mirror of
https://github.com/Xahau/xahaud.git
synced 2025-12-06 17:27:52 +00:00
Squashed 'src/ripple/rocksdb/rocksdb/' content from commit 56589ab
git-subtree-dir: src/ripple/rocksdb/rocksdb git-subtree-split: 56589ab81f6827ff7402e31b24a6d548f29a524f
This commit is contained in:
45
include/rocksdb/arena.h
Normal file
45
include/rocksdb/arena.h
Normal file
@@ -0,0 +1,45 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Arena class defines memory allocation methods. It's used by memtable and
|
||||
// skiplist.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_ARENA_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_ARENA_H_
|
||||
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Arena {
|
||||
public:
|
||||
Arena() {};
|
||||
virtual ~Arena() {};
|
||||
|
||||
// Return a pointer to a newly allocated memory block of "bytes" bytes.
|
||||
virtual char* Allocate(size_t bytes) = 0;
|
||||
|
||||
// Allocate memory with the normal alignment guarantees provided by malloc.
|
||||
virtual char* AllocateAligned(size_t bytes) = 0;
|
||||
|
||||
// Returns an estimate of the total memory used by arena.
|
||||
virtual const size_t ApproximateMemoryUsage() = 0;
|
||||
|
||||
// Returns the total number of bytes in all blocks allocated so far.
|
||||
virtual const size_t MemoryAllocatedBytes() = 0;
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
Arena(const Arena&);
|
||||
void operator=(const Arena&);
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_ARENA_H_
|
||||
285
include/rocksdb/c.h
Normal file
285
include/rocksdb/c.h
Normal file
@@ -0,0 +1,285 @@
|
||||
/* Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
This source code is licensed under the BSD-style license found in the
|
||||
LICENSE file in the root directory of this source tree. An additional grant
|
||||
of patent rights can be found in the PATENTS file in the same directory.
|
||||
Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
Use of this source code is governed by a BSD-style license that can be
|
||||
found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
C bindings for leveldb. May be useful as a stable ABI that can be
|
||||
used by programs that keep leveldb in a shared library, or for
|
||||
a JNI api.
|
||||
|
||||
Does not support:
|
||||
. getters for the option types
|
||||
. custom comparators that implement key shortening
|
||||
. capturing post-write-snapshot
|
||||
. custom iter, db, env, cache implementations using just the C bindings
|
||||
|
||||
Some conventions:
|
||||
|
||||
(1) We expose just opaque struct pointers and functions to clients.
|
||||
This allows us to change internal representations without having to
|
||||
recompile clients.
|
||||
|
||||
(2) For simplicity, there is no equivalent to the Slice type. Instead,
|
||||
the caller has to pass the pointer and length as separate
|
||||
arguments.
|
||||
|
||||
(3) Errors are represented by a null-terminated c string. NULL
|
||||
means no error. All operations that can raise an error are passed
|
||||
a "char** errptr" as the last argument. One of the following must
|
||||
be true on entry:
|
||||
*errptr == NULL
|
||||
*errptr points to a malloc()ed null-terminated error message
|
||||
On success, a leveldb routine leaves *errptr unchanged.
|
||||
On failure, leveldb frees the old value of *errptr and
|
||||
set *errptr to a malloc()ed error message.
|
||||
|
||||
(4) Bools have the type unsigned char (0 == false; rest == true)
|
||||
|
||||
(5) All of the pointer arguments must be non-NULL.
|
||||
*/
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_C_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/* Exported types */
|
||||
|
||||
typedef struct leveldb_t leveldb_t;
|
||||
typedef struct leveldb_cache_t leveldb_cache_t;
|
||||
typedef struct leveldb_comparator_t leveldb_comparator_t;
|
||||
typedef struct leveldb_env_t leveldb_env_t;
|
||||
typedef struct leveldb_filelock_t leveldb_filelock_t;
|
||||
typedef struct leveldb_filterpolicy_t leveldb_filterpolicy_t;
|
||||
typedef struct leveldb_iterator_t leveldb_iterator_t;
|
||||
typedef struct leveldb_logger_t leveldb_logger_t;
|
||||
typedef struct leveldb_options_t leveldb_options_t;
|
||||
typedef struct leveldb_randomfile_t leveldb_randomfile_t;
|
||||
typedef struct leveldb_readoptions_t leveldb_readoptions_t;
|
||||
typedef struct leveldb_seqfile_t leveldb_seqfile_t;
|
||||
typedef struct leveldb_snapshot_t leveldb_snapshot_t;
|
||||
typedef struct leveldb_writablefile_t leveldb_writablefile_t;
|
||||
typedef struct leveldb_writebatch_t leveldb_writebatch_t;
|
||||
typedef struct leveldb_writeoptions_t leveldb_writeoptions_t;
|
||||
|
||||
/* DB operations */
|
||||
|
||||
extern leveldb_t* leveldb_open(
|
||||
const leveldb_options_t* options,
|
||||
const char* name,
|
||||
char** errptr);
|
||||
|
||||
extern void leveldb_close(leveldb_t* db);
|
||||
|
||||
extern void leveldb_put(
|
||||
leveldb_t* db,
|
||||
const leveldb_writeoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
const char* val, size_t vallen,
|
||||
char** errptr);
|
||||
|
||||
extern void leveldb_delete(
|
||||
leveldb_t* db,
|
||||
const leveldb_writeoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
char** errptr);
|
||||
|
||||
extern void leveldb_write(
|
||||
leveldb_t* db,
|
||||
const leveldb_writeoptions_t* options,
|
||||
leveldb_writebatch_t* batch,
|
||||
char** errptr);
|
||||
|
||||
/* Returns NULL if not found. A malloc()ed array otherwise.
|
||||
Stores the length of the array in *vallen. */
|
||||
extern char* leveldb_get(
|
||||
leveldb_t* db,
|
||||
const leveldb_readoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
size_t* vallen,
|
||||
char** errptr);
|
||||
|
||||
extern leveldb_iterator_t* leveldb_create_iterator(
|
||||
leveldb_t* db,
|
||||
const leveldb_readoptions_t* options);
|
||||
|
||||
extern const leveldb_snapshot_t* leveldb_create_snapshot(
|
||||
leveldb_t* db);
|
||||
|
||||
extern void leveldb_release_snapshot(
|
||||
leveldb_t* db,
|
||||
const leveldb_snapshot_t* snapshot);
|
||||
|
||||
/* Returns NULL if property name is unknown.
|
||||
Else returns a pointer to a malloc()-ed null-terminated value. */
|
||||
extern char* leveldb_property_value(
|
||||
leveldb_t* db,
|
||||
const char* propname);
|
||||
|
||||
extern void leveldb_approximate_sizes(
|
||||
leveldb_t* db,
|
||||
int num_ranges,
|
||||
const char* const* range_start_key, const size_t* range_start_key_len,
|
||||
const char* const* range_limit_key, const size_t* range_limit_key_len,
|
||||
uint64_t* sizes);
|
||||
|
||||
extern void leveldb_compact_range(
|
||||
leveldb_t* db,
|
||||
const char* start_key, size_t start_key_len,
|
||||
const char* limit_key, size_t limit_key_len);
|
||||
|
||||
/* Management operations */
|
||||
|
||||
extern void leveldb_destroy_db(
|
||||
const leveldb_options_t* options,
|
||||
const char* name,
|
||||
char** errptr);
|
||||
|
||||
extern void leveldb_repair_db(
|
||||
const leveldb_options_t* options,
|
||||
const char* name,
|
||||
char** errptr);
|
||||
|
||||
/* Iterator */
|
||||
|
||||
extern void leveldb_iter_destroy(leveldb_iterator_t*);
|
||||
extern unsigned char leveldb_iter_valid(const leveldb_iterator_t*);
|
||||
extern void leveldb_iter_seek_to_first(leveldb_iterator_t*);
|
||||
extern void leveldb_iter_seek_to_last(leveldb_iterator_t*);
|
||||
extern void leveldb_iter_seek(leveldb_iterator_t*, const char* k, size_t klen);
|
||||
extern void leveldb_iter_next(leveldb_iterator_t*);
|
||||
extern void leveldb_iter_prev(leveldb_iterator_t*);
|
||||
extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
|
||||
extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
|
||||
extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);
|
||||
|
||||
/* Write batch */
|
||||
|
||||
extern leveldb_writebatch_t* leveldb_writebatch_create();
|
||||
extern void leveldb_writebatch_destroy(leveldb_writebatch_t*);
|
||||
extern void leveldb_writebatch_clear(leveldb_writebatch_t*);
|
||||
extern void leveldb_writebatch_put(
|
||||
leveldb_writebatch_t*,
|
||||
const char* key, size_t klen,
|
||||
const char* val, size_t vlen);
|
||||
extern void leveldb_writebatch_delete(
|
||||
leveldb_writebatch_t*,
|
||||
const char* key, size_t klen);
|
||||
extern void leveldb_writebatch_iterate(
|
||||
leveldb_writebatch_t*,
|
||||
void* state,
|
||||
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
|
||||
void (*deleted)(void*, const char* k, size_t klen));
|
||||
|
||||
/* Options */
|
||||
|
||||
extern leveldb_options_t* leveldb_options_create();
|
||||
extern void leveldb_options_destroy(leveldb_options_t*);
|
||||
extern void leveldb_options_set_comparator(
|
||||
leveldb_options_t*,
|
||||
leveldb_comparator_t*);
|
||||
extern void leveldb_options_set_compression_per_level(
|
||||
leveldb_options_t* opt,
|
||||
int* level_values,
|
||||
size_t num_levels);
|
||||
extern void leveldb_options_set_filter_policy(
|
||||
leveldb_options_t*,
|
||||
leveldb_filterpolicy_t*);
|
||||
extern void leveldb_options_set_create_if_missing(
|
||||
leveldb_options_t*, unsigned char);
|
||||
extern void leveldb_options_set_error_if_exists(
|
||||
leveldb_options_t*, unsigned char);
|
||||
extern void leveldb_options_set_paranoid_checks(
|
||||
leveldb_options_t*, unsigned char);
|
||||
extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
|
||||
extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
|
||||
extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
|
||||
extern void leveldb_options_set_max_open_files(leveldb_options_t*, int);
|
||||
extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
|
||||
extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
|
||||
extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
|
||||
extern void leveldb_options_set_compression_options(
|
||||
leveldb_options_t* opt, int w_bits, int level, int strategy);
|
||||
|
||||
enum {
|
||||
leveldb_no_compression = 0,
|
||||
leveldb_snappy_compression = 1
|
||||
};
|
||||
extern void leveldb_options_set_compression(leveldb_options_t*, int);
|
||||
|
||||
/* Comparator */
|
||||
|
||||
extern leveldb_comparator_t* leveldb_comparator_create(
|
||||
void* state,
|
||||
void (*destructor)(void*),
|
||||
int (*compare)(
|
||||
void*,
|
||||
const char* a, size_t alen,
|
||||
const char* b, size_t blen),
|
||||
const char* (*name)(void*));
|
||||
extern void leveldb_comparator_destroy(leveldb_comparator_t*);
|
||||
|
||||
/* Filter policy */
|
||||
|
||||
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create(
|
||||
void* state,
|
||||
void (*destructor)(void*),
|
||||
char* (*create_filter)(
|
||||
void*,
|
||||
const char* const* key_array, const size_t* key_length_array,
|
||||
int num_keys,
|
||||
size_t* filter_length),
|
||||
unsigned char (*key_may_match)(
|
||||
void*,
|
||||
const char* key, size_t length,
|
||||
const char* filter, size_t filter_length),
|
||||
const char* (*name)(void*));
|
||||
extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*);
|
||||
|
||||
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(
|
||||
int bits_per_key);
|
||||
|
||||
/* Read options */
|
||||
|
||||
extern leveldb_readoptions_t* leveldb_readoptions_create();
|
||||
extern void leveldb_readoptions_destroy(leveldb_readoptions_t*);
|
||||
extern void leveldb_readoptions_set_verify_checksums(
|
||||
leveldb_readoptions_t*,
|
||||
unsigned char);
|
||||
extern void leveldb_readoptions_set_fill_cache(
|
||||
leveldb_readoptions_t*, unsigned char);
|
||||
extern void leveldb_readoptions_set_snapshot(
|
||||
leveldb_readoptions_t*,
|
||||
const leveldb_snapshot_t*);
|
||||
|
||||
/* Write options */
|
||||
|
||||
extern leveldb_writeoptions_t* leveldb_writeoptions_create();
|
||||
extern void leveldb_writeoptions_destroy(leveldb_writeoptions_t*);
|
||||
extern void leveldb_writeoptions_set_sync(
|
||||
leveldb_writeoptions_t*, unsigned char);
|
||||
|
||||
/* Cache */
|
||||
|
||||
extern leveldb_cache_t* leveldb_cache_create_lru(size_t capacity);
|
||||
extern void leveldb_cache_destroy(leveldb_cache_t* cache);
|
||||
|
||||
/* Env */
|
||||
|
||||
extern leveldb_env_t* leveldb_create_default_env();
|
||||
extern void leveldb_env_destroy(leveldb_env_t*);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* end extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* STORAGE_ROCKSDB_INCLUDE_C_H_ */
|
||||
122
include/rocksdb/cache.h
Normal file
122
include/rocksdb/cache.h
Normal file
@@ -0,0 +1,122 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// A Cache is an interface that maps keys to values. It has internal
|
||||
// synchronization and may be safely accessed concurrently from
|
||||
// multiple threads. It may automatically evict entries to make room
|
||||
// for new entries. Values have a specified charge against the cache
|
||||
// capacity. For example, a cache where the values are variable
|
||||
// length strings, may use the length of the string as the charge for
|
||||
// the string.
|
||||
//
|
||||
// A builtin cache implementation with a least-recently-used eviction
|
||||
// policy is provided. Clients may use their own implementations if
|
||||
// they want something more sophisticated (like scan-resistance, a
|
||||
// custom eviction policy, variable cache sizing, etc.)
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_CACHE_H_
|
||||
|
||||
#include <memory>
|
||||
#include <stdint.h>
|
||||
#include "rocksdb/slice.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
using std::shared_ptr;
|
||||
|
||||
class Cache;
|
||||
|
||||
// Create a new cache with a fixed size capacity. The cache is sharded
|
||||
// to 2^numShardBits shards, by hash of the key. The total capacity
|
||||
// is divided and evenly assigned to each shard. Inside each shard,
|
||||
// the eviction is done in two passes: first try to free spaces by
|
||||
// evicting entries that are among the most least used removeScanCountLimit
|
||||
// entries and do not have reference other than by the cache itself, in
|
||||
// the least-used order. If not enough space is freed, further free the
|
||||
// entries in least used order.
|
||||
//
|
||||
// The functions without parameter numShardBits and/or removeScanCountLimit
|
||||
// use default values. removeScanCountLimit's default value is 0, which
|
||||
// means a strict LRU order inside each shard.
|
||||
extern shared_ptr<Cache> NewLRUCache(size_t capacity);
|
||||
extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits);
|
||||
extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
|
||||
int removeScanCountLimit);
|
||||
|
||||
class Cache {
|
||||
public:
|
||||
Cache() { }
|
||||
|
||||
// Destroys all existing entries by calling the "deleter"
|
||||
// function that was passed to the constructor.
|
||||
virtual ~Cache();
|
||||
|
||||
// Opaque handle to an entry stored in the cache.
|
||||
struct Handle { };
|
||||
|
||||
// Insert a mapping from key->value into the cache and assign it
|
||||
// the specified charge against the total cache capacity.
|
||||
//
|
||||
// Returns a handle that corresponds to the mapping. The caller
|
||||
// must call this->Release(handle) when the returned mapping is no
|
||||
// longer needed.
|
||||
//
|
||||
// When the inserted entry is no longer needed, the key and
|
||||
// value will be passed to "deleter".
|
||||
virtual Handle* Insert(const Slice& key, void* value, size_t charge,
|
||||
void (*deleter)(const Slice& key, void* value)) = 0;
|
||||
|
||||
// If the cache has no mapping for "key", returns nullptr.
|
||||
//
|
||||
// Else return a handle that corresponds to the mapping. The caller
|
||||
// must call this->Release(handle) when the returned mapping is no
|
||||
// longer needed.
|
||||
virtual Handle* Lookup(const Slice& key) = 0;
|
||||
|
||||
// Release a mapping returned by a previous Lookup().
|
||||
// REQUIRES: handle must not have been released yet.
|
||||
// REQUIRES: handle must have been returned by a method on *this.
|
||||
virtual void Release(Handle* handle) = 0;
|
||||
|
||||
// Return the value encapsulated in a handle returned by a
|
||||
// successful Lookup().
|
||||
// REQUIRES: handle must not have been released yet.
|
||||
// REQUIRES: handle must have been returned by a method on *this.
|
||||
virtual void* Value(Handle* handle) = 0;
|
||||
|
||||
// If the cache contains entry for key, erase it. Note that the
|
||||
// underlying entry will be kept around until all existing handles
|
||||
// to it have been released.
|
||||
virtual void Erase(const Slice& key) = 0;
|
||||
|
||||
// Return a new numeric id. May be used by multiple clients who are
|
||||
// sharing the same cache to partition the key space. Typically the
|
||||
// client will allocate a new id at startup and prepend the id to
|
||||
// its cache keys.
|
||||
virtual uint64_t NewId() = 0;
|
||||
|
||||
// returns the maximum configured capacity of the cache
|
||||
virtual size_t GetCapacity() = 0;
|
||||
|
||||
private:
|
||||
void LRU_Remove(Handle* e);
|
||||
void LRU_Append(Handle* e);
|
||||
void Unref(Handle* e);
|
||||
|
||||
struct Rep;
|
||||
Rep* rep_;
|
||||
|
||||
// No copying allowed
|
||||
Cache(const Cache&);
|
||||
void operator=(const Cache&);
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_UTIL_CACHE_H_
|
||||
83
include/rocksdb/compaction_filter.h
Normal file
83
include/rocksdb/compaction_filter.h
Normal file
@@ -0,0 +1,83 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
|
||||
// CompactionFilter allows an application to modify/delete a key-value at
|
||||
// the time of compaction.
|
||||
|
||||
class CompactionFilter {
|
||||
public:
|
||||
|
||||
// Context information of a compaction run
|
||||
struct Context {
|
||||
// Does this compaction run include all data files
|
||||
bool is_full_compaction;
|
||||
};
|
||||
|
||||
virtual ~CompactionFilter() {}
|
||||
|
||||
// The compaction process invokes this
|
||||
// method for kv that is being compacted. A return value
|
||||
// of false indicates that the kv should be preserved in the
|
||||
// output of this compaction run and a return value of true
|
||||
// indicates that this key-value should be removed from the
|
||||
// output of the compaction. The application can inspect
|
||||
// the existing value of the key and make decision based on it.
|
||||
//
|
||||
// When the value is to be preserved, the application has the option
|
||||
// to modify the existing_value and pass it back through new_value.
|
||||
// value_changed needs to be set to true in this case.
|
||||
virtual bool Filter(int level,
|
||||
const Slice& key,
|
||||
const Slice& existing_value,
|
||||
std::string* new_value,
|
||||
bool* value_changed) const = 0;
|
||||
|
||||
// Returns a name that identifies this compaction filter.
|
||||
// The name will be printed to LOG file on start up for diagnosis.
|
||||
virtual const char* Name() const = 0;
|
||||
};
|
||||
|
||||
// Each compaction will create a new CompactionFilter allowing the
|
||||
// application to know about different campactions
|
||||
class CompactionFilterFactory {
|
||||
public:
|
||||
virtual ~CompactionFilterFactory() { };
|
||||
|
||||
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
|
||||
const CompactionFilter::Context& context) = 0;
|
||||
|
||||
// Returns a name that identifies this compaction filter factory.
|
||||
virtual const char* Name() const = 0;
|
||||
};
|
||||
|
||||
// Default implementaion of CompactionFilterFactory which does not
|
||||
// return any filter
|
||||
class DefaultCompactionFilterFactory : public CompactionFilterFactory {
|
||||
public:
|
||||
virtual std::unique_ptr<CompactionFilter>
|
||||
CreateCompactionFilter(const CompactionFilter::Context& context) override {
|
||||
return std::unique_ptr<CompactionFilter>(nullptr);
|
||||
}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "DefaultCompactionFilterFactory";
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
|
||||
67
include/rocksdb/comparator.h
Normal file
67
include/rocksdb/comparator.h
Normal file
@@ -0,0 +1,67 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
|
||||
// A Comparator object provides a total order across slices that are
|
||||
// used as keys in an sstable or a database. A Comparator implementation
|
||||
// must be thread-safe since rocksdb may invoke its methods concurrently
|
||||
// from multiple threads.
|
||||
class Comparator {
|
||||
public:
|
||||
virtual ~Comparator();
|
||||
|
||||
// Three-way comparison. Returns value:
|
||||
// < 0 iff "a" < "b",
|
||||
// == 0 iff "a" == "b",
|
||||
// > 0 iff "a" > "b"
|
||||
virtual int Compare(const Slice& a, const Slice& b) const = 0;
|
||||
|
||||
// The name of the comparator. Used to check for comparator
|
||||
// mismatches (i.e., a DB created with one comparator is
|
||||
// accessed using a different comparator.
|
||||
//
|
||||
// The client of this package should switch to a new name whenever
|
||||
// the comparator implementation changes in a way that will cause
|
||||
// the relative ordering of any two keys to change.
|
||||
//
|
||||
// Names starting with "rocksdb." are reserved and should not be used
|
||||
// by any clients of this package.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
// Advanced functions: these are used to reduce the space requirements
|
||||
// for internal data structures like index blocks.
|
||||
|
||||
// If *start < limit, changes *start to a short string in [start,limit).
|
||||
// Simple comparator implementations may return with *start unchanged,
|
||||
// i.e., an implementation of this method that does nothing is correct.
|
||||
virtual void FindShortestSeparator(
|
||||
std::string* start,
|
||||
const Slice& limit) const = 0;
|
||||
|
||||
// Changes *key to a short string >= *key.
|
||||
// Simple comparator implementations may return with *key unchanged,
|
||||
// i.e., an implementation of this method that does nothing is correct.
|
||||
virtual void FindShortSuccessor(std::string* key) const = 0;
|
||||
};
|
||||
|
||||
// Return a builtin comparator that uses lexicographic byte-wise
|
||||
// ordering. The result remains the property of this module and
|
||||
// must not be deleted.
|
||||
extern const Comparator* BytewiseComparator();
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
|
||||
307
include/rocksdb/db.h
Normal file
307
include/rocksdb/db.h
Normal file
@@ -0,0 +1,307 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_DB_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "rocksdb/transaction_log.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
using std::unique_ptr;
|
||||
|
||||
// Update Makefile if you change these
|
||||
static const int kMajorVersion = 2;
|
||||
static const int kMinorVersion = 0;
|
||||
|
||||
struct Options;
|
||||
struct ReadOptions;
|
||||
struct WriteOptions;
|
||||
struct FlushOptions;
|
||||
class WriteBatch;
|
||||
|
||||
// Metadata associated with each SST file.
|
||||
struct LiveFileMetaData {
|
||||
std::string name; // Name of the file
|
||||
int level; // Level at which this file resides.
|
||||
size_t size; // File size in bytes.
|
||||
std::string smallestkey; // Smallest user defined key in the file.
|
||||
std::string largestkey; // Largest user defined key in the file.
|
||||
SequenceNumber smallest_seqno; // smallest seqno in file
|
||||
SequenceNumber largest_seqno; // largest seqno in file
|
||||
};
|
||||
|
||||
// Abstract handle to particular state of a DB.
|
||||
// A Snapshot is an immutable object and can therefore be safely
|
||||
// accessed from multiple threads without any external synchronization.
|
||||
class Snapshot {
|
||||
protected:
|
||||
virtual ~Snapshot();
|
||||
};
|
||||
|
||||
// A range of keys
|
||||
struct Range {
|
||||
Slice start; // Included in the range
|
||||
Slice limit; // Not included in the range
|
||||
|
||||
Range() { }
|
||||
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
|
||||
};
|
||||
|
||||
// A DB is a persistent ordered map from keys to values.
|
||||
// A DB is safe for concurrent access from multiple threads without
|
||||
// any external synchronization.
|
||||
class DB {
|
||||
public:
|
||||
// Open the database with the specified "name".
|
||||
// Stores a pointer to a heap-allocated database in *dbptr and returns
|
||||
// OK on success.
|
||||
// Stores nullptr in *dbptr and returns a non-OK status on error.
|
||||
// Caller should delete *dbptr when it is no longer needed.
|
||||
static Status Open(const Options& options,
|
||||
const std::string& name,
|
||||
DB** dbptr);
|
||||
|
||||
// Open the database for read only. All DB interfaces
|
||||
// that modify data, like put/delete, will return error.
|
||||
// If the db is opened in read only mode, then no compactions
|
||||
// will happen.
|
||||
static Status OpenForReadOnly(const Options& options,
|
||||
const std::string& name, DB** dbptr,
|
||||
bool error_if_log_file_exist = false);
|
||||
|
||||
DB() { }
|
||||
virtual ~DB();
|
||||
|
||||
// Set the database entry for "key" to "value".
|
||||
// Returns OK on success, and a non-OK status on error.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Put(const WriteOptions& options,
|
||||
const Slice& key,
|
||||
const Slice& value) = 0;
|
||||
|
||||
// Remove the database entry (if any) for "key". Returns OK on
|
||||
// success, and a non-OK status on error. It is not an error if "key"
|
||||
// did not exist in the database.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
|
||||
|
||||
// Merge the database entry for "key" with "value". Returns OK on success,
|
||||
// and a non-OK status on error. The semantics of this operation is
|
||||
// determined by the user provided merge_operator when opening DB.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Merge(const WriteOptions& options,
|
||||
const Slice& key,
|
||||
const Slice& value) = 0;
|
||||
|
||||
// Apply the specified updates to the database.
|
||||
// Returns OK on success, non-OK on failure.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
|
||||
|
||||
// If the database contains an entry for "key" store the
|
||||
// corresponding value in *value and return OK.
|
||||
//
|
||||
// If there is no entry for "key" leave *value unchanged and return
|
||||
// a status for which Status::IsNotFound() returns true.
|
||||
//
|
||||
// May return some other Status on an error.
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value) = 0;
|
||||
|
||||
// If keys[i] does not exist in the database, then the i'th returned
|
||||
// status will be one for which Status::IsNotFound() is true, and
|
||||
// (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
|
||||
// the i'th returned status will have Status::ok() true, and (*values)[i]
|
||||
// will store the value associated with keys[i].
|
||||
//
|
||||
// (*values) will always be resized to be the same size as (keys).
|
||||
// Similarly, the number of returned statuses will be the number of keys.
|
||||
// Note: keys will not be "de-duplicated". Duplicate keys will return
|
||||
// duplicate values in order.
|
||||
virtual std::vector<Status> MultiGet(const ReadOptions& options,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values) = 0;
|
||||
|
||||
// If the key definitely does not exist in the database, then this method
|
||||
// returns false, else true. If the caller wants to obtain value when the key
|
||||
// is found in memory, a bool for 'value_found' must be passed. 'value_found'
|
||||
// will be true on return if value has been set properly.
|
||||
// This check is potentially lighter-weight than invoking DB::Get(). One way
|
||||
// to make this lighter weight is to avoid doing any IOs.
|
||||
// Default implementation here returns true and sets 'value_found' to false
|
||||
virtual bool KeyMayExist(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value,
|
||||
bool* value_found = nullptr) {
|
||||
if (value_found != nullptr) {
|
||||
*value_found = false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Return a heap-allocated iterator over the contents of the database.
|
||||
// The result of NewIterator() is initially invalid (caller must
|
||||
// call one of the Seek methods on the iterator before using it).
|
||||
//
|
||||
// Caller should delete the iterator when it is no longer needed.
|
||||
// The returned iterator should be deleted before this db is deleted.
|
||||
virtual Iterator* NewIterator(const ReadOptions& options) = 0;
|
||||
|
||||
// Return a handle to the current DB state. Iterators created with
|
||||
// this handle will all observe a stable snapshot of the current DB
|
||||
// state. The caller must call ReleaseSnapshot(result) when the
|
||||
// snapshot is no longer needed.
|
||||
virtual const Snapshot* GetSnapshot() = 0;
|
||||
|
||||
// Release a previously acquired snapshot. The caller must not
|
||||
// use "snapshot" after this call.
|
||||
virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
|
||||
|
||||
// DB implementations can export properties about their state
|
||||
// via this method. If "property" is a valid property understood by this
|
||||
// DB implementation, fills "*value" with its current value and returns
|
||||
// true. Otherwise returns false.
|
||||
//
|
||||
//
|
||||
// Valid property names include:
|
||||
//
|
||||
// "rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
|
||||
// where <N> is an ASCII representation of a level number (e.g. "0").
|
||||
// "rocksdb.stats" - returns a multi-line string that describes statistics
|
||||
// about the internal operation of the DB.
|
||||
// "rocksdb.sstables" - returns a multi-line string that describes all
|
||||
// of the sstables that make up the db contents.
|
||||
virtual bool GetProperty(const Slice& property, std::string* value) = 0;
|
||||
|
||||
// For each i in [0,n-1], store in "sizes[i]", the approximate
|
||||
// file system space used by keys in "[range[i].start .. range[i].limit)".
|
||||
//
|
||||
// Note that the returned sizes measure file system space usage, so
|
||||
// if the user data compresses by a factor of ten, the returned
|
||||
// sizes will be one-tenth the size of the corresponding user data size.
|
||||
//
|
||||
// The results may not include the sizes of recently written data.
|
||||
virtual void GetApproximateSizes(const Range* range, int n,
|
||||
uint64_t* sizes) = 0;
|
||||
|
||||
// Compact the underlying storage for the key range [*begin,*end].
|
||||
// In particular, deleted and overwritten versions are discarded,
|
||||
// and the data is rearranged to reduce the cost of operations
|
||||
// needed to access the data. This operation should typically only
|
||||
// be invoked by users who understand the underlying implementation.
|
||||
//
|
||||
// begin==nullptr is treated as a key before all keys in the database.
|
||||
// end==nullptr is treated as a key after all keys in the database.
|
||||
// Therefore the following call will compact the entire database:
|
||||
// db->CompactRange(nullptr, nullptr);
|
||||
// Note that after the entire database is compacted, all data are pushed
|
||||
// down to the last level containing any data. If the total data size
|
||||
// after compaction is reduced, that level might not be appropriate for
|
||||
// hosting all the files. In this case, client could set reduce_level
|
||||
// to true, to move the files back to the minimum level capable of holding
|
||||
// the data set or a given level (specified by non-negative target_level).
|
||||
virtual void CompactRange(const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false,
|
||||
int target_level = -1) = 0;
|
||||
|
||||
// Number of levels used for this DB.
|
||||
virtual int NumberLevels() = 0;
|
||||
|
||||
// Maximum level to which a new compacted memtable is pushed if it
|
||||
// does not create overlap.
|
||||
virtual int MaxMemCompactionLevel() = 0;
|
||||
|
||||
// Number of files in level-0 that would stop writes.
|
||||
virtual int Level0StopWriteTrigger() = 0;
|
||||
|
||||
// Flush all mem-table data.
|
||||
virtual Status Flush(const FlushOptions& options) = 0;
|
||||
|
||||
// Prevent file deletions. Compactions will continue to occur,
|
||||
// but no obsolete files will be deleted. Calling this multiple
|
||||
// times have the same effect as calling it once.
|
||||
virtual Status DisableFileDeletions() = 0;
|
||||
|
||||
// Allow compactions to delete obselete files.
|
||||
virtual Status EnableFileDeletions() = 0;
|
||||
|
||||
// GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
|
||||
|
||||
// THIS METHOD IS DEPRECATED. Use the GetTableMetaData to get more
|
||||
// detailed information on the live files.
|
||||
// Retrieve the list of all files in the database. The files are
|
||||
// relative to the dbname and are not absolute paths. The valid size of the
|
||||
// manifest file is returned in manifest_file_size. The manifest file is an
|
||||
// ever growing file, but only the portion specified by manifest_file_size is
|
||||
// valid for this snapshot.
|
||||
// Setting flush_memtable to true does Flush before recording the live files.
|
||||
// Setting flush_memtable to false is useful when we don't want to wait for
|
||||
// flush which may have to wait for compaction to complete taking an
|
||||
// indeterminate time. But this will have to use GetSortedWalFiles after
|
||||
// GetLiveFiles to compensate for memtables missed in this snapshot due to the
|
||||
// absence of Flush, by WAL files to recover the database consistently later
|
||||
virtual Status GetLiveFiles(std::vector<std::string>&,
|
||||
uint64_t* manifest_file_size,
|
||||
bool flush_memtable = true) = 0;
|
||||
|
||||
// Retrieve the sorted list of all wal files with earliest file first
|
||||
virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
|
||||
|
||||
// The sequence number of the most recent transaction.
|
||||
virtual SequenceNumber GetLatestSequenceNumber() const = 0;
|
||||
|
||||
// Sets iter to an iterator that is positioned at a write-batch containing
|
||||
// seq_number. If the sequence number is non existent, it returns an iterator
|
||||
// at the first available seq_no after the requested seq_no
|
||||
// Returns Status::Ok if iterator is valid
|
||||
// Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
|
||||
// use this api, else the WAL files will get
|
||||
// cleared aggressively and the iterator might keep getting invalid before
|
||||
// an update is read.
|
||||
virtual Status GetUpdatesSince(SequenceNumber seq_number,
|
||||
unique_ptr<TransactionLogIterator>* iter) = 0;
|
||||
|
||||
// Delete the file name from the db directory and update the internal state to
|
||||
// reflect that. Supports deletion of sst and log files only. 'name' must be
|
||||
// path relative to the db directory. eg. 000001.sst, /archive/000003.log
|
||||
virtual Status DeleteFile(std::string name) = 0;
|
||||
|
||||
// Returns a list of all table files with their level, start key
|
||||
// and end key
|
||||
virtual void GetLiveFilesMetaData(
|
||||
std::vector<LiveFileMetaData> *metadata) {
|
||||
}
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
DB(const DB&);
|
||||
void operator=(const DB&);
|
||||
};
|
||||
|
||||
// Destroy the contents of the specified database.
|
||||
// Be very careful using this method.
|
||||
Status DestroyDB(const std::string& name, const Options& options);
|
||||
|
||||
// If a DB cannot be opened, you may attempt to call this method to
|
||||
// resurrect as much of the contents of the database as possible.
|
||||
// Some data may be lost, so be careful when calling this function
|
||||
// on a database that contains important information.
|
||||
Status RepairDB(const std::string& dbname, const Options& options);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_DB_H_
|
||||
649
include/rocksdb/env.h
Normal file
649
include/rocksdb/env.h
Normal file
@@ -0,0 +1,649 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// An Env is an interface used by the rocksdb implementation to access
|
||||
// operating system functionality like the filesystem etc. Callers
|
||||
// may wish to provide a custom Env object when opening a database to
|
||||
// get fine gain control; e.g., to rate limit file system operations.
|
||||
//
|
||||
// All Env implementations are safe for concurrent access from
|
||||
// multiple threads without any external synchronization.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_ENV_H_
|
||||
|
||||
#include <cstdarg>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <stdint.h>
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class FileLock;
|
||||
class Logger;
|
||||
class RandomAccessFile;
|
||||
class SequentialFile;
|
||||
class Slice;
|
||||
class WritableFile;
|
||||
class RandomRWFile;
|
||||
struct Options;
|
||||
|
||||
using std::unique_ptr;
|
||||
using std::shared_ptr;
|
||||
|
||||
|
||||
// Options while opening a file to read/write
|
||||
struct EnvOptions {
|
||||
|
||||
// construct with default Options
|
||||
EnvOptions();
|
||||
|
||||
// construct from Options
|
||||
explicit EnvOptions(const Options& options);
|
||||
|
||||
// If true, then allow caching of data in environment buffers
|
||||
bool use_os_buffer = true;
|
||||
|
||||
// If true, then use mmap to read data
|
||||
bool use_mmap_reads = false;
|
||||
|
||||
// If true, then use mmap to write data
|
||||
bool use_mmap_writes = true;
|
||||
|
||||
// If true, set the FD_CLOEXEC on open fd.
|
||||
bool set_fd_cloexec= true;
|
||||
|
||||
// Allows OS to incrementally sync files to disk while they are being
|
||||
// written, in the background. Issue one request for every bytes_per_sync
|
||||
// written. 0 turns it off.
|
||||
// Default: 0
|
||||
uint64_t bytes_per_sync = 0;
|
||||
};
|
||||
|
||||
class Env {
|
||||
public:
|
||||
Env() { }
|
||||
virtual ~Env();
|
||||
|
||||
// Return a default environment suitable for the current operating
|
||||
// system. Sophisticated users may wish to provide their own Env
|
||||
// implementation instead of relying on this default environment.
|
||||
//
|
||||
// The result of Default() belongs to rocksdb and must never be deleted.
|
||||
static Env* Default();
|
||||
|
||||
// Create a brand new sequentially-readable file with the specified name.
|
||||
// On success, stores a pointer to the new file in *result and returns OK.
|
||||
// On failure stores nullptr in *result and returns non-OK. If the file does
|
||||
// not exist, returns a non-OK status.
|
||||
//
|
||||
// The returned file will only be accessed by one thread at a time.
|
||||
virtual Status NewSequentialFile(const std::string& fname,
|
||||
unique_ptr<SequentialFile>* result,
|
||||
const EnvOptions& options)
|
||||
= 0;
|
||||
|
||||
// Create a brand new random access read-only file with the
|
||||
// specified name. On success, stores a pointer to the new file in
|
||||
// *result and returns OK. On failure stores nullptr in *result and
|
||||
// returns non-OK. If the file does not exist, returns a non-OK
|
||||
// status.
|
||||
//
|
||||
// The returned file may be concurrently accessed by multiple threads.
|
||||
virtual Status NewRandomAccessFile(const std::string& fname,
|
||||
unique_ptr<RandomAccessFile>* result,
|
||||
const EnvOptions& options)
|
||||
= 0;
|
||||
|
||||
// Create an object that writes to a new file with the specified
|
||||
// name. Deletes any existing file with the same name and creates a
|
||||
// new file. On success, stores a pointer to the new file in
|
||||
// *result and returns OK. On failure stores nullptr in *result and
|
||||
// returns non-OK.
|
||||
//
|
||||
// The returned file will only be accessed by one thread at a time.
|
||||
virtual Status NewWritableFile(const std::string& fname,
|
||||
unique_ptr<WritableFile>* result,
|
||||
const EnvOptions& options) = 0;
|
||||
|
||||
// Create an object that both reads and writes to a file on
|
||||
// specified offsets (random access). If file already exists,
|
||||
// does not overwrite it. On success, stores a pointer to the
|
||||
// new file in *result and returns OK. On failure stores nullptr
|
||||
// in *result and returns non-OK.
|
||||
virtual Status NewRandomRWFile(const std::string& fname,
|
||||
unique_ptr<RandomRWFile>* result,
|
||||
const EnvOptions& options) = 0;
|
||||
|
||||
// Returns true iff the named file exists.
|
||||
virtual bool FileExists(const std::string& fname) = 0;
|
||||
|
||||
// Store in *result the names of the children of the specified directory.
|
||||
// The names are relative to "dir".
|
||||
// Original contents of *results are dropped.
|
||||
virtual Status GetChildren(const std::string& dir,
|
||||
std::vector<std::string>* result) = 0;
|
||||
|
||||
// Delete the named file.
|
||||
virtual Status DeleteFile(const std::string& fname) = 0;
|
||||
|
||||
// Create the specified directory. Returns error if directory exists.
|
||||
virtual Status CreateDir(const std::string& dirname) = 0;
|
||||
|
||||
// Creates directory if missing. Return Ok if it exists, or successful in
|
||||
// Creating.
|
||||
virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
|
||||
|
||||
// Delete the specified directory.
|
||||
virtual Status DeleteDir(const std::string& dirname) = 0;
|
||||
|
||||
// Store the size of fname in *file_size.
|
||||
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
|
||||
|
||||
// Store the last modification time of fname in *file_mtime.
|
||||
virtual Status GetFileModificationTime(const std::string& fname,
|
||||
uint64_t* file_mtime) = 0;
|
||||
// Rename file src to target.
|
||||
virtual Status RenameFile(const std::string& src,
|
||||
const std::string& target) = 0;
|
||||
|
||||
// Lock the specified file. Used to prevent concurrent access to
|
||||
// the same db by multiple processes. On failure, stores nullptr in
|
||||
// *lock and returns non-OK.
|
||||
//
|
||||
// On success, stores a pointer to the object that represents the
|
||||
// acquired lock in *lock and returns OK. The caller should call
|
||||
// UnlockFile(*lock) to release the lock. If the process exits,
|
||||
// the lock will be automatically released.
|
||||
//
|
||||
// If somebody else already holds the lock, finishes immediately
|
||||
// with a failure. I.e., this call does not wait for existing locks
|
||||
// to go away.
|
||||
//
|
||||
// May create the named file if it does not already exist.
|
||||
virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
|
||||
|
||||
// Release the lock acquired by a previous successful call to LockFile.
|
||||
// REQUIRES: lock was returned by a successful LockFile() call
|
||||
// REQUIRES: lock has not already been unlocked.
|
||||
virtual Status UnlockFile(FileLock* lock) = 0;
|
||||
|
||||
enum Priority { LOW, HIGH, TOTAL };
|
||||
|
||||
// Arrange to run "(*function)(arg)" once in a background thread, in
|
||||
// the thread pool specified by pri. By default, jobs go to the 'LOW'
|
||||
// priority thread pool.
|
||||
|
||||
// "function" may run in an unspecified thread. Multiple functions
|
||||
// added to the same Env may run concurrently in different threads.
|
||||
// I.e., the caller may not assume that background work items are
|
||||
// serialized.
|
||||
virtual void Schedule(
|
||||
void (*function)(void* arg),
|
||||
void* arg,
|
||||
Priority pri = LOW) = 0;
|
||||
|
||||
// Start a new thread, invoking "function(arg)" within the new thread.
|
||||
// When "function(arg)" returns, the thread will be destroyed.
|
||||
virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
|
||||
|
||||
// *path is set to a temporary directory that can be used for testing. It may
|
||||
// or many not have just been created. The directory may or may not differ
|
||||
// between runs of the same process, but subsequent calls will return the
|
||||
// same directory.
|
||||
virtual Status GetTestDirectory(std::string* path) = 0;
|
||||
|
||||
// Create and return a log file for storing informational messages.
|
||||
virtual Status NewLogger(const std::string& fname,
|
||||
shared_ptr<Logger>* result) = 0;
|
||||
|
||||
// Returns the number of micro-seconds since some fixed point in time. Only
|
||||
// useful for computing deltas of time.
|
||||
virtual uint64_t NowMicros() = 0;
|
||||
|
||||
// Returns the number of nano-seconds since some fixed point in time. Only
|
||||
// useful for computing deltas of time in one run.
|
||||
// Default implementation simply relies on NowMicros
|
||||
virtual uint64_t NowNanos() {
|
||||
return NowMicros() * 1000;
|
||||
}
|
||||
|
||||
// Sleep/delay the thread for the perscribed number of micro-seconds.
|
||||
virtual void SleepForMicroseconds(int micros) = 0;
|
||||
|
||||
// Get the current host name.
|
||||
virtual Status GetHostName(char* name, uint64_t len) = 0;
|
||||
|
||||
// Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
|
||||
virtual Status GetCurrentTime(int64_t* unix_time) = 0;
|
||||
|
||||
// Get full directory name for this db.
|
||||
virtual Status GetAbsolutePath(const std::string& db_path,
|
||||
std::string* output_path) = 0;
|
||||
|
||||
// The number of background worker threads of a specific thread pool
|
||||
// for this environment. 'LOW' is the default pool.
|
||||
// default number: 1
|
||||
virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
|
||||
|
||||
// Converts seconds-since-Jan-01-1970 to a printable string
|
||||
virtual std::string TimeToString(uint64_t time) = 0;
|
||||
|
||||
// Generates a unique id that can be used to identify a db
|
||||
virtual std::string GenerateUniqueId();
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
Env(const Env&);
|
||||
void operator=(const Env&);
|
||||
};
|
||||
|
||||
// A file abstraction for reading sequentially through a file
|
||||
class SequentialFile {
|
||||
public:
|
||||
SequentialFile() { }
|
||||
virtual ~SequentialFile();
|
||||
|
||||
// Read up to "n" bytes from the file. "scratch[0..n-1]" may be
|
||||
// written by this routine. Sets "*result" to the data that was
|
||||
// read (including if fewer than "n" bytes were successfully read).
|
||||
// May set "*result" to point at data in "scratch[0..n-1]", so
|
||||
// "scratch[0..n-1]" must be live when "*result" is used.
|
||||
// If an error was encountered, returns a non-OK status.
|
||||
//
|
||||
// REQUIRES: External synchronization
|
||||
virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
|
||||
|
||||
// Skip "n" bytes from the file. This is guaranteed to be no
|
||||
// slower that reading the same data, but may be faster.
|
||||
//
|
||||
// If end of file is reached, skipping will stop at the end of the
|
||||
// file, and Skip will return OK.
|
||||
//
|
||||
// REQUIRES: External synchronization
|
||||
virtual Status Skip(uint64_t n) = 0;
|
||||
|
||||
// Remove any kind of caching of data from the offset to offset+length
|
||||
// of this file. If the length is 0, then it refers to the end of file.
|
||||
// If the system is not caching the file contents, then this is a noop.
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) {
|
||||
return Status::NotSupported("InvalidateCache not supported.");
|
||||
}
|
||||
};
|
||||
|
||||
// A file abstraction for randomly reading the contents of a file.
|
||||
class RandomAccessFile {
|
||||
public:
|
||||
RandomAccessFile() { }
|
||||
virtual ~RandomAccessFile();
|
||||
|
||||
// Read up to "n" bytes from the file starting at "offset".
|
||||
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
|
||||
// to the data that was read (including if fewer than "n" bytes were
|
||||
// successfully read). May set "*result" to point at data in
|
||||
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
|
||||
// "*result" is used. If an error was encountered, returns a non-OK
|
||||
// status.
|
||||
//
|
||||
// Safe for concurrent use by multiple threads.
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const = 0;
|
||||
|
||||
// Tries to get an unique ID for this file that will be the same each time
|
||||
// the file is opened (and will stay the same while the file is open).
|
||||
// Furthermore, it tries to make this ID at most "max_size" bytes. If such an
|
||||
// ID can be created this function returns the length of the ID and places it
|
||||
// in "id"; otherwise, this function returns 0, in which case "id"
|
||||
// may not have been modified.
|
||||
//
|
||||
// This function guarantees, for IDs from a given environment, two unique ids
|
||||
// cannot be made equal to eachother by adding arbitrary bytes to one of
|
||||
// them. That is, no unique ID is the prefix of another.
|
||||
//
|
||||
// This function guarantees that the returned ID will not be interpretable as
|
||||
// a single varint.
|
||||
//
|
||||
// Note: these IDs are only valid for the duration of the process.
|
||||
virtual size_t GetUniqueId(char* id, size_t max_size) const {
|
||||
return 0; // Default implementation to prevent issues with backwards
|
||||
// compatibility.
|
||||
};
|
||||
|
||||
|
||||
enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
|
||||
|
||||
virtual void Hint(AccessPattern pattern) {}
|
||||
|
||||
// Remove any kind of caching of data from the offset to offset+length
|
||||
// of this file. If the length is 0, then it refers to the end of file.
|
||||
// If the system is not caching the file contents, then this is a noop.
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) {
|
||||
return Status::NotSupported("InvalidateCache not supported.");
|
||||
}
|
||||
};
|
||||
|
||||
// A file abstraction for sequential writing. The implementation
|
||||
// must provide buffering since callers may append small fragments
|
||||
// at a time to the file.
|
||||
class WritableFile {
|
||||
public:
|
||||
WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) {
|
||||
}
|
||||
virtual ~WritableFile();
|
||||
|
||||
virtual Status Append(const Slice& data) = 0;
|
||||
virtual Status Close() = 0;
|
||||
virtual Status Flush() = 0;
|
||||
virtual Status Sync() = 0; // sync data
|
||||
|
||||
/*
|
||||
* Sync data and/or metadata as well.
|
||||
* By default, sync only data.
|
||||
* Override this method for environments where we need to sync
|
||||
* metadata as well.
|
||||
*/
|
||||
virtual Status Fsync() {
|
||||
return Sync();
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the size of valid data in the file.
|
||||
*/
|
||||
virtual uint64_t GetFileSize() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get and set the default pre-allocation block size for writes to
|
||||
* this file. If non-zero, then Allocate will be used to extend the
|
||||
* underlying storage of a file (generally via fallocate) if the Env
|
||||
* instance supports it.
|
||||
*/
|
||||
void SetPreallocationBlockSize(size_t size) {
|
||||
preallocation_block_size_ = size;
|
||||
}
|
||||
|
||||
virtual void GetPreallocationStatus(size_t* block_size,
|
||||
size_t* last_allocated_block) {
|
||||
*last_allocated_block = last_preallocated_block_;
|
||||
*block_size = preallocation_block_size_;
|
||||
}
|
||||
|
||||
// For documentation, refer to RandomAccessFile::GetUniqueId()
|
||||
virtual size_t GetUniqueId(char* id, size_t max_size) const {
|
||||
return 0; // Default implementation to prevent issues with backwards
|
||||
}
|
||||
|
||||
// Remove any kind of caching of data from the offset to offset+length
|
||||
// of this file. If the length is 0, then it refers to the end of file.
|
||||
// If the system is not caching the file contents, then this is a noop.
|
||||
// This call has no effect on dirty pages in the cache.
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) {
|
||||
return Status::NotSupported("InvalidateCache not supported.");
|
||||
}
|
||||
|
||||
protected:
|
||||
// PrepareWrite performs any necessary preparation for a write
|
||||
// before the write actually occurs. This allows for pre-allocation
|
||||
// of space on devices where it can result in less file
|
||||
// fragmentation and/or less waste from over-zealous filesystem
|
||||
// pre-allocation.
|
||||
void PrepareWrite(size_t offset, size_t len) {
|
||||
if (preallocation_block_size_ == 0) {
|
||||
return;
|
||||
}
|
||||
// If this write would cross one or more preallocation blocks,
|
||||
// determine what the last preallocation block necesessary to
|
||||
// cover this write would be and Allocate to that point.
|
||||
const auto block_size = preallocation_block_size_;
|
||||
size_t new_last_preallocated_block =
|
||||
(offset + len + block_size - 1) / block_size;
|
||||
if (new_last_preallocated_block > last_preallocated_block_) {
|
||||
size_t num_spanned_blocks =
|
||||
new_last_preallocated_block - last_preallocated_block_;
|
||||
Allocate(block_size * last_preallocated_block_,
|
||||
block_size * num_spanned_blocks);
|
||||
last_preallocated_block_ = new_last_preallocated_block;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Pre-allocate space for a file.
|
||||
*/
|
||||
virtual Status Allocate(off_t offset, off_t len) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Sync a file range with disk.
|
||||
// offset is the starting byte of the file range to be synchronized.
|
||||
// nbytes specifies the length of the range to be synchronized.
|
||||
// This asks the OS to initiate flushing the cached data to disk,
|
||||
// without waiting for completion.
|
||||
// Default implementation does nothing.
|
||||
virtual Status RangeSync(off_t offset, off_t nbytes) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
size_t last_preallocated_block_;
|
||||
size_t preallocation_block_size_;
|
||||
// No copying allowed
|
||||
WritableFile(const WritableFile&);
|
||||
void operator=(const WritableFile&);
|
||||
};
|
||||
|
||||
// A file abstraction for random reading and writing.
|
||||
class RandomRWFile {
|
||||
public:
|
||||
RandomRWFile() {}
|
||||
virtual ~RandomRWFile() {}
|
||||
|
||||
// Write data from Slice data to file starting from offset
|
||||
// Returns IOError on failure, but does not guarantee
|
||||
// atomicity of a write. Returns OK status on success.
|
||||
//
|
||||
// Safe for concurrent use.
|
||||
virtual Status Write(uint64_t offset, const Slice& data) = 0;
|
||||
// Read up to "n" bytes from the file starting at "offset".
|
||||
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
|
||||
// to the data that was read (including if fewer than "n" bytes were
|
||||
// successfully read). May set "*result" to point at data in
|
||||
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
|
||||
// "*result" is used. If an error was encountered, returns a non-OK
|
||||
// status.
|
||||
//
|
||||
// Safe for concurrent use by multiple threads.
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const = 0;
|
||||
virtual Status Close() = 0; // closes the file
|
||||
virtual Status Sync() = 0; // sync data
|
||||
|
||||
/*
|
||||
* Sync data and/or metadata as well.
|
||||
* By default, sync only data.
|
||||
* Override this method for environments where we need to sync
|
||||
* metadata as well.
|
||||
*/
|
||||
virtual Status Fsync() {
|
||||
return Sync();
|
||||
}
|
||||
|
||||
/*
|
||||
* Pre-allocate space for a file.
|
||||
*/
|
||||
virtual Status Allocate(off_t offset, off_t len) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
RandomRWFile(const RandomRWFile&);
|
||||
void operator=(const RandomRWFile&);
|
||||
};
|
||||
|
||||
// An interface for writing log messages.
|
||||
class Logger {
|
||||
public:
|
||||
enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 };
|
||||
Logger() { }
|
||||
virtual ~Logger();
|
||||
|
||||
// Write an entry to the log file with the specified format.
|
||||
virtual void Logv(const char* format, va_list ap) = 0;
|
||||
virtual size_t GetLogFileSize() const {
|
||||
return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE;
|
||||
}
|
||||
// Flush to the OS buffers
|
||||
virtual void Flush() {}
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
Logger(const Logger&);
|
||||
void operator=(const Logger&);
|
||||
};
|
||||
|
||||
|
||||
// Identifies a locked file.
|
||||
class FileLock {
|
||||
public:
|
||||
FileLock() { }
|
||||
virtual ~FileLock();
|
||||
private:
|
||||
// No copying allowed
|
||||
FileLock(const FileLock&);
|
||||
void operator=(const FileLock&);
|
||||
};
|
||||
|
||||
|
||||
extern void LogFlush(const shared_ptr<Logger>& info_log);
|
||||
|
||||
// Log the specified data to *info_log if info_log is non-nullptr.
|
||||
extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
|
||||
# if defined(__GNUC__) || defined(__clang__)
|
||||
__attribute__((__format__ (__printf__, 2, 3)))
|
||||
# endif
|
||||
;
|
||||
|
||||
extern void LogFlush(Logger *info_log);
|
||||
|
||||
extern void Log(Logger* info_log, const char* format, ...)
|
||||
# if defined(__GNUC__) || defined(__clang__)
|
||||
__attribute__((__format__ (__printf__, 2, 3)))
|
||||
# endif
|
||||
;
|
||||
|
||||
// A utility routine: write "data" to the named file.
|
||||
extern Status WriteStringToFile(Env* env, const Slice& data,
|
||||
const std::string& fname);
|
||||
|
||||
// A utility routine: read contents of named file into *data
|
||||
extern Status ReadFileToString(Env* env, const std::string& fname,
|
||||
std::string* data);
|
||||
|
||||
// An implementation of Env that forwards all calls to another Env.
|
||||
// May be useful to clients who wish to override just part of the
|
||||
// functionality of another Env.
|
||||
class EnvWrapper : public Env {
|
||||
public:
|
||||
// Initialize an EnvWrapper that delegates all calls to *t
|
||||
explicit EnvWrapper(Env* t) : target_(t) { }
|
||||
virtual ~EnvWrapper();
|
||||
|
||||
// Return the target to which this Env forwards all calls
|
||||
Env* target() const { return target_; }
|
||||
|
||||
// The following text is boilerplate that forwards all methods to target()
|
||||
Status NewSequentialFile(const std::string& f,
|
||||
unique_ptr<SequentialFile>* r,
|
||||
const EnvOptions& options) {
|
||||
return target_->NewSequentialFile(f, r, options);
|
||||
}
|
||||
Status NewRandomAccessFile(const std::string& f,
|
||||
unique_ptr<RandomAccessFile>* r,
|
||||
const EnvOptions& options) {
|
||||
return target_->NewRandomAccessFile(f, r, options);
|
||||
}
|
||||
Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
|
||||
const EnvOptions& options) {
|
||||
return target_->NewWritableFile(f, r, options);
|
||||
}
|
||||
Status NewRandomRWFile(const std::string& f, unique_ptr<RandomRWFile>* r,
|
||||
const EnvOptions& options) {
|
||||
return target_->NewRandomRWFile(f, r, options);
|
||||
}
|
||||
bool FileExists(const std::string& f) { return target_->FileExists(f); }
|
||||
Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
|
||||
return target_->GetChildren(dir, r);
|
||||
}
|
||||
Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
|
||||
Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
|
||||
Status CreateDirIfMissing(const std::string& d) {
|
||||
return target_->CreateDirIfMissing(d);
|
||||
}
|
||||
Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
|
||||
Status GetFileSize(const std::string& f, uint64_t* s) {
|
||||
return target_->GetFileSize(f, s);
|
||||
}
|
||||
|
||||
Status GetFileModificationTime(const std::string& fname,
|
||||
uint64_t* file_mtime) {
|
||||
return target_->GetFileModificationTime(fname, file_mtime);
|
||||
}
|
||||
|
||||
Status RenameFile(const std::string& s, const std::string& t) {
|
||||
return target_->RenameFile(s, t);
|
||||
}
|
||||
Status LockFile(const std::string& f, FileLock** l) {
|
||||
return target_->LockFile(f, l);
|
||||
}
|
||||
Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
|
||||
void Schedule(void (*f)(void*), void* a, Priority pri) {
|
||||
return target_->Schedule(f, a, pri);
|
||||
}
|
||||
void StartThread(void (*f)(void*), void* a) {
|
||||
return target_->StartThread(f, a);
|
||||
}
|
||||
virtual Status GetTestDirectory(std::string* path) {
|
||||
return target_->GetTestDirectory(path);
|
||||
}
|
||||
virtual Status NewLogger(const std::string& fname,
|
||||
shared_ptr<Logger>* result) {
|
||||
return target_->NewLogger(fname, result);
|
||||
}
|
||||
uint64_t NowMicros() {
|
||||
return target_->NowMicros();
|
||||
}
|
||||
void SleepForMicroseconds(int micros) {
|
||||
target_->SleepForMicroseconds(micros);
|
||||
}
|
||||
Status GetHostName(char* name, uint64_t len) {
|
||||
return target_->GetHostName(name, len);
|
||||
}
|
||||
Status GetCurrentTime(int64_t* unix_time) {
|
||||
return target_->GetCurrentTime(unix_time);
|
||||
}
|
||||
Status GetAbsolutePath(const std::string& db_path,
|
||||
std::string* output_path) {
|
||||
return target_->GetAbsolutePath(db_path, output_path);
|
||||
}
|
||||
void SetBackgroundThreads(int num, Priority pri) {
|
||||
return target_->SetBackgroundThreads(num, pri);
|
||||
}
|
||||
std::string TimeToString(uint64_t time) {
|
||||
return target_->TimeToString(time);
|
||||
}
|
||||
|
||||
private:
|
||||
Env* target_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_ENV_H_
|
||||
74
include/rocksdb/filter_policy.h
Normal file
74
include/rocksdb/filter_policy.h
Normal file
@@ -0,0 +1,74 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// A database can be configured with a custom FilterPolicy object.
|
||||
// This object is responsible for creating a small filter from a set
|
||||
// of keys. These filters are stored in rocksdb and are consulted
|
||||
// automatically by rocksdb to decide whether or not to read some
|
||||
// information from disk. In many cases, a filter can cut down the
|
||||
// number of disk seeks form a handful to a single disk seek per
|
||||
// DB::Get() call.
|
||||
//
|
||||
// Most people will want to use the builtin bloom filter support (see
|
||||
// NewBloomFilterPolicy() below).
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
|
||||
class FilterPolicy {
|
||||
public:
|
||||
virtual ~FilterPolicy();
|
||||
|
||||
// Return the name of this policy. Note that if the filter encoding
|
||||
// changes in an incompatible way, the name returned by this method
|
||||
// must be changed. Otherwise, old incompatible filters may be
|
||||
// passed to methods of this type.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
// keys[0,n-1] contains a list of keys (potentially with duplicates)
|
||||
// that are ordered according to the user supplied comparator.
|
||||
// Append a filter that summarizes keys[0,n-1] to *dst.
|
||||
//
|
||||
// Warning: do not change the initial contents of *dst. Instead,
|
||||
// append the newly constructed filter to *dst.
|
||||
virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
|
||||
const = 0;
|
||||
|
||||
// "filter" contains the data appended by a preceding call to
|
||||
// CreateFilter() on this class. This method must return true if
|
||||
// the key was in the list of keys passed to CreateFilter().
|
||||
// This method may return true or false if the key was not on the
|
||||
// list, but it should aim to return false with a high probability.
|
||||
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
|
||||
};
|
||||
|
||||
// Return a new filter policy that uses a bloom filter with approximately
|
||||
// the specified number of bits per key. A good value for bits_per_key
|
||||
// is 10, which yields a filter with ~ 1% false positive rate.
|
||||
//
|
||||
// Callers must delete the result after any database that is using the
|
||||
// result has been closed.
|
||||
//
|
||||
// Note: if you are using a custom comparator that ignores some parts
|
||||
// of the keys being compared, you must not use NewBloomFilterPolicy()
|
||||
// and must provide your own FilterPolicy that also ignores the
|
||||
// corresponding parts of the keys. For example, if the comparator
|
||||
// ignores trailing spaces, it would be incorrect to use a
|
||||
// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
|
||||
// trailing spaces in keys.
|
||||
extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
|
||||
|
||||
}
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
|
||||
64
include/rocksdb/flush_block_policy.h
Normal file
64
include/rocksdb/flush_block_policy.h
Normal file
@@ -0,0 +1,64 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
class BlockBuilder;
|
||||
|
||||
// FlushBlockPolicy provides a configurable way to determine when to flush a
|
||||
// block in the block based tables,
|
||||
class FlushBlockPolicy {
|
||||
public:
|
||||
// Keep track of the key/value sequences and return the boolean value to
|
||||
// determine if table builder should flush current data block.
|
||||
virtual bool Update(const Slice& key,
|
||||
const Slice& value) = 0;
|
||||
|
||||
virtual ~FlushBlockPolicy() { }
|
||||
};
|
||||
|
||||
class FlushBlockPolicyFactory {
|
||||
public:
|
||||
// Return the name of the flush block policy.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
// Return a new block flush policy that flushes data blocks by data size.
|
||||
// FlushBlockPolicy may need to access the metadata of the data block
|
||||
// builder to determine when to flush the blocks.
|
||||
//
|
||||
// Callers must delete the result after any database that is using the
|
||||
// result has been closed.
|
||||
virtual FlushBlockPolicy* NewFlushBlockPolicy(
|
||||
const BlockBuilder& data_block_builder) const = 0;
|
||||
|
||||
virtual ~FlushBlockPolicyFactory() { }
|
||||
};
|
||||
|
||||
class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
|
||||
public:
|
||||
FlushBlockBySizePolicyFactory(const uint64_t block_size,
|
||||
const uint64_t block_size_deviation) :
|
||||
block_size_(block_size),
|
||||
block_size_deviation_(block_size_deviation) {
|
||||
}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "FlushBlockBySizePolicyFactory";
|
||||
}
|
||||
|
||||
virtual FlushBlockPolicy* NewFlushBlockPolicy(
|
||||
const BlockBuilder& data_block_builder) const override;
|
||||
|
||||
private:
|
||||
const uint64_t block_size_;
|
||||
const uint64_t block_size_deviation_;
|
||||
};
|
||||
|
||||
} // rocksdb
|
||||
106
include/rocksdb/iterator.h
Normal file
106
include/rocksdb/iterator.h
Normal file
@@ -0,0 +1,106 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// An iterator yields a sequence of key/value pairs from a source.
|
||||
// The following class defines the interface. Multiple implementations
|
||||
// are provided by this library. In particular, iterators are provided
|
||||
// to access the contents of a Table or a DB.
|
||||
//
|
||||
// Multiple threads can invoke const methods on an Iterator without
|
||||
// external synchronization, but if any of the threads may call a
|
||||
// non-const method, all threads accessing the same Iterator must use
|
||||
// external synchronization.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
|
||||
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Iterator {
|
||||
public:
|
||||
Iterator();
|
||||
virtual ~Iterator();
|
||||
|
||||
// An iterator is either positioned at a key/value pair, or
|
||||
// not valid. This method returns true iff the iterator is valid.
|
||||
virtual bool Valid() const = 0;
|
||||
|
||||
// Position at the first key in the source. The iterator is Valid()
|
||||
// after this call iff the source is not empty.
|
||||
virtual void SeekToFirst() = 0;
|
||||
|
||||
// Position at the last key in the source. The iterator is
|
||||
// Valid() after this call iff the source is not empty.
|
||||
virtual void SeekToLast() = 0;
|
||||
|
||||
// Position at the first key in the source that at or past target
|
||||
// The iterator is Valid() after this call iff the source contains
|
||||
// an entry that comes at or past target.
|
||||
virtual void Seek(const Slice& target) = 0;
|
||||
|
||||
// Moves to the next entry in the source. After this call, Valid() is
|
||||
// true iff the iterator was not positioned at the last entry in the source.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Next() = 0;
|
||||
|
||||
// Moves to the previous entry in the source. After this call, Valid() is
|
||||
// true iff the iterator was not positioned at the first entry in source.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Prev() = 0;
|
||||
|
||||
// Return the key for the current entry. The underlying storage for
|
||||
// the returned slice is valid only until the next modification of
|
||||
// the iterator.
|
||||
// REQUIRES: Valid()
|
||||
virtual Slice key() const = 0;
|
||||
|
||||
// Return the value for the current entry. The underlying storage for
|
||||
// the returned slice is valid only until the next modification of
|
||||
// the iterator.
|
||||
// REQUIRES: !AtEnd() && !AtStart()
|
||||
virtual Slice value() const = 0;
|
||||
|
||||
// If an error has occurred, return it. Else return an ok status.
|
||||
// If non-blocking IO is requested and this operation cannot be
|
||||
// satisfied without doing some IO, then this returns Status::Incomplete().
|
||||
virtual Status status() const = 0;
|
||||
|
||||
// Clients are allowed to register function/arg1/arg2 triples that
|
||||
// will be invoked when this iterator is destroyed.
|
||||
//
|
||||
// Note that unlike all of the preceding methods, this method is
|
||||
// not abstract and therefore clients should not override it.
|
||||
typedef void (*CleanupFunction)(void* arg1, void* arg2);
|
||||
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
|
||||
|
||||
private:
|
||||
struct Cleanup {
|
||||
CleanupFunction function;
|
||||
void* arg1;
|
||||
void* arg2;
|
||||
Cleanup* next;
|
||||
};
|
||||
Cleanup cleanup_;
|
||||
|
||||
// No copying allowed
|
||||
Iterator(const Iterator&);
|
||||
void operator=(const Iterator&);
|
||||
};
|
||||
|
||||
// Return an empty iterator (yields nothing).
|
||||
extern Iterator* NewEmptyIterator();
|
||||
|
||||
// Return an empty iterator with the specified status.
|
||||
extern Iterator* NewErrorIterator(const Status& status);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
|
||||
18
include/rocksdb/ldb_tool.h
Normal file
18
include/rocksdb/ldb_tool.h
Normal file
@@ -0,0 +1,18 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
|
||||
#define STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
|
||||
#include "rocksdb/options.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class LDBTool {
|
||||
public:
|
||||
void Run(int argc, char** argv, Options = Options());
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
|
||||
276
include/rocksdb/memtablerep.h
Normal file
276
include/rocksdb/memtablerep.h
Normal file
@@ -0,0 +1,276 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// This file contains the interface that must be implemented by any collection
|
||||
// to be used as the backing store for a MemTable. Such a collection must
|
||||
// satisfy the following properties:
|
||||
// (1) It does not store duplicate items.
|
||||
// (2) It uses MemTableRep::KeyComparator to compare items for iteration and
|
||||
// equality.
|
||||
// (3) It can be accessed concurrently by multiple readers and can support
|
||||
// during reads. However, it needn't support multiple concurrent writes.
|
||||
// (4) Items are never deleted.
|
||||
// The liberal use of assertions is encouraged to enforce (1).
|
||||
//
|
||||
// The factory will be passed an Arena object when a new MemTableRep is
|
||||
// requested. The API for this object is in rocksdb/arena.h.
|
||||
//
|
||||
// Users can implement their own memtable representations. We include four
|
||||
// types built in:
|
||||
// - SkipListRep: This is the default; it is backed by a skip list.
|
||||
// - TransformRep: This is backed by an custom hash map.
|
||||
// On construction, they are given a SliceTransform object. This
|
||||
// object is applied to the user key of stored items which indexes into the
|
||||
// hash map to yield a skiplist containing all records that share the same
|
||||
// user key under the transform function.
|
||||
// - UnsortedRep: A subclass of TransformRep where the transform function is
|
||||
// the identity function. Optimized for point lookups.
|
||||
// - PrefixHashRep: A subclass of TransformRep where the transform function is
|
||||
// a fixed-size prefix extractor. If you use PrefixHashRepFactory, the transform
|
||||
// must be identical to options.prefix_extractor, otherwise it will be discarded
|
||||
// and the default will be used. It is optimized for ranged scans over a
|
||||
// prefix.
|
||||
// - VectorRep: This is backed by an unordered std::vector. On iteration, the
|
||||
// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
|
||||
// has been called, the vector will only be sorted once. It is optimized for
|
||||
// random-write-heavy workloads.
|
||||
//
|
||||
// The last four implementations are designed for situations in which
|
||||
// iteration over the entire collection is rare since doing so requires all the
|
||||
// keys to be copied into a sorted data structure.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
|
||||
#define STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
|
||||
|
||||
#include <memory>
|
||||
#include "rocksdb/arena.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class MemTableRep {
|
||||
public:
|
||||
// KeyComparator provides a means to compare keys, which are internal keys
|
||||
// concatenated with values.
|
||||
class KeyComparator {
|
||||
public:
|
||||
// Compare a and b. Return a negative value if a is less than b, 0 if they
|
||||
// are equal, and a positive value if a is greater than b
|
||||
virtual int operator()(const char* a, const char* b) const = 0;
|
||||
|
||||
virtual ~KeyComparator() { }
|
||||
};
|
||||
|
||||
// Insert key into the collection. (The caller will pack key and value into a
|
||||
// single buffer and pass that in as the parameter to Insert)
|
||||
// REQUIRES: nothing that compares equal to key is currently in the
|
||||
// collection.
|
||||
virtual void Insert(const char* key) = 0;
|
||||
|
||||
// Returns true iff an entry that compares equal to key is in the collection.
|
||||
virtual bool Contains(const char* key) const = 0;
|
||||
|
||||
// Notify this table rep that it will no longer be added to. By default, does
|
||||
// nothing.
|
||||
virtual void MarkReadOnly() { }
|
||||
|
||||
// Report an approximation of how much memory has been used other than memory
|
||||
// that was allocated through the arena.
|
||||
virtual size_t ApproximateMemoryUsage() = 0;
|
||||
|
||||
virtual ~MemTableRep() { }
|
||||
|
||||
// Iteration over the contents of a skip collection
|
||||
class Iterator {
|
||||
public:
|
||||
// Initialize an iterator over the specified collection.
|
||||
// The returned iterator is not valid.
|
||||
// explicit Iterator(const MemTableRep* collection);
|
||||
virtual ~Iterator() { };
|
||||
|
||||
// Returns true iff the iterator is positioned at a valid node.
|
||||
virtual bool Valid() const = 0;
|
||||
|
||||
// Returns the key at the current position.
|
||||
// REQUIRES: Valid()
|
||||
virtual const char* key() const = 0;
|
||||
|
||||
// Advances to the next position.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Next() = 0;
|
||||
|
||||
// Advances to the previous position.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Prev() = 0;
|
||||
|
||||
// Advance to the first entry with a key >= target
|
||||
virtual void Seek(const char* target) = 0;
|
||||
|
||||
// Position at the first entry in collection.
|
||||
// Final state of iterator is Valid() iff collection is not empty.
|
||||
virtual void SeekToFirst() = 0;
|
||||
|
||||
// Position at the last entry in collection.
|
||||
// Final state of iterator is Valid() iff collection is not empty.
|
||||
virtual void SeekToLast() = 0;
|
||||
};
|
||||
|
||||
// Return an iterator over the keys in this representation.
|
||||
virtual std::shared_ptr<Iterator> GetIterator() = 0;
|
||||
|
||||
// Return an iterator over at least the keys with the specified user key. The
|
||||
// iterator may also allow access to other keys, but doesn't have to. Default:
|
||||
// GetIterator().
|
||||
virtual std::shared_ptr<Iterator> GetIterator(const Slice& user_key) {
|
||||
return GetIterator();
|
||||
}
|
||||
|
||||
// Return an iterator over at least the keys with the specified prefix. The
|
||||
// iterator may also allow access to other keys, but doesn't have to. Default:
|
||||
// GetIterator().
|
||||
virtual std::shared_ptr<Iterator> GetPrefixIterator(const Slice& prefix) {
|
||||
return GetIterator();
|
||||
}
|
||||
|
||||
// Return an iterator that has a special Seek semantics. The result of
|
||||
// a Seek might only include keys with the same prefix as the target key.
|
||||
virtual std::shared_ptr<Iterator> GetDynamicPrefixIterator() {
|
||||
return GetIterator();
|
||||
}
|
||||
|
||||
protected:
|
||||
// When *key is an internal key concatenated with the value, returns the
|
||||
// user key.
|
||||
virtual Slice UserKey(const char* key) const;
|
||||
};
|
||||
|
||||
// This is the base class for all factories that are used by RocksDB to create
|
||||
// new MemTableRep objects
|
||||
class MemTableRepFactory {
|
||||
public:
|
||||
virtual ~MemTableRepFactory() { };
|
||||
virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
|
||||
MemTableRep::KeyComparator&, Arena*) = 0;
|
||||
virtual const char* Name() const = 0;
|
||||
};
|
||||
|
||||
// This creates MemTableReps that are backed by an std::vector. On iteration,
|
||||
// the vector is sorted. This is useful for workloads where iteration is very
|
||||
// rare and writes are generally not issued after reads begin.
|
||||
//
|
||||
// Parameters:
|
||||
// count: Passed to the constructor of the underlying std::vector of each
|
||||
// VectorRep. On initialization, the underlying array will be at least count
|
||||
// bytes reserved for usage.
|
||||
class VectorRepFactory : public MemTableRepFactory {
|
||||
const size_t count_;
|
||||
public:
|
||||
explicit VectorRepFactory(size_t count = 0) : count_(count) { }
|
||||
virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
|
||||
MemTableRep::KeyComparator&, Arena*) override;
|
||||
virtual const char* Name() const override {
|
||||
return "VectorRepFactory";
|
||||
}
|
||||
};
|
||||
|
||||
// This uses a skip list to store keys. It is the default.
|
||||
class SkipListFactory : public MemTableRepFactory {
|
||||
public:
|
||||
virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
|
||||
MemTableRep::KeyComparator&, Arena*) override;
|
||||
virtual const char* Name() const override {
|
||||
return "SkipListFactory";
|
||||
}
|
||||
};
|
||||
|
||||
// TransformReps are backed by an unordered map of buffers to buckets. When
|
||||
// looking up a key, the user key is extracted and a user-supplied transform
|
||||
// function (see rocksdb/slice_transform.h) is applied to get the key into the
|
||||
// unordered map. This allows the user to bin user keys based on arbitrary
|
||||
// criteria. Two example implementations are UnsortedRepFactory and
|
||||
// PrefixHashRepFactory.
|
||||
//
|
||||
// Iteration over the entire collection is implemented by dumping all the keys
|
||||
// into an std::set. Thus, these data structures are best used when iteration
|
||||
// over the entire collection is rare.
|
||||
//
|
||||
// Parameters:
|
||||
// transform: The SliceTransform to bucket user keys on. TransformRepFactory
|
||||
// owns the pointer.
|
||||
// bucket_count: Passed to the constructor of the underlying
|
||||
// std::unordered_map of each TransformRep. On initialization, the
|
||||
// underlying array will be at least bucket_count size.
|
||||
// num_locks: Number of read-write locks to have for the rep. Each bucket is
|
||||
// hashed onto a read-write lock which controls access to that lock. More
|
||||
// locks means finer-grained concurrency but more memory overhead.
|
||||
class TransformRepFactory : public MemTableRepFactory {
|
||||
public:
|
||||
explicit TransformRepFactory(const SliceTransform* transform,
|
||||
size_t bucket_count, size_t num_locks = 1000)
|
||||
: transform_(transform),
|
||||
bucket_count_(bucket_count),
|
||||
num_locks_(num_locks) { }
|
||||
|
||||
virtual ~TransformRepFactory() { delete transform_; }
|
||||
|
||||
virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
|
||||
MemTableRep::KeyComparator&, Arena*) override;
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "TransformRepFactory";
|
||||
}
|
||||
|
||||
const SliceTransform* GetTransform() { return transform_; }
|
||||
|
||||
protected:
|
||||
const SliceTransform* transform_;
|
||||
const size_t bucket_count_;
|
||||
const size_t num_locks_;
|
||||
};
|
||||
|
||||
// UnsortedReps bin user keys based on an identity function transform -- that
|
||||
// is, transform(key) = key. This optimizes for point look-ups.
|
||||
//
|
||||
// Parameters: See TransformRepFactory.
|
||||
class UnsortedRepFactory : public TransformRepFactory {
|
||||
public:
|
||||
explicit UnsortedRepFactory(size_t bucket_count = 0, size_t num_locks = 1000)
|
||||
: TransformRepFactory(NewNoopTransform(),
|
||||
bucket_count,
|
||||
num_locks) { }
|
||||
virtual const char* Name() const override {
|
||||
return "UnsortedRepFactory";
|
||||
}
|
||||
};
|
||||
|
||||
// PrefixHashReps bin user keys based on a fixed-size prefix. This optimizes for
|
||||
// short ranged scans over a given prefix.
|
||||
//
|
||||
// Parameters: See TransformRepFactory.
|
||||
class PrefixHashRepFactory : public TransformRepFactory {
|
||||
public:
|
||||
explicit PrefixHashRepFactory(const SliceTransform* prefix_extractor,
|
||||
size_t bucket_count = 0, size_t num_locks = 1000)
|
||||
: TransformRepFactory(prefix_extractor, bucket_count, num_locks)
|
||||
{ }
|
||||
|
||||
virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
|
||||
MemTableRep::KeyComparator&, Arena*) override;
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "PrefixHashRepFactory";
|
||||
}
|
||||
};
|
||||
|
||||
// The same as TransformRepFactory except it doesn't use locks.
|
||||
// Experimental, will replace TransformRepFactory once we are sure
|
||||
// it performs better
|
||||
extern MemTableRepFactory* NewHashSkipListRepFactory(
|
||||
const SliceTransform* transform, size_t bucket_count = 1000000);
|
||||
|
||||
}
|
||||
|
||||
#endif // STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
|
||||
148
include/rocksdb/merge_operator.h
Normal file
148
include/rocksdb/merge_operator.h
Normal file
@@ -0,0 +1,148 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
|
||||
|
||||
#include <string>
|
||||
#include <deque>
|
||||
#include "rocksdb/slice.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
class Logger;
|
||||
|
||||
// The Merge Operator
|
||||
//
|
||||
// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only
|
||||
// client knows. It could be numeric addition, list append, string
|
||||
// concatenation, edit data structure, ... , anything.
|
||||
// The library, on the other hand, is concerned with the exercise of this
|
||||
// interface, at the right time (during get, iteration, compaction...)
|
||||
//
|
||||
// To use merge, the client needs to provide an object implementing one of
|
||||
// the following interfaces:
|
||||
// a) AssociativeMergeOperator - for most simple semantics (always take
|
||||
// two values, and merge them into one value, which is then put back
|
||||
// into rocksdb); numeric addition and string concatenation are examples;
|
||||
//
|
||||
// b) MergeOperator - the generic class for all the more abstract / complex
|
||||
// operations; one method (FullMerge) to merge a Put/Delete value with a
|
||||
// merge operand; and another method (PartialMerge) that merges two
|
||||
// operands together. this is especially useful if your key values have a
|
||||
// complex structure but you would still like to support client-specific
|
||||
// incremental updates.
|
||||
//
|
||||
// AssociativeMergeOperator is simpler to implement. MergeOperator is simply
|
||||
// more powerful.
|
||||
//
|
||||
// Refer to rocksdb-merge wiki for more details and example implementations.
|
||||
//
|
||||
class MergeOperator {
|
||||
public:
|
||||
virtual ~MergeOperator() {}
|
||||
|
||||
// Gives the client a way to express the read -> modify -> write semantics
|
||||
// key: (IN) The key that's associated with this merge operation.
|
||||
// Client could multiplex the merge operator based on it
|
||||
// if the key space is partitioned and different subspaces
|
||||
// refer to different types of data which have different
|
||||
// merge operation semantics
|
||||
// existing: (IN) null indicates that the key does not exist before this op
|
||||
// operand_list:(IN) the sequence of merge operations to apply, front() first.
|
||||
// new_value:(OUT) Client is responsible for filling the merge result here
|
||||
// logger: (IN) Client could use this to log errors during merge.
|
||||
//
|
||||
// Return true on success.
|
||||
// All values passed in will be client-specific values. So if this method
|
||||
// returns false, it is because client specified bad data or there was
|
||||
// internal corruption. This will be treated as an error by the library.
|
||||
//
|
||||
// Also make use of the *logger for error messages.
|
||||
virtual bool FullMerge(const Slice& key,
|
||||
const Slice* existing_value,
|
||||
const std::deque<std::string>& operand_list,
|
||||
std::string* new_value,
|
||||
Logger* logger) const = 0;
|
||||
|
||||
// This function performs merge(left_op, right_op)
|
||||
// when both the operands are themselves merge operation types
|
||||
// that you would have passed to a DB::Merge() call in the same order
|
||||
// (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)).
|
||||
//
|
||||
// PartialMerge should combine them into a single merge operation that is
|
||||
// saved into *new_value, and then it should return true.
|
||||
// *new_value should be constructed such that a call to
|
||||
// DB::Merge(key, *new_value) would yield the same result as a call
|
||||
// to DB::Merge(key, left_op) followed by DB::Merge(key, right_op).
|
||||
//
|
||||
// If it is impossible or infeasible to combine the two operations,
|
||||
// leave new_value unchanged and return false. The library will
|
||||
// internally keep track of the operations, and apply them in the
|
||||
// correct order once a base-value (a Put/Delete/End-of-Database) is seen.
|
||||
//
|
||||
// TODO: Presently there is no way to differentiate between error/corruption
|
||||
// and simply "return false". For now, the client should simply return
|
||||
// false in any case it cannot perform partial-merge, regardless of reason.
|
||||
// If there is corruption in the data, handle it in the FullMerge() function,
|
||||
// and return false there.
|
||||
virtual bool PartialMerge(const Slice& key,
|
||||
const Slice& left_operand,
|
||||
const Slice& right_operand,
|
||||
std::string* new_value,
|
||||
Logger* logger) const = 0;
|
||||
|
||||
// The name of the MergeOperator. Used to check for MergeOperator
|
||||
// mismatches (i.e., a DB created with one MergeOperator is
|
||||
// accessed using a different MergeOperator)
|
||||
// TODO: the name is currently not stored persistently and thus
|
||||
// no checking is enforced. Client is responsible for providing
|
||||
// consistent MergeOperator between DB opens.
|
||||
virtual const char* Name() const = 0;
|
||||
};
|
||||
|
||||
// The simpler, associative merge operator.
|
||||
class AssociativeMergeOperator : public MergeOperator {
|
||||
public:
|
||||
virtual ~AssociativeMergeOperator() {}
|
||||
|
||||
// Gives the client a way to express the read -> modify -> write semantics
|
||||
// key: (IN) The key that's associated with this merge operation.
|
||||
// existing_value:(IN) null indicates the key does not exist before this op
|
||||
// value: (IN) the value to update/merge the existing_value with
|
||||
// new_value: (OUT) Client is responsible for filling the merge result here
|
||||
// logger: (IN) Client could use this to log errors during merge.
|
||||
//
|
||||
// Return true on success.
|
||||
// All values passed in will be client-specific values. So if this method
|
||||
// returns false, it is because client specified bad data or there was
|
||||
// internal corruption. The client should assume that this will be treated
|
||||
// as an error by the library.
|
||||
virtual bool Merge(const Slice& key,
|
||||
const Slice* existing_value,
|
||||
const Slice& value,
|
||||
std::string* new_value,
|
||||
Logger* logger) const = 0;
|
||||
|
||||
|
||||
private:
|
||||
// Default implementations of the MergeOperator functions
|
||||
virtual bool FullMerge(const Slice& key,
|
||||
const Slice* existing_value,
|
||||
const std::deque<std::string>& operand_list,
|
||||
std::string* new_value,
|
||||
Logger* logger) const override;
|
||||
|
||||
virtual bool PartialMerge(const Slice& key,
|
||||
const Slice& left_operand,
|
||||
const Slice& right_operand,
|
||||
std::string* new_value,
|
||||
Logger* logger) const override;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
|
||||
742
include/rocksdb/options.h
Normal file
742
include/rocksdb/options.h
Normal file
@@ -0,0 +1,742 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "rocksdb/memtablerep.h"
|
||||
#include "rocksdb/memtablerep.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "rocksdb/table_properties.h"
|
||||
#include "rocksdb/universal_compaction.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Cache;
|
||||
class CompactionFilter;
|
||||
class CompactionFilterFactory;
|
||||
class Comparator;
|
||||
class Env;
|
||||
class FilterPolicy;
|
||||
class Logger;
|
||||
class MergeOperator;
|
||||
class Snapshot;
|
||||
class TableFactory;
|
||||
|
||||
using std::shared_ptr;
|
||||
|
||||
// DB contents are stored in a set of blocks, each of which holds a
|
||||
// sequence of key,value pairs. Each block may be compressed before
|
||||
// being stored in a file. The following enum describes which
|
||||
// compression method (if any) is used to compress a block.
|
||||
enum CompressionType : char {
|
||||
// NOTE: do not change the values of existing entries, as these are
|
||||
// part of the persistent format on disk.
|
||||
kNoCompression = 0x0,
|
||||
kSnappyCompression = 0x1,
|
||||
kZlibCompression = 0x2,
|
||||
kBZip2Compression = 0x3
|
||||
};
|
||||
|
||||
enum CompactionStyle : char {
|
||||
kCompactionStyleLevel = 0x0, // level based compaction style
|
||||
kCompactionStyleUniversal = 0x1 // Universal compaction style
|
||||
};
|
||||
|
||||
// Compression options for different compression algorithms like Zlib
|
||||
struct CompressionOptions {
|
||||
int window_bits;
|
||||
int level;
|
||||
int strategy;
|
||||
CompressionOptions():window_bits(-14),
|
||||
level(-1),
|
||||
strategy(0){}
|
||||
CompressionOptions(int wbits, int lev, int strategy):window_bits(wbits),
|
||||
level(lev),
|
||||
strategy(strategy){}
|
||||
};
|
||||
|
||||
// Options to control the behavior of a database (passed to DB::Open)
|
||||
struct Options {
|
||||
// -------------------
|
||||
// Parameters that affect behavior
|
||||
|
||||
// Comparator used to define the order of keys in the table.
|
||||
// Default: a comparator that uses lexicographic byte-wise ordering
|
||||
//
|
||||
// REQUIRES: The client must ensure that the comparator supplied
|
||||
// here has the same name and orders keys *exactly* the same as the
|
||||
// comparator provided to previous open calls on the same DB.
|
||||
const Comparator* comparator;
|
||||
|
||||
// REQUIRES: The client must provide a merge operator if Merge operation
|
||||
// needs to be accessed. Calling Merge on a DB without a merge operator
|
||||
// would result in Status::NotSupported. The client must ensure that the
|
||||
// merge operator supplied here has the same name and *exactly* the same
|
||||
// semantics as the merge operator provided to previous open calls on
|
||||
// the same DB. The only exception is reserved for upgrade, where a DB
|
||||
// previously without a merge operator is introduced to Merge operation
|
||||
// for the first time. It's necessary to specify a merge operator when
|
||||
// openning the DB in this case.
|
||||
// Default: nullptr
|
||||
shared_ptr<MergeOperator> merge_operator;
|
||||
|
||||
// The client must provide compaction_filter_factory if it requires a new
|
||||
// compaction filter to be used for different compaction processes
|
||||
// Allows an application to modify/delete a key-value during background
|
||||
// compaction.
|
||||
// Ideally, client should specify only one of filter or factory.
|
||||
// compaction_filter takes precedence over compaction_filter_factory if
|
||||
// client specifies both.
|
||||
// Default: nullptr
|
||||
const CompactionFilter* compaction_filter;
|
||||
|
||||
// If true, the database will be created if it is missing.
|
||||
// Default: false
|
||||
bool create_if_missing;
|
||||
|
||||
// If true, an error is raised if the database already exists.
|
||||
// Default: false
|
||||
bool error_if_exists;
|
||||
|
||||
// If true, the implementation will do aggressive checking of the
|
||||
// data it is processing and will stop early if it detects any
|
||||
// errors. This may have unforeseen ramifications: for example, a
|
||||
// corruption of one DB entry may cause a large number of entries to
|
||||
// become unreadable or for the entire DB to become unopenable.
|
||||
// If any of the writes to the database fails (Put, Delete, Merge, Write),
|
||||
// the database will switch to read-only mode and fail all other
|
||||
// Write operations.
|
||||
// Default: false
|
||||
bool paranoid_checks;
|
||||
|
||||
// Use the specified object to interact with the environment,
|
||||
// e.g. to read/write files, schedule background work, etc.
|
||||
// Default: Env::Default()
|
||||
Env* env;
|
||||
|
||||
// Any internal progress/error information generated by the db will
|
||||
// be written to info_log if it is non-nullptr, or to a file stored
|
||||
// in the same directory as the DB contents if info_log is nullptr.
|
||||
// Default: nullptr
|
||||
shared_ptr<Logger> info_log;
|
||||
|
||||
// -------------------
|
||||
// Parameters that affect performance
|
||||
|
||||
// Amount of data to build up in memory (backed by an unsorted log
|
||||
// on disk) before converting to a sorted on-disk file.
|
||||
//
|
||||
// Larger values increase performance, especially during bulk loads.
|
||||
// Up to max_write_buffer_number write buffers may be held in memory
|
||||
// at the same time,
|
||||
// so you may wish to adjust this parameter to control memory usage.
|
||||
// Also, a larger write buffer will result in a longer recovery time
|
||||
// the next time the database is opened.
|
||||
//
|
||||
// Default: 4MB
|
||||
size_t write_buffer_size;
|
||||
|
||||
// The maximum number of write buffers that are built up in memory.
|
||||
// The default is 2, so that when 1 write buffer is being flushed to
|
||||
// storage, new writes can continue to the other write buffer.
|
||||
// Default: 2
|
||||
int max_write_buffer_number;
|
||||
|
||||
// The minimum number of write buffers that will be merged together
|
||||
// before writing to storage. If set to 1, then
|
||||
// all write buffers are fushed to L0 as individual files and this increases
|
||||
// read amplification because a get request has to check in all of these
|
||||
// files. Also, an in-memory merge may result in writing lesser
|
||||
// data to storage if there are duplicate records in each of these
|
||||
// individual write buffers. Default: 1
|
||||
int min_write_buffer_number_to_merge;
|
||||
|
||||
// Number of open files that can be used by the DB. You may need to
|
||||
// increase this if your database has a large working set (budget
|
||||
// one open file per 2MB of working set).
|
||||
//
|
||||
// Default: 1000
|
||||
int max_open_files;
|
||||
|
||||
// Control over blocks (user data is stored in a set of blocks, and
|
||||
// a block is the unit of reading from disk).
|
||||
|
||||
// If non-NULL use the specified cache for blocks.
|
||||
// If NULL, rocksdb will automatically create and use an 8MB internal cache.
|
||||
// Default: nullptr
|
||||
shared_ptr<Cache> block_cache;
|
||||
|
||||
// If non-NULL use the specified cache for compressed blocks.
|
||||
// If NULL, rocksdb will not use a compressed block cache.
|
||||
// Default: nullptr
|
||||
shared_ptr<Cache> block_cache_compressed;
|
||||
|
||||
// Approximate size of user data packed per block. Note that the
|
||||
// block size specified here corresponds to uncompressed data. The
|
||||
// actual size of the unit read from disk may be smaller if
|
||||
// compression is enabled. This parameter can be changed dynamically.
|
||||
//
|
||||
// Default: 4K
|
||||
size_t block_size;
|
||||
|
||||
// Number of keys between restart points for delta encoding of keys.
|
||||
// This parameter can be changed dynamically. Most clients should
|
||||
// leave this parameter alone.
|
||||
//
|
||||
// Default: 16
|
||||
int block_restart_interval;
|
||||
|
||||
|
||||
// Compress blocks using the specified compression algorithm. This
|
||||
// parameter can be changed dynamically.
|
||||
//
|
||||
// Default: kSnappyCompression, which gives lightweight but fast
|
||||
// compression.
|
||||
//
|
||||
// Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
|
||||
// ~200-500MB/s compression
|
||||
// ~400-800MB/s decompression
|
||||
// Note that these speeds are significantly faster than most
|
||||
// persistent storage speeds, and therefore it is typically never
|
||||
// worth switching to kNoCompression. Even if the input data is
|
||||
// incompressible, the kSnappyCompression implementation will
|
||||
// efficiently detect that and will switch to uncompressed mode.
|
||||
CompressionType compression;
|
||||
|
||||
// Different levels can have different compression policies. There
|
||||
// are cases where most lower levels would like to quick compression
|
||||
// algorithm while the higher levels (which have more data) use
|
||||
// compression algorithms that have better compression but could
|
||||
// be slower. This array, if non nullptr, should have an entry for
|
||||
// each level of the database. This array, if non nullptr, overides the
|
||||
// value specified in the previous field 'compression'. The caller is
|
||||
// reponsible for allocating memory and initializing the values in it
|
||||
// before invoking Open(). The caller is responsible for freeing this
|
||||
// array and it could be freed anytime after the return from Open().
|
||||
// This could have been a std::vector but that makes the equivalent
|
||||
// java/C api hard to construct.
|
||||
std::vector<CompressionType> compression_per_level;
|
||||
|
||||
//different options for compression algorithms
|
||||
CompressionOptions compression_opts;
|
||||
|
||||
// If non-nullptr, use the specified filter policy to reduce disk reads.
|
||||
// Many applications will benefit from passing the result of
|
||||
// NewBloomFilterPolicy() here.
|
||||
//
|
||||
// Default: nullptr
|
||||
const FilterPolicy* filter_policy;
|
||||
|
||||
// If non-nullptr, use the specified function to determine the
|
||||
// prefixes for keys. These prefixes will be placed in the filter.
|
||||
// Depending on the workload, this can reduce the number of read-IOP
|
||||
// cost for scans when a prefix is passed via ReadOptions to
|
||||
// db.NewIterator(). For prefix filtering to work properly,
|
||||
// "prefix_extractor" and "comparator" must be such that the following
|
||||
// properties hold:
|
||||
//
|
||||
// 1) key.starts_with(prefix(key))
|
||||
// 2) Compare(prefix(key), key) <= 0.
|
||||
// 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
|
||||
// 4) prefix(prefix(key)) == prefix(key)
|
||||
//
|
||||
// Default: nullptr
|
||||
const SliceTransform* prefix_extractor;
|
||||
|
||||
// If true, place whole keys in the filter (not just prefixes).
|
||||
// This must generally be true for gets to be efficient.
|
||||
//
|
||||
// Default: true
|
||||
bool whole_key_filtering;
|
||||
|
||||
// Number of levels for this database
|
||||
int num_levels;
|
||||
|
||||
// Number of files to trigger level-0 compaction. A value <0 means that
|
||||
// level-0 compaction will not be triggered by number of files at all.
|
||||
int level0_file_num_compaction_trigger;
|
||||
|
||||
// Soft limit on number of level-0 files. We start slowing down writes at this
|
||||
// point. A value <0 means that no writing slow down will be triggered by
|
||||
// number of files in level-0.
|
||||
int level0_slowdown_writes_trigger;
|
||||
|
||||
// Maximum number of level-0 files. We stop writes at this point.
|
||||
int level0_stop_writes_trigger;
|
||||
|
||||
// Maximum level to which a new compacted memtable is pushed if it
|
||||
// does not create overlap. We try to push to level 2 to avoid the
|
||||
// relatively expensive level 0=>1 compactions and to avoid some
|
||||
// expensive manifest file operations. We do not push all the way to
|
||||
// the largest level since that can generate a lot of wasted disk
|
||||
// space if the same key space is being repeatedly overwritten.
|
||||
int max_mem_compaction_level;
|
||||
|
||||
// Target file size for compaction.
|
||||
// target_file_size_base is per-file size for level-1.
|
||||
// Target file size for level L can be calculated by
|
||||
// target_file_size_base * (target_file_size_multiplier ^ (L-1))
|
||||
// For example, if target_file_size_base is 2MB and
|
||||
// target_file_size_multiplier is 10, then each file on level-1 will
|
||||
// be 2MB, and each file on level 2 will be 20MB,
|
||||
// and each file on level-3 will be 200MB.
|
||||
|
||||
// by default target_file_size_base is 2MB.
|
||||
int target_file_size_base;
|
||||
// by default target_file_size_multiplier is 1, which means
|
||||
// by default files in different levels will have similar size.
|
||||
int target_file_size_multiplier;
|
||||
|
||||
// Control maximum total data size for a level.
|
||||
// max_bytes_for_level_base is the max total for level-1.
|
||||
// Maximum number of bytes for level L can be calculated as
|
||||
// (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
|
||||
// For example, if max_bytes_for_level_base is 20MB, and if
|
||||
// max_bytes_for_level_multiplier is 10, total data size for level-1
|
||||
// will be 20MB, total file size for level-2 will be 200MB,
|
||||
// and total file size for level-3 will be 2GB.
|
||||
|
||||
|
||||
// by default 'max_bytes_for_level_base' is 10MB.
|
||||
uint64_t max_bytes_for_level_base;
|
||||
// by default 'max_bytes_for_level_base' is 10.
|
||||
int max_bytes_for_level_multiplier;
|
||||
|
||||
// Different max-size multipliers for different levels.
|
||||
// These are multiplied by max_bytes_for_level_multiplier to arrive
|
||||
// at the max-size of each level.
|
||||
// Default: 1
|
||||
std::vector<int> max_bytes_for_level_multiplier_additional;
|
||||
|
||||
// Maximum number of bytes in all compacted files. We avoid expanding
|
||||
// the lower level file set of a compaction if it would make the
|
||||
// total compaction cover more than
|
||||
// (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
|
||||
int expanded_compaction_factor;
|
||||
|
||||
// Maximum number of bytes in all source files to be compacted in a
|
||||
// single compaction run. We avoid picking too many files in the
|
||||
// source level so that we do not exceed the total source bytes
|
||||
// for compaction to exceed
|
||||
// (source_compaction_factor * targetFileSizeLevel()) many bytes.
|
||||
// Default:1, i.e. pick maxfilesize amount of data as the source of
|
||||
// a compaction.
|
||||
int source_compaction_factor;
|
||||
|
||||
// Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
|
||||
// stop building a single file in a level->level+1 compaction.
|
||||
int max_grandparent_overlap_factor;
|
||||
|
||||
// If non-null, then we should collect metrics about database operations
|
||||
// Statistics objects should not be shared between DB instances as
|
||||
// it does not use any locks to prevent concurrent updates.
|
||||
shared_ptr<Statistics> statistics;
|
||||
|
||||
// If true, then the contents of data files are not synced
|
||||
// to stable storage. Their contents remain in the OS buffers till the
|
||||
// OS decides to flush them. This option is good for bulk-loading
|
||||
// of data. Once the bulk-loading is complete, please issue a
|
||||
// sync to the OS to flush all dirty buffesrs to stable storage.
|
||||
// Default: false
|
||||
bool disableDataSync;
|
||||
|
||||
// If true, then every store to stable storage will issue a fsync.
|
||||
// If false, then every store to stable storage will issue a fdatasync.
|
||||
// This parameter should be set to true while storing data to
|
||||
// filesystem like ext3 that can lose files after a reboot.
|
||||
// Default: false
|
||||
bool use_fsync;
|
||||
|
||||
// This number controls how often a new scribe log about
|
||||
// db deploy stats is written out.
|
||||
// -1 indicates no logging at all.
|
||||
// Default value is 1800 (half an hour).
|
||||
int db_stats_log_interval;
|
||||
|
||||
// This specifies the info LOG dir.
|
||||
// If it is empty, the log files will be in the same dir as data.
|
||||
// If it is non empty, the log files will be in the specified dir,
|
||||
// and the db data dir's absolute path will be used as the log file
|
||||
// name's prefix.
|
||||
std::string db_log_dir;
|
||||
|
||||
// This specifies the absolute dir path for write-ahead logs (WAL).
|
||||
// If it is empty, the log files will be in the same dir as data,
|
||||
// dbname is used as the data dir by default
|
||||
// If it is non empty, the log files will be in kept the specified dir.
|
||||
// When destroying the db,
|
||||
// all log files in wal_dir and the dir itself is deleted
|
||||
std::string wal_dir;
|
||||
|
||||
// Disable compaction triggered by seek.
|
||||
// With bloomfilter and fast storage, a miss on one level
|
||||
// is very cheap if the file handle is cached in table cache
|
||||
// (which is true if max_open_files is large).
|
||||
bool disable_seek_compaction;
|
||||
|
||||
// The periodicity when obsolete files get deleted. The default
|
||||
// value is 6 hours. The files that get out of scope by compaction
|
||||
// process will still get automatically delete on every compaction,
|
||||
// regardless of this setting
|
||||
uint64_t delete_obsolete_files_period_micros;
|
||||
|
||||
// Maximum number of concurrent background jobs, submitted to
|
||||
// the default LOW priority thread pool
|
||||
// Default: 1
|
||||
int max_background_compactions;
|
||||
|
||||
// Maximum number of concurrent background memtable flush jobs, submitted to
|
||||
// the HIGH priority thread pool.
|
||||
// By default, all background jobs (major compaction and memtable flush) go
|
||||
// to the LOW priority pool. If this option is set to a positive number,
|
||||
// memtable flush jobs will be submitted to the HIGH priority pool.
|
||||
// It is important when the same Env is shared by multiple db instances.
|
||||
// Without a separate pool, long running major compaction jobs could
|
||||
// potentially block memtable flush jobs of other db instances, leading to
|
||||
// unnecessary Put stalls.
|
||||
// Default: 0
|
||||
int max_background_flushes;
|
||||
|
||||
// Specify the maximal size of the info log file. If the log file
|
||||
// is larger than `max_log_file_size`, a new info log file will
|
||||
// be created.
|
||||
// If max_log_file_size == 0, all logs will be written to one
|
||||
// log file.
|
||||
size_t max_log_file_size;
|
||||
|
||||
// Time for the info log file to roll (in seconds).
|
||||
// If specified with non-zero value, log file will be rolled
|
||||
// if it has been active longer than `log_file_time_to_roll`.
|
||||
// Default: 0 (disabled)
|
||||
size_t log_file_time_to_roll;
|
||||
|
||||
// Maximal info log files to be kept.
|
||||
// Default: 1000
|
||||
size_t keep_log_file_num;
|
||||
|
||||
// Puts are delayed 0-1 ms when any level has a compaction score that exceeds
|
||||
// soft_rate_limit. This is ignored when == 0.0.
|
||||
// CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
|
||||
// hold, RocksDB will set soft_rate_limit = hard_rate_limit
|
||||
// Default: 0 (disabled)
|
||||
double soft_rate_limit;
|
||||
|
||||
// Puts are delayed 1ms at a time when any level has a compaction score that
|
||||
// exceeds hard_rate_limit. This is ignored when <= 1.0.
|
||||
// Default: 0 (disabled)
|
||||
double hard_rate_limit;
|
||||
|
||||
// Max time a put will be stalled when hard_rate_limit is enforced. If 0, then
|
||||
// there is no limit.
|
||||
// Default: 1000
|
||||
unsigned int rate_limit_delay_max_milliseconds;
|
||||
|
||||
// manifest file is rolled over on reaching this limit.
|
||||
// The older manifest file be deleted.
|
||||
// The default value is MAX_INT so that roll-over does not take place.
|
||||
uint64_t max_manifest_file_size;
|
||||
|
||||
// Disable block cache. If this is set to true,
|
||||
// then no block cache should be used, and the block_cache should
|
||||
// point to a nullptr object.
|
||||
// Default: false
|
||||
bool no_block_cache;
|
||||
|
||||
// Number of shards used for table cache.
|
||||
int table_cache_numshardbits;
|
||||
|
||||
// During data eviction of table's LRU cache, it would be inefficient
|
||||
// to strictly follow LRU because this piece of memory will not really
|
||||
// be released unless its refcount falls to zero. Instead, make two
|
||||
// passes: the first pass will release items with refcount = 1,
|
||||
// and if not enough space releases after scanning the number of
|
||||
// elements specified by this parameter, we will remove items in LRU
|
||||
// order.
|
||||
int table_cache_remove_scan_count_limit;
|
||||
|
||||
// size of one block in arena memory allocation.
|
||||
// If <= 0, a proper value is automatically calculated (usually 1/10 of
|
||||
// writer_buffer_size).
|
||||
//
|
||||
// Default: 0
|
||||
size_t arena_block_size;
|
||||
|
||||
// Create an Options object with default values for all fields.
|
||||
Options();
|
||||
|
||||
void Dump(Logger* log) const;
|
||||
|
||||
// Set appropriate parameters for bulk loading.
|
||||
// The reason that this is a function that returns "this" instead of a
|
||||
// constructor is to enable chaining of multiple similar calls in the future.
|
||||
//
|
||||
// All data will be in level 0 without any automatic compaction.
|
||||
// It's recommended to manually call CompactRange(NULL, NULL) before reading
|
||||
// from the database, because otherwise the read can be very slow.
|
||||
Options* PrepareForBulkLoad();
|
||||
|
||||
// Disable automatic compactions. Manual compactions can still
|
||||
// be issued on this database.
|
||||
bool disable_auto_compactions;
|
||||
|
||||
// The following two fields affect how archived logs will be deleted.
|
||||
// 1. If both set to 0, logs will be deleted asap and will not get into
|
||||
// the archive.
|
||||
// 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
|
||||
// WAL files will be checked every 10 min and if total size is greater
|
||||
// then WAL_size_limit_MB, they will be deleted starting with the
|
||||
// earliest until size_limit is met. All empty files will be deleted.
|
||||
// 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
|
||||
// WAL files will be checked every WAL_ttl_secondsi / 2 and those that
|
||||
// are older than WAL_ttl_seconds will be deleted.
|
||||
// 4. If both are not 0, WAL files will be checked every 10 min and both
|
||||
// checks will be performed with ttl being first.
|
||||
uint64_t WAL_ttl_seconds;
|
||||
uint64_t WAL_size_limit_MB;
|
||||
|
||||
// Number of bytes to preallocate (via fallocate) the manifest
|
||||
// files. Default is 4mb, which is reasonable to reduce random IO
|
||||
// as well as prevent overallocation for mounts that preallocate
|
||||
// large amounts of data (such as xfs's allocsize option).
|
||||
size_t manifest_preallocation_size;
|
||||
|
||||
// Purge duplicate/deleted keys when a memtable is flushed to storage.
|
||||
// Default: true
|
||||
bool purge_redundant_kvs_while_flush;
|
||||
|
||||
// Data being read from file storage may be buffered in the OS
|
||||
// Default: true
|
||||
bool allow_os_buffer;
|
||||
|
||||
// Allow the OS to mmap file for reading sst tables. Default: false
|
||||
bool allow_mmap_reads;
|
||||
|
||||
// Allow the OS to mmap file for writing. Default: true
|
||||
bool allow_mmap_writes;
|
||||
|
||||
// Disable child process inherit open files. Default: true
|
||||
bool is_fd_close_on_exec;
|
||||
|
||||
// Skip log corruption error on recovery (If client is ok with
|
||||
// losing most recent changes)
|
||||
// Default: false
|
||||
bool skip_log_error_on_recovery;
|
||||
|
||||
// if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
|
||||
// Default: 3600 (1 hour)
|
||||
unsigned int stats_dump_period_sec;
|
||||
|
||||
// This is used to close a block before it reaches the configured
|
||||
// 'block_size'. If the percentage of free space in the current block is less
|
||||
// than this specified number and adding a new record to the block will
|
||||
// exceed the configured block size, then this block will be closed and the
|
||||
// new record will be written to the next block.
|
||||
// Default is 10.
|
||||
int block_size_deviation;
|
||||
|
||||
// If set true, will hint the underlying file system that the file
|
||||
// access pattern is random, when a sst file is opened.
|
||||
// Default: true
|
||||
bool advise_random_on_open;
|
||||
|
||||
// Specify the file access pattern once a compaction is started.
|
||||
// It will be applied to all input files of a compaction.
|
||||
// Default: NORMAL
|
||||
enum { NONE, NORMAL, SEQUENTIAL, WILLNEED } access_hint_on_compaction_start;
|
||||
|
||||
// Use adaptive mutex, which spins in the user space before resorting
|
||||
// to kernel. This could reduce context switch when the mutex is not
|
||||
// heavily contended. However, if the mutex is hot, we could end up
|
||||
// wasting spin time.
|
||||
// Default: false
|
||||
bool use_adaptive_mutex;
|
||||
|
||||
// Allows OS to incrementally sync files to disk while they are being
|
||||
// written, asynchronously, in the background.
|
||||
// Issue one request for every bytes_per_sync written. 0 turns it off.
|
||||
// Default: 0
|
||||
uint64_t bytes_per_sync;
|
||||
|
||||
// The compaction style. Default: kCompactionStyleLevel
|
||||
CompactionStyle compaction_style;
|
||||
|
||||
// The options needed to support Universal Style compactions
|
||||
CompactionOptionsUniversal compaction_options_universal;
|
||||
|
||||
// Use KeyMayExist API to filter deletes when this is true.
|
||||
// If KeyMayExist returns false, i.e. the key definitely does not exist, then
|
||||
// the delete is a noop. KeyMayExist only incurs in-memory look up.
|
||||
// This optimization avoids writing the delete to storage when appropriate.
|
||||
// Default: false
|
||||
bool filter_deletes;
|
||||
|
||||
// An iteration->Next() sequentially skips over keys with the same
|
||||
// user-key unless this option is set. This number specifies the number
|
||||
// of keys (with the same userkey) that will be sequentially
|
||||
// skipped before a reseek is issued.
|
||||
// Default: 8
|
||||
uint64_t max_sequential_skip_in_iterations;
|
||||
|
||||
// This is a factory that provides MemTableRep objects.
|
||||
// Default: a factory that provides a skip-list-based implementation of
|
||||
// MemTableRep.
|
||||
std::shared_ptr<MemTableRepFactory> memtable_factory;
|
||||
|
||||
// This is a factory that provides TableFactory objects.
|
||||
// Default: a factory that provides a default implementation of
|
||||
// Table and TableBuilder.
|
||||
std::shared_ptr<TableFactory> table_factory;
|
||||
|
||||
// This is a factory that provides compaction filter objects which allow
|
||||
// an application to modify/delete a key-value during background compaction.
|
||||
// Default: a factory that doesn't provide any object
|
||||
std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
|
||||
|
||||
// This option allows user to to collect their own interested statistics of
|
||||
// the tables.
|
||||
// Default: emtpy vector -- no user-defined statistics collection will be
|
||||
// performed.
|
||||
std::vector<std::shared_ptr<TablePropertiesCollector>>
|
||||
table_properties_collectors;
|
||||
|
||||
// Allows thread-safe inplace updates. Requires Updates iff
|
||||
// * key exists in current memtable
|
||||
// * new sizeof(new_value) <= sizeof(old_value)
|
||||
// * old_value for that key is a put i.e. kTypeValue
|
||||
// Default: false.
|
||||
bool inplace_update_support;
|
||||
|
||||
// Number of locks used for inplace update
|
||||
// Default: 10000, if inplace_update_support = true, else 0.
|
||||
size_t inplace_update_num_locks;
|
||||
};
|
||||
|
||||
//
|
||||
// An application can issue a read request (via Get/Iterators) and specify
|
||||
// if that read should process data that ALREADY resides on a specified cache
|
||||
// level. For example, if an application specifies kBlockCacheTier then the
|
||||
// Get call will process data that is already processed in the memtable or
|
||||
// the block cache. It will not page in data from the OS cache or data that
|
||||
// resides in storage.
|
||||
enum ReadTier {
|
||||
kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage
|
||||
kBlockCacheTier = 0x1 // data in memtable or block cache
|
||||
};
|
||||
|
||||
// Options that control read operations
|
||||
struct ReadOptions {
|
||||
// If true, all data read from underlying storage will be
|
||||
// verified against corresponding checksums.
|
||||
// Default: false
|
||||
bool verify_checksums;
|
||||
|
||||
// Should the "data block"/"index block"/"filter block" read for this
|
||||
// iteration be cached in memory?
|
||||
// Callers may wish to set this field to false for bulk scans.
|
||||
// Default: true
|
||||
bool fill_cache;
|
||||
|
||||
// If this option is set and memtable implementation allows, Seek
|
||||
// might only return keys with the same prefix as the seek-key
|
||||
bool prefix_seek;
|
||||
|
||||
// If "snapshot" is non-nullptr, read as of the supplied snapshot
|
||||
// (which must belong to the DB that is being read and which must
|
||||
// not have been released). If "snapshot" is nullptr, use an impliicit
|
||||
// snapshot of the state at the beginning of this read operation.
|
||||
// Default: nullptr
|
||||
const Snapshot* snapshot;
|
||||
|
||||
// If "prefix" is non-nullptr, and ReadOptions is being passed to
|
||||
// db.NewIterator, only return results when the key begins with this
|
||||
// prefix. This field is ignored by other calls (e.g., Get).
|
||||
// Options.prefix_extractor must also be set, and
|
||||
// prefix_extractor.InRange(prefix) must be true. The iterator
|
||||
// returned by NewIterator when this option is set will behave just
|
||||
// as if the underlying store did not contain any non-matching keys,
|
||||
// with two exceptions. Seek() only accepts keys starting with the
|
||||
// prefix, and SeekToLast() is not supported. prefix filter with this
|
||||
// option will sometimes reduce the number of read IOPs.
|
||||
// Default: nullptr
|
||||
const Slice* prefix;
|
||||
|
||||
// Specify if this read request should process data that ALREADY
|
||||
// resides on a particular cache. If the required data is not
|
||||
// found at the specified cache, then Status::Incomplete is returned.
|
||||
// Default: kReadAllTier
|
||||
ReadTier read_tier;
|
||||
|
||||
ReadOptions()
|
||||
: verify_checksums(false),
|
||||
fill_cache(true),
|
||||
prefix_seek(false),
|
||||
snapshot(nullptr),
|
||||
prefix(nullptr),
|
||||
read_tier(kReadAllTier) {
|
||||
}
|
||||
ReadOptions(bool cksum, bool cache) :
|
||||
verify_checksums(cksum), fill_cache(cache),
|
||||
prefix_seek(false), snapshot(nullptr), prefix(nullptr),
|
||||
read_tier(kReadAllTier) {
|
||||
}
|
||||
};
|
||||
|
||||
// Options that control write operations
|
||||
struct WriteOptions {
|
||||
// If true, the write will be flushed from the operating system
|
||||
// buffer cache (by calling WritableFile::Sync()) before the write
|
||||
// is considered complete. If this flag is true, writes will be
|
||||
// slower.
|
||||
//
|
||||
// If this flag is false, and the machine crashes, some recent
|
||||
// writes may be lost. Note that if it is just the process that
|
||||
// crashes (i.e., the machine does not reboot), no writes will be
|
||||
// lost even if sync==false.
|
||||
//
|
||||
// In other words, a DB write with sync==false has similar
|
||||
// crash semantics as the "write()" system call. A DB write
|
||||
// with sync==true has similar crash semantics to a "write()"
|
||||
// system call followed by "fdatasync()".
|
||||
//
|
||||
// Default: false
|
||||
bool sync;
|
||||
|
||||
// If true, writes will not first go to the write ahead log,
|
||||
// and the write may got lost after a crash.
|
||||
bool disableWAL;
|
||||
|
||||
WriteOptions()
|
||||
: sync(false),
|
||||
disableWAL(false) {
|
||||
}
|
||||
};
|
||||
|
||||
// Options that control flush operations
|
||||
struct FlushOptions {
|
||||
// If true, the flush will wait until the flush is done.
|
||||
// Default: true
|
||||
bool wait;
|
||||
|
||||
FlushOptions()
|
||||
: wait(true) {
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
|
||||
48
include/rocksdb/perf_context.h
Normal file
48
include/rocksdb/perf_context.h
Normal file
@@ -0,0 +1,48 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
|
||||
#define STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
enum PerfLevel {
|
||||
kDisable = 0, // disable perf stats
|
||||
kEnableCount = 1, // enable only count stats
|
||||
kEnableTime = 2 // enable time stats too
|
||||
};
|
||||
|
||||
// set the perf stats level
|
||||
void SetPerfLevel(PerfLevel level);
|
||||
|
||||
// A thread local context for gathering performance counter efficiently
|
||||
// and transparently.
|
||||
|
||||
struct PerfContext {
|
||||
|
||||
void Reset(); // reset all performance counters to zero
|
||||
|
||||
uint64_t user_key_comparison_count; // total number of user key comparisons
|
||||
uint64_t block_cache_hit_count; // total number of block cache hits
|
||||
uint64_t block_read_count; // total number of block reads (with IO)
|
||||
uint64_t block_read_byte; // total number of bytes from block reads
|
||||
uint64_t block_read_time; // total time spent on block reads
|
||||
uint64_t block_checksum_time; // total time spent on block checksum
|
||||
uint64_t block_decompress_time; // total time spent on block decompression
|
||||
// total number of internal keys skipped over during iteration (overwritten or
|
||||
// deleted, to be more specific, hidden by a put or delete of the same key)
|
||||
uint64_t internal_key_skipped_count;
|
||||
// total number of deletes skipped over during iteration
|
||||
uint64_t internal_delete_skipped_count;
|
||||
uint64_t wal_write_time; // total time spent on writing to WAL
|
||||
};
|
||||
|
||||
extern __thread PerfContext perf_context;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
136
include/rocksdb/slice.h
Normal file
136
include/rocksdb/slice.h
Normal file
@@ -0,0 +1,136 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Slice is a simple structure containing a pointer into some external
|
||||
// storage and a size. The user of a Slice must ensure that the slice
|
||||
// is not used after the corresponding external storage has been
|
||||
// deallocated.
|
||||
//
|
||||
// Multiple threads can invoke const methods on a Slice without
|
||||
// external synchronization, but if any of the threads may call a
|
||||
// non-const method, all threads accessing the same Slice must use
|
||||
// external synchronization.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_SLICE_H_
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice {
|
||||
public:
|
||||
// Create an empty slice.
|
||||
Slice() : data_(""), size_(0) { }
|
||||
|
||||
// Create a slice that refers to d[0,n-1].
|
||||
Slice(const char* d, size_t n) : data_(d), size_(n) { }
|
||||
|
||||
// Create a slice that refers to the contents of "s"
|
||||
/* implicit */
|
||||
Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
|
||||
|
||||
// Create a slice that refers to s[0,strlen(s)-1]
|
||||
/* implicit */
|
||||
Slice(const char* s) : data_(s), size_(strlen(s)) { }
|
||||
|
||||
// Return a pointer to the beginning of the referenced data
|
||||
const char* data() const { return data_; }
|
||||
|
||||
// Return the length (in bytes) of the referenced data
|
||||
size_t size() const { return size_; }
|
||||
|
||||
// Return true iff the length of the referenced data is zero
|
||||
bool empty() const { return size_ == 0; }
|
||||
|
||||
// Return the ith byte in the referenced data.
|
||||
// REQUIRES: n < size()
|
||||
char operator[](size_t n) const {
|
||||
assert(n < size());
|
||||
return data_[n];
|
||||
}
|
||||
|
||||
// Change this slice to refer to an empty array
|
||||
void clear() { data_ = ""; size_ = 0; }
|
||||
|
||||
// Drop the first "n" bytes from this slice.
|
||||
void remove_prefix(size_t n) {
|
||||
assert(n <= size());
|
||||
data_ += n;
|
||||
size_ -= n;
|
||||
}
|
||||
|
||||
// Return a string that contains the copy of the referenced data.
|
||||
std::string ToString(bool hex = false) const {
|
||||
if (hex) {
|
||||
std::string result;
|
||||
char buf[10];
|
||||
for (size_t i = 0; i < size_; i++) {
|
||||
snprintf(buf, 10, "%02X", (unsigned char)data_[i]);
|
||||
result += buf;
|
||||
}
|
||||
return result;
|
||||
} else {
|
||||
return std::string(data_, size_);
|
||||
}
|
||||
}
|
||||
|
||||
// Three-way comparison. Returns value:
|
||||
// < 0 iff "*this" < "b",
|
||||
// == 0 iff "*this" == "b",
|
||||
// > 0 iff "*this" > "b"
|
||||
int compare(const Slice& b) const;
|
||||
|
||||
// Return true iff "x" is a prefix of "*this"
|
||||
bool starts_with(const Slice& x) const {
|
||||
return ((size_ >= x.size_) &&
|
||||
(memcmp(data_, x.data_, x.size_) == 0));
|
||||
}
|
||||
|
||||
// private: make these public for rocksdbjni access
|
||||
const char* data_;
|
||||
size_t size_;
|
||||
|
||||
// Intentionally copyable
|
||||
};
|
||||
|
||||
// A set of Slices that are virtually concatenated together. 'parts' points
|
||||
// to an array of Slices. The number of elements in the array is 'num_parts'.
|
||||
struct SliceParts {
|
||||
SliceParts(const Slice* parts, int num_parts) :
|
||||
parts(parts), num_parts(num_parts) { }
|
||||
|
||||
const Slice* parts;
|
||||
int num_parts;
|
||||
};
|
||||
|
||||
inline bool operator==(const Slice& x, const Slice& y) {
|
||||
return ((x.size() == y.size()) &&
|
||||
(memcmp(x.data(), y.data(), x.size()) == 0));
|
||||
}
|
||||
|
||||
inline bool operator!=(const Slice& x, const Slice& y) {
|
||||
return !(x == y);
|
||||
}
|
||||
|
||||
inline int Slice::compare(const Slice& b) const {
|
||||
const int min_len = (size_ < b.size_) ? size_ : b.size_;
|
||||
int r = memcmp(data_, b.data_, min_len);
|
||||
if (r == 0) {
|
||||
if (size_ < b.size_) r = -1;
|
||||
else if (size_ > b.size_) r = +1;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_H_
|
||||
47
include/rocksdb/slice_transform.h
Normal file
47
include/rocksdb/slice_transform.h
Normal file
@@ -0,0 +1,47 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Class for specifying user-defined functions which perform a
|
||||
// transformation on a slice. It is not required that every slice
|
||||
// belong to the domain and/or range of a function. Subclasses should
|
||||
// define InDomain and InRange to determine which slices are in either
|
||||
// of these sets respectively.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
|
||||
class SliceTransform {
|
||||
public:
|
||||
virtual ~SliceTransform() {};
|
||||
|
||||
// Return the name of this transformation.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
// transform a src in domain to a dst in the range
|
||||
virtual Slice Transform(const Slice& src) const = 0;
|
||||
|
||||
// determine whether this is a valid src upon the function applies
|
||||
virtual bool InDomain(const Slice& src) const = 0;
|
||||
|
||||
// determine whether dst=Transform(src) for some src
|
||||
virtual bool InRange(const Slice& dst) const = 0;
|
||||
};
|
||||
|
||||
extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
|
||||
|
||||
extern const SliceTransform* NewNoopTransform();
|
||||
|
||||
}
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
|
||||
302
include/rocksdb/statistics.h
Normal file
302
include/rocksdb/statistics.h
Normal file
@@ -0,0 +1,302 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
|
||||
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
/**
|
||||
* Keep adding ticker's here.
|
||||
* Any ticker should have a value less than TICKER_ENUM_MAX.
|
||||
* Add a new ticker by assigning it the current value of TICKER_ENUM_MAX
|
||||
* Add a string representation in TickersNameMap below.
|
||||
* And incrementing TICKER_ENUM_MAX.
|
||||
*/
|
||||
enum Tickers {
|
||||
// total block cache misses
|
||||
// REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
|
||||
// BLOCK_CACHE_FILTER_MISS +
|
||||
// BLOCK_CACHE_DATA_MISS;
|
||||
BLOCK_CACHE_MISS,
|
||||
// total block cache hit
|
||||
// REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
|
||||
// BLOCK_CACHE_FILTER_HIT +
|
||||
// BLOCK_CACHE_DATA_HIT;
|
||||
BLOCK_CACHE_HIT,
|
||||
// # of blocks added to block cache.
|
||||
BLOCK_CACHE_ADD,
|
||||
// # of times cache miss when accessing index block from block cache.
|
||||
BLOCK_CACHE_INDEX_MISS,
|
||||
// # of times cache hit when accessing index block from block cache.
|
||||
BLOCK_CACHE_INDEX_HIT,
|
||||
// # of times cache miss when accessing filter block from block cache.
|
||||
BLOCK_CACHE_FILTER_MISS,
|
||||
// # of times cache hit when accessing filter block from block cache.
|
||||
BLOCK_CACHE_FILTER_HIT,
|
||||
// # of times cache miss when accessing data block from block cache.
|
||||
BLOCK_CACHE_DATA_MISS,
|
||||
// # of times cache hit when accessing data block from block cache.
|
||||
BLOCK_CACHE_DATA_HIT,
|
||||
// # of times bloom filter has avoided file reads.
|
||||
BLOOM_FILTER_USEFUL,
|
||||
|
||||
/**
|
||||
* COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
|
||||
* There are 3 reasons currently.
|
||||
*/
|
||||
COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value.
|
||||
COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete.
|
||||
COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key.
|
||||
|
||||
// Number of keys written to the database via the Put and Write call's
|
||||
NUMBER_KEYS_WRITTEN,
|
||||
// Number of Keys read,
|
||||
NUMBER_KEYS_READ,
|
||||
// Number keys updated, if inplace update is enabled
|
||||
NUMBER_KEYS_UPDATED,
|
||||
// Bytes written / read
|
||||
BYTES_WRITTEN,
|
||||
BYTES_READ,
|
||||
NO_FILE_CLOSES,
|
||||
NO_FILE_OPENS,
|
||||
NO_FILE_ERRORS,
|
||||
// Time system had to wait to do LO-L1 compactions
|
||||
STALL_L0_SLOWDOWN_MICROS,
|
||||
// Time system had to wait to move memtable to L1.
|
||||
STALL_MEMTABLE_COMPACTION_MICROS,
|
||||
// write throttle because of too many files in L0
|
||||
STALL_L0_NUM_FILES_MICROS,
|
||||
RATE_LIMIT_DELAY_MILLIS,
|
||||
|
||||
NO_ITERATORS, // number of iterators currently open
|
||||
|
||||
// Number of MultiGet calls, keys read, and bytes read
|
||||
NUMBER_MULTIGET_CALLS,
|
||||
NUMBER_MULTIGET_KEYS_READ,
|
||||
NUMBER_MULTIGET_BYTES_READ,
|
||||
|
||||
// Number of deletes records that were not required to be
|
||||
// written to storage because key does not exist
|
||||
NUMBER_FILTERED_DELETES,
|
||||
NUMBER_MERGE_FAILURES,
|
||||
SEQUENCE_NUMBER,
|
||||
|
||||
// number of times bloom was checked before creating iterator on a
|
||||
// file, and the number of times the check was useful in avoiding
|
||||
// iterator creation (and thus likely IOPs).
|
||||
BLOOM_FILTER_PREFIX_CHECKED,
|
||||
BLOOM_FILTER_PREFIX_USEFUL,
|
||||
|
||||
// Number of times we had to reseek inside an iteration to skip
|
||||
// over large number of keys with same userkey.
|
||||
NUMBER_OF_RESEEKS_IN_ITERATION,
|
||||
|
||||
// Record the number of calls to GetUpadtesSince. Useful to keep track of
|
||||
// transaction log iterator refreshes
|
||||
GET_UPDATES_SINCE_CALLS,
|
||||
|
||||
BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache
|
||||
BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache
|
||||
|
||||
TICKER_ENUM_MAX
|
||||
};
|
||||
|
||||
// The order of items listed in Tickers should be the same as
|
||||
// the order listed in TickersNameMap
|
||||
const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
|
||||
{ BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" },
|
||||
{ BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" },
|
||||
{ BLOCK_CACHE_ADD, "rocksdb.block.cache.add" },
|
||||
{ BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss" },
|
||||
{ BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit" },
|
||||
{ BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss" },
|
||||
{ BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit" },
|
||||
{ BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss" },
|
||||
{ BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit" },
|
||||
{ BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" },
|
||||
{ COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" },
|
||||
{ COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" },
|
||||
{ COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" },
|
||||
{ NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written" },
|
||||
{ NUMBER_KEYS_READ, "rocksdb.number.keys.read" },
|
||||
{ NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated" },
|
||||
{ BYTES_WRITTEN, "rocksdb.bytes.written" },
|
||||
{ BYTES_READ, "rocksdb.bytes.read" },
|
||||
{ NO_FILE_CLOSES, "rocksdb.no.file.closes" },
|
||||
{ NO_FILE_OPENS, "rocksdb.no.file.opens" },
|
||||
{ NO_FILE_ERRORS, "rocksdb.no.file.errors" },
|
||||
{ STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros" },
|
||||
{ STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros" },
|
||||
{ STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros" },
|
||||
{ RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis" },
|
||||
{ NO_ITERATORS, "rocksdb.num.iterators" },
|
||||
{ NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" },
|
||||
{ NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" },
|
||||
{ NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" },
|
||||
{ NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" },
|
||||
{ NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" },
|
||||
{ SEQUENCE_NUMBER, "rocksdb.sequence.number" },
|
||||
{ BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" },
|
||||
{ BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" },
|
||||
{ NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" },
|
||||
{ GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls" },
|
||||
{ BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss" },
|
||||
{ BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit" }
|
||||
};
|
||||
|
||||
/**
|
||||
* Keep adding histogram's here.
|
||||
* Any histogram whould have value less than HISTOGRAM_ENUM_MAX
|
||||
* Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
|
||||
* Add a string representation in HistogramsNameMap below
|
||||
* And increment HISTOGRAM_ENUM_MAX
|
||||
*/
|
||||
enum Histograms {
|
||||
DB_GET,
|
||||
DB_WRITE,
|
||||
COMPACTION_TIME,
|
||||
TABLE_SYNC_MICROS,
|
||||
COMPACTION_OUTFILE_SYNC_MICROS,
|
||||
WAL_FILE_SYNC_MICROS,
|
||||
MANIFEST_FILE_SYNC_MICROS,
|
||||
// TIME SPENT IN IO DURING TABLE OPEN
|
||||
TABLE_OPEN_IO_MICROS,
|
||||
DB_MULTIGET,
|
||||
READ_BLOCK_COMPACTION_MICROS,
|
||||
READ_BLOCK_GET_MICROS,
|
||||
WRITE_RAW_BLOCK_MICROS,
|
||||
|
||||
STALL_L0_SLOWDOWN_COUNT,
|
||||
STALL_MEMTABLE_COMPACTION_COUNT,
|
||||
STALL_L0_NUM_FILES_COUNT,
|
||||
HARD_RATE_LIMIT_DELAY_COUNT,
|
||||
SOFT_RATE_LIMIT_DELAY_COUNT,
|
||||
NUM_FILES_IN_SINGLE_COMPACTION,
|
||||
HISTOGRAM_ENUM_MAX,
|
||||
};
|
||||
|
||||
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
|
||||
{ DB_GET, "rocksdb.db.get.micros" },
|
||||
{ DB_WRITE, "rocksdb.db.write.micros" },
|
||||
{ COMPACTION_TIME, "rocksdb.compaction.times.micros" },
|
||||
{ TABLE_SYNC_MICROS, "rocksdb.table.sync.micros" },
|
||||
{ COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros" },
|
||||
{ WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros" },
|
||||
{ MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros" },
|
||||
{ TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros" },
|
||||
{ DB_MULTIGET, "rocksdb.db.multiget.micros" },
|
||||
{ READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros" },
|
||||
{ READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros" },
|
||||
{ WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros" },
|
||||
{ STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
|
||||
{ STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
|
||||
{ STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
|
||||
{ HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
|
||||
{ SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
|
||||
{ NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" },
|
||||
};
|
||||
|
||||
struct HistogramData {
|
||||
double median;
|
||||
double percentile95;
|
||||
double percentile99;
|
||||
double average;
|
||||
double standard_deviation;
|
||||
};
|
||||
|
||||
|
||||
class Histogram {
|
||||
public:
|
||||
// clear's the histogram
|
||||
virtual void Clear() = 0;
|
||||
virtual ~Histogram();
|
||||
// Add a value to be recorded in the histogram.
|
||||
virtual void Add(uint64_t value) = 0;
|
||||
|
||||
virtual std::string ToString() const = 0;
|
||||
|
||||
// Get statistics
|
||||
virtual double Median() const = 0;
|
||||
virtual double Percentile(double p) const = 0;
|
||||
virtual double Average() const = 0;
|
||||
virtual double StandardDeviation() const = 0;
|
||||
virtual void Data(HistogramData * const data) const = 0;
|
||||
|
||||
};
|
||||
|
||||
/**
|
||||
* A dumb ticker which keeps incrementing through its life time.
|
||||
* Thread safe. Locking managed by implementation of this interface.
|
||||
*/
|
||||
class Ticker {
|
||||
public:
|
||||
Ticker() : count_(0) { }
|
||||
|
||||
inline void setTickerCount(uint64_t count) {
|
||||
count_ = count;
|
||||
}
|
||||
|
||||
inline void recordTick(int count = 1) {
|
||||
count_ += count;
|
||||
}
|
||||
|
||||
inline uint64_t getCount() {
|
||||
return count_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::atomic_uint_fast64_t count_;
|
||||
};
|
||||
|
||||
// Analyze the performance of a db
|
||||
class Statistics {
|
||||
public:
|
||||
|
||||
virtual long getTickerCount(Tickers tickerType) = 0;
|
||||
virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0;
|
||||
virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0;
|
||||
virtual void measureTime(Histograms histogramType, uint64_t time) = 0;
|
||||
|
||||
virtual void histogramData(Histograms type, HistogramData * const data) = 0;
|
||||
// String representation of the statistic object.
|
||||
std::string ToString();
|
||||
};
|
||||
|
||||
// Create a concrete DBStatistics object
|
||||
std::shared_ptr<Statistics> CreateDBStatistics();
|
||||
|
||||
// Ease of Use functions
|
||||
inline void RecordTick(std::shared_ptr<Statistics> statistics,
|
||||
Tickers ticker,
|
||||
uint64_t count = 1) {
|
||||
assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
|
||||
assert(TickersNameMap.size() == TICKER_ENUM_MAX);
|
||||
if (statistics) {
|
||||
statistics->recordTick(ticker, count);
|
||||
}
|
||||
}
|
||||
|
||||
inline void SetTickerCount(std::shared_ptr<Statistics> statistics,
|
||||
Tickers ticker,
|
||||
uint64_t count) {
|
||||
assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
|
||||
assert(TickersNameMap.size() == TICKER_ENUM_MAX);
|
||||
if (statistics) {
|
||||
statistics->setTickerCount(ticker, count);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
|
||||
130
include/rocksdb/status.h
Normal file
130
include/rocksdb/status.h
Normal file
@@ -0,0 +1,130 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// A Status encapsulates the result of an operation. It may indicate success,
|
||||
// or it may indicate an error with an associated error message.
|
||||
//
|
||||
// Multiple threads can invoke const methods on a Status without
|
||||
// external synchronization, but if any of the threads may call a
|
||||
// non-const method, all threads accessing the same Status must use
|
||||
// external synchronization.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_STATUS_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_STATUS_H_
|
||||
|
||||
#include <string>
|
||||
#include "rocksdb/slice.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Status {
|
||||
public:
|
||||
// Create a success status.
|
||||
Status() : state_(nullptr) { }
|
||||
~Status() { delete[] state_; }
|
||||
|
||||
// Copy the specified status.
|
||||
Status(const Status& s);
|
||||
void operator=(const Status& s);
|
||||
|
||||
// Return a success status.
|
||||
static Status OK() { return Status(); }
|
||||
|
||||
// Return error status of an appropriate type.
|
||||
static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kNotFound, msg, msg2);
|
||||
}
|
||||
static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kCorruption, msg, msg2);
|
||||
}
|
||||
static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kNotSupported, msg, msg2);
|
||||
}
|
||||
static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kInvalidArgument, msg, msg2);
|
||||
}
|
||||
static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kIOError, msg, msg2);
|
||||
}
|
||||
static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kMergeInProgress, msg, msg2);
|
||||
}
|
||||
static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kIncomplete, msg, msg2);
|
||||
}
|
||||
|
||||
// Returns true iff the status indicates success.
|
||||
bool ok() const { return (state_ == nullptr); }
|
||||
|
||||
// Returns true iff the status indicates a NotFound error.
|
||||
bool IsNotFound() const { return code() == kNotFound; }
|
||||
|
||||
// Returns true iff the status indicates a Corruption error.
|
||||
bool IsCorruption() const { return code() == kCorruption; }
|
||||
|
||||
// Returns true iff the status indicates a NotSupported error.
|
||||
bool IsNotSupported() const { return code() == kNotSupported; }
|
||||
|
||||
// Returns true iff the status indicates an InvalidArgument error.
|
||||
bool IsInvalidArgument() const { return code() == kInvalidArgument; }
|
||||
|
||||
// Returns true iff the status indicates an IOError.
|
||||
bool IsIOError() const { return code() == kIOError; }
|
||||
|
||||
// Returns true iff the status indicates an MergeInProgress.
|
||||
bool IsMergeInProgress() const { return code() == kMergeInProgress; }
|
||||
|
||||
// Returns true iff the status indicates Incomplete
|
||||
bool IsIncomplete() const { return code() == kIncomplete; }
|
||||
|
||||
// Return a string representation of this status suitable for printing.
|
||||
// Returns the string "OK" for success.
|
||||
std::string ToString() const;
|
||||
|
||||
private:
|
||||
// OK status has a nullptr state_. Otherwise, state_ is a new[] array
|
||||
// of the following form:
|
||||
// state_[0..3] == length of message
|
||||
// state_[4] == code
|
||||
// state_[5..] == message
|
||||
const char* state_;
|
||||
|
||||
enum Code {
|
||||
kOk = 0,
|
||||
kNotFound = 1,
|
||||
kCorruption = 2,
|
||||
kNotSupported = 3,
|
||||
kInvalidArgument = 4,
|
||||
kIOError = 5,
|
||||
kMergeInProgress = 6,
|
||||
kIncomplete = 7
|
||||
};
|
||||
|
||||
Code code() const {
|
||||
return (state_ == nullptr) ? kOk : static_cast<Code>(state_[4]);
|
||||
}
|
||||
|
||||
Status(Code code, const Slice& msg, const Slice& msg2);
|
||||
static const char* CopyState(const char* s);
|
||||
};
|
||||
|
||||
inline Status::Status(const Status& s) {
|
||||
state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
|
||||
}
|
||||
inline void Status::operator=(const Status& s) {
|
||||
// The following condition catches both aliasing (when this == &s),
|
||||
// and the common case where both s and *this are ok.
|
||||
if (state_ != s.state_) {
|
||||
delete[] state_;
|
||||
state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_STATUS_H_
|
||||
180
include/rocksdb/table.h
Normal file
180
include/rocksdb/table.h
Normal file
@@ -0,0 +1,180 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <memory>
|
||||
#include <stdint.h>
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/table_properties.h"
|
||||
#include "rocksdb/options.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
struct Options;
|
||||
class RandomAccessFile;
|
||||
struct ReadOptions;
|
||||
class TableCache;
|
||||
class WritableFile;
|
||||
|
||||
using std::unique_ptr;
|
||||
|
||||
// TableBuilder provides the interface used to build a Table
|
||||
// (an immutable and sorted map from keys to values).
|
||||
//
|
||||
// Multiple threads can invoke const methods on a TableBuilder without
|
||||
// external synchronization, but if any of the threads may call a
|
||||
// non-const method, all threads accessing the same TableBuilder must use
|
||||
// external synchronization.
|
||||
class TableBuilder {
|
||||
public:
|
||||
// REQUIRES: Either Finish() or Abandon() has been called.
|
||||
virtual ~TableBuilder() {}
|
||||
|
||||
// Add key,value to the table being constructed.
|
||||
// REQUIRES: key is after any previously added key according to comparator.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
virtual void Add(const Slice& key, const Slice& value) = 0;
|
||||
|
||||
// Return non-ok iff some error has been detected.
|
||||
virtual Status status() const = 0;
|
||||
|
||||
// Finish building the table.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
virtual Status Finish() = 0;
|
||||
|
||||
// Indicate that the contents of this builder should be abandoned.
|
||||
// If the caller is not going to call Finish(), it must call Abandon()
|
||||
// before destroying this builder.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
virtual void Abandon() = 0;
|
||||
|
||||
// Number of calls to Add() so far.
|
||||
virtual uint64_t NumEntries() const = 0;
|
||||
|
||||
// Size of the file generated so far. If invoked after a successful
|
||||
// Finish() call, returns the size of the final generated file.
|
||||
virtual uint64_t FileSize() const = 0;
|
||||
};
|
||||
|
||||
// A Table is a sorted map from strings to strings. Tables are
|
||||
// immutable and persistent. A Table may be safely accessed from
|
||||
// multiple threads without external synchronization.
|
||||
class TableReader {
|
||||
public:
|
||||
virtual ~TableReader() {}
|
||||
|
||||
// Determine whether there is a chance that the current table file
|
||||
// contains the key a key starting with iternal_prefix. The specific
|
||||
// table implementation can use bloom filter and/or other heuristic
|
||||
// to filter out this table as a whole.
|
||||
virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0;
|
||||
|
||||
// Returns a new iterator over the table contents.
|
||||
// The result of NewIterator() is initially invalid (caller must
|
||||
// call one of the Seek methods on the iterator before using it).
|
||||
virtual Iterator* NewIterator(const ReadOptions&) = 0;
|
||||
|
||||
// Given a key, return an approximate byte offset in the file where
|
||||
// the data for that key begins (or would begin if the key were
|
||||
// present in the file). The returned value is in terms of file
|
||||
// bytes, and so includes effects like compression of the underlying data.
|
||||
// E.g., the approximate offset of the last key in the table will
|
||||
// be close to the file length.
|
||||
virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
|
||||
|
||||
// Returns true if the block for the specified key is in cache.
|
||||
// REQUIRES: key is in this table.
|
||||
virtual bool TEST_KeyInCache(const ReadOptions& options,
|
||||
const Slice& key) = 0;
|
||||
|
||||
// Set up the table for Compaction. Might change some parameters with
|
||||
// posix_fadvise
|
||||
virtual void SetupForCompaction() = 0;
|
||||
|
||||
virtual TableProperties& GetTableProperties() = 0;
|
||||
|
||||
// Calls (*result_handler)(handle_context, ...) repeatedly, starting with
|
||||
// the entry found after a call to Seek(key), until result_handler returns
|
||||
// false, where k is the actual internal key for a row found and v as the
|
||||
// value of the key. didIO is true if I/O is involved in the operation. May
|
||||
// not make such a call if filter policy says that key is not present.
|
||||
//
|
||||
// mark_key_may_exist_handler needs to be called when it is configured to be
|
||||
// memory only and the key is not found in the block cache, with
|
||||
// the parameter to be handle_context.
|
||||
//
|
||||
// readOptions is the options for the read
|
||||
// key is the key to search for
|
||||
virtual Status Get(
|
||||
const ReadOptions& readOptions,
|
||||
const Slice& key,
|
||||
void* handle_context,
|
||||
bool (*result_handler)(void* handle_context, const Slice& k,
|
||||
const Slice& v, bool didIO),
|
||||
void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
|
||||
};
|
||||
|
||||
// A base class for table factories
|
||||
class TableFactory {
|
||||
public:
|
||||
virtual ~TableFactory() {}
|
||||
|
||||
// The type of the table.
|
||||
//
|
||||
// The client of this package should switch to a new name whenever
|
||||
// the table format implementation changes.
|
||||
//
|
||||
// Names starting with "rocksdb." are reserved and should not be used
|
||||
// by any clients of this package.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
// Returns a Table object table that can fetch data from file specified
|
||||
// in parameter file. It's the caller's responsibility to make sure
|
||||
// file is in the correct format.
|
||||
//
|
||||
// GetTableReader() is called in two places:
|
||||
// (1) TableCache::FindTable() calls the function when table cache miss
|
||||
// and cache the table object returned.
|
||||
// (1) SstFileReader (for SST Dump) opens the table and dump the table
|
||||
// contents using the interator of the table.
|
||||
// options and soptions are options. options is the general options.
|
||||
// Multiple configured can be accessed from there, including and not
|
||||
// limited to block cache and key comparators.
|
||||
// file is a file handler to handle the file for the table
|
||||
// file_size is the physical file size of the file
|
||||
// table_reader is the output table reader
|
||||
virtual Status GetTableReader(
|
||||
const Options& options, const EnvOptions& soptions,
|
||||
unique_ptr<RandomAccessFile> && file, uint64_t file_size,
|
||||
unique_ptr<TableReader>* table_reader) const = 0;
|
||||
|
||||
// Return a table builder to write to a file for this table type.
|
||||
//
|
||||
// It is called in several places:
|
||||
// (1) When flushing memtable to a level-0 output file, it creates a table
|
||||
// builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
|
||||
// (2) During compaction, it gets the builder for writing compaction output
|
||||
// files in DBImpl::OpenCompactionOutputFile().
|
||||
// (3) When recovering from transaction logs, it creates a table builder to
|
||||
// write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
|
||||
// by calling BuildTable())
|
||||
// (4) When running Repairer, it creates a table builder to convert logs to
|
||||
// SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
|
||||
//
|
||||
// options is the general options. Multiple configured can be acceseed from
|
||||
// there, including and not limited to compression options.
|
||||
// file is a handle of a writable file. It is the caller's responsibility to
|
||||
// keep the file open and close the file after closing the table builder.
|
||||
// compression_type is the compression type to use in this table.
|
||||
virtual TableBuilder* GetTableBuilder(
|
||||
const Options& options, WritableFile* file,
|
||||
CompressionType compression_type) const = 0;
|
||||
};
|
||||
} // namespace rocksdb
|
||||
90
include/rocksdb/table_properties.h
Normal file
90
include/rocksdb/table_properties.h
Normal file
@@ -0,0 +1,90 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// TableProperties contains a bunch of read-only properties of its associated
|
||||
// table.
|
||||
struct TableProperties {
|
||||
public:
|
||||
// Other than basic table properties, each table may also have the user
|
||||
// collected properties.
|
||||
// The value of the user-collected properties are encoded as raw bytes --
|
||||
// users have to interprete these values by themselves.
|
||||
typedef
|
||||
std::unordered_map<std::string, std::string>
|
||||
UserCollectedProperties;
|
||||
|
||||
// the total size of all data blocks.
|
||||
uint64_t data_size = 0;
|
||||
// the size of index block.
|
||||
uint64_t index_size = 0;
|
||||
// the size of filter block.
|
||||
uint64_t filter_size = 0;
|
||||
// total raw key size
|
||||
uint64_t raw_key_size = 0;
|
||||
// total raw value size
|
||||
uint64_t raw_value_size = 0;
|
||||
// the number of blocks in this table
|
||||
uint64_t num_data_blocks = 0;
|
||||
// the number of entries in this table
|
||||
uint64_t num_entries = 0;
|
||||
|
||||
// The name of the filter policy used in this table.
|
||||
// If no filter policy is used, `filter_policy_name` will be an empty string.
|
||||
std::string filter_policy_name;
|
||||
|
||||
// user collected properties
|
||||
UserCollectedProperties user_collected_properties;
|
||||
|
||||
// convert this object to a human readable form
|
||||
// @prop_delim: delimiter for each property.
|
||||
std::string ToString(
|
||||
const std::string& prop_delim = "; ",
|
||||
const std::string& kv_delim = "=") const;
|
||||
};
|
||||
|
||||
// `TablePropertiesCollector` provides the mechanism for users to collect
|
||||
// their own interested properties. This class is essentially a collection
|
||||
// of callback functions that will be invoked during table building.
|
||||
class TablePropertiesCollector {
|
||||
public:
|
||||
virtual ~TablePropertiesCollector() { }
|
||||
|
||||
// Add() will be called when a new key/value pair is inserted into the table.
|
||||
// @params key the original key that is inserted into the table.
|
||||
// @params value the original value that is inserted into the table.
|
||||
virtual Status Add(const Slice& key, const Slice& value) = 0;
|
||||
|
||||
// Finish() will be called when a table has already been built and is ready
|
||||
// for writing the properties block.
|
||||
// @params properties User will add their collected statistics to
|
||||
// `properties`.
|
||||
virtual Status Finish(
|
||||
TableProperties::UserCollectedProperties* properties) = 0;
|
||||
|
||||
// The name of the properties collector can be used for debugging purpose.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
// Return the human-readable properties, where the key is property name and
|
||||
// the value is the human-readable form of value.
|
||||
virtual TableProperties::UserCollectedProperties
|
||||
GetReadableProperties() const = 0;
|
||||
};
|
||||
|
||||
// Extra properties
|
||||
// Below is a list of non-basic properties that are collected by database
|
||||
// itself. Especially some properties regarding to the internal keys (which
|
||||
// is unknown to `table`).
|
||||
extern uint64_t GetDeletedKeys(
|
||||
const TableProperties::UserCollectedProperties& props);
|
||||
|
||||
} // namespace rocksdb
|
||||
91
include/rocksdb/transaction_log.h
Normal file
91
include/rocksdb/transaction_log.h
Normal file
@@ -0,0 +1,91 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
|
||||
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "rocksdb/write_batch.h"
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class LogFile;
|
||||
typedef std::vector<std::unique_ptr<LogFile>> VectorLogPtr;
|
||||
|
||||
enum WalFileType {
|
||||
/* Indicates that WAL file is in archive directory. WAL files are moved from
|
||||
* the main db directory to archive directory once they are not live and stay
|
||||
* there until cleaned up. Files are cleaned depending on archive size
|
||||
* (Options::WAL_size_limit_MB) and time since last cleaning
|
||||
* (Options::WAL_ttl_seconds).
|
||||
*/
|
||||
kArchivedLogFile = 0,
|
||||
|
||||
/* Indicates that WAL file is live and resides in the main db directory */
|
||||
kAliveLogFile = 1
|
||||
} ;
|
||||
|
||||
class LogFile {
|
||||
public:
|
||||
LogFile() {}
|
||||
virtual ~LogFile() {}
|
||||
|
||||
// Returns log file's pathname relative to the main db dir
|
||||
// Eg. For a live-log-file = /000003.log
|
||||
// For an archived-log-file = /archive/000003.log
|
||||
virtual std::string PathName() const = 0;
|
||||
|
||||
|
||||
// Primary identifier for log file.
|
||||
// This is directly proportional to creation time of the log file
|
||||
virtual uint64_t LogNumber() const = 0;
|
||||
|
||||
// Log file can be either alive or archived
|
||||
virtual WalFileType Type() const = 0;
|
||||
|
||||
// Starting sequence number of writebatch written in this log file
|
||||
virtual SequenceNumber StartSequence() const = 0;
|
||||
|
||||
// Size of log file on disk in Bytes
|
||||
virtual uint64_t SizeFileBytes() const = 0;
|
||||
};
|
||||
|
||||
struct BatchResult {
|
||||
SequenceNumber sequence = SequenceNumber();
|
||||
std::unique_ptr<WriteBatch> writeBatchPtr;
|
||||
};
|
||||
|
||||
// A TransactionLogIterator is used to iterate over the transactions in a db.
|
||||
// One run of the iterator is continuous, i.e. the iterator will stop at the
|
||||
// beginning of any gap in sequences
|
||||
class TransactionLogIterator {
|
||||
public:
|
||||
TransactionLogIterator() {}
|
||||
virtual ~TransactionLogIterator() {}
|
||||
|
||||
// An iterator is either positioned at a WriteBatch or not valid.
|
||||
// This method returns true if the iterator is valid.
|
||||
// Can read data from a valid iterator.
|
||||
virtual bool Valid() = 0;
|
||||
|
||||
// Moves the iterator to the next WriteBatch.
|
||||
// REQUIRES: Valid() to be true.
|
||||
virtual void Next() = 0;
|
||||
|
||||
// Returns ok if the iterator is valid.
|
||||
// Returns the Error when something has gone wrong.
|
||||
virtual Status status() = 0;
|
||||
|
||||
// If valid return's the current write_batch and the sequence number of the
|
||||
// earliest transaction contained in the batch.
|
||||
// ONLY use if Valid() is true and status() is OK.
|
||||
virtual BatchResult GetBatch() = 0;
|
||||
};
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
|
||||
20
include/rocksdb/types.h
Normal file
20
include/rocksdb/types.h
Normal file
@@ -0,0 +1,20 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_TYPES_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_TYPES_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// Define all public custom types here.
|
||||
|
||||
// Represents a sequence number in a WAL file.
|
||||
typedef uint64_t SequenceNumber;
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_TYPES_H_
|
||||
89
include/rocksdb/universal_compaction.h
Normal file
89
include/rocksdb/universal_compaction.h
Normal file
@@ -0,0 +1,89 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
|
||||
#define STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <stdint.h>
|
||||
#include <climits>
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/statistics.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
//
|
||||
// Algorithm used to make a compaction request stop picking new files
|
||||
// into a single compaction run
|
||||
//
|
||||
enum CompactionStopStyle {
|
||||
kCompactionStopStyleSimilarSize, // pick files of similar size
|
||||
kCompactionStopStyleTotalSize // total size of picked files > next file
|
||||
};
|
||||
|
||||
class CompactionOptionsUniversal {
|
||||
public:
|
||||
|
||||
// Percentage flexibilty while comparing file size. If the candidate file(s)
|
||||
// size is 1% smaller than the next file's size, then include next file into
|
||||
// this candidate set. // Default: 1
|
||||
unsigned int size_ratio;
|
||||
|
||||
// The minimum number of files in a single compaction run. Default: 2
|
||||
unsigned int min_merge_width;
|
||||
|
||||
// The maximum number of files in a single compaction run. Default: UINT_MAX
|
||||
unsigned int max_merge_width;
|
||||
|
||||
// The size amplification is defined as the amount (in percentage) of
|
||||
// additional storage needed to store a single byte of data in the database.
|
||||
// For example, a size amplification of 2% means that a database that
|
||||
// contains 100 bytes of user-data may occupy upto 102 bytes of
|
||||
// physical storage. By this definition, a fully compacted database has
|
||||
// a size amplification of 0%. Rocksdb uses the following heuristic
|
||||
// to calculate size amplification: it assumes that all files excluding
|
||||
// the earliest file contribute to the size amplification.
|
||||
// Default: 200, which means that a 100 byte database could require upto
|
||||
// 300 bytes of storage.
|
||||
unsigned int max_size_amplification_percent;
|
||||
|
||||
// If this option is set to be -1 (the default value), all the output files
|
||||
// will follow compression type specified.
|
||||
//
|
||||
// If this option is not negative, we will try to make sure compressed
|
||||
// size is just above this value. In normal cases, at least this percentage
|
||||
// of data will be compressed.
|
||||
// When we are compacting to a new file, here is the criteria whether
|
||||
// it needs to be compressed: assuming here are the list of files sorted
|
||||
// by generation time:
|
||||
// A1...An B1...Bm C1...Ct
|
||||
// where A1 is the newest and Ct is the oldest, and we are going to compact
|
||||
// B1...Bm, we calculate the total size of all the files as total_size, as
|
||||
// well as the total size of C1...Ct as total_C, the compaction output file
|
||||
// will be compressed iff
|
||||
// total_C / total_size < this percentage
|
||||
int compression_size_percent;
|
||||
|
||||
// The algorithm used to stop picking files into a single compaction run
|
||||
// Default: kCompactionStopStyleTotalSize
|
||||
CompactionStopStyle stop_style;
|
||||
|
||||
// Default set of parameters
|
||||
CompactionOptionsUniversal() :
|
||||
size_ratio(1),
|
||||
min_merge_width(2),
|
||||
max_merge_width(UINT_MAX),
|
||||
max_size_amplification_percent(200),
|
||||
compression_size_percent(-1),
|
||||
stop_style(kCompactionStopStyleTotalSize) {
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
|
||||
109
include/rocksdb/write_batch.h
Normal file
109
include/rocksdb/write_batch.h
Normal file
@@ -0,0 +1,109 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// WriteBatch holds a collection of updates to apply atomically to a DB.
|
||||
//
|
||||
// The updates are applied in the order in which they are added
|
||||
// to the WriteBatch. For example, the value of "key" will be "v3"
|
||||
// after the following batch is written:
|
||||
//
|
||||
// batch.Put("key", "v1");
|
||||
// batch.Delete("key");
|
||||
// batch.Put("key", "v2");
|
||||
// batch.Put("key", "v3");
|
||||
//
|
||||
// Multiple threads can invoke const methods on a WriteBatch without
|
||||
// external synchronization, but if any of the threads may call a
|
||||
// non-const method, all threads accessing the same WriteBatch must use
|
||||
// external synchronization.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
|
||||
|
||||
#include <string>
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
struct SliceParts;
|
||||
|
||||
class WriteBatch {
|
||||
public:
|
||||
WriteBatch();
|
||||
~WriteBatch();
|
||||
|
||||
// Store the mapping "key->value" in the database.
|
||||
void Put(const Slice& key, const Slice& value);
|
||||
|
||||
// Variant of Put() that gathers output like writev(2). The key and value
|
||||
// that will be written to the database are concatentations of arrays of
|
||||
// slices.
|
||||
void Put(const SliceParts& key, const SliceParts& value);
|
||||
|
||||
// Merge "value" with the existing value of "key" in the database.
|
||||
// "key->merge(existing, value)"
|
||||
void Merge(const Slice& key, const Slice& value);
|
||||
|
||||
// If the database contains a mapping for "key", erase it. Else do nothing.
|
||||
void Delete(const Slice& key);
|
||||
|
||||
// Append a blob of arbitrary size to the records in this batch. The blob will
|
||||
// be stored in the transaction log but not in any other file. In particular,
|
||||
// it will not be persisted to the SST files. When iterating over this
|
||||
// WriteBatch, WriteBatch::Handler::LogData will be called with the contents
|
||||
// of the blob as it is encountered. Blobs, puts, deletes, and merges will be
|
||||
// encountered in the same order in thich they were inserted. The blob will
|
||||
// NOT consume sequence number(s) and will NOT increase the count of the batch
|
||||
//
|
||||
// Example application: add timestamps to the transaction log for use in
|
||||
// replication.
|
||||
void PutLogData(const Slice& blob);
|
||||
|
||||
// Clear all updates buffered in this batch.
|
||||
void Clear();
|
||||
|
||||
// Support for iterating over the contents of a batch.
|
||||
class Handler {
|
||||
public:
|
||||
virtual ~Handler();
|
||||
virtual void Put(const Slice& key, const Slice& value) = 0;
|
||||
// Merge and LogData are not pure virtual. Otherwise, we would break
|
||||
// existing clients of Handler on a source code level. The default
|
||||
// implementation of Merge simply throws a runtime exception.
|
||||
virtual void Merge(const Slice& key, const Slice& value);
|
||||
// The default implementation of LogData does nothing.
|
||||
virtual void LogData(const Slice& blob);
|
||||
virtual void Delete(const Slice& key) = 0;
|
||||
// Continue is called by WriteBatch::Iterate. If it returns false,
|
||||
// iteration is halted. Otherwise, it continues iterating. The default
|
||||
// implementation always returns true.
|
||||
virtual bool Continue();
|
||||
};
|
||||
Status Iterate(Handler* handler) const;
|
||||
|
||||
// Retrieve the serialized version of this batch.
|
||||
std::string Data() { return rep_; }
|
||||
|
||||
// Returns the number of updates in the batch
|
||||
int Count() const;
|
||||
|
||||
// Constructor with a serialized string object
|
||||
explicit WriteBatch(std::string rep): rep_(rep) {}
|
||||
|
||||
private:
|
||||
friend class WriteBatchInternal;
|
||||
|
||||
std::string rep_; // See comment in write_batch.cc for the format of rep_
|
||||
|
||||
// Intentionally copyable
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
|
||||
161
include/utilities/stackable_db.h
Normal file
161
include/utilities/stackable_db.h
Normal file
@@ -0,0 +1,161 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include "rocksdb/db.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
|
||||
class StackableDB : public DB {
|
||||
public:
|
||||
explicit StackableDB(StackableDB* sdb) : sdb_(sdb) {}
|
||||
|
||||
// Returns the DB object that is the lowermost component in the stack of DBs
|
||||
virtual DB* GetRawDB() {
|
||||
return sdb_->GetRawDB();
|
||||
}
|
||||
|
||||
// convert a DB to StackableDB
|
||||
// TODO: This function does not work yet. Passing nullptr to StackableDB in
|
||||
// NewStackableDB's constructor will cause segfault on object's usage
|
||||
static StackableDB* DBToStackableDB(DB* db) {
|
||||
class NewStackableDB : public StackableDB {
|
||||
public:
|
||||
NewStackableDB(DB* db)
|
||||
: StackableDB(nullptr),
|
||||
db_(db) {}
|
||||
|
||||
DB* GetRawDB() {
|
||||
return db_;
|
||||
}
|
||||
|
||||
private:
|
||||
DB* db_;
|
||||
};
|
||||
return new NewStackableDB(db);
|
||||
}
|
||||
|
||||
virtual Status Put(const WriteOptions& options,
|
||||
const Slice& key,
|
||||
const Slice& val) override {
|
||||
return sdb_->Put(options, key, val);
|
||||
}
|
||||
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value) override {
|
||||
return sdb_->Get(options, key, value);
|
||||
}
|
||||
|
||||
virtual std::vector<Status> MultiGet(const ReadOptions& options,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values)
|
||||
override {
|
||||
return sdb_->MultiGet(options, keys, values);
|
||||
}
|
||||
|
||||
virtual bool KeyMayExist(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value,
|
||||
bool* value_found = nullptr) override {
|
||||
return sdb_->KeyMayExist(options, key, value, value_found);
|
||||
}
|
||||
|
||||
virtual Status Delete(const WriteOptions& wopts, const Slice& key) override {
|
||||
return sdb_->Delete(wopts, key);
|
||||
}
|
||||
|
||||
virtual Status Merge(const WriteOptions& options,
|
||||
const Slice& key,
|
||||
const Slice& value) override {
|
||||
return sdb_->Merge(options, key, value);
|
||||
}
|
||||
|
||||
|
||||
virtual Status Write(const WriteOptions& opts, WriteBatch* updates)
|
||||
override {
|
||||
return sdb_->Write(opts, updates);
|
||||
}
|
||||
|
||||
virtual Iterator* NewIterator(const ReadOptions& opts) override {
|
||||
return sdb_->NewIterator(opts);
|
||||
}
|
||||
|
||||
virtual const Snapshot* GetSnapshot() override {
|
||||
return sdb_->GetSnapshot();
|
||||
}
|
||||
|
||||
virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
|
||||
return sdb_->ReleaseSnapshot(snapshot);
|
||||
}
|
||||
|
||||
virtual bool GetProperty(const Slice& property, std::string* value)
|
||||
override {
|
||||
return sdb_->GetProperty(property, value);
|
||||
}
|
||||
|
||||
virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes)
|
||||
override {
|
||||
return sdb_->GetApproximateSizes(r, n, sizes);
|
||||
}
|
||||
|
||||
virtual void CompactRange(const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false,
|
||||
int target_level = -1) override {
|
||||
return sdb_->CompactRange(begin, end, reduce_level, target_level);
|
||||
}
|
||||
|
||||
virtual int NumberLevels() override {
|
||||
return sdb_->NumberLevels();
|
||||
}
|
||||
|
||||
virtual int MaxMemCompactionLevel() override {
|
||||
return sdb_->MaxMemCompactionLevel();
|
||||
}
|
||||
|
||||
virtual int Level0StopWriteTrigger() override {
|
||||
return sdb_->Level0StopWriteTrigger();
|
||||
}
|
||||
|
||||
virtual Status Flush(const FlushOptions& fopts) override {
|
||||
return sdb_->Flush(fopts);
|
||||
}
|
||||
|
||||
virtual Status DisableFileDeletions() override {
|
||||
return sdb_->DisableFileDeletions();
|
||||
}
|
||||
|
||||
virtual Status EnableFileDeletions() override {
|
||||
return sdb_->EnableFileDeletions();
|
||||
}
|
||||
|
||||
virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
|
||||
bool flush_memtable = true) override {
|
||||
return sdb_->GetLiveFiles(vec, mfs, flush_memtable);
|
||||
}
|
||||
|
||||
virtual SequenceNumber GetLatestSequenceNumber() const override {
|
||||
return sdb_->GetLatestSequenceNumber();
|
||||
}
|
||||
|
||||
virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
|
||||
return sdb_->GetSortedWalFiles(files);
|
||||
}
|
||||
|
||||
virtual Status DeleteFile(std::string name) override {
|
||||
return sdb_->DeleteFile(name);
|
||||
}
|
||||
|
||||
virtual Status GetUpdatesSince(SequenceNumber seq_number,
|
||||
unique_ptr<TransactionLogIterator>* iter)
|
||||
override {
|
||||
return sdb_->GetUpdatesSince(seq_number, iter);
|
||||
}
|
||||
|
||||
protected:
|
||||
StackableDB* sdb_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
50
include/utilities/utility_db.h
Normal file
50
include/utilities/utility_db.h
Normal file
@@ -0,0 +1,50 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include "stackable_db.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// This class contains APIs to open rocksdb with specific support eg. TTL
|
||||
class UtilityDB {
|
||||
|
||||
public:
|
||||
// Open the database with TTL support.
|
||||
//
|
||||
// USE-CASES:
|
||||
// This API should be used to open the db when key-values inserted are
|
||||
// meant to be removed from the db in a non-strict 'ttl' amount of time
|
||||
// Therefore, this guarantees that key-values inserted will remain in the
|
||||
// db for >= ttl amount of time and the db will make efforts to remove the
|
||||
// key-values as soon as possible after ttl seconds of their insertion.
|
||||
//
|
||||
// BEHAVIOUR:
|
||||
// TTL is accepted in seconds
|
||||
// (int32_t)Timestamp(creation) is suffixed to values in Put internally
|
||||
// Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
|
||||
// Get/Iterator may return expired entries(compaction not run on them yet)
|
||||
// Different TTL may be used during different Opens
|
||||
// Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
|
||||
// Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
|
||||
// read_only=true opens in the usual read-only mode. Compactions will not be
|
||||
// triggered(neither manual nor automatic), so no expired entries removed
|
||||
//
|
||||
// CONSTRAINTS:
|
||||
// Not specifying/passing or non-positive TTL behaves like TTL = infinity
|
||||
//
|
||||
// !!!WARNING!!!:
|
||||
// Calling DB::Open directly to re-open a db created by this API will get
|
||||
// corrupt values(timestamp suffixed) and no ttl effect will be there
|
||||
// during the second Open, so use this API consistently to open the db
|
||||
// Be careful when passing ttl with a small positive value because the
|
||||
// whole database may be deleted in a small amount of time
|
||||
static Status OpenTtlDB(const Options& options,
|
||||
const std::string& name,
|
||||
StackableDB** dbptr,
|
||||
int32_t ttl = 0,
|
||||
bool read_only = false);
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
Reference in New Issue
Block a user