Squashed 'src/ripple/rocksdb/rocksdb/' content from commit 56589ab

git-subtree-dir: src/ripple/rocksdb/rocksdb
git-subtree-split: 56589ab81f6827ff7402e31b24a6d548f29a524f
This commit is contained in:
Vinnie Falco
2013-11-21 16:24:10 -08:00
commit b156a49cff
262 changed files with 63141 additions and 0 deletions

45
include/rocksdb/arena.h Normal file
View File

@@ -0,0 +1,45 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Arena class defines memory allocation methods. It's used by memtable and
// skiplist.
#ifndef STORAGE_ROCKSDB_INCLUDE_ARENA_H_
#define STORAGE_ROCKSDB_INCLUDE_ARENA_H_
#include <limits>
#include <memory>
namespace rocksdb {
class Arena {
public:
Arena() {};
virtual ~Arena() {};
// Return a pointer to a newly allocated memory block of "bytes" bytes.
virtual char* Allocate(size_t bytes) = 0;
// Allocate memory with the normal alignment guarantees provided by malloc.
virtual char* AllocateAligned(size_t bytes) = 0;
// Returns an estimate of the total memory used by arena.
virtual const size_t ApproximateMemoryUsage() = 0;
// Returns the total number of bytes in all blocks allocated so far.
virtual const size_t MemoryAllocatedBytes() = 0;
private:
// No copying allowed
Arena(const Arena&);
void operator=(const Arena&);
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_ARENA_H_

285
include/rocksdb/c.h Normal file
View File

@@ -0,0 +1,285 @@
/* Copyright (c) 2013, Facebook, Inc. All rights reserved.
This source code is licensed under the BSD-style license found in the
LICENSE file in the root directory of this source tree. An additional grant
of patent rights can be found in the PATENTS file in the same directory.
Copyright (c) 2011 The LevelDB Authors. All rights reserved.
Use of this source code is governed by a BSD-style license that can be
found in the LICENSE file. See the AUTHORS file for names of contributors.
C bindings for leveldb. May be useful as a stable ABI that can be
used by programs that keep leveldb in a shared library, or for
a JNI api.
Does not support:
. getters for the option types
. custom comparators that implement key shortening
. capturing post-write-snapshot
. custom iter, db, env, cache implementations using just the C bindings
Some conventions:
(1) We expose just opaque struct pointers and functions to clients.
This allows us to change internal representations without having to
recompile clients.
(2) For simplicity, there is no equivalent to the Slice type. Instead,
the caller has to pass the pointer and length as separate
arguments.
(3) Errors are represented by a null-terminated c string. NULL
means no error. All operations that can raise an error are passed
a "char** errptr" as the last argument. One of the following must
be true on entry:
*errptr == NULL
*errptr points to a malloc()ed null-terminated error message
On success, a leveldb routine leaves *errptr unchanged.
On failure, leveldb frees the old value of *errptr and
set *errptr to a malloc()ed error message.
(4) Bools have the type unsigned char (0 == false; rest == true)
(5) All of the pointer arguments must be non-NULL.
*/
#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_
#define STORAGE_ROCKSDB_INCLUDE_C_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdarg.h>
#include <stddef.h>
#include <stdint.h>
/* Exported types */
typedef struct leveldb_t leveldb_t;
typedef struct leveldb_cache_t leveldb_cache_t;
typedef struct leveldb_comparator_t leveldb_comparator_t;
typedef struct leveldb_env_t leveldb_env_t;
typedef struct leveldb_filelock_t leveldb_filelock_t;
typedef struct leveldb_filterpolicy_t leveldb_filterpolicy_t;
typedef struct leveldb_iterator_t leveldb_iterator_t;
typedef struct leveldb_logger_t leveldb_logger_t;
typedef struct leveldb_options_t leveldb_options_t;
typedef struct leveldb_randomfile_t leveldb_randomfile_t;
typedef struct leveldb_readoptions_t leveldb_readoptions_t;
typedef struct leveldb_seqfile_t leveldb_seqfile_t;
typedef struct leveldb_snapshot_t leveldb_snapshot_t;
typedef struct leveldb_writablefile_t leveldb_writablefile_t;
typedef struct leveldb_writebatch_t leveldb_writebatch_t;
typedef struct leveldb_writeoptions_t leveldb_writeoptions_t;
/* DB operations */
extern leveldb_t* leveldb_open(
const leveldb_options_t* options,
const char* name,
char** errptr);
extern void leveldb_close(leveldb_t* db);
extern void leveldb_put(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr);
extern void leveldb_delete(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
char** errptr);
extern void leveldb_write(
leveldb_t* db,
const leveldb_writeoptions_t* options,
leveldb_writebatch_t* batch,
char** errptr);
/* Returns NULL if not found. A malloc()ed array otherwise.
Stores the length of the array in *vallen. */
extern char* leveldb_get(
leveldb_t* db,
const leveldb_readoptions_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr);
extern leveldb_iterator_t* leveldb_create_iterator(
leveldb_t* db,
const leveldb_readoptions_t* options);
extern const leveldb_snapshot_t* leveldb_create_snapshot(
leveldb_t* db);
extern void leveldb_release_snapshot(
leveldb_t* db,
const leveldb_snapshot_t* snapshot);
/* Returns NULL if property name is unknown.
Else returns a pointer to a malloc()-ed null-terminated value. */
extern char* leveldb_property_value(
leveldb_t* db,
const char* propname);
extern void leveldb_approximate_sizes(
leveldb_t* db,
int num_ranges,
const char* const* range_start_key, const size_t* range_start_key_len,
const char* const* range_limit_key, const size_t* range_limit_key_len,
uint64_t* sizes);
extern void leveldb_compact_range(
leveldb_t* db,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len);
/* Management operations */
extern void leveldb_destroy_db(
const leveldb_options_t* options,
const char* name,
char** errptr);
extern void leveldb_repair_db(
const leveldb_options_t* options,
const char* name,
char** errptr);
/* Iterator */
extern void leveldb_iter_destroy(leveldb_iterator_t*);
extern unsigned char leveldb_iter_valid(const leveldb_iterator_t*);
extern void leveldb_iter_seek_to_first(leveldb_iterator_t*);
extern void leveldb_iter_seek_to_last(leveldb_iterator_t*);
extern void leveldb_iter_seek(leveldb_iterator_t*, const char* k, size_t klen);
extern void leveldb_iter_next(leveldb_iterator_t*);
extern void leveldb_iter_prev(leveldb_iterator_t*);
extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);
/* Write batch */
extern leveldb_writebatch_t* leveldb_writebatch_create();
extern void leveldb_writebatch_destroy(leveldb_writebatch_t*);
extern void leveldb_writebatch_clear(leveldb_writebatch_t*);
extern void leveldb_writebatch_put(
leveldb_writebatch_t*,
const char* key, size_t klen,
const char* val, size_t vlen);
extern void leveldb_writebatch_delete(
leveldb_writebatch_t*,
const char* key, size_t klen);
extern void leveldb_writebatch_iterate(
leveldb_writebatch_t*,
void* state,
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
void (*deleted)(void*, const char* k, size_t klen));
/* Options */
extern leveldb_options_t* leveldb_options_create();
extern void leveldb_options_destroy(leveldb_options_t*);
extern void leveldb_options_set_comparator(
leveldb_options_t*,
leveldb_comparator_t*);
extern void leveldb_options_set_compression_per_level(
leveldb_options_t* opt,
int* level_values,
size_t num_levels);
extern void leveldb_options_set_filter_policy(
leveldb_options_t*,
leveldb_filterpolicy_t*);
extern void leveldb_options_set_create_if_missing(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_error_if_exists(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_paranoid_checks(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
extern void leveldb_options_set_max_open_files(leveldb_options_t*, int);
extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
extern void leveldb_options_set_compression_options(
leveldb_options_t* opt, int w_bits, int level, int strategy);
enum {
leveldb_no_compression = 0,
leveldb_snappy_compression = 1
};
extern void leveldb_options_set_compression(leveldb_options_t*, int);
/* Comparator */
extern leveldb_comparator_t* leveldb_comparator_create(
void* state,
void (*destructor)(void*),
int (*compare)(
void*,
const char* a, size_t alen,
const char* b, size_t blen),
const char* (*name)(void*));
extern void leveldb_comparator_destroy(leveldb_comparator_t*);
/* Filter policy */
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create(
void* state,
void (*destructor)(void*),
char* (*create_filter)(
void*,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length),
unsigned char (*key_may_match)(
void*,
const char* key, size_t length,
const char* filter, size_t filter_length),
const char* (*name)(void*));
extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*);
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(
int bits_per_key);
/* Read options */
extern leveldb_readoptions_t* leveldb_readoptions_create();
extern void leveldb_readoptions_destroy(leveldb_readoptions_t*);
extern void leveldb_readoptions_set_verify_checksums(
leveldb_readoptions_t*,
unsigned char);
extern void leveldb_readoptions_set_fill_cache(
leveldb_readoptions_t*, unsigned char);
extern void leveldb_readoptions_set_snapshot(
leveldb_readoptions_t*,
const leveldb_snapshot_t*);
/* Write options */
extern leveldb_writeoptions_t* leveldb_writeoptions_create();
extern void leveldb_writeoptions_destroy(leveldb_writeoptions_t*);
extern void leveldb_writeoptions_set_sync(
leveldb_writeoptions_t*, unsigned char);
/* Cache */
extern leveldb_cache_t* leveldb_cache_create_lru(size_t capacity);
extern void leveldb_cache_destroy(leveldb_cache_t* cache);
/* Env */
extern leveldb_env_t* leveldb_create_default_env();
extern void leveldb_env_destroy(leveldb_env_t*);
#ifdef __cplusplus
} /* end extern "C" */
#endif
#endif /* STORAGE_ROCKSDB_INCLUDE_C_H_ */

122
include/rocksdb/cache.h Normal file
View File

@@ -0,0 +1,122 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A Cache is an interface that maps keys to values. It has internal
// synchronization and may be safely accessed concurrently from
// multiple threads. It may automatically evict entries to make room
// for new entries. Values have a specified charge against the cache
// capacity. For example, a cache where the values are variable
// length strings, may use the length of the string as the charge for
// the string.
//
// A builtin cache implementation with a least-recently-used eviction
// policy is provided. Clients may use their own implementations if
// they want something more sophisticated (like scan-resistance, a
// custom eviction policy, variable cache sizing, etc.)
#ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_
#define STORAGE_ROCKSDB_INCLUDE_CACHE_H_
#include <memory>
#include <stdint.h>
#include "rocksdb/slice.h"
namespace rocksdb {
using std::shared_ptr;
class Cache;
// Create a new cache with a fixed size capacity. The cache is sharded
// to 2^numShardBits shards, by hash of the key. The total capacity
// is divided and evenly assigned to each shard. Inside each shard,
// the eviction is done in two passes: first try to free spaces by
// evicting entries that are among the most least used removeScanCountLimit
// entries and do not have reference other than by the cache itself, in
// the least-used order. If not enough space is freed, further free the
// entries in least used order.
//
// The functions without parameter numShardBits and/or removeScanCountLimit
// use default values. removeScanCountLimit's default value is 0, which
// means a strict LRU order inside each shard.
extern shared_ptr<Cache> NewLRUCache(size_t capacity);
extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits);
extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
int removeScanCountLimit);
class Cache {
public:
Cache() { }
// Destroys all existing entries by calling the "deleter"
// function that was passed to the constructor.
virtual ~Cache();
// Opaque handle to an entry stored in the cache.
struct Handle { };
// Insert a mapping from key->value into the cache and assign it
// the specified charge against the total cache capacity.
//
// Returns a handle that corresponds to the mapping. The caller
// must call this->Release(handle) when the returned mapping is no
// longer needed.
//
// When the inserted entry is no longer needed, the key and
// value will be passed to "deleter".
virtual Handle* Insert(const Slice& key, void* value, size_t charge,
void (*deleter)(const Slice& key, void* value)) = 0;
// If the cache has no mapping for "key", returns nullptr.
//
// Else return a handle that corresponds to the mapping. The caller
// must call this->Release(handle) when the returned mapping is no
// longer needed.
virtual Handle* Lookup(const Slice& key) = 0;
// Release a mapping returned by a previous Lookup().
// REQUIRES: handle must not have been released yet.
// REQUIRES: handle must have been returned by a method on *this.
virtual void Release(Handle* handle) = 0;
// Return the value encapsulated in a handle returned by a
// successful Lookup().
// REQUIRES: handle must not have been released yet.
// REQUIRES: handle must have been returned by a method on *this.
virtual void* Value(Handle* handle) = 0;
// If the cache contains entry for key, erase it. Note that the
// underlying entry will be kept around until all existing handles
// to it have been released.
virtual void Erase(const Slice& key) = 0;
// Return a new numeric id. May be used by multiple clients who are
// sharing the same cache to partition the key space. Typically the
// client will allocate a new id at startup and prepend the id to
// its cache keys.
virtual uint64_t NewId() = 0;
// returns the maximum configured capacity of the cache
virtual size_t GetCapacity() = 0;
private:
void LRU_Remove(Handle* e);
void LRU_Append(Handle* e);
void Unref(Handle* e);
struct Rep;
Rep* rep_;
// No copying allowed
Cache(const Cache&);
void operator=(const Cache&);
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_UTIL_CACHE_H_

View File

@@ -0,0 +1,83 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
#include <string>
namespace rocksdb {
class Slice;
// CompactionFilter allows an application to modify/delete a key-value at
// the time of compaction.
class CompactionFilter {
public:
// Context information of a compaction run
struct Context {
// Does this compaction run include all data files
bool is_full_compaction;
};
virtual ~CompactionFilter() {}
// The compaction process invokes this
// method for kv that is being compacted. A return value
// of false indicates that the kv should be preserved in the
// output of this compaction run and a return value of true
// indicates that this key-value should be removed from the
// output of the compaction. The application can inspect
// the existing value of the key and make decision based on it.
//
// When the value is to be preserved, the application has the option
// to modify the existing_value and pass it back through new_value.
// value_changed needs to be set to true in this case.
virtual bool Filter(int level,
const Slice& key,
const Slice& existing_value,
std::string* new_value,
bool* value_changed) const = 0;
// Returns a name that identifies this compaction filter.
// The name will be printed to LOG file on start up for diagnosis.
virtual const char* Name() const = 0;
};
// Each compaction will create a new CompactionFilter allowing the
// application to know about different campactions
class CompactionFilterFactory {
public:
virtual ~CompactionFilterFactory() { };
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
const CompactionFilter::Context& context) = 0;
// Returns a name that identifies this compaction filter factory.
virtual const char* Name() const = 0;
};
// Default implementaion of CompactionFilterFactory which does not
// return any filter
class DefaultCompactionFilterFactory : public CompactionFilterFactory {
public:
virtual std::unique_ptr<CompactionFilter>
CreateCompactionFilter(const CompactionFilter::Context& context) override {
return std::unique_ptr<CompactionFilter>(nullptr);
}
virtual const char* Name() const override {
return "DefaultCompactionFilterFactory";
}
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_

View File

@@ -0,0 +1,67 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
#include <string>
namespace rocksdb {
class Slice;
// A Comparator object provides a total order across slices that are
// used as keys in an sstable or a database. A Comparator implementation
// must be thread-safe since rocksdb may invoke its methods concurrently
// from multiple threads.
class Comparator {
public:
virtual ~Comparator();
// Three-way comparison. Returns value:
// < 0 iff "a" < "b",
// == 0 iff "a" == "b",
// > 0 iff "a" > "b"
virtual int Compare(const Slice& a, const Slice& b) const = 0;
// The name of the comparator. Used to check for comparator
// mismatches (i.e., a DB created with one comparator is
// accessed using a different comparator.
//
// The client of this package should switch to a new name whenever
// the comparator implementation changes in a way that will cause
// the relative ordering of any two keys to change.
//
// Names starting with "rocksdb." are reserved and should not be used
// by any clients of this package.
virtual const char* Name() const = 0;
// Advanced functions: these are used to reduce the space requirements
// for internal data structures like index blocks.
// If *start < limit, changes *start to a short string in [start,limit).
// Simple comparator implementations may return with *start unchanged,
// i.e., an implementation of this method that does nothing is correct.
virtual void FindShortestSeparator(
std::string* start,
const Slice& limit) const = 0;
// Changes *key to a short string >= *key.
// Simple comparator implementations may return with *key unchanged,
// i.e., an implementation of this method that does nothing is correct.
virtual void FindShortSuccessor(std::string* key) const = 0;
};
// Return a builtin comparator that uses lexicographic byte-wise
// ordering. The result remains the property of this module and
// must not be deleted.
extern const Comparator* BytewiseComparator();
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_

307
include/rocksdb/db.h Normal file
View File

@@ -0,0 +1,307 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_
#define STORAGE_ROCKSDB_INCLUDE_DB_H_
#include <stdint.h>
#include <stdio.h>
#include <memory>
#include <vector>
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "rocksdb/types.h"
#include "rocksdb/transaction_log.h"
namespace rocksdb {
using std::unique_ptr;
// Update Makefile if you change these
static const int kMajorVersion = 2;
static const int kMinorVersion = 0;
struct Options;
struct ReadOptions;
struct WriteOptions;
struct FlushOptions;
class WriteBatch;
// Metadata associated with each SST file.
struct LiveFileMetaData {
std::string name; // Name of the file
int level; // Level at which this file resides.
size_t size; // File size in bytes.
std::string smallestkey; // Smallest user defined key in the file.
std::string largestkey; // Largest user defined key in the file.
SequenceNumber smallest_seqno; // smallest seqno in file
SequenceNumber largest_seqno; // largest seqno in file
};
// Abstract handle to particular state of a DB.
// A Snapshot is an immutable object and can therefore be safely
// accessed from multiple threads without any external synchronization.
class Snapshot {
protected:
virtual ~Snapshot();
};
// A range of keys
struct Range {
Slice start; // Included in the range
Slice limit; // Not included in the range
Range() { }
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
};
// A DB is a persistent ordered map from keys to values.
// A DB is safe for concurrent access from multiple threads without
// any external synchronization.
class DB {
public:
// Open the database with the specified "name".
// Stores a pointer to a heap-allocated database in *dbptr and returns
// OK on success.
// Stores nullptr in *dbptr and returns a non-OK status on error.
// Caller should delete *dbptr when it is no longer needed.
static Status Open(const Options& options,
const std::string& name,
DB** dbptr);
// Open the database for read only. All DB interfaces
// that modify data, like put/delete, will return error.
// If the db is opened in read only mode, then no compactions
// will happen.
static Status OpenForReadOnly(const Options& options,
const std::string& name, DB** dbptr,
bool error_if_log_file_exist = false);
DB() { }
virtual ~DB();
// Set the database entry for "key" to "value".
// Returns OK on success, and a non-OK status on error.
// Note: consider setting options.sync = true.
virtual Status Put(const WriteOptions& options,
const Slice& key,
const Slice& value) = 0;
// Remove the database entry (if any) for "key". Returns OK on
// success, and a non-OK status on error. It is not an error if "key"
// did not exist in the database.
// Note: consider setting options.sync = true.
virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
// Merge the database entry for "key" with "value". Returns OK on success,
// and a non-OK status on error. The semantics of this operation is
// determined by the user provided merge_operator when opening DB.
// Note: consider setting options.sync = true.
virtual Status Merge(const WriteOptions& options,
const Slice& key,
const Slice& value) = 0;
// Apply the specified updates to the database.
// Returns OK on success, non-OK on failure.
// Note: consider setting options.sync = true.
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
// If the database contains an entry for "key" store the
// corresponding value in *value and return OK.
//
// If there is no entry for "key" leave *value unchanged and return
// a status for which Status::IsNotFound() returns true.
//
// May return some other Status on an error.
virtual Status Get(const ReadOptions& options,
const Slice& key,
std::string* value) = 0;
// If keys[i] does not exist in the database, then the i'th returned
// status will be one for which Status::IsNotFound() is true, and
// (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
// the i'th returned status will have Status::ok() true, and (*values)[i]
// will store the value associated with keys[i].
//
// (*values) will always be resized to be the same size as (keys).
// Similarly, the number of returned statuses will be the number of keys.
// Note: keys will not be "de-duplicated". Duplicate keys will return
// duplicate values in order.
virtual std::vector<Status> MultiGet(const ReadOptions& options,
const std::vector<Slice>& keys,
std::vector<std::string>* values) = 0;
// If the key definitely does not exist in the database, then this method
// returns false, else true. If the caller wants to obtain value when the key
// is found in memory, a bool for 'value_found' must be passed. 'value_found'
// will be true on return if value has been set properly.
// This check is potentially lighter-weight than invoking DB::Get(). One way
// to make this lighter weight is to avoid doing any IOs.
// Default implementation here returns true and sets 'value_found' to false
virtual bool KeyMayExist(const ReadOptions& options,
const Slice& key,
std::string* value,
bool* value_found = nullptr) {
if (value_found != nullptr) {
*value_found = false;
}
return true;
}
// Return a heap-allocated iterator over the contents of the database.
// The result of NewIterator() is initially invalid (caller must
// call one of the Seek methods on the iterator before using it).
//
// Caller should delete the iterator when it is no longer needed.
// The returned iterator should be deleted before this db is deleted.
virtual Iterator* NewIterator(const ReadOptions& options) = 0;
// Return a handle to the current DB state. Iterators created with
// this handle will all observe a stable snapshot of the current DB
// state. The caller must call ReleaseSnapshot(result) when the
// snapshot is no longer needed.
virtual const Snapshot* GetSnapshot() = 0;
// Release a previously acquired snapshot. The caller must not
// use "snapshot" after this call.
virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
// DB implementations can export properties about their state
// via this method. If "property" is a valid property understood by this
// DB implementation, fills "*value" with its current value and returns
// true. Otherwise returns false.
//
//
// Valid property names include:
//
// "rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
// where <N> is an ASCII representation of a level number (e.g. "0").
// "rocksdb.stats" - returns a multi-line string that describes statistics
// about the internal operation of the DB.
// "rocksdb.sstables" - returns a multi-line string that describes all
// of the sstables that make up the db contents.
virtual bool GetProperty(const Slice& property, std::string* value) = 0;
// For each i in [0,n-1], store in "sizes[i]", the approximate
// file system space used by keys in "[range[i].start .. range[i].limit)".
//
// Note that the returned sizes measure file system space usage, so
// if the user data compresses by a factor of ten, the returned
// sizes will be one-tenth the size of the corresponding user data size.
//
// The results may not include the sizes of recently written data.
virtual void GetApproximateSizes(const Range* range, int n,
uint64_t* sizes) = 0;
// Compact the underlying storage for the key range [*begin,*end].
// In particular, deleted and overwritten versions are discarded,
// and the data is rearranged to reduce the cost of operations
// needed to access the data. This operation should typically only
// be invoked by users who understand the underlying implementation.
//
// begin==nullptr is treated as a key before all keys in the database.
// end==nullptr is treated as a key after all keys in the database.
// Therefore the following call will compact the entire database:
// db->CompactRange(nullptr, nullptr);
// Note that after the entire database is compacted, all data are pushed
// down to the last level containing any data. If the total data size
// after compaction is reduced, that level might not be appropriate for
// hosting all the files. In this case, client could set reduce_level
// to true, to move the files back to the minimum level capable of holding
// the data set or a given level (specified by non-negative target_level).
virtual void CompactRange(const Slice* begin, const Slice* end,
bool reduce_level = false,
int target_level = -1) = 0;
// Number of levels used for this DB.
virtual int NumberLevels() = 0;
// Maximum level to which a new compacted memtable is pushed if it
// does not create overlap.
virtual int MaxMemCompactionLevel() = 0;
// Number of files in level-0 that would stop writes.
virtual int Level0StopWriteTrigger() = 0;
// Flush all mem-table data.
virtual Status Flush(const FlushOptions& options) = 0;
// Prevent file deletions. Compactions will continue to occur,
// but no obsolete files will be deleted. Calling this multiple
// times have the same effect as calling it once.
virtual Status DisableFileDeletions() = 0;
// Allow compactions to delete obselete files.
virtual Status EnableFileDeletions() = 0;
// GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
// THIS METHOD IS DEPRECATED. Use the GetTableMetaData to get more
// detailed information on the live files.
// Retrieve the list of all files in the database. The files are
// relative to the dbname and are not absolute paths. The valid size of the
// manifest file is returned in manifest_file_size. The manifest file is an
// ever growing file, but only the portion specified by manifest_file_size is
// valid for this snapshot.
// Setting flush_memtable to true does Flush before recording the live files.
// Setting flush_memtable to false is useful when we don't want to wait for
// flush which may have to wait for compaction to complete taking an
// indeterminate time. But this will have to use GetSortedWalFiles after
// GetLiveFiles to compensate for memtables missed in this snapshot due to the
// absence of Flush, by WAL files to recover the database consistently later
virtual Status GetLiveFiles(std::vector<std::string>&,
uint64_t* manifest_file_size,
bool flush_memtable = true) = 0;
// Retrieve the sorted list of all wal files with earliest file first
virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
// The sequence number of the most recent transaction.
virtual SequenceNumber GetLatestSequenceNumber() const = 0;
// Sets iter to an iterator that is positioned at a write-batch containing
// seq_number. If the sequence number is non existent, it returns an iterator
// at the first available seq_no after the requested seq_no
// Returns Status::Ok if iterator is valid
// Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
// use this api, else the WAL files will get
// cleared aggressively and the iterator might keep getting invalid before
// an update is read.
virtual Status GetUpdatesSince(SequenceNumber seq_number,
unique_ptr<TransactionLogIterator>* iter) = 0;
// Delete the file name from the db directory and update the internal state to
// reflect that. Supports deletion of sst and log files only. 'name' must be
// path relative to the db directory. eg. 000001.sst, /archive/000003.log
virtual Status DeleteFile(std::string name) = 0;
// Returns a list of all table files with their level, start key
// and end key
virtual void GetLiveFilesMetaData(
std::vector<LiveFileMetaData> *metadata) {
}
private:
// No copying allowed
DB(const DB&);
void operator=(const DB&);
};
// Destroy the contents of the specified database.
// Be very careful using this method.
Status DestroyDB(const std::string& name, const Options& options);
// If a DB cannot be opened, you may attempt to call this method to
// resurrect as much of the contents of the database as possible.
// Some data may be lost, so be careful when calling this function
// on a database that contains important information.
Status RepairDB(const std::string& dbname, const Options& options);
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_DB_H_

649
include/rocksdb/env.h Normal file
View File

@@ -0,0 +1,649 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// An Env is an interface used by the rocksdb implementation to access
// operating system functionality like the filesystem etc. Callers
// may wish to provide a custom Env object when opening a database to
// get fine gain control; e.g., to rate limit file system operations.
//
// All Env implementations are safe for concurrent access from
// multiple threads without any external synchronization.
#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
#define STORAGE_ROCKSDB_INCLUDE_ENV_H_
#include <cstdarg>
#include <string>
#include <memory>
#include <vector>
#include <stdint.h>
#include "rocksdb/status.h"
namespace rocksdb {
class FileLock;
class Logger;
class RandomAccessFile;
class SequentialFile;
class Slice;
class WritableFile;
class RandomRWFile;
struct Options;
using std::unique_ptr;
using std::shared_ptr;
// Options while opening a file to read/write
struct EnvOptions {
// construct with default Options
EnvOptions();
// construct from Options
explicit EnvOptions(const Options& options);
// If true, then allow caching of data in environment buffers
bool use_os_buffer = true;
// If true, then use mmap to read data
bool use_mmap_reads = false;
// If true, then use mmap to write data
bool use_mmap_writes = true;
// If true, set the FD_CLOEXEC on open fd.
bool set_fd_cloexec= true;
// Allows OS to incrementally sync files to disk while they are being
// written, in the background. Issue one request for every bytes_per_sync
// written. 0 turns it off.
// Default: 0
uint64_t bytes_per_sync = 0;
};
class Env {
public:
Env() { }
virtual ~Env();
// Return a default environment suitable for the current operating
// system. Sophisticated users may wish to provide their own Env
// implementation instead of relying on this default environment.
//
// The result of Default() belongs to rocksdb and must never be deleted.
static Env* Default();
// Create a brand new sequentially-readable file with the specified name.
// On success, stores a pointer to the new file in *result and returns OK.
// On failure stores nullptr in *result and returns non-OK. If the file does
// not exist, returns a non-OK status.
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewSequentialFile(const std::string& fname,
unique_ptr<SequentialFile>* result,
const EnvOptions& options)
= 0;
// Create a brand new random access read-only file with the
// specified name. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores nullptr in *result and
// returns non-OK. If the file does not exist, returns a non-OK
// status.
//
// The returned file may be concurrently accessed by multiple threads.
virtual Status NewRandomAccessFile(const std::string& fname,
unique_ptr<RandomAccessFile>* result,
const EnvOptions& options)
= 0;
// Create an object that writes to a new file with the specified
// name. Deletes any existing file with the same name and creates a
// new file. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores nullptr in *result and
// returns non-OK.
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewWritableFile(const std::string& fname,
unique_ptr<WritableFile>* result,
const EnvOptions& options) = 0;
// Create an object that both reads and writes to a file on
// specified offsets (random access). If file already exists,
// does not overwrite it. On success, stores a pointer to the
// new file in *result and returns OK. On failure stores nullptr
// in *result and returns non-OK.
virtual Status NewRandomRWFile(const std::string& fname,
unique_ptr<RandomRWFile>* result,
const EnvOptions& options) = 0;
// Returns true iff the named file exists.
virtual bool FileExists(const std::string& fname) = 0;
// Store in *result the names of the children of the specified directory.
// The names are relative to "dir".
// Original contents of *results are dropped.
virtual Status GetChildren(const std::string& dir,
std::vector<std::string>* result) = 0;
// Delete the named file.
virtual Status DeleteFile(const std::string& fname) = 0;
// Create the specified directory. Returns error if directory exists.
virtual Status CreateDir(const std::string& dirname) = 0;
// Creates directory if missing. Return Ok if it exists, or successful in
// Creating.
virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
// Delete the specified directory.
virtual Status DeleteDir(const std::string& dirname) = 0;
// Store the size of fname in *file_size.
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
// Store the last modification time of fname in *file_mtime.
virtual Status GetFileModificationTime(const std::string& fname,
uint64_t* file_mtime) = 0;
// Rename file src to target.
virtual Status RenameFile(const std::string& src,
const std::string& target) = 0;
// Lock the specified file. Used to prevent concurrent access to
// the same db by multiple processes. On failure, stores nullptr in
// *lock and returns non-OK.
//
// On success, stores a pointer to the object that represents the
// acquired lock in *lock and returns OK. The caller should call
// UnlockFile(*lock) to release the lock. If the process exits,
// the lock will be automatically released.
//
// If somebody else already holds the lock, finishes immediately
// with a failure. I.e., this call does not wait for existing locks
// to go away.
//
// May create the named file if it does not already exist.
virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
// Release the lock acquired by a previous successful call to LockFile.
// REQUIRES: lock was returned by a successful LockFile() call
// REQUIRES: lock has not already been unlocked.
virtual Status UnlockFile(FileLock* lock) = 0;
enum Priority { LOW, HIGH, TOTAL };
// Arrange to run "(*function)(arg)" once in a background thread, in
// the thread pool specified by pri. By default, jobs go to the 'LOW'
// priority thread pool.
// "function" may run in an unspecified thread. Multiple functions
// added to the same Env may run concurrently in different threads.
// I.e., the caller may not assume that background work items are
// serialized.
virtual void Schedule(
void (*function)(void* arg),
void* arg,
Priority pri = LOW) = 0;
// Start a new thread, invoking "function(arg)" within the new thread.
// When "function(arg)" returns, the thread will be destroyed.
virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
// *path is set to a temporary directory that can be used for testing. It may
// or many not have just been created. The directory may or may not differ
// between runs of the same process, but subsequent calls will return the
// same directory.
virtual Status GetTestDirectory(std::string* path) = 0;
// Create and return a log file for storing informational messages.
virtual Status NewLogger(const std::string& fname,
shared_ptr<Logger>* result) = 0;
// Returns the number of micro-seconds since some fixed point in time. Only
// useful for computing deltas of time.
virtual uint64_t NowMicros() = 0;
// Returns the number of nano-seconds since some fixed point in time. Only
// useful for computing deltas of time in one run.
// Default implementation simply relies on NowMicros
virtual uint64_t NowNanos() {
return NowMicros() * 1000;
}
// Sleep/delay the thread for the perscribed number of micro-seconds.
virtual void SleepForMicroseconds(int micros) = 0;
// Get the current host name.
virtual Status GetHostName(char* name, uint64_t len) = 0;
// Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
virtual Status GetCurrentTime(int64_t* unix_time) = 0;
// Get full directory name for this db.
virtual Status GetAbsolutePath(const std::string& db_path,
std::string* output_path) = 0;
// The number of background worker threads of a specific thread pool
// for this environment. 'LOW' is the default pool.
// default number: 1
virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
// Converts seconds-since-Jan-01-1970 to a printable string
virtual std::string TimeToString(uint64_t time) = 0;
// Generates a unique id that can be used to identify a db
virtual std::string GenerateUniqueId();
private:
// No copying allowed
Env(const Env&);
void operator=(const Env&);
};
// A file abstraction for reading sequentially through a file
class SequentialFile {
public:
SequentialFile() { }
virtual ~SequentialFile();
// Read up to "n" bytes from the file. "scratch[0..n-1]" may be
// written by this routine. Sets "*result" to the data that was
// read (including if fewer than "n" bytes were successfully read).
// May set "*result" to point at data in "scratch[0..n-1]", so
// "scratch[0..n-1]" must be live when "*result" is used.
// If an error was encountered, returns a non-OK status.
//
// REQUIRES: External synchronization
virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
// Skip "n" bytes from the file. This is guaranteed to be no
// slower that reading the same data, but may be faster.
//
// If end of file is reached, skipping will stop at the end of the
// file, and Skip will return OK.
//
// REQUIRES: External synchronization
virtual Status Skip(uint64_t n) = 0;
// Remove any kind of caching of data from the offset to offset+length
// of this file. If the length is 0, then it refers to the end of file.
// If the system is not caching the file contents, then this is a noop.
virtual Status InvalidateCache(size_t offset, size_t length) {
return Status::NotSupported("InvalidateCache not supported.");
}
};
// A file abstraction for randomly reading the contents of a file.
class RandomAccessFile {
public:
RandomAccessFile() { }
virtual ~RandomAccessFile();
// Read up to "n" bytes from the file starting at "offset".
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
// to the data that was read (including if fewer than "n" bytes were
// successfully read). May set "*result" to point at data in
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
// "*result" is used. If an error was encountered, returns a non-OK
// status.
//
// Safe for concurrent use by multiple threads.
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const = 0;
// Tries to get an unique ID for this file that will be the same each time
// the file is opened (and will stay the same while the file is open).
// Furthermore, it tries to make this ID at most "max_size" bytes. If such an
// ID can be created this function returns the length of the ID and places it
// in "id"; otherwise, this function returns 0, in which case "id"
// may not have been modified.
//
// This function guarantees, for IDs from a given environment, two unique ids
// cannot be made equal to eachother by adding arbitrary bytes to one of
// them. That is, no unique ID is the prefix of another.
//
// This function guarantees that the returned ID will not be interpretable as
// a single varint.
//
// Note: these IDs are only valid for the duration of the process.
virtual size_t GetUniqueId(char* id, size_t max_size) const {
return 0; // Default implementation to prevent issues with backwards
// compatibility.
};
enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
virtual void Hint(AccessPattern pattern) {}
// Remove any kind of caching of data from the offset to offset+length
// of this file. If the length is 0, then it refers to the end of file.
// If the system is not caching the file contents, then this is a noop.
virtual Status InvalidateCache(size_t offset, size_t length) {
return Status::NotSupported("InvalidateCache not supported.");
}
};
// A file abstraction for sequential writing. The implementation
// must provide buffering since callers may append small fragments
// at a time to the file.
class WritableFile {
public:
WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) {
}
virtual ~WritableFile();
virtual Status Append(const Slice& data) = 0;
virtual Status Close() = 0;
virtual Status Flush() = 0;
virtual Status Sync() = 0; // sync data
/*
* Sync data and/or metadata as well.
* By default, sync only data.
* Override this method for environments where we need to sync
* metadata as well.
*/
virtual Status Fsync() {
return Sync();
}
/*
* Get the size of valid data in the file.
*/
virtual uint64_t GetFileSize() {
return 0;
}
/*
* Get and set the default pre-allocation block size for writes to
* this file. If non-zero, then Allocate will be used to extend the
* underlying storage of a file (generally via fallocate) if the Env
* instance supports it.
*/
void SetPreallocationBlockSize(size_t size) {
preallocation_block_size_ = size;
}
virtual void GetPreallocationStatus(size_t* block_size,
size_t* last_allocated_block) {
*last_allocated_block = last_preallocated_block_;
*block_size = preallocation_block_size_;
}
// For documentation, refer to RandomAccessFile::GetUniqueId()
virtual size_t GetUniqueId(char* id, size_t max_size) const {
return 0; // Default implementation to prevent issues with backwards
}
// Remove any kind of caching of data from the offset to offset+length
// of this file. If the length is 0, then it refers to the end of file.
// If the system is not caching the file contents, then this is a noop.
// This call has no effect on dirty pages in the cache.
virtual Status InvalidateCache(size_t offset, size_t length) {
return Status::NotSupported("InvalidateCache not supported.");
}
protected:
// PrepareWrite performs any necessary preparation for a write
// before the write actually occurs. This allows for pre-allocation
// of space on devices where it can result in less file
// fragmentation and/or less waste from over-zealous filesystem
// pre-allocation.
void PrepareWrite(size_t offset, size_t len) {
if (preallocation_block_size_ == 0) {
return;
}
// If this write would cross one or more preallocation blocks,
// determine what the last preallocation block necesessary to
// cover this write would be and Allocate to that point.
const auto block_size = preallocation_block_size_;
size_t new_last_preallocated_block =
(offset + len + block_size - 1) / block_size;
if (new_last_preallocated_block > last_preallocated_block_) {
size_t num_spanned_blocks =
new_last_preallocated_block - last_preallocated_block_;
Allocate(block_size * last_preallocated_block_,
block_size * num_spanned_blocks);
last_preallocated_block_ = new_last_preallocated_block;
}
}
/*
* Pre-allocate space for a file.
*/
virtual Status Allocate(off_t offset, off_t len) {
return Status::OK();
}
// Sync a file range with disk.
// offset is the starting byte of the file range to be synchronized.
// nbytes specifies the length of the range to be synchronized.
// This asks the OS to initiate flushing the cached data to disk,
// without waiting for completion.
// Default implementation does nothing.
virtual Status RangeSync(off_t offset, off_t nbytes) {
return Status::OK();
}
private:
size_t last_preallocated_block_;
size_t preallocation_block_size_;
// No copying allowed
WritableFile(const WritableFile&);
void operator=(const WritableFile&);
};
// A file abstraction for random reading and writing.
class RandomRWFile {
public:
RandomRWFile() {}
virtual ~RandomRWFile() {}
// Write data from Slice data to file starting from offset
// Returns IOError on failure, but does not guarantee
// atomicity of a write. Returns OK status on success.
//
// Safe for concurrent use.
virtual Status Write(uint64_t offset, const Slice& data) = 0;
// Read up to "n" bytes from the file starting at "offset".
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
// to the data that was read (including if fewer than "n" bytes were
// successfully read). May set "*result" to point at data in
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
// "*result" is used. If an error was encountered, returns a non-OK
// status.
//
// Safe for concurrent use by multiple threads.
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const = 0;
virtual Status Close() = 0; // closes the file
virtual Status Sync() = 0; // sync data
/*
* Sync data and/or metadata as well.
* By default, sync only data.
* Override this method for environments where we need to sync
* metadata as well.
*/
virtual Status Fsync() {
return Sync();
}
/*
* Pre-allocate space for a file.
*/
virtual Status Allocate(off_t offset, off_t len) {
return Status::OK();
}
private:
// No copying allowed
RandomRWFile(const RandomRWFile&);
void operator=(const RandomRWFile&);
};
// An interface for writing log messages.
class Logger {
public:
enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 };
Logger() { }
virtual ~Logger();
// Write an entry to the log file with the specified format.
virtual void Logv(const char* format, va_list ap) = 0;
virtual size_t GetLogFileSize() const {
return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE;
}
// Flush to the OS buffers
virtual void Flush() {}
private:
// No copying allowed
Logger(const Logger&);
void operator=(const Logger&);
};
// Identifies a locked file.
class FileLock {
public:
FileLock() { }
virtual ~FileLock();
private:
// No copying allowed
FileLock(const FileLock&);
void operator=(const FileLock&);
};
extern void LogFlush(const shared_ptr<Logger>& info_log);
// Log the specified data to *info_log if info_log is non-nullptr.
extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
# if defined(__GNUC__) || defined(__clang__)
__attribute__((__format__ (__printf__, 2, 3)))
# endif
;
extern void LogFlush(Logger *info_log);
extern void Log(Logger* info_log, const char* format, ...)
# if defined(__GNUC__) || defined(__clang__)
__attribute__((__format__ (__printf__, 2, 3)))
# endif
;
// A utility routine: write "data" to the named file.
extern Status WriteStringToFile(Env* env, const Slice& data,
const std::string& fname);
// A utility routine: read contents of named file into *data
extern Status ReadFileToString(Env* env, const std::string& fname,
std::string* data);
// An implementation of Env that forwards all calls to another Env.
// May be useful to clients who wish to override just part of the
// functionality of another Env.
class EnvWrapper : public Env {
public:
// Initialize an EnvWrapper that delegates all calls to *t
explicit EnvWrapper(Env* t) : target_(t) { }
virtual ~EnvWrapper();
// Return the target to which this Env forwards all calls
Env* target() const { return target_; }
// The following text is boilerplate that forwards all methods to target()
Status NewSequentialFile(const std::string& f,
unique_ptr<SequentialFile>* r,
const EnvOptions& options) {
return target_->NewSequentialFile(f, r, options);
}
Status NewRandomAccessFile(const std::string& f,
unique_ptr<RandomAccessFile>* r,
const EnvOptions& options) {
return target_->NewRandomAccessFile(f, r, options);
}
Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
const EnvOptions& options) {
return target_->NewWritableFile(f, r, options);
}
Status NewRandomRWFile(const std::string& f, unique_ptr<RandomRWFile>* r,
const EnvOptions& options) {
return target_->NewRandomRWFile(f, r, options);
}
bool FileExists(const std::string& f) { return target_->FileExists(f); }
Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
return target_->GetChildren(dir, r);
}
Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
Status CreateDirIfMissing(const std::string& d) {
return target_->CreateDirIfMissing(d);
}
Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
Status GetFileSize(const std::string& f, uint64_t* s) {
return target_->GetFileSize(f, s);
}
Status GetFileModificationTime(const std::string& fname,
uint64_t* file_mtime) {
return target_->GetFileModificationTime(fname, file_mtime);
}
Status RenameFile(const std::string& s, const std::string& t) {
return target_->RenameFile(s, t);
}
Status LockFile(const std::string& f, FileLock** l) {
return target_->LockFile(f, l);
}
Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
void Schedule(void (*f)(void*), void* a, Priority pri) {
return target_->Schedule(f, a, pri);
}
void StartThread(void (*f)(void*), void* a) {
return target_->StartThread(f, a);
}
virtual Status GetTestDirectory(std::string* path) {
return target_->GetTestDirectory(path);
}
virtual Status NewLogger(const std::string& fname,
shared_ptr<Logger>* result) {
return target_->NewLogger(fname, result);
}
uint64_t NowMicros() {
return target_->NowMicros();
}
void SleepForMicroseconds(int micros) {
target_->SleepForMicroseconds(micros);
}
Status GetHostName(char* name, uint64_t len) {
return target_->GetHostName(name, len);
}
Status GetCurrentTime(int64_t* unix_time) {
return target_->GetCurrentTime(unix_time);
}
Status GetAbsolutePath(const std::string& db_path,
std::string* output_path) {
return target_->GetAbsolutePath(db_path, output_path);
}
void SetBackgroundThreads(int num, Priority pri) {
return target_->SetBackgroundThreads(num, pri);
}
std::string TimeToString(uint64_t time) {
return target_->TimeToString(time);
}
private:
Env* target_;
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_ENV_H_

View File

@@ -0,0 +1,74 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A database can be configured with a custom FilterPolicy object.
// This object is responsible for creating a small filter from a set
// of keys. These filters are stored in rocksdb and are consulted
// automatically by rocksdb to decide whether or not to read some
// information from disk. In many cases, a filter can cut down the
// number of disk seeks form a handful to a single disk seek per
// DB::Get() call.
//
// Most people will want to use the builtin bloom filter support (see
// NewBloomFilterPolicy() below).
#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
#include <string>
namespace rocksdb {
class Slice;
class FilterPolicy {
public:
virtual ~FilterPolicy();
// Return the name of this policy. Note that if the filter encoding
// changes in an incompatible way, the name returned by this method
// must be changed. Otherwise, old incompatible filters may be
// passed to methods of this type.
virtual const char* Name() const = 0;
// keys[0,n-1] contains a list of keys (potentially with duplicates)
// that are ordered according to the user supplied comparator.
// Append a filter that summarizes keys[0,n-1] to *dst.
//
// Warning: do not change the initial contents of *dst. Instead,
// append the newly constructed filter to *dst.
virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
const = 0;
// "filter" contains the data appended by a preceding call to
// CreateFilter() on this class. This method must return true if
// the key was in the list of keys passed to CreateFilter().
// This method may return true or false if the key was not on the
// list, but it should aim to return false with a high probability.
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
};
// Return a new filter policy that uses a bloom filter with approximately
// the specified number of bits per key. A good value for bits_per_key
// is 10, which yields a filter with ~ 1% false positive rate.
//
// Callers must delete the result after any database that is using the
// result has been closed.
//
// Note: if you are using a custom comparator that ignores some parts
// of the keys being compared, you must not use NewBloomFilterPolicy()
// and must provide your own FilterPolicy that also ignores the
// corresponding parts of the keys. For example, if the comparator
// ignores trailing spaces, it would be incorrect to use a
// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
// trailing spaces in keys.
extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
}
#endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_

View File

@@ -0,0 +1,64 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <string>
namespace rocksdb {
class Slice;
class BlockBuilder;
// FlushBlockPolicy provides a configurable way to determine when to flush a
// block in the block based tables,
class FlushBlockPolicy {
public:
// Keep track of the key/value sequences and return the boolean value to
// determine if table builder should flush current data block.
virtual bool Update(const Slice& key,
const Slice& value) = 0;
virtual ~FlushBlockPolicy() { }
};
class FlushBlockPolicyFactory {
public:
// Return the name of the flush block policy.
virtual const char* Name() const = 0;
// Return a new block flush policy that flushes data blocks by data size.
// FlushBlockPolicy may need to access the metadata of the data block
// builder to determine when to flush the blocks.
//
// Callers must delete the result after any database that is using the
// result has been closed.
virtual FlushBlockPolicy* NewFlushBlockPolicy(
const BlockBuilder& data_block_builder) const = 0;
virtual ~FlushBlockPolicyFactory() { }
};
class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
public:
FlushBlockBySizePolicyFactory(const uint64_t block_size,
const uint64_t block_size_deviation) :
block_size_(block_size),
block_size_deviation_(block_size_deviation) {
}
virtual const char* Name() const override {
return "FlushBlockBySizePolicyFactory";
}
virtual FlushBlockPolicy* NewFlushBlockPolicy(
const BlockBuilder& data_block_builder) const override;
private:
const uint64_t block_size_;
const uint64_t block_size_deviation_;
};
} // rocksdb

106
include/rocksdb/iterator.h Normal file
View File

@@ -0,0 +1,106 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// An iterator yields a sequence of key/value pairs from a source.
// The following class defines the interface. Multiple implementations
// are provided by this library. In particular, iterators are provided
// to access the contents of a Table or a DB.
//
// Multiple threads can invoke const methods on an Iterator without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same Iterator must use
// external synchronization.
#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
namespace rocksdb {
class Iterator {
public:
Iterator();
virtual ~Iterator();
// An iterator is either positioned at a key/value pair, or
// not valid. This method returns true iff the iterator is valid.
virtual bool Valid() const = 0;
// Position at the first key in the source. The iterator is Valid()
// after this call iff the source is not empty.
virtual void SeekToFirst() = 0;
// Position at the last key in the source. The iterator is
// Valid() after this call iff the source is not empty.
virtual void SeekToLast() = 0;
// Position at the first key in the source that at or past target
// The iterator is Valid() after this call iff the source contains
// an entry that comes at or past target.
virtual void Seek(const Slice& target) = 0;
// Moves to the next entry in the source. After this call, Valid() is
// true iff the iterator was not positioned at the last entry in the source.
// REQUIRES: Valid()
virtual void Next() = 0;
// Moves to the previous entry in the source. After this call, Valid() is
// true iff the iterator was not positioned at the first entry in source.
// REQUIRES: Valid()
virtual void Prev() = 0;
// Return the key for the current entry. The underlying storage for
// the returned slice is valid only until the next modification of
// the iterator.
// REQUIRES: Valid()
virtual Slice key() const = 0;
// Return the value for the current entry. The underlying storage for
// the returned slice is valid only until the next modification of
// the iterator.
// REQUIRES: !AtEnd() && !AtStart()
virtual Slice value() const = 0;
// If an error has occurred, return it. Else return an ok status.
// If non-blocking IO is requested and this operation cannot be
// satisfied without doing some IO, then this returns Status::Incomplete().
virtual Status status() const = 0;
// Clients are allowed to register function/arg1/arg2 triples that
// will be invoked when this iterator is destroyed.
//
// Note that unlike all of the preceding methods, this method is
// not abstract and therefore clients should not override it.
typedef void (*CleanupFunction)(void* arg1, void* arg2);
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
private:
struct Cleanup {
CleanupFunction function;
void* arg1;
void* arg2;
Cleanup* next;
};
Cleanup cleanup_;
// No copying allowed
Iterator(const Iterator&);
void operator=(const Iterator&);
};
// Return an empty iterator (yields nothing).
extern Iterator* NewEmptyIterator();
// Return an empty iterator with the specified status.
extern Iterator* NewErrorIterator(const Status& status);
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_

View File

@@ -0,0 +1,18 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
#define STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
#include "rocksdb/options.h"
namespace rocksdb {
class LDBTool {
public:
void Run(int argc, char** argv, Options = Options());
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H

View File

@@ -0,0 +1,276 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// This file contains the interface that must be implemented by any collection
// to be used as the backing store for a MemTable. Such a collection must
// satisfy the following properties:
// (1) It does not store duplicate items.
// (2) It uses MemTableRep::KeyComparator to compare items for iteration and
// equality.
// (3) It can be accessed concurrently by multiple readers and can support
// during reads. However, it needn't support multiple concurrent writes.
// (4) Items are never deleted.
// The liberal use of assertions is encouraged to enforce (1).
//
// The factory will be passed an Arena object when a new MemTableRep is
// requested. The API for this object is in rocksdb/arena.h.
//
// Users can implement their own memtable representations. We include four
// types built in:
// - SkipListRep: This is the default; it is backed by a skip list.
// - TransformRep: This is backed by an custom hash map.
// On construction, they are given a SliceTransform object. This
// object is applied to the user key of stored items which indexes into the
// hash map to yield a skiplist containing all records that share the same
// user key under the transform function.
// - UnsortedRep: A subclass of TransformRep where the transform function is
// the identity function. Optimized for point lookups.
// - PrefixHashRep: A subclass of TransformRep where the transform function is
// a fixed-size prefix extractor. If you use PrefixHashRepFactory, the transform
// must be identical to options.prefix_extractor, otherwise it will be discarded
// and the default will be used. It is optimized for ranged scans over a
// prefix.
// - VectorRep: This is backed by an unordered std::vector. On iteration, the
// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
// has been called, the vector will only be sorted once. It is optimized for
// random-write-heavy workloads.
//
// The last four implementations are designed for situations in which
// iteration over the entire collection is rare since doing so requires all the
// keys to be copied into a sorted data structure.
#ifndef STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
#define STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
#include <memory>
#include "rocksdb/arena.h"
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
namespace rocksdb {
class MemTableRep {
public:
// KeyComparator provides a means to compare keys, which are internal keys
// concatenated with values.
class KeyComparator {
public:
// Compare a and b. Return a negative value if a is less than b, 0 if they
// are equal, and a positive value if a is greater than b
virtual int operator()(const char* a, const char* b) const = 0;
virtual ~KeyComparator() { }
};
// Insert key into the collection. (The caller will pack key and value into a
// single buffer and pass that in as the parameter to Insert)
// REQUIRES: nothing that compares equal to key is currently in the
// collection.
virtual void Insert(const char* key) = 0;
// Returns true iff an entry that compares equal to key is in the collection.
virtual bool Contains(const char* key) const = 0;
// Notify this table rep that it will no longer be added to. By default, does
// nothing.
virtual void MarkReadOnly() { }
// Report an approximation of how much memory has been used other than memory
// that was allocated through the arena.
virtual size_t ApproximateMemoryUsage() = 0;
virtual ~MemTableRep() { }
// Iteration over the contents of a skip collection
class Iterator {
public:
// Initialize an iterator over the specified collection.
// The returned iterator is not valid.
// explicit Iterator(const MemTableRep* collection);
virtual ~Iterator() { };
// Returns true iff the iterator is positioned at a valid node.
virtual bool Valid() const = 0;
// Returns the key at the current position.
// REQUIRES: Valid()
virtual const char* key() const = 0;
// Advances to the next position.
// REQUIRES: Valid()
virtual void Next() = 0;
// Advances to the previous position.
// REQUIRES: Valid()
virtual void Prev() = 0;
// Advance to the first entry with a key >= target
virtual void Seek(const char* target) = 0;
// Position at the first entry in collection.
// Final state of iterator is Valid() iff collection is not empty.
virtual void SeekToFirst() = 0;
// Position at the last entry in collection.
// Final state of iterator is Valid() iff collection is not empty.
virtual void SeekToLast() = 0;
};
// Return an iterator over the keys in this representation.
virtual std::shared_ptr<Iterator> GetIterator() = 0;
// Return an iterator over at least the keys with the specified user key. The
// iterator may also allow access to other keys, but doesn't have to. Default:
// GetIterator().
virtual std::shared_ptr<Iterator> GetIterator(const Slice& user_key) {
return GetIterator();
}
// Return an iterator over at least the keys with the specified prefix. The
// iterator may also allow access to other keys, but doesn't have to. Default:
// GetIterator().
virtual std::shared_ptr<Iterator> GetPrefixIterator(const Slice& prefix) {
return GetIterator();
}
// Return an iterator that has a special Seek semantics. The result of
// a Seek might only include keys with the same prefix as the target key.
virtual std::shared_ptr<Iterator> GetDynamicPrefixIterator() {
return GetIterator();
}
protected:
// When *key is an internal key concatenated with the value, returns the
// user key.
virtual Slice UserKey(const char* key) const;
};
// This is the base class for all factories that are used by RocksDB to create
// new MemTableRep objects
class MemTableRepFactory {
public:
virtual ~MemTableRepFactory() { };
virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
MemTableRep::KeyComparator&, Arena*) = 0;
virtual const char* Name() const = 0;
};
// This creates MemTableReps that are backed by an std::vector. On iteration,
// the vector is sorted. This is useful for workloads where iteration is very
// rare and writes are generally not issued after reads begin.
//
// Parameters:
// count: Passed to the constructor of the underlying std::vector of each
// VectorRep. On initialization, the underlying array will be at least count
// bytes reserved for usage.
class VectorRepFactory : public MemTableRepFactory {
const size_t count_;
public:
explicit VectorRepFactory(size_t count = 0) : count_(count) { }
virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
MemTableRep::KeyComparator&, Arena*) override;
virtual const char* Name() const override {
return "VectorRepFactory";
}
};
// This uses a skip list to store keys. It is the default.
class SkipListFactory : public MemTableRepFactory {
public:
virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
MemTableRep::KeyComparator&, Arena*) override;
virtual const char* Name() const override {
return "SkipListFactory";
}
};
// TransformReps are backed by an unordered map of buffers to buckets. When
// looking up a key, the user key is extracted and a user-supplied transform
// function (see rocksdb/slice_transform.h) is applied to get the key into the
// unordered map. This allows the user to bin user keys based on arbitrary
// criteria. Two example implementations are UnsortedRepFactory and
// PrefixHashRepFactory.
//
// Iteration over the entire collection is implemented by dumping all the keys
// into an std::set. Thus, these data structures are best used when iteration
// over the entire collection is rare.
//
// Parameters:
// transform: The SliceTransform to bucket user keys on. TransformRepFactory
// owns the pointer.
// bucket_count: Passed to the constructor of the underlying
// std::unordered_map of each TransformRep. On initialization, the
// underlying array will be at least bucket_count size.
// num_locks: Number of read-write locks to have for the rep. Each bucket is
// hashed onto a read-write lock which controls access to that lock. More
// locks means finer-grained concurrency but more memory overhead.
class TransformRepFactory : public MemTableRepFactory {
public:
explicit TransformRepFactory(const SliceTransform* transform,
size_t bucket_count, size_t num_locks = 1000)
: transform_(transform),
bucket_count_(bucket_count),
num_locks_(num_locks) { }
virtual ~TransformRepFactory() { delete transform_; }
virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
MemTableRep::KeyComparator&, Arena*) override;
virtual const char* Name() const override {
return "TransformRepFactory";
}
const SliceTransform* GetTransform() { return transform_; }
protected:
const SliceTransform* transform_;
const size_t bucket_count_;
const size_t num_locks_;
};
// UnsortedReps bin user keys based on an identity function transform -- that
// is, transform(key) = key. This optimizes for point look-ups.
//
// Parameters: See TransformRepFactory.
class UnsortedRepFactory : public TransformRepFactory {
public:
explicit UnsortedRepFactory(size_t bucket_count = 0, size_t num_locks = 1000)
: TransformRepFactory(NewNoopTransform(),
bucket_count,
num_locks) { }
virtual const char* Name() const override {
return "UnsortedRepFactory";
}
};
// PrefixHashReps bin user keys based on a fixed-size prefix. This optimizes for
// short ranged scans over a given prefix.
//
// Parameters: See TransformRepFactory.
class PrefixHashRepFactory : public TransformRepFactory {
public:
explicit PrefixHashRepFactory(const SliceTransform* prefix_extractor,
size_t bucket_count = 0, size_t num_locks = 1000)
: TransformRepFactory(prefix_extractor, bucket_count, num_locks)
{ }
virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
MemTableRep::KeyComparator&, Arena*) override;
virtual const char* Name() const override {
return "PrefixHashRepFactory";
}
};
// The same as TransformRepFactory except it doesn't use locks.
// Experimental, will replace TransformRepFactory once we are sure
// it performs better
extern MemTableRepFactory* NewHashSkipListRepFactory(
const SliceTransform* transform, size_t bucket_count = 1000000);
}
#endif // STORAGE_ROCKSDB_DB_MEMTABLEREP_H_

View File

@@ -0,0 +1,148 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
#define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
#include <string>
#include <deque>
#include "rocksdb/slice.h"
namespace rocksdb {
class Slice;
class Logger;
// The Merge Operator
//
// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only
// client knows. It could be numeric addition, list append, string
// concatenation, edit data structure, ... , anything.
// The library, on the other hand, is concerned with the exercise of this
// interface, at the right time (during get, iteration, compaction...)
//
// To use merge, the client needs to provide an object implementing one of
// the following interfaces:
// a) AssociativeMergeOperator - for most simple semantics (always take
// two values, and merge them into one value, which is then put back
// into rocksdb); numeric addition and string concatenation are examples;
//
// b) MergeOperator - the generic class for all the more abstract / complex
// operations; one method (FullMerge) to merge a Put/Delete value with a
// merge operand; and another method (PartialMerge) that merges two
// operands together. this is especially useful if your key values have a
// complex structure but you would still like to support client-specific
// incremental updates.
//
// AssociativeMergeOperator is simpler to implement. MergeOperator is simply
// more powerful.
//
// Refer to rocksdb-merge wiki for more details and example implementations.
//
class MergeOperator {
public:
virtual ~MergeOperator() {}
// Gives the client a way to express the read -> modify -> write semantics
// key: (IN) The key that's associated with this merge operation.
// Client could multiplex the merge operator based on it
// if the key space is partitioned and different subspaces
// refer to different types of data which have different
// merge operation semantics
// existing: (IN) null indicates that the key does not exist before this op
// operand_list:(IN) the sequence of merge operations to apply, front() first.
// new_value:(OUT) Client is responsible for filling the merge result here
// logger: (IN) Client could use this to log errors during merge.
//
// Return true on success.
// All values passed in will be client-specific values. So if this method
// returns false, it is because client specified bad data or there was
// internal corruption. This will be treated as an error by the library.
//
// Also make use of the *logger for error messages.
virtual bool FullMerge(const Slice& key,
const Slice* existing_value,
const std::deque<std::string>& operand_list,
std::string* new_value,
Logger* logger) const = 0;
// This function performs merge(left_op, right_op)
// when both the operands are themselves merge operation types
// that you would have passed to a DB::Merge() call in the same order
// (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)).
//
// PartialMerge should combine them into a single merge operation that is
// saved into *new_value, and then it should return true.
// *new_value should be constructed such that a call to
// DB::Merge(key, *new_value) would yield the same result as a call
// to DB::Merge(key, left_op) followed by DB::Merge(key, right_op).
//
// If it is impossible or infeasible to combine the two operations,
// leave new_value unchanged and return false. The library will
// internally keep track of the operations, and apply them in the
// correct order once a base-value (a Put/Delete/End-of-Database) is seen.
//
// TODO: Presently there is no way to differentiate between error/corruption
// and simply "return false". For now, the client should simply return
// false in any case it cannot perform partial-merge, regardless of reason.
// If there is corruption in the data, handle it in the FullMerge() function,
// and return false there.
virtual bool PartialMerge(const Slice& key,
const Slice& left_operand,
const Slice& right_operand,
std::string* new_value,
Logger* logger) const = 0;
// The name of the MergeOperator. Used to check for MergeOperator
// mismatches (i.e., a DB created with one MergeOperator is
// accessed using a different MergeOperator)
// TODO: the name is currently not stored persistently and thus
// no checking is enforced. Client is responsible for providing
// consistent MergeOperator between DB opens.
virtual const char* Name() const = 0;
};
// The simpler, associative merge operator.
class AssociativeMergeOperator : public MergeOperator {
public:
virtual ~AssociativeMergeOperator() {}
// Gives the client a way to express the read -> modify -> write semantics
// key: (IN) The key that's associated with this merge operation.
// existing_value:(IN) null indicates the key does not exist before this op
// value: (IN) the value to update/merge the existing_value with
// new_value: (OUT) Client is responsible for filling the merge result here
// logger: (IN) Client could use this to log errors during merge.
//
// Return true on success.
// All values passed in will be client-specific values. So if this method
// returns false, it is because client specified bad data or there was
// internal corruption. The client should assume that this will be treated
// as an error by the library.
virtual bool Merge(const Slice& key,
const Slice* existing_value,
const Slice& value,
std::string* new_value,
Logger* logger) const = 0;
private:
// Default implementations of the MergeOperator functions
virtual bool FullMerge(const Slice& key,
const Slice* existing_value,
const std::deque<std::string>& operand_list,
std::string* new_value,
Logger* logger) const override;
virtual bool PartialMerge(const Slice& key,
const Slice& left_operand,
const Slice& right_operand,
std::string* new_value,
Logger* logger) const override;
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_

742
include/rocksdb/options.h Normal file
View File

@@ -0,0 +1,742 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
#include <stddef.h>
#include <string>
#include <memory>
#include <vector>
#include <stdint.h>
#include "rocksdb/memtablerep.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/statistics.h"
#include "rocksdb/table_properties.h"
#include "rocksdb/universal_compaction.h"
namespace rocksdb {
class Cache;
class CompactionFilter;
class CompactionFilterFactory;
class Comparator;
class Env;
class FilterPolicy;
class Logger;
class MergeOperator;
class Snapshot;
class TableFactory;
using std::shared_ptr;
// DB contents are stored in a set of blocks, each of which holds a
// sequence of key,value pairs. Each block may be compressed before
// being stored in a file. The following enum describes which
// compression method (if any) is used to compress a block.
enum CompressionType : char {
// NOTE: do not change the values of existing entries, as these are
// part of the persistent format on disk.
kNoCompression = 0x0,
kSnappyCompression = 0x1,
kZlibCompression = 0x2,
kBZip2Compression = 0x3
};
enum CompactionStyle : char {
kCompactionStyleLevel = 0x0, // level based compaction style
kCompactionStyleUniversal = 0x1 // Universal compaction style
};
// Compression options for different compression algorithms like Zlib
struct CompressionOptions {
int window_bits;
int level;
int strategy;
CompressionOptions():window_bits(-14),
level(-1),
strategy(0){}
CompressionOptions(int wbits, int lev, int strategy):window_bits(wbits),
level(lev),
strategy(strategy){}
};
// Options to control the behavior of a database (passed to DB::Open)
struct Options {
// -------------------
// Parameters that affect behavior
// Comparator used to define the order of keys in the table.
// Default: a comparator that uses lexicographic byte-wise ordering
//
// REQUIRES: The client must ensure that the comparator supplied
// here has the same name and orders keys *exactly* the same as the
// comparator provided to previous open calls on the same DB.
const Comparator* comparator;
// REQUIRES: The client must provide a merge operator if Merge operation
// needs to be accessed. Calling Merge on a DB without a merge operator
// would result in Status::NotSupported. The client must ensure that the
// merge operator supplied here has the same name and *exactly* the same
// semantics as the merge operator provided to previous open calls on
// the same DB. The only exception is reserved for upgrade, where a DB
// previously without a merge operator is introduced to Merge operation
// for the first time. It's necessary to specify a merge operator when
// openning the DB in this case.
// Default: nullptr
shared_ptr<MergeOperator> merge_operator;
// The client must provide compaction_filter_factory if it requires a new
// compaction filter to be used for different compaction processes
// Allows an application to modify/delete a key-value during background
// compaction.
// Ideally, client should specify only one of filter or factory.
// compaction_filter takes precedence over compaction_filter_factory if
// client specifies both.
// Default: nullptr
const CompactionFilter* compaction_filter;
// If true, the database will be created if it is missing.
// Default: false
bool create_if_missing;
// If true, an error is raised if the database already exists.
// Default: false
bool error_if_exists;
// If true, the implementation will do aggressive checking of the
// data it is processing and will stop early if it detects any
// errors. This may have unforeseen ramifications: for example, a
// corruption of one DB entry may cause a large number of entries to
// become unreadable or for the entire DB to become unopenable.
// If any of the writes to the database fails (Put, Delete, Merge, Write),
// the database will switch to read-only mode and fail all other
// Write operations.
// Default: false
bool paranoid_checks;
// Use the specified object to interact with the environment,
// e.g. to read/write files, schedule background work, etc.
// Default: Env::Default()
Env* env;
// Any internal progress/error information generated by the db will
// be written to info_log if it is non-nullptr, or to a file stored
// in the same directory as the DB contents if info_log is nullptr.
// Default: nullptr
shared_ptr<Logger> info_log;
// -------------------
// Parameters that affect performance
// Amount of data to build up in memory (backed by an unsorted log
// on disk) before converting to a sorted on-disk file.
//
// Larger values increase performance, especially during bulk loads.
// Up to max_write_buffer_number write buffers may be held in memory
// at the same time,
// so you may wish to adjust this parameter to control memory usage.
// Also, a larger write buffer will result in a longer recovery time
// the next time the database is opened.
//
// Default: 4MB
size_t write_buffer_size;
// The maximum number of write buffers that are built up in memory.
// The default is 2, so that when 1 write buffer is being flushed to
// storage, new writes can continue to the other write buffer.
// Default: 2
int max_write_buffer_number;
// The minimum number of write buffers that will be merged together
// before writing to storage. If set to 1, then
// all write buffers are fushed to L0 as individual files and this increases
// read amplification because a get request has to check in all of these
// files. Also, an in-memory merge may result in writing lesser
// data to storage if there are duplicate records in each of these
// individual write buffers. Default: 1
int min_write_buffer_number_to_merge;
// Number of open files that can be used by the DB. You may need to
// increase this if your database has a large working set (budget
// one open file per 2MB of working set).
//
// Default: 1000
int max_open_files;
// Control over blocks (user data is stored in a set of blocks, and
// a block is the unit of reading from disk).
// If non-NULL use the specified cache for blocks.
// If NULL, rocksdb will automatically create and use an 8MB internal cache.
// Default: nullptr
shared_ptr<Cache> block_cache;
// If non-NULL use the specified cache for compressed blocks.
// If NULL, rocksdb will not use a compressed block cache.
// Default: nullptr
shared_ptr<Cache> block_cache_compressed;
// Approximate size of user data packed per block. Note that the
// block size specified here corresponds to uncompressed data. The
// actual size of the unit read from disk may be smaller if
// compression is enabled. This parameter can be changed dynamically.
//
// Default: 4K
size_t block_size;
// Number of keys between restart points for delta encoding of keys.
// This parameter can be changed dynamically. Most clients should
// leave this parameter alone.
//
// Default: 16
int block_restart_interval;
// Compress blocks using the specified compression algorithm. This
// parameter can be changed dynamically.
//
// Default: kSnappyCompression, which gives lightweight but fast
// compression.
//
// Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
// ~200-500MB/s compression
// ~400-800MB/s decompression
// Note that these speeds are significantly faster than most
// persistent storage speeds, and therefore it is typically never
// worth switching to kNoCompression. Even if the input data is
// incompressible, the kSnappyCompression implementation will
// efficiently detect that and will switch to uncompressed mode.
CompressionType compression;
// Different levels can have different compression policies. There
// are cases where most lower levels would like to quick compression
// algorithm while the higher levels (which have more data) use
// compression algorithms that have better compression but could
// be slower. This array, if non nullptr, should have an entry for
// each level of the database. This array, if non nullptr, overides the
// value specified in the previous field 'compression'. The caller is
// reponsible for allocating memory and initializing the values in it
// before invoking Open(). The caller is responsible for freeing this
// array and it could be freed anytime after the return from Open().
// This could have been a std::vector but that makes the equivalent
// java/C api hard to construct.
std::vector<CompressionType> compression_per_level;
//different options for compression algorithms
CompressionOptions compression_opts;
// If non-nullptr, use the specified filter policy to reduce disk reads.
// Many applications will benefit from passing the result of
// NewBloomFilterPolicy() here.
//
// Default: nullptr
const FilterPolicy* filter_policy;
// If non-nullptr, use the specified function to determine the
// prefixes for keys. These prefixes will be placed in the filter.
// Depending on the workload, this can reduce the number of read-IOP
// cost for scans when a prefix is passed via ReadOptions to
// db.NewIterator(). For prefix filtering to work properly,
// "prefix_extractor" and "comparator" must be such that the following
// properties hold:
//
// 1) key.starts_with(prefix(key))
// 2) Compare(prefix(key), key) <= 0.
// 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
// 4) prefix(prefix(key)) == prefix(key)
//
// Default: nullptr
const SliceTransform* prefix_extractor;
// If true, place whole keys in the filter (not just prefixes).
// This must generally be true for gets to be efficient.
//
// Default: true
bool whole_key_filtering;
// Number of levels for this database
int num_levels;
// Number of files to trigger level-0 compaction. A value <0 means that
// level-0 compaction will not be triggered by number of files at all.
int level0_file_num_compaction_trigger;
// Soft limit on number of level-0 files. We start slowing down writes at this
// point. A value <0 means that no writing slow down will be triggered by
// number of files in level-0.
int level0_slowdown_writes_trigger;
// Maximum number of level-0 files. We stop writes at this point.
int level0_stop_writes_trigger;
// Maximum level to which a new compacted memtable is pushed if it
// does not create overlap. We try to push to level 2 to avoid the
// relatively expensive level 0=>1 compactions and to avoid some
// expensive manifest file operations. We do not push all the way to
// the largest level since that can generate a lot of wasted disk
// space if the same key space is being repeatedly overwritten.
int max_mem_compaction_level;
// Target file size for compaction.
// target_file_size_base is per-file size for level-1.
// Target file size for level L can be calculated by
// target_file_size_base * (target_file_size_multiplier ^ (L-1))
// For example, if target_file_size_base is 2MB and
// target_file_size_multiplier is 10, then each file on level-1 will
// be 2MB, and each file on level 2 will be 20MB,
// and each file on level-3 will be 200MB.
// by default target_file_size_base is 2MB.
int target_file_size_base;
// by default target_file_size_multiplier is 1, which means
// by default files in different levels will have similar size.
int target_file_size_multiplier;
// Control maximum total data size for a level.
// max_bytes_for_level_base is the max total for level-1.
// Maximum number of bytes for level L can be calculated as
// (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
// For example, if max_bytes_for_level_base is 20MB, and if
// max_bytes_for_level_multiplier is 10, total data size for level-1
// will be 20MB, total file size for level-2 will be 200MB,
// and total file size for level-3 will be 2GB.
// by default 'max_bytes_for_level_base' is 10MB.
uint64_t max_bytes_for_level_base;
// by default 'max_bytes_for_level_base' is 10.
int max_bytes_for_level_multiplier;
// Different max-size multipliers for different levels.
// These are multiplied by max_bytes_for_level_multiplier to arrive
// at the max-size of each level.
// Default: 1
std::vector<int> max_bytes_for_level_multiplier_additional;
// Maximum number of bytes in all compacted files. We avoid expanding
// the lower level file set of a compaction if it would make the
// total compaction cover more than
// (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
int expanded_compaction_factor;
// Maximum number of bytes in all source files to be compacted in a
// single compaction run. We avoid picking too many files in the
// source level so that we do not exceed the total source bytes
// for compaction to exceed
// (source_compaction_factor * targetFileSizeLevel()) many bytes.
// Default:1, i.e. pick maxfilesize amount of data as the source of
// a compaction.
int source_compaction_factor;
// Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
// stop building a single file in a level->level+1 compaction.
int max_grandparent_overlap_factor;
// If non-null, then we should collect metrics about database operations
// Statistics objects should not be shared between DB instances as
// it does not use any locks to prevent concurrent updates.
shared_ptr<Statistics> statistics;
// If true, then the contents of data files are not synced
// to stable storage. Their contents remain in the OS buffers till the
// OS decides to flush them. This option is good for bulk-loading
// of data. Once the bulk-loading is complete, please issue a
// sync to the OS to flush all dirty buffesrs to stable storage.
// Default: false
bool disableDataSync;
// If true, then every store to stable storage will issue a fsync.
// If false, then every store to stable storage will issue a fdatasync.
// This parameter should be set to true while storing data to
// filesystem like ext3 that can lose files after a reboot.
// Default: false
bool use_fsync;
// This number controls how often a new scribe log about
// db deploy stats is written out.
// -1 indicates no logging at all.
// Default value is 1800 (half an hour).
int db_stats_log_interval;
// This specifies the info LOG dir.
// If it is empty, the log files will be in the same dir as data.
// If it is non empty, the log files will be in the specified dir,
// and the db data dir's absolute path will be used as the log file
// name's prefix.
std::string db_log_dir;
// This specifies the absolute dir path for write-ahead logs (WAL).
// If it is empty, the log files will be in the same dir as data,
// dbname is used as the data dir by default
// If it is non empty, the log files will be in kept the specified dir.
// When destroying the db,
// all log files in wal_dir and the dir itself is deleted
std::string wal_dir;
// Disable compaction triggered by seek.
// With bloomfilter and fast storage, a miss on one level
// is very cheap if the file handle is cached in table cache
// (which is true if max_open_files is large).
bool disable_seek_compaction;
// The periodicity when obsolete files get deleted. The default
// value is 6 hours. The files that get out of scope by compaction
// process will still get automatically delete on every compaction,
// regardless of this setting
uint64_t delete_obsolete_files_period_micros;
// Maximum number of concurrent background jobs, submitted to
// the default LOW priority thread pool
// Default: 1
int max_background_compactions;
// Maximum number of concurrent background memtable flush jobs, submitted to
// the HIGH priority thread pool.
// By default, all background jobs (major compaction and memtable flush) go
// to the LOW priority pool. If this option is set to a positive number,
// memtable flush jobs will be submitted to the HIGH priority pool.
// It is important when the same Env is shared by multiple db instances.
// Without a separate pool, long running major compaction jobs could
// potentially block memtable flush jobs of other db instances, leading to
// unnecessary Put stalls.
// Default: 0
int max_background_flushes;
// Specify the maximal size of the info log file. If the log file
// is larger than `max_log_file_size`, a new info log file will
// be created.
// If max_log_file_size == 0, all logs will be written to one
// log file.
size_t max_log_file_size;
// Time for the info log file to roll (in seconds).
// If specified with non-zero value, log file will be rolled
// if it has been active longer than `log_file_time_to_roll`.
// Default: 0 (disabled)
size_t log_file_time_to_roll;
// Maximal info log files to be kept.
// Default: 1000
size_t keep_log_file_num;
// Puts are delayed 0-1 ms when any level has a compaction score that exceeds
// soft_rate_limit. This is ignored when == 0.0.
// CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
// hold, RocksDB will set soft_rate_limit = hard_rate_limit
// Default: 0 (disabled)
double soft_rate_limit;
// Puts are delayed 1ms at a time when any level has a compaction score that
// exceeds hard_rate_limit. This is ignored when <= 1.0.
// Default: 0 (disabled)
double hard_rate_limit;
// Max time a put will be stalled when hard_rate_limit is enforced. If 0, then
// there is no limit.
// Default: 1000
unsigned int rate_limit_delay_max_milliseconds;
// manifest file is rolled over on reaching this limit.
// The older manifest file be deleted.
// The default value is MAX_INT so that roll-over does not take place.
uint64_t max_manifest_file_size;
// Disable block cache. If this is set to true,
// then no block cache should be used, and the block_cache should
// point to a nullptr object.
// Default: false
bool no_block_cache;
// Number of shards used for table cache.
int table_cache_numshardbits;
// During data eviction of table's LRU cache, it would be inefficient
// to strictly follow LRU because this piece of memory will not really
// be released unless its refcount falls to zero. Instead, make two
// passes: the first pass will release items with refcount = 1,
// and if not enough space releases after scanning the number of
// elements specified by this parameter, we will remove items in LRU
// order.
int table_cache_remove_scan_count_limit;
// size of one block in arena memory allocation.
// If <= 0, a proper value is automatically calculated (usually 1/10 of
// writer_buffer_size).
//
// Default: 0
size_t arena_block_size;
// Create an Options object with default values for all fields.
Options();
void Dump(Logger* log) const;
// Set appropriate parameters for bulk loading.
// The reason that this is a function that returns "this" instead of a
// constructor is to enable chaining of multiple similar calls in the future.
//
// All data will be in level 0 without any automatic compaction.
// It's recommended to manually call CompactRange(NULL, NULL) before reading
// from the database, because otherwise the read can be very slow.
Options* PrepareForBulkLoad();
// Disable automatic compactions. Manual compactions can still
// be issued on this database.
bool disable_auto_compactions;
// The following two fields affect how archived logs will be deleted.
// 1. If both set to 0, logs will be deleted asap and will not get into
// the archive.
// 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
// WAL files will be checked every 10 min and if total size is greater
// then WAL_size_limit_MB, they will be deleted starting with the
// earliest until size_limit is met. All empty files will be deleted.
// 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
// WAL files will be checked every WAL_ttl_secondsi / 2 and those that
// are older than WAL_ttl_seconds will be deleted.
// 4. If both are not 0, WAL files will be checked every 10 min and both
// checks will be performed with ttl being first.
uint64_t WAL_ttl_seconds;
uint64_t WAL_size_limit_MB;
// Number of bytes to preallocate (via fallocate) the manifest
// files. Default is 4mb, which is reasonable to reduce random IO
// as well as prevent overallocation for mounts that preallocate
// large amounts of data (such as xfs's allocsize option).
size_t manifest_preallocation_size;
// Purge duplicate/deleted keys when a memtable is flushed to storage.
// Default: true
bool purge_redundant_kvs_while_flush;
// Data being read from file storage may be buffered in the OS
// Default: true
bool allow_os_buffer;
// Allow the OS to mmap file for reading sst tables. Default: false
bool allow_mmap_reads;
// Allow the OS to mmap file for writing. Default: true
bool allow_mmap_writes;
// Disable child process inherit open files. Default: true
bool is_fd_close_on_exec;
// Skip log corruption error on recovery (If client is ok with
// losing most recent changes)
// Default: false
bool skip_log_error_on_recovery;
// if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
// Default: 3600 (1 hour)
unsigned int stats_dump_period_sec;
// This is used to close a block before it reaches the configured
// 'block_size'. If the percentage of free space in the current block is less
// than this specified number and adding a new record to the block will
// exceed the configured block size, then this block will be closed and the
// new record will be written to the next block.
// Default is 10.
int block_size_deviation;
// If set true, will hint the underlying file system that the file
// access pattern is random, when a sst file is opened.
// Default: true
bool advise_random_on_open;
// Specify the file access pattern once a compaction is started.
// It will be applied to all input files of a compaction.
// Default: NORMAL
enum { NONE, NORMAL, SEQUENTIAL, WILLNEED } access_hint_on_compaction_start;
// Use adaptive mutex, which spins in the user space before resorting
// to kernel. This could reduce context switch when the mutex is not
// heavily contended. However, if the mutex is hot, we could end up
// wasting spin time.
// Default: false
bool use_adaptive_mutex;
// Allows OS to incrementally sync files to disk while they are being
// written, asynchronously, in the background.
// Issue one request for every bytes_per_sync written. 0 turns it off.
// Default: 0
uint64_t bytes_per_sync;
// The compaction style. Default: kCompactionStyleLevel
CompactionStyle compaction_style;
// The options needed to support Universal Style compactions
CompactionOptionsUniversal compaction_options_universal;
// Use KeyMayExist API to filter deletes when this is true.
// If KeyMayExist returns false, i.e. the key definitely does not exist, then
// the delete is a noop. KeyMayExist only incurs in-memory look up.
// This optimization avoids writing the delete to storage when appropriate.
// Default: false
bool filter_deletes;
// An iteration->Next() sequentially skips over keys with the same
// user-key unless this option is set. This number specifies the number
// of keys (with the same userkey) that will be sequentially
// skipped before a reseek is issued.
// Default: 8
uint64_t max_sequential_skip_in_iterations;
// This is a factory that provides MemTableRep objects.
// Default: a factory that provides a skip-list-based implementation of
// MemTableRep.
std::shared_ptr<MemTableRepFactory> memtable_factory;
// This is a factory that provides TableFactory objects.
// Default: a factory that provides a default implementation of
// Table and TableBuilder.
std::shared_ptr<TableFactory> table_factory;
// This is a factory that provides compaction filter objects which allow
// an application to modify/delete a key-value during background compaction.
// Default: a factory that doesn't provide any object
std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
// This option allows user to to collect their own interested statistics of
// the tables.
// Default: emtpy vector -- no user-defined statistics collection will be
// performed.
std::vector<std::shared_ptr<TablePropertiesCollector>>
table_properties_collectors;
// Allows thread-safe inplace updates. Requires Updates iff
// * key exists in current memtable
// * new sizeof(new_value) <= sizeof(old_value)
// * old_value for that key is a put i.e. kTypeValue
// Default: false.
bool inplace_update_support;
// Number of locks used for inplace update
// Default: 10000, if inplace_update_support = true, else 0.
size_t inplace_update_num_locks;
};
//
// An application can issue a read request (via Get/Iterators) and specify
// if that read should process data that ALREADY resides on a specified cache
// level. For example, if an application specifies kBlockCacheTier then the
// Get call will process data that is already processed in the memtable or
// the block cache. It will not page in data from the OS cache or data that
// resides in storage.
enum ReadTier {
kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage
kBlockCacheTier = 0x1 // data in memtable or block cache
};
// Options that control read operations
struct ReadOptions {
// If true, all data read from underlying storage will be
// verified against corresponding checksums.
// Default: false
bool verify_checksums;
// Should the "data block"/"index block"/"filter block" read for this
// iteration be cached in memory?
// Callers may wish to set this field to false for bulk scans.
// Default: true
bool fill_cache;
// If this option is set and memtable implementation allows, Seek
// might only return keys with the same prefix as the seek-key
bool prefix_seek;
// If "snapshot" is non-nullptr, read as of the supplied snapshot
// (which must belong to the DB that is being read and which must
// not have been released). If "snapshot" is nullptr, use an impliicit
// snapshot of the state at the beginning of this read operation.
// Default: nullptr
const Snapshot* snapshot;
// If "prefix" is non-nullptr, and ReadOptions is being passed to
// db.NewIterator, only return results when the key begins with this
// prefix. This field is ignored by other calls (e.g., Get).
// Options.prefix_extractor must also be set, and
// prefix_extractor.InRange(prefix) must be true. The iterator
// returned by NewIterator when this option is set will behave just
// as if the underlying store did not contain any non-matching keys,
// with two exceptions. Seek() only accepts keys starting with the
// prefix, and SeekToLast() is not supported. prefix filter with this
// option will sometimes reduce the number of read IOPs.
// Default: nullptr
const Slice* prefix;
// Specify if this read request should process data that ALREADY
// resides on a particular cache. If the required data is not
// found at the specified cache, then Status::Incomplete is returned.
// Default: kReadAllTier
ReadTier read_tier;
ReadOptions()
: verify_checksums(false),
fill_cache(true),
prefix_seek(false),
snapshot(nullptr),
prefix(nullptr),
read_tier(kReadAllTier) {
}
ReadOptions(bool cksum, bool cache) :
verify_checksums(cksum), fill_cache(cache),
prefix_seek(false), snapshot(nullptr), prefix(nullptr),
read_tier(kReadAllTier) {
}
};
// Options that control write operations
struct WriteOptions {
// If true, the write will be flushed from the operating system
// buffer cache (by calling WritableFile::Sync()) before the write
// is considered complete. If this flag is true, writes will be
// slower.
//
// If this flag is false, and the machine crashes, some recent
// writes may be lost. Note that if it is just the process that
// crashes (i.e., the machine does not reboot), no writes will be
// lost even if sync==false.
//
// In other words, a DB write with sync==false has similar
// crash semantics as the "write()" system call. A DB write
// with sync==true has similar crash semantics to a "write()"
// system call followed by "fdatasync()".
//
// Default: false
bool sync;
// If true, writes will not first go to the write ahead log,
// and the write may got lost after a crash.
bool disableWAL;
WriteOptions()
: sync(false),
disableWAL(false) {
}
};
// Options that control flush operations
struct FlushOptions {
// If true, the flush will wait until the flush is done.
// Default: true
bool wait;
FlushOptions()
: wait(true) {
}
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_

View File

@@ -0,0 +1,48 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
#define STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
#include <stdint.h>
namespace rocksdb {
enum PerfLevel {
kDisable = 0, // disable perf stats
kEnableCount = 1, // enable only count stats
kEnableTime = 2 // enable time stats too
};
// set the perf stats level
void SetPerfLevel(PerfLevel level);
// A thread local context for gathering performance counter efficiently
// and transparently.
struct PerfContext {
void Reset(); // reset all performance counters to zero
uint64_t user_key_comparison_count; // total number of user key comparisons
uint64_t block_cache_hit_count; // total number of block cache hits
uint64_t block_read_count; // total number of block reads (with IO)
uint64_t block_read_byte; // total number of bytes from block reads
uint64_t block_read_time; // total time spent on block reads
uint64_t block_checksum_time; // total time spent on block checksum
uint64_t block_decompress_time; // total time spent on block decompression
// total number of internal keys skipped over during iteration (overwritten or
// deleted, to be more specific, hidden by a put or delete of the same key)
uint64_t internal_key_skipped_count;
// total number of deletes skipped over during iteration
uint64_t internal_delete_skipped_count;
uint64_t wal_write_time; // total time spent on writing to WAL
};
extern __thread PerfContext perf_context;
}
#endif

136
include/rocksdb/slice.h Normal file
View File

@@ -0,0 +1,136 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Slice is a simple structure containing a pointer into some external
// storage and a size. The user of a Slice must ensure that the slice
// is not used after the corresponding external storage has been
// deallocated.
//
// Multiple threads can invoke const methods on a Slice without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same Slice must use
// external synchronization.
#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_H_
#define STORAGE_ROCKSDB_INCLUDE_SLICE_H_
#include <assert.h>
#include <stddef.h>
#include <string.h>
#include <string>
namespace rocksdb {
class Slice {
public:
// Create an empty slice.
Slice() : data_(""), size_(0) { }
// Create a slice that refers to d[0,n-1].
Slice(const char* d, size_t n) : data_(d), size_(n) { }
// Create a slice that refers to the contents of "s"
/* implicit */
Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
// Create a slice that refers to s[0,strlen(s)-1]
/* implicit */
Slice(const char* s) : data_(s), size_(strlen(s)) { }
// Return a pointer to the beginning of the referenced data
const char* data() const { return data_; }
// Return the length (in bytes) of the referenced data
size_t size() const { return size_; }
// Return true iff the length of the referenced data is zero
bool empty() const { return size_ == 0; }
// Return the ith byte in the referenced data.
// REQUIRES: n < size()
char operator[](size_t n) const {
assert(n < size());
return data_[n];
}
// Change this slice to refer to an empty array
void clear() { data_ = ""; size_ = 0; }
// Drop the first "n" bytes from this slice.
void remove_prefix(size_t n) {
assert(n <= size());
data_ += n;
size_ -= n;
}
// Return a string that contains the copy of the referenced data.
std::string ToString(bool hex = false) const {
if (hex) {
std::string result;
char buf[10];
for (size_t i = 0; i < size_; i++) {
snprintf(buf, 10, "%02X", (unsigned char)data_[i]);
result += buf;
}
return result;
} else {
return std::string(data_, size_);
}
}
// Three-way comparison. Returns value:
// < 0 iff "*this" < "b",
// == 0 iff "*this" == "b",
// > 0 iff "*this" > "b"
int compare(const Slice& b) const;
// Return true iff "x" is a prefix of "*this"
bool starts_with(const Slice& x) const {
return ((size_ >= x.size_) &&
(memcmp(data_, x.data_, x.size_) == 0));
}
// private: make these public for rocksdbjni access
const char* data_;
size_t size_;
// Intentionally copyable
};
// A set of Slices that are virtually concatenated together. 'parts' points
// to an array of Slices. The number of elements in the array is 'num_parts'.
struct SliceParts {
SliceParts(const Slice* parts, int num_parts) :
parts(parts), num_parts(num_parts) { }
const Slice* parts;
int num_parts;
};
inline bool operator==(const Slice& x, const Slice& y) {
return ((x.size() == y.size()) &&
(memcmp(x.data(), y.data(), x.size()) == 0));
}
inline bool operator!=(const Slice& x, const Slice& y) {
return !(x == y);
}
inline int Slice::compare(const Slice& b) const {
const int min_len = (size_ < b.size_) ? size_ : b.size_;
int r = memcmp(data_, b.data_, min_len);
if (r == 0) {
if (size_ < b.size_) r = -1;
else if (size_ > b.size_) r = +1;
}
return r;
}
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_H_

View File

@@ -0,0 +1,47 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Class for specifying user-defined functions which perform a
// transformation on a slice. It is not required that every slice
// belong to the domain and/or range of a function. Subclasses should
// define InDomain and InRange to determine which slices are in either
// of these sets respectively.
#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
#define STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
#include <string>
namespace rocksdb {
class Slice;
class SliceTransform {
public:
virtual ~SliceTransform() {};
// Return the name of this transformation.
virtual const char* Name() const = 0;
// transform a src in domain to a dst in the range
virtual Slice Transform(const Slice& src) const = 0;
// determine whether this is a valid src upon the function applies
virtual bool InDomain(const Slice& src) const = 0;
// determine whether dst=Transform(src) for some src
virtual bool InRange(const Slice& dst) const = 0;
};
extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
extern const SliceTransform* NewNoopTransform();
}
#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_

View File

@@ -0,0 +1,302 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
#include <atomic>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <string>
#include <memory>
#include <vector>
namespace rocksdb {
/**
* Keep adding ticker's here.
* Any ticker should have a value less than TICKER_ENUM_MAX.
* Add a new ticker by assigning it the current value of TICKER_ENUM_MAX
* Add a string representation in TickersNameMap below.
* And incrementing TICKER_ENUM_MAX.
*/
enum Tickers {
// total block cache misses
// REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
// BLOCK_CACHE_FILTER_MISS +
// BLOCK_CACHE_DATA_MISS;
BLOCK_CACHE_MISS,
// total block cache hit
// REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
// BLOCK_CACHE_FILTER_HIT +
// BLOCK_CACHE_DATA_HIT;
BLOCK_CACHE_HIT,
// # of blocks added to block cache.
BLOCK_CACHE_ADD,
// # of times cache miss when accessing index block from block cache.
BLOCK_CACHE_INDEX_MISS,
// # of times cache hit when accessing index block from block cache.
BLOCK_CACHE_INDEX_HIT,
// # of times cache miss when accessing filter block from block cache.
BLOCK_CACHE_FILTER_MISS,
// # of times cache hit when accessing filter block from block cache.
BLOCK_CACHE_FILTER_HIT,
// # of times cache miss when accessing data block from block cache.
BLOCK_CACHE_DATA_MISS,
// # of times cache hit when accessing data block from block cache.
BLOCK_CACHE_DATA_HIT,
// # of times bloom filter has avoided file reads.
BLOOM_FILTER_USEFUL,
/**
* COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
* There are 3 reasons currently.
*/
COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value.
COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete.
COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key.
// Number of keys written to the database via the Put and Write call's
NUMBER_KEYS_WRITTEN,
// Number of Keys read,
NUMBER_KEYS_READ,
// Number keys updated, if inplace update is enabled
NUMBER_KEYS_UPDATED,
// Bytes written / read
BYTES_WRITTEN,
BYTES_READ,
NO_FILE_CLOSES,
NO_FILE_OPENS,
NO_FILE_ERRORS,
// Time system had to wait to do LO-L1 compactions
STALL_L0_SLOWDOWN_MICROS,
// Time system had to wait to move memtable to L1.
STALL_MEMTABLE_COMPACTION_MICROS,
// write throttle because of too many files in L0
STALL_L0_NUM_FILES_MICROS,
RATE_LIMIT_DELAY_MILLIS,
NO_ITERATORS, // number of iterators currently open
// Number of MultiGet calls, keys read, and bytes read
NUMBER_MULTIGET_CALLS,
NUMBER_MULTIGET_KEYS_READ,
NUMBER_MULTIGET_BYTES_READ,
// Number of deletes records that were not required to be
// written to storage because key does not exist
NUMBER_FILTERED_DELETES,
NUMBER_MERGE_FAILURES,
SEQUENCE_NUMBER,
// number of times bloom was checked before creating iterator on a
// file, and the number of times the check was useful in avoiding
// iterator creation (and thus likely IOPs).
BLOOM_FILTER_PREFIX_CHECKED,
BLOOM_FILTER_PREFIX_USEFUL,
// Number of times we had to reseek inside an iteration to skip
// over large number of keys with same userkey.
NUMBER_OF_RESEEKS_IN_ITERATION,
// Record the number of calls to GetUpadtesSince. Useful to keep track of
// transaction log iterator refreshes
GET_UPDATES_SINCE_CALLS,
BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache
BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache
TICKER_ENUM_MAX
};
// The order of items listed in Tickers should be the same as
// the order listed in TickersNameMap
const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{ BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" },
{ BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" },
{ BLOCK_CACHE_ADD, "rocksdb.block.cache.add" },
{ BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss" },
{ BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit" },
{ BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss" },
{ BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit" },
{ BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss" },
{ BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit" },
{ BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" },
{ COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" },
{ COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" },
{ COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" },
{ NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written" },
{ NUMBER_KEYS_READ, "rocksdb.number.keys.read" },
{ NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated" },
{ BYTES_WRITTEN, "rocksdb.bytes.written" },
{ BYTES_READ, "rocksdb.bytes.read" },
{ NO_FILE_CLOSES, "rocksdb.no.file.closes" },
{ NO_FILE_OPENS, "rocksdb.no.file.opens" },
{ NO_FILE_ERRORS, "rocksdb.no.file.errors" },
{ STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros" },
{ STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros" },
{ STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros" },
{ RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis" },
{ NO_ITERATORS, "rocksdb.num.iterators" },
{ NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" },
{ NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" },
{ NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" },
{ NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" },
{ NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" },
{ SEQUENCE_NUMBER, "rocksdb.sequence.number" },
{ BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" },
{ BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" },
{ NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" },
{ GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls" },
{ BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss" },
{ BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit" }
};
/**
* Keep adding histogram's here.
* Any histogram whould have value less than HISTOGRAM_ENUM_MAX
* Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
* Add a string representation in HistogramsNameMap below
* And increment HISTOGRAM_ENUM_MAX
*/
enum Histograms {
DB_GET,
DB_WRITE,
COMPACTION_TIME,
TABLE_SYNC_MICROS,
COMPACTION_OUTFILE_SYNC_MICROS,
WAL_FILE_SYNC_MICROS,
MANIFEST_FILE_SYNC_MICROS,
// TIME SPENT IN IO DURING TABLE OPEN
TABLE_OPEN_IO_MICROS,
DB_MULTIGET,
READ_BLOCK_COMPACTION_MICROS,
READ_BLOCK_GET_MICROS,
WRITE_RAW_BLOCK_MICROS,
STALL_L0_SLOWDOWN_COUNT,
STALL_MEMTABLE_COMPACTION_COUNT,
STALL_L0_NUM_FILES_COUNT,
HARD_RATE_LIMIT_DELAY_COUNT,
SOFT_RATE_LIMIT_DELAY_COUNT,
NUM_FILES_IN_SINGLE_COMPACTION,
HISTOGRAM_ENUM_MAX,
};
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
{ DB_GET, "rocksdb.db.get.micros" },
{ DB_WRITE, "rocksdb.db.write.micros" },
{ COMPACTION_TIME, "rocksdb.compaction.times.micros" },
{ TABLE_SYNC_MICROS, "rocksdb.table.sync.micros" },
{ COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros" },
{ WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros" },
{ MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros" },
{ TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros" },
{ DB_MULTIGET, "rocksdb.db.multiget.micros" },
{ READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros" },
{ READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros" },
{ WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros" },
{ STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
{ STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
{ STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
{ HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
{ SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
{ NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" },
};
struct HistogramData {
double median;
double percentile95;
double percentile99;
double average;
double standard_deviation;
};
class Histogram {
public:
// clear's the histogram
virtual void Clear() = 0;
virtual ~Histogram();
// Add a value to be recorded in the histogram.
virtual void Add(uint64_t value) = 0;
virtual std::string ToString() const = 0;
// Get statistics
virtual double Median() const = 0;
virtual double Percentile(double p) const = 0;
virtual double Average() const = 0;
virtual double StandardDeviation() const = 0;
virtual void Data(HistogramData * const data) const = 0;
};
/**
* A dumb ticker which keeps incrementing through its life time.
* Thread safe. Locking managed by implementation of this interface.
*/
class Ticker {
public:
Ticker() : count_(0) { }
inline void setTickerCount(uint64_t count) {
count_ = count;
}
inline void recordTick(int count = 1) {
count_ += count;
}
inline uint64_t getCount() {
return count_;
}
private:
std::atomic_uint_fast64_t count_;
};
// Analyze the performance of a db
class Statistics {
public:
virtual long getTickerCount(Tickers tickerType) = 0;
virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0;
virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0;
virtual void measureTime(Histograms histogramType, uint64_t time) = 0;
virtual void histogramData(Histograms type, HistogramData * const data) = 0;
// String representation of the statistic object.
std::string ToString();
};
// Create a concrete DBStatistics object
std::shared_ptr<Statistics> CreateDBStatistics();
// Ease of Use functions
inline void RecordTick(std::shared_ptr<Statistics> statistics,
Tickers ticker,
uint64_t count = 1) {
assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
assert(TickersNameMap.size() == TICKER_ENUM_MAX);
if (statistics) {
statistics->recordTick(ticker, count);
}
}
inline void SetTickerCount(std::shared_ptr<Statistics> statistics,
Tickers ticker,
uint64_t count) {
assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
assert(TickersNameMap.size() == TICKER_ENUM_MAX);
if (statistics) {
statistics->setTickerCount(ticker, count);
}
}
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_

130
include/rocksdb/status.h Normal file
View File

@@ -0,0 +1,130 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A Status encapsulates the result of an operation. It may indicate success,
// or it may indicate an error with an associated error message.
//
// Multiple threads can invoke const methods on a Status without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same Status must use
// external synchronization.
#ifndef STORAGE_ROCKSDB_INCLUDE_STATUS_H_
#define STORAGE_ROCKSDB_INCLUDE_STATUS_H_
#include <string>
#include "rocksdb/slice.h"
namespace rocksdb {
class Status {
public:
// Create a success status.
Status() : state_(nullptr) { }
~Status() { delete[] state_; }
// Copy the specified status.
Status(const Status& s);
void operator=(const Status& s);
// Return a success status.
static Status OK() { return Status(); }
// Return error status of an appropriate type.
static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kNotFound, msg, msg2);
}
static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kCorruption, msg, msg2);
}
static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kNotSupported, msg, msg2);
}
static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kInvalidArgument, msg, msg2);
}
static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kIOError, msg, msg2);
}
static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kMergeInProgress, msg, msg2);
}
static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kIncomplete, msg, msg2);
}
// Returns true iff the status indicates success.
bool ok() const { return (state_ == nullptr); }
// Returns true iff the status indicates a NotFound error.
bool IsNotFound() const { return code() == kNotFound; }
// Returns true iff the status indicates a Corruption error.
bool IsCorruption() const { return code() == kCorruption; }
// Returns true iff the status indicates a NotSupported error.
bool IsNotSupported() const { return code() == kNotSupported; }
// Returns true iff the status indicates an InvalidArgument error.
bool IsInvalidArgument() const { return code() == kInvalidArgument; }
// Returns true iff the status indicates an IOError.
bool IsIOError() const { return code() == kIOError; }
// Returns true iff the status indicates an MergeInProgress.
bool IsMergeInProgress() const { return code() == kMergeInProgress; }
// Returns true iff the status indicates Incomplete
bool IsIncomplete() const { return code() == kIncomplete; }
// Return a string representation of this status suitable for printing.
// Returns the string "OK" for success.
std::string ToString() const;
private:
// OK status has a nullptr state_. Otherwise, state_ is a new[] array
// of the following form:
// state_[0..3] == length of message
// state_[4] == code
// state_[5..] == message
const char* state_;
enum Code {
kOk = 0,
kNotFound = 1,
kCorruption = 2,
kNotSupported = 3,
kInvalidArgument = 4,
kIOError = 5,
kMergeInProgress = 6,
kIncomplete = 7
};
Code code() const {
return (state_ == nullptr) ? kOk : static_cast<Code>(state_[4]);
}
Status(Code code, const Slice& msg, const Slice& msg2);
static const char* CopyState(const char* s);
};
inline Status::Status(const Status& s) {
state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
}
inline void Status::operator=(const Status& s) {
// The following condition catches both aliasing (when this == &s),
// and the common case where both s and *this are ok.
if (state_ != s.state_) {
delete[] state_;
state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
}
}
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_STATUS_H_

180
include/rocksdb/table.h Normal file
View File

@@ -0,0 +1,180 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <memory>
#include <stdint.h>
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/table_properties.h"
#include "rocksdb/options.h"
namespace rocksdb {
struct Options;
class RandomAccessFile;
struct ReadOptions;
class TableCache;
class WritableFile;
using std::unique_ptr;
// TableBuilder provides the interface used to build a Table
// (an immutable and sorted map from keys to values).
//
// Multiple threads can invoke const methods on a TableBuilder without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same TableBuilder must use
// external synchronization.
class TableBuilder {
public:
// REQUIRES: Either Finish() or Abandon() has been called.
virtual ~TableBuilder() {}
// Add key,value to the table being constructed.
// REQUIRES: key is after any previously added key according to comparator.
// REQUIRES: Finish(), Abandon() have not been called
virtual void Add(const Slice& key, const Slice& value) = 0;
// Return non-ok iff some error has been detected.
virtual Status status() const = 0;
// Finish building the table.
// REQUIRES: Finish(), Abandon() have not been called
virtual Status Finish() = 0;
// Indicate that the contents of this builder should be abandoned.
// If the caller is not going to call Finish(), it must call Abandon()
// before destroying this builder.
// REQUIRES: Finish(), Abandon() have not been called
virtual void Abandon() = 0;
// Number of calls to Add() so far.
virtual uint64_t NumEntries() const = 0;
// Size of the file generated so far. If invoked after a successful
// Finish() call, returns the size of the final generated file.
virtual uint64_t FileSize() const = 0;
};
// A Table is a sorted map from strings to strings. Tables are
// immutable and persistent. A Table may be safely accessed from
// multiple threads without external synchronization.
class TableReader {
public:
virtual ~TableReader() {}
// Determine whether there is a chance that the current table file
// contains the key a key starting with iternal_prefix. The specific
// table implementation can use bloom filter and/or other heuristic
// to filter out this table as a whole.
virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0;
// Returns a new iterator over the table contents.
// The result of NewIterator() is initially invalid (caller must
// call one of the Seek methods on the iterator before using it).
virtual Iterator* NewIterator(const ReadOptions&) = 0;
// Given a key, return an approximate byte offset in the file where
// the data for that key begins (or would begin if the key were
// present in the file). The returned value is in terms of file
// bytes, and so includes effects like compression of the underlying data.
// E.g., the approximate offset of the last key in the table will
// be close to the file length.
virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
// Returns true if the block for the specified key is in cache.
// REQUIRES: key is in this table.
virtual bool TEST_KeyInCache(const ReadOptions& options,
const Slice& key) = 0;
// Set up the table for Compaction. Might change some parameters with
// posix_fadvise
virtual void SetupForCompaction() = 0;
virtual TableProperties& GetTableProperties() = 0;
// Calls (*result_handler)(handle_context, ...) repeatedly, starting with
// the entry found after a call to Seek(key), until result_handler returns
// false, where k is the actual internal key for a row found and v as the
// value of the key. didIO is true if I/O is involved in the operation. May
// not make such a call if filter policy says that key is not present.
//
// mark_key_may_exist_handler needs to be called when it is configured to be
// memory only and the key is not found in the block cache, with
// the parameter to be handle_context.
//
// readOptions is the options for the read
// key is the key to search for
virtual Status Get(
const ReadOptions& readOptions,
const Slice& key,
void* handle_context,
bool (*result_handler)(void* handle_context, const Slice& k,
const Slice& v, bool didIO),
void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
};
// A base class for table factories
class TableFactory {
public:
virtual ~TableFactory() {}
// The type of the table.
//
// The client of this package should switch to a new name whenever
// the table format implementation changes.
//
// Names starting with "rocksdb." are reserved and should not be used
// by any clients of this package.
virtual const char* Name() const = 0;
// Returns a Table object table that can fetch data from file specified
// in parameter file. It's the caller's responsibility to make sure
// file is in the correct format.
//
// GetTableReader() is called in two places:
// (1) TableCache::FindTable() calls the function when table cache miss
// and cache the table object returned.
// (1) SstFileReader (for SST Dump) opens the table and dump the table
// contents using the interator of the table.
// options and soptions are options. options is the general options.
// Multiple configured can be accessed from there, including and not
// limited to block cache and key comparators.
// file is a file handler to handle the file for the table
// file_size is the physical file size of the file
// table_reader is the output table reader
virtual Status GetTableReader(
const Options& options, const EnvOptions& soptions,
unique_ptr<RandomAccessFile> && file, uint64_t file_size,
unique_ptr<TableReader>* table_reader) const = 0;
// Return a table builder to write to a file for this table type.
//
// It is called in several places:
// (1) When flushing memtable to a level-0 output file, it creates a table
// builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
// (2) During compaction, it gets the builder for writing compaction output
// files in DBImpl::OpenCompactionOutputFile().
// (3) When recovering from transaction logs, it creates a table builder to
// write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
// by calling BuildTable())
// (4) When running Repairer, it creates a table builder to convert logs to
// SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
//
// options is the general options. Multiple configured can be acceseed from
// there, including and not limited to compression options.
// file is a handle of a writable file. It is the caller's responsibility to
// keep the file open and close the file after closing the table builder.
// compression_type is the compression type to use in this table.
virtual TableBuilder* GetTableBuilder(
const Options& options, WritableFile* file,
CompressionType compression_type) const = 0;
};
} // namespace rocksdb

View File

@@ -0,0 +1,90 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <string>
#include <unordered_map>
#include "rocksdb/status.h"
namespace rocksdb {
// TableProperties contains a bunch of read-only properties of its associated
// table.
struct TableProperties {
public:
// Other than basic table properties, each table may also have the user
// collected properties.
// The value of the user-collected properties are encoded as raw bytes --
// users have to interprete these values by themselves.
typedef
std::unordered_map<std::string, std::string>
UserCollectedProperties;
// the total size of all data blocks.
uint64_t data_size = 0;
// the size of index block.
uint64_t index_size = 0;
// the size of filter block.
uint64_t filter_size = 0;
// total raw key size
uint64_t raw_key_size = 0;
// total raw value size
uint64_t raw_value_size = 0;
// the number of blocks in this table
uint64_t num_data_blocks = 0;
// the number of entries in this table
uint64_t num_entries = 0;
// The name of the filter policy used in this table.
// If no filter policy is used, `filter_policy_name` will be an empty string.
std::string filter_policy_name;
// user collected properties
UserCollectedProperties user_collected_properties;
// convert this object to a human readable form
// @prop_delim: delimiter for each property.
std::string ToString(
const std::string& prop_delim = "; ",
const std::string& kv_delim = "=") const;
};
// `TablePropertiesCollector` provides the mechanism for users to collect
// their own interested properties. This class is essentially a collection
// of callback functions that will be invoked during table building.
class TablePropertiesCollector {
public:
virtual ~TablePropertiesCollector() { }
// Add() will be called when a new key/value pair is inserted into the table.
// @params key the original key that is inserted into the table.
// @params value the original value that is inserted into the table.
virtual Status Add(const Slice& key, const Slice& value) = 0;
// Finish() will be called when a table has already been built and is ready
// for writing the properties block.
// @params properties User will add their collected statistics to
// `properties`.
virtual Status Finish(
TableProperties::UserCollectedProperties* properties) = 0;
// The name of the properties collector can be used for debugging purpose.
virtual const char* Name() const = 0;
// Return the human-readable properties, where the key is property name and
// the value is the human-readable form of value.
virtual TableProperties::UserCollectedProperties
GetReadableProperties() const = 0;
};
// Extra properties
// Below is a list of non-basic properties that are collected by database
// itself. Especially some properties regarding to the internal keys (which
// is unknown to `table`).
extern uint64_t GetDeletedKeys(
const TableProperties::UserCollectedProperties& props);
} // namespace rocksdb

View File

@@ -0,0 +1,91 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
#define STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
#include "rocksdb/status.h"
#include "rocksdb/types.h"
#include "rocksdb/write_batch.h"
#include <memory>
#include <vector>
namespace rocksdb {
class LogFile;
typedef std::vector<std::unique_ptr<LogFile>> VectorLogPtr;
enum WalFileType {
/* Indicates that WAL file is in archive directory. WAL files are moved from
* the main db directory to archive directory once they are not live and stay
* there until cleaned up. Files are cleaned depending on archive size
* (Options::WAL_size_limit_MB) and time since last cleaning
* (Options::WAL_ttl_seconds).
*/
kArchivedLogFile = 0,
/* Indicates that WAL file is live and resides in the main db directory */
kAliveLogFile = 1
} ;
class LogFile {
public:
LogFile() {}
virtual ~LogFile() {}
// Returns log file's pathname relative to the main db dir
// Eg. For a live-log-file = /000003.log
// For an archived-log-file = /archive/000003.log
virtual std::string PathName() const = 0;
// Primary identifier for log file.
// This is directly proportional to creation time of the log file
virtual uint64_t LogNumber() const = 0;
// Log file can be either alive or archived
virtual WalFileType Type() const = 0;
// Starting sequence number of writebatch written in this log file
virtual SequenceNumber StartSequence() const = 0;
// Size of log file on disk in Bytes
virtual uint64_t SizeFileBytes() const = 0;
};
struct BatchResult {
SequenceNumber sequence = SequenceNumber();
std::unique_ptr<WriteBatch> writeBatchPtr;
};
// A TransactionLogIterator is used to iterate over the transactions in a db.
// One run of the iterator is continuous, i.e. the iterator will stop at the
// beginning of any gap in sequences
class TransactionLogIterator {
public:
TransactionLogIterator() {}
virtual ~TransactionLogIterator() {}
// An iterator is either positioned at a WriteBatch or not valid.
// This method returns true if the iterator is valid.
// Can read data from a valid iterator.
virtual bool Valid() = 0;
// Moves the iterator to the next WriteBatch.
// REQUIRES: Valid() to be true.
virtual void Next() = 0;
// Returns ok if the iterator is valid.
// Returns the Error when something has gone wrong.
virtual Status status() = 0;
// If valid return's the current write_batch and the sequence number of the
// earliest transaction contained in the batch.
// ONLY use if Valid() is true and status() is OK.
virtual BatchResult GetBatch() = 0;
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_

20
include/rocksdb/types.h Normal file
View File

@@ -0,0 +1,20 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef STORAGE_ROCKSDB_INCLUDE_TYPES_H_
#define STORAGE_ROCKSDB_INCLUDE_TYPES_H_
#include <stdint.h>
namespace rocksdb {
// Define all public custom types here.
// Represents a sequence number in a WAL file.
typedef uint64_t SequenceNumber;
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_TYPES_H_

View File

@@ -0,0 +1,89 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
#define STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
#include <stddef.h>
#include <string>
#include <memory>
#include <vector>
#include <stdint.h>
#include <climits>
#include "rocksdb/slice.h"
#include "rocksdb/statistics.h"
namespace rocksdb {
//
// Algorithm used to make a compaction request stop picking new files
// into a single compaction run
//
enum CompactionStopStyle {
kCompactionStopStyleSimilarSize, // pick files of similar size
kCompactionStopStyleTotalSize // total size of picked files > next file
};
class CompactionOptionsUniversal {
public:
// Percentage flexibilty while comparing file size. If the candidate file(s)
// size is 1% smaller than the next file's size, then include next file into
// this candidate set. // Default: 1
unsigned int size_ratio;
// The minimum number of files in a single compaction run. Default: 2
unsigned int min_merge_width;
// The maximum number of files in a single compaction run. Default: UINT_MAX
unsigned int max_merge_width;
// The size amplification is defined as the amount (in percentage) of
// additional storage needed to store a single byte of data in the database.
// For example, a size amplification of 2% means that a database that
// contains 100 bytes of user-data may occupy upto 102 bytes of
// physical storage. By this definition, a fully compacted database has
// a size amplification of 0%. Rocksdb uses the following heuristic
// to calculate size amplification: it assumes that all files excluding
// the earliest file contribute to the size amplification.
// Default: 200, which means that a 100 byte database could require upto
// 300 bytes of storage.
unsigned int max_size_amplification_percent;
// If this option is set to be -1 (the default value), all the output files
// will follow compression type specified.
//
// If this option is not negative, we will try to make sure compressed
// size is just above this value. In normal cases, at least this percentage
// of data will be compressed.
// When we are compacting to a new file, here is the criteria whether
// it needs to be compressed: assuming here are the list of files sorted
// by generation time:
// A1...An B1...Bm C1...Ct
// where A1 is the newest and Ct is the oldest, and we are going to compact
// B1...Bm, we calculate the total size of all the files as total_size, as
// well as the total size of C1...Ct as total_C, the compaction output file
// will be compressed iff
// total_C / total_size < this percentage
int compression_size_percent;
// The algorithm used to stop picking files into a single compaction run
// Default: kCompactionStopStyleTotalSize
CompactionStopStyle stop_style;
// Default set of parameters
CompactionOptionsUniversal() :
size_ratio(1),
min_merge_width(2),
max_merge_width(UINT_MAX),
max_size_amplification_percent(200),
compression_size_percent(-1),
stop_style(kCompactionStopStyleTotalSize) {
}
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H

View File

@@ -0,0 +1,109 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// WriteBatch holds a collection of updates to apply atomically to a DB.
//
// The updates are applied in the order in which they are added
// to the WriteBatch. For example, the value of "key" will be "v3"
// after the following batch is written:
//
// batch.Put("key", "v1");
// batch.Delete("key");
// batch.Put("key", "v2");
// batch.Put("key", "v3");
//
// Multiple threads can invoke const methods on a WriteBatch without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same WriteBatch must use
// external synchronization.
#ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
#define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
#include <string>
#include "rocksdb/status.h"
namespace rocksdb {
class Slice;
struct SliceParts;
class WriteBatch {
public:
WriteBatch();
~WriteBatch();
// Store the mapping "key->value" in the database.
void Put(const Slice& key, const Slice& value);
// Variant of Put() that gathers output like writev(2). The key and value
// that will be written to the database are concatentations of arrays of
// slices.
void Put(const SliceParts& key, const SliceParts& value);
// Merge "value" with the existing value of "key" in the database.
// "key->merge(existing, value)"
void Merge(const Slice& key, const Slice& value);
// If the database contains a mapping for "key", erase it. Else do nothing.
void Delete(const Slice& key);
// Append a blob of arbitrary size to the records in this batch. The blob will
// be stored in the transaction log but not in any other file. In particular,
// it will not be persisted to the SST files. When iterating over this
// WriteBatch, WriteBatch::Handler::LogData will be called with the contents
// of the blob as it is encountered. Blobs, puts, deletes, and merges will be
// encountered in the same order in thich they were inserted. The blob will
// NOT consume sequence number(s) and will NOT increase the count of the batch
//
// Example application: add timestamps to the transaction log for use in
// replication.
void PutLogData(const Slice& blob);
// Clear all updates buffered in this batch.
void Clear();
// Support for iterating over the contents of a batch.
class Handler {
public:
virtual ~Handler();
virtual void Put(const Slice& key, const Slice& value) = 0;
// Merge and LogData are not pure virtual. Otherwise, we would break
// existing clients of Handler on a source code level. The default
// implementation of Merge simply throws a runtime exception.
virtual void Merge(const Slice& key, const Slice& value);
// The default implementation of LogData does nothing.
virtual void LogData(const Slice& blob);
virtual void Delete(const Slice& key) = 0;
// Continue is called by WriteBatch::Iterate. If it returns false,
// iteration is halted. Otherwise, it continues iterating. The default
// implementation always returns true.
virtual bool Continue();
};
Status Iterate(Handler* handler) const;
// Retrieve the serialized version of this batch.
std::string Data() { return rep_; }
// Returns the number of updates in the batch
int Count() const;
// Constructor with a serialized string object
explicit WriteBatch(std::string rep): rep_(rep) {}
private:
friend class WriteBatchInternal;
std::string rep_; // See comment in write_batch.cc for the format of rep_
// Intentionally copyable
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_

View File

@@ -0,0 +1,161 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "rocksdb/db.h"
namespace rocksdb {
// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
class StackableDB : public DB {
public:
explicit StackableDB(StackableDB* sdb) : sdb_(sdb) {}
// Returns the DB object that is the lowermost component in the stack of DBs
virtual DB* GetRawDB() {
return sdb_->GetRawDB();
}
// convert a DB to StackableDB
// TODO: This function does not work yet. Passing nullptr to StackableDB in
// NewStackableDB's constructor will cause segfault on object's usage
static StackableDB* DBToStackableDB(DB* db) {
class NewStackableDB : public StackableDB {
public:
NewStackableDB(DB* db)
: StackableDB(nullptr),
db_(db) {}
DB* GetRawDB() {
return db_;
}
private:
DB* db_;
};
return new NewStackableDB(db);
}
virtual Status Put(const WriteOptions& options,
const Slice& key,
const Slice& val) override {
return sdb_->Put(options, key, val);
}
virtual Status Get(const ReadOptions& options,
const Slice& key,
std::string* value) override {
return sdb_->Get(options, key, value);
}
virtual std::vector<Status> MultiGet(const ReadOptions& options,
const std::vector<Slice>& keys,
std::vector<std::string>* values)
override {
return sdb_->MultiGet(options, keys, values);
}
virtual bool KeyMayExist(const ReadOptions& options,
const Slice& key,
std::string* value,
bool* value_found = nullptr) override {
return sdb_->KeyMayExist(options, key, value, value_found);
}
virtual Status Delete(const WriteOptions& wopts, const Slice& key) override {
return sdb_->Delete(wopts, key);
}
virtual Status Merge(const WriteOptions& options,
const Slice& key,
const Slice& value) override {
return sdb_->Merge(options, key, value);
}
virtual Status Write(const WriteOptions& opts, WriteBatch* updates)
override {
return sdb_->Write(opts, updates);
}
virtual Iterator* NewIterator(const ReadOptions& opts) override {
return sdb_->NewIterator(opts);
}
virtual const Snapshot* GetSnapshot() override {
return sdb_->GetSnapshot();
}
virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
return sdb_->ReleaseSnapshot(snapshot);
}
virtual bool GetProperty(const Slice& property, std::string* value)
override {
return sdb_->GetProperty(property, value);
}
virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes)
override {
return sdb_->GetApproximateSizes(r, n, sizes);
}
virtual void CompactRange(const Slice* begin, const Slice* end,
bool reduce_level = false,
int target_level = -1) override {
return sdb_->CompactRange(begin, end, reduce_level, target_level);
}
virtual int NumberLevels() override {
return sdb_->NumberLevels();
}
virtual int MaxMemCompactionLevel() override {
return sdb_->MaxMemCompactionLevel();
}
virtual int Level0StopWriteTrigger() override {
return sdb_->Level0StopWriteTrigger();
}
virtual Status Flush(const FlushOptions& fopts) override {
return sdb_->Flush(fopts);
}
virtual Status DisableFileDeletions() override {
return sdb_->DisableFileDeletions();
}
virtual Status EnableFileDeletions() override {
return sdb_->EnableFileDeletions();
}
virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
bool flush_memtable = true) override {
return sdb_->GetLiveFiles(vec, mfs, flush_memtable);
}
virtual SequenceNumber GetLatestSequenceNumber() const override {
return sdb_->GetLatestSequenceNumber();
}
virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
return sdb_->GetSortedWalFiles(files);
}
virtual Status DeleteFile(std::string name) override {
return sdb_->DeleteFile(name);
}
virtual Status GetUpdatesSince(SequenceNumber seq_number,
unique_ptr<TransactionLogIterator>* iter)
override {
return sdb_->GetUpdatesSince(seq_number, iter);
}
protected:
StackableDB* sdb_;
};
} // namespace rocksdb

View File

@@ -0,0 +1,50 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "stackable_db.h"
namespace rocksdb {
// This class contains APIs to open rocksdb with specific support eg. TTL
class UtilityDB {
public:
// Open the database with TTL support.
//
// USE-CASES:
// This API should be used to open the db when key-values inserted are
// meant to be removed from the db in a non-strict 'ttl' amount of time
// Therefore, this guarantees that key-values inserted will remain in the
// db for >= ttl amount of time and the db will make efforts to remove the
// key-values as soon as possible after ttl seconds of their insertion.
//
// BEHAVIOUR:
// TTL is accepted in seconds
// (int32_t)Timestamp(creation) is suffixed to values in Put internally
// Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
// Get/Iterator may return expired entries(compaction not run on them yet)
// Different TTL may be used during different Opens
// Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
// Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
// read_only=true opens in the usual read-only mode. Compactions will not be
// triggered(neither manual nor automatic), so no expired entries removed
//
// CONSTRAINTS:
// Not specifying/passing or non-positive TTL behaves like TTL = infinity
//
// !!!WARNING!!!:
// Calling DB::Open directly to re-open a db created by this API will get
// corrupt values(timestamp suffixed) and no ttl effect will be there
// during the second Open, so use this API consistently to open the db
// Be careful when passing ttl with a small positive value because the
// whole database may be deleted in a small amount of time
static Status OpenTtlDB(const Options& options,
const std::string& name,
StackableDB** dbptr,
int32_t ttl = 0,
bool read_only = false);
};
} // namespace rocksdb