Merge branch 'master' into columnfamilies

Conflicts:
	HISTORY.md
	db/db_impl.cc
	db/db_impl.h
	db/db_iter.cc
	db/db_test.cc
	db/dbformat.h
	db/memtable.cc
	db/memtable_list.cc
	db/memtable_list.h
	db/table_cache.cc
	db/table_cache.h
	db/version_edit.h
	db/version_set.cc
	db/version_set.h
	db/write_batch.cc
	db/write_batch_test.cc
	include/rocksdb/options.h
	util/options.cc
This commit is contained in:
Igor Canadi
2014-02-06 15:42:16 -08:00
104 changed files with 6225 additions and 2358 deletions

View File

@@ -1,45 +0,0 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Arena class defines memory allocation methods. It's used by memtable and
// skiplist.
#ifndef STORAGE_ROCKSDB_INCLUDE_ARENA_H_
#define STORAGE_ROCKSDB_INCLUDE_ARENA_H_
#include <limits>
#include <memory>
namespace rocksdb {
class Arena {
public:
Arena() {};
virtual ~Arena() {};
// Return a pointer to a newly allocated memory block of "bytes" bytes.
virtual char* Allocate(size_t bytes) = 0;
// Allocate memory with the normal alignment guarantees provided by malloc.
virtual char* AllocateAligned(size_t bytes) = 0;
// Returns an estimate of the total memory used by arena.
virtual const size_t ApproximateMemoryUsage() = 0;
// Returns the total number of bytes in all blocks allocated so far.
virtual const size_t MemoryAllocatedBytes() = 0;
private:
// No copying allowed
Arena(const Arena&);
void operator=(const Arena&);
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_ARENA_H_

View File

@@ -102,7 +102,10 @@ class Cache {
virtual uint64_t NewId() = 0;
// returns the maximum configured capacity of the cache
virtual size_t GetCapacity() = 0;
virtual size_t GetCapacity() const = 0;
// returns the memory size for the entries residing in the cache.
virtual size_t GetUsage() const = 0;
// Call this on shutdown if you want to speed it up. Cache will disown
// any underlying data and will not free it on delete. This call will leak

View File

@@ -438,7 +438,7 @@ class WritableFile {
// This asks the OS to initiate flushing the cached data to disk,
// without waiting for completion.
// Default implementation does nothing.
virtual Status RangeSync(off64_t offset, off64_t nbytes) {
virtual Status RangeSync(off_t offset, off_t nbytes) {
return Status::OK();
}

View File

@@ -33,8 +33,7 @@
// iteration over the entire collection is rare since doing so requires all the
// keys to be copied into a sorted data structure.
#ifndef STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
#define STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
#pragma once
#include <memory>
@@ -52,7 +51,11 @@ class MemTableRep {
public:
// Compare a and b. Return a negative value if a is less than b, 0 if they
// are equal, and a positive value if a is greater than b
virtual int operator()(const char* a, const char* b) const = 0;
virtual int operator()(const char* prefix_len_key1,
const char* prefix_len_key2) const = 0;
virtual int operator()(const char* prefix_len_key,
const Slice& key) const = 0;
virtual ~KeyComparator() { }
};
@@ -100,7 +103,7 @@ class MemTableRep {
virtual void Prev() = 0;
// Advance to the first entry with a key >= target
virtual void Seek(const char* target) = 0;
virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
// Position at the first entry in collection.
// Final state of iterator is Valid() iff collection is not empty.
@@ -175,26 +178,22 @@ public:
}
};
// HashSkipListRep is backed by hash map of buckets. Each bucket is a skip
// list. All the keys with the same prefix will be in the same bucket.
// The prefix is determined using user supplied SliceTransform. It has
// to match prefix_extractor in options.prefix_extractor.
//
// Iteration over the entire collection is implemented by dumping all the keys
// into a separate skip list. Thus, these data structures are best used when
// iteration over the entire collection is rare.
//
// Parameters:
// transform: The prefix extractor that returns prefix when supplied a user
// key. Has to match options.prefix_extractor
// bucket_count: Number of buckets in a hash_map. Each bucket needs
// 8 bytes. By default, we set buckets to one million, which
// will take 8MB of memory. If you know the number of keys you'll
// keep in hash map, set bucket count to be approximately twice
// the number of keys
// This class contains a fixed array of buckets, each
// pointing to a skiplist (null if the bucket is empty).
// bucket_count: number of fixed array buckets
// skiplist_height: the max height of the skiplist
// skiplist_branching_factor: probabilistic size ratio between adjacent
// link lists in the skiplist
extern MemTableRepFactory* NewHashSkipListRepFactory(
const SliceTransform* transform, size_t bucket_count = 1000000);
const SliceTransform* transform, size_t bucket_count = 1000000,
int32_t skiplist_height = 4, int32_t skiplist_branching_factor = 4
);
// The factory is to create memtables with a hashed linked list:
// it contains a fixed array of buckets, each pointing to a sorted single
// linked list (null if the bucket is empty).
// bucket_count: number of fixed array buckets
extern MemTableRepFactory* NewHashLinkListRepFactory(
const SliceTransform* transform, size_t bucket_count = 50000);
}
#endif // STORAGE_ROCKSDB_DB_MEMTABLEREP_H_

View File

@@ -34,6 +34,7 @@ class TablePropertiesCollector;
class Slice;
class SliceTransform;
class Statistics;
class InternalKeyComparator;
using std::shared_ptr;
@@ -65,6 +66,12 @@ struct CompressionOptions {
: window_bits(wbits), level(lev), strategy(strategy) {}
};
enum UpdateStatus { // Return status For inplace update callback
UPDATE_FAILED = 0, // Nothing to update
UPDATED_INPLACE = 1, // Value updated inplace
UPDATED = 2, // No inplace update. Merged value set
};
struct Options;
struct ColumnFamilyOptions {
@@ -410,13 +417,17 @@ struct ColumnFamilyOptions {
// the tables.
// Default: emtpy vector -- no user-defined statistics collection will be
// performed.
std::vector<std::shared_ptr<TablePropertiesCollector>>
table_properties_collectors;
typedef std::vector<std::shared_ptr<TablePropertiesCollector>>
TablePropertiesCollectors;
TablePropertiesCollectors table_properties_collectors;
// Allows thread-safe inplace updates. Requires Updates iff
// * key exists in current memtable
// * new sizeof(new_value) <= sizeof(old_value)
// * old_value for that key is a put i.e. kTypeValue
// Allows thread-safe inplace updates.
// If inplace_callback function is not set,
// Put(key, new_value) will update inplace the existing_value iff
// * key exists in current memtable
// * new sizeof(new_value) <= sizeof(existing_value)
// * existing_value for that key is a put i.e. kTypeValue
// If inplace_callback function is set, check doc for inplace_callback.
// Default: false.
bool inplace_update_support;
@@ -424,6 +435,55 @@ struct ColumnFamilyOptions {
// Default: 10000, if inplace_update_support = true, else 0.
size_t inplace_update_num_locks;
// existing_value - pointer to previous value (from both memtable and sst).
// nullptr if key doesn't exist
// existing_value_size - pointer to size of existing_value).
// nullptr if key doesn't exist
// delta_value - Delta value to be merged with the existing_value.
// Stored in transaction logs.
// merged_value - Set when delta is applied on the previous value.
// Applicable only when inplace_update_support is true,
// this callback function is called at the time of updating the memtable
// as part of a Put operation, lets say Put(key, delta_value). It allows the
// 'delta_value' specified as part of the Put operation to be merged with
// an 'existing_value' of the key in the database.
// If the merged value is smaller in size that the 'existing_value',
// then this function can update the 'existing_value' buffer inplace and
// the corresponding 'existing_value'_size pointer, if it wishes to.
// The callback should return UpdateStatus::UPDATED_INPLACE.
// In this case. (In this case, the snapshot-semantics of the rocksdb
// Iterator is not atomic anymore).
// If the merged value is larger in size than the 'existing_value' or the
// application does not wish to modify the 'existing_value' buffer inplace,
// then the merged value should be returned via *merge_value. It is set by
// merging the 'existing_value' and the Put 'delta_value'. The callback should
// return UpdateStatus::UPDATED in this case. This merged value will be added
// to the memtable.
// If merging fails or the application does not wish to take any action,
// then the callback should return UpdateStatus::UPDATE_FAILED.
// Please remember that the original call from the application is Put(key,
// delta_value). So the transaction log (if enabled) will still contain (key,
// delta_value). The 'merged_value' is not stored in the transaction log.
// Hence the inplace_callback function should be consistent across db reopens.
// Default: nullptr
UpdateStatus (*inplace_callback)(char* existing_value,
uint32_t* existing_value_size,
Slice delta_value,
std::string* merged_value);
// if prefix_extractor is set and bloom_bits is not 0, create prefix bloom
// for memtable
uint32_t memtable_prefix_bloom_bits;
// number of hash probes per key
uint32_t memtable_prefix_bloom_probes;
// Maximum number of successive merge operations on a key in the memtable.
//
// When a merge operation is added to the memtable and the maximum number of
@@ -473,9 +533,10 @@ struct DBOptions {
shared_ptr<Logger> info_log;
// Number of open files that can be used by the DB. You may need to
// increase this if your database has a large working set (budget
// one open file per 2MB of working set).
//
// increase this if your database has a large working set. Value -1 means
// files opened are always kept open. You can estimate number of files based
// on target_file_size_base and target_file_size_multiplier for level-based
// compaction. For universal-style compaction, you can usually set it to -1.
// Default: 1000
int max_open_files;

View File

@@ -38,7 +38,27 @@ struct PerfContext {
uint64_t internal_key_skipped_count;
// total number of deletes skipped over during iteration
uint64_t internal_delete_skipped_count;
uint64_t wal_write_time; // total time spent on writing to WAL
uint64_t get_snapshot_time; // total time spent on getting snapshot
uint64_t get_from_memtable_time; // total time spent on querying memtables
uint64_t get_from_memtable_count; // number of mem tables queried
// total time spent after Get() finds a key
uint64_t get_post_process_time;
uint64_t get_from_output_files_time; // total time reading from output files
// total time spent on seeking child iters
uint64_t seek_child_seek_time;
// number of seek issued in child iterators
uint64_t seek_child_seek_count;
uint64_t seek_min_heap_time; // total time spent on the merge heap
// total time spent on seeking the internal entries
uint64_t seek_internal_seek_time;
// total time spent on iterating internal entries to find the next user entry
uint64_t find_next_user_entry_time;
// total time spent on pre or post processing when writing a record
uint64_t write_pre_and_post_process_time;
uint64_t write_wal_time; // total time spent on writing to WAL
// total time spent on writing to mem tables
uint64_t write_memtable_time;
};
extern __thread PerfContext perf_context;

View File

@@ -7,7 +7,6 @@
#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
#include <atomic>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <string>
@@ -18,10 +17,8 @@ namespace rocksdb {
/**
* Keep adding ticker's here.
* Any ticker should have a value less than TICKER_ENUM_MAX.
* Add a new ticker by assigning it the current value of TICKER_ENUM_MAX
* Add a string representation in TickersNameMap below.
* And incrementing TICKER_ENUM_MAX.
* 1. Any ticker should be added before TICKER_ENUM_MAX.
* 2. Add a readable string in TickersNameMap below for the newly added ticker.
*/
enum Tickers {
// total block cache misses
@@ -252,7 +249,7 @@ class Statistics {
virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0;
virtual void measureTime(Histograms histogramType, uint64_t time) = 0;
virtual void histogramData(Histograms type, HistogramData * const data) = 0;
virtual void histogramData(Histograms type, HistogramData* const data) = 0;
// String representation of the statistic object.
std::string ToString();
};

View File

@@ -1,127 +1,81 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Currently we support two types of tables: plain table and block-based table.
// 1. Block-based table: this is the default table type that we inherited from
// LevelDB, which was designed for storing data in hard disk or flash
// device.
// 2. Plain table: it is one of RocksDB's SST file format optimized
// for low query latency on pure-memory or really low-latency media.
//
// A tutorial of rocksdb table formats is available here:
// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
//
// Example code is also available
// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
#pragma once
#include <memory>
#include <stdint.h>
#include <string>
#include <unordered_map>
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/table_properties.h"
#include "rocksdb/options.h"
#include "rocksdb/status.h"
namespace rocksdb {
struct Options;
// -- Block-based Table
class FlushBlockPolicyFactory;
class RandomAccessFile;
struct ReadOptions;
class TableCache;
class TableBuilder;
class TableReader;
class WritableFile;
struct EnvOptions;
struct Options;
using std::unique_ptr;
// TableBuilder provides the interface used to build a Table
// (an immutable and sorted map from keys to values).
//
// Multiple threads can invoke const methods on a TableBuilder without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same TableBuilder must use
// external synchronization.
class TableBuilder {
public:
// REQUIRES: Either Finish() or Abandon() has been called.
virtual ~TableBuilder() {}
// For advanced user only
struct BlockBasedTableOptions {
// @flush_block_policy_factory creates the instances of flush block policy.
// which provides a configurable way to determine when to flush a block in
// the block based tables. If not set, table builder will use the default
// block flush policy, which cut blocks by block size (please refer to
// `FlushBlockBySizePolicy`).
std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
// Add key,value to the table being constructed.
// REQUIRES: key is after any previously added key according to comparator.
// REQUIRES: Finish(), Abandon() have not been called
virtual void Add(const Slice& key, const Slice& value) = 0;
// Return non-ok iff some error has been detected.
virtual Status status() const = 0;
// Finish building the table.
// REQUIRES: Finish(), Abandon() have not been called
virtual Status Finish() = 0;
// Indicate that the contents of this builder should be abandoned.
// If the caller is not going to call Finish(), it must call Abandon()
// before destroying this builder.
// REQUIRES: Finish(), Abandon() have not been called
virtual void Abandon() = 0;
// Number of calls to Add() so far.
virtual uint64_t NumEntries() const = 0;
// Size of the file generated so far. If invoked after a successful
// Finish() call, returns the size of the final generated file.
virtual uint64_t FileSize() const = 0;
// TODO(kailiu) Temporarily disable this feature by making the default value
// to be false.
//
// Indicating if we'd put index/filter blocks to the block cache.
// If not specified, each "table reader" object will pre-load index/filter
// block during table initialization.
bool cache_index_and_filter_blocks = false;
};
// A Table is a sorted map from strings to strings. Tables are
// immutable and persistent. A Table may be safely accessed from
// multiple threads without external synchronization.
class TableReader {
public:
virtual ~TableReader() {}
// Create default block based table factory.
extern TableFactory* NewBlockBasedTableFactory(
const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
// Determine whether there is a chance that the current table file
// contains the key a key starting with iternal_prefix. The specific
// table implementation can use bloom filter and/or other heuristic
// to filter out this table as a whole.
virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0;
// -- Plain Table
// @user_key_len: plain table has optimization for fix-sized keys, which can be
// specified via user_key_len. Alternatively, you can pass
// `kPlainTableVariableLength` if your keys have variable
// lengths.
// @bloom_bits_per_key: the number of bits used for bloom filer per key. You may
// disable it by passing a zero.
// @hash_table_ratio: the desired utilization of the hash table used for prefix
// hashing. hash_table_ratio = number of prefixes / #buckets
// in the hash table
const uint32_t kPlainTableVariableLength = 0;
extern TableFactory* NewPlainTableFactory(
uint32_t user_key_len = kPlainTableVariableLength,
int bloom_bits_per_key = 10, double hash_table_ratio = 0.75);
// Returns a new iterator over the table contents.
// The result of NewIterator() is initially invalid (caller must
// call one of the Seek methods on the iterator before using it).
virtual Iterator* NewIterator(const ReadOptions&) = 0;
// Given a key, return an approximate byte offset in the file where
// the data for that key begins (or would begin if the key were
// present in the file). The returned value is in terms of file
// bytes, and so includes effects like compression of the underlying data.
// E.g., the approximate offset of the last key in the table will
// be close to the file length.
virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
// Returns true if the block for the specified key is in cache.
// REQUIRES: key is in this table.
virtual bool TEST_KeyInCache(const ReadOptions& options,
const Slice& key) = 0;
// Set up the table for Compaction. Might change some parameters with
// posix_fadvise
virtual void SetupForCompaction() = 0;
virtual TableProperties& GetTableProperties() = 0;
// Calls (*result_handler)(handle_context, ...) repeatedly, starting with
// the entry found after a call to Seek(key), until result_handler returns
// false, where k is the actual internal key for a row found and v as the
// value of the key. didIO is true if I/O is involved in the operation. May
// not make such a call if filter policy says that key is not present.
//
// mark_key_may_exist_handler needs to be called when it is configured to be
// memory only and the key is not found in the block cache, with
// the parameter to be handle_context.
//
// readOptions is the options for the read
// key is the key to search for
virtual Status Get(
const ReadOptions& readOptions,
const Slice& key,
void* handle_context,
bool (*result_handler)(void* handle_context, const Slice& k,
const Slice& v, bool didIO),
void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
};
// A base class for table factories
// A base class for table factories.
class TableFactory {
public:
virtual ~TableFactory() {}
@@ -139,7 +93,7 @@ class TableFactory {
// in parameter file. It's the caller's responsibility to make sure
// file is in the correct format.
//
// GetTableReader() is called in two places:
// NewTableReader() is called in two places:
// (1) TableCache::FindTable() calls the function when table cache miss
// and cache the table object returned.
// (1) SstFileReader (for SST Dump) opens the table and dump the table
@@ -150,9 +104,10 @@ class TableFactory {
// file is a file handler to handle the file for the table
// file_size is the physical file size of the file
// table_reader is the output table reader
virtual Status GetTableReader(
virtual Status NewTableReader(
const Options& options, const EnvOptions& soptions,
unique_ptr<RandomAccessFile> && file, uint64_t file_size,
const InternalKeyComparator& internal_comparator,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table_reader) const = 0;
// Return a table builder to write to a file for this table type.
@@ -173,8 +128,9 @@ class TableFactory {
// file is a handle of a writable file. It is the caller's responsibility to
// keep the file open and close the file after closing the table builder.
// compression_type is the compression type to use in this table.
virtual TableBuilder* GetTableBuilder(
const Options& options, WritableFile* file,
CompressionType compression_type) const = 0;
virtual TableBuilder* NewTableBuilder(
const Options& options, const InternalKeyComparator& internal_comparator,
WritableFile* file, CompressionType compression_type) const = 0;
};
} // namespace rocksdb

View File

@@ -1,28 +1,25 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <string>
#include <unordered_map>
#include "rocksdb/status.h"
namespace rocksdb {
// -- Table Properties
// Other than basic table properties, each table may also have the user
// collected properties.
// The value of the user-collected properties are encoded as raw bytes --
// users have to interprete these values by themselves.
typedef std::unordered_map<std::string, std::string> UserCollectedProperties;
// TableProperties contains a bunch of read-only properties of its associated
// table.
struct TableProperties {
public:
// Other than basic table properties, each table may also have the user
// collected properties.
// The value of the user-collected properties are encoded as raw bytes --
// users have to interprete these values by themselves.
typedef
std::unordered_map<std::string, std::string>
UserCollectedProperties;
// the total size of all data blocks.
uint64_t data_size = 0;
// the size of index block.
@@ -37,6 +34,10 @@ struct TableProperties {
uint64_t num_data_blocks = 0;
// the number of entries in this table
uint64_t num_entries = 0;
// format version, reserved for backward compatibility
uint64_t format_version = 0;
// If 0, key is variable length. Otherwise number of bytes for each key.
uint64_t fixed_key_len = 0;
// The name of the filter policy used in this table.
// If no filter policy is used, `filter_policy_name` will be an empty string.
@@ -47,17 +48,32 @@ struct TableProperties {
// convert this object to a human readable form
// @prop_delim: delimiter for each property.
std::string ToString(
const std::string& prop_delim = "; ",
const std::string& kv_delim = "=") const;
std::string ToString(const std::string& prop_delim = "; ",
const std::string& kv_delim = "=") const;
};
// table properties' human-readable names in the property block.
struct TablePropertiesNames {
static const std::string kDataSize;
static const std::string kIndexSize;
static const std::string kFilterSize;
static const std::string kRawKeySize;
static const std::string kRawValueSize;
static const std::string kNumDataBlocks;
static const std::string kNumEntries;
static const std::string kFormatVersion;
static const std::string kFixedKeyLen;
static const std::string kFilterPolicy;
};
extern const std::string kPropertiesBlock;
// `TablePropertiesCollector` provides the mechanism for users to collect
// their own interested properties. This class is essentially a collection
// of callback functions that will be invoked during table building.
class TablePropertiesCollector {
public:
virtual ~TablePropertiesCollector() { }
virtual ~TablePropertiesCollector() {}
// Add() will be called when a new key/value pair is inserted into the table.
// @params key the original key that is inserted into the table.
@@ -68,23 +84,20 @@ class TablePropertiesCollector {
// for writing the properties block.
// @params properties User will add their collected statistics to
// `properties`.
virtual Status Finish(
TableProperties::UserCollectedProperties* properties) = 0;
virtual Status Finish(UserCollectedProperties* properties) = 0;
// The name of the properties collector can be used for debugging purpose.
virtual const char* Name() const = 0;
// Return the human-readable properties, where the key is property name and
// the value is the human-readable form of value.
virtual TableProperties::UserCollectedProperties
GetReadableProperties() const = 0;
virtual UserCollectedProperties GetReadableProperties() const = 0;
};
// Extra properties
// Below is a list of non-basic properties that are collected by database
// itself. Especially some properties regarding to the internal keys (which
// is unknown to `table`).
extern uint64_t GetDeletedKeys(
const TableProperties::UserCollectedProperties& props);
extern uint64_t GetDeletedKeys(const UserCollectedProperties& props);
} // namespace rocksdb