Merge branch 'master' into columnfamilies

Conflicts:
	HISTORY.md
	db/db_impl.cc
	db/db_impl.h
	db/db_iter.cc
	db/db_test.cc
	db/dbformat.h
	db/memtable.cc
	db/memtable_list.cc
	db/memtable_list.h
	db/table_cache.cc
	db/table_cache.h
	db/version_edit.h
	db/version_set.cc
	db/version_set.h
	db/write_batch.cc
	db/write_batch_test.cc
	include/rocksdb/options.h
	util/options.cc
This commit is contained in:
Igor Canadi
2014-02-06 15:42:16 -08:00
104 changed files with 6225 additions and 2358 deletions

View File

@@ -7,19 +7,19 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/arena_impl.h"
#include "util/arena.h"
#include <algorithm>
namespace rocksdb {
const size_t ArenaImpl::kMinBlockSize = 4096;
const size_t ArenaImpl::kMaxBlockSize = 2 << 30;
const size_t Arena::kMinBlockSize = 4096;
const size_t Arena::kMaxBlockSize = 2 << 30;
static const int kAlignUnit = sizeof(void*);
size_t OptimizeBlockSize(size_t block_size) {
// Make sure block_size is in optimal range
block_size = std::max(ArenaImpl::kMinBlockSize, block_size);
block_size = std::min(ArenaImpl::kMaxBlockSize, block_size);
block_size = std::max(Arena::kMinBlockSize, block_size);
block_size = std::min(Arena::kMaxBlockSize, block_size);
// make sure block_size is the multiple of kAlignUnit
if (block_size % kAlignUnit != 0) {
@@ -29,19 +29,18 @@ size_t OptimizeBlockSize(size_t block_size) {
return block_size;
}
ArenaImpl::ArenaImpl(size_t block_size)
: kBlockSize(OptimizeBlockSize(block_size)) {
Arena::Arena(size_t block_size) : kBlockSize(OptimizeBlockSize(block_size)) {
assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
kBlockSize % kAlignUnit == 0);
}
ArenaImpl::~ArenaImpl() {
Arena::~Arena() {
for (const auto& block : blocks_) {
delete[] block;
}
}
char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) {
char* Arena::AllocateFallback(size_t bytes, bool aligned) {
if (bytes > kBlockSize / 4) {
// Object is more than a quarter of our block size. Allocate it separately
// to avoid wasting too much space in leftover bytes.
@@ -63,7 +62,7 @@ char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) {
}
}
char* ArenaImpl::AllocateAligned(size_t bytes) {
char* Arena::AllocateAligned(size_t bytes) {
assert((kAlignUnit & (kAlignUnit - 1)) ==
0); // Pointer size should be a power of 2
size_t current_mod =
@@ -83,7 +82,7 @@ char* ArenaImpl::AllocateAligned(size_t bytes) {
return result;
}
char* ArenaImpl::AllocateNewBlock(size_t block_bytes) {
char* Arena::AllocateNewBlock(size_t block_bytes) {
char* block = new char[block_bytes];
blocks_memory_ += block_bytes;
blocks_.push_back(block);

View File

@@ -7,7 +7,7 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
// ArenaImpl is an implementation of Arena class. For a request of small size,
// Arena is an implementation of Arena class. For a request of small size,
// it allocates a block with pre-defined block size. For a request of big
// size, it uses malloc to directly get the requested size.
@@ -16,37 +16,35 @@
#include <vector>
#include <assert.h>
#include <stdint.h>
#include "rocksdb/arena.h"
#include "util/arena.h"
namespace rocksdb {
class ArenaImpl : public Arena {
class Arena {
public:
// No copying allowed
ArenaImpl(const ArenaImpl&) = delete;
void operator=(const ArenaImpl&) = delete;
Arena(const Arena&) = delete;
void operator=(const Arena&) = delete;
static const size_t kMinBlockSize;
static const size_t kMaxBlockSize;
explicit ArenaImpl(size_t block_size = kMinBlockSize);
virtual ~ArenaImpl();
explicit Arena(size_t block_size = kMinBlockSize);
~Arena();
virtual char* Allocate(size_t bytes) override;
char* Allocate(size_t bytes);
virtual char* AllocateAligned(size_t bytes) override;
char* AllocateAligned(size_t bytes);
// Returns an estimate of the total memory usage of data allocated
// by the arena (exclude the space allocated but not yet used for future
// allocations).
virtual const size_t ApproximateMemoryUsage() {
const size_t ApproximateMemoryUsage() {
return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
alloc_bytes_remaining_;
}
virtual const size_t MemoryAllocatedBytes() override {
return blocks_memory_;
}
const size_t MemoryAllocatedBytes() { return blocks_memory_; }
private:
// Number of bytes allocated in one block
@@ -72,7 +70,7 @@ class ArenaImpl : public Arena {
size_t blocks_memory_ = 0;
};
inline char* ArenaImpl::Allocate(size_t bytes) {
inline char* Arena::Allocate(size_t bytes) {
// The semantics of what to return are a bit messy if we allow
// 0-byte allocations, so we disallow them here (we don't need
// them for our internal use).

View File

@@ -7,34 +7,32 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/arena_impl.h"
#include "util/arena.h"
#include "util/random.h"
#include "util/testharness.h"
namespace rocksdb {
class ArenaImplTest { };
class ArenaTest {};
TEST(ArenaImplTest, Empty) {
ArenaImpl arena0;
}
TEST(ArenaTest, Empty) { Arena arena0; }
TEST(ArenaImplTest, MemoryAllocatedBytes) {
TEST(ArenaTest, MemoryAllocatedBytes) {
const int N = 17;
size_t req_sz; //requested size
size_t req_sz; // requested size
size_t bsz = 8192; // block size
size_t expected_memory_allocated;
ArenaImpl arena_impl(bsz);
Arena arena(bsz);
// requested size > quarter of a block:
// allocate requested size separately
req_sz = 3001;
for (int i = 0; i < N; i++) {
arena_impl.Allocate(req_sz);
arena.Allocate(req_sz);
}
expected_memory_allocated = req_sz * N;
ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
// requested size < quarter of a block:
// allocate a block with the default size, then try to use unused part
@@ -42,28 +40,28 @@ TEST(ArenaImplTest, MemoryAllocatedBytes) {
// Allocate(99) call. All the remaining calls won't lead to new allocation.
req_sz = 99;
for (int i = 0; i < N; i++) {
arena_impl.Allocate(req_sz);
arena.Allocate(req_sz);
}
expected_memory_allocated += bsz;
ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
// requested size > quarter of a block:
// allocate requested size separately
req_sz = 99999999;
for (int i = 0; i < N; i++) {
arena_impl.Allocate(req_sz);
arena.Allocate(req_sz);
}
expected_memory_allocated += req_sz * N;
ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
}
// Make sure we didn't count the allocate but not used memory space in
// Arena::ApproximateMemoryUsage()
TEST(ArenaImplTest, ApproximateMemoryUsageTest) {
TEST(ArenaTest, ApproximateMemoryUsageTest) {
const size_t kBlockSize = 4096;
const size_t kEntrySize = kBlockSize / 8;
const size_t kZero = 0;
ArenaImpl arena(kBlockSize);
const size_t kZero = 0;
Arena arena(kBlockSize);
ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
auto num_blocks = kBlockSize / kEntrySize;
@@ -83,9 +81,9 @@ TEST(ArenaImplTest, ApproximateMemoryUsageTest) {
ASSERT_GT(usage, mem_usage);
}
TEST(ArenaImplTest, Simple) {
TEST(ArenaTest, Simple) {
std::vector<std::pair<size_t, char*>> allocated;
ArenaImpl arena_impl;
Arena arena;
const int N = 100000;
size_t bytes = 0;
Random rnd(301);
@@ -104,9 +102,9 @@ TEST(ArenaImplTest, Simple) {
}
char* r;
if (rnd.OneIn(10)) {
r = arena_impl.AllocateAligned(s);
r = arena.AllocateAligned(s);
} else {
r = arena_impl.Allocate(s);
r = arena.Allocate(s);
}
for (unsigned int b = 0; b < s; b++) {
@@ -115,9 +113,9 @@ TEST(ArenaImplTest, Simple) {
}
bytes += s;
allocated.push_back(std::make_pair(s, r));
ASSERT_GE(arena_impl.ApproximateMemoryUsage(), bytes);
ASSERT_GE(arena.ApproximateMemoryUsage(), bytes);
if (i > N / 10) {
ASSERT_LE(arena_impl.ApproximateMemoryUsage(), bytes * 1.10);
ASSERT_LE(arena.ApproximateMemoryUsage(), bytes * 1.10);
}
}
for (unsigned int i = 0; i < allocated.size(); i++) {
@@ -132,6 +130,4 @@ TEST(ArenaImplTest, Simple) {
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}
int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

View File

@@ -57,11 +57,9 @@ class autovector {
typedef std::random_access_iterator_tag iterator_category;
iterator_impl(TAutoVector* vect, size_t index)
: vect_(vect)
, index_(index) {
};
: vect_(vect), index_(index) {};
iterator_impl(const iterator_impl&) = default;
~iterator_impl() { }
~iterator_impl() {}
iterator_impl& operator=(const iterator_impl&) = default;
// -- Advancement
@@ -130,9 +128,7 @@ class autovector {
return index_ == other.index_;
}
bool operator!=(const self_type& other) const {
return !(*this == other);
}
bool operator!=(const self_type& other) const { return !(*this == other); }
bool operator>(const self_type& other) const {
assert(vect_ == other.vect_);
@@ -174,13 +170,9 @@ class autovector {
return vect_.capacity() == 0;
}
size_type size() const {
return num_stack_items_ + vect_.size();
}
size_type size() const { return num_stack_items_ + vect_.size(); }
bool empty() const {
return size() == 0;
}
bool empty() const { return size() == 0; }
// will not check boundry
const_reference operator[](size_type n) const {
@@ -235,11 +227,9 @@ class autovector {
}
}
void push_back(const T& item) {
push_back(value_type(item));
}
void push_back(const T& item) { push_back(value_type(item)); }
template<class... Args>
template <class... Args>
void emplace_back(Args&&... args) {
push_back(value_type(args...));
}
@@ -261,13 +251,9 @@ class autovector {
// -- Copy and Assignment
autovector& assign(const autovector& other);
autovector(const autovector& other) {
assign(other);
}
autovector(const autovector& other) { assign(other); }
autovector& operator=(const autovector& other) {
return assign(other);
}
autovector& operator=(const autovector& other) { return assign(other); }
// move operation are disallowed since it is very hard to make sure both
// autovectors are allocated from the same function stack.
@@ -275,41 +261,29 @@ class autovector {
autovector(autovector&& other) = delete;
// -- Iterator Operations
iterator begin() {
return iterator(this, 0);
}
iterator begin() { return iterator(this, 0); }
const_iterator begin() const {
return const_iterator(this, 0);
}
const_iterator begin() const { return const_iterator(this, 0); }
iterator end() {
return iterator(this, this->size());
}
iterator end() { return iterator(this, this->size()); }
const_iterator end() const {
return const_iterator(this, this->size());
}
const_iterator end() const { return const_iterator(this, this->size()); }
reverse_iterator rbegin() {
return reverse_iterator(end());
}
reverse_iterator rbegin() { return reverse_iterator(end()); }
const_reverse_iterator rbegin() const {
return const_reverse_iterator(end());
}
reverse_iterator rend() {
return reverse_iterator(begin());
}
reverse_iterator rend() { return reverse_iterator(begin()); }
const_reverse_iterator rend() const {
return const_reverse_iterator(begin());
}
private:
size_type num_stack_items_ = 0; // current number of items
value_type values_[kSize]; // the first `kSize` items
size_type num_stack_items_ = 0; // current number of items
value_type values_[kSize]; // the first `kSize` items
// used only if there are more than `kSize` items.
std::vector<T> vect_;
};

View File

@@ -7,12 +7,16 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <gflags/gflags.h>
#include "rocksdb/filter_policy.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
DEFINE_int32(bits_per_key, 10, "");
namespace rocksdb {
static const int kVerbose = 1;
@@ -29,7 +33,7 @@ class BloomTest {
std::vector<std::string> keys_;
public:
BloomTest() : policy_(NewBloomFilterPolicy(10)) { }
BloomTest() : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key)) { }
~BloomTest() {
delete policy_;
@@ -160,5 +164,7 @@ TEST(BloomTest, VaryingLengths) {
} // namespace rocksdb
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
return rocksdb::test::RunAllTests();
}

View File

@@ -10,10 +10,10 @@
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include "rocksdb/cache.h"
#include "port/port.h"
#include "util/autovector.h"
#include "util/hash.h"
#include "util/mutexlock.h"
@@ -156,6 +156,13 @@ class LRUCache {
Cache::Handle* Lookup(const Slice& key, uint32_t hash);
void Release(Cache::Handle* handle);
void Erase(const Slice& key, uint32_t hash);
// Although in some platforms the update of size_t is atomic, to make sure
// GetUsage() works correctly under any platforms, we'll protect this
// function with mutex.
size_t GetUsage() const {
MutexLock l(&mutex_);
return usage_;
}
private:
void LRU_Remove(LRUHandle* e);
@@ -171,7 +178,9 @@ class LRUCache {
uint32_t remove_scan_count_limit_;
// mutex_ protects the following state.
port::Mutex mutex_;
// We don't count mutex_ as the cache's internal state so semantically we
// don't mind mutex_ invoking the non-const actions.
mutable port::Mutex mutex_;
size_t usage_;
// Dummy head of LRU list.
@@ -255,8 +264,7 @@ Cache::Handle* LRUCache::Insert(
LRUHandle* e = reinterpret_cast<LRUHandle*>(
malloc(sizeof(LRUHandle)-1 + key.size()));
std::vector<LRUHandle*> last_reference_list;
last_reference_list.reserve(1);
autovector<LRUHandle*> last_reference_list;
e->value = value;
e->deleter = deleter;
@@ -342,10 +350,10 @@ static int kRemoveScanCountLimit = 0; // default values, can be overridden
class ShardedLRUCache : public Cache {
private:
LRUCache* shard_;
LRUCache* shards_;
port::Mutex id_mutex_;
uint64_t last_id_;
int numShardBits;
int num_shard_bits_;
size_t capacity_;
static inline uint32_t HashSlice(const Slice& s) {
@@ -354,18 +362,18 @@ class ShardedLRUCache : public Cache {
uint32_t Shard(uint32_t hash) {
// Note, hash >> 32 yields hash in gcc, not the zero we expect!
return (numShardBits > 0) ? (hash >> (32 - numShardBits)) : 0;
return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0;
}
void init(size_t capacity, int numbits, int removeScanCountLimit) {
numShardBits = numbits;
num_shard_bits_ = numbits;
capacity_ = capacity;
int numShards = 1 << numShardBits;
shard_ = new LRUCache[numShards];
const size_t per_shard = (capacity + (numShards - 1)) / numShards;
for (int s = 0; s < numShards; s++) {
shard_[s].SetCapacity(per_shard);
shard_[s].SetRemoveScanCountLimit(removeScanCountLimit);
int num_shards = 1 << num_shard_bits_;
shards_ = new LRUCache[num_shards];
const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
for (int s = 0; s < num_shards; s++) {
shards_[s].SetCapacity(per_shard);
shards_[s].SetRemoveScanCountLimit(removeScanCountLimit);
}
}
@@ -374,30 +382,30 @@ class ShardedLRUCache : public Cache {
: last_id_(0) {
init(capacity, kNumShardBits, kRemoveScanCountLimit);
}
ShardedLRUCache(size_t capacity, int numShardBits,
ShardedLRUCache(size_t capacity, int num_shard_bits,
int removeScanCountLimit)
: last_id_(0) {
init(capacity, numShardBits, removeScanCountLimit);
init(capacity, num_shard_bits, removeScanCountLimit);
}
virtual ~ShardedLRUCache() {
delete[] shard_;
delete[] shards_;
}
virtual Handle* Insert(const Slice& key, void* value, size_t charge,
void (*deleter)(const Slice& key, void* value)) {
const uint32_t hash = HashSlice(key);
return shard_[Shard(hash)].Insert(key, hash, value, charge, deleter);
return shards_[Shard(hash)].Insert(key, hash, value, charge, deleter);
}
virtual Handle* Lookup(const Slice& key) {
const uint32_t hash = HashSlice(key);
return shard_[Shard(hash)].Lookup(key, hash);
return shards_[Shard(hash)].Lookup(key, hash);
}
virtual void Release(Handle* handle) {
LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
shard_[Shard(h->hash)].Release(handle);
shards_[Shard(h->hash)].Release(handle);
}
virtual void Erase(const Slice& key) {
const uint32_t hash = HashSlice(key);
shard_[Shard(hash)].Erase(key, hash);
shards_[Shard(hash)].Erase(key, hash);
}
virtual void* Value(Handle* handle) {
return reinterpret_cast<LRUHandle*>(handle)->value;
@@ -406,11 +414,23 @@ class ShardedLRUCache : public Cache {
MutexLock l(&id_mutex_);
return ++(last_id_);
}
virtual size_t GetCapacity() {
virtual size_t GetCapacity() const {
return capacity_;
}
virtual size_t GetUsage() const {
// We will not lock the cache when getting the usage from shards.
// for (size_t i = 0; i < num_shard_bits_; ++i)
int num_shards = 1 << num_shard_bits_;
size_t usage = 0;
for (int s = 0; s < num_shards; s++) {
usage += shards_[s].GetUsage();
}
return usage;
}
virtual void DisownData() {
shard_ = nullptr;
shards_ = nullptr;
}
};
@@ -420,17 +440,17 @@ shared_ptr<Cache> NewLRUCache(size_t capacity) {
return NewLRUCache(capacity, kNumShardBits);
}
shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits) {
return NewLRUCache(capacity, numShardBits, kRemoveScanCountLimit);
shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits) {
return NewLRUCache(capacity, num_shard_bits, kRemoveScanCountLimit);
}
shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits,
int removeScanCountLimit) {
if (numShardBits >= 20) {
if (num_shard_bits >= 20) {
return nullptr; // the cache cannot be sharded into too many fine pieces
}
return std::make_shared<ShardedLRUCache>(capacity,
numShardBits,
num_shard_bits,
removeScanCountLimit);
}

View File

@@ -107,6 +107,39 @@ class CacheTest {
};
CacheTest* CacheTest::current_;
void dumbDeleter(const Slice& key, void* value) { }
TEST(CacheTest, UsageTest) {
// cache is shared_ptr and will be automatically cleaned up.
const uint64_t kCapacity = 100000;
auto cache = NewLRUCache(kCapacity, 8, 200);
size_t usage = 0;
const char* value = "abcdef";
// make sure everything will be cached
for (int i = 1; i < 100; ++i) {
std::string key(i, 'a');
auto kv_size = key.size() + 5;
cache->Release(
cache->Insert(key, (void*)value, kv_size, dumbDeleter)
);
usage += kv_size;
ASSERT_EQ(usage, cache->GetUsage());
}
// make sure the cache will be overloaded
for (uint64_t i = 1; i < kCapacity; ++i) {
auto key = std::to_string(i);
cache->Release(
cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter)
);
}
// the usage should be close to the capacity
ASSERT_GT(kCapacity, cache->GetUsage());
ASSERT_LT(kCapacity * 0.95, cache->GetUsage());
}
TEST(CacheTest, HitAndMiss) {
ASSERT_EQ(-1, Lookup(100));
@@ -353,7 +386,6 @@ void deleter(const Slice& key, void* value) {
delete (Value *)value;
}
TEST(CacheTest, BadEviction) {
int n = 10;

View File

@@ -9,131 +9,41 @@
#include "util/coding.h"
#include <algorithm>
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include <algorithm>
namespace rocksdb {
void EncodeFixed32(char* buf, uint32_t value) {
#if __BYTE_ORDER == __LITTLE_ENDIAN
memcpy(buf, &value, sizeof(value));
#else
buf[0] = value & 0xff;
buf[1] = (value >> 8) & 0xff;
buf[2] = (value >> 16) & 0xff;
buf[3] = (value >> 24) & 0xff;
#endif
}
void EncodeFixed64(char* buf, uint64_t value) {
#if __BYTE_ORDER == __LITTLE_ENDIAN
memcpy(buf, &value, sizeof(value));
#else
buf[0] = value & 0xff;
buf[1] = (value >> 8) & 0xff;
buf[2] = (value >> 16) & 0xff;
buf[3] = (value >> 24) & 0xff;
buf[4] = (value >> 32) & 0xff;
buf[5] = (value >> 40) & 0xff;
buf[6] = (value >> 48) & 0xff;
buf[7] = (value >> 56) & 0xff;
#endif
}
void PutFixed32(std::string* dst, uint32_t value) {
char buf[sizeof(value)];
EncodeFixed32(buf, value);
dst->append(buf, sizeof(buf));
}
void PutFixed64(std::string* dst, uint64_t value) {
char buf[sizeof(value)];
EncodeFixed64(buf, value);
dst->append(buf, sizeof(buf));
}
char* EncodeVarint32(char* dst, uint32_t v) {
// Operate on characters as unsigneds
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
static const int B = 128;
if (v < (1<<7)) {
if (v < (1 << 7)) {
*(ptr++) = v;
} else if (v < (1<<14)) {
} else if (v < (1 << 14)) {
*(ptr++) = v | B;
*(ptr++) = v>>7;
} else if (v < (1<<21)) {
*(ptr++) = v >> 7;
} else if (v < (1 << 21)) {
*(ptr++) = v | B;
*(ptr++) = (v>>7) | B;
*(ptr++) = v>>14;
} else if (v < (1<<28)) {
*(ptr++) = (v >> 7) | B;
*(ptr++) = v >> 14;
} else if (v < (1 << 28)) {
*(ptr++) = v | B;
*(ptr++) = (v>>7) | B;
*(ptr++) = (v>>14) | B;
*(ptr++) = v>>21;
*(ptr++) = (v >> 7) | B;
*(ptr++) = (v >> 14) | B;
*(ptr++) = v >> 21;
} else {
*(ptr++) = v | B;
*(ptr++) = (v>>7) | B;
*(ptr++) = (v>>14) | B;
*(ptr++) = (v>>21) | B;
*(ptr++) = v>>28;
*(ptr++) = (v >> 7) | B;
*(ptr++) = (v >> 14) | B;
*(ptr++) = (v >> 21) | B;
*(ptr++) = v >> 28;
}
return reinterpret_cast<char*>(ptr);
}
void PutVarint32(std::string* dst, uint32_t v) {
char buf[5];
char* ptr = EncodeVarint32(buf, v);
dst->append(buf, ptr - buf);
}
char* EncodeVarint64(char* dst, uint64_t v) {
static const unsigned int B = 128;
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
while (v >= B) {
*(ptr++) = (v & (B-1)) | B;
v >>= 7;
}
*(ptr++) = static_cast<unsigned char>(v);
return reinterpret_cast<char*>(ptr);
}
void PutVarint64(std::string* dst, uint64_t v) {
char buf[10];
char* ptr = EncodeVarint64(buf, v);
dst->append(buf, ptr - buf);
}
void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
PutVarint32(dst, value.size());
dst->append(value.data(), value.size());
}
void PutLengthPrefixedSliceParts(std::string* dst,
const SliceParts& slice_parts) {
uint32_t total_bytes = 0;
for (int i = 0; i < slice_parts.num_parts; ++i) {
total_bytes += slice_parts.parts[i].size();
}
PutVarint32(dst, total_bytes);
for (int i = 0; i < slice_parts.num_parts; ++i) {
dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size());
}
}
int VarintLength(uint64_t v) {
int len = 1;
while (v >= 128) {
v >>= 7;
len++;
}
return len;
}
const char* GetVarint32PtrFallback(const char* p,
const char* limit,
const char* GetVarint32PtrFallback(const char* p, const char* limit,
uint32_t* value) {
uint32_t result = 0;
for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
@@ -151,18 +61,6 @@ const char* GetVarint32PtrFallback(const char* p,
return nullptr;
}
bool GetVarint32(Slice* input, uint32_t* value) {
const char* p = input->data();
const char* limit = p + input->size();
const char* q = GetVarint32Ptr(p, limit, value);
if (q == nullptr) {
return false;
} else {
*input = Slice(q, limit - q);
return true;
}
}
const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
uint64_t result = 0;
for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) {
@@ -180,58 +78,6 @@ const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
return nullptr;
}
bool GetVarint64(Slice* input, uint64_t* value) {
const char* p = input->data();
const char* limit = p + input->size();
const char* q = GetVarint64Ptr(p, limit, value);
if (q == nullptr) {
return false;
} else {
*input = Slice(q, limit - q);
return true;
}
}
const char* GetLengthPrefixedSlice(const char* p, const char* limit,
Slice* result) {
uint32_t len;
p = GetVarint32Ptr(p, limit, &len);
if (p == nullptr) return nullptr;
if (p + len > limit) return nullptr;
*result = Slice(p, len);
return p + len;
}
bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
uint32_t len;
if (GetVarint32(input, &len) &&
input->size() >= len) {
*result = Slice(input->data(), len);
input->remove_prefix(len);
return true;
} else {
return false;
}
}
Slice GetLengthPrefixedSlice(const char* data) {
uint32_t len;
const char* p = data;
p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted
return Slice(p, len);
}
Slice GetSliceUntil(Slice* slice, char delimiter) {
uint32_t len;
for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
// nothing
}
Slice ret(slice->data(), len);
slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
return ret;
}
void BitStreamPutInt(char* dst, size_t dstlen, size_t offset,
uint32_t bits, uint64_t value) {
assert((offset + bits + 7)/8 <= dstlen);
@@ -320,14 +166,4 @@ void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits,
BitStreamGetInt(dst, offset, bits));
}
uint64_t BitStreamGetInt(const std::string* src, size_t offset,
uint32_t bits) {
return BitStreamGetInt(src->data(), src->size(), offset, bits);
}
uint64_t BitStreamGetInt(const Slice* src, size_t offset,
uint32_t bits) {
return BitStreamGetInt(src->data(), src->size(), offset, bits);
}
} // namespace rocksdb

View File

@@ -13,6 +13,7 @@
// * Strings are encoded prefixed by their length in varint format
#pragma once
#include <algorithm>
#include <stdint.h>
#include <string.h>
#include <string>
@@ -40,6 +41,7 @@ extern void PutLengthPrefixedSliceParts(std::string* dst,
extern bool GetVarint32(Slice* input, uint32_t* value);
extern bool GetVarint64(Slice* input, uint64_t* value);
extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
// This function assumes data is well-formed.
extern Slice GetLengthPrefixedSlice(const char* data);
extern Slice GetSliceUntil(Slice* slice, char delimiter);
@@ -138,4 +140,155 @@ extern uint64_t BitStreamGetInt(const std::string* src, size_t offset,
extern uint64_t BitStreamGetInt(const Slice* src, size_t offset,
uint32_t bits);
// -- Implementation of the functions declared above
inline void EncodeFixed32(char* buf, uint32_t value) {
#if __BYTE_ORDER == __LITTLE_ENDIAN
memcpy(buf, &value, sizeof(value));
#else
buf[0] = value & 0xff;
buf[1] = (value >> 8) & 0xff;
buf[2] = (value >> 16) & 0xff;
buf[3] = (value >> 24) & 0xff;
#endif
}
inline void EncodeFixed64(char* buf, uint64_t value) {
#if __BYTE_ORDER == __LITTLE_ENDIAN
memcpy(buf, &value, sizeof(value));
#else
buf[0] = value & 0xff;
buf[1] = (value >> 8) & 0xff;
buf[2] = (value >> 16) & 0xff;
buf[3] = (value >> 24) & 0xff;
buf[4] = (value >> 32) & 0xff;
buf[5] = (value >> 40) & 0xff;
buf[6] = (value >> 48) & 0xff;
buf[7] = (value >> 56) & 0xff;
#endif
}
inline void PutFixed32(std::string* dst, uint32_t value) {
char buf[sizeof(value)];
EncodeFixed32(buf, value);
dst->append(buf, sizeof(buf));
}
inline void PutFixed64(std::string* dst, uint64_t value) {
char buf[sizeof(value)];
EncodeFixed64(buf, value);
dst->append(buf, sizeof(buf));
}
inline void PutVarint32(std::string* dst, uint32_t v) {
char buf[5];
char* ptr = EncodeVarint32(buf, v);
dst->append(buf, ptr - buf);
}
inline char* EncodeVarint64(char* dst, uint64_t v) {
static const unsigned int B = 128;
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
while (v >= B) {
*(ptr++) = (v & (B - 1)) | B;
v >>= 7;
}
*(ptr++) = static_cast<unsigned char>(v);
return reinterpret_cast<char*>(ptr);
}
inline void PutVarint64(std::string* dst, uint64_t v) {
char buf[10];
char* ptr = EncodeVarint64(buf, v);
dst->append(buf, ptr - buf);
}
inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
PutVarint32(dst, value.size());
dst->append(value.data(), value.size());
}
inline void PutLengthPrefixedSliceParts(std::string* dst,
const SliceParts& slice_parts) {
uint32_t total_bytes = 0;
for (int i = 0; i < slice_parts.num_parts; ++i) {
total_bytes += slice_parts.parts[i].size();
}
PutVarint32(dst, total_bytes);
for (int i = 0; i < slice_parts.num_parts; ++i) {
dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size());
}
}
inline int VarintLength(uint64_t v) {
int len = 1;
while (v >= 128) {
v >>= 7;
len++;
}
return len;
}
inline bool GetVarint32(Slice* input, uint32_t* value) {
const char* p = input->data();
const char* limit = p + input->size();
const char* q = GetVarint32Ptr(p, limit, value);
if (q == nullptr) {
return false;
} else {
*input = Slice(q, limit - q);
return true;
}
}
inline bool GetVarint64(Slice* input, uint64_t* value) {
const char* p = input->data();
const char* limit = p + input->size();
const char* q = GetVarint64Ptr(p, limit, value);
if (q == nullptr) {
return false;
} else {
*input = Slice(q, limit - q);
return true;
}
}
inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
uint32_t len = 0;
if (GetVarint32(input, &len) && input->size() >= len) {
*result = Slice(input->data(), len);
input->remove_prefix(len);
return true;
} else {
return false;
}
}
inline Slice GetLengthPrefixedSlice(const char* data) {
uint32_t len = 0;
// +5: we assume "data" is not corrupted
auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len);
return Slice(p, len);
}
inline Slice GetSliceUntil(Slice* slice, char delimiter) {
uint32_t len = 0;
for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
// nothing
}
Slice ret(slice->data(), len);
slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
return ret;
}
inline uint64_t BitStreamGetInt(const std::string* src, size_t offset,
uint32_t bits) {
return BitStreamGetInt(src->data(), src->size(), offset, bits);
}
inline uint64_t BitStreamGetInt(const Slice* src, size_t offset,
uint32_t bits) {
return BitStreamGetInt(src->data(), src->size(), offset, bits);
}
} // namespace rocksdb

View File

@@ -41,7 +41,7 @@ TEST(Coding, Fixed64) {
const char* p = s.data();
for (int power = 0; power <= 63; power++) {
uint64_t v = static_cast<uint64_t>(1) << power;
uint64_t actual;
uint64_t actual = 0;
actual = DecodeFixed64(p);
ASSERT_EQ(v-1, actual);
p += sizeof(uint64_t);
@@ -90,7 +90,7 @@ TEST(Coding, Varint32) {
const char* limit = p + s.size();
for (uint32_t i = 0; i < (32 * 32); i++) {
uint32_t expected = (i / 32) << (i % 32);
uint32_t actual;
uint32_t actual = 0;
const char* start = p;
p = GetVarint32Ptr(p, limit, &actual);
ASSERT_TRUE(p != nullptr);
@@ -125,7 +125,7 @@ TEST(Coding, Varint64) {
const char* limit = p + s.size();
for (unsigned int i = 0; i < values.size(); i++) {
ASSERT_TRUE(p < limit);
uint64_t actual;
uint64_t actual = 0;
const char* start = p;
p = GetVarint64Ptr(p, limit, &actual);
ASSERT_TRUE(p != nullptr);

36
util/dynamic_bloom.cc Normal file
View File

@@ -0,0 +1,36 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "dynamic_bloom.h"
#include "rocksdb/slice.h"
#include "util/hash.h"
namespace rocksdb {
namespace {
static uint32_t BloomHash(const Slice& key) {
return Hash(key.data(), key.size(), 0xbc9f1d34);
}
}
DynamicBloom::DynamicBloom(uint32_t total_bits,
uint32_t (*hash_func)(const Slice& key),
uint32_t num_probes)
: hash_func_(hash_func),
kTotalBits((total_bits + 7) / 8 * 8),
kNumProbes(num_probes) {
assert(hash_func_);
assert(kNumProbes > 0);
assert(kTotalBits > 0);
data_.reset(new unsigned char[kTotalBits / 8]());
}
DynamicBloom::DynamicBloom(uint32_t total_bits,
uint32_t num_probes)
: DynamicBloom(total_bits, &BloomHash, num_probes) {
}
} // rocksdb

72
util/dynamic_bloom.h Normal file
View File

@@ -0,0 +1,72 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <atomic>
#include <memory>
namespace rocksdb {
class Slice;
class DynamicBloom {
public:
// total_bits: fixed total bits for the bloom
// hash_func: customized hash function
// num_probes: number of hash probes for a single key
DynamicBloom(uint32_t total_bits,
uint32_t (*hash_func)(const Slice& key),
uint32_t num_probes = 6);
explicit DynamicBloom(uint32_t total_bits, uint32_t num_probes = 6);
// Assuming single threaded access to this function.
void Add(const Slice& key);
// Assuming single threaded access to this function.
void AddHash(uint32_t hash);
// Multithreaded access to this function is OK
bool MayContain(const Slice& key);
// Multithreaded access to this function is OK
bool MayContainHash(uint32_t hash);
private:
uint32_t (*hash_func_)(const Slice& key);
const uint32_t kTotalBits;
const uint32_t kNumProbes;
std::unique_ptr<unsigned char[]> data_;
};
inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }
inline bool DynamicBloom::MayContain(const Slice& key) {
return (MayContainHash(hash_func_(key)));
}
inline bool DynamicBloom::MayContainHash(uint32_t h) {
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
for (uint32_t i = 0; i < kNumProbes; i++) {
const uint32_t bitpos = h % kTotalBits;
if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
return false;
}
h += delta;
}
return true;
}
inline void DynamicBloom::AddHash(uint32_t h) {
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
for (uint32_t i = 0; i < kNumProbes; i++) {
const uint32_t bitpos = h % kTotalBits;
data_[bitpos / 8] |= (1 << (bitpos % 8));
h += delta;
}
}
} // rocksdb

113
util/dynamic_bloom_test.cc Normal file
View File

@@ -0,0 +1,113 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include <gflags/gflags.h>
#include "dynamic_bloom.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
DEFINE_int32(bits_per_key, 10, "");
DEFINE_int32(num_probes, 6, "");
namespace rocksdb {
static Slice Key(int i, char* buffer) {
memcpy(buffer, &i, sizeof(i));
return Slice(buffer, sizeof(i));
}
class DynamicBloomTest {
};
TEST(DynamicBloomTest, EmptyFilter) {
DynamicBloom bloom(100, 2);
ASSERT_TRUE(! bloom.MayContain("hello"));
ASSERT_TRUE(! bloom.MayContain("world"));
}
TEST(DynamicBloomTest, Small) {
DynamicBloom bloom(100, 2);
bloom.Add("hello");
bloom.Add("world");
ASSERT_TRUE(bloom.MayContain("hello"));
ASSERT_TRUE(bloom.MayContain("world"));
ASSERT_TRUE(! bloom.MayContain("x"));
ASSERT_TRUE(! bloom.MayContain("foo"));
}
static int NextLength(int length) {
if (length < 10) {
length += 1;
} else if (length < 100) {
length += 10;
} else if (length < 1000) {
length += 100;
} else {
length += 1000;
}
return length;
}
TEST(DynamicBloomTest, VaryingLengths) {
char buffer[sizeof(int)];
// Count number of filters that significantly exceed the false positive rate
int mediocre_filters = 0;
int good_filters = 0;
fprintf(stderr, "bits_per_key: %d num_probes: %d\n",
FLAGS_bits_per_key, FLAGS_num_probes);
for (int length = 1; length <= 10000; length = NextLength(length)) {
uint32_t bloom_bits = std::max(length * FLAGS_bits_per_key, 64);
DynamicBloom bloom(bloom_bits, FLAGS_num_probes);
for (int i = 0; i < length; i++) {
bloom.Add(Key(i, buffer));
ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
}
// All added keys must match
for (int i = 0; i < length; i++) {
ASSERT_TRUE(bloom.MayContain(Key(i, buffer)))
<< "Length " << length << "; key " << i;
}
// Check false positive rate
int result = 0;
for (int i = 0; i < 10000; i++) {
if (bloom.MayContain(Key(i + 1000000000, buffer))) {
result++;
}
}
double rate = result / 10000.0;
fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; \n",
rate*100.0, length);
//ASSERT_LE(rate, 0.02); // Must not be over 2%
if (rate > 0.0125)
mediocre_filters++; // Allowed, but not too often
else
good_filters++;
}
fprintf(stderr, "Filters: %d good, %d mediocre\n",
good_filters, mediocre_filters);
ASSERT_LE(mediocre_filters, good_filters/5);
}
// Different bits-per-byte
} // namespace rocksdb
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
return rocksdb::test::RunAllTests();
}

View File

@@ -306,7 +306,13 @@ class PosixMmapReadableFile: public RandomAccessFile {
assert(options.use_mmap_reads);
assert(options.use_os_buffer);
}
virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); }
virtual ~PosixMmapReadableFile() {
int ret = munmap(mmapped_region_, length_);
if (ret != 0) {
fprintf(stdout, "failed to munmap %p length %zu \n",
mmapped_region_, length_);
}
}
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {

470
util/hash_linklist_rep.cc Normal file
View File

@@ -0,0 +1,470 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include "util/hash_linklist_rep.h"
#include "rocksdb/memtablerep.h"
#include "util/arena.h"
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include "port/port.h"
#include "port/atomic_pointer.h"
#include "util/murmurhash.h"
#include "db/memtable.h"
#include "db/skiplist.h"
namespace rocksdb {
namespace {
typedef const char* Key;
struct Node {
explicit Node(const Key& k) :
key(k) {
}
Key const key;
// Accessors/mutators for links. Wrapped in methods so we can
// add the appropriate barriers as necessary.
Node* Next() {
// Use an 'acquire load' so that we observe a fully initialized
// version of the returned Node.
return reinterpret_cast<Node*>(next_.Acquire_Load());
}
void SetNext(Node* x) {
// Use a 'release store' so that anybody who reads through this
// pointer observes a fully initialized version of the inserted node.
next_.Release_Store(x);
}
// No-barrier variants that can be safely used in a few locations.
Node* NoBarrier_Next() {
return reinterpret_cast<Node*>(next_.NoBarrier_Load());
}
void NoBarrier_SetNext(Node* x) {
next_.NoBarrier_Store(x);
}
private:
port::AtomicPointer next_;
};
class HashLinkListRep : public MemTableRep {
public:
HashLinkListRep(MemTableRep::KeyComparator& compare, Arena* arena,
const SliceTransform* transform, size_t bucket_size);
virtual void Insert(const char* key) override;
virtual bool Contains(const char* key) const override;
virtual size_t ApproximateMemoryUsage() override;
virtual ~HashLinkListRep();
virtual MemTableRep::Iterator* GetIterator() override;
virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override;
virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix)
override;
virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override;
private:
friend class DynamicIterator;
typedef SkipList<const char*, MemTableRep::KeyComparator&> FullList;
size_t bucket_size_;
// Maps slices (which are transformed user keys) to buckets of keys sharing
// the same transform.
port::AtomicPointer* buckets_;
// The user-supplied transform whose domain is the user keys.
const SliceTransform* transform_;
MemTableRep::KeyComparator& compare_;
// immutable after construction
Arena* const arena_;
bool BucketContains(Node* head, const Slice& key) const;
Slice GetPrefix(const Slice& internal_key) const {
return transform_->Transform(ExtractUserKey(internal_key));
}
size_t GetHash(const Slice& slice) const {
return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_;
}
Node* GetBucket(size_t i) const {
return static_cast<Node*>(buckets_[i].Acquire_Load());
}
Node* GetBucket(const Slice& slice) const {
return GetBucket(GetHash(slice));
}
Node* NewNode(const Key& key) {
char* mem = arena_->AllocateAligned(sizeof(Node));
return new (mem) Node(key);
}
bool Equal(const Slice& a, const Key& b) const {
return (compare_(b, a) == 0);
}
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const {
// nullptr n is considered infinite
return (n != nullptr) && (compare_(n->key, internal_key) < 0);
}
bool KeyIsAfterNode(const Key& key, const Node* n) const {
// nullptr n is considered infinite
return (n != nullptr) && (compare_(n->key, key) < 0);
}
Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const;
class FullListIterator : public MemTableRep::Iterator {
public:
explicit FullListIterator(FullList* list)
: iter_(list), full_list_(list) {}
virtual ~FullListIterator() {
}
// Returns true iff the iterator is positioned at a valid node.
virtual bool Valid() const {
return iter_.Valid();
}
// Returns the key at the current position.
// REQUIRES: Valid()
virtual const char* key() const {
assert(Valid());
return iter_.key();
}
// Advances to the next position.
// REQUIRES: Valid()
virtual void Next() {
assert(Valid());
iter_.Next();
}
// Advances to the previous position.
// REQUIRES: Valid()
virtual void Prev() {
assert(Valid());
iter_.Prev();
}
// Advance to the first entry with a key >= target
virtual void Seek(const Slice& internal_key, const char* memtable_key) {
const char* encoded_key =
(memtable_key != nullptr) ?
memtable_key : EncodeKey(&tmp_, internal_key);
iter_.Seek(encoded_key);
}
// Position at the first entry in collection.
// Final state of iterator is Valid() iff collection is not empty.
virtual void SeekToFirst() {
iter_.SeekToFirst();
}
// Position at the last entry in collection.
// Final state of iterator is Valid() iff collection is not empty.
virtual void SeekToLast() {
iter_.SeekToLast();
}
private:
FullList::Iterator iter_;
// To destruct with the iterator.
std::unique_ptr<FullList> full_list_;
std::string tmp_; // For passing to EncodeKey
};
class Iterator : public MemTableRep::Iterator {
public:
explicit Iterator(const HashLinkListRep* const hash_link_list_rep,
Node* head) :
hash_link_list_rep_(hash_link_list_rep), head_(head), node_(nullptr) {
}
virtual ~Iterator() {
}
// Returns true iff the iterator is positioned at a valid node.
virtual bool Valid() const {
return node_ != nullptr;
}
// Returns the key at the current position.
// REQUIRES: Valid()
virtual const char* key() const {
assert(Valid());
return node_->key;
}
// Advances to the next position.
// REQUIRES: Valid()
virtual void Next() {
assert(Valid());
node_ = node_->Next();
}
// Advances to the previous position.
// REQUIRES: Valid()
virtual void Prev() {
// Prefix iterator does not support total order.
// We simply set the iterator to invalid state
Reset(nullptr);
}
// Advance to the first entry with a key >= target
virtual void Seek(const Slice& internal_key, const char* memtable_key) {
node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_,
internal_key);
}
// Position at the first entry in collection.
// Final state of iterator is Valid() iff collection is not empty.
virtual void SeekToFirst() {
// Prefix iterator does not support total order.
// We simply set the iterator to invalid state
Reset(nullptr);
}
// Position at the last entry in collection.
// Final state of iterator is Valid() iff collection is not empty.
virtual void SeekToLast() {
// Prefix iterator does not support total order.
// We simply set the iterator to invalid state
Reset(nullptr);
}
protected:
void Reset(Node* head) {
head_ = head;
node_ = nullptr;
}
private:
friend class HashLinkListRep;
const HashLinkListRep* const hash_link_list_rep_;
Node* head_;
Node* node_;
std::string tmp_; // For passing to EncodeKey
virtual void SeekToHead() {
node_ = head_;
}
};
class DynamicIterator : public HashLinkListRep::Iterator {
public:
explicit DynamicIterator(HashLinkListRep& memtable_rep)
: HashLinkListRep::Iterator(&memtable_rep, nullptr),
memtable_rep_(memtable_rep) {}
// Advance to the first entry with a key >= target
virtual void Seek(const Slice& k, const char* memtable_key) {
auto transformed = memtable_rep_.GetPrefix(k);
Reset(memtable_rep_.GetBucket(transformed));
HashLinkListRep::Iterator::Seek(k, memtable_key);
}
private:
// the underlying memtable
const HashLinkListRep& memtable_rep_;
};
class EmptyIterator : public MemTableRep::Iterator {
// This is used when there wasn't a bucket. It is cheaper than
// instantiating an empty bucket over which to iterate.
public:
EmptyIterator() { }
virtual bool Valid() const {
return false;
}
virtual const char* key() const {
assert(false);
return nullptr;
}
virtual void Next() { }
virtual void Prev() { }
virtual void Seek(const Slice& user_key, const char* memtable_key) { }
virtual void SeekToFirst() { }
virtual void SeekToLast() { }
private:
};
};
HashLinkListRep::HashLinkListRep(MemTableRep::KeyComparator& compare,
Arena* arena, const SliceTransform* transform,
size_t bucket_size)
: bucket_size_(bucket_size),
transform_(transform),
compare_(compare),
arena_(arena) {
char* mem = arena_->AllocateAligned(
sizeof(port::AtomicPointer) * bucket_size);
buckets_ = new (mem) port::AtomicPointer[bucket_size];
for (size_t i = 0; i < bucket_size_; ++i) {
buckets_[i].NoBarrier_Store(nullptr);
}
}
HashLinkListRep::~HashLinkListRep() {
}
void HashLinkListRep::Insert(const char* key) {
assert(!Contains(key));
Slice internal_key = GetLengthPrefixedSlice(key);
auto transformed = GetPrefix(internal_key);
auto& bucket = buckets_[GetHash(transformed)];
Node* head = static_cast<Node*>(bucket.Acquire_Load());
if (!head) {
Node* x = NewNode(key);
// NoBarrier_SetNext() suffices since we will add a barrier when
// we publish a pointer to "x" in prev[i].
x->NoBarrier_SetNext(nullptr);
bucket.Release_Store(static_cast<void*>(x));
return;
}
Node* cur = head;
Node* prev = nullptr;
while (true) {
if (cur == nullptr) {
break;
}
Node* next = cur->Next();
// Make sure the lists are sorted.
// If x points to head_ or next points nullptr, it is trivially satisfied.
assert((cur == head) || (next == nullptr) ||
KeyIsAfterNode(next->key, cur));
if (KeyIsAfterNode(internal_key, cur)) {
// Keep searching in this list
prev = cur;
cur = next;
} else {
break;
}
}
// Our data structure does not allow duplicate insertion
assert(cur == nullptr || !Equal(key, cur->key));
Node* x = NewNode(key);
// NoBarrier_SetNext() suffices since we will add a barrier when
// we publish a pointer to "x" in prev[i].
x->NoBarrier_SetNext(cur);
if (prev) {
prev->SetNext(x);
} else {
bucket.Release_Store(static_cast<void*>(x));
}
}
bool HashLinkListRep::Contains(const char* key) const {
Slice internal_key = GetLengthPrefixedSlice(key);
auto transformed = GetPrefix(internal_key);
auto bucket = GetBucket(transformed);
if (bucket == nullptr) {
return false;
}
return BucketContains(bucket, internal_key);
}
size_t HashLinkListRep::ApproximateMemoryUsage() {
// Memory is always allocated from the arena.
return 0;
}
MemTableRep::Iterator* HashLinkListRep::GetIterator() {
auto list = new FullList(compare_, arena_);
for (size_t i = 0; i < bucket_size_; ++i) {
auto bucket = GetBucket(i);
if (bucket != nullptr) {
Iterator itr(this, bucket);
for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
list->Insert(itr.key());
}
}
}
return new FullListIterator(list);
}
MemTableRep::Iterator* HashLinkListRep::GetPrefixIterator(
const Slice& prefix) {
auto bucket = GetBucket(prefix);
if (bucket == nullptr) {
return new EmptyIterator();
}
return new Iterator(this, bucket);
}
MemTableRep::Iterator* HashLinkListRep::GetIterator(const Slice& slice) {
return GetPrefixIterator(transform_->Transform(slice));
}
MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator() {
return new DynamicIterator(*this);
}
bool HashLinkListRep::BucketContains(Node* head, const Slice& user_key) const {
Node* x = FindGreaterOrEqualInBucket(head, user_key);
return (x != nullptr && Equal(user_key, x->key));
}
Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
const Slice& key) const {
Node* x = head;
while (true) {
if (x == nullptr) {
return x;
}
Node* next = x->Next();
// Make sure the lists are sorted.
// If x points to head_ or next points nullptr, it is trivially satisfied.
assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x));
if (KeyIsAfterNode(key, x)) {
// Keep searching in this list
x = next;
} else {
break;
}
}
return x;
}
} // anon namespace
MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
MemTableRep::KeyComparator& compare, Arena* arena) {
return new HashLinkListRep(compare, arena, transform_, bucket_count_);
}
MemTableRepFactory* NewHashLinkListRepFactory(
const SliceTransform* transform, size_t bucket_count) {
return new HashLinkListRepFactory(transform, bucket_count);
}
} // namespace rocksdb

39
util/hash_linklist_rep.h Normal file
View File

@@ -0,0 +1,39 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "rocksdb/slice_transform.h"
#include "rocksdb/memtablerep.h"
namespace rocksdb {
class HashLinkListRepFactory : public MemTableRepFactory {
public:
explicit HashLinkListRepFactory(
const SliceTransform* transform,
size_t bucket_count)
: transform_(transform),
bucket_count_(bucket_count) { }
virtual ~HashLinkListRepFactory() { delete transform_; }
virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator& compare,
Arena* arena) override;
virtual const char* Name() const override {
return "HashLinkListRepFactory";
}
const SliceTransform* GetTransform() { return transform_; }
private:
const SliceTransform* transform_;
const size_t bucket_count_;
};
}

View File

@@ -7,12 +7,13 @@
#include "util/hash_skiplist_rep.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/arena.h"
#include "util/arena.h"
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include "port/port.h"
#include "port/atomic_pointer.h"
#include "util/murmurhash.h"
#include "db/memtable.h"
#include "db/skiplist.h"
namespace rocksdb {
@@ -21,7 +22,8 @@ namespace {
class HashSkipListRep : public MemTableRep {
public:
HashSkipListRep(MemTableRep::KeyComparator& compare, Arena* arena,
const SliceTransform* transform, size_t bucket_size);
const SliceTransform* transform, size_t bucket_size,
int32_t skiplist_height, int32_t skiplist_branching_factor);
virtual void Insert(const char* key) override;
@@ -46,6 +48,9 @@ class HashSkipListRep : public MemTableRep {
size_t bucket_size_;
const int32_t skiplist_height_;
const int32_t skiplist_branching_factor_;
// Maps slices (which are transformed user keys) to buckets of keys sharing
// the same transform.
port::AtomicPointer* buckets_;
@@ -112,9 +117,12 @@ class HashSkipListRep : public MemTableRep {
}
// Advance to the first entry with a key >= target
virtual void Seek(const char* target) {
virtual void Seek(const Slice& internal_key, const char* memtable_key) {
if (list_ != nullptr) {
iter_.Seek(target);
const char* encoded_key =
(memtable_key != nullptr) ?
memtable_key : EncodeKey(&tmp_, internal_key);
iter_.Seek(encoded_key);
}
}
@@ -151,6 +159,7 @@ class HashSkipListRep : public MemTableRep {
// here we track if we own list_. If we own it, we are also
// responsible for it's cleaning. This is a poor man's shared_ptr
bool own_list_;
std::string tmp_; // For passing to EncodeKey
};
class DynamicIterator : public HashSkipListRep::Iterator {
@@ -160,11 +169,10 @@ class HashSkipListRep : public MemTableRep {
memtable_rep_(memtable_rep) {}
// Advance to the first entry with a key >= target
virtual void Seek(const char* target) {
auto transformed = memtable_rep_.transform_->Transform(
memtable_rep_.UserKey(target));
virtual void Seek(const Slice& k, const char* memtable_key) {
auto transformed = memtable_rep_.transform_->Transform(ExtractUserKey(k));
Reset(memtable_rep_.GetBucket(transformed));
HashSkipListRep::Iterator::Seek(target);
HashSkipListRep::Iterator::Seek(k, memtable_key);
}
// Position at the first entry in collection.
@@ -201,7 +209,8 @@ class HashSkipListRep : public MemTableRep {
}
virtual void Next() { }
virtual void Prev() { }
virtual void Seek(const char* target) { }
virtual void Seek(const Slice& internal_key,
const char* memtable_key) { }
virtual void SeekToFirst() { }
virtual void SeekToLast() { }
private:
@@ -210,8 +219,11 @@ class HashSkipListRep : public MemTableRep {
HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare,
Arena* arena, const SliceTransform* transform,
size_t bucket_size)
size_t bucket_size, int32_t skiplist_height,
int32_t skiplist_branching_factor)
: bucket_size_(bucket_size),
skiplist_height_(skiplist_height),
skiplist_branching_factor_(skiplist_branching_factor),
transform_(transform),
compare_(compare),
arena_(arena) {
@@ -232,7 +244,8 @@ HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket(
auto bucket = GetBucket(hash);
if (bucket == nullptr) {
auto addr = arena_->AllocateAligned(sizeof(Bucket));
bucket = new (addr) Bucket(compare_, arena_);
bucket = new (addr) Bucket(compare_, arena_, skiplist_height_,
skiplist_branching_factor_);
buckets_[hash].Release_Store(static_cast<void*>(bucket));
}
return bucket;
@@ -292,12 +305,15 @@ MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() {
MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
MemTableRep::KeyComparator& compare, Arena* arena) {
return new HashSkipListRep(compare, arena, transform_, bucket_count_);
return new HashSkipListRep(compare, arena, transform_, bucket_count_,
skiplist_height_, skiplist_branching_factor_);
}
MemTableRepFactory* NewHashSkipListRepFactory(
const SliceTransform* transform, size_t bucket_count) {
return new HashSkipListRepFactory(transform, bucket_count);
const SliceTransform* transform, size_t bucket_count,
int32_t skiplist_height, int32_t skiplist_branching_factor) {
return new HashSkipListRepFactory(transform, bucket_count,
skiplist_height, skiplist_branching_factor);
}
} // namespace rocksdb

View File

@@ -14,10 +14,15 @@ namespace rocksdb {
class HashSkipListRepFactory : public MemTableRepFactory {
public:
explicit HashSkipListRepFactory(const SliceTransform* transform,
size_t bucket_count = 1000000)
: transform_(transform),
bucket_count_(bucket_count) { }
explicit HashSkipListRepFactory(
const SliceTransform* transform,
size_t bucket_count,
int32_t skiplist_height,
int32_t skiplist_branching_factor)
: transform_(transform),
bucket_count_(bucket_count),
skiplist_height_(skiplist_height),
skiplist_branching_factor_(skiplist_branching_factor) { }
virtual ~HashSkipListRepFactory() { delete transform_; }
@@ -33,6 +38,8 @@ class HashSkipListRepFactory : public MemTableRepFactory {
private:
const SliceTransform* transform_;
const size_t bucket_count_;
const int32_t skiplist_height_;
const int32_t skiplist_branching_factor_;
};
}

View File

@@ -16,10 +16,11 @@
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
#include "rocksdb/filter_policy.h"
#include "rocksdb/merge_operator.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/merge_operator.h"
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/table.h"
#include "rocksdb/table_properties.h"
#include "table/block_based_table_factory.h"
@@ -73,6 +74,9 @@ ColumnFamilyOptions::ColumnFamilyOptions()
std::shared_ptr<TableFactory>(new BlockBasedTableFactory())),
inplace_update_support(false),
inplace_update_num_locks(10000),
inplace_callback(nullptr),
memtable_prefix_bloom_bits(0),
memtable_prefix_bloom_probes(6),
max_successive_merges(0) {
assert(memtable_factory.get() != nullptr);
}
@@ -131,6 +135,9 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
table_properties_collectors(options.table_properties_collectors),
inplace_update_support(options.inplace_update_support),
inplace_update_num_locks(options.inplace_update_num_locks),
inplace_callback(options.inplace_callback),
memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
max_successive_merges(options.max_successive_merges) {
assert(memtable_factory.get() != nullptr);
}
@@ -396,6 +403,11 @@ Options::Dump(Logger* log) const
inplace_update_support);
Log(log, " Options.inplace_update_num_locks: %zd",
inplace_update_num_locks);
// TODO: easier config for bloom (maybe based on avg key/value size)
Log(log, " Options.memtable_prefix_bloom_bits: %d",
memtable_prefix_bloom_bits);
Log(log, " Options.memtable_prefix_bloom_probes: %d",
memtable_prefix_bloom_probes);
Log(log, " Options.max_successive_merges: %zd",
max_successive_merges);
} // Options::Dump

View File

@@ -22,7 +22,20 @@ void PerfContext::Reset() {
block_decompress_time = 0;
internal_key_skipped_count = 0;
internal_delete_skipped_count = 0;
wal_write_time = 0;
write_wal_time = 0;
get_snapshot_time = 0;
get_from_memtable_time = 0;
get_from_memtable_count = 0;
get_post_process_time = 0;
get_from_output_files_time = 0;
seek_child_seek_time = 0;
seek_child_seek_count = 0;
seek_min_heap_time = 0;
seek_internal_seek_time = 0;
find_next_user_entry_time = 0;
write_pre_and_post_process_time = 0;
write_memtable_time = 0;
}
__thread PerfContext perf_context;

View File

@@ -70,8 +70,13 @@ public:
}
// Advance to the first entry with a key >= target
virtual void Seek(const char* target) override {
iter_.Seek(target);
virtual void Seek(const Slice& user_key, const char* memtable_key)
override {
if (memtable_key != nullptr) {
iter_.Seek(memtable_key);
} else {
iter_.Seek(EncodeKey(&tmp_, user_key));
}
}
// Position at the first entry in list.
@@ -85,6 +90,8 @@ public:
virtual void SeekToLast() override {
iter_.SeekToLast();
}
protected:
std::string tmp_; // For passing to EncodeKey
};
// Unhide default implementations of GetIterator

View File

@@ -8,6 +8,8 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <string>
#include "db/dbformat.h"
#include "rocksdb/env.h"
#include "rocksdb/slice.h"
#include "util/random.h"
@@ -51,5 +53,28 @@ class ErrorEnv : public EnvWrapper {
}
};
// An internal comparator that just forward comparing results from the
// user comparator in it. Can be used to test entities that have no dependency
// on internal key structure but consumes InternalKeyComparator, like
// BlockBasedTable.
class PlainInternalKeyComparator : public InternalKeyComparator {
public:
explicit PlainInternalKeyComparator(const Comparator* c)
: InternalKeyComparator(c) {}
virtual ~PlainInternalKeyComparator() {}
virtual int Compare(const Slice& a, const Slice& b) const override {
return user_comparator()->Compare(a, b);
}
virtual void FindShortestSeparator(std::string* start,
const Slice& limit) const override {
user_comparator()->FindShortestSeparator(start, limit);
}
virtual void FindShortSuccessor(std::string* key) const override {
user_comparator()->FindShortSuccessor(key);
}
};
} // namespace test
} // namespace rocksdb

View File

@@ -11,7 +11,8 @@
#include <algorithm>
#include <type_traits>
#include "rocksdb/arena.h"
#include "util/arena.h"
#include "db/memtable.h"
#include "port/port.h"
#include "util/mutexlock.h"
#include "util/stl_wrappers.h"
@@ -45,6 +46,7 @@ class VectorRep : public MemTableRep {
std::shared_ptr<std::vector<const char*>> bucket_;
typename std::vector<const char*>::const_iterator mutable cit_;
const KeyComparator& compare_;
std::string tmp_; // For passing to EncodeKey
bool mutable sorted_;
void DoSort() const;
public:
@@ -73,7 +75,7 @@ class VectorRep : public MemTableRep {
virtual void Prev() override;
// Advance to the first entry with a key >= target
virtual void Seek(const char* target) override;
virtual void Seek(const Slice& user_key, const char* memtable_key) override;
// Position at the first entry in collection.
// Final state of iterator is Valid() iff collection is not empty.
@@ -200,12 +202,15 @@ void VectorRep::Iterator::Prev() {
}
// Advance to the first entry with a key >= target
void VectorRep::Iterator::Seek(const char* target) {
void VectorRep::Iterator::Seek(const Slice& user_key,
const char* memtable_key) {
DoSort();
// Do binary search to find first value not less than the target
const char* encoded_key =
(memtable_key != nullptr) ? memtable_key : EncodeKey(&tmp_, user_key);
cit_ = std::equal_range(bucket_->begin(),
bucket_->end(),
target,
encoded_key,
[this] (const char* a, const char* b) {
return compare_(a, b) < 0;
}).first;