diff --git a/db/db_test.cc b/db/db_test.cc index aa94f089a7..c089dcf43a 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -4649,85 +4649,90 @@ void PrefixScanInit(DBTest *dbtest) { } TEST(DBTest, PrefixScan) { - ReadOptions ro = ReadOptions(); - int count; - Slice prefix; - Slice key; - char buf[100]; - Iterator* iter; - snprintf(buf, sizeof(buf), "03______:"); - prefix = Slice(buf, 8); - key = Slice(buf, 9); - auto prefix_extractor = NewFixedPrefixTransform(8); - auto memtable_factory = - std::make_shared(prefix_extractor); - - // db configs - env_->count_random_reads_ = true; - Options options = CurrentOptions(); - options.env = env_; - options.block_cache = NewLRUCache(0); // Prevent cache hits - options.filter_policy = NewBloomFilterPolicy(10); - options.prefix_extractor = prefix_extractor; - options.whole_key_filtering = false; - options.disable_auto_compactions = true; - options.max_background_compactions = 2; - options.create_if_missing = true; - options.disable_seek_compaction = true; - options.memtable_factory = memtable_factory; - - // prefix specified, with blooms: 2 RAND I/Os - // SeekToFirst - DestroyAndReopen(&options); - PrefixScanInit(this); - count = 0; - env_->random_read_counter_.Reset(); - ro.prefix = &prefix; - iter = db_->NewIterator(ro); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - assert(iter->key().starts_with(prefix)); - count++; - } - ASSERT_OK(iter->status()); - delete iter; - ASSERT_EQ(count, 2); - ASSERT_EQ(env_->random_read_counter_.Read(), 2); - - // prefix specified, with blooms: 2 RAND I/Os - // Seek - DestroyAndReopen(&options); - PrefixScanInit(this); - count = 0; - env_->random_read_counter_.Reset(); - ro.prefix = &prefix; - iter = db_->NewIterator(ro); - for (iter->Seek(key); iter->Valid(); iter->Next()) { - assert(iter->key().starts_with(prefix)); - count++; - } - ASSERT_OK(iter->status()); - delete iter; - ASSERT_EQ(count, 2); - ASSERT_EQ(env_->random_read_counter_.Read(), 2); - - // no prefix specified: 11 RAND I/Os - DestroyAndReopen(&options); - PrefixScanInit(this); - count = 0; - env_->random_read_counter_.Reset(); - iter = db_->NewIterator(ReadOptions()); - for (iter->Seek(prefix); iter->Valid(); iter->Next()) { - if (! iter->key().starts_with(prefix)) { - break; + for (int it = 0; it < 2; ++it) { + ReadOptions ro = ReadOptions(); + int count; + Slice prefix; + Slice key; + char buf[100]; + Iterator* iter; + snprintf(buf, sizeof(buf), "03______:"); + prefix = Slice(buf, 8); + key = Slice(buf, 9); + auto prefix_extractor = NewFixedPrefixTransform(8); + // db configs + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.block_cache = NewLRUCache(0); // Prevent cache hits + options.filter_policy = NewBloomFilterPolicy(10); + options.prefix_extractor = prefix_extractor; + options.whole_key_filtering = false; + options.disable_auto_compactions = true; + options.max_background_compactions = 2; + options.create_if_missing = true; + options.disable_seek_compaction = true; + if (it == 0) { + options.memtable_factory = std::make_shared( + prefix_extractor); + } else { + options.memtable_factory = std::make_shared( + prefix_extractor); } - count++; + + // prefix specified, with blooms: 2 RAND I/Os + // SeekToFirst + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + ro.prefix = &prefix; + iter = db_->NewIterator(ro); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + assert(iter->key().starts_with(prefix)); + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + + // prefix specified, with blooms: 2 RAND I/Os + // Seek + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + ro.prefix = &prefix; + iter = db_->NewIterator(ro); + for (iter->Seek(key); iter->Valid(); iter->Next()) { + assert(iter->key().starts_with(prefix)); + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + + // no prefix specified: 11 RAND I/Os + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + iter = db_->NewIterator(ReadOptions()); + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + if (! iter->key().starts_with(prefix)) { + break; + } + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 11); + Close(); + delete options.filter_policy; } - ASSERT_OK(iter->status()); - delete iter; - ASSERT_EQ(count, 2); - ASSERT_EQ(env_->random_read_counter_.Read(), 11); - Close(); - delete options.filter_policy; } std::string MakeKey(unsigned int num) { diff --git a/db/prefix_test.cc b/db/prefix_test.cc index 0ec84269b7..48238d52e4 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -11,6 +11,7 @@ #include "util/testharness.h" DEFINE_bool(use_prefix_hash_memtable, true, ""); +DEFINE_bool(use_nolock_version, true, ""); DEFINE_bool(trigger_deadlock, false, "issue delete in range scan to trigger PrefixHashMap deadlock"); DEFINE_uint64(bucket_count, 100000, "number of buckets"); @@ -93,15 +94,24 @@ class PrefixTest { if (FLAGS_use_prefix_hash_memtable) { auto prefix_extractor = NewFixedPrefixTransform(8); options.prefix_extractor = prefix_extractor; - options.memtable_factory = - std::make_shared( - prefix_extractor, FLAGS_bucket_count, FLAGS_num_locks); + if (FLAGS_use_nolock_version) { + options.memtable_factory = + std::make_shared( + prefix_extractor, FLAGS_bucket_count); + } else { + options.memtable_factory = + std::make_shared( + prefix_extractor, FLAGS_bucket_count, FLAGS_num_locks); + } } Status s = DB::Open(options, kDbName, &db); ASSERT_OK(s); return std::shared_ptr(db); } + ~PrefixTest() { + delete options.comparator; + } protected: Options options; }; diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index 361b2f9c98..1b3a71cc38 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -15,11 +15,11 @@ // Users can implement their own memtable representations. We include four // types built in: // - SkipListRep: This is the default; it is backed by a skip list. -// - TransformRep: This is backed by an std::unordered_map. On construction, they are given a SliceTransform object. This +// - TransformRep: This is backed by an custom hash map. +// On construction, they are given a SliceTransform object. This // object is applied to the user key of stored items which indexes into the -// unordered map to yield a set containing all records that share the same user -// key under the transform function. +// hash map to yield a skiplist containing all records that share the same +// user key under the transform function. // - UnsortedRep: A subclass of TransformRep where the transform function is // the identity function. Optimized for point lookups. // - PrefixHashRep: A subclass of TransformRep where the transform function is @@ -254,6 +254,68 @@ public: } }; + +// NO LOCKS VERSION + +// The same as TransformRepFactory except it doesn't use locks. +// Experimental, will replace TransformRepFactory once we are sure +// it performs better +class TransformRepNoLockFactory : public MemTableRepFactory { + public: + explicit TransformRepNoLockFactory(const SliceTransform* transform, + size_t bucket_count) + : transform_(transform), + bucket_count_(bucket_count) { } + + virtual ~TransformRepNoLockFactory() { delete transform_; } + + virtual std::shared_ptr CreateMemTableRep( + MemTableRep::KeyComparator&, Arena*) override; + + virtual const char* Name() const override { + return "TransformRepNoLockFactory"; + } + + const SliceTransform* GetTransform() { return transform_; } + + protected: + const SliceTransform* transform_; + const size_t bucket_count_; +}; + +// UnsortedReps bin user keys based on an identity function transform -- that +// is, transform(key) = key. This optimizes for point look-ups. +// +// Parameters: See TransformRepNoLockFactory. +class UnsortedRepNoLockFactory : public TransformRepNoLockFactory { +public: + explicit UnsortedRepNoLockFactory(size_t bucket_count = 1000000) + : TransformRepNoLockFactory(NewNoopTransform(), + bucket_count) { } + virtual const char* Name() const override { + return "UnsortedRepNoLockFactory"; + } +}; + +// PrefixHashReps bin user keys based on a fixed-size prefix. This optimizes for +// short ranged scans over a given prefix. +// +// Parameters: See TransformRepNoLockFactory. +class PrefixHashRepNoLockFactory : public TransformRepNoLockFactory { +public: + explicit PrefixHashRepNoLockFactory(const SliceTransform* prefix_extractor, + size_t bucket_count = 1000000) + : TransformRepNoLockFactory(prefix_extractor, bucket_count) + { } + + virtual std::shared_ptr CreateMemTableRep( + MemTableRep::KeyComparator&, Arena*) override; + + virtual const char* Name() const override { + return "PrefixHashRepNoLockFactory"; + } +}; + } #endif // STORAGE_ROCKSDB_DB_MEMTABLEREP_H_ diff --git a/util/transformrepnolock.cc b/util/transformrepnolock.cc new file mode 100644 index 0000000000..3fe520462b --- /dev/null +++ b/util/transformrepnolock.cc @@ -0,0 +1,262 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include "rocksdb/memtablerep.h" +#include "rocksdb/arena.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "port/port.h" +#include "port/atomic_pointer.h" +#include "util/murmurhash.h" +#include "db/skiplist.h" + +namespace rocksdb { +namespace { + +class TransformRepNoLock : public MemTableRep { + public: + TransformRepNoLock(MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, size_t bucket_size); + + virtual void Insert(const char* key) override; + + virtual bool Contains(const char* key) const override; + + virtual size_t ApproximateMemoryUsage() override; + + virtual ~TransformRepNoLock(); + + virtual std::shared_ptr GetIterator() override; + + virtual std::shared_ptr GetIterator( + const Slice& slice) override; + + std::shared_ptr GetTransformIterator( + const Slice& transformed); + + private: + typedef SkipList Bucket; + + size_t bucket_size_; + + // Maps slices (which are transformed user keys) to buckets of keys sharing + // the same transform. + port::AtomicPointer* buckets_; + + // The user-supplied transform whose domain is the user keys. + const SliceTransform* transform_; + + MemTableRep::KeyComparator& compare_; + // immutable after construction + Arena* const arena_; + + inline size_t GetHash(const Slice& slice) const { + return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_; + } + inline Bucket* GetBucket(size_t i) const { + return static_cast(buckets_[i].Acquire_Load()); + } + inline Bucket* GetBucket(const Slice& slice) const { + return GetBucket(GetHash(slice)); + } + // Get a bucket from buckets_. If the bucket hasn't been initialized yet, + // initialize it before returning. + Bucket* GetInitializedBucket(const Slice& transformed); + + class Iterator : public MemTableRep::Iterator { + public: + explicit Iterator(Bucket* list, bool own_list = true) + : list_(list), + iter_(list), + own_list_(own_list) {} + + virtual ~Iterator() { + // if we own the list, we should also delete it + if (own_list_) { + delete list_; + } + }; + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const { + return iter_.Valid(); + } + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const { + return iter_.key(); + } + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() { + iter_.Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() { + iter_.Prev(); + } + + // Advance to the first entry with a key >= target + virtual void Seek(const char* target) { + iter_.Seek(target); + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() { + iter_.SeekToFirst(); + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() { + iter_.SeekToLast(); + } + private: + Bucket* list_; + Bucket::Iterator iter_; + // here we track if we own list_. If we own it, we are also + // responsible for it's cleaning. This is a poor man's shared_ptr + bool own_list_; + }; + + class EmptyIterator : public MemTableRep::Iterator { + // This is used when there wasn't a bucket. It is cheaper than + // instantiating an empty bucket over which to iterate. + public: + EmptyIterator() { } + virtual bool Valid() const { + return false; + } + virtual const char* key() const { + assert(false); + return nullptr; + } + virtual void Next() { } + virtual void Prev() { } + virtual void Seek(const char* target) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + private: + }; + + std::shared_ptr empty_iterator_; +}; + +class PrefixHashRepNoLock : public TransformRepNoLock { + public: + PrefixHashRepNoLock(MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, size_t bucket_size) + : TransformRepNoLock(compare, arena, transform, bucket_size) { } + + virtual std::shared_ptr GetPrefixIterator( + const Slice& prefix) override; +}; + +TransformRepNoLock::TransformRepNoLock(MemTableRep::KeyComparator& compare, + Arena* arena, const SliceTransform* transform, size_t bucket_size) + : bucket_size_(bucket_size), + transform_(transform), + compare_(compare), + arena_(arena), + empty_iterator_(std::make_shared()) { + + buckets_ = new port::AtomicPointer[bucket_size]; + + for (size_t i = 0; i < bucket_size_; ++i) { + buckets_[i].NoBarrier_Store(nullptr); + } +} + +TransformRepNoLock::~TransformRepNoLock() { + delete[] buckets_; +} + +TransformRepNoLock::Bucket* TransformRepNoLock::GetInitializedBucket( + const Slice& transformed) { + size_t hash = GetHash(transformed); + auto bucket = GetBucket(hash); + if (bucket == nullptr) { + auto addr = arena_->AllocateAligned(sizeof(Bucket)); + bucket = new (addr) Bucket(compare_, arena_); + buckets_[hash].Release_Store(static_cast(bucket)); + } + return bucket; +} + +void TransformRepNoLock::Insert(const char* key) { + assert(!Contains(key)); + auto transformed = transform_->Transform(UserKey(key)); + auto bucket = GetInitializedBucket(transformed); + bucket->Insert(key); +} + +bool TransformRepNoLock::Contains(const char* key) const { + auto transformed = transform_->Transform(UserKey(key)); + auto bucket = GetBucket(transformed); + if (bucket == nullptr) { + return false; + } + return bucket->Contains(key); +} + +size_t TransformRepNoLock::ApproximateMemoryUsage() { + return sizeof(buckets_); +} + +std::shared_ptr TransformRepNoLock::GetIterator() { + auto list = new Bucket(compare_, arena_); + for (size_t i = 0; i < bucket_size_; ++i) { + auto bucket = GetBucket(i); + if (bucket != nullptr) { + Bucket::Iterator itr(bucket); + for (itr.SeekToFirst(); itr.Valid(); itr.Next()) { + list->Insert(itr.key()); + } + } + } + return std::make_shared(list); +} + +std::shared_ptr TransformRepNoLock::GetTransformIterator( + const Slice& transformed) { + auto bucket = GetBucket(transformed); + if (bucket == nullptr) { + return empty_iterator_; + } + return std::make_shared(bucket, false); +} + +std::shared_ptr TransformRepNoLock::GetIterator( + const Slice& slice) { + auto transformed = transform_->Transform(slice); + return GetTransformIterator(transformed); +} + +} // anon namespace + +std::shared_ptr TransformRepNoLockFactory::CreateMemTableRep( + MemTableRep::KeyComparator& compare, Arena* arena) { + return std::make_shared(compare, arena, transform_, + bucket_count_); +} + +std::shared_ptr PrefixHashRepNoLockFactory::CreateMemTableRep( + MemTableRep::KeyComparator& compare, Arena* arena) { + return std::make_shared(compare, arena, transform_, + bucket_count_); +} + +std::shared_ptr PrefixHashRepNoLock::GetPrefixIterator( + const Slice& prefix) { + return TransformRepNoLock::GetTransformIterator(prefix); +} + +} // namespace rocksdb