Merge branch 'master' into columnfamilies

Conflicts:
	db/db_impl.cc
	db/db_test.cc
	db/internal_stats.cc
	db/internal_stats.h
	db/version_edit.cc
	db/version_edit.h
	db/version_set.cc
	include/rocksdb/options.h
	util/options.cc
This commit is contained in:
Igor Canadi
2014-03-31 12:44:54 -07:00
41 changed files with 1813 additions and 254 deletions

View File

@@ -5,6 +5,9 @@
#include "dynamic_bloom.h"
#include <algorithm>
#include "port/port.h"
#include "rocksdb/slice.h"
#include "util/hash.h"
@@ -17,20 +20,31 @@ static uint32_t BloomHash(const Slice& key) {
}
DynamicBloom::DynamicBloom(uint32_t total_bits,
uint32_t (*hash_func)(const Slice& key),
uint32_t num_probes)
: hash_func_(hash_func),
kTotalBits((total_bits + 7) / 8 * 8),
kNumProbes(num_probes) {
assert(hash_func_);
uint32_t cl_per_block,
uint32_t num_probes,
uint32_t (*hash_func)(const Slice& key))
: kBlocked(cl_per_block > 0),
kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8),
kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock
* kBitsPerBlock :
total_bits + 7) / 8 * 8),
kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1),
kNumProbes(num_probes),
hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
assert(kBlocked ? kTotalBits > 0 : kTotalBits >= kBitsPerBlock);
assert(kNumProbes > 0);
assert(kTotalBits > 0);
data_.reset(new unsigned char[kTotalBits / 8]());
}
DynamicBloom::DynamicBloom(uint32_t total_bits,
uint32_t num_probes)
: DynamicBloom(total_bits, &BloomHash, num_probes) {
uint32_t sz = kTotalBits / 8;
if (kBlocked) {
sz += CACHE_LINE_SIZE - 1;
}
raw_ = new unsigned char[sz]();
if (kBlocked && (reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE)) {
data_ = raw_ + CACHE_LINE_SIZE -
reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE;
} else {
data_ = raw_;
}
}
} // rocksdb

View File

@@ -15,13 +15,17 @@ class Slice;
class DynamicBloom {
public:
// total_bits: fixed total bits for the bloom
// hash_func: customized hash function
// num_probes: number of hash probes for a single key
DynamicBloom(uint32_t total_bits,
uint32_t (*hash_func)(const Slice& key),
uint32_t num_probes = 6);
// cl_per_block: block size in cache lines. When this is non-zero, a
// query/set is done within a block to improve cache locality.
// hash_func: customized hash function
explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0,
uint32_t num_probes = 6,
uint32_t (*hash_func)(const Slice& key) = nullptr);
explicit DynamicBloom(uint32_t total_bits, uint32_t num_probes = 6);
~DynamicBloom() {
delete[] raw_;
}
// Assuming single threaded access to this function.
void Add(const Slice& key);
@@ -36,10 +40,15 @@ class DynamicBloom {
bool MayContainHash(uint32_t hash);
private:
uint32_t (*hash_func_)(const Slice& key);
const bool kBlocked;
const uint32_t kBitsPerBlock;
const uint32_t kTotalBits;
const uint32_t kNumBlocks;
const uint32_t kNumProbes;
std::unique_ptr<unsigned char[]> data_;
uint32_t (*hash_func_)(const Slice& key);
unsigned char* data_;
unsigned char* raw_;
};
inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }
@@ -50,22 +59,42 @@ inline bool DynamicBloom::MayContain(const Slice& key) {
inline bool DynamicBloom::MayContainHash(uint32_t h) {
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
for (uint32_t i = 0; i < kNumProbes; i++) {
const uint32_t bitpos = h % kTotalBits;
if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
return false;
if (kBlocked) {
uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * kBitsPerBlock;
for (uint32_t i = 0; i < kNumProbes; ++i) {
const uint32_t bitpos = b + h % kBitsPerBlock;
if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
return false;
}
h += delta;
}
} else {
for (uint32_t i = 0; i < kNumProbes; ++i) {
const uint32_t bitpos = h % kTotalBits;
if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
return false;
}
h += delta;
}
h += delta;
}
return true;
}
inline void DynamicBloom::AddHash(uint32_t h) {
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
for (uint32_t i = 0; i < kNumProbes; i++) {
const uint32_t bitpos = h % kTotalBits;
data_[bitpos / 8] |= (1 << (bitpos % 8));
h += delta;
if (kBlocked) {
uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * kBitsPerBlock;
for (uint32_t i = 0; i < kNumProbes; ++i) {
const uint32_t bitpos = b + h % kBitsPerBlock;
data_[bitpos / 8] |= (1 << (bitpos % 8));
h += delta;
}
} else {
for (uint32_t i = 0; i < kNumProbes; ++i) {
const uint32_t bitpos = h % kTotalBits;
data_[bitpos / 8] |= (1 << (bitpos % 8));
h += delta;
}
}
}

View File

@@ -3,19 +3,23 @@
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include <algorithm>
#include <gflags/gflags.h>
#include "dynamic_bloom.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
#include "util/stop_watch.h"
DEFINE_int32(bits_per_key, 10, "");
DEFINE_int32(num_probes, 6, "");
DEFINE_bool(enable_perf, false, "");
namespace rocksdb {
static Slice Key(int i, char* buffer) {
static Slice Key(uint64_t i, char* buffer) {
memcpy(buffer, &i, sizeof(i));
return Slice(buffer, sizeof(i));
}
@@ -24,36 +28,48 @@ class DynamicBloomTest {
};
TEST(DynamicBloomTest, EmptyFilter) {
DynamicBloom bloom(100, 2);
ASSERT_TRUE(! bloom.MayContain("hello"));
ASSERT_TRUE(! bloom.MayContain("world"));
DynamicBloom bloom1(100, 0, 2);
ASSERT_TRUE(!bloom1.MayContain("hello"));
ASSERT_TRUE(!bloom1.MayContain("world"));
DynamicBloom bloom2(CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
ASSERT_TRUE(!bloom2.MayContain("hello"));
ASSERT_TRUE(!bloom2.MayContain("world"));
}
TEST(DynamicBloomTest, Small) {
DynamicBloom bloom(100, 2);
bloom.Add("hello");
bloom.Add("world");
ASSERT_TRUE(bloom.MayContain("hello"));
ASSERT_TRUE(bloom.MayContain("world"));
ASSERT_TRUE(! bloom.MayContain("x"));
ASSERT_TRUE(! bloom.MayContain("foo"));
DynamicBloom bloom1(100, 0, 2);
bloom1.Add("hello");
bloom1.Add("world");
ASSERT_TRUE(bloom1.MayContain("hello"));
ASSERT_TRUE(bloom1.MayContain("world"));
ASSERT_TRUE(!bloom1.MayContain("x"));
ASSERT_TRUE(!bloom1.MayContain("foo"));
DynamicBloom bloom2(CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
bloom2.Add("hello");
bloom2.Add("world");
ASSERT_TRUE(bloom2.MayContain("hello"));
ASSERT_TRUE(bloom2.MayContain("world"));
ASSERT_TRUE(!bloom2.MayContain("x"));
ASSERT_TRUE(!bloom2.MayContain("foo"));
}
static int NextLength(int length) {
if (length < 10) {
length += 1;
} else if (length < 100) {
length += 10;
} else if (length < 1000) {
length += 100;
static uint32_t NextNum(uint32_t num) {
if (num < 10) {
num += 1;
} else if (num < 100) {
num += 10;
} else if (num < 1000) {
num += 100;
} else {
length += 1000;
num += 1000;
}
return length;
return num;
}
TEST(DynamicBloomTest, VaryingLengths) {
char buffer[sizeof(int)];
char buffer[sizeof(uint64_t)];
// Count number of filters that significantly exceed the false positive rate
int mediocre_filters = 0;
@@ -62,47 +78,116 @@ TEST(DynamicBloomTest, VaryingLengths) {
fprintf(stderr, "bits_per_key: %d num_probes: %d\n",
FLAGS_bits_per_key, FLAGS_num_probes);
for (int length = 1; length <= 10000; length = NextLength(length)) {
uint32_t bloom_bits = std::max(length * FLAGS_bits_per_key, 64);
DynamicBloom bloom(bloom_bits, FLAGS_num_probes);
for (int i = 0; i < length; i++) {
bloom.Add(Key(i, buffer));
ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
}
// All added keys must match
for (int i = 0; i < length; i++) {
ASSERT_TRUE(bloom.MayContain(Key(i, buffer)))
<< "Length " << length << "; key " << i;
}
// Check false positive rate
int result = 0;
for (int i = 0; i < 10000; i++) {
if (bloom.MayContain(Key(i + 1000000000, buffer))) {
result++;
for (uint32_t cl_per_block = 0; cl_per_block < FLAGS_num_probes;
++cl_per_block) {
for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
uint32_t bloom_bits = 0;
if (cl_per_block == 0) {
bloom_bits = std::max(num * FLAGS_bits_per_key, 64U);
} else {
bloom_bits = std::max(num * FLAGS_bits_per_key,
cl_per_block * CACHE_LINE_SIZE * 8);
}
DynamicBloom bloom(bloom_bits, cl_per_block, FLAGS_num_probes);
for (uint64_t i = 0; i < num; i++) {
bloom.Add(Key(i, buffer));
ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
}
// All added keys must match
for (uint64_t i = 0; i < num; i++) {
ASSERT_TRUE(bloom.MayContain(Key(i, buffer)))
<< "Num " << num << "; key " << i;
}
// Check false positive rate
int result = 0;
for (uint64_t i = 0; i < 10000; i++) {
if (bloom.MayContain(Key(i + 1000000000, buffer))) {
result++;
}
}
double rate = result / 10000.0;
fprintf(stderr, "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, "
"cl per block = %u\n", rate*100.0, num, bloom_bits, cl_per_block);
if (rate > 0.0125)
mediocre_filters++; // Allowed, but not too often
else
good_filters++;
}
double rate = result / 10000.0;
fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; \n",
rate*100.0, length);
//ASSERT_LE(rate, 0.02); // Must not be over 2%
if (rate > 0.0125)
mediocre_filters++; // Allowed, but not too often
else
good_filters++;
fprintf(stderr, "Filters: %d good, %d mediocre\n",
good_filters, mediocre_filters);
ASSERT_LE(mediocre_filters, good_filters/5);
}
fprintf(stderr, "Filters: %d good, %d mediocre\n",
good_filters, mediocre_filters);
ASSERT_LE(mediocre_filters, good_filters/5);
}
// Different bits-per-byte
TEST(DynamicBloomTest, perf) {
StopWatchNano timer(Env::Default());
if (!FLAGS_enable_perf) {
return;
}
for (uint64_t m = 1; m <= 8; ++m) {
const uint64_t num_keys = m * 8 * 1024 * 1024;
fprintf(stderr, "testing %luM keys\n", m * 8);
DynamicBloom std_bloom(num_keys * 10, 0, FLAGS_num_probes);
timer.Start();
for (uint64_t i = 1; i <= num_keys; ++i) {
std_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
}
uint64_t elapsed = timer.ElapsedNanos();
fprintf(stderr, "standard bloom, avg add latency %lu\n",
elapsed / num_keys);
uint64_t count = 0;
timer.Start();
for (uint64_t i = 1; i <= num_keys; ++i) {
if (std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8))) {
++count;
}
}
elapsed = timer.ElapsedNanos();
fprintf(stderr, "standard bloom, avg query latency %lu\n",
elapsed / count);
ASSERT_TRUE(count == num_keys);
for (int cl_per_block = 1; cl_per_block <= FLAGS_num_probes;
++cl_per_block) {
DynamicBloom blocked_bloom(num_keys * 10, cl_per_block, FLAGS_num_probes);
timer.Start();
for (uint64_t i = 1; i <= num_keys; ++i) {
blocked_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
}
uint64_t elapsed = timer.ElapsedNanos();
fprintf(stderr, "blocked bloom(%d), avg add latency %lu\n",
cl_per_block, elapsed / num_keys);
uint64_t count = 0;
timer.Start();
for (uint64_t i = 1; i <= num_keys; ++i) {
if (blocked_bloom.MayContain(
Slice(reinterpret_cast<const char*>(&i), 8))) {
++count;
}
}
elapsed = timer.ElapsedNanos();
fprintf(stderr, "blocked bloom(%d), avg query latency %lu\n",
cl_per_block, elapsed / count);
ASSERT_TRUE(count == num_keys);
}
}
}
} // namespace rocksdb

View File

@@ -1363,7 +1363,10 @@ class PosixEnv : public Env {
EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const {
EnvOptions optimized = env_options;
optimized.use_mmap_writes = false;
optimized.fallocate_with_keep_size = false;
// TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
// breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
// test and make this false
optimized.fallocate_with_keep_size = true;
return optimized;
}

View File

@@ -481,9 +481,9 @@ class TestLogger : public Logger {
if (new_format[0] == '[') {
// "[DEBUG] "
ASSERT_TRUE(n <= 56 + (512 - sizeof(struct timeval)));
ASSERT_TRUE(n <= 56 + (512 - static_cast<int>(sizeof(struct timeval))));
} else {
ASSERT_TRUE(n <= 48 + (512 - sizeof(struct timeval)));
ASSERT_TRUE(n <= 48 + (512 - static_cast<int>(sizeof(struct timeval))));
}
va_end(backup_ap);
}

View File

@@ -33,7 +33,7 @@ ColumnFamilyOptions::ColumnFamilyOptions()
compaction_filter_factory(std::shared_ptr<CompactionFilterFactory>(
new DefaultCompactionFilterFactory())),
compaction_filter_factory_v2(
new DefaultCompactionFilterFactoryV2(NewFixedPrefixTransform(8))),
new DefaultCompactionFilterFactoryV2()),
write_buffer_size(4 << 20),
max_write_buffer_number(2),
min_write_buffer_number_to_merge(1),
@@ -47,8 +47,8 @@ ColumnFamilyOptions::ColumnFamilyOptions()
whole_key_filtering(true),
num_levels(7),
level0_file_num_compaction_trigger(4),
level0_slowdown_writes_trigger(8),
level0_stop_writes_trigger(12),
level0_slowdown_writes_trigger(20),
level0_stop_writes_trigger(24),
max_mem_compaction_level(2),
target_file_size_base(2 * 1048576),
target_file_size_multiplier(1),
@@ -58,7 +58,7 @@ ColumnFamilyOptions::ColumnFamilyOptions()
expanded_compaction_factor(25),
source_compaction_factor(1),
max_grandparent_overlap_factor(10),
disable_seek_compaction(false),
disable_seek_compaction(true),
soft_rate_limit(0.0),
hard_rate_limit(0.0),
rate_limit_delay_max_milliseconds(1000),
@@ -151,11 +151,11 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
DBOptions::DBOptions()
: create_if_missing(false),
error_if_exists(false),
paranoid_checks(false),
paranoid_checks(true),
env(Env::Default()),
info_log(nullptr),
info_log_level(INFO),
max_open_files(1000),
max_open_files(5000),
statistics(nullptr),
disableDataSync(false),
use_fsync(false),
@@ -176,7 +176,7 @@ DBOptions::DBOptions()
manifest_preallocation_size(4 * 1024 * 1024),
allow_os_buffer(true),
allow_mmap_reads(false),
allow_mmap_writes(true),
allow_mmap_writes(false),
is_fd_close_on_exec(true),
skip_log_error_on_recovery(false),
stats_dump_period_sec(3600),