Files
xahaud/table/block_based_table_builder.cc
Vinnie Falco c168d54495 Squashed 'src/rocksdb2/' changes from 25888ae..1fdd726
1fdd726 Hotfix RocksDB 3.5
d67500a Add `make install` to Makefile in 3.5.fb.
4cb631a update HISTORY.md
cfd0946 comments about the BlockBasedTableOptions migration in Options
REVERT: 25888ae Merge pull request #329 from fyrz/master
REVERT: 89833e5 Fixed signed-unsigned comparison warning in db_test.cc
REVERT: fcac705 Fixed compile warning on Mac caused by unused variables.
REVERT: b3343fd resolution for java build problem introduced by 5ec53f3edf62bec1b690ce12fb21a6c52203f3c8
REVERT: 187b299 ForwardIterator: update prev_key_ only if prefix hasn't changed
REVERT: 5ec53f3 make compaction related options changeable
REVERT: d122e7b Update INSTALL.md
REVERT: 986dad0 Merge pull request #324 from dalgaaf/wip-da-SCA-20140930
REVERT: 8ee75dc db/memtable.cc: remove unused variable merge_result
REVERT: 0fd8bbc db/db_impl.cc: reduce scope of prefix_initialized
REVERT: 676ff7b compaction_picker.cc: remove check for >=0 for unsigned
REVERT: e55aea5 document_db.cc: fix assert
REVERT: d517c83 in_table_factory.cc: use correct format specifier
REVERT: b140375 ttl/ttl_test.cc: prefer prefix ++operator for non-primitive types
REVERT: 43c789c spatialdb/spatial_db.cc: use !empty() instead of 'size() > 0'
REVERT: 0de452e document_db.cc: pass const parameter by reference
REVERT: 4cc8643 util/ldb_cmd.cc: prefer prefix ++operator for non-primitive types
REVERT: af8c2b2 util/signal_test.cc: suppress intentional null pointer deref
REVERT: 33580fa db/db_impl.cc: fix object handling, remove double lines
REVERT: 873f135 db_ttl_impl.h: pass func parameter by reference
REVERT: 8558457 ldb_cmd_execute_result.h: perform init in initialization list
REVERT: 063471b table/table_test.cc: pass func parameter by reference
REVERT: 93548ce table/cuckoo_table_reader.cc: pass func parameter by ref
REVERT: b8b7117 db/version_set.cc: use !empty() instead of 'size() > 0'
REVERT: 8ce050b table/bloom_block.*: pass func parameter by reference
REVERT: 53910dd db_test.cc: pass parameter by reference
REVERT: 68ca534 corruption_test.cc: pass parameter by reference
REVERT: 7506198 cuckoo_table_db_test.cc: add flush after delete
REVERT: 1f96330 Print MB per second compaction throughput separately for reads and writes
REVERT: ffe3d49 Add an instruction about SSE in INSTALL.md
REVERT: ee1f3cc Package generation for Ubuntu and CentOS
REVERT: f0f7955 Fixing comile errors on OS X
REVERT: 99fb613 remove 2 space linter
REVERT: b2d64a4 Fix linters, second try
REVERT: 747523d Print per column family metrics in db_bench
REVERT: 56ebd40 Fix arc lint (should fix #238)
REVERT: 637f891 Merge pull request #321 from eonnen/master
REVERT: 827e31c Make test use a compatible type in the size checks.
REVERT: fd5d80d CompactedDB: log using the correct info_log
REVERT: 2faf49d use GetContext to replace callback function pointer
REVERT: 983d2de Add AUTHORS file. Fix #203
REVERT: abd70c5 Merge pull request #316 from fyrz/ReverseBytewiseComparator
REVERT: 2dc6f62 handle kDelete type in cuckoo builder
REVERT: 8b8011a Changed name of ReverseBytewiseComparator based on review comment
REVERT: 389edb6 universal compaction picker: use double for potential overflow
REVERT: 5340484 Built-in comparator(s) in RocksJava
REVERT: d439451 delay initialization of cuckoo table iterator
REVERT: 94997ea reduce memory usage of cuckoo table builder
REVERT: c627595 improve memory efficiency of cuckoo reader
REVERT: 581442d option to choose module when calculating CuckooTable hash
REVERT: fbd2daf CompactedDBImpl::MultiGet() for better CuckooTable performance
REVERT: 3c68006 CompactedDBImpl
REVERT: f7375f3 Fix double deletes
REVERT: 21ddcf6 Remove allow_thread_local
REVERT: fb4a492 Merge pull request #311 from ankgup87/master
REVERT: 611e286 Merge branch 'master' of https://github.com/facebook/rocksdb
REVERT: 0103b44 Merge branch 'master' of ssh://github.com/ankgup87/rocksdb
REVERT: 1dfb7bb Add block based table config options
REVERT: cdaf44f Enlarge log size cap when printing file summary
REVERT: 7cc1ed7 Merge pull request #309 from naveenatceg/staticbuild
REVERT: ba6d660 Resolving merge conflict
REVERT: 51eeaf6 Addressing review comments
REVERT: fd7d3fe Addressing review comments (adding a env variable to override temp directory)
REVERT: cf7ace8 Addressing review comments
REVERT: 0a29ce5 re-enable BlockBasedTable::SetupForCompaction()
REVERT: 55af370 Remove TODO for checking index checksums
REVERT: 3d74f09 Fix compile
REVERT: 53b0039 Fix release compile
REVERT: d0de413 WriteBatchWithIndex to allow different Comparators for different column families
REVERT: 57a32f1 change target_file_size_base to uint64_t
REVERT: 5e6aee4 dont create backup_input if compaction filter v2 is not used
REVERT: 49b5f94 Merge pull request #306 from Liuchang0812/fix_cast
REVERT: 787cb4d remove cast, replace %llu with % PRIu64
REVERT: a7574d4 Update logging.cc
REVERT: 7e0dcb9 Update logging.cc
REVERT: 57fa3cc Merge pull request #304 from Liuchang0812/fix-check
REVERT: cd44522 Merge pull request #305 from Liuchang0812/fix-logging
REVERT: 6a031b6 remove unused variable
REVERT: 4436f17 fixed #303: replace %ld with % PRId64
REVERT: 7a1bd05 Merge pull request #302 from ankgup87/master
REVERT: 423e52c Merge branch 'master' of https://github.com/facebook/rocksdb
REVERT: bfeef94 Add rate limiter
REVERT: 32f2532 Print compression_size_percent as a signed int
REVERT: 976caca Skip AllocateTest if fallocate() is not supported in the file system
REVERT: 3b897cd Enable no-fbcode RocksDB build
REVERT: f445947 RocksDB: Format uint64 using PRIu64 in db_impl.cc
REVERT: e17bc65 Merge pull request #299 from ankgup87/master
REVERT: b93797a Fix build
REVERT: adae3ca [Java] Fix JNI link error caused by the removal of options.db_stats_log_interval
REVERT: 90b8c07 Fix unit tests errors
REVERT: 51af7c3 CuckooTable: add one option to allow identity function for the first hash function
REVERT: 0350435 Fixed a signed-unsigned comparison in spatial_db.cc -- issue #293
REVERT: 2fb1fea Fix syncronization issues
REVERT: ff76895 Remove some unnecessary constructors
REVERT: feadb9d fix cuckoo table builder test
REVERT: 3c232e1 Fix mac compile
REVERT: 54cada9 Run make format on PR #249
REVERT: 27b22f1 Merge pull request #249 from tdfischer/decompression-refactoring
REVERT: fb6456b Replace naked calls to operator new and delete (Fixes #222)
REVERT: 5600c8f cuckoo table: return estimated size - 1
REVERT: a062e1f SetOptions() for memtable related options
REVERT: e4eca6a Options conversion function for convenience
REVERT: a7c2094 Merge pull request #292 from saghmrossi/master
REVERT: 4d05234 Merge branch 'master' of github.com:saghmrossi/rocksdb
REVERT: 60a4aa1 Test use_mmap_reads
REVERT: 94e43a1 [Java] Fixed 32-bit overflowing issue when converting jlong to size_t
REVERT: f9eaaa6 added include for inttypes.h to fix nonworking printf statements
REVERT: f090575 Replaced "built on on earlier work" by "built on earlier work" in README.md
REVERT: faad439 Fix #284
REVERT: 49aacd8 Fix make install
REVERT: acb9348 [Java] Include WriteBatch into RocksDBSample.java, fix how DbBenchmark.java handles WriteBatch.
REVERT: 4a27a2f Don't sync manifest when disableDataSync = true
REVERT: 9b8480d Merge pull request #287 from yinqiwen/rate-limiter-crash-fix
REVERT: 28be16b fix rate limiter crash #286
REVERT: 04ce1b2 Fix #284
REVERT: add22e3 standardize scripts to run RocksDB benchmarks
REVERT: dee91c2 WriteThread
REVERT: 540a257 Fix WAL synced
REVERT: 24f034b Merge pull request #282 from Chilledheart/develop
REVERT: 49fe329 Fix build issue under macosx
REVERT: ebb5c65 Add make install
REVERT: 0352a9f add_wrapped_bloom_test
REVERT: 9c0e66c Don't run background jobs (flush, compactions) when bg_error_ is set
REVERT: a9639bd Fix valgrind test
REVERT: d1f24dc Relax FlushSchedule test
REVERT: 3d9e6f7 Push model for flushing memtables
REVERT: 059e584 [unit test] CompactRange should fail if we don't have space
REVERT: dd641b2 fix RocksDB java build
REVERT: 53404d9 add_qps_info_in cache bench
REVERT: a52cecb Fix Mac compile
REVERT: 092f97e Fix comments and typos
REVERT: 6cc1286 Added a few statistics for BackupableDB
REVERT: 0a42295 Fix SimpleWriteTimeoutTest
REVERT: 06d9862 Always pass MergeContext as pointer, not reference
REVERT: d343c3f Improve db recovery
REVERT: 6bb7e3e Merger test
REVERT: 88841bd Explicitly cast char to signed char in Hash()
REVERT: 5231146 MemTableOptions
REVERT: 1d284db Addressing review comments
REVERT: 55114e7 Some updates for SpatialDB
REVERT: 171d4ff remove TailingIterator reference in db_impl.h
REVERT: 9b0f7ff rename version_set options_ to db_options_ to avoid confusion
REVERT: 2d57828 Check stop level trigger-0 before slowdown level-0 trigger
REVERT: 659d2d5 move compaction_filter to immutable_options
REVERT: 048560a reduce references to cfd->options() in DBImpl
REVERT: 011241b DB::Flush() Do not wait for background threads when there is nothing in mem table
REVERT: a2bb7c3 Push- instead of pull-model for managing Write stalls
REVERT: 0af157f Implement full filter for block based table.
REVERT: 9360cc6 Fix valgrind issue
REVERT: 02d5bff Merge pull request #277 from wankai/master
REVERT: 88a2f44 fix comments
REVERT: 7c16e39 Merge pull request #276 from wankai/master
REVERT: 8237738 replace hard-coded number with named variable
REVERT: db8ca52 Merge pull request #273 from nbougalis/static-analysis
REVERT: b7b031f Merge pull request #274 from wankai/master
REVERT: 4c2b1f0 Merge remote-tracking branch 'upstream/master'
REVERT: a5d2863 typo improvement
REVERT: 9f8aa09 Don't leak data returned by opendir
REVERT: d1cfb71 Remove unused member(s)
REVERT: bfee319 sizeof(int*) where sizeof(int) was intended
REVERT: d40c1f7 Add missing break statement
REVERT: 2e97c38 Avoid off-by-one error when using readlink
REVERT: 40ddc3d add cache bench
REVERT: 9f1c80b Drop column family from write thread
REVERT: 8de151b Add db_bench with lots of column families to regression tests
REVERT: c9e419c rename options_ to db_options_ in DBImpl to avoid confusion
REVERT: 5cd0576 Fix compaction bug in Cuckoo Table Builder. Use kvs_.size() instead of num_entries in FileSize() method.
REVERT: 0fbb3fa fixed memory leak in unit test DBIteratorBoundTest
REVERT: adcd253 fix asan check
REVERT: 4092b7a Merge pull request #272 from project-zerus/patch-1
REVERT: bb6ae0f fix more compile warnings
REVERT: 6d31441 Merge pull request #271 from nbougalis/cleanups
REVERT: 0cd0ec4 Plug memory leak during index creation
REVERT: 4329d74 Fix swapped variable names to accurately reflect usage
REVERT: 45a5e3e Remove path with arena==nullptr from NewInternalIterator
REVERT: 5665e5e introduce ImmutableOptions
REVERT: e0b99d4 created a new ReadOptions parameter 'iterate_upper_bound'
REVERT: 51ea889 Fix travis builds
REVERT: a481626 Relax backupable rate limiting test
REVERT: f7f973d Merge pull request #269 from huahang/patch-2
REVERT: ef5b384 fix a few compile warnings
REVERT: 2fd3806 Merge pull request #263 from wankai/master
REVERT: 1785114 delete unused Comparator
REVERT: 1b1d961 update HISTORY.md
REVERT: 703c3ea comments about the BlockBasedTableOptions migration in Options
REVERT: 4b5ad88 Merge pull request #260 from wankai/master
REVERT: 19cc588 change to filter_block std::unique_ptr support RAII
REVERT: 9b976e3 Merge pull request #259 from wankai/master
REVERT: 5d25a46 Merge remote-tracking branch 'upstream/master'
REVERT: dff2b1a typo improvement
REVERT: 343e98a Reverting import change
REVERT: ddb8039 RocksDB static build Make file changes to download and build the dependencies .Load the shared library when RocksDB is initialized

git-subtree-dir: src/rocksdb2
git-subtree-split: 1fdd726a8254c13d0c66d8db8130ad17c13d7bcc
2014-10-27 11:36:32 -07:00

810 lines
29 KiB
C++

// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "table/block_based_table_builder.h"
#include <assert.h>
#include <inttypes.h>
#include <stdio.h>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include "db/dbformat.h"
#include "rocksdb/cache.h"
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
#include "rocksdb/filter_policy.h"
#include "rocksdb/flush_block_policy.h"
#include "rocksdb/options.h"
#include "rocksdb/table.h"
#include "table/block.h"
#include "table/block_based_table_reader.h"
#include "table/block_builder.h"
#include "table/filter_block.h"
#include "table/format.h"
#include "table/meta_blocks.h"
#include "table/table_builder.h"
#include "util/coding.h"
#include "util/crc32c.h"
#include "util/stop_watch.h"
#include "util/xxhash.h"
namespace rocksdb {
extern const std::string kHashIndexPrefixesBlock;
extern const std::string kHashIndexPrefixesMetadataBlock;
typedef BlockBasedTableOptions::IndexType IndexType;
// The interface for building index.
// Instruction for adding a new concrete IndexBuilder:
// 1. Create a subclass instantiated from IndexBuilder.
// 2. Add a new entry associated with that subclass in TableOptions::IndexType.
// 3. Add a create function for the new subclass in CreateIndexBuilder.
// Note: we can devise more advanced design to simplify the process for adding
// new subclass, which will, on the other hand, increase the code complexity and
// catch unwanted attention from readers. Given that we won't add/change
// indexes frequently, it makes sense to just embrace a more straightforward
// design that just works.
class IndexBuilder {
public:
// Index builder will construct a set of blocks which contain:
// 1. One primary index block.
// 2. (Optional) a set of metablocks that contains the metadata of the
// primary index.
struct IndexBlocks {
Slice index_block_contents;
std::unordered_map<std::string, Slice> meta_blocks;
};
explicit IndexBuilder(const Comparator* comparator)
: comparator_(comparator) {}
virtual ~IndexBuilder() {}
// Add a new index entry to index block.
// To allow further optimization, we provide `last_key_in_current_block` and
// `first_key_in_next_block`, based on which the specific implementation can
// determine the best index key to be used for the index block.
// @last_key_in_current_block: this parameter maybe overridden with the value
// "substitute key".
// @first_key_in_next_block: it will be nullptr if the entry being added is
// the last one in the table
//
// REQUIRES: Finish() has not yet been called.
virtual void AddIndexEntry(std::string* last_key_in_current_block,
const Slice* first_key_in_next_block,
const BlockHandle& block_handle) = 0;
// This method will be called whenever a key is added. The subclasses may
// override OnKeyAdded() if they need to collect additional information.
virtual void OnKeyAdded(const Slice& key) {}
// Inform the index builder that all entries has been written. Block builder
// may therefore perform any operation required for block finalization.
//
// REQUIRES: Finish() has not yet been called.
virtual Status Finish(IndexBlocks* index_blocks) = 0;
// Get the estimated size for index block.
virtual size_t EstimatedSize() const = 0;
protected:
const Comparator* comparator_;
};
// This index builder builds space-efficient index block.
//
// Optimizations:
// 1. Made block's `block_restart_interval` to be 1, which will avoid linear
// search when doing index lookup.
// 2. Shorten the key length for index block. Other than honestly using the
// last key in the data block as the index key, we instead find a shortest
// substitute key that serves the same function.
class ShortenedIndexBuilder : public IndexBuilder {
public:
explicit ShortenedIndexBuilder(const Comparator* comparator)
: IndexBuilder(comparator),
index_block_builder_(1 /* block_restart_interval == 1 */) {}
virtual void AddIndexEntry(std::string* last_key_in_current_block,
const Slice* first_key_in_next_block,
const BlockHandle& block_handle) override {
if (first_key_in_next_block != nullptr) {
comparator_->FindShortestSeparator(last_key_in_current_block,
*first_key_in_next_block);
} else {
comparator_->FindShortSuccessor(last_key_in_current_block);
}
std::string handle_encoding;
block_handle.EncodeTo(&handle_encoding);
index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
}
virtual Status Finish(IndexBlocks* index_blocks) {
index_blocks->index_block_contents = index_block_builder_.Finish();
return Status::OK();
}
virtual size_t EstimatedSize() const {
return index_block_builder_.CurrentSizeEstimate();
}
private:
BlockBuilder index_block_builder_;
};
// HashIndexBuilder contains a binary-searchable primary index and the
// metadata for secondary hash index construction.
// The metadata for hash index consists two parts:
// - a metablock that compactly contains a sequence of prefixes. All prefixes
// are stored consectively without any metadata (like, prefix sizes) being
// stored, which is kept in the other metablock.
// - a metablock contains the metadata of the prefixes, including prefix size,
// restart index and number of block it spans. The format looks like:
//
// +-----------------+---------------------------+---------------------+ <=prefix 1
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
// +-----------------+---------------------------+---------------------+ <=prefix 2
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
// +-----------------+---------------------------+---------------------+
// | |
// | .... |
// | |
// +-----------------+---------------------------+---------------------+ <=prefix n
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
// +-----------------+---------------------------+---------------------+
//
// The reason of separating these two metablocks is to enable the efficiently
// reuse the first metablock during hash index construction without unnecessary
// data copy or small heap allocations for prefixes.
class HashIndexBuilder : public IndexBuilder {
public:
explicit HashIndexBuilder(const Comparator* comparator,
const SliceTransform* hash_key_extractor)
: IndexBuilder(comparator),
primary_index_builder(comparator),
hash_key_extractor_(hash_key_extractor) {}
virtual void AddIndexEntry(std::string* last_key_in_current_block,
const Slice* first_key_in_next_block,
const BlockHandle& block_handle) override {
++current_restart_index_;
primary_index_builder.AddIndexEntry(last_key_in_current_block,
first_key_in_next_block, block_handle);
}
virtual void OnKeyAdded(const Slice& key) override {
auto key_prefix = hash_key_extractor_->Transform(key);
bool is_first_entry = pending_block_num_ == 0;
// Keys may share the prefix
if (is_first_entry || pending_entry_prefix_ != key_prefix) {
if (!is_first_entry) {
FlushPendingPrefix();
}
// need a hard copy otherwise the underlying data changes all the time.
// TODO(kailiu) ToString() is expensive. We may speed up can avoid data
// copy.
pending_entry_prefix_ = key_prefix.ToString();
pending_block_num_ = 1;
pending_entry_index_ = current_restart_index_;
} else {
// entry number increments when keys share the prefix reside in
// differnt data blocks.
auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
assert(last_restart_index <= current_restart_index_);
if (last_restart_index != current_restart_index_) {
++pending_block_num_;
}
}
}
virtual Status Finish(IndexBlocks* index_blocks) {
FlushPendingPrefix();
primary_index_builder.Finish(index_blocks);
index_blocks->meta_blocks.insert(
{kHashIndexPrefixesBlock.c_str(), prefix_block_});
index_blocks->meta_blocks.insert(
{kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
return Status::OK();
}
virtual size_t EstimatedSize() const {
return primary_index_builder.EstimatedSize() + prefix_block_.size() +
prefix_meta_block_.size();
}
private:
void FlushPendingPrefix() {
prefix_block_.append(pending_entry_prefix_.data(),
pending_entry_prefix_.size());
PutVarint32(&prefix_meta_block_, pending_entry_prefix_.size());
PutVarint32(&prefix_meta_block_, pending_entry_index_);
PutVarint32(&prefix_meta_block_, pending_block_num_);
}
ShortenedIndexBuilder primary_index_builder;
const SliceTransform* hash_key_extractor_;
// stores a sequence of prefixes
std::string prefix_block_;
// stores the metadata of prefixes
std::string prefix_meta_block_;
// The following 3 variables keeps unflushed prefix and its metadata.
// The details of block_num and entry_index can be found in
// "block_hash_index.{h,cc}"
uint32_t pending_block_num_ = 0;
uint32_t pending_entry_index_ = 0;
std::string pending_entry_prefix_;
uint64_t current_restart_index_ = 0;
};
// Create a index builder based on its type.
IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator,
const SliceTransform* prefix_extractor) {
switch (type) {
case BlockBasedTableOptions::kBinarySearch: {
return new ShortenedIndexBuilder(comparator);
}
case BlockBasedTableOptions::kHashSearch: {
return new HashIndexBuilder(comparator, prefix_extractor);
}
default: {
assert(!"Do not recognize the index type ");
return nullptr;
}
}
// impossible.
assert(false);
return nullptr;
}
bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
// Check to see if compressed less than 12.5%
return compressed_size < raw_size - (raw_size / 8u);
}
Slice CompressBlock(const Slice& raw,
const CompressionOptions& compression_options,
CompressionType* type, std::string* compressed_output) {
if (*type == kNoCompression) {
return raw;
}
// Will return compressed block contents if (1) the compression method is
// supported in this platform and (2) the compression rate is "good enough".
switch (*type) {
case kSnappyCompression:
if (port::Snappy_Compress(compression_options, raw.data(), raw.size(),
compressed_output) &&
GoodCompressionRatio(compressed_output->size(), raw.size())) {
return *compressed_output;
}
break; // fall back to no compression.
case kZlibCompression:
if (port::Zlib_Compress(compression_options, raw.data(), raw.size(),
compressed_output) &&
GoodCompressionRatio(compressed_output->size(), raw.size())) {
return *compressed_output;
}
break; // fall back to no compression.
case kBZip2Compression:
if (port::BZip2_Compress(compression_options, raw.data(), raw.size(),
compressed_output) &&
GoodCompressionRatio(compressed_output->size(), raw.size())) {
return *compressed_output;
}
break; // fall back to no compression.
case kLZ4Compression:
if (port::LZ4_Compress(compression_options, raw.data(), raw.size(),
compressed_output) &&
GoodCompressionRatio(compressed_output->size(), raw.size())) {
return *compressed_output;
}
break; // fall back to no compression.
case kLZ4HCCompression:
if (port::LZ4HC_Compress(compression_options, raw.data(), raw.size(),
compressed_output) &&
GoodCompressionRatio(compressed_output->size(), raw.size())) {
return *compressed_output;
}
break; // fall back to no compression.
default: {} // Do not recognize this compression type
}
// Compression method is not supported, or not good compression ratio, so just
// fall back to uncompressed form.
*type = kNoCompression;
return raw;
}
// kBlockBasedTableMagicNumber was picked by running
// echo rocksdb.table.block_based | sha1sum
// and taking the leading 64 bits.
// Please note that kBlockBasedTableMagicNumber may also be accessed by
// other .cc files so it have to be explicitly declared with "extern".
extern const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull;
// We also support reading and writing legacy block based table format (for
// backwards compatibility)
extern const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
// A collector that collects properties of interest to block-based table.
// For now this class looks heavy-weight since we only write one additional
// property.
// But in the forseeable future, we will add more and more properties that are
// specific to block-based table.
class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
: public TablePropertiesCollector {
public:
explicit BlockBasedTablePropertiesCollector(
BlockBasedTableOptions::IndexType index_type)
: index_type_(index_type) {}
virtual Status Add(const Slice& key, const Slice& value) {
// Intentionally left blank. Have no interest in collecting stats for
// individual key/value pairs.
return Status::OK();
}
virtual Status Finish(UserCollectedProperties* properties) {
std::string val;
PutFixed32(&val, static_cast<uint32_t>(index_type_));
properties->insert({BlockBasedTablePropertyNames::kIndexType, val});
return Status::OK();
}
// The name of the properties collector can be used for debugging purpose.
virtual const char* Name() const {
return "BlockBasedTablePropertiesCollector";
}
virtual UserCollectedProperties GetReadableProperties() const {
// Intentionally left blank.
return UserCollectedProperties();
}
private:
BlockBasedTableOptions::IndexType index_type_;
};
struct BlockBasedTableBuilder::Rep {
const Options options;
const BlockBasedTableOptions table_options;
const InternalKeyComparator& internal_comparator;
WritableFile* file;
uint64_t offset = 0;
Status status;
BlockBuilder data_block;
InternalKeySliceTransform internal_prefix_transform;
std::unique_ptr<IndexBuilder> index_builder;
std::string last_key;
CompressionType compression_type;
TableProperties props;
bool closed = false; // Either Finish() or Abandon() has been called.
FilterBlockBuilder* filter_block;
char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize];
size_t compressed_cache_key_prefix_size;
BlockHandle pending_handle; // Handle to add to index block
std::string compressed_output;
std::unique_ptr<FlushBlockPolicy> flush_block_policy;
std::vector<std::unique_ptr<TablePropertiesCollector>>
table_properties_collectors;
Rep(const Options& opt, const BlockBasedTableOptions& table_opt,
const InternalKeyComparator& icomparator,
WritableFile* f, CompressionType compression_type)
: options(opt),
table_options(table_opt),
internal_comparator(icomparator),
file(f),
data_block(table_options.block_restart_interval),
internal_prefix_transform(options.prefix_extractor.get()),
index_builder(CreateIndexBuilder(
table_options.index_type, &internal_comparator,
&this->internal_prefix_transform)),
compression_type(compression_type),
filter_block(table_options.filter_policy == nullptr ?
nullptr :
new FilterBlockBuilder(opt, table_options, &internal_comparator)),
flush_block_policy(
table_options.flush_block_policy_factory->NewFlushBlockPolicy(
table_options, data_block)) {
for (auto& collector_factories :
options.table_properties_collector_factories) {
table_properties_collectors.emplace_back(
collector_factories->CreateTablePropertiesCollector());
}
table_properties_collectors.emplace_back(
new BlockBasedTablePropertiesCollector(table_options.index_type));
}
};
BlockBasedTableBuilder::BlockBasedTableBuilder(
const Options& options, const BlockBasedTableOptions& table_options,
const InternalKeyComparator& internal_comparator, WritableFile* file,
CompressionType compression_type)
: rep_(new Rep(options, table_options, internal_comparator,
file, compression_type)) {
if (rep_->filter_block != nullptr) {
rep_->filter_block->StartBlock(0);
}
if (table_options.block_cache_compressed.get() != nullptr) {
BlockBasedTable::GenerateCachePrefix(
table_options.block_cache_compressed.get(), file,
&rep_->compressed_cache_key_prefix[0],
&rep_->compressed_cache_key_prefix_size);
}
}
BlockBasedTableBuilder::~BlockBasedTableBuilder() {
assert(rep_->closed); // Catch errors where caller forgot to call Finish()
delete rep_->filter_block;
delete rep_;
}
void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
Rep* r = rep_;
assert(!r->closed);
if (!ok()) return;
if (r->props.num_entries > 0) {
assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
}
auto should_flush = r->flush_block_policy->Update(key, value);
if (should_flush) {
assert(!r->data_block.empty());
Flush();
// Add item to index block.
// We do not emit the index entry for a block until we have seen the
// first key for the next data block. This allows us to use shorter
// keys in the index block. For example, consider a block boundary
// between the keys "the quick brown fox" and "the who". We can use
// "the r" as the key for the index block entry since it is >= all
// entries in the first block and < all entries in subsequent
// blocks.
if (ok()) {
r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle);
}
}
if (r->filter_block != nullptr) {
r->filter_block->AddKey(ExtractUserKey(key));
}
r->last_key.assign(key.data(), key.size());
r->data_block.Add(key, value);
r->props.num_entries++;
r->props.raw_key_size += key.size();
r->props.raw_value_size += value.size();
r->index_builder->OnKeyAdded(key);
NotifyCollectTableCollectorsOnAdd(key, value, r->table_properties_collectors,
r->options.info_log.get());
}
void BlockBasedTableBuilder::Flush() {
Rep* r = rep_;
assert(!r->closed);
if (!ok()) return;
if (r->data_block.empty()) return;
WriteBlock(&r->data_block, &r->pending_handle);
if (ok()) {
r->status = r->file->Flush();
}
if (r->filter_block != nullptr) {
r->filter_block->StartBlock(r->offset);
}
r->props.data_size = r->offset;
++r->props.num_data_blocks;
}
void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
BlockHandle* handle) {
WriteBlock(block->Finish(), handle);
block->Reset();
}
void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
BlockHandle* handle) {
// File format contains a sequence of blocks where each block has:
// block_data: uint8[n]
// type: uint8
// crc: uint32
assert(ok());
Rep* r = rep_;
auto type = r->compression_type;
Slice block_contents;
if (raw_block_contents.size() < kCompressionSizeLimit) {
block_contents =
CompressBlock(raw_block_contents, r->options.compression_opts, &type,
&r->compressed_output);
} else {
RecordTick(r->options.statistics.get(), NUMBER_BLOCK_NOT_COMPRESSED);
type = kNoCompression;
block_contents = raw_block_contents;
}
WriteRawBlock(block_contents, type, handle);
r->compressed_output.clear();
}
void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
CompressionType type,
BlockHandle* handle) {
Rep* r = rep_;
StopWatch sw(r->options.env, r->options.statistics.get(),
WRITE_RAW_BLOCK_MICROS);
handle->set_offset(r->offset);
handle->set_size(block_contents.size());
r->status = r->file->Append(block_contents);
if (r->status.ok()) {
char trailer[kBlockTrailerSize];
trailer[0] = type;
char* trailer_without_type = trailer + 1;
switch (r->table_options.checksum) {
case kNoChecksum:
// we don't support no checksum yet
assert(false);
// intentional fallthrough in release binary
case kCRC32c: {
auto crc = crc32c::Value(block_contents.data(), block_contents.size());
crc = crc32c::Extend(crc, trailer, 1); // Extend to cover block type
EncodeFixed32(trailer_without_type, crc32c::Mask(crc));
break;
}
case kxxHash: {
void* xxh = XXH32_init(0);
XXH32_update(xxh, block_contents.data(), block_contents.size());
XXH32_update(xxh, trailer, 1); // Extend to cover block type
EncodeFixed32(trailer_without_type, XXH32_digest(xxh));
break;
}
}
r->status = r->file->Append(Slice(trailer, kBlockTrailerSize));
if (r->status.ok()) {
r->status = InsertBlockInCache(block_contents, type, handle);
}
if (r->status.ok()) {
r->offset += block_contents.size() + kBlockTrailerSize;
}
}
}
Status BlockBasedTableBuilder::status() const {
return rep_->status;
}
static void DeleteCachedBlock(const Slice& key, void* value) {
Block* block = reinterpret_cast<Block*>(value);
delete block;
}
//
// Make a copy of the block contents and insert into compressed block cache
//
Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
const CompressionType type,
const BlockHandle* handle) {
Rep* r = rep_;
Cache* block_cache_compressed = r->table_options.block_cache_compressed.get();
if (type != kNoCompression && block_cache_compressed != nullptr) {
Cache::Handle* cache_handle = nullptr;
size_t size = block_contents.size();
char* ubuf = new char[size + 1]; // make a new copy
memcpy(ubuf, block_contents.data(), size);
ubuf[size] = type;
BlockContents results;
Slice sl(ubuf, size);
results.data = sl;
results.cachable = true; // XXX
results.heap_allocated = true;
results.compression_type = type;
Block* block = new Block(results);
// make cache key by appending the file offset to the cache prefix id
char* end = EncodeVarint64(
r->compressed_cache_key_prefix +
r->compressed_cache_key_prefix_size,
handle->offset());
Slice key(r->compressed_cache_key_prefix, static_cast<size_t>
(end - r->compressed_cache_key_prefix));
// Insert into compressed block cache.
cache_handle = block_cache_compressed->Insert(key, block, block->size(),
&DeleteCachedBlock);
block_cache_compressed->Release(cache_handle);
// Invalidate OS cache.
r->file->InvalidateCache(r->offset, size);
}
return Status::OK();
}
Status BlockBasedTableBuilder::Finish() {
Rep* r = rep_;
bool empty_data_block = r->data_block.empty();
Flush();
assert(!r->closed);
r->closed = true;
BlockHandle filter_block_handle,
metaindex_block_handle,
index_block_handle;
// Write filter block
if (ok() && r->filter_block != nullptr) {
auto filter_contents = r->filter_block->Finish();
r->props.filter_size = filter_contents.size();
WriteRawBlock(filter_contents, kNoCompression, &filter_block_handle);
}
// To make sure properties block is able to keep the accurate size of index
// block, we will finish writing all index entries here and flush them
// to storage after metaindex block is written.
if (ok() && !empty_data_block) {
r->index_builder->AddIndexEntry(
&r->last_key, nullptr /* no next data block */, r->pending_handle);
}
IndexBuilder::IndexBlocks index_blocks;
auto s = r->index_builder->Finish(&index_blocks);
if (!s.ok()) {
return s;
}
// Write meta blocks and metaindex block with the following order.
// 1. [meta block: filter]
// 2. [other meta blocks]
// 3. [meta block: properties]
// 4. [metaindex block]
// write meta blocks
MetaIndexBuilder meta_index_builder;
for (const auto& item : index_blocks.meta_blocks) {
BlockHandle block_handle;
WriteBlock(item.second, &block_handle);
meta_index_builder.Add(item.first, block_handle);
}
if (ok()) {
if (r->filter_block != nullptr) {
// Add mapping from "<filter_block_prefix>.Name" to location
// of filter data.
std::string key = BlockBasedTable::kFilterBlockPrefix;
key.append(r->table_options.filter_policy->Name());
meta_index_builder.Add(key, filter_block_handle);
}
// Write properties block.
{
PropertyBlockBuilder property_block_builder;
std::vector<std::string> failed_user_prop_collectors;
r->props.filter_policy_name = r->table_options.filter_policy != nullptr ?
r->table_options.filter_policy->Name() : "";
r->props.index_size =
r->index_builder->EstimatedSize() + kBlockTrailerSize;
// Add basic properties
property_block_builder.AddTableProperty(r->props);
// Add use collected properties
NotifyCollectTableCollectorsOnFinish(r->table_properties_collectors,
r->options.info_log.get(),
&property_block_builder);
BlockHandle properties_block_handle;
WriteRawBlock(
property_block_builder.Finish(),
kNoCompression,
&properties_block_handle
);
meta_index_builder.Add(kPropertiesBlock, properties_block_handle);
} // end of properties block writing
} // meta blocks
// Write index block
if (ok()) {
// flush the meta index block
WriteRawBlock(meta_index_builder.Finish(), kNoCompression,
&metaindex_block_handle);
WriteBlock(index_blocks.index_block_contents, &index_block_handle);
}
// Write footer
if (ok()) {
// No need to write out new footer if we're using default checksum.
// We're writing legacy magic number because we want old versions of RocksDB
// be able to read files generated with new release (just in case if
// somebody wants to roll back after an upgrade)
// TODO(icanadi) at some point in the future, when we're absolutely sure
// nobody will roll back to RocksDB 2.x versions, retire the legacy magic
// number and always write new table files with new magic number
bool legacy = (r->table_options.checksum == kCRC32c);
Footer footer(legacy ? kLegacyBlockBasedTableMagicNumber
: kBlockBasedTableMagicNumber);
footer.set_metaindex_handle(metaindex_block_handle);
footer.set_index_handle(index_block_handle);
footer.set_checksum(r->table_options.checksum);
std::string footer_encoding;
footer.EncodeTo(&footer_encoding);
r->status = r->file->Append(footer_encoding);
if (r->status.ok()) {
r->offset += footer_encoding.size();
}
}
// Print out the table stats
if (ok()) {
// user collected properties
std::string user_collected;
user_collected.reserve(1024);
for (const auto& collector : r->table_properties_collectors) {
for (const auto& prop : collector->GetReadableProperties()) {
user_collected.append(prop.first);
user_collected.append("=");
user_collected.append(prop.second);
user_collected.append("; ");
}
}
Log(
r->options.info_log,
"Table was constructed:\n"
" [basic properties]: %s\n"
" [user collected properties]: %s",
r->props.ToString().c_str(),
user_collected.c_str()
);
}
return r->status;
}
void BlockBasedTableBuilder::Abandon() {
Rep* r = rep_;
assert(!r->closed);
r->closed = true;
}
uint64_t BlockBasedTableBuilder::NumEntries() const {
return rep_->props.num_entries;
}
uint64_t BlockBasedTableBuilder::FileSize() const {
return rep_->offset;
}
const std::string BlockBasedTable::kFilterBlockPrefix = "filter.";
} // namespace rocksdb