mirror of
https://github.com/XRPLF/rippled.git
synced 2025-12-06 17:27:55 +00:00
Materialize the hash index
Summary: Materialize the hash index to avoid the soaring cpu/flash usage when initializing the database. Test Plan: existing unit tests passed Reviewers: sdong, haobo Reviewed By: sdong CC: leveldb Differential Revision: https://reviews.facebook.net/D18339
This commit is contained in:
@@ -15,6 +15,8 @@
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "db/dbformat.h"
|
||||
|
||||
@@ -41,6 +43,8 @@
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
extern const std::string kHashIndexPrefixesBlock;
|
||||
extern const std::string kHashIndexPrefixesMetadataBlock;
|
||||
namespace {
|
||||
|
||||
typedef BlockBasedTableOptions::IndexType IndexType;
|
||||
@@ -57,6 +61,14 @@ typedef BlockBasedTableOptions::IndexType IndexType;
|
||||
// design that just works.
|
||||
class IndexBuilder {
|
||||
public:
|
||||
// Index builder will construct a set of blocks which contain:
|
||||
// 1. One primary index block.
|
||||
// 2. (Optional) a set of metablocks that contains the metadata of the
|
||||
// primary index.
|
||||
struct IndexBlocks {
|
||||
Slice index_block_contents;
|
||||
std::unordered_map<std::string, Slice> meta_blocks;
|
||||
};
|
||||
explicit IndexBuilder(const Comparator* comparator)
|
||||
: comparator_(comparator) {}
|
||||
|
||||
@@ -72,15 +84,19 @@ class IndexBuilder {
|
||||
// the last one in the table
|
||||
//
|
||||
// REQUIRES: Finish() has not yet been called.
|
||||
virtual void AddEntry(std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block,
|
||||
const BlockHandle& block_handle) = 0;
|
||||
virtual void AddIndexEntry(std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block,
|
||||
const BlockHandle& block_handle) = 0;
|
||||
|
||||
// This method will be called whenever a key is added. The subclasses may
|
||||
// override OnKeyAdded() if they need to collect additional information.
|
||||
virtual void OnKeyAdded(const Slice& key) {}
|
||||
|
||||
// Inform the index builder that all entries has been written. Block builder
|
||||
// may therefore perform any operation required for block finalization.
|
||||
//
|
||||
// REQUIRES: Finish() has not yet been called.
|
||||
virtual Slice Finish() = 0;
|
||||
virtual Status Finish(IndexBlocks* index_blocks) = 0;
|
||||
|
||||
// Get the estimated size for index block.
|
||||
virtual size_t EstimatedSize() const = 0;
|
||||
@@ -103,9 +119,9 @@ class ShortenedIndexBuilder : public IndexBuilder {
|
||||
: IndexBuilder(comparator),
|
||||
index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
|
||||
|
||||
virtual void AddEntry(std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block,
|
||||
const BlockHandle& block_handle) override {
|
||||
virtual void AddIndexEntry(std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block,
|
||||
const BlockHandle& block_handle) override {
|
||||
if (first_key_in_next_block != nullptr) {
|
||||
comparator_->FindShortestSeparator(last_key_in_current_block,
|
||||
*first_key_in_next_block);
|
||||
@@ -118,7 +134,10 @@ class ShortenedIndexBuilder : public IndexBuilder {
|
||||
index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
|
||||
}
|
||||
|
||||
virtual Slice Finish() override { return index_block_builder_.Finish(); }
|
||||
virtual Status Finish(IndexBlocks* index_blocks) {
|
||||
index_blocks->index_block_contents = index_block_builder_.Finish();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual size_t EstimatedSize() const {
|
||||
return index_block_builder_.CurrentSizeEstimate();
|
||||
@@ -128,38 +147,125 @@ class ShortenedIndexBuilder : public IndexBuilder {
|
||||
BlockBuilder index_block_builder_;
|
||||
};
|
||||
|
||||
// FullKeyIndexBuilder is also based on BlockBuilder. It works pretty much like
|
||||
// ShortenedIndexBuilder, but preserves the full key instead the substitude key.
|
||||
class FullKeyIndexBuilder : public IndexBuilder {
|
||||
// HashIndexBuilder contains a binary-searchable primary index and the
|
||||
// metadata for secondary hash index construction.
|
||||
// The metadata for hash index consists two parts:
|
||||
// - a metablock that compactly contains a sequence of prefixes. All prefixes
|
||||
// are stored consectively without any metadata (like, prefix sizes) being
|
||||
// stored, which is kept in the other metablock.
|
||||
// - a metablock contains the metadata of the prefixes, including prefix size,
|
||||
// restart index and number of block it spans. The format looks like:
|
||||
//
|
||||
// +-----------------+---------------------------+---------------------+ <=prefix 1
|
||||
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
|
||||
// +-----------------+---------------------------+---------------------+ <=prefix 2
|
||||
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
|
||||
// +-----------------+---------------------------+---------------------+
|
||||
// | |
|
||||
// | .... |
|
||||
// | |
|
||||
// +-----------------+---------------------------+---------------------+ <=prefix n
|
||||
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
|
||||
// +-----------------+---------------------------+---------------------+
|
||||
//
|
||||
// The reason of separating these two metablocks is to enable the efficiently
|
||||
// reuse the first metablock during hash index construction without unnecessary
|
||||
// data copy or small heap allocations for prefixes.
|
||||
class HashIndexBuilder : public IndexBuilder {
|
||||
public:
|
||||
explicit FullKeyIndexBuilder(const Comparator* comparator)
|
||||
explicit HashIndexBuilder(const Comparator* comparator,
|
||||
const SliceTransform* hash_key_extractor)
|
||||
: IndexBuilder(comparator),
|
||||
index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
|
||||
primary_index_builder(comparator),
|
||||
hash_key_extractor_(hash_key_extractor) {}
|
||||
|
||||
virtual void AddEntry(std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block,
|
||||
const BlockHandle& block_handle) override {
|
||||
std::string handle_encoding;
|
||||
block_handle.EncodeTo(&handle_encoding);
|
||||
index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
|
||||
virtual void AddIndexEntry(std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block,
|
||||
const BlockHandle& block_handle) override {
|
||||
++current_restart_index_;
|
||||
primary_index_builder.AddIndexEntry(last_key_in_current_block,
|
||||
first_key_in_next_block, block_handle);
|
||||
}
|
||||
|
||||
virtual Slice Finish() override { return index_block_builder_.Finish(); }
|
||||
virtual void OnKeyAdded(const Slice& key) override {
|
||||
auto key_prefix = hash_key_extractor_->Transform(key);
|
||||
bool is_first_entry = pending_block_num_ == 0;
|
||||
|
||||
// Keys may share the prefix
|
||||
if (is_first_entry || pending_entry_prefix_ != key_prefix) {
|
||||
if (!is_first_entry) {
|
||||
FlushPendingPrefix();
|
||||
}
|
||||
|
||||
// need a hard copy otherwise the underlying data changes all the time.
|
||||
// TODO(kailiu) ToString() is expensive. We may speed up can avoid data
|
||||
// copy.
|
||||
pending_entry_prefix_ = key_prefix.ToString();
|
||||
pending_block_num_ = 1;
|
||||
pending_entry_index_ = current_restart_index_;
|
||||
} else {
|
||||
// entry number increments when keys share the prefix reside in
|
||||
// differnt data blocks.
|
||||
auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
|
||||
assert(last_restart_index <= current_restart_index_);
|
||||
if (last_restart_index != current_restart_index_) {
|
||||
++pending_block_num_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
virtual Status Finish(IndexBlocks* index_blocks) {
|
||||
FlushPendingPrefix();
|
||||
primary_index_builder.Finish(index_blocks);
|
||||
index_blocks->meta_blocks.insert(
|
||||
{kHashIndexPrefixesBlock.c_str(), prefix_block_});
|
||||
index_blocks->meta_blocks.insert(
|
||||
{kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual size_t EstimatedSize() const {
|
||||
return index_block_builder_.CurrentSizeEstimate();
|
||||
return primary_index_builder.EstimatedSize() + prefix_block_.size() +
|
||||
prefix_meta_block_.size();
|
||||
}
|
||||
|
||||
private:
|
||||
BlockBuilder index_block_builder_;
|
||||
void FlushPendingPrefix() {
|
||||
prefix_block_.append(pending_entry_prefix_.data(),
|
||||
pending_entry_prefix_.size());
|
||||
PutVarint32(&prefix_meta_block_, pending_entry_prefix_.size());
|
||||
PutVarint32(&prefix_meta_block_, pending_entry_index_);
|
||||
PutVarint32(&prefix_meta_block_, pending_block_num_);
|
||||
}
|
||||
|
||||
ShortenedIndexBuilder primary_index_builder;
|
||||
const SliceTransform* hash_key_extractor_;
|
||||
|
||||
// stores a sequence of prefixes
|
||||
std::string prefix_block_;
|
||||
// stores the metadata of prefixes
|
||||
std::string prefix_meta_block_;
|
||||
|
||||
// The following 3 variables keeps unflushed prefix and its metadata.
|
||||
// The details of block_num and entry_index can be found in
|
||||
// "block_hash_index.{h,cc}"
|
||||
uint32_t pending_block_num_ = 0;
|
||||
uint32_t pending_entry_index_ = 0;
|
||||
std::string pending_entry_prefix_;
|
||||
|
||||
uint64_t current_restart_index_ = 0;
|
||||
};
|
||||
|
||||
// Create a index builder based on its type.
|
||||
IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator) {
|
||||
IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator,
|
||||
const SliceTransform* prefix_extractor) {
|
||||
switch (type) {
|
||||
case BlockBasedTableOptions::kBinarySearch: {
|
||||
return new ShortenedIndexBuilder(comparator);
|
||||
}
|
||||
case BlockBasedTableOptions::kHashSearch: {
|
||||
return new HashIndexBuilder(comparator, prefix_extractor);
|
||||
}
|
||||
default: {
|
||||
assert(!"Do not recognize the index type ");
|
||||
return nullptr;
|
||||
@@ -249,7 +355,7 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
|
||||
class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
|
||||
: public TablePropertiesCollector {
|
||||
public:
|
||||
BlockBasedTablePropertiesCollector(
|
||||
explicit BlockBasedTablePropertiesCollector(
|
||||
BlockBasedTableOptions::IndexType index_type)
|
||||
: index_type_(index_type) {}
|
||||
|
||||
@@ -288,6 +394,8 @@ struct BlockBasedTableBuilder::Rep {
|
||||
uint64_t offset = 0;
|
||||
Status status;
|
||||
BlockBuilder data_block;
|
||||
|
||||
InternalKeySliceTransform internal_prefix_transform;
|
||||
std::unique_ptr<IndexBuilder> index_builder;
|
||||
|
||||
std::string last_key;
|
||||
@@ -316,8 +424,9 @@ struct BlockBasedTableBuilder::Rep {
|
||||
internal_comparator(icomparator),
|
||||
file(f),
|
||||
data_block(options, &internal_comparator),
|
||||
index_builder(
|
||||
CreateIndexBuilder(index_block_type, &internal_comparator)),
|
||||
internal_prefix_transform(options.prefix_extractor.get()),
|
||||
index_builder(CreateIndexBuilder(index_block_type, &internal_comparator,
|
||||
&this->internal_prefix_transform)),
|
||||
compression_type(compression_type),
|
||||
checksum_type(checksum_type),
|
||||
filter_block(opt.filter_policy == nullptr
|
||||
@@ -335,16 +444,13 @@ struct BlockBasedTableBuilder::Rep {
|
||||
}
|
||||
};
|
||||
|
||||
// TODO(sdong): Currently only write out binary search index. In
|
||||
// BlockBasedTableReader, Hash index will be built using binary search index.
|
||||
BlockBasedTableBuilder::BlockBasedTableBuilder(
|
||||
const Options& options, const BlockBasedTableOptions& table_options,
|
||||
const InternalKeyComparator& internal_comparator, WritableFile* file,
|
||||
CompressionType compression_type)
|
||||
: rep_(new Rep(options, internal_comparator, file,
|
||||
table_options.flush_block_policy_factory.get(),
|
||||
compression_type,
|
||||
BlockBasedTableOptions::IndexType::kBinarySearch,
|
||||
compression_type, table_options.index_type,
|
||||
table_options.checksum)) {
|
||||
if (rep_->filter_block != nullptr) {
|
||||
rep_->filter_block->StartBlock(0);
|
||||
@@ -370,7 +476,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
||||
if (r->props.num_entries > 0) {
|
||||
assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
|
||||
}
|
||||
|
||||
r->index_builder->OnKeyAdded(key);
|
||||
auto should_flush = r->flush_block_policy->Update(key, value);
|
||||
if (should_flush) {
|
||||
assert(!r->data_block.empty());
|
||||
@@ -385,7 +491,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
||||
// entries in the first block and < all entries in subsequent
|
||||
// blocks.
|
||||
if (ok()) {
|
||||
r->index_builder->AddEntry(&r->last_key, &key, r->pending_handle);
|
||||
r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -561,24 +667,36 @@ Status BlockBasedTableBuilder::Finish() {
|
||||
// block, we will finish writing all index entries here and flush them
|
||||
// to storage after metaindex block is written.
|
||||
if (ok() && !empty_data_block) {
|
||||
r->index_builder->AddEntry(&r->last_key, nullptr /* no next data block */,
|
||||
r->pending_handle);
|
||||
r->index_builder->AddIndexEntry(
|
||||
&r->last_key, nullptr /* no next data block */, r->pending_handle);
|
||||
}
|
||||
|
||||
IndexBuilder::IndexBlocks index_blocks;
|
||||
auto s = r->index_builder->Finish(&index_blocks);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// Write meta blocks and metaindex block with the following order.
|
||||
// 1. [meta block: filter]
|
||||
// 2. [meta block: properties]
|
||||
// 3. [metaindex block]
|
||||
if (ok()) {
|
||||
MetaIndexBuilder meta_index_builer;
|
||||
// 2. [other meta blocks]
|
||||
// 3. [meta block: properties]
|
||||
// 4. [metaindex block]
|
||||
// write meta blocks
|
||||
MetaIndexBuilder meta_index_builder;
|
||||
for (const auto& item : index_blocks.meta_blocks) {
|
||||
BlockHandle block_handle;
|
||||
WriteBlock(item.second, &block_handle);
|
||||
meta_index_builder.Add(item.first, block_handle);
|
||||
}
|
||||
|
||||
// Write filter block.
|
||||
if (ok()) {
|
||||
if (r->filter_block != nullptr) {
|
||||
// Add mapping from "<filter_block_prefix>.Name" to location
|
||||
// of filter data.
|
||||
std::string key = BlockBasedTable::kFilterBlockPrefix;
|
||||
key.append(r->options.filter_policy->Name());
|
||||
meta_index_builer.Add(key, filter_block_handle);
|
||||
meta_index_builder.Add(key, filter_block_handle);
|
||||
}
|
||||
|
||||
// Write properties block.
|
||||
@@ -605,20 +723,16 @@ Status BlockBasedTableBuilder::Finish() {
|
||||
&properties_block_handle
|
||||
);
|
||||
|
||||
meta_index_builer.Add(kPropertiesBlock,
|
||||
properties_block_handle);
|
||||
meta_index_builder.Add(kPropertiesBlock, properties_block_handle);
|
||||
} // end of properties block writing
|
||||
|
||||
WriteRawBlock(
|
||||
meta_index_builer.Finish(),
|
||||
kNoCompression,
|
||||
&metaindex_block_handle
|
||||
);
|
||||
} // meta blocks and metaindex block.
|
||||
} // meta blocks
|
||||
|
||||
// Write index block
|
||||
if (ok()) {
|
||||
WriteBlock(r->index_builder->Finish(), &index_block_handle);
|
||||
// flush the meta index block
|
||||
WriteRawBlock(meta_index_builder.Finish(), kNoCompression,
|
||||
&metaindex_block_handle);
|
||||
WriteBlock(index_blocks.index_block_contents, &index_block_handle);
|
||||
}
|
||||
|
||||
// Write footer
|
||||
@@ -685,7 +799,6 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
|
||||
return rep_->offset;
|
||||
}
|
||||
|
||||
const std::string BlockBasedTable::kFilterBlockPrefix =
|
||||
"filter.";
|
||||
const std::string BlockBasedTable::kFilterBlockPrefix = "filter.";
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
Reference in New Issue
Block a user