mirror of
https://github.com/XRPLF/rippled.git
synced 2025-12-06 17:27:55 +00:00
Merge branch 'master' into columnfamilies
Conflicts: HISTORY.md db/db_impl.cc db/db_impl.h db/db_iter.cc db/db_test.cc db/dbformat.h db/memtable.cc db/memtable_list.cc db/memtable_list.h db/table_cache.cc db/table_cache.h db/version_edit.h db/version_set.cc db/version_set.h db/write_batch.cc db/write_batch_test.cc include/rocksdb/options.h util/options.cc
This commit is contained in:
@@ -17,15 +17,17 @@
|
||||
#include "rocksdb/flush_block_policy.h"
|
||||
#include "rocksdb/cache.h"
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "table/table_builder.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/filter_policy.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "table/block_based_table_reader.h"
|
||||
#include "table/block.h"
|
||||
#include "table/block_builder.h"
|
||||
#include "table/filter_block.h"
|
||||
#include "table/format.h"
|
||||
#include "table/meta_blocks.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/crc32c.h"
|
||||
#include "util/stop_watch.h"
|
||||
@@ -34,51 +36,24 @@ namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
|
||||
struct BytewiseLessThan {
|
||||
bool operator()(const std::string& key1, const std::string& key2) const {
|
||||
// smaller entries will be placed in front.
|
||||
return comparator->Compare(key1, key2) <= 0;
|
||||
}
|
||||
const Comparator* comparator = BytewiseComparator();
|
||||
};
|
||||
|
||||
// When writing to a block that requires entries to be sorted by
|
||||
// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap`
|
||||
// before writng to store.
|
||||
typedef std::map<std::string, std::string, BytewiseLessThan> BytewiseSortedMap;
|
||||
|
||||
void AddProperties(BytewiseSortedMap& props, std::string name, uint64_t val) {
|
||||
assert(props.find(name) == props.end());
|
||||
|
||||
std::string dst;
|
||||
PutVarint64(&dst, val);
|
||||
|
||||
props.insert(
|
||||
std::make_pair(name, dst)
|
||||
);
|
||||
}
|
||||
|
||||
static bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
|
||||
// Check to see if compressed less than 12.5%
|
||||
return compressed_size < raw_size - (raw_size / 8u);
|
||||
}
|
||||
|
||||
// Were we encounter any error occurs during user-defined statistics collection,
|
||||
// we'll write the warning message to info log.
|
||||
void LogPropertiesCollectionError(
|
||||
Logger* info_log, const std::string& method, const std::string& name) {
|
||||
assert(method == "Add" || method == "Finish");
|
||||
|
||||
std::string msg =
|
||||
"[Warning] encountered error when calling TablePropertiesCollector::" +
|
||||
method + "() with collector name: " + name;
|
||||
Log(info_log, "%s", msg.c_str());
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
// kBlockBasedTableMagicNumber was picked by running
|
||||
// echo http://code.google.com/p/leveldb/ | sha1sum
|
||||
// and taking the leading 64 bits.
|
||||
// Please note that kBlockBasedTableMagicNumber may also be accessed by
|
||||
// other .cc files so it have to be explicitly declared with "extern".
|
||||
extern const uint64_t kBlockBasedTableMagicNumber
|
||||
= 0xdb4775248b80fb57ull;
|
||||
|
||||
struct BlockBasedTableBuilder::Rep {
|
||||
Options options;
|
||||
const InternalKeyComparator& internal_comparator;
|
||||
WritableFile* file;
|
||||
uint64_t offset = 0;
|
||||
Status status;
|
||||
@@ -98,31 +73,30 @@ struct BlockBasedTableBuilder::Rep {
|
||||
std::string compressed_output;
|
||||
std::unique_ptr<FlushBlockPolicy> flush_block_policy;
|
||||
|
||||
Rep(const Options& opt,
|
||||
WritableFile* f,
|
||||
FlushBlockPolicyFactory* flush_block_policy_factory,
|
||||
Rep(const Options& opt, const InternalKeyComparator& icomparator,
|
||||
WritableFile* f, FlushBlockPolicyFactory* flush_block_policy_factory,
|
||||
CompressionType compression_type)
|
||||
: options(opt),
|
||||
internal_comparator(icomparator),
|
||||
file(f),
|
||||
data_block(options),
|
||||
data_block(options, &internal_comparator),
|
||||
// To avoid linear scan, we make the block_restart_interval to be `1`
|
||||
// in index block builder
|
||||
index_block(1 /* block_restart_interval */, options.comparator),
|
||||
index_block(1 /* block_restart_interval */, &internal_comparator),
|
||||
compression_type(compression_type),
|
||||
filter_block(opt.filter_policy == nullptr ? nullptr
|
||||
: new FilterBlockBuilder(opt)),
|
||||
filter_block(opt.filter_policy == nullptr
|
||||
? nullptr
|
||||
: new FilterBlockBuilder(opt, &internal_comparator)),
|
||||
flush_block_policy(
|
||||
flush_block_policy_factory->NewFlushBlockPolicy(data_block)) {
|
||||
}
|
||||
flush_block_policy_factory->NewFlushBlockPolicy(data_block)) {}
|
||||
};
|
||||
|
||||
BlockBasedTableBuilder::BlockBasedTableBuilder(
|
||||
const Options& options,
|
||||
WritableFile* file,
|
||||
FlushBlockPolicyFactory* flush_block_policy_factory,
|
||||
const Options& options, const InternalKeyComparator& internal_comparator,
|
||||
WritableFile* file, FlushBlockPolicyFactory* flush_block_policy_factory,
|
||||
CompressionType compression_type)
|
||||
: rep_(new Rep(options,
|
||||
file, flush_block_policy_factory, compression_type)) {
|
||||
: rep_(new Rep(options, internal_comparator, file,
|
||||
flush_block_policy_factory, compression_type)) {
|
||||
if (rep_->filter_block != nullptr) {
|
||||
rep_->filter_block->StartBlock(0);
|
||||
}
|
||||
@@ -145,7 +119,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
||||
assert(!r->closed);
|
||||
if (!ok()) return;
|
||||
if (r->props.num_entries > 0) {
|
||||
assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0);
|
||||
assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
|
||||
}
|
||||
|
||||
auto should_flush = r->flush_block_policy->Update(key, value);
|
||||
@@ -162,7 +136,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
||||
// entries in the first block and < all entries in subsequent
|
||||
// blocks.
|
||||
if (ok()) {
|
||||
r->options.comparator->FindShortestSeparator(&r->last_key, key);
|
||||
r->internal_comparator.FindShortestSeparator(&r->last_key, key);
|
||||
std::string handle_encoding;
|
||||
r->pending_handle.EncodeTo(&handle_encoding);
|
||||
r->index_block.Add(r->last_key, Slice(handle_encoding));
|
||||
@@ -179,16 +153,12 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
||||
r->props.raw_key_size += key.size();
|
||||
r->props.raw_value_size += value.size();
|
||||
|
||||
for (auto collector : r->options.table_properties_collectors) {
|
||||
Status s = collector->Add(key, value);
|
||||
if (!s.ok()) {
|
||||
LogPropertiesCollectionError(
|
||||
r->options.info_log.get(),
|
||||
"Add", /* method */
|
||||
collector->Name()
|
||||
);
|
||||
}
|
||||
}
|
||||
NotifyCollectTableCollectorsOnAdd(
|
||||
key,
|
||||
value,
|
||||
r->options.table_properties_collectors,
|
||||
r->options.info_log.get()
|
||||
);
|
||||
}
|
||||
|
||||
void BlockBasedTableBuilder::Flush() {
|
||||
@@ -370,7 +340,7 @@ Status BlockBasedTableBuilder::Finish() {
|
||||
// block, we will finish writing all index entries here and flush them
|
||||
// to storage after metaindex block is written.
|
||||
if (ok() && !empty_data_block) {
|
||||
r->options.comparator->FindShortSuccessor(&r->last_key);
|
||||
r->internal_comparator.FindShortSuccessor(&r->last_key);
|
||||
|
||||
std::string handle_encoding;
|
||||
r->pending_handle.EncodeTo(&handle_encoding);
|
||||
@@ -382,14 +352,7 @@ Status BlockBasedTableBuilder::Finish() {
|
||||
// 2. [meta block: properties]
|
||||
// 3. [metaindex block]
|
||||
if (ok()) {
|
||||
// We use `BytewiseComparator` as the comparator for meta block.
|
||||
BlockBuilder meta_index_block(
|
||||
r->options.block_restart_interval,
|
||||
BytewiseComparator()
|
||||
);
|
||||
// Key: meta block name
|
||||
// Value: block handle to that meta block
|
||||
BytewiseSortedMap meta_block_handles;
|
||||
MetaIndexBuilder meta_index_builer;
|
||||
|
||||
// Write filter block.
|
||||
if (r->filter_block != nullptr) {
|
||||
@@ -397,104 +360,43 @@ Status BlockBasedTableBuilder::Finish() {
|
||||
// of filter data.
|
||||
std::string key = BlockBasedTable::kFilterBlockPrefix;
|
||||
key.append(r->options.filter_policy->Name());
|
||||
std::string handle_encoding;
|
||||
filter_block_handle.EncodeTo(&handle_encoding);
|
||||
meta_block_handles.insert(
|
||||
std::make_pair(key, handle_encoding)
|
||||
);
|
||||
meta_index_builer.Add(key, filter_block_handle);
|
||||
}
|
||||
|
||||
// Write properties block.
|
||||
{
|
||||
BlockBuilder properties_block(
|
||||
r->options.block_restart_interval,
|
||||
BytewiseComparator()
|
||||
);
|
||||
|
||||
BytewiseSortedMap properties;
|
||||
|
||||
// Add basic properties
|
||||
AddProperties(
|
||||
properties,
|
||||
BlockBasedTablePropertiesNames::kRawKeySize,
|
||||
r->props.raw_key_size
|
||||
);
|
||||
AddProperties(
|
||||
properties,
|
||||
BlockBasedTablePropertiesNames::kRawValueSize,
|
||||
r->props.raw_value_size
|
||||
);
|
||||
AddProperties(
|
||||
properties,
|
||||
BlockBasedTablePropertiesNames::kDataSize,
|
||||
r->props.data_size
|
||||
);
|
||||
PropertyBlockBuilder property_block_builder;
|
||||
std::vector<std::string> failed_user_prop_collectors;
|
||||
r->props.filter_policy_name = r->options.filter_policy != nullptr ?
|
||||
r->options.filter_policy->Name() : "";
|
||||
r->props.index_size =
|
||||
r->index_block.CurrentSizeEstimate() + kBlockTrailerSize;
|
||||
AddProperties(
|
||||
properties,
|
||||
BlockBasedTablePropertiesNames::kIndexSize,
|
||||
r->props.index_size
|
||||
);
|
||||
AddProperties(
|
||||
properties,
|
||||
BlockBasedTablePropertiesNames::kNumEntries,
|
||||
r->props.num_entries
|
||||
);
|
||||
AddProperties(
|
||||
properties,
|
||||
BlockBasedTablePropertiesNames::kNumDataBlocks,
|
||||
r->props.num_data_blocks);
|
||||
if (r->filter_block != nullptr) {
|
||||
properties.insert({
|
||||
BlockBasedTablePropertiesNames::kFilterPolicy,
|
||||
r->options.filter_policy->Name()
|
||||
});
|
||||
}
|
||||
AddProperties(
|
||||
properties,
|
||||
BlockBasedTablePropertiesNames::kFilterSize,
|
||||
r->props.filter_size
|
||||
);
|
||||
|
||||
for (auto collector : r->options.table_properties_collectors) {
|
||||
TableProperties::UserCollectedProperties user_collected_properties;
|
||||
Status s =
|
||||
collector->Finish(&user_collected_properties);
|
||||
// Add basic properties
|
||||
property_block_builder.AddTableProperty(r->props);
|
||||
|
||||
if (!s.ok()) {
|
||||
LogPropertiesCollectionError(
|
||||
r->options.info_log.get(),
|
||||
"Finish", /* method */
|
||||
collector->Name()
|
||||
);
|
||||
} else {
|
||||
properties.insert(
|
||||
user_collected_properties.begin(),
|
||||
user_collected_properties.end()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& stat : properties) {
|
||||
properties_block.Add(stat.first, stat.second);
|
||||
}
|
||||
NotifyCollectTableCollectorsOnFinish(
|
||||
r->options.table_properties_collectors,
|
||||
r->options.info_log.get(),
|
||||
&property_block_builder
|
||||
);
|
||||
|
||||
BlockHandle properties_block_handle;
|
||||
WriteBlock(&properties_block, &properties_block_handle);
|
||||
|
||||
std::string handle_encoding;
|
||||
properties_block_handle.EncodeTo(&handle_encoding);
|
||||
meta_block_handles.insert(
|
||||
{ BlockBasedTable::kPropertiesBlock, handle_encoding }
|
||||
WriteRawBlock(
|
||||
property_block_builder.Finish(),
|
||||
kNoCompression,
|
||||
&properties_block_handle
|
||||
);
|
||||
|
||||
meta_index_builer.Add(kPropertiesBlock,
|
||||
properties_block_handle);
|
||||
} // end of properties block writing
|
||||
|
||||
for (const auto& metablock : meta_block_handles) {
|
||||
meta_index_block.Add(metablock.first, metablock.second);
|
||||
}
|
||||
|
||||
WriteBlock(&meta_index_block, &metaindex_block_handle);
|
||||
WriteRawBlock(
|
||||
meta_index_builer.Finish(),
|
||||
kNoCompression,
|
||||
&metaindex_block_handle
|
||||
);
|
||||
} // meta blocks and metaindex block.
|
||||
|
||||
// Write index block
|
||||
@@ -504,7 +406,7 @@ Status BlockBasedTableBuilder::Finish() {
|
||||
|
||||
// Write footer
|
||||
if (ok()) {
|
||||
Footer footer;
|
||||
Footer footer(kBlockBasedTableMagicNumber);
|
||||
footer.set_metaindex_handle(metaindex_block_handle);
|
||||
footer.set_index_handle(index_block_handle);
|
||||
std::string footer_encoding;
|
||||
@@ -556,4 +458,7 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
|
||||
return rep_->offset;
|
||||
}
|
||||
|
||||
const std::string BlockBasedTable::kFilterBlockPrefix =
|
||||
"filter.";
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
#include "rocksdb/flush_block_policy.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "table/table_builder.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
@@ -20,13 +20,13 @@ class BlockBuilder;
|
||||
class BlockHandle;
|
||||
class WritableFile;
|
||||
|
||||
|
||||
class BlockBasedTableBuilder : public TableBuilder {
|
||||
public:
|
||||
// Create a builder that will store the contents of the table it is
|
||||
// building in *file. Does not close the file. It is up to the
|
||||
// caller to close the file after calling Finish().
|
||||
BlockBasedTableBuilder(const Options& options,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
WritableFile* file,
|
||||
FlushBlockPolicyFactory* flush_block_policy_factory,
|
||||
CompressionType compression_type);
|
||||
|
||||
@@ -18,17 +18,19 @@
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
Status BlockBasedTableFactory::GetTableReader(
|
||||
Status BlockBasedTableFactory::NewTableReader(
|
||||
const Options& options, const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||
unique_ptr<TableReader>* table_reader) const {
|
||||
return BlockBasedTable::Open(options, soptions, table_options_,
|
||||
std::move(file), file_size, table_reader);
|
||||
internal_comparator, std::move(file), file_size,
|
||||
table_reader);
|
||||
}
|
||||
|
||||
TableBuilder* BlockBasedTableFactory::GetTableBuilder(
|
||||
const Options& options, WritableFile* file,
|
||||
CompressionType compression_type) const {
|
||||
TableBuilder* BlockBasedTableFactory::NewTableBuilder(
|
||||
const Options& options, const InternalKeyComparator& internal_comparator,
|
||||
WritableFile* file, CompressionType compression_type) const {
|
||||
auto flush_block_policy_factory =
|
||||
table_options_.flush_block_policy_factory.get();
|
||||
|
||||
@@ -45,11 +47,9 @@ TableBuilder* BlockBasedTableFactory::GetTableBuilder(
|
||||
options.block_size_deviation);
|
||||
}
|
||||
|
||||
auto table_builder = new BlockBasedTableBuilder(
|
||||
options,
|
||||
file,
|
||||
flush_block_policy_factory,
|
||||
compression_type);
|
||||
auto table_builder =
|
||||
new BlockBasedTableBuilder(options, internal_comparator, file,
|
||||
flush_block_policy_factory, compression_type);
|
||||
|
||||
// Delete flush_block_policy_factory only when it's just created from the
|
||||
// options.
|
||||
@@ -63,4 +63,9 @@ TableBuilder* BlockBasedTableFactory::GetTableBuilder(
|
||||
return table_builder;
|
||||
}
|
||||
|
||||
TableFactory* NewBlockBasedTableFactory(
|
||||
const BlockBasedTableOptions& table_options) {
|
||||
return new BlockBasedTableFactory(table_options);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
#include "rocksdb/flush_block_policy.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "table/block_based_table_options.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
@@ -22,31 +21,26 @@ struct Options;
|
||||
struct EnvOptions;
|
||||
|
||||
using std::unique_ptr;
|
||||
class Status;
|
||||
class RandomAccessFile;
|
||||
class WritableFile;
|
||||
class Table;
|
||||
class TableBuilder;
|
||||
class BlockBasedTable;
|
||||
class BlockBasedTableBuilder;
|
||||
|
||||
class BlockBasedTableFactory: public TableFactory {
|
||||
class BlockBasedTableFactory : public TableFactory {
|
||||
public:
|
||||
BlockBasedTableFactory() : BlockBasedTableFactory(BlockBasedTableOptions()) {}
|
||||
explicit BlockBasedTableFactory(const BlockBasedTableOptions& table_options)
|
||||
explicit BlockBasedTableFactory(
|
||||
const BlockBasedTableOptions& table_options = BlockBasedTableOptions())
|
||||
: table_options_(table_options) {}
|
||||
|
||||
~BlockBasedTableFactory() {}
|
||||
|
||||
const char* Name() const override { return "BlockBasedTable"; }
|
||||
|
||||
Status GetTableReader(const Options& options, const EnvOptions& soptions,
|
||||
Status NewTableReader(const Options& options, const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||
unique_ptr<TableReader>* table_reader) const override;
|
||||
|
||||
TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
|
||||
CompressionType compression_type)
|
||||
const override;
|
||||
TableBuilder* NewTableBuilder(
|
||||
const Options& options, const InternalKeyComparator& internal_comparator,
|
||||
WritableFile* file, CompressionType compression_type) const override;
|
||||
|
||||
private:
|
||||
BlockBasedTableOptions table_options_;
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
#include <memory>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class FlushBlockPolicyFactory;
|
||||
|
||||
struct BlockBasedTableOptions {
|
||||
// @flush_block_policy_factory creates the instances of flush block policy.
|
||||
// which provides a configurable way to determine when to flush a block in
|
||||
// the block based tables. If not set, table builder will use the default
|
||||
// block flush policy, which cut blocks by block size (please refer to
|
||||
// `FlushBlockBySizePolicy`).
|
||||
std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
|
||||
|
||||
// TODO(kailiu) Temporarily disable this feature by making the default value
|
||||
// to be false. Also in master branch, this file is non-public so no user
|
||||
// will be able to change the value of `cache_index_and_filter_blocks`.
|
||||
//
|
||||
// Indicating if we'd put index/filter blocks to the block cache.
|
||||
// If not specified, each "table reader" object will pre-load index/filter
|
||||
// block during table initialization.
|
||||
bool cache_index_and_filter_blocks = false;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
@@ -21,15 +21,17 @@
|
||||
#include "table/block.h"
|
||||
#include "table/filter_block.h"
|
||||
#include "table/format.h"
|
||||
#include "table/meta_blocks.h"
|
||||
#include "table/two_level_iterator.h"
|
||||
|
||||
#include "util/coding.h"
|
||||
#include "util/perf_context_imp.h"
|
||||
#include "util/stop_watch.h"
|
||||
#include "table/block_based_table_options.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
extern uint64_t kBlockBasedTableMagicNumber;
|
||||
|
||||
// The longest the prefix of the cache key used to identify blocks can be.
|
||||
// We are using the fact that we know for Posix files the unique ID is three
|
||||
// varints.
|
||||
@@ -37,12 +39,13 @@ const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1;
|
||||
using std::unique_ptr;
|
||||
|
||||
struct BlockBasedTable::Rep {
|
||||
Rep(const EnvOptions& storage_options) :
|
||||
soptions(storage_options) {
|
||||
}
|
||||
Rep(const EnvOptions& storage_options,
|
||||
const InternalKeyComparator& internal_comparator)
|
||||
: soptions(storage_options), internal_comparator_(internal_comparator) {}
|
||||
|
||||
Options options;
|
||||
const EnvOptions& soptions;
|
||||
const InternalKeyComparator& internal_comparator_;
|
||||
Status status;
|
||||
unique_ptr<RandomAccessFile> file;
|
||||
char cache_key_prefix[kMaxCacheKeyPrefixSize];
|
||||
@@ -223,34 +226,19 @@ Cache::Handle* GetFromBlockCache(
|
||||
|
||||
Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
|
||||
const BlockBasedTableOptions& table_options,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
unique_ptr<RandomAccessFile>&& file,
|
||||
uint64_t file_size,
|
||||
unique_ptr<TableReader>* table_reader) {
|
||||
table_reader->reset();
|
||||
|
||||
if (file_size < Footer::kEncodedLength) {
|
||||
return Status::InvalidArgument("file is too short to be an sstable");
|
||||
}
|
||||
|
||||
char footer_space[Footer::kEncodedLength];
|
||||
Slice footer_input;
|
||||
Status s = file->Read(file_size - Footer::kEncodedLength,
|
||||
Footer::kEncodedLength, &footer_input, footer_space);
|
||||
if (!s.ok()) return s;
|
||||
|
||||
// Check that we actually read the whole footer from the file. It may be
|
||||
// that size isn't correct.
|
||||
if (footer_input.size() != Footer::kEncodedLength) {
|
||||
return Status::InvalidArgument("file is too short to be an sstable");
|
||||
}
|
||||
|
||||
Footer footer;
|
||||
s = footer.DecodeFrom(&footer_input);
|
||||
Footer footer(kBlockBasedTableMagicNumber);
|
||||
auto s = ReadFooterFromFile(file.get(), file_size, &footer);
|
||||
if (!s.ok()) return s;
|
||||
|
||||
// We've successfully read the footer and the index block: we're
|
||||
// ready to serve requests.
|
||||
Rep* rep = new BlockBasedTable::Rep(soptions);
|
||||
Rep* rep = new BlockBasedTable::Rep(soptions, internal_comparator);
|
||||
rep->options = options;
|
||||
rep->file = std::move(file);
|
||||
rep->metaindex_handle = footer.metaindex_handle();
|
||||
@@ -265,10 +253,11 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
|
||||
|
||||
// Read the properties
|
||||
meta_iter->Seek(kPropertiesBlock);
|
||||
if (meta_iter->Valid() && meta_iter->key() == Slice(kPropertiesBlock)) {
|
||||
if (meta_iter->Valid() && meta_iter->key() == kPropertiesBlock) {
|
||||
s = meta_iter->status();
|
||||
if (s.ok()) {
|
||||
s = ReadProperties(meta_iter->value(), rep, &rep->table_properties);
|
||||
s = ReadProperties(meta_iter->value(), rep->file.get(), rep->options.env,
|
||||
rep->options.info_log.get(), &rep->table_properties);
|
||||
}
|
||||
|
||||
if (!s.ok()) {
|
||||
@@ -350,7 +339,7 @@ void BlockBasedTable::SetupForCompaction() {
|
||||
compaction_optimized_ = true;
|
||||
}
|
||||
|
||||
TableProperties& BlockBasedTable::GetTableProperties() {
|
||||
const TableProperties& BlockBasedTable::GetTableProperties() {
|
||||
return rep_->table_properties;
|
||||
}
|
||||
|
||||
@@ -415,96 +404,6 @@ FilterBlockReader* BlockBasedTable::ReadFilter (
|
||||
rep->options, block.data, block.heap_allocated);
|
||||
}
|
||||
|
||||
Status BlockBasedTable::ReadProperties(
|
||||
const Slice& handle_value, Rep* rep, TableProperties* table_properties) {
|
||||
assert(table_properties);
|
||||
|
||||
Slice v = handle_value;
|
||||
BlockHandle handle;
|
||||
if (!handle.DecodeFrom(&v).ok()) {
|
||||
return Status::InvalidArgument("Failed to decode properties block handle");
|
||||
}
|
||||
|
||||
BlockContents block_contents;
|
||||
Status s = ReadBlockContents(
|
||||
rep->file.get(),
|
||||
ReadOptions(),
|
||||
handle,
|
||||
&block_contents,
|
||||
rep->options.env,
|
||||
false
|
||||
);
|
||||
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
Block properties_block(block_contents);
|
||||
std::unique_ptr<Iterator> iter(
|
||||
properties_block.NewIterator(BytewiseComparator())
|
||||
);
|
||||
|
||||
// All pre-defined properties of type uint64_t
|
||||
std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
|
||||
{ BlockBasedTablePropertiesNames::kDataSize,
|
||||
&table_properties->data_size },
|
||||
{ BlockBasedTablePropertiesNames::kIndexSize,
|
||||
&table_properties->index_size },
|
||||
{ BlockBasedTablePropertiesNames::kFilterSize,
|
||||
&table_properties->filter_size },
|
||||
{ BlockBasedTablePropertiesNames::kRawKeySize,
|
||||
&table_properties->raw_key_size },
|
||||
{ BlockBasedTablePropertiesNames::kRawValueSize,
|
||||
&table_properties->raw_value_size },
|
||||
{ BlockBasedTablePropertiesNames::kNumDataBlocks,
|
||||
&table_properties->num_data_blocks },
|
||||
{ BlockBasedTablePropertiesNames::kNumEntries,
|
||||
&table_properties->num_entries },
|
||||
};
|
||||
|
||||
std::string last_key;
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
s = iter->status();
|
||||
if (!s.ok()) {
|
||||
break;
|
||||
}
|
||||
|
||||
auto key = iter->key().ToString();
|
||||
// properties block is strictly sorted with no duplicate key.
|
||||
assert(
|
||||
last_key.empty() ||
|
||||
BytewiseComparator()->Compare(key, last_key) > 0
|
||||
);
|
||||
last_key = key;
|
||||
|
||||
auto raw_val = iter->value();
|
||||
auto pos = predefined_uint64_properties.find(key);
|
||||
|
||||
if (pos != predefined_uint64_properties.end()) {
|
||||
// handle predefined rocksdb properties
|
||||
uint64_t val;
|
||||
if (!GetVarint64(&raw_val, &val)) {
|
||||
// skip malformed value
|
||||
auto error_msg =
|
||||
"[Warning] detect malformed value in properties meta-block:"
|
||||
"\tkey: " + key + "\tval: " + raw_val.ToString();
|
||||
Log(rep->options.info_log, "%s", error_msg.c_str());
|
||||
continue;
|
||||
}
|
||||
*(pos->second) = val;
|
||||
} else if (key == BlockBasedTablePropertiesNames::kFilterPolicy) {
|
||||
table_properties->filter_policy_name = raw_val.ToString();
|
||||
} else {
|
||||
// handle user-collected
|
||||
table_properties->user_collected_properties.insert(
|
||||
std::make_pair(key, raw_val.ToString())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
Status BlockBasedTable::GetBlock(
|
||||
const BlockBasedTable* table,
|
||||
const BlockHandle& handle,
|
||||
@@ -764,7 +663,7 @@ Iterator* BlockBasedTable::BlockReader(void* arg,
|
||||
|
||||
Iterator* iter;
|
||||
if (block != nullptr) {
|
||||
iter = block->NewIterator(table->rep_->options.comparator);
|
||||
iter = block->NewIterator(&(table->rep_->internal_comparator_));
|
||||
if (cache_handle != nullptr) {
|
||||
iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
|
||||
} else {
|
||||
@@ -837,7 +736,7 @@ BlockBasedTable::GetFilter(bool no_io) const {
|
||||
// Get the iterator from the index block.
|
||||
Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
|
||||
if (rep_->index_block) {
|
||||
return rep_->index_block->NewIterator(rep_->options.comparator);
|
||||
return rep_->index_block->NewIterator(&(rep_->internal_comparator_));
|
||||
}
|
||||
|
||||
// get index block from cache
|
||||
@@ -858,7 +757,7 @@ Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
|
||||
|
||||
Iterator* iter;
|
||||
if (entry.value != nullptr) {
|
||||
iter = entry.value->NewIterator(rep_->options.comparator);
|
||||
iter = entry.value->NewIterator(&(rep_->internal_comparator_));
|
||||
if (entry.cache_handle) {
|
||||
iter->RegisterCleanup(
|
||||
&ReleaseBlock, rep_->options.block_cache.get(), entry.cache_handle
|
||||
@@ -872,9 +771,9 @@ Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
|
||||
return iter;
|
||||
}
|
||||
|
||||
Iterator* BlockBasedTable::BlockReader(void* arg,
|
||||
const ReadOptions& options,
|
||||
Iterator* BlockBasedTable::BlockReader(void* arg, const ReadOptions& options,
|
||||
const EnvOptions& soptions,
|
||||
const InternalKeyComparator& icomparator,
|
||||
const Slice& index_value,
|
||||
bool for_compaction) {
|
||||
return BlockReader(arg, options, index_value, nullptr, for_compaction);
|
||||
@@ -965,20 +864,15 @@ Iterator* BlockBasedTable::NewIterator(const ReadOptions& options) {
|
||||
}
|
||||
}
|
||||
|
||||
return NewTwoLevelIterator(
|
||||
IndexBlockReader(options),
|
||||
&BlockBasedTable::BlockReader,
|
||||
const_cast<BlockBasedTable*>(this),
|
||||
options,
|
||||
rep_->soptions
|
||||
);
|
||||
return NewTwoLevelIterator(IndexBlockReader(options),
|
||||
&BlockBasedTable::BlockReader,
|
||||
const_cast<BlockBasedTable*>(this), options,
|
||||
rep_->soptions, rep_->internal_comparator_);
|
||||
}
|
||||
|
||||
Status BlockBasedTable::Get(
|
||||
const ReadOptions& readOptions,
|
||||
const Slice& key,
|
||||
void* handle_context,
|
||||
bool (*result_handler)(void* handle_context, const Slice& k,
|
||||
const ReadOptions& readOptions, const Slice& key, void* handle_context,
|
||||
bool (*result_handler)(void* handle_context, const ParsedInternalKey& k,
|
||||
const Slice& v, bool didIO),
|
||||
void (*mark_key_may_exist_handler)(void* handle_context)) {
|
||||
Status s;
|
||||
@@ -1016,8 +910,13 @@ Status BlockBasedTable::Get(
|
||||
|
||||
// Call the *saver function on each entry/block until it returns false
|
||||
for (block_iter->Seek(key); block_iter->Valid(); block_iter->Next()) {
|
||||
if (!(*result_handler)(handle_context, block_iter->key(),
|
||||
block_iter->value(), didIO)) {
|
||||
ParsedInternalKey parsed_key;
|
||||
if (!ParseInternalKey(block_iter->key(), &parsed_key)) {
|
||||
s = Status::Corruption(Slice());
|
||||
}
|
||||
|
||||
if (!(*result_handler)(handle_context, parsed_key, block_iter->value(),
|
||||
didIO)) {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
@@ -1034,7 +933,8 @@ Status BlockBasedTable::Get(
|
||||
return s;
|
||||
}
|
||||
|
||||
bool SaveDidIO(void* arg, const Slice& key, const Slice& value, bool didIO) {
|
||||
bool SaveDidIO(void* arg, const ParsedInternalKey& key, const Slice& value,
|
||||
bool didIO) {
|
||||
*reinterpret_cast<bool*>(arg) = didIO;
|
||||
return false;
|
||||
}
|
||||
@@ -1075,25 +975,4 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
|
||||
return result;
|
||||
}
|
||||
|
||||
const std::string BlockBasedTable::kFilterBlockPrefix =
|
||||
"filter.";
|
||||
const std::string BlockBasedTable::kPropertiesBlock =
|
||||
"rocksdb.properties";
|
||||
const std::string BlockBasedTablePropertiesNames::kDataSize =
|
||||
"rocksdb.data.size";
|
||||
const std::string BlockBasedTablePropertiesNames::kIndexSize =
|
||||
"rocksdb.index.size";
|
||||
const std::string BlockBasedTablePropertiesNames::kFilterSize =
|
||||
"rocksdb.filter.size";
|
||||
const std::string BlockBasedTablePropertiesNames::kRawKeySize =
|
||||
"rocksdb.raw.key.size";
|
||||
const std::string BlockBasedTablePropertiesNames::kRawValueSize =
|
||||
"rocksdb.raw.value.size";
|
||||
const std::string BlockBasedTablePropertiesNames::kNumDataBlocks =
|
||||
"rocksdb.num.data.blocks";
|
||||
const std::string BlockBasedTablePropertiesNames::kNumEntries =
|
||||
"rocksdb.num.entries";
|
||||
const std::string BlockBasedTablePropertiesNames::kFilterPolicy =
|
||||
"rocksdb.filter.policy";
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
@@ -14,8 +14,7 @@
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "rocksdb/table_properties.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "table/table_reader.h"
|
||||
#include "util/coding.h"
|
||||
|
||||
namespace rocksdb {
|
||||
@@ -39,7 +38,6 @@ using std::unique_ptr;
|
||||
class BlockBasedTable : public TableReader {
|
||||
public:
|
||||
static const std::string kFilterBlockPrefix;
|
||||
static const std::string kPropertiesBlock;
|
||||
|
||||
// Attempt to open the table that is stored in bytes [0..file_size)
|
||||
// of "file", and read the metadata entries necessary to allow
|
||||
@@ -53,6 +51,7 @@ class BlockBasedTable : public TableReader {
|
||||
// *file must remain live while this Table is in use.
|
||||
static Status Open(const Options& db_options, const EnvOptions& env_options,
|
||||
const BlockBasedTableOptions& table_options,
|
||||
const InternalKeyComparator& internal_key_comparator,
|
||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||
unique_ptr<TableReader>* table_reader);
|
||||
|
||||
@@ -63,14 +62,13 @@ class BlockBasedTable : public TableReader {
|
||||
// call one of the Seek methods on the iterator before using it).
|
||||
Iterator* NewIterator(const ReadOptions&) override;
|
||||
|
||||
Status Get(
|
||||
const ReadOptions& readOptions,
|
||||
const Slice& key,
|
||||
void* handle_context,
|
||||
bool (*result_handler)(void* handle_context, const Slice& k,
|
||||
const Slice& v, bool didIO),
|
||||
void (*mark_key_may_exist_handler)(void* handle_context) = nullptr)
|
||||
override;
|
||||
Status Get(const ReadOptions& readOptions, const Slice& key,
|
||||
void* handle_context,
|
||||
bool (*result_handler)(void* handle_context,
|
||||
const ParsedInternalKey& k, const Slice& v,
|
||||
bool didIO),
|
||||
void (*mark_key_may_exist_handler)(void* handle_context) =
|
||||
nullptr) override;
|
||||
|
||||
// Given a key, return an approximate byte offset in the file where
|
||||
// the data for that key begins (or would begin if the key were
|
||||
@@ -82,13 +80,13 @@ class BlockBasedTable : public TableReader {
|
||||
|
||||
// Returns true if the block for the specified key is in cache.
|
||||
// REQUIRES: key is in this table.
|
||||
bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override;
|
||||
bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
|
||||
|
||||
// Set up the table for Compaction. Might change some parameters with
|
||||
// posix_fadvise
|
||||
void SetupForCompaction() override;
|
||||
|
||||
TableProperties& GetTableProperties() override;
|
||||
const TableProperties& GetTableProperties() override;
|
||||
|
||||
~BlockBasedTable();
|
||||
|
||||
@@ -101,8 +99,9 @@ class BlockBasedTable : public TableReader {
|
||||
bool compaction_optimized_;
|
||||
|
||||
static Iterator* BlockReader(void*, const ReadOptions&,
|
||||
const EnvOptions& soptions, const Slice&,
|
||||
bool for_compaction);
|
||||
const EnvOptions& soptions,
|
||||
const InternalKeyComparator& icomparator,
|
||||
const Slice&, bool for_compaction);
|
||||
|
||||
static Iterator* BlockReader(void*, const ReadOptions&, const Slice&,
|
||||
bool* didIO, bool for_compaction = false);
|
||||
@@ -142,7 +141,6 @@ class BlockBasedTable : public TableReader {
|
||||
|
||||
void ReadMeta(const Footer& footer);
|
||||
void ReadFilter(const Slice& filter_handle_value);
|
||||
static Status ReadProperties(const Slice& handle_value, Rep* rep);
|
||||
|
||||
// Read the meta block from sst.
|
||||
static Status ReadMetaBlock(
|
||||
@@ -156,10 +154,6 @@ class BlockBasedTable : public TableReader {
|
||||
Rep* rep,
|
||||
size_t* filter_size = nullptr);
|
||||
|
||||
// Read the table properties from properties block.
|
||||
static Status ReadProperties(
|
||||
const Slice& handle_value, Rep* rep, TableProperties* properties);
|
||||
|
||||
static void SetupCacheKeyPrefix(Rep* rep);
|
||||
|
||||
explicit BlockBasedTable(Rep* rep) :
|
||||
@@ -181,15 +175,4 @@ class BlockBasedTable : public TableReader {
|
||||
void operator=(const TableReader&) = delete;
|
||||
};
|
||||
|
||||
struct BlockBasedTablePropertiesNames {
|
||||
static const std::string kDataSize;
|
||||
static const std::string kIndexSize;
|
||||
static const std::string kFilterSize;
|
||||
static const std::string kRawKeySize;
|
||||
static const std::string kRawValueSize;
|
||||
static const std::string kNumDataBlocks;
|
||||
static const std::string kNumEntries;
|
||||
static const std::string kFilterPolicy;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
@@ -36,6 +36,7 @@
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "util/coding.h"
|
||||
|
||||
namespace rocksdb {
|
||||
@@ -51,9 +52,8 @@ BlockBuilder::BlockBuilder(int block_restart_interval,
|
||||
restarts_.push_back(0); // First restart point is at offset 0
|
||||
}
|
||||
|
||||
BlockBuilder::BlockBuilder(const Options& options)
|
||||
: BlockBuilder(options.block_restart_interval, options.comparator) {
|
||||
}
|
||||
BlockBuilder::BlockBuilder(const Options& options, const Comparator* comparator)
|
||||
: BlockBuilder(options.block_restart_interval, comparator) {}
|
||||
|
||||
void BlockBuilder::Reset() {
|
||||
buffer_.clear();
|
||||
|
||||
@@ -21,7 +21,7 @@ class Comparator;
|
||||
class BlockBuilder {
|
||||
public:
|
||||
BlockBuilder(int block_builder, const Comparator* comparator);
|
||||
explicit BlockBuilder(const Options& options);
|
||||
explicit BlockBuilder(const Options& options, const Comparator* comparator);
|
||||
|
||||
// Reset the contents as if the BlockBuilder was just constructed.
|
||||
void Reset();
|
||||
|
||||
@@ -32,9 +32,12 @@ class BlockTest {};
|
||||
TEST(BlockTest, SimpleTest) {
|
||||
Random rnd(301);
|
||||
Options options = Options();
|
||||
std::unique_ptr<InternalKeyComparator> ic;
|
||||
ic.reset(new test::PlainInternalKeyComparator(options.comparator));
|
||||
|
||||
std::vector<std::string> keys;
|
||||
std::vector<std::string> values;
|
||||
BlockBuilder builder(options);
|
||||
BlockBuilder builder(options, ic.get());
|
||||
int num_records = 100000;
|
||||
char buf[10];
|
||||
char* p = &buf[0];
|
||||
|
||||
@@ -21,11 +21,12 @@ namespace rocksdb {
|
||||
static const size_t kFilterBaseLg = 11;
|
||||
static const size_t kFilterBase = 1 << kFilterBaseLg;
|
||||
|
||||
FilterBlockBuilder::FilterBlockBuilder(const Options& opt)
|
||||
: policy_(opt.filter_policy),
|
||||
prefix_extractor_(opt.prefix_extractor),
|
||||
whole_key_filtering_(opt.whole_key_filtering),
|
||||
comparator_(opt.comparator){}
|
||||
FilterBlockBuilder::FilterBlockBuilder(const Options& opt,
|
||||
const Comparator* internal_comparator)
|
||||
: policy_(opt.filter_policy),
|
||||
prefix_extractor_(opt.prefix_extractor),
|
||||
whole_key_filtering_(opt.whole_key_filtering),
|
||||
comparator_(internal_comparator) {}
|
||||
|
||||
void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
|
||||
uint64_t filter_index = (block_offset / kFilterBase);
|
||||
|
||||
@@ -35,7 +35,8 @@ class FilterPolicy;
|
||||
// (StartBlock AddKey*)* Finish
|
||||
class FilterBlockBuilder {
|
||||
public:
|
||||
explicit FilterBlockBuilder(const Options& opt);
|
||||
explicit FilterBlockBuilder(const Options& opt,
|
||||
const Comparator* internal_comparator);
|
||||
|
||||
void StartBlock(uint64_t block_offset);
|
||||
void AddKey(const Slice& key);
|
||||
|
||||
@@ -55,7 +55,7 @@ class FilterBlockTest {
|
||||
};
|
||||
|
||||
TEST(FilterBlockTest, EmptyBuilder) {
|
||||
FilterBlockBuilder builder(options_);
|
||||
FilterBlockBuilder builder(options_, options_.comparator);
|
||||
Slice block = builder.Finish();
|
||||
ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
|
||||
FilterBlockReader reader(options_, block);
|
||||
@@ -64,7 +64,7 @@ TEST(FilterBlockTest, EmptyBuilder) {
|
||||
}
|
||||
|
||||
TEST(FilterBlockTest, SingleChunk) {
|
||||
FilterBlockBuilder builder(options_);
|
||||
FilterBlockBuilder builder(options_, options_.comparator);
|
||||
builder.StartBlock(100);
|
||||
builder.AddKey("foo");
|
||||
builder.AddKey("bar");
|
||||
@@ -85,7 +85,7 @@ TEST(FilterBlockTest, SingleChunk) {
|
||||
}
|
||||
|
||||
TEST(FilterBlockTest, MultiChunk) {
|
||||
FilterBlockBuilder builder(options_);
|
||||
FilterBlockBuilder builder(options_, options_.comparator);
|
||||
|
||||
// First filter
|
||||
builder.StartBlock(0);
|
||||
|
||||
@@ -34,6 +34,7 @@ Status BlockHandle::DecodeFrom(Slice* input) {
|
||||
return Status::Corruption("bad block handle");
|
||||
}
|
||||
}
|
||||
const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
|
||||
|
||||
void Footer::EncodeTo(std::string* dst) const {
|
||||
#ifndef NDEBUG
|
||||
@@ -72,6 +73,30 @@ Status Footer::DecodeFrom(Slice* input) {
|
||||
return result;
|
||||
}
|
||||
|
||||
Status ReadFooterFromFile(RandomAccessFile* file,
|
||||
uint64_t file_size,
|
||||
Footer* footer) {
|
||||
if (file_size < Footer::kEncodedLength) {
|
||||
return Status::InvalidArgument("file is too short to be an sstable");
|
||||
}
|
||||
|
||||
char footer_space[Footer::kEncodedLength];
|
||||
Slice footer_input;
|
||||
Status s = file->Read(file_size - Footer::kEncodedLength,
|
||||
Footer::kEncodedLength,
|
||||
&footer_input,
|
||||
footer_space);
|
||||
if (!s.ok()) return s;
|
||||
|
||||
// Check that we actually read the whole footer from the file. It may be
|
||||
// that size isn't correct.
|
||||
if (footer_input.size() != Footer::kEncodedLength) {
|
||||
return Status::InvalidArgument("file is too short to be an sstable");
|
||||
}
|
||||
|
||||
return footer->DecodeFrom(&footer_input);
|
||||
}
|
||||
|
||||
Status ReadBlockContents(RandomAccessFile* file,
|
||||
const ReadOptions& options,
|
||||
const BlockHandle& handle,
|
||||
|
||||
@@ -26,6 +26,7 @@ struct ReadOptions;
|
||||
class BlockHandle {
|
||||
public:
|
||||
BlockHandle();
|
||||
BlockHandle(uint64_t offset, uint64_t size);
|
||||
|
||||
// The offset of the block in the file.
|
||||
uint64_t offset() const { return offset_; }
|
||||
@@ -38,19 +39,36 @@ class BlockHandle {
|
||||
void EncodeTo(std::string* dst) const;
|
||||
Status DecodeFrom(Slice* input);
|
||||
|
||||
// if the block handle's offset and size are both "0", we will view it
|
||||
// as a null block handle that points to no where.
|
||||
bool IsNull() const {
|
||||
return offset_ == 0 && size_ == 0;
|
||||
}
|
||||
|
||||
static const BlockHandle& NullBlockHandle() {
|
||||
return kNullBlockHandle;
|
||||
}
|
||||
|
||||
// Maximum encoding length of a BlockHandle
|
||||
enum { kMaxEncodedLength = 10 + 10 };
|
||||
|
||||
private:
|
||||
uint64_t offset_;
|
||||
uint64_t size_;
|
||||
uint64_t offset_ = 0;
|
||||
uint64_t size_ = 0;
|
||||
|
||||
static const BlockHandle kNullBlockHandle;
|
||||
};
|
||||
|
||||
// Footer encapsulates the fixed information stored at the tail
|
||||
// end of every table file.
|
||||
class Footer {
|
||||
public:
|
||||
Footer() { }
|
||||
// @table_magic_number serves two purposes:
|
||||
// 1. Identify different types of the tables.
|
||||
// 2. Help us to identify if a given file is a valid sst.
|
||||
Footer(uint64_t table_magic_number) :
|
||||
kTableMagicNumber(table_magic_number) {
|
||||
}
|
||||
|
||||
// The block handle for the metaindex block of the table
|
||||
const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
|
||||
@@ -77,12 +95,13 @@ class Footer {
|
||||
private:
|
||||
BlockHandle metaindex_handle_;
|
||||
BlockHandle index_handle_;
|
||||
const uint64_t kTableMagicNumber;
|
||||
};
|
||||
|
||||
// kTableMagicNumber was picked by running
|
||||
// echo http://code.google.com/p/leveldb/ | sha1sum
|
||||
// and taking the leading 64 bits.
|
||||
static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull;
|
||||
// Read the footer from file
|
||||
Status ReadFooterFromFile(RandomAccessFile* file,
|
||||
uint64_t file_size,
|
||||
Footer* footer);
|
||||
|
||||
// 1-byte type + 32-bit crc
|
||||
static const size_t kBlockTrailerSize = 5;
|
||||
@@ -115,8 +134,13 @@ extern Status UncompressBlockContents(const char* data,
|
||||
// Implementation details follow. Clients should ignore,
|
||||
|
||||
inline BlockHandle::BlockHandle()
|
||||
: offset_(~static_cast<uint64_t>(0)),
|
||||
size_(~static_cast<uint64_t>(0)) {
|
||||
: BlockHandle(~static_cast<uint64_t>(0),
|
||||
~static_cast<uint64_t>(0)) {
|
||||
}
|
||||
|
||||
inline BlockHandle::BlockHandle(uint64_t offset, uint64_t size)
|
||||
: offset_(offset),
|
||||
size_(size) {
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
@@ -11,8 +11,11 @@
|
||||
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "table/iter_heap.h"
|
||||
#include "table/iterator_wrapper.h"
|
||||
#include "util/stop_watch.h"
|
||||
#include "util/perf_context_imp.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
@@ -22,10 +25,13 @@ namespace {
|
||||
|
||||
class MergingIterator : public Iterator {
|
||||
public:
|
||||
MergingIterator(const Comparator* comparator, Iterator** children, int n)
|
||||
MergingIterator(Env* const env, const Comparator* comparator,
|
||||
Iterator** children, int n)
|
||||
: comparator_(comparator),
|
||||
children_(n),
|
||||
current_(nullptr),
|
||||
use_heap_(true),
|
||||
env_(env),
|
||||
direction_(kForward),
|
||||
maxHeap_(NewMaxIterHeap(comparator_)),
|
||||
minHeap_ (NewMinIterHeap(comparator_)) {
|
||||
@@ -70,15 +76,52 @@ class MergingIterator : public Iterator {
|
||||
}
|
||||
|
||||
virtual void Seek(const Slice& target) {
|
||||
ClearHeaps();
|
||||
// Invalidate the heap.
|
||||
use_heap_ = false;
|
||||
IteratorWrapper* first_child = nullptr;
|
||||
StopWatchNano child_seek_timer(env_, false);
|
||||
StopWatchNano min_heap_timer(env_, false);
|
||||
for (auto& child : children_) {
|
||||
StartPerfTimer(&child_seek_timer);
|
||||
child.Seek(target);
|
||||
BumpPerfTime(&perf_context.seek_child_seek_time, &child_seek_timer);
|
||||
BumpPerfCount(&perf_context.seek_child_seek_count);
|
||||
|
||||
if (child.Valid()) {
|
||||
minHeap_.push(&child);
|
||||
// This child has valid key
|
||||
if (!use_heap_) {
|
||||
if (first_child == nullptr) {
|
||||
// It's the first child has valid key. Only put it int
|
||||
// current_. Now the values in the heap should be invalid.
|
||||
first_child = &child;
|
||||
} else {
|
||||
// We have more than one children with valid keys. Initialize
|
||||
// the heap and put the first child into the heap.
|
||||
StartPerfTimer(&min_heap_timer);
|
||||
ClearHeaps();
|
||||
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
|
||||
StartPerfTimer(&min_heap_timer);
|
||||
minHeap_.push(first_child);
|
||||
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
|
||||
}
|
||||
}
|
||||
if (use_heap_) {
|
||||
StartPerfTimer(&min_heap_timer);
|
||||
minHeap_.push(&child);
|
||||
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
|
||||
}
|
||||
}
|
||||
}
|
||||
FindSmallest();
|
||||
direction_ = kForward;
|
||||
if (use_heap_) {
|
||||
// If heap is valid, need to put the smallest key to curent_.
|
||||
StartPerfTimer(&min_heap_timer);
|
||||
FindSmallest();
|
||||
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
|
||||
} else {
|
||||
// The heap is not valid, then the current_ iterator is the first
|
||||
// one, or null if there is no first child.
|
||||
current_ = first_child;
|
||||
}
|
||||
}
|
||||
|
||||
virtual void Next() {
|
||||
@@ -109,10 +152,14 @@ class MergingIterator : public Iterator {
|
||||
// as the current points to the current record. move the iterator forward.
|
||||
// and if it is valid add it to the heap.
|
||||
current_->Next();
|
||||
if (current_->Valid()){
|
||||
minHeap_.push(current_);
|
||||
if (use_heap_) {
|
||||
if (current_->Valid()) {
|
||||
minHeap_.push(current_);
|
||||
}
|
||||
FindSmallest();
|
||||
} else if (!current_->Valid()) {
|
||||
current_ = nullptr;
|
||||
}
|
||||
FindSmallest();
|
||||
}
|
||||
|
||||
virtual void Prev() {
|
||||
@@ -178,6 +225,11 @@ class MergingIterator : public Iterator {
|
||||
const Comparator* comparator_;
|
||||
std::vector<IteratorWrapper> children_;
|
||||
IteratorWrapper* current_;
|
||||
// If the value is true, both of iterators in the heap and current_
|
||||
// contain valid rows. If it is false, only current_ can possibly contain
|
||||
// valid rows.
|
||||
bool use_heap_;
|
||||
Env* const env_;
|
||||
// Which direction is the iterator moving?
|
||||
enum Direction {
|
||||
kForward,
|
||||
@@ -189,6 +241,7 @@ class MergingIterator : public Iterator {
|
||||
};
|
||||
|
||||
void MergingIterator::FindSmallest() {
|
||||
assert(use_heap_);
|
||||
if (minHeap_.empty()) {
|
||||
current_ = nullptr;
|
||||
} else {
|
||||
@@ -199,6 +252,7 @@ void MergingIterator::FindSmallest() {
|
||||
}
|
||||
|
||||
void MergingIterator::FindLargest() {
|
||||
assert(use_heap_);
|
||||
if (maxHeap_.empty()) {
|
||||
current_ = nullptr;
|
||||
} else {
|
||||
@@ -209,19 +263,21 @@ void MergingIterator::FindLargest() {
|
||||
}
|
||||
|
||||
void MergingIterator::ClearHeaps() {
|
||||
use_heap_ = true;
|
||||
maxHeap_ = NewMaxIterHeap(comparator_);
|
||||
minHeap_ = NewMinIterHeap(comparator_);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) {
|
||||
Iterator* NewMergingIterator(Env* const env, const Comparator* cmp,
|
||||
Iterator** list, int n) {
|
||||
assert(n >= 0);
|
||||
if (n == 0) {
|
||||
return NewEmptyIterator();
|
||||
} else if (n == 1) {
|
||||
return list[0];
|
||||
} else {
|
||||
return new MergingIterator(cmp, list, n);
|
||||
return new MergingIterator(env, cmp, list, n);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ namespace rocksdb {
|
||||
|
||||
class Comparator;
|
||||
class Iterator;
|
||||
class Env;
|
||||
|
||||
// Return an iterator that provided the union of the data in
|
||||
// children[0,n-1]. Takes ownership of the child iterators and
|
||||
@@ -22,7 +23,8 @@ class Iterator;
|
||||
// key is present in K child iterators, it will be yielded K times.
|
||||
//
|
||||
// REQUIRES: n >= 0
|
||||
extern Iterator* NewMergingIterator(
|
||||
const Comparator* comparator, Iterator** children, int n);
|
||||
extern Iterator* NewMergingIterator(Env* const env,
|
||||
const Comparator* comparator,
|
||||
Iterator** children, int n);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
286
table/meta_blocks.cc
Normal file
286
table/meta_blocks.cc
Normal file
@@ -0,0 +1,286 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "table/meta_blocks.h"
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "rocksdb/table.h"
|
||||
#include "table/block.h"
|
||||
#include "table/format.h"
|
||||
#include "util/coding.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
MetaIndexBuilder::MetaIndexBuilder()
|
||||
: meta_index_block_(
|
||||
new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
|
||||
}
|
||||
|
||||
void MetaIndexBuilder::Add(const std::string& key,
|
||||
const BlockHandle& handle) {
|
||||
std::string handle_encoding;
|
||||
handle.EncodeTo(&handle_encoding);
|
||||
meta_block_handles_.insert({key, handle_encoding});
|
||||
}
|
||||
|
||||
Slice MetaIndexBuilder::Finish() {
|
||||
for (const auto& metablock : meta_block_handles_) {
|
||||
meta_index_block_->Add(metablock.first, metablock.second);
|
||||
}
|
||||
return meta_index_block_->Finish();
|
||||
}
|
||||
|
||||
PropertyBlockBuilder::PropertyBlockBuilder()
|
||||
: properties_block_(
|
||||
new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
|
||||
}
|
||||
|
||||
void PropertyBlockBuilder::Add(const std::string& name,
|
||||
const std::string& val) {
|
||||
props_.insert({name, val});
|
||||
}
|
||||
|
||||
void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) {
|
||||
assert(props_.find(name) == props_.end());
|
||||
|
||||
std::string dst;
|
||||
PutVarint64(&dst, val);
|
||||
|
||||
Add(name, dst);
|
||||
}
|
||||
|
||||
void PropertyBlockBuilder::Add(
|
||||
const UserCollectedProperties& user_collected_properties) {
|
||||
for (const auto& prop : user_collected_properties) {
|
||||
Add(prop.first, prop.second);
|
||||
}
|
||||
}
|
||||
|
||||
void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
|
||||
Add(TablePropertiesNames::kRawKeySize, props.raw_key_size);
|
||||
Add(TablePropertiesNames::kRawValueSize, props.raw_value_size);
|
||||
Add(TablePropertiesNames::kDataSize, props.data_size);
|
||||
Add(TablePropertiesNames::kIndexSize, props.index_size);
|
||||
Add(TablePropertiesNames::kNumEntries, props.num_entries);
|
||||
Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
|
||||
Add(TablePropertiesNames::kFilterSize, props.filter_size);
|
||||
Add(TablePropertiesNames::kFormatVersion, props.format_version);
|
||||
Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len);
|
||||
|
||||
if (!props.filter_policy_name.empty()) {
|
||||
Add(TablePropertiesNames::kFilterPolicy,
|
||||
props.filter_policy_name);
|
||||
}
|
||||
}
|
||||
|
||||
Slice PropertyBlockBuilder::Finish() {
|
||||
for (const auto& prop : props_) {
|
||||
properties_block_->Add(prop.first, prop.second);
|
||||
}
|
||||
|
||||
return properties_block_->Finish();
|
||||
}
|
||||
|
||||
void LogPropertiesCollectionError(
|
||||
Logger* info_log, const std::string& method, const std::string& name) {
|
||||
assert(method == "Add" || method == "Finish");
|
||||
|
||||
std::string msg =
|
||||
"[Warning] encountered error when calling TablePropertiesCollector::" +
|
||||
method + "() with collector name: " + name;
|
||||
Log(info_log, "%s", msg.c_str());
|
||||
}
|
||||
|
||||
bool NotifyCollectTableCollectorsOnAdd(
|
||||
const Slice& key,
|
||||
const Slice& value,
|
||||
const Options::TablePropertiesCollectors& collectors,
|
||||
Logger* info_log) {
|
||||
bool all_succeeded = true;
|
||||
for (auto collector : collectors) {
|
||||
Status s = collector->Add(key, value);
|
||||
all_succeeded = all_succeeded && s.ok();
|
||||
if (!s.ok()) {
|
||||
LogPropertiesCollectionError(
|
||||
info_log, "Add", /* method */ collector->Name()
|
||||
);
|
||||
}
|
||||
}
|
||||
return all_succeeded;
|
||||
}
|
||||
|
||||
bool NotifyCollectTableCollectorsOnFinish(
|
||||
const Options::TablePropertiesCollectors& collectors,
|
||||
Logger* info_log,
|
||||
PropertyBlockBuilder* builder) {
|
||||
bool all_succeeded = true;
|
||||
for (auto collector : collectors) {
|
||||
UserCollectedProperties user_collected_properties;
|
||||
Status s = collector->Finish(&user_collected_properties);
|
||||
|
||||
all_succeeded = all_succeeded && s.ok();
|
||||
if (!s.ok()) {
|
||||
LogPropertiesCollectionError(
|
||||
info_log, "Finish", /* method */ collector->Name()
|
||||
);
|
||||
} else {
|
||||
builder->Add(user_collected_properties);
|
||||
}
|
||||
}
|
||||
|
||||
return all_succeeded;
|
||||
}
|
||||
|
||||
Status ReadProperties(
|
||||
const Slice& handle_value,
|
||||
RandomAccessFile* file,
|
||||
Env* env,
|
||||
Logger* logger,
|
||||
TableProperties* table_properties) {
|
||||
assert(table_properties);
|
||||
|
||||
Slice v = handle_value;
|
||||
BlockHandle handle;
|
||||
if (!handle.DecodeFrom(&v).ok()) {
|
||||
return Status::InvalidArgument("Failed to decode properties block handle");
|
||||
}
|
||||
|
||||
BlockContents block_contents;
|
||||
ReadOptions read_options;
|
||||
read_options.verify_checksums = false;
|
||||
Status s = ReadBlockContents(
|
||||
file,
|
||||
read_options,
|
||||
handle,
|
||||
&block_contents,
|
||||
env,
|
||||
false
|
||||
);
|
||||
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
Block properties_block(block_contents);
|
||||
std::unique_ptr<Iterator> iter(
|
||||
properties_block.NewIterator(BytewiseComparator())
|
||||
);
|
||||
|
||||
// All pre-defined properties of type uint64_t
|
||||
std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
|
||||
{ TablePropertiesNames::kDataSize, &table_properties->data_size },
|
||||
{ TablePropertiesNames::kIndexSize, &table_properties->index_size },
|
||||
{ TablePropertiesNames::kFilterSize, &table_properties->filter_size },
|
||||
{ TablePropertiesNames::kRawKeySize, &table_properties->raw_key_size },
|
||||
{ TablePropertiesNames::kRawValueSize, &table_properties->raw_value_size },
|
||||
{ TablePropertiesNames::kNumDataBlocks,
|
||||
&table_properties->num_data_blocks },
|
||||
{ TablePropertiesNames::kNumEntries, &table_properties->num_entries },
|
||||
{ TablePropertiesNames::kFormatVersion, &table_properties->format_version },
|
||||
{ TablePropertiesNames::kFixedKeyLen, &table_properties->fixed_key_len },
|
||||
};
|
||||
|
||||
std::string last_key;
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
s = iter->status();
|
||||
if (!s.ok()) {
|
||||
break;
|
||||
}
|
||||
|
||||
auto key = iter->key().ToString();
|
||||
// properties block is strictly sorted with no duplicate key.
|
||||
assert(
|
||||
last_key.empty() ||
|
||||
BytewiseComparator()->Compare(key, last_key) > 0
|
||||
);
|
||||
last_key = key;
|
||||
|
||||
auto raw_val = iter->value();
|
||||
auto pos = predefined_uint64_properties.find(key);
|
||||
|
||||
if (pos != predefined_uint64_properties.end()) {
|
||||
// handle predefined rocksdb properties
|
||||
uint64_t val;
|
||||
if (!GetVarint64(&raw_val, &val)) {
|
||||
// skip malformed value
|
||||
auto error_msg =
|
||||
"[Warning] detect malformed value in properties meta-block:"
|
||||
"\tkey: " + key + "\tval: " + raw_val.ToString();
|
||||
Log(logger, "%s", error_msg.c_str());
|
||||
continue;
|
||||
}
|
||||
*(pos->second) = val;
|
||||
} else if (key == TablePropertiesNames::kFilterPolicy) {
|
||||
table_properties->filter_policy_name = raw_val.ToString();
|
||||
} else {
|
||||
// handle user-collected properties
|
||||
table_properties->user_collected_properties.insert(
|
||||
std::make_pair(key, raw_val.ToString())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
Status ReadTableProperties(
|
||||
RandomAccessFile* file,
|
||||
uint64_t file_size,
|
||||
uint64_t table_magic_number,
|
||||
Env* env,
|
||||
Logger* info_log,
|
||||
TableProperties* properties) {
|
||||
// -- Read metaindex block
|
||||
Footer footer(table_magic_number);
|
||||
auto s = ReadFooterFromFile(file, file_size, &footer);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
auto metaindex_handle = footer.metaindex_handle();
|
||||
BlockContents metaindex_contents;
|
||||
ReadOptions read_options;
|
||||
read_options.verify_checksums = false;
|
||||
s = ReadBlockContents(
|
||||
file,
|
||||
read_options,
|
||||
metaindex_handle,
|
||||
&metaindex_contents,
|
||||
env,
|
||||
false
|
||||
);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
Block metaindex_block(metaindex_contents);
|
||||
std::unique_ptr<Iterator> meta_iter(
|
||||
metaindex_block.NewIterator(BytewiseComparator())
|
||||
);
|
||||
|
||||
// -- Read property block
|
||||
meta_iter->Seek(kPropertiesBlock);
|
||||
TableProperties table_properties;
|
||||
if (meta_iter->Valid() &&
|
||||
meta_iter->key() == kPropertiesBlock &&
|
||||
meta_iter->status().ok()) {
|
||||
s = ReadProperties(
|
||||
meta_iter->value(),
|
||||
file,
|
||||
env,
|
||||
info_log,
|
||||
properties
|
||||
);
|
||||
} else {
|
||||
s = Status::Corruption(
|
||||
"Unable to read the property block from the plain table"
|
||||
);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
} // namespace rocksdb
|
||||
121
table/meta_blocks.h
Normal file
121
table/meta_blocks.h
Normal file
@@ -0,0 +1,121 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/table_properties.h"
|
||||
#include "table/block_builder.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class BlockBuilder;
|
||||
class BlockHandle;
|
||||
class Env;
|
||||
class Logger;
|
||||
class RandomAccessFile;
|
||||
struct TableProperties;
|
||||
|
||||
// An STL style comparator that does the bytewise comparator comparasion
|
||||
// internally.
|
||||
struct BytewiseLessThan {
|
||||
bool operator()(const std::string& key1, const std::string& key2) const {
|
||||
// smaller entries will be placed in front.
|
||||
return comparator->Compare(key1, key2) <= 0;
|
||||
}
|
||||
|
||||
const Comparator* comparator = BytewiseComparator();
|
||||
};
|
||||
|
||||
// When writing to a block that requires entries to be sorted by
|
||||
// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap`
|
||||
// before writng to store.
|
||||
typedef std::map<std::string, std::string, BytewiseLessThan> BytewiseSortedMap;
|
||||
|
||||
class MetaIndexBuilder {
|
||||
public:
|
||||
MetaIndexBuilder(const MetaIndexBuilder&) = delete;
|
||||
MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete;
|
||||
|
||||
MetaIndexBuilder();
|
||||
void Add(const std::string& key, const BlockHandle& handle);
|
||||
|
||||
// Write all the added key/value pairs to the block and return the contents
|
||||
// of the block.
|
||||
Slice Finish();
|
||||
|
||||
private:
|
||||
// store the sorted key/handle of the metablocks.
|
||||
BytewiseSortedMap meta_block_handles_;
|
||||
std::unique_ptr<BlockBuilder> meta_index_block_;
|
||||
};
|
||||
|
||||
class PropertyBlockBuilder {
|
||||
public:
|
||||
PropertyBlockBuilder(const PropertyBlockBuilder&) = delete;
|
||||
PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete;
|
||||
|
||||
PropertyBlockBuilder();
|
||||
|
||||
void AddTableProperty(const TableProperties& props);
|
||||
void Add(const std::string& key, uint64_t value);
|
||||
void Add(const std::string& key, const std::string& value);
|
||||
void Add(const UserCollectedProperties& user_collected_properties);
|
||||
|
||||
// Write all the added entries to the block and return the block contents
|
||||
Slice Finish();
|
||||
|
||||
private:
|
||||
std::unique_ptr<BlockBuilder> properties_block_;
|
||||
BytewiseSortedMap props_;
|
||||
};
|
||||
|
||||
// Were we encounter any error occurs during user-defined statistics collection,
|
||||
// we'll write the warning message to info log.
|
||||
void LogPropertiesCollectionError(
|
||||
Logger* info_log, const std::string& method, const std::string& name);
|
||||
|
||||
// Utility functions help table builder to trigger batch events for user
|
||||
// defined property collectors.
|
||||
// Return value indicates if there is any error occurred; if error occurred,
|
||||
// the warning message will be logged.
|
||||
// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all
|
||||
// property collectors.
|
||||
bool NotifyCollectTableCollectorsOnAdd(
|
||||
const Slice& key,
|
||||
const Slice& value,
|
||||
const Options::TablePropertiesCollectors& collectors,
|
||||
Logger* info_log);
|
||||
|
||||
// NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all
|
||||
// property collectors. The collected properties will be added to `builder`.
|
||||
bool NotifyCollectTableCollectorsOnFinish(
|
||||
const Options::TablePropertiesCollectors& collectors,
|
||||
Logger* info_log,
|
||||
PropertyBlockBuilder* builder);
|
||||
|
||||
// Read the properties from the table.
|
||||
Status ReadProperties(
|
||||
const Slice& handle_value,
|
||||
RandomAccessFile* file,
|
||||
Env* env,
|
||||
Logger* logger,
|
||||
TableProperties* table_properties);
|
||||
|
||||
// Directly read the properties from the properties block of a plain table.
|
||||
Status ReadTableProperties(
|
||||
RandomAccessFile* file,
|
||||
uint64_t file_size,
|
||||
uint64_t table_magic_number,
|
||||
Env* env,
|
||||
Logger* info_log,
|
||||
TableProperties* properties);
|
||||
|
||||
} // namespace rocksdb
|
||||
198
table/plain_table_builder.cc
Normal file
198
table/plain_table_builder.cc
Normal file
@@ -0,0 +1,198 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "table/plain_table_builder.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <map>
|
||||
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/filter_policy.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "table/plain_table_factory.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "table/block_builder.h"
|
||||
#include "table/filter_block.h"
|
||||
#include "table/format.h"
|
||||
#include "table/meta_blocks.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/crc32c.h"
|
||||
#include "util/stop_watch.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
|
||||
// a utility that helps writing block content to the file
|
||||
// @offset will advance if @block_contents was successfully written.
|
||||
// @block_handle the block handle this particular block.
|
||||
Status WriteBlock(
|
||||
const Slice& block_contents,
|
||||
WritableFile* file,
|
||||
uint64_t* offset,
|
||||
BlockHandle* block_handle) {
|
||||
block_handle->set_offset(*offset);
|
||||
block_handle->set_size(block_contents.size());
|
||||
Status s = file->Append(block_contents);
|
||||
|
||||
if (s.ok()) {
|
||||
*offset += block_contents.size();
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// kPlainTableMagicNumber was picked by running
|
||||
// echo rocksdb.plain.table | sha1sum
|
||||
// and taking the leading 64 bits.
|
||||
extern const uint64_t kPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
|
||||
|
||||
PlainTableBuilder::PlainTableBuilder(const Options& options,
|
||||
WritableFile* file,
|
||||
uint32_t user_key_len) :
|
||||
options_(options), file_(file), user_key_len_(user_key_len) {
|
||||
properties_.fixed_key_len = user_key_len;
|
||||
|
||||
// for plain table, we put all the data in a big chuck.
|
||||
properties_.num_data_blocks = 1;
|
||||
// emphasize that currently plain table doesn't have persistent index or
|
||||
// filter block.
|
||||
properties_.index_size = 0;
|
||||
properties_.filter_size = 0;
|
||||
properties_.format_version = 0;
|
||||
}
|
||||
|
||||
PlainTableBuilder::~PlainTableBuilder() {
|
||||
}
|
||||
|
||||
void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
|
||||
size_t user_key_size = key.size() - 8;
|
||||
assert(user_key_len_ == 0 || user_key_size == user_key_len_);
|
||||
|
||||
if (!IsFixedLength()) {
|
||||
// Write key length
|
||||
key_size_str_.clear();
|
||||
PutVarint32(&key_size_str_, user_key_size);
|
||||
file_->Append(key_size_str_);
|
||||
offset_ += key_size_str_.length();
|
||||
}
|
||||
|
||||
// Write key
|
||||
ParsedInternalKey parsed_key;
|
||||
if (!ParseInternalKey(key, &parsed_key)) {
|
||||
status_ = Status::Corruption(Slice());
|
||||
return;
|
||||
}
|
||||
if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
|
||||
file_->Append(Slice(key.data(), user_key_size));
|
||||
char tmp_char = PlainTableFactory::kValueTypeSeqId0;
|
||||
file_->Append(Slice(&tmp_char, 1));
|
||||
offset_ += key.size() - 7;
|
||||
} else {
|
||||
file_->Append(key);
|
||||
offset_ += key.size();
|
||||
}
|
||||
|
||||
// Write value length
|
||||
value_size_str_.clear();
|
||||
int value_size = value.size();
|
||||
PutVarint32(&value_size_str_, value_size);
|
||||
file_->Append(value_size_str_);
|
||||
|
||||
// Write value
|
||||
file_->Append(value);
|
||||
offset_ += value_size + value_size_str_.length();
|
||||
|
||||
properties_.num_entries++;
|
||||
properties_.raw_key_size += key.size();
|
||||
properties_.raw_value_size += value.size();
|
||||
|
||||
// notify property collectors
|
||||
NotifyCollectTableCollectorsOnAdd(
|
||||
key,
|
||||
value,
|
||||
options_.table_properties_collectors,
|
||||
options_.info_log.get()
|
||||
);
|
||||
}
|
||||
|
||||
Status PlainTableBuilder::status() const { return status_; }
|
||||
|
||||
Status PlainTableBuilder::Finish() {
|
||||
assert(!closed_);
|
||||
closed_ = true;
|
||||
|
||||
properties_.data_size = offset_;
|
||||
|
||||
// Write the following blocks
|
||||
// 1. [meta block: properties]
|
||||
// 2. [metaindex block]
|
||||
// 3. [footer]
|
||||
MetaIndexBuilder meta_index_builer;
|
||||
|
||||
PropertyBlockBuilder property_block_builder;
|
||||
// -- Add basic properties
|
||||
property_block_builder.AddTableProperty(properties_);
|
||||
|
||||
// -- Add user collected properties
|
||||
NotifyCollectTableCollectorsOnFinish(
|
||||
options_.table_properties_collectors,
|
||||
options_.info_log.get(),
|
||||
&property_block_builder
|
||||
);
|
||||
|
||||
// -- Write property block
|
||||
BlockHandle property_block_handle;
|
||||
auto s = WriteBlock(
|
||||
property_block_builder.Finish(),
|
||||
file_,
|
||||
&offset_,
|
||||
&property_block_handle
|
||||
);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
meta_index_builer.Add(kPropertiesBlock, property_block_handle);
|
||||
|
||||
// -- write metaindex block
|
||||
BlockHandle metaindex_block_handle;
|
||||
s = WriteBlock(
|
||||
meta_index_builer.Finish(),
|
||||
file_,
|
||||
&offset_,
|
||||
&metaindex_block_handle
|
||||
);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// Write Footer
|
||||
Footer footer(kPlainTableMagicNumber);
|
||||
footer.set_metaindex_handle(metaindex_block_handle);
|
||||
footer.set_index_handle(BlockHandle::NullBlockHandle());
|
||||
std::string footer_encoding;
|
||||
footer.EncodeTo(&footer_encoding);
|
||||
s = file_->Append(footer_encoding);
|
||||
if (s.ok()) {
|
||||
offset_ += footer_encoding.size();
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
void PlainTableBuilder::Abandon() {
|
||||
closed_ = true;
|
||||
}
|
||||
|
||||
uint64_t PlainTableBuilder::NumEntries() const {
|
||||
return properties_.num_entries;
|
||||
}
|
||||
|
||||
uint64_t PlainTableBuilder::FileSize() const {
|
||||
return offset_;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
85
table/plain_table_builder.h
Normal file
85
table/plain_table_builder.h
Normal file
@@ -0,0 +1,85 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// IndexedTable is a simple table format for UNIT TEST ONLY. It is not built
|
||||
// as production quality.
|
||||
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "table/table_builder.h"
|
||||
#include "rocksdb/table_properties.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class BlockBuilder;
|
||||
class BlockHandle;
|
||||
class WritableFile;
|
||||
class TableBuilder;
|
||||
|
||||
class PlainTableBuilder: public TableBuilder {
|
||||
public:
|
||||
// Create a builder that will store the contents of the table it is
|
||||
// building in *file. Does not close the file. It is up to the
|
||||
// caller to close the file after calling Finish(). The output file
|
||||
// will be part of level specified by 'level'. A value of -1 means
|
||||
// that the caller does not know which level the output file will reside.
|
||||
PlainTableBuilder(const Options& options, WritableFile* file,
|
||||
uint32_t user_key_size);
|
||||
|
||||
// REQUIRES: Either Finish() or Abandon() has been called.
|
||||
~PlainTableBuilder();
|
||||
|
||||
// Add key,value to the table being constructed.
|
||||
// REQUIRES: key is after any previously added key according to comparator.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
void Add(const Slice& key, const Slice& value) override;
|
||||
|
||||
// Return non-ok iff some error has been detected.
|
||||
Status status() const override;
|
||||
|
||||
// Finish building the table. Stops using the file passed to the
|
||||
// constructor after this function returns.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
Status Finish() override;
|
||||
|
||||
// Indicate that the contents of this builder should be abandoned. Stops
|
||||
// using the file passed to the constructor after this function returns.
|
||||
// If the caller is not going to call Finish(), it must call Abandon()
|
||||
// before destroying this builder.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
void Abandon() override;
|
||||
|
||||
// Number of calls to Add() so far.
|
||||
uint64_t NumEntries() const override;
|
||||
|
||||
// Size of the file generated so far. If invoked after a successful
|
||||
// Finish() call, returns the size of the final generated file.
|
||||
uint64_t FileSize() const override;
|
||||
|
||||
private:
|
||||
Options options_;
|
||||
WritableFile* file_;
|
||||
uint64_t offset_ = 0;
|
||||
Status status_;
|
||||
TableProperties properties_;
|
||||
|
||||
const size_t user_key_len_;
|
||||
bool closed_ = false; // Either Finish() or Abandon() has been called.
|
||||
|
||||
std::string key_size_str_;
|
||||
std::string value_size_str_;
|
||||
|
||||
bool IsFixedLength() const {
|
||||
return user_key_len_ > 0;
|
||||
}
|
||||
|
||||
// No copying allowed
|
||||
PlainTableBuilder(const PlainTableBuilder&) = delete;
|
||||
void operator=(const PlainTableBuilder&) = delete;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
40
table/plain_table_factory.cc
Normal file
40
table/plain_table_factory.cc
Normal file
@@ -0,0 +1,40 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "table/plain_table_factory.h"
|
||||
|
||||
#include <memory>
|
||||
#include <stdint.h>
|
||||
#include "db/dbformat.h"
|
||||
#include "table/plain_table_builder.h"
|
||||
#include "table/plain_table_reader.h"
|
||||
#include "port/port.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
Status PlainTableFactory::NewTableReader(const Options& options,
|
||||
const EnvOptions& soptions,
|
||||
const InternalKeyComparator& icomp,
|
||||
unique_ptr<RandomAccessFile>&& file,
|
||||
uint64_t file_size,
|
||||
unique_ptr<TableReader>* table) const {
|
||||
return PlainTableReader::Open(options, soptions, icomp, std::move(file),
|
||||
file_size, table, bloom_bits_per_key_,
|
||||
hash_table_ratio_);
|
||||
}
|
||||
|
||||
TableBuilder* PlainTableFactory::NewTableBuilder(
|
||||
const Options& options, const InternalKeyComparator& internal_comparator,
|
||||
WritableFile* file, CompressionType compression_type) const {
|
||||
return new PlainTableBuilder(options, file, user_key_len_);
|
||||
}
|
||||
|
||||
extern TableFactory* NewPlainTableFactory(uint32_t user_key_len,
|
||||
int bloom_bits_per_key,
|
||||
double hash_table_ratio) {
|
||||
return new PlainTableFactory(user_key_len, bloom_bits_per_key,
|
||||
hash_table_ratio);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
76
table/plain_table_factory.h
Normal file
76
table/plain_table_factory.h
Normal file
@@ -0,0 +1,76 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <memory>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/table.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
struct Options;
|
||||
struct EnvOptions;
|
||||
|
||||
using std::unique_ptr;
|
||||
class Status;
|
||||
class RandomAccessFile;
|
||||
class WritableFile;
|
||||
class Table;
|
||||
class TableBuilder;
|
||||
|
||||
// IndexedTable requires fixed length key, configured as a constructor
|
||||
// parameter of the factory class. Output file format:
|
||||
// +-------------+-----------------+
|
||||
// | version | user_key_length |
|
||||
// +------------++------------------------------+ <= key1 offset
|
||||
// | [key_size] | key1 | value_size | |
|
||||
// +------------+-------------+-------------+ |
|
||||
// | value1 |
|
||||
// | |
|
||||
// +----------------------------------------+---+ <= key2 offset
|
||||
// | [key_size] | key2 | value_size | |
|
||||
// +------------+-------------+-------------+ |
|
||||
// | value2 |
|
||||
// | |
|
||||
// | ...... |
|
||||
// +-----------------+--------------------------+
|
||||
// If user_key_length = kPlainTableVariableLength, it means the key is variable
|
||||
// length, there will be an extra field for key size encoded before every key.
|
||||
class PlainTableFactory : public TableFactory {
|
||||
public:
|
||||
~PlainTableFactory() {}
|
||||
// user_key_size is the length of the user key. If it is set to be
|
||||
// kPlainTableVariableLength, then it means variable length. Otherwise, all
|
||||
// the keys need to have the fix length of this value. bloom_bits_per_key is
|
||||
// number of bits used for bloom filer per key. hash_table_ratio is
|
||||
// the desired utilization of the hash table used for prefix hashing.
|
||||
// hash_table_ratio = number of prefixes / #buckets in the hash table
|
||||
explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength,
|
||||
int bloom_bits_per_key = 0,
|
||||
double hash_table_ratio = 0.75)
|
||||
: user_key_len_(user_key_len),
|
||||
bloom_bits_per_key_(bloom_bits_per_key),
|
||||
hash_table_ratio_(hash_table_ratio) {}
|
||||
const char* Name() const override { return "PlainTable"; }
|
||||
Status NewTableReader(const Options& options, const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||
unique_ptr<TableReader>* table) const override;
|
||||
TableBuilder* NewTableBuilder(const Options& options,
|
||||
const InternalKeyComparator& icomparator,
|
||||
WritableFile* file,
|
||||
CompressionType compression_type) const
|
||||
override;
|
||||
|
||||
static const char kValueTypeSeqId0 = 0xFF;
|
||||
|
||||
private:
|
||||
uint32_t user_key_len_;
|
||||
int bloom_bits_per_key_;
|
||||
double hash_table_ratio_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
695
table/plain_table_reader.cc
Normal file
695
table/plain_table_reader.cc
Normal file
@@ -0,0 +1,695 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "table/plain_table_reader.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "db/dbformat.h"
|
||||
|
||||
#include "rocksdb/cache.h"
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/filter_policy.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/statistics.h"
|
||||
|
||||
#include "table/block.h"
|
||||
#include "table/filter_block.h"
|
||||
#include "table/format.h"
|
||||
#include "table/meta_blocks.h"
|
||||
#include "table/two_level_iterator.h"
|
||||
#include "table/plain_table_factory.h"
|
||||
|
||||
#include "util/coding.h"
|
||||
#include "util/dynamic_bloom.h"
|
||||
#include "util/hash.h"
|
||||
#include "util/histogram.h"
|
||||
#include "util/murmurhash.h"
|
||||
#include "util/perf_context_imp.h"
|
||||
#include "util/stop_watch.h"
|
||||
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
|
||||
inline uint32_t GetSliceHash(Slice const& s) {
|
||||
return Hash(s.data(), s.size(), 397) ;
|
||||
}
|
||||
|
||||
inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
|
||||
return hash % num_buckets;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// Iterator to iterate IndexedTable
|
||||
class PlainTableIterator : public Iterator {
|
||||
public:
|
||||
explicit PlainTableIterator(PlainTableReader* table);
|
||||
~PlainTableIterator();
|
||||
|
||||
bool Valid() const;
|
||||
|
||||
void SeekToFirst();
|
||||
|
||||
void SeekToLast();
|
||||
|
||||
void Seek(const Slice& target);
|
||||
|
||||
void Next();
|
||||
|
||||
void Prev();
|
||||
|
||||
Slice key() const;
|
||||
|
||||
Slice value() const;
|
||||
|
||||
Status status() const;
|
||||
|
||||
private:
|
||||
PlainTableReader* table_;
|
||||
uint32_t offset_;
|
||||
uint32_t next_offset_;
|
||||
Slice key_;
|
||||
Slice value_;
|
||||
Status status_;
|
||||
std::string tmp_str_;
|
||||
// No copying allowed
|
||||
PlainTableIterator(const PlainTableIterator&) = delete;
|
||||
void operator=(const Iterator&) = delete;
|
||||
};
|
||||
|
||||
extern const uint64_t kPlainTableMagicNumber;
|
||||
PlainTableReader::PlainTableReader(const EnvOptions& storage_options,
|
||||
const InternalKeyComparator& icomparator,
|
||||
uint64_t file_size, int bloom_bits_per_key,
|
||||
double hash_table_ratio,
|
||||
const TableProperties& table_properties)
|
||||
: soptions_(storage_options),
|
||||
internal_comparator_(icomparator),
|
||||
file_size_(file_size),
|
||||
kHashTableRatio(hash_table_ratio),
|
||||
kBloomBitsPerKey(bloom_bits_per_key),
|
||||
table_properties_(table_properties),
|
||||
data_end_offset_(table_properties_.data_size),
|
||||
user_key_len_(table_properties.fixed_key_len) {}
|
||||
|
||||
PlainTableReader::~PlainTableReader() {
|
||||
delete[] hash_table_;
|
||||
delete[] sub_index_;
|
||||
delete bloom_;
|
||||
}
|
||||
|
||||
Status PlainTableReader::Open(const Options& options,
|
||||
const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
unique_ptr<RandomAccessFile>&& file,
|
||||
uint64_t file_size,
|
||||
unique_ptr<TableReader>* table_reader,
|
||||
const int bloom_bits_per_key,
|
||||
double hash_table_ratio) {
|
||||
assert(options.allow_mmap_reads);
|
||||
|
||||
if (file_size > kMaxFileSize) {
|
||||
return Status::NotSupported("File is too large for PlainTableReader!");
|
||||
}
|
||||
|
||||
TableProperties table_properties;
|
||||
auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
|
||||
options.env, options.info_log.get(),
|
||||
&table_properties);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
|
||||
soptions, internal_comparator, file_size, bloom_bits_per_key,
|
||||
hash_table_ratio, table_properties));
|
||||
new_reader->file_ = std::move(file);
|
||||
new_reader->options_ = options;
|
||||
|
||||
// -- Populate Index
|
||||
s = new_reader->PopulateIndex();
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
*table_reader = std::move(new_reader);
|
||||
return s;
|
||||
}
|
||||
|
||||
void PlainTableReader::SetupForCompaction() {
|
||||
}
|
||||
|
||||
bool PlainTableReader::PrefixMayMatch(const Slice& internal_prefix) {
|
||||
return true;
|
||||
}
|
||||
|
||||
Iterator* PlainTableReader::NewIterator(const ReadOptions& options) {
|
||||
return new PlainTableIterator(this);
|
||||
}
|
||||
|
||||
struct PlainTableReader::IndexRecord {
|
||||
uint32_t hash; // hash of the prefix
|
||||
uint32_t offset; // offset of a row
|
||||
IndexRecord* next;
|
||||
};
|
||||
|
||||
// Helper class to track all the index records
|
||||
class PlainTableReader::IndexRecordList {
|
||||
public:
|
||||
explicit IndexRecordList(size_t num_records_per_group)
|
||||
: kNumRecordsPerGroup(num_records_per_group),
|
||||
current_group_(nullptr),
|
||||
num_records_in_current_group_(num_records_per_group) {}
|
||||
|
||||
~IndexRecordList() {
|
||||
for (size_t i = 0; i < groups_.size(); i++) {
|
||||
delete[] groups_[i];
|
||||
}
|
||||
}
|
||||
|
||||
void AddRecord(murmur_t hash, uint32_t offset) {
|
||||
if (num_records_in_current_group_ == kNumRecordsPerGroup) {
|
||||
current_group_ = AllocateNewGroup();
|
||||
num_records_in_current_group_ = 0;
|
||||
}
|
||||
auto& new_record = current_group_[num_records_in_current_group_++];
|
||||
new_record.hash = hash;
|
||||
new_record.offset = offset;
|
||||
new_record.next = nullptr;
|
||||
}
|
||||
|
||||
size_t GetNumRecords() const {
|
||||
return (groups_.size() - 1) * kNumRecordsPerGroup +
|
||||
num_records_in_current_group_;
|
||||
}
|
||||
IndexRecord* At(size_t index) {
|
||||
return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
|
||||
}
|
||||
|
||||
private:
|
||||
IndexRecord* AllocateNewGroup() {
|
||||
IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
|
||||
groups_.push_back(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
const size_t kNumRecordsPerGroup;
|
||||
IndexRecord* current_group_;
|
||||
// List of arrays allocated
|
||||
std::vector<IndexRecord*> groups_;
|
||||
size_t num_records_in_current_group_;
|
||||
};
|
||||
|
||||
int PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list) {
|
||||
Slice prev_key_prefix_slice;
|
||||
uint32_t prev_key_prefix_hash = 0;
|
||||
uint32_t pos = data_start_offset_;
|
||||
int key_index_within_prefix = 0;
|
||||
bool is_first_record = true;
|
||||
HistogramImpl keys_per_prefix_hist;
|
||||
// Need map to be ordered to make sure sub indexes generated
|
||||
// are in order.
|
||||
|
||||
int num_prefixes = 0;
|
||||
while (pos < data_end_offset_) {
|
||||
uint32_t key_offset = pos;
|
||||
ParsedInternalKey key;
|
||||
Slice value_slice;
|
||||
status_ = Next(pos, &key, &value_slice, pos);
|
||||
Slice key_prefix_slice = GetPrefix(key);
|
||||
|
||||
if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
|
||||
++num_prefixes;
|
||||
if (!is_first_record) {
|
||||
keys_per_prefix_hist.Add(key_index_within_prefix);
|
||||
}
|
||||
key_index_within_prefix = 0;
|
||||
prev_key_prefix_slice = key_prefix_slice;
|
||||
prev_key_prefix_hash = GetSliceHash(key_prefix_slice);
|
||||
}
|
||||
|
||||
if (key_index_within_prefix++ % kIndexIntervalForSamePrefixKeys == 0) {
|
||||
// Add an index key for every kIndexIntervalForSamePrefixKeys keys
|
||||
record_list->AddRecord(prev_key_prefix_hash, key_offset);
|
||||
}
|
||||
is_first_record = false;
|
||||
}
|
||||
|
||||
keys_per_prefix_hist.Add(key_index_within_prefix);
|
||||
Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
|
||||
keys_per_prefix_hist.ToString().c_str());
|
||||
|
||||
return num_prefixes;
|
||||
}
|
||||
|
||||
void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
|
||||
delete[] hash_table_;
|
||||
|
||||
if (kBloomBitsPerKey > 0) {
|
||||
bloom_ = new DynamicBloom(num_prefixes * kBloomBitsPerKey);
|
||||
}
|
||||
double hash_table_size_multipier =
|
||||
(kHashTableRatio > 1.0) ? 1.0 : 1.0 / kHashTableRatio;
|
||||
hash_table_size_ = num_prefixes * hash_table_size_multipier + 1;
|
||||
hash_table_ = new uint32_t[hash_table_size_];
|
||||
}
|
||||
|
||||
size_t PlainTableReader::BucketizeIndexesAndFillBloom(
|
||||
IndexRecordList& record_list, int num_prefixes,
|
||||
std::vector<IndexRecord*>* hash_to_offsets,
|
||||
std::vector<uint32_t>* bucket_count) {
|
||||
size_t sub_index_size_needed = 0;
|
||||
bool first = true;
|
||||
uint32_t prev_hash = 0;
|
||||
size_t num_records = record_list.GetNumRecords();
|
||||
for (size_t i = 0; i < num_records; i++) {
|
||||
IndexRecord* index_record = record_list.At(i);
|
||||
uint32_t cur_hash = index_record->hash;
|
||||
if (first || prev_hash != cur_hash) {
|
||||
prev_hash = cur_hash;
|
||||
first = false;
|
||||
if (bloom_) {
|
||||
bloom_->AddHash(cur_hash);
|
||||
}
|
||||
}
|
||||
uint32_t bucket = GetBucketIdFromHash(cur_hash, hash_table_size_);
|
||||
IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
|
||||
index_record->next = prev_bucket_head;
|
||||
(*hash_to_offsets)[bucket] = index_record;
|
||||
auto& item_count = (*bucket_count)[bucket];
|
||||
if (item_count > 0) {
|
||||
if (item_count == 1) {
|
||||
sub_index_size_needed += kOffsetLen + 1;
|
||||
}
|
||||
if (item_count == 127) {
|
||||
// Need more than one byte for length
|
||||
sub_index_size_needed++;
|
||||
}
|
||||
sub_index_size_needed += kOffsetLen;
|
||||
}
|
||||
item_count++;
|
||||
}
|
||||
return sub_index_size_needed;
|
||||
}
|
||||
|
||||
void PlainTableReader::FillIndexes(
|
||||
size_t sub_index_size_needed,
|
||||
const std::vector<IndexRecord*>& hash_to_offsets,
|
||||
const std::vector<uint32_t>& bucket_count) {
|
||||
Log(options_.info_log, "Reserving %zu bytes for sub index",
|
||||
sub_index_size_needed);
|
||||
// 8 bytes buffer for variable length size
|
||||
size_t buffer_size = 8 * 8;
|
||||
size_t buffer_used = 0;
|
||||
sub_index_size_needed += buffer_size;
|
||||
sub_index_ = new char[sub_index_size_needed];
|
||||
size_t sub_index_offset = 0;
|
||||
char* prev_ptr;
|
||||
char* cur_ptr;
|
||||
uint32_t* sub_index_ptr;
|
||||
for (int i = 0; i < hash_table_size_; i++) {
|
||||
uint32_t num_keys_for_bucket = bucket_count[i];
|
||||
switch (num_keys_for_bucket) {
|
||||
case 0:
|
||||
// No key for bucket
|
||||
hash_table_[i] = data_end_offset_;
|
||||
break;
|
||||
case 1:
|
||||
// point directly to the file offset
|
||||
hash_table_[i] = hash_to_offsets[i]->offset;
|
||||
break;
|
||||
default:
|
||||
// point to second level indexes.
|
||||
hash_table_[i] = sub_index_offset | kSubIndexMask;
|
||||
prev_ptr = sub_index_ + sub_index_offset;
|
||||
cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
|
||||
sub_index_offset += (cur_ptr - prev_ptr);
|
||||
if (cur_ptr - prev_ptr > 2
|
||||
|| (cur_ptr - prev_ptr == 2 && num_keys_for_bucket <= 127)) {
|
||||
// Need to resize sub_index. Exponentially grow buffer.
|
||||
buffer_used += cur_ptr - prev_ptr - 1;
|
||||
if (buffer_used + 4 > buffer_size) {
|
||||
Log(options_.info_log, "Recalculate suffix_map length to %zu",
|
||||
sub_index_size_needed);
|
||||
|
||||
sub_index_size_needed += buffer_size;
|
||||
buffer_size *= 2;
|
||||
char* new_sub_index = new char[sub_index_size_needed];
|
||||
memcpy(new_sub_index, sub_index_, sub_index_offset);
|
||||
delete[] sub_index_;
|
||||
sub_index_ = new_sub_index;
|
||||
}
|
||||
}
|
||||
sub_index_ptr = (uint32_t*) (sub_index_ + sub_index_offset);
|
||||
IndexRecord* record = hash_to_offsets[i];
|
||||
int j;
|
||||
for (j = num_keys_for_bucket - 1; j >= 0 && record;
|
||||
j--, record = record->next) {
|
||||
sub_index_ptr[j] = record->offset;
|
||||
}
|
||||
assert(j == -1 && record == nullptr);
|
||||
sub_index_offset += kOffsetLen * num_keys_for_bucket;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
|
||||
hash_table_size_, sub_index_size_needed);
|
||||
}
|
||||
|
||||
Status PlainTableReader::PopulateIndex() {
|
||||
// Get mmapped memory to file_data_.
|
||||
Status s = file_->Read(0, file_size_, &file_data_, nullptr);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
IndexRecordList record_list(kRecordsPerGroup);
|
||||
// First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
|
||||
// for a prefix (starting from the first one), generate a record of (hash,
|
||||
// offset) and append it to IndexRecordList, which is a data structure created
|
||||
// to store them.
|
||||
int num_prefixes = PopulateIndexRecordList(&record_list);
|
||||
// Calculated hash table and bloom filter size and allocate memory for indexes
|
||||
// and bloom filter based on the number of prefixes.
|
||||
AllocateIndexAndBloom(num_prefixes);
|
||||
|
||||
// Bucketize all the index records to a temp data structure, in which for
|
||||
// each bucket, we generate a linked list of IndexRecord, in reversed order.
|
||||
std::vector<IndexRecord*> hash_to_offsets(hash_table_size_, nullptr);
|
||||
std::vector<uint32_t> bucket_count(hash_table_size_, 0);
|
||||
size_t sub_index_size_needed = BucketizeIndexesAndFillBloom(
|
||||
record_list, num_prefixes, &hash_to_offsets, &bucket_count);
|
||||
// From the temp data structure, populate indexes.
|
||||
FillIndexes(sub_index_size_needed, hash_to_offsets, bucket_count);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
|
||||
uint32_t prefix_hash, bool& prefix_matched,
|
||||
uint32_t& ret_offset) {
|
||||
prefix_matched = false;
|
||||
int bucket = GetBucketIdFromHash(prefix_hash, hash_table_size_);
|
||||
uint32_t bucket_value = hash_table_[bucket];
|
||||
if (bucket_value == data_end_offset_) {
|
||||
ret_offset = data_end_offset_;
|
||||
return Status::OK();
|
||||
} else if ((bucket_value & kSubIndexMask) == 0) {
|
||||
// point directly to the file
|
||||
ret_offset = bucket_value;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// point to sub-index, need to do a binary search
|
||||
uint32_t low = 0;
|
||||
uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask;
|
||||
|
||||
const char* index_ptr = sub_index_ + prefix_index_offset;
|
||||
uint32_t upper_bound = 0;
|
||||
const uint32_t* base_ptr = (const uint32_t*) GetVarint32Ptr(index_ptr,
|
||||
index_ptr + 4,
|
||||
&upper_bound);
|
||||
uint32_t high = upper_bound;
|
||||
ParsedInternalKey mid_key;
|
||||
ParsedInternalKey parsed_target;
|
||||
if (!ParseInternalKey(target, &parsed_target)) {
|
||||
return Status::Corruption(Slice());
|
||||
}
|
||||
|
||||
// The key is between [low, high). Do a binary search between it.
|
||||
while (high - low > 1) {
|
||||
uint32_t mid = (high + low) / 2;
|
||||
uint32_t file_offset = base_ptr[mid];
|
||||
size_t tmp;
|
||||
Status s = ReadKey(file_data_.data() + file_offset, &mid_key, tmp);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
|
||||
if (cmp_result < 0) {
|
||||
low = mid;
|
||||
} else {
|
||||
if (cmp_result == 0) {
|
||||
// Happen to have found the exact key or target is smaller than the
|
||||
// first key after base_offset.
|
||||
prefix_matched = true;
|
||||
ret_offset = file_offset;
|
||||
return Status::OK();
|
||||
} else {
|
||||
high = mid;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Both of the key at the position low or low+1 could share the same
|
||||
// prefix as target. We need to rule out one of them to avoid to go
|
||||
// to the wrong prefix.
|
||||
ParsedInternalKey low_key;
|
||||
size_t tmp;
|
||||
uint32_t low_key_offset = base_ptr[low];
|
||||
Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, tmp);
|
||||
if (GetPrefix(low_key) == prefix) {
|
||||
prefix_matched = true;
|
||||
ret_offset = low_key_offset;
|
||||
} else if (low + 1 < upper_bound) {
|
||||
// There is possible a next prefix, return it
|
||||
prefix_matched = false;
|
||||
ret_offset = base_ptr[low + 1];
|
||||
} else {
|
||||
// target is larger than a key of the last prefix in this bucket
|
||||
// but with a different prefix. Key does not exist.
|
||||
ret_offset = data_end_offset_;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
bool PlainTableReader::MayHavePrefix(uint32_t hash) {
|
||||
return bloom_ == nullptr || bloom_->MayContainHash(hash);
|
||||
}
|
||||
|
||||
Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) {
|
||||
return options_.prefix_extractor->Transform(target.user_key);
|
||||
}
|
||||
|
||||
Status PlainTableReader::ReadKey(const char* row_ptr, ParsedInternalKey* key,
|
||||
size_t& bytes_read) {
|
||||
const char* key_ptr = nullptr;
|
||||
bytes_read = 0;
|
||||
size_t user_key_size = 0;
|
||||
if (IsFixedLength()) {
|
||||
user_key_size = user_key_len_;
|
||||
key_ptr = row_ptr;
|
||||
} else {
|
||||
uint32_t tmp_size = 0;
|
||||
key_ptr = GetVarint32Ptr(row_ptr, file_data_.data() + data_end_offset_,
|
||||
&tmp_size);
|
||||
if (key_ptr == nullptr) {
|
||||
return Status::Corruption("Unable to read the next key");
|
||||
}
|
||||
user_key_size = (size_t)tmp_size;
|
||||
bytes_read = key_ptr - row_ptr;
|
||||
}
|
||||
if (key_ptr + user_key_size + 1 >= file_data_.data() + data_end_offset_) {
|
||||
return Status::Corruption("Unable to read the next key");
|
||||
}
|
||||
|
||||
if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) {
|
||||
// Special encoding for the row with seqID=0
|
||||
key->user_key = Slice(key_ptr, user_key_size);
|
||||
key->sequence = 0;
|
||||
key->type = kTypeValue;
|
||||
bytes_read += user_key_size + 1;
|
||||
} else {
|
||||
if (row_ptr + user_key_size + 8 >= file_data_.data() + data_end_offset_) {
|
||||
return Status::Corruption("Unable to read the next key");
|
||||
}
|
||||
if (!ParseInternalKey(Slice(key_ptr, user_key_size + 8), key)) {
|
||||
return Status::Corruption(Slice());
|
||||
}
|
||||
bytes_read += user_key_size + 8;
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status PlainTableReader::Next(uint32_t offset, ParsedInternalKey* key,
|
||||
Slice* value, uint32_t& next_offset) {
|
||||
if (offset == data_end_offset_) {
|
||||
next_offset = data_end_offset_;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
if (offset > data_end_offset_) {
|
||||
return Status::Corruption("Offset is out of file size");
|
||||
}
|
||||
|
||||
const char* row_ptr = file_data_.data() + offset;
|
||||
size_t bytes_for_key;
|
||||
Status s = ReadKey(row_ptr, key, bytes_for_key);
|
||||
uint32_t value_size;
|
||||
const char* value_ptr = GetVarint32Ptr(row_ptr + bytes_for_key,
|
||||
file_data_.data() + data_end_offset_,
|
||||
&value_size);
|
||||
if (value_ptr == nullptr) {
|
||||
return Status::Corruption("Error reading value length.");
|
||||
}
|
||||
next_offset = offset + (value_ptr - row_ptr) + value_size;
|
||||
if (next_offset > data_end_offset_) {
|
||||
return Status::Corruption("Reach end of file when reading value");
|
||||
}
|
||||
*value = Slice(value_ptr, value_size);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
|
||||
void* arg,
|
||||
bool (*saver)(void*, const ParsedInternalKey&,
|
||||
const Slice&, bool),
|
||||
void (*mark_key_may_exist)(void*)) {
|
||||
// Check bloom filter first.
|
||||
Slice prefix_slice = GetPrefix(target);
|
||||
uint32_t prefix_hash = GetSliceHash(prefix_slice);
|
||||
if (!MayHavePrefix(prefix_hash)) {
|
||||
return Status::OK();
|
||||
}
|
||||
uint32_t offset;
|
||||
bool prefix_match;
|
||||
Status s = GetOffset(target, prefix_slice, prefix_hash, prefix_match, offset);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
ParsedInternalKey found_key;
|
||||
ParsedInternalKey parsed_target;
|
||||
if (!ParseInternalKey(target, &parsed_target)) {
|
||||
return Status::Corruption(Slice());
|
||||
}
|
||||
|
||||
Slice found_value;
|
||||
while (offset < data_end_offset_) {
|
||||
Status s = Next(offset, &found_key, &found_value, offset);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
if (!prefix_match) {
|
||||
// Need to verify prefix for the first key found if it is not yet
|
||||
// checked.
|
||||
if (GetPrefix(found_key) != prefix_slice) {
|
||||
return Status::OK();
|
||||
}
|
||||
prefix_match = true;
|
||||
}
|
||||
if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
|
||||
if (!(*saver)(arg, found_key, found_value, true)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
PlainTableIterator::PlainTableIterator(PlainTableReader* table) :
|
||||
table_(table) {
|
||||
next_offset_ = offset_ = table_->data_end_offset_;
|
||||
}
|
||||
|
||||
PlainTableIterator::~PlainTableIterator() {
|
||||
}
|
||||
|
||||
bool PlainTableIterator::Valid() const {
|
||||
return offset_ < table_->data_end_offset_
|
||||
&& offset_ >= table_->data_start_offset_;
|
||||
}
|
||||
|
||||
void PlainTableIterator::SeekToFirst() {
|
||||
next_offset_ = table_->data_start_offset_;
|
||||
if (next_offset_ >= table_->data_end_offset_) {
|
||||
next_offset_ = offset_ = table_->data_end_offset_;
|
||||
} else {
|
||||
Next();
|
||||
}
|
||||
}
|
||||
|
||||
void PlainTableIterator::SeekToLast() {
|
||||
assert(false);
|
||||
}
|
||||
|
||||
void PlainTableIterator::Seek(const Slice& target) {
|
||||
Slice prefix_slice = table_->GetPrefix(target);
|
||||
uint32_t prefix_hash = GetSliceHash(prefix_slice);
|
||||
if (!table_->MayHavePrefix(prefix_hash)) {
|
||||
offset_ = next_offset_ = table_->data_end_offset_;
|
||||
return;
|
||||
}
|
||||
bool prefix_match;
|
||||
status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match,
|
||||
next_offset_);
|
||||
if (!status_.ok()) {
|
||||
offset_ = next_offset_ = table_->data_end_offset_;
|
||||
return;
|
||||
}
|
||||
|
||||
if (next_offset_ < table_-> data_end_offset_) {
|
||||
for (Next(); status_.ok() && Valid(); Next()) {
|
||||
if (!prefix_match) {
|
||||
// Need to verify the first key's prefix
|
||||
if (table_->GetPrefix(key()) != prefix_slice) {
|
||||
offset_ = next_offset_ = table_->data_end_offset_;
|
||||
break;
|
||||
}
|
||||
prefix_match = true;
|
||||
}
|
||||
if (table_->internal_comparator_.Compare(key(), target) >= 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
offset_ = table_->data_end_offset_;
|
||||
}
|
||||
}
|
||||
|
||||
void PlainTableIterator::Next() {
|
||||
offset_ = next_offset_;
|
||||
if (offset_ < table_->data_end_offset_) {
|
||||
Slice tmp_slice;
|
||||
ParsedInternalKey parsed_key;
|
||||
status_ = table_->Next(next_offset_, &parsed_key, &value_, next_offset_);
|
||||
if (status_.ok()) {
|
||||
// Make a copy in this case. TODO optimize.
|
||||
tmp_str_.clear();
|
||||
AppendInternalKey(&tmp_str_, parsed_key);
|
||||
key_ = Slice(tmp_str_);
|
||||
} else {
|
||||
offset_ = next_offset_ = table_->data_end_offset_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PlainTableIterator::Prev() {
|
||||
assert(false);
|
||||
}
|
||||
|
||||
Slice PlainTableIterator::key() const {
|
||||
assert(Valid());
|
||||
return key_;
|
||||
}
|
||||
|
||||
Slice PlainTableIterator::value() const {
|
||||
assert(Valid());
|
||||
return value_;
|
||||
}
|
||||
|
||||
Status PlainTableIterator::status() const {
|
||||
return status_;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
220
table/plain_table_reader.h
Normal file
220
table/plain_table_reader.h
Normal file
@@ -0,0 +1,220 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "rocksdb/table_properties.h"
|
||||
#include "table/table_reader.h"
|
||||
#include "table/plain_table_factory.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Block;
|
||||
class BlockHandle;
|
||||
class Footer;
|
||||
struct Options;
|
||||
class RandomAccessFile;
|
||||
struct ReadOptions;
|
||||
class TableCache;
|
||||
class TableReader;
|
||||
class DynamicBloom;
|
||||
class InternalKeyComparator;
|
||||
|
||||
using std::unique_ptr;
|
||||
using std::unordered_map;
|
||||
extern const uint32_t kPlainTableVariableLength;
|
||||
|
||||
// Based on following output file format shown in plain_table_factory.h
|
||||
// When opening the output file, IndexedTableReader creates a hash table
|
||||
// from key prefixes to offset of the output file. IndexedTable will decide
|
||||
// whether it points to the data offset of the first key with the key prefix
|
||||
// or the offset of it. If there are too many keys share this prefix, it will
|
||||
// create a binary search-able index from the suffix to offset on disk.
|
||||
//
|
||||
// The implementation of IndexedTableReader requires output file is mmaped
|
||||
class PlainTableReader: public TableReader {
|
||||
public:
|
||||
static Status Open(const Options& options, const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||
unique_ptr<TableReader>* table,
|
||||
const int bloom_bits_per_key, double hash_table_ratio);
|
||||
|
||||
bool PrefixMayMatch(const Slice& internal_prefix);
|
||||
|
||||
Iterator* NewIterator(const ReadOptions&);
|
||||
|
||||
Status Get(const ReadOptions&, const Slice& key, void* arg,
|
||||
bool (*result_handler)(void* arg, const ParsedInternalKey& k,
|
||||
const Slice& v, bool),
|
||||
void (*mark_key_may_exist)(void*) = nullptr);
|
||||
|
||||
uint64_t ApproximateOffsetOf(const Slice& key);
|
||||
|
||||
void SetupForCompaction();
|
||||
|
||||
const TableProperties& GetTableProperties() { return table_properties_; }
|
||||
|
||||
PlainTableReader(const EnvOptions& storage_options,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
uint64_t file_size, int bloom_num_bits,
|
||||
double hash_table_ratio,
|
||||
const TableProperties& table_properties);
|
||||
~PlainTableReader();
|
||||
|
||||
private:
|
||||
struct IndexRecord;
|
||||
class IndexRecordList;
|
||||
|
||||
uint32_t* hash_table_ = nullptr;
|
||||
int hash_table_size_ = 0;
|
||||
char* sub_index_ = nullptr;
|
||||
|
||||
Options options_;
|
||||
const EnvOptions& soptions_;
|
||||
const InternalKeyComparator internal_comparator_;
|
||||
Status status_;
|
||||
unique_ptr<RandomAccessFile> file_;
|
||||
|
||||
Slice file_data_;
|
||||
uint32_t version_;
|
||||
uint32_t file_size_;
|
||||
|
||||
const double kHashTableRatio;
|
||||
const int kBloomBitsPerKey;
|
||||
DynamicBloom* bloom_ = nullptr;
|
||||
|
||||
TableProperties table_properties_;
|
||||
const uint32_t data_start_offset_ = 0;
|
||||
const uint32_t data_end_offset_;
|
||||
const size_t user_key_len_;
|
||||
|
||||
static const size_t kNumInternalBytes = 8;
|
||||
static const uint32_t kSubIndexMask = 0x80000000;
|
||||
static const size_t kOffsetLen = sizeof(uint32_t);
|
||||
static const uint64_t kMaxFileSize = 1u << 31;
|
||||
static const size_t kRecordsPerGroup = 256;
|
||||
// To speed up the search for keys with same prefix, we'll add index key for
|
||||
// every N keys, where the "N" is determined by
|
||||
// kIndexIntervalForSamePrefixKeys
|
||||
static const size_t kIndexIntervalForSamePrefixKeys = 16;
|
||||
|
||||
bool IsFixedLength() const {
|
||||
return user_key_len_ != kPlainTableVariableLength;
|
||||
}
|
||||
|
||||
size_t GetFixedInternalKeyLength() const {
|
||||
return user_key_len_ + kNumInternalBytes;
|
||||
}
|
||||
|
||||
friend class TableCache;
|
||||
friend class PlainTableIterator;
|
||||
|
||||
// Internal helper function to generate an IndexRecordList object from all
|
||||
// the rows, which contains index records as a list.
|
||||
int PopulateIndexRecordList(IndexRecordList* record_list);
|
||||
|
||||
// Internal helper function to allocate memory for indexes and bloom filters
|
||||
void AllocateIndexAndBloom(int num_prefixes);
|
||||
|
||||
// Internal helper function to bucket index record list to hash buckets.
|
||||
// hash_to_offsets is sized of of hash_table_size_, each contains a linked
|
||||
// list
|
||||
// of offsets for the hash, in reversed order.
|
||||
// bucket_count is sized of hash_table_size_. The value is how many index
|
||||
// records are there in hash_to_offsets for the same bucket.
|
||||
size_t BucketizeIndexesAndFillBloom(
|
||||
IndexRecordList& record_list, int num_prefixes,
|
||||
std::vector<IndexRecord*>* hash_to_offsets,
|
||||
std::vector<uint32_t>* bucket_count);
|
||||
|
||||
// Internal helper class to fill the indexes and bloom filters to internal
|
||||
// data structures. hash_to_offsets and bucket_count are bucketized indexes
|
||||
// and counts generated by BucketizeIndexesAndFillBloom().
|
||||
void FillIndexes(size_t sub_index_size_needed,
|
||||
const std::vector<IndexRecord*>& hash_to_offsets,
|
||||
const std::vector<uint32_t>& bucket_count);
|
||||
|
||||
// PopulateIndex() builds index of keys. It must be called before any query
|
||||
// to the table.
|
||||
//
|
||||
// hash_table_ contains buckets size of hash_table_size_, each is a 32-bit
|
||||
// integer. The lower 31 bits contain an offset value (explained below) and
|
||||
// the first bit of the integer indicates type of the offset.
|
||||
//
|
||||
// +--------------+------------------------------------------------------+
|
||||
// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
|
||||
// +--------------+------------------------------------------------------+
|
||||
//
|
||||
// Explanation for the "flag bit":
|
||||
//
|
||||
// 0 indicates that the bucket contains only one prefix (no conflict when
|
||||
// hashing this prefix), whose first row starts from this offset of the
|
||||
// file.
|
||||
// 1 indicates that the bucket contains more than one prefixes, or there
|
||||
// are too many rows for one prefix so we need a binary search for it. In
|
||||
// this case, the offset indicates the offset of sub_index_ holding the
|
||||
// binary search indexes of keys for those rows. Those binary search indexes
|
||||
// are organized in this way:
|
||||
//
|
||||
// The first 4 bytes, indicate how many indexes (N) are stored after it. After
|
||||
// it, there are N 32-bit integers, each points of an offset of the file,
|
||||
// which
|
||||
// points to starting of a row. Those offsets need to be guaranteed to be in
|
||||
// ascending order so the keys they are pointing to are also in ascending
|
||||
// order
|
||||
// to make sure we can use them to do binary searches. Below is visual
|
||||
// presentation of a bucket.
|
||||
//
|
||||
// <begin>
|
||||
// number_of_records: varint32
|
||||
// record 1 file offset: fixedint32
|
||||
// record 2 file offset: fixedint32
|
||||
// ....
|
||||
// record N file offset: fixedint32
|
||||
// <end>
|
||||
Status PopulateIndex();
|
||||
|
||||
// Check bloom filter to see whether it might contain this prefix.
|
||||
// The hash of the prefix is given, since it can be reused for index lookup
|
||||
// too.
|
||||
bool MayHavePrefix(uint32_t hash);
|
||||
|
||||
Status ReadKey(const char* row_ptr, ParsedInternalKey* key,
|
||||
size_t& bytes_read);
|
||||
// Read the key and value at offset to key and value.
|
||||
// tmp_slice is a tmp slice.
|
||||
// return next_offset as the offset for the next key.
|
||||
Status Next(uint32_t offset, ParsedInternalKey* key, Slice* value,
|
||||
uint32_t& next_offset);
|
||||
// Get file offset for key target.
|
||||
// return value prefix_matched is set to true if the offset is confirmed
|
||||
// for a key with the same prefix as target.
|
||||
Status GetOffset(const Slice& target, const Slice& prefix,
|
||||
uint32_t prefix_hash, bool& prefix_matched,
|
||||
uint32_t& ret_offset);
|
||||
|
||||
Slice GetPrefix(const Slice& target) {
|
||||
assert(target.size() >= 8); // target is internal key
|
||||
return options_.prefix_extractor->Transform(
|
||||
Slice(target.data(), target.size() - 8));
|
||||
}
|
||||
|
||||
Slice GetPrefix(const ParsedInternalKey& target);
|
||||
|
||||
// No copying allowed
|
||||
explicit PlainTableReader(const TableReader&) = delete;
|
||||
void operator=(const TableReader&) = delete;
|
||||
};
|
||||
} // namespace rocksdb
|
||||
55
table/table_builder.h
Normal file
55
table/table_builder.h
Normal file
@@ -0,0 +1,55 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
class Status;
|
||||
|
||||
// TableBuilder provides the interface used to build a Table
|
||||
// (an immutable and sorted map from keys to values).
|
||||
//
|
||||
// Multiple threads can invoke const methods on a TableBuilder without
|
||||
// external synchronization, but if any of the threads may call a
|
||||
// non-const method, all threads accessing the same TableBuilder must use
|
||||
// external synchronization.
|
||||
class TableBuilder {
|
||||
public:
|
||||
// REQUIRES: Either Finish() or Abandon() has been called.
|
||||
virtual ~TableBuilder() {}
|
||||
|
||||
// Add key,value to the table being constructed.
|
||||
// REQUIRES: key is after any previously added key according to comparator.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
virtual void Add(const Slice& key, const Slice& value) = 0;
|
||||
|
||||
// Return non-ok iff some error has been detected.
|
||||
virtual Status status() const = 0;
|
||||
|
||||
// Finish building the table.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
virtual Status Finish() = 0;
|
||||
|
||||
// Indicate that the contents of this builder should be abandoned.
|
||||
// If the caller is not going to call Finish(), it must call Abandon()
|
||||
// before destroying this builder.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
virtual void Abandon() = 0;
|
||||
|
||||
// Number of calls to Add() so far.
|
||||
virtual uint64_t NumEntries() const = 0;
|
||||
|
||||
// Size of the file generated so far. If invoked after a successful
|
||||
// Finish() call, returns the size of the final generated file.
|
||||
virtual uint64_t FileSize() const = 0;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
114
table/table_properties.cc
Normal file
114
table/table_properties.cc
Normal file
@@ -0,0 +1,114 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "rocksdb/table_properties.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
void AppendProperty(
|
||||
std::string& props,
|
||||
const std::string& key,
|
||||
const std::string& value,
|
||||
const std::string& prop_delim,
|
||||
const std::string& kv_delim) {
|
||||
props.append(key);
|
||||
props.append(kv_delim);
|
||||
props.append(value);
|
||||
props.append(prop_delim);
|
||||
}
|
||||
|
||||
template <class TValue>
|
||||
void AppendProperty(
|
||||
std::string& props,
|
||||
const std::string& key,
|
||||
const TValue& value,
|
||||
const std::string& prop_delim,
|
||||
const std::string& kv_delim) {
|
||||
AppendProperty(
|
||||
props, key, std::to_string(value), prop_delim, kv_delim
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
std::string TableProperties::ToString(
|
||||
const std::string& prop_delim,
|
||||
const std::string& kv_delim) const {
|
||||
std::string result;
|
||||
result.reserve(1024);
|
||||
|
||||
// Basic Info
|
||||
AppendProperty(
|
||||
result, "# data blocks", num_data_blocks, prop_delim, kv_delim
|
||||
);
|
||||
AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
|
||||
|
||||
AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
|
||||
AppendProperty(
|
||||
result,
|
||||
"raw average key size",
|
||||
num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0,
|
||||
prop_delim,
|
||||
kv_delim
|
||||
);
|
||||
AppendProperty(
|
||||
result, "raw value size", raw_value_size, prop_delim, kv_delim
|
||||
);
|
||||
AppendProperty(
|
||||
result,
|
||||
"raw average value size",
|
||||
num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0,
|
||||
prop_delim,
|
||||
kv_delim
|
||||
);
|
||||
|
||||
AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
|
||||
AppendProperty(result, "index block size", index_size, prop_delim, kv_delim);
|
||||
AppendProperty(
|
||||
result, "filter block size", filter_size, prop_delim, kv_delim
|
||||
);
|
||||
AppendProperty(
|
||||
result,
|
||||
"(estimated) table size",
|
||||
data_size + index_size + filter_size,
|
||||
prop_delim,
|
||||
kv_delim
|
||||
);
|
||||
|
||||
AppendProperty(
|
||||
result,
|
||||
"filter policy name",
|
||||
filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
|
||||
prop_delim,
|
||||
kv_delim
|
||||
);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const std::string TablePropertiesNames::kDataSize =
|
||||
"rocksdb.data.size";
|
||||
const std::string TablePropertiesNames::kIndexSize =
|
||||
"rocksdb.index.size";
|
||||
const std::string TablePropertiesNames::kFilterSize =
|
||||
"rocksdb.filter.size";
|
||||
const std::string TablePropertiesNames::kRawKeySize =
|
||||
"rocksdb.raw.key.size";
|
||||
const std::string TablePropertiesNames::kRawValueSize =
|
||||
"rocksdb.raw.value.size";
|
||||
const std::string TablePropertiesNames::kNumDataBlocks =
|
||||
"rocksdb.num.data.blocks";
|
||||
const std::string TablePropertiesNames::kNumEntries =
|
||||
"rocksdb.num.entries";
|
||||
const std::string TablePropertiesNames::kFilterPolicy =
|
||||
"rocksdb.filter.policy";
|
||||
const std::string TablePropertiesNames::kFormatVersion =
|
||||
"rocksdb.format.version";
|
||||
const std::string TablePropertiesNames::kFixedKeyLen =
|
||||
"rocksdb.fixed.key.length";
|
||||
|
||||
extern const std::string kPropertiesBlock = "rocksdb.properties";
|
||||
|
||||
} // namespace rocksdb
|
||||
71
table/table_reader.h
Normal file
71
table/table_reader.h
Normal file
@@ -0,0 +1,71 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Iterator;
|
||||
struct ParsedInternalKey;
|
||||
class Slice;
|
||||
struct ReadOptions;
|
||||
struct TableProperties;
|
||||
|
||||
// A Table is a sorted map from strings to strings. Tables are
|
||||
// immutable and persistent. A Table may be safely accessed from
|
||||
// multiple threads without external synchronization.
|
||||
class TableReader {
|
||||
public:
|
||||
virtual ~TableReader() {}
|
||||
|
||||
// Determine whether there is a chance that the current table file
|
||||
// contains the key a key starting with iternal_prefix. The specific
|
||||
// table implementation can use bloom filter and/or other heuristic
|
||||
// to filter out this table as a whole.
|
||||
virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0;
|
||||
|
||||
// Returns a new iterator over the table contents.
|
||||
// The result of NewIterator() is initially invalid (caller must
|
||||
// call one of the Seek methods on the iterator before using it).
|
||||
virtual Iterator* NewIterator(const ReadOptions&) = 0;
|
||||
|
||||
// Given a key, return an approximate byte offset in the file where
|
||||
// the data for that key begins (or would begin if the key were
|
||||
// present in the file). The returned value is in terms of file
|
||||
// bytes, and so includes effects like compression of the underlying data.
|
||||
// E.g., the approximate offset of the last key in the table will
|
||||
// be close to the file length.
|
||||
virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
|
||||
|
||||
// Set up the table for Compaction. Might change some parameters with
|
||||
// posix_fadvise
|
||||
virtual void SetupForCompaction() = 0;
|
||||
|
||||
virtual const TableProperties& GetTableProperties() = 0;
|
||||
|
||||
// Calls (*result_handler)(handle_context, ...) repeatedly, starting with
|
||||
// the entry found after a call to Seek(key), until result_handler returns
|
||||
// false, where k is the actual internal key for a row found and v as the
|
||||
// value of the key. didIO is true if I/O is involved in the operation. May
|
||||
// not make such a call if filter policy says that key is not present.
|
||||
//
|
||||
// mark_key_may_exist_handler needs to be called when it is configured to be
|
||||
// memory only and the key is not found in the block cache, with
|
||||
// the parameter to be handle_context.
|
||||
//
|
||||
// readOptions is the options for the read
|
||||
// key is the key to search for
|
||||
virtual Status Get(
|
||||
const ReadOptions& readOptions, const Slice& key, void* handle_context,
|
||||
bool (*result_handler)(void* arg, const ParsedInternalKey& k,
|
||||
const Slice& v, bool didIO),
|
||||
void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
@@ -6,12 +6,13 @@
|
||||
#include <gflags/gflags.h>
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "port/atomic_pointer.h"
|
||||
#include "table/block_based_table_factory.h"
|
||||
#include "table/plain_table_factory.h"
|
||||
#include "util/histogram.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
@@ -33,8 +34,8 @@ static std::string MakeKey(int i, int j, bool through_db) {
|
||||
return key.Encode().ToString();
|
||||
}
|
||||
|
||||
static bool DummySaveValue(void* arg, const Slice& ikey, const Slice& v,
|
||||
bool didIO) {
|
||||
static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey,
|
||||
const Slice& v, bool didIO) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -70,7 +71,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
|
||||
Status s;
|
||||
if (!through_db) {
|
||||
env->NewWritableFile(file_name, &file, env_options);
|
||||
tb = opts.table_factory->GetTableBuilder(opts, file.get(),
|
||||
tb = opts.table_factory->NewTableBuilder(opts, file.get(),
|
||||
CompressionType::kNoCompression);
|
||||
} else {
|
||||
s = DB::Open(opts, dbname, &db);
|
||||
@@ -101,7 +102,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
|
||||
Status s = env->NewRandomAccessFile(file_name, &raf, env_options);
|
||||
uint64_t file_size;
|
||||
env->GetFileSize(file_name, &file_size);
|
||||
s = opts.table_factory->GetTableReader(opts, env_options, std::move(raf),
|
||||
s = opts.table_factory->NewTableReader(opts, env_options, std::move(raf),
|
||||
file_size, &table_reader);
|
||||
}
|
||||
|
||||
@@ -218,6 +219,8 @@ DEFINE_bool(iterator, false, "For test iterator");
|
||||
DEFINE_bool(through_db, false, "If enable, a DB instance will be created and "
|
||||
"the query will be against DB. Otherwise, will be directly against "
|
||||
"a table reader.");
|
||||
DEFINE_bool(plain_table, false, "Use PlainTable");
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
|
||||
@@ -230,10 +233,23 @@ int main(int argc, char** argv) {
|
||||
options.prefix_extractor = rocksdb::NewFixedPrefixTransform(
|
||||
FLAGS_prefix_len);
|
||||
}
|
||||
options.SetUpDefaultFlushBlockPolicyFactory();
|
||||
rocksdb::ReadOptions ro;
|
||||
rocksdb::EnvOptions env_options;
|
||||
options.create_if_missing = true;
|
||||
options.compression = rocksdb::CompressionType::kNoCompression;
|
||||
options.internal_comparator =
|
||||
new rocksdb::InternalKeyComparator(options.comparator);
|
||||
|
||||
if (FLAGS_plain_table) {
|
||||
options.allow_mmap_reads = true;
|
||||
env_options.use_mmap_reads = true;
|
||||
tf = new rocksdb::PlainTableFactory(16, (FLAGS_prefix_len == 16) ? 0 : 8,
|
||||
0.75);
|
||||
options.prefix_extractor = rocksdb::NewFixedPrefixTransform(
|
||||
FLAGS_prefix_len);
|
||||
} else {
|
||||
tf = new rocksdb::BlockBasedTableFactory();
|
||||
}
|
||||
options.table_factory =
|
||||
std::shared_ptr<rocksdb::TableFactory>(tf);
|
||||
TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -20,18 +20,17 @@ namespace rocksdb {
|
||||
namespace {
|
||||
|
||||
typedef Iterator* (*BlockFunction)(void*, const ReadOptions&,
|
||||
const EnvOptions& soptions, const Slice&,
|
||||
bool for_compaction);
|
||||
const EnvOptions& soptions,
|
||||
const InternalKeyComparator& icomparator,
|
||||
const Slice&, bool for_compaction);
|
||||
|
||||
class TwoLevelIterator: public Iterator {
|
||||
public:
|
||||
TwoLevelIterator(
|
||||
Iterator* index_iter,
|
||||
BlockFunction block_function,
|
||||
void* arg,
|
||||
const ReadOptions& options,
|
||||
const EnvOptions& soptions,
|
||||
bool for_compaction);
|
||||
TwoLevelIterator(Iterator* index_iter, BlockFunction block_function,
|
||||
void* arg, const ReadOptions& options,
|
||||
const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
bool for_compaction);
|
||||
|
||||
virtual ~TwoLevelIterator();
|
||||
|
||||
@@ -76,6 +75,7 @@ class TwoLevelIterator: public Iterator {
|
||||
void* arg_;
|
||||
const ReadOptions options_;
|
||||
const EnvOptions& soptions_;
|
||||
const InternalKeyComparator& internal_comparator_;
|
||||
Status status_;
|
||||
IteratorWrapper index_iter_;
|
||||
IteratorWrapper data_iter_; // May be nullptr
|
||||
@@ -86,20 +86,17 @@ class TwoLevelIterator: public Iterator {
|
||||
};
|
||||
|
||||
TwoLevelIterator::TwoLevelIterator(
|
||||
Iterator* index_iter,
|
||||
BlockFunction block_function,
|
||||
void* arg,
|
||||
const ReadOptions& options,
|
||||
const EnvOptions& soptions,
|
||||
bool for_compaction)
|
||||
Iterator* index_iter, BlockFunction block_function, void* arg,
|
||||
const ReadOptions& options, const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_comparator, bool for_compaction)
|
||||
: block_function_(block_function),
|
||||
arg_(arg),
|
||||
options_(options),
|
||||
soptions_(soptions),
|
||||
internal_comparator_(internal_comparator),
|
||||
index_iter_(index_iter),
|
||||
data_iter_(nullptr),
|
||||
for_compaction_(for_compaction) {
|
||||
}
|
||||
for_compaction_(for_compaction) {}
|
||||
|
||||
TwoLevelIterator::~TwoLevelIterator() {
|
||||
}
|
||||
@@ -181,8 +178,9 @@ void TwoLevelIterator::InitDataBlock() {
|
||||
// data_iter_ is already constructed with this iterator, so
|
||||
// no need to change anything
|
||||
} else {
|
||||
Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle,
|
||||
for_compaction_);
|
||||
Iterator* iter =
|
||||
(*block_function_)(arg_, options_, soptions_, internal_comparator_,
|
||||
handle, for_compaction_);
|
||||
data_block_handle_.assign(handle.data(), handle.size());
|
||||
SetDataIterator(iter);
|
||||
}
|
||||
@@ -191,15 +189,14 @@ void TwoLevelIterator::InitDataBlock() {
|
||||
|
||||
} // namespace
|
||||
|
||||
Iterator* NewTwoLevelIterator(
|
||||
Iterator* index_iter,
|
||||
BlockFunction block_function,
|
||||
void* arg,
|
||||
const ReadOptions& options,
|
||||
const EnvOptions& soptions,
|
||||
bool for_compaction) {
|
||||
return new TwoLevelIterator(index_iter, block_function, arg,
|
||||
options, soptions, for_compaction);
|
||||
Iterator* NewTwoLevelIterator(Iterator* index_iter,
|
||||
BlockFunction block_function, void* arg,
|
||||
const ReadOptions& options,
|
||||
const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
bool for_compaction) {
|
||||
return new TwoLevelIterator(index_iter, block_function, arg, options,
|
||||
soptions, internal_comparator, for_compaction);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
namespace rocksdb {
|
||||
|
||||
struct ReadOptions;
|
||||
class InternalKeyComparator;
|
||||
|
||||
// Return a new two level iterator. A two-level iterator contains an
|
||||
// index iterator whose values point to a sequence of blocks where
|
||||
@@ -27,14 +28,11 @@ struct ReadOptions;
|
||||
extern Iterator* NewTwoLevelIterator(
|
||||
Iterator* index_iter,
|
||||
Iterator* (*block_function)(
|
||||
void* arg,
|
||||
const ReadOptions& options,
|
||||
const EnvOptions& soptions,
|
||||
const Slice& index_value,
|
||||
bool for_compaction),
|
||||
void* arg,
|
||||
const ReadOptions& options,
|
||||
const EnvOptions& soptions,
|
||||
void* arg, const ReadOptions& options, const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
const Slice& index_value, bool for_compaction),
|
||||
void* arg, const ReadOptions& options, const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
bool for_compaction = false);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
Reference in New Issue
Block a user