Files
xahaud/table/plain_table_reader.cc
Vinnie Falco c168d54495 Squashed 'src/rocksdb2/' changes from 25888ae..1fdd726
1fdd726 Hotfix RocksDB 3.5
d67500a Add `make install` to Makefile in 3.5.fb.
4cb631a update HISTORY.md
cfd0946 comments about the BlockBasedTableOptions migration in Options
REVERT: 25888ae Merge pull request #329 from fyrz/master
REVERT: 89833e5 Fixed signed-unsigned comparison warning in db_test.cc
REVERT: fcac705 Fixed compile warning on Mac caused by unused variables.
REVERT: b3343fd resolution for java build problem introduced by 5ec53f3edf62bec1b690ce12fb21a6c52203f3c8
REVERT: 187b299 ForwardIterator: update prev_key_ only if prefix hasn't changed
REVERT: 5ec53f3 make compaction related options changeable
REVERT: d122e7b Update INSTALL.md
REVERT: 986dad0 Merge pull request #324 from dalgaaf/wip-da-SCA-20140930
REVERT: 8ee75dc db/memtable.cc: remove unused variable merge_result
REVERT: 0fd8bbc db/db_impl.cc: reduce scope of prefix_initialized
REVERT: 676ff7b compaction_picker.cc: remove check for >=0 for unsigned
REVERT: e55aea5 document_db.cc: fix assert
REVERT: d517c83 in_table_factory.cc: use correct format specifier
REVERT: b140375 ttl/ttl_test.cc: prefer prefix ++operator for non-primitive types
REVERT: 43c789c spatialdb/spatial_db.cc: use !empty() instead of 'size() > 0'
REVERT: 0de452e document_db.cc: pass const parameter by reference
REVERT: 4cc8643 util/ldb_cmd.cc: prefer prefix ++operator for non-primitive types
REVERT: af8c2b2 util/signal_test.cc: suppress intentional null pointer deref
REVERT: 33580fa db/db_impl.cc: fix object handling, remove double lines
REVERT: 873f135 db_ttl_impl.h: pass func parameter by reference
REVERT: 8558457 ldb_cmd_execute_result.h: perform init in initialization list
REVERT: 063471b table/table_test.cc: pass func parameter by reference
REVERT: 93548ce table/cuckoo_table_reader.cc: pass func parameter by ref
REVERT: b8b7117 db/version_set.cc: use !empty() instead of 'size() > 0'
REVERT: 8ce050b table/bloom_block.*: pass func parameter by reference
REVERT: 53910dd db_test.cc: pass parameter by reference
REVERT: 68ca534 corruption_test.cc: pass parameter by reference
REVERT: 7506198 cuckoo_table_db_test.cc: add flush after delete
REVERT: 1f96330 Print MB per second compaction throughput separately for reads and writes
REVERT: ffe3d49 Add an instruction about SSE in INSTALL.md
REVERT: ee1f3cc Package generation for Ubuntu and CentOS
REVERT: f0f7955 Fixing comile errors on OS X
REVERT: 99fb613 remove 2 space linter
REVERT: b2d64a4 Fix linters, second try
REVERT: 747523d Print per column family metrics in db_bench
REVERT: 56ebd40 Fix arc lint (should fix #238)
REVERT: 637f891 Merge pull request #321 from eonnen/master
REVERT: 827e31c Make test use a compatible type in the size checks.
REVERT: fd5d80d CompactedDB: log using the correct info_log
REVERT: 2faf49d use GetContext to replace callback function pointer
REVERT: 983d2de Add AUTHORS file. Fix #203
REVERT: abd70c5 Merge pull request #316 from fyrz/ReverseBytewiseComparator
REVERT: 2dc6f62 handle kDelete type in cuckoo builder
REVERT: 8b8011a Changed name of ReverseBytewiseComparator based on review comment
REVERT: 389edb6 universal compaction picker: use double for potential overflow
REVERT: 5340484 Built-in comparator(s) in RocksJava
REVERT: d439451 delay initialization of cuckoo table iterator
REVERT: 94997ea reduce memory usage of cuckoo table builder
REVERT: c627595 improve memory efficiency of cuckoo reader
REVERT: 581442d option to choose module when calculating CuckooTable hash
REVERT: fbd2daf CompactedDBImpl::MultiGet() for better CuckooTable performance
REVERT: 3c68006 CompactedDBImpl
REVERT: f7375f3 Fix double deletes
REVERT: 21ddcf6 Remove allow_thread_local
REVERT: fb4a492 Merge pull request #311 from ankgup87/master
REVERT: 611e286 Merge branch 'master' of https://github.com/facebook/rocksdb
REVERT: 0103b44 Merge branch 'master' of ssh://github.com/ankgup87/rocksdb
REVERT: 1dfb7bb Add block based table config options
REVERT: cdaf44f Enlarge log size cap when printing file summary
REVERT: 7cc1ed7 Merge pull request #309 from naveenatceg/staticbuild
REVERT: ba6d660 Resolving merge conflict
REVERT: 51eeaf6 Addressing review comments
REVERT: fd7d3fe Addressing review comments (adding a env variable to override temp directory)
REVERT: cf7ace8 Addressing review comments
REVERT: 0a29ce5 re-enable BlockBasedTable::SetupForCompaction()
REVERT: 55af370 Remove TODO for checking index checksums
REVERT: 3d74f09 Fix compile
REVERT: 53b0039 Fix release compile
REVERT: d0de413 WriteBatchWithIndex to allow different Comparators for different column families
REVERT: 57a32f1 change target_file_size_base to uint64_t
REVERT: 5e6aee4 dont create backup_input if compaction filter v2 is not used
REVERT: 49b5f94 Merge pull request #306 from Liuchang0812/fix_cast
REVERT: 787cb4d remove cast, replace %llu with % PRIu64
REVERT: a7574d4 Update logging.cc
REVERT: 7e0dcb9 Update logging.cc
REVERT: 57fa3cc Merge pull request #304 from Liuchang0812/fix-check
REVERT: cd44522 Merge pull request #305 from Liuchang0812/fix-logging
REVERT: 6a031b6 remove unused variable
REVERT: 4436f17 fixed #303: replace %ld with % PRId64
REVERT: 7a1bd05 Merge pull request #302 from ankgup87/master
REVERT: 423e52c Merge branch 'master' of https://github.com/facebook/rocksdb
REVERT: bfeef94 Add rate limiter
REVERT: 32f2532 Print compression_size_percent as a signed int
REVERT: 976caca Skip AllocateTest if fallocate() is not supported in the file system
REVERT: 3b897cd Enable no-fbcode RocksDB build
REVERT: f445947 RocksDB: Format uint64 using PRIu64 in db_impl.cc
REVERT: e17bc65 Merge pull request #299 from ankgup87/master
REVERT: b93797a Fix build
REVERT: adae3ca [Java] Fix JNI link error caused by the removal of options.db_stats_log_interval
REVERT: 90b8c07 Fix unit tests errors
REVERT: 51af7c3 CuckooTable: add one option to allow identity function for the first hash function
REVERT: 0350435 Fixed a signed-unsigned comparison in spatial_db.cc -- issue #293
REVERT: 2fb1fea Fix syncronization issues
REVERT: ff76895 Remove some unnecessary constructors
REVERT: feadb9d fix cuckoo table builder test
REVERT: 3c232e1 Fix mac compile
REVERT: 54cada9 Run make format on PR #249
REVERT: 27b22f1 Merge pull request #249 from tdfischer/decompression-refactoring
REVERT: fb6456b Replace naked calls to operator new and delete (Fixes #222)
REVERT: 5600c8f cuckoo table: return estimated size - 1
REVERT: a062e1f SetOptions() for memtable related options
REVERT: e4eca6a Options conversion function for convenience
REVERT: a7c2094 Merge pull request #292 from saghmrossi/master
REVERT: 4d05234 Merge branch 'master' of github.com:saghmrossi/rocksdb
REVERT: 60a4aa1 Test use_mmap_reads
REVERT: 94e43a1 [Java] Fixed 32-bit overflowing issue when converting jlong to size_t
REVERT: f9eaaa6 added include for inttypes.h to fix nonworking printf statements
REVERT: f090575 Replaced "built on on earlier work" by "built on earlier work" in README.md
REVERT: faad439 Fix #284
REVERT: 49aacd8 Fix make install
REVERT: acb9348 [Java] Include WriteBatch into RocksDBSample.java, fix how DbBenchmark.java handles WriteBatch.
REVERT: 4a27a2f Don't sync manifest when disableDataSync = true
REVERT: 9b8480d Merge pull request #287 from yinqiwen/rate-limiter-crash-fix
REVERT: 28be16b fix rate limiter crash #286
REVERT: 04ce1b2 Fix #284
REVERT: add22e3 standardize scripts to run RocksDB benchmarks
REVERT: dee91c2 WriteThread
REVERT: 540a257 Fix WAL synced
REVERT: 24f034b Merge pull request #282 from Chilledheart/develop
REVERT: 49fe329 Fix build issue under macosx
REVERT: ebb5c65 Add make install
REVERT: 0352a9f add_wrapped_bloom_test
REVERT: 9c0e66c Don't run background jobs (flush, compactions) when bg_error_ is set
REVERT: a9639bd Fix valgrind test
REVERT: d1f24dc Relax FlushSchedule test
REVERT: 3d9e6f7 Push model for flushing memtables
REVERT: 059e584 [unit test] CompactRange should fail if we don't have space
REVERT: dd641b2 fix RocksDB java build
REVERT: 53404d9 add_qps_info_in cache bench
REVERT: a52cecb Fix Mac compile
REVERT: 092f97e Fix comments and typos
REVERT: 6cc1286 Added a few statistics for BackupableDB
REVERT: 0a42295 Fix SimpleWriteTimeoutTest
REVERT: 06d9862 Always pass MergeContext as pointer, not reference
REVERT: d343c3f Improve db recovery
REVERT: 6bb7e3e Merger test
REVERT: 88841bd Explicitly cast char to signed char in Hash()
REVERT: 5231146 MemTableOptions
REVERT: 1d284db Addressing review comments
REVERT: 55114e7 Some updates for SpatialDB
REVERT: 171d4ff remove TailingIterator reference in db_impl.h
REVERT: 9b0f7ff rename version_set options_ to db_options_ to avoid confusion
REVERT: 2d57828 Check stop level trigger-0 before slowdown level-0 trigger
REVERT: 659d2d5 move compaction_filter to immutable_options
REVERT: 048560a reduce references to cfd->options() in DBImpl
REVERT: 011241b DB::Flush() Do not wait for background threads when there is nothing in mem table
REVERT: a2bb7c3 Push- instead of pull-model for managing Write stalls
REVERT: 0af157f Implement full filter for block based table.
REVERT: 9360cc6 Fix valgrind issue
REVERT: 02d5bff Merge pull request #277 from wankai/master
REVERT: 88a2f44 fix comments
REVERT: 7c16e39 Merge pull request #276 from wankai/master
REVERT: 8237738 replace hard-coded number with named variable
REVERT: db8ca52 Merge pull request #273 from nbougalis/static-analysis
REVERT: b7b031f Merge pull request #274 from wankai/master
REVERT: 4c2b1f0 Merge remote-tracking branch 'upstream/master'
REVERT: a5d2863 typo improvement
REVERT: 9f8aa09 Don't leak data returned by opendir
REVERT: d1cfb71 Remove unused member(s)
REVERT: bfee319 sizeof(int*) where sizeof(int) was intended
REVERT: d40c1f7 Add missing break statement
REVERT: 2e97c38 Avoid off-by-one error when using readlink
REVERT: 40ddc3d add cache bench
REVERT: 9f1c80b Drop column family from write thread
REVERT: 8de151b Add db_bench with lots of column families to regression tests
REVERT: c9e419c rename options_ to db_options_ in DBImpl to avoid confusion
REVERT: 5cd0576 Fix compaction bug in Cuckoo Table Builder. Use kvs_.size() instead of num_entries in FileSize() method.
REVERT: 0fbb3fa fixed memory leak in unit test DBIteratorBoundTest
REVERT: adcd253 fix asan check
REVERT: 4092b7a Merge pull request #272 from project-zerus/patch-1
REVERT: bb6ae0f fix more compile warnings
REVERT: 6d31441 Merge pull request #271 from nbougalis/cleanups
REVERT: 0cd0ec4 Plug memory leak during index creation
REVERT: 4329d74 Fix swapped variable names to accurately reflect usage
REVERT: 45a5e3e Remove path with arena==nullptr from NewInternalIterator
REVERT: 5665e5e introduce ImmutableOptions
REVERT: e0b99d4 created a new ReadOptions parameter 'iterate_upper_bound'
REVERT: 51ea889 Fix travis builds
REVERT: a481626 Relax backupable rate limiting test
REVERT: f7f973d Merge pull request #269 from huahang/patch-2
REVERT: ef5b384 fix a few compile warnings
REVERT: 2fd3806 Merge pull request #263 from wankai/master
REVERT: 1785114 delete unused Comparator
REVERT: 1b1d961 update HISTORY.md
REVERT: 703c3ea comments about the BlockBasedTableOptions migration in Options
REVERT: 4b5ad88 Merge pull request #260 from wankai/master
REVERT: 19cc588 change to filter_block std::unique_ptr support RAII
REVERT: 9b976e3 Merge pull request #259 from wankai/master
REVERT: 5d25a46 Merge remote-tracking branch 'upstream/master'
REVERT: dff2b1a typo improvement
REVERT: 343e98a Reverting import change
REVERT: ddb8039 RocksDB static build Make file changes to download and build the dependencies .Load the shared library when RocksDB is initialized

git-subtree-dir: src/rocksdb2
git-subtree-split: 1fdd726a8254c13d0c66d8db8130ad17c13d7bcc
2014-10-27 11:36:32 -07:00

715 lines
22 KiB
C++

// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef ROCKSDB_LITE
#include "table/plain_table_reader.h"
#include <string>
#include <vector>
#include "db/dbformat.h"
#include "rocksdb/cache.h"
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
#include "rocksdb/filter_policy.h"
#include "rocksdb/options.h"
#include "rocksdb/statistics.h"
#include "table/block.h"
#include "table/bloom_block.h"
#include "table/filter_block.h"
#include "table/format.h"
#include "table/meta_blocks.h"
#include "table/two_level_iterator.h"
#include "table/plain_table_factory.h"
#include "table/plain_table_key_coding.h"
#include "util/arena.h"
#include "util/coding.h"
#include "util/dynamic_bloom.h"
#include "util/hash.h"
#include "util/histogram.h"
#include "util/murmurhash.h"
#include "util/perf_context_imp.h"
#include "util/stop_watch.h"
namespace rocksdb {
namespace {
// Safely getting a uint32_t element from a char array, where, starting from
// `base`, every 4 bytes are considered as an fixed 32 bit integer.
inline uint32_t GetFixed32Element(const char* base, size_t offset) {
return DecodeFixed32(base + offset * sizeof(uint32_t));
}
} // namespace
// Iterator to iterate IndexedTable
class PlainTableIterator : public Iterator {
public:
explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
~PlainTableIterator();
bool Valid() const;
void SeekToFirst();
void SeekToLast();
void Seek(const Slice& target);
void Next();
void Prev();
Slice key() const;
Slice value() const;
Status status() const;
private:
PlainTableReader* table_;
PlainTableKeyDecoder decoder_;
bool use_prefix_seek_;
uint32_t offset_;
uint32_t next_offset_;
Slice key_;
Slice value_;
Status status_;
// No copying allowed
PlainTableIterator(const PlainTableIterator&) = delete;
void operator=(const Iterator&) = delete;
};
extern const uint64_t kPlainTableMagicNumber;
PlainTableReader::PlainTableReader(const Options& options,
unique_ptr<RandomAccessFile>&& file,
const EnvOptions& storage_options,
const InternalKeyComparator& icomparator,
EncodingType encoding_type,
uint64_t file_size,
const TableProperties* table_properties)
: internal_comparator_(icomparator),
encoding_type_(encoding_type),
full_scan_mode_(false),
data_end_offset_(table_properties->data_size),
user_key_len_(table_properties->fixed_key_len),
prefix_extractor_(options.prefix_extractor.get()),
enable_bloom_(false),
bloom_(6, nullptr),
options_(options),
file_(std::move(file)),
file_size_(file_size),
table_properties_(nullptr) {}
PlainTableReader::~PlainTableReader() {
}
Status PlainTableReader::Open(const Options& options,
const EnvOptions& soptions,
const InternalKeyComparator& internal_comparator,
unique_ptr<RandomAccessFile>&& file,
uint64_t file_size,
unique_ptr<TableReader>* table_reader,
const int bloom_bits_per_key,
double hash_table_ratio, size_t index_sparseness,
size_t huge_page_tlb_size, bool full_scan_mode) {
assert(options.allow_mmap_reads);
if (file_size > PlainTableIndex::kMaxFileSize) {
return Status::NotSupported("File is too large for PlainTableReader!");
}
TableProperties* props = nullptr;
auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
options.env, options.info_log.get(), &props);
if (!s.ok()) {
return s;
}
assert(hash_table_ratio >= 0.0);
auto& user_props = props->user_collected_properties;
auto prefix_extractor_in_file =
user_props.find(PlainTablePropertyNames::kPrefixExtractorName);
if (!full_scan_mode && prefix_extractor_in_file != user_props.end()) {
if (!options.prefix_extractor) {
return Status::InvalidArgument(
"Prefix extractor is missing when opening a PlainTable built "
"using a prefix extractor");
} else if (prefix_extractor_in_file->second.compare(
options.prefix_extractor->Name()) != 0) {
return Status::InvalidArgument(
"Prefix extractor given doesn't match the one used to build "
"PlainTable");
}
}
EncodingType encoding_type = kPlain;
auto encoding_type_prop =
user_props.find(PlainTablePropertyNames::kEncodingType);
if (encoding_type_prop != user_props.end()) {
encoding_type = static_cast<EncodingType>(
DecodeFixed32(encoding_type_prop->second.c_str()));
}
std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
options, std::move(file), soptions, internal_comparator, encoding_type,
file_size, props));
s = new_reader->MmapDataFile();
if (!s.ok()) {
return s;
}
if (!full_scan_mode) {
s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio,
index_sparseness, huge_page_tlb_size);
if (!s.ok()) {
return s;
}
} else {
// Flag to indicate it is a full scan mode so that none of the indexes
// can be used.
new_reader->full_scan_mode_ = true;
}
*table_reader = std::move(new_reader);
return s;
}
void PlainTableReader::SetupForCompaction() {
}
Iterator* PlainTableReader::NewIterator(const ReadOptions& options,
Arena* arena) {
if (options.total_order_seek && !IsTotalOrderMode()) {
return NewErrorIterator(
Status::InvalidArgument("total_order_seek not supported"), arena);
}
if (arena == nullptr) {
return new PlainTableIterator(this, prefix_extractor_ != nullptr);
} else {
auto mem = arena->AllocateAligned(sizeof(PlainTableIterator));
return new (mem) PlainTableIterator(this, prefix_extractor_ != nullptr);
}
}
Status PlainTableReader::PopulateIndexRecordList(
PlainTableIndexBuilder* index_builder, vector<uint32_t>* prefix_hashes) {
Slice prev_key_prefix_slice;
uint32_t pos = data_start_offset_;
bool is_first_record = true;
Slice key_prefix_slice;
PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
options_.prefix_extractor.get());
while (pos < data_end_offset_) {
uint32_t key_offset = pos;
ParsedInternalKey key;
Slice value_slice;
bool seekable = false;
Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable);
if (!s.ok()) {
return s;
}
key_prefix_slice = GetPrefix(key);
if (enable_bloom_) {
bloom_.AddHash(GetSliceHash(key.user_key));
} else {
if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
if (!is_first_record) {
prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
}
prev_key_prefix_slice = key_prefix_slice;
}
}
index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
if (!seekable && is_first_record) {
return Status::Corruption("Key for a prefix is not seekable");
}
is_first_record = false;
}
prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
auto s = index_.InitFromRawData(index_builder->Finish());
return s;
}
void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key,
int num_prefixes,
size_t huge_page_tlb_size,
vector<uint32_t>* prefix_hashes) {
if (!IsTotalOrderMode()) {
uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key;
if (bloom_total_bits > 0) {
enable_bloom_ = true;
bloom_.SetTotalBits(&arena_, bloom_total_bits, options_.bloom_locality,
huge_page_tlb_size, options_.info_log.get());
FillBloom(prefix_hashes);
}
}
}
void PlainTableReader::FillBloom(vector<uint32_t>* prefix_hashes) {
assert(bloom_.IsInitialized());
for (auto prefix_hash : *prefix_hashes) {
bloom_.AddHash(prefix_hash);
}
}
Status PlainTableReader::MmapDataFile() {
// Get mmapped memory to file_data_.
return file_->Read(0, file_size_, &file_data_, nullptr);
}
Status PlainTableReader::PopulateIndex(TableProperties* props,
int bloom_bits_per_key,
double hash_table_ratio,
size_t index_sparseness,
size_t huge_page_tlb_size) {
assert(props != nullptr);
table_properties_.reset(props);
BlockContents bloom_block_contents;
auto s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
options_.env, BloomBlockBuilder::kBloomBlock,
&bloom_block_contents);
bool index_in_file = s.ok();
BlockContents index_block_contents;
s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
options_.env, PlainTableIndexBuilder::kPlainTableIndexBlock,
&index_block_contents);
index_in_file &= s.ok();
Slice* bloom_block;
if (index_in_file) {
bloom_block = &bloom_block_contents.data;
} else {
bloom_block = nullptr;
}
// index_in_file == true only if there are kBloomBlock and
// kPlainTableIndexBlock
// in file
Slice* index_block;
if (index_in_file) {
index_block = &index_block_contents.data;
} else {
index_block = nullptr;
}
if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) {
// options.prefix_extractor is requried for a hash-based look-up.
return Status::NotSupported(
"PlainTable requires a prefix extractor enable prefix hash mode.");
}
// First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
// for a prefix (starting from the first one), generate a record of (hash,
// offset) and append it to IndexRecordList, which is a data structure created
// to store them.
if (!index_in_file) {
// Allocate bloom filter here for total order mode.
if (IsTotalOrderMode()) {
uint32_t num_bloom_bits =
table_properties_->num_entries * bloom_bits_per_key;
if (num_bloom_bits > 0) {
enable_bloom_ = true;
bloom_.SetTotalBits(&arena_, num_bloom_bits, options_.bloom_locality,
huge_page_tlb_size, options_.info_log.get());
}
}
} else {
enable_bloom_ = true;
auto num_blocks_property = props->user_collected_properties.find(
PlainTablePropertyNames::kNumBloomBlocks);
uint32_t num_blocks = 0;
if (num_blocks_property != props->user_collected_properties.end()) {
Slice temp_slice(num_blocks_property->second);
if (!GetVarint32(&temp_slice, &num_blocks)) {
num_blocks = 0;
}
}
// cast away const qualifier, because bloom_ won't be changed
bloom_.SetRawData(
const_cast<unsigned char*>(
reinterpret_cast<const unsigned char*>(bloom_block->data())),
bloom_block->size() * 8, num_blocks);
}
PlainTableIndexBuilder index_builder(&arena_, options_, index_sparseness,
hash_table_ratio, huge_page_tlb_size);
std::vector<uint32_t> prefix_hashes;
if (!index_in_file) {
Status s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
if (!s.ok()) {
return s;
}
} else {
Status s = index_.InitFromRawData(*index_block);
if (!s.ok()) {
return s;
}
}
if (!index_in_file) {
// Calculated bloom filter size and allocate memory for
// bloom filter based on the number of prefixes, then fill it.
AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
huge_page_tlb_size, &prefix_hashes);
}
// Fill two table properties.
if (!index_in_file) {
props->user_collected_properties["plain_table_hash_table_size"] =
std::to_string(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
props->user_collected_properties["plain_table_sub_index_size"] =
std::to_string(index_.GetSubIndexSize());
} else {
props->user_collected_properties["plain_table_hash_table_size"] =
std::to_string(0);
props->user_collected_properties["plain_table_sub_index_size"] =
std::to_string(0);
}
return Status::OK();
}
Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
uint32_t prefix_hash, bool& prefix_matched,
uint32_t* offset) const {
prefix_matched = false;
uint32_t prefix_index_offset;
auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
if (res == PlainTableIndex::kNoPrefixForBucket) {
*offset = data_end_offset_;
return Status::OK();
} else if (res == PlainTableIndex::kDirectToFile) {
*offset = prefix_index_offset;
return Status::OK();
}
// point to sub-index, need to do a binary search
uint32_t upper_bound;
const char* base_ptr =
index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
uint32_t low = 0;
uint32_t high = upper_bound;
ParsedInternalKey mid_key;
ParsedInternalKey parsed_target;
if (!ParseInternalKey(target, &parsed_target)) {
return Status::Corruption(Slice());
}
// The key is between [low, high). Do a binary search between it.
while (high - low > 1) {
uint32_t mid = (high + low) / 2;
uint32_t file_offset = GetFixed32Element(base_ptr, mid);
size_t tmp;
Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_,
options_.prefix_extractor.get())
.NextKey(file_data_.data() + file_offset,
file_data_.data() + data_end_offset_, &mid_key,
nullptr, &tmp);
if (!s.ok()) {
return s;
}
int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
if (cmp_result < 0) {
low = mid;
} else {
if (cmp_result == 0) {
// Happen to have found the exact key or target is smaller than the
// first key after base_offset.
prefix_matched = true;
*offset = file_offset;
return Status::OK();
} else {
high = mid;
}
}
}
// Both of the key at the position low or low+1 could share the same
// prefix as target. We need to rule out one of them to avoid to go
// to the wrong prefix.
ParsedInternalKey low_key;
size_t tmp;
uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_,
options_.prefix_extractor.get())
.NextKey(file_data_.data() + low_key_offset,
file_data_.data() + data_end_offset_, &low_key,
nullptr, &tmp);
if (!s.ok()) {
return s;
}
if (GetPrefix(low_key) == prefix) {
prefix_matched = true;
*offset = low_key_offset;
} else if (low + 1 < upper_bound) {
// There is possible a next prefix, return it
prefix_matched = false;
*offset = GetFixed32Element(base_ptr, low + 1);
} else {
// target is larger than a key of the last prefix in this bucket
// but with a different prefix. Key does not exist.
*offset = data_end_offset_;
}
return Status::OK();
}
bool PlainTableReader::MatchBloom(uint32_t hash) const {
return !enable_bloom_ || bloom_.MayContainHash(hash);
}
Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
ParsedInternalKey* parsed_key,
Slice* internal_key, Slice* value,
bool* seekable) const {
if (*offset == data_end_offset_) {
*offset = data_end_offset_;
return Status::OK();
}
if (*offset > data_end_offset_) {
return Status::Corruption("Offset is out of file size");
}
const char* start = file_data_.data() + *offset;
size_t bytes_for_key;
Status s =
decoder->NextKey(start, file_data_.data() + data_end_offset_, parsed_key,
internal_key, &bytes_for_key, seekable);
if (!s.ok()) {
return s;
}
uint32_t value_size;
const char* value_ptr = GetVarint32Ptr(
start + bytes_for_key, file_data_.data() + data_end_offset_, &value_size);
if (value_ptr == nullptr) {
return Status::Corruption(
"Unexpected EOF when reading the next value's size.");
}
*offset = *offset + (value_ptr - start) + value_size;
if (*offset > data_end_offset_) {
return Status::Corruption("Unexpected EOF when reading the next value. ");
}
*value = Slice(value_ptr, value_size);
return Status::OK();
}
void PlainTableReader::Prepare(const Slice& target) {
if (enable_bloom_) {
uint32_t prefix_hash = GetSliceHash(GetPrefix(target));
bloom_.Prefetch(prefix_hash);
}
}
Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
void* arg,
bool (*saver)(void*, const ParsedInternalKey&,
const Slice&),
void (*mark_key_may_exist)(void*)) {
// Check bloom filter first.
Slice prefix_slice;
uint32_t prefix_hash;
if (IsTotalOrderMode()) {
if (full_scan_mode_) {
status_ =
Status::InvalidArgument("Get() is not allowed in full scan mode.");
}
// Match whole user key for bloom filter check.
if (!MatchBloom(GetSliceHash(GetUserKey(target)))) {
return Status::OK();
}
// in total order mode, there is only one bucket 0, and we always use empty
// prefix.
prefix_slice = Slice();
prefix_hash = 0;
} else {
prefix_slice = GetPrefix(target);
prefix_hash = GetSliceHash(prefix_slice);
if (!MatchBloom(prefix_hash)) {
return Status::OK();
}
}
uint32_t offset;
bool prefix_match;
Status s =
GetOffset(target, prefix_slice, prefix_hash, prefix_match, &offset);
if (!s.ok()) {
return s;
}
ParsedInternalKey found_key;
ParsedInternalKey parsed_target;
if (!ParseInternalKey(target, &parsed_target)) {
return Status::Corruption(Slice());
}
Slice found_value;
PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
options_.prefix_extractor.get());
while (offset < data_end_offset_) {
Status s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
if (!s.ok()) {
return s;
}
if (!prefix_match) {
// Need to verify prefix for the first key found if it is not yet
// checked.
if (GetPrefix(found_key) != prefix_slice) {
return Status::OK();
}
prefix_match = true;
}
if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
if (!(*saver)(arg, found_key, found_value)) {
break;
}
}
}
return Status::OK();
}
uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) {
return 0;
}
PlainTableIterator::PlainTableIterator(PlainTableReader* table,
bool use_prefix_seek)
: table_(table),
decoder_(table_->encoding_type_, table_->user_key_len_,
table_->prefix_extractor_),
use_prefix_seek_(use_prefix_seek) {
next_offset_ = offset_ = table_->data_end_offset_;
}
PlainTableIterator::~PlainTableIterator() {
}
bool PlainTableIterator::Valid() const {
return offset_ < table_->data_end_offset_
&& offset_ >= table_->data_start_offset_;
}
void PlainTableIterator::SeekToFirst() {
next_offset_ = table_->data_start_offset_;
if (next_offset_ >= table_->data_end_offset_) {
next_offset_ = offset_ = table_->data_end_offset_;
} else {
Next();
}
}
void PlainTableIterator::SeekToLast() {
assert(false);
status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable");
}
void PlainTableIterator::Seek(const Slice& target) {
// If the user doesn't set prefix seek option and we are not able to do a
// total Seek(). assert failure.
if (!use_prefix_seek_) {
if (table_->full_scan_mode_) {
status_ =
Status::InvalidArgument("Seek() is not allowed in full scan mode.");
offset_ = next_offset_ = table_->data_end_offset_;
return;
} else if (table_->GetIndexSize() > 1) {
assert(false);
status_ = Status::NotSupported(
"PlainTable cannot issue non-prefix seek unless in total order "
"mode.");
offset_ = next_offset_ = table_->data_end_offset_;
return;
}
}
Slice prefix_slice = table_->GetPrefix(target);
uint32_t prefix_hash = 0;
// Bloom filter is ignored in total-order mode.
if (!table_->IsTotalOrderMode()) {
prefix_hash = GetSliceHash(prefix_slice);
if (!table_->MatchBloom(prefix_hash)) {
offset_ = next_offset_ = table_->data_end_offset_;
return;
}
}
bool prefix_match;
status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match,
&next_offset_);
if (!status_.ok()) {
offset_ = next_offset_ = table_->data_end_offset_;
return;
}
if (next_offset_ < table_-> data_end_offset_) {
for (Next(); status_.ok() && Valid(); Next()) {
if (!prefix_match) {
// Need to verify the first key's prefix
if (table_->GetPrefix(key()) != prefix_slice) {
offset_ = next_offset_ = table_->data_end_offset_;
break;
}
prefix_match = true;
}
if (table_->internal_comparator_.Compare(key(), target) >= 0) {
break;
}
}
} else {
offset_ = table_->data_end_offset_;
}
}
void PlainTableIterator::Next() {
offset_ = next_offset_;
if (offset_ < table_->data_end_offset_) {
Slice tmp_slice;
ParsedInternalKey parsed_key;
status_ =
table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_);
if (!status_.ok()) {
offset_ = next_offset_ = table_->data_end_offset_;
}
}
}
void PlainTableIterator::Prev() {
assert(false);
}
Slice PlainTableIterator::key() const {
assert(Valid());
return key_;
}
Slice PlainTableIterator::value() const {
assert(Valid());
return value_;
}
Status PlainTableIterator::status() const {
return status_;
}
} // namespace rocksdb
#endif // ROCKSDB_LITE