mirror of
https://github.com/Xahau/xahaud.git
synced 2025-12-06 17:27:52 +00:00
25511b7 Merge branch 'master' of github.com:rescrv/HyperLevelDB into hyperdb ed01020 Make "source" universal 3784d92 Ignore the static file 507319b Don't package with snappy 3e2cc8b Tolerate -fno-rtti 4dcdd6e Drop revision down to 1.0.dev 2542163 Drop all but the latest kept for garbage reasons 9c270b7 Update .gitignore 5331878 Add upack script adc2a7a Explicitly add -lpthread for Ubuntu 7b57bbd Strip NULL chars passed to LiveBackup e3b87e7 Add write-buffer-size option to benchmark 2f11087 Followup to snappy support with -DSNAPPY af503da Improve efficiency of ReplayIterator; fix a bug 33c1f0c Add snappy support ce1cacf Fix a race in ReplayIterator 5c4679b Fix a bug in the replay_iterator ca332bd Fix sort algorithm used for compaction boundaries. d9ec544 make checK b83a9cd Fix a deadlock in the ReplayIterator dtor 273547b Fix a double-delete in ReplayIterator 3377c7a Add "all" to set of special timestamps 387f43a Timestamp comparison and validation. f9a6eb1 make distcheck 9a4d0b7 Add a ReplayIterator. 1d53869 Conditionally enable read-driven compaction. f6fa561 16% end-to-end performance improvement from the skiplist 28ffd32 Merge remote-tracking branch 'upstream/master' a58de73 Revert "Remove read-driven compactions." e19fc0c Fix upstream issue 200 748539c LevelDB 1.13 78b7812 Add install instructions to README e47a48e Make benchmark dir variable 820a096 Update distributed files. 486ca7f Live backup of LevelDB instances 6579884 Put a reference counter on log_/logfile_ 3075253 Update internal benchmark. 2a6b0bd Make the Version a parameter of PickCompaction 5bd76dc Release leveldb 1.12 git-subtree-dir: src/hyperleveldb git-subtree-split: 25511b7a9101b0bafb57349d2194ba80ccbf7bc3
1634 lines
51 KiB
C++
1634 lines
51 KiB
C++
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include "db/version_set.h"
|
|
|
|
#include <algorithm>
|
|
#include <stdio.h>
|
|
#include "dbformat.h"
|
|
#include "filename.h"
|
|
#include "log_reader.h"
|
|
#include "log_writer.h"
|
|
#include "memtable.h"
|
|
#include "table_cache.h"
|
|
#include "../hyperleveldb/env.h"
|
|
#include "../hyperleveldb/table_builder.h"
|
|
#include "../table/merger.h"
|
|
#include "../table/two_level_iterator.h"
|
|
#include "../util/coding.h"
|
|
#include "../util/logging.h"
|
|
|
|
namespace hyperleveldb {
|
|
|
|
static double MaxBytesForLevel(int level) {
|
|
assert(level < leveldb::config::kNumLevels);
|
|
static const double bytes[] = {10 * 1048576.0,
|
|
100 * 1048576.0,
|
|
100 * 1048576.0,
|
|
1000 * 1048576.0,
|
|
10000 * 1048576.0,
|
|
100000 * 1048576.0,
|
|
1000000 * 1048576.0};
|
|
return bytes[level];
|
|
}
|
|
|
|
static uint64_t MaxFileSizeForLevel(int level) {
|
|
assert(level < leveldb::config::kNumLevels);
|
|
static const uint64_t bytes[] = {8 * 1048576,
|
|
8 * 1048576,
|
|
8 * 1048576,
|
|
8 * 1048576,
|
|
8 * 1048576,
|
|
8 * 1048576,
|
|
8 * 1048576};
|
|
return bytes[level];
|
|
}
|
|
|
|
static uint64_t MaxCompactionBytesForLevel(int level) {
|
|
assert(level < leveldb::config::kNumLevels);
|
|
static const uint64_t bytes[] = {128 * 1048576,
|
|
128 * 1048576,
|
|
128 * 1048576,
|
|
256 * 1048576,
|
|
256 * 1048576,
|
|
256 * 1048576,
|
|
256 * 1048576};
|
|
return bytes[level];
|
|
}
|
|
|
|
static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
|
|
int64_t sum = 0;
|
|
for (size_t i = 0; i < files.size(); i++) {
|
|
sum += files[i]->file_size;
|
|
}
|
|
return sum;
|
|
}
|
|
|
|
namespace {
|
|
std::string IntSetToString(const std::set<uint64_t>& s) {
|
|
std::string result = "{";
|
|
for (std::set<uint64_t>::const_iterator it = s.begin();
|
|
it != s.end();
|
|
++it) {
|
|
result += (result.size() > 1) ? "," : "";
|
|
result += NumberToString(*it);
|
|
}
|
|
result += "}";
|
|
return result;
|
|
}
|
|
} // namespace
|
|
|
|
Version::~Version() {
|
|
assert(refs_ == 0);
|
|
|
|
// Remove from linked list
|
|
prev_->next_ = next_;
|
|
next_->prev_ = prev_;
|
|
|
|
// Drop references to files
|
|
for (int level = 0; level < config::kNumLevels; level++) {
|
|
for (size_t i = 0; i < files_[level].size(); i++) {
|
|
FileMetaData* f = files_[level][i];
|
|
assert(f->refs > 0);
|
|
f->refs--;
|
|
if (f->refs <= 0) {
|
|
delete f;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
int FindFile(const InternalKeyComparator& icmp,
|
|
const std::vector<FileMetaData*>& files,
|
|
const Slice& key) {
|
|
uint32_t left = 0;
|
|
uint32_t right = files.size();
|
|
while (left < right) {
|
|
uint32_t mid = (left + right) / 2;
|
|
const FileMetaData* f = files[mid];
|
|
if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) {
|
|
// Key at "mid.largest" is < "target". Therefore all
|
|
// files at or before "mid" are uninteresting.
|
|
left = mid + 1;
|
|
} else {
|
|
// Key at "mid.largest" is >= "target". Therefore all files
|
|
// after "mid" are uninteresting.
|
|
right = mid;
|
|
}
|
|
}
|
|
return right;
|
|
}
|
|
|
|
static bool AfterFile(const Comparator* ucmp,
|
|
const Slice* user_key, const FileMetaData* f) {
|
|
// NULL user_key occurs before all keys and is therefore never after *f
|
|
return (user_key != NULL &&
|
|
ucmp->Compare(*user_key, f->largest.user_key()) > 0);
|
|
}
|
|
|
|
static bool BeforeFile(const Comparator* ucmp,
|
|
const Slice* user_key, const FileMetaData* f) {
|
|
// NULL user_key occurs after all keys and is therefore never before *f
|
|
return (user_key != NULL &&
|
|
ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
|
|
}
|
|
|
|
bool SomeFileOverlapsRange(
|
|
const InternalKeyComparator& icmp,
|
|
bool disjoint_sorted_files,
|
|
const std::vector<FileMetaData*>& files,
|
|
const Slice* smallest_user_key,
|
|
const Slice* largest_user_key) {
|
|
const Comparator* ucmp = icmp.user_comparator();
|
|
if (!disjoint_sorted_files) {
|
|
// Need to check against all files
|
|
for (size_t i = 0; i < files.size(); i++) {
|
|
const FileMetaData* f = files[i];
|
|
if (AfterFile(ucmp, smallest_user_key, f) ||
|
|
BeforeFile(ucmp, largest_user_key, f)) {
|
|
// No overlap
|
|
} else {
|
|
return true; // Overlap
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Binary search over file list
|
|
uint32_t index = 0;
|
|
if (smallest_user_key != NULL) {
|
|
// Find the earliest possible internal key for smallest_user_key
|
|
InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek);
|
|
index = FindFile(icmp, files, small.Encode());
|
|
}
|
|
|
|
if (index >= files.size()) {
|
|
// beginning of range is after all files, so no overlap.
|
|
return false;
|
|
}
|
|
|
|
return !BeforeFile(ucmp, largest_user_key, files[index]);
|
|
}
|
|
|
|
// An internal iterator. For a given version/level pair, yields
|
|
// information about the files in the level. For a given entry, key()
|
|
// is the largest key that occurs in the file, and value() is an
|
|
// 16-byte value containing the file number and file size, both
|
|
// encoded using EncodeFixed64.
|
|
//
|
|
// If num != 0, then do not call SeekToLast, Prev
|
|
class Version::LevelFileNumIterator : public Iterator {
|
|
public:
|
|
LevelFileNumIterator(const InternalKeyComparator& icmp,
|
|
const std::vector<FileMetaData*>* flist,
|
|
uint64_t num)
|
|
: icmp_(icmp),
|
|
flist_(flist),
|
|
index_(flist->size()), // Marks as invalid
|
|
number_(num) {
|
|
}
|
|
virtual bool Valid() const {
|
|
return index_ < flist_->size();
|
|
}
|
|
virtual void Seek(const Slice& target) {
|
|
index_ = FindFile(icmp_, *flist_, target);
|
|
Bump();
|
|
}
|
|
virtual void SeekToFirst() {
|
|
index_ = 0;
|
|
Bump();
|
|
}
|
|
virtual void SeekToLast() {
|
|
assert(number_ == 0);
|
|
index_ = flist_->empty() ? 0 : flist_->size() - 1;
|
|
}
|
|
virtual void Next() {
|
|
assert(Valid());
|
|
index_++;
|
|
Bump();
|
|
}
|
|
virtual void Prev() {
|
|
assert(Valid());
|
|
assert(number_ == 0);
|
|
if (index_ == 0) {
|
|
index_ = flist_->size(); // Marks as invalid
|
|
} else {
|
|
index_--;
|
|
}
|
|
}
|
|
Slice key() const {
|
|
assert(Valid());
|
|
return (*flist_)[index_]->largest.Encode();
|
|
}
|
|
Slice value() const {
|
|
assert(Valid());
|
|
EncodeFixed64(value_buf_, (*flist_)[index_]->number);
|
|
EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size);
|
|
return Slice(value_buf_, sizeof(value_buf_));
|
|
}
|
|
virtual Status status() const { return Status::OK(); }
|
|
private:
|
|
void Bump() {
|
|
while (index_ < flist_->size() &&
|
|
(*flist_)[index_]->number < number_) {
|
|
++index_;
|
|
}
|
|
}
|
|
const InternalKeyComparator icmp_;
|
|
const std::vector<FileMetaData*>* const flist_;
|
|
uint32_t index_;
|
|
uint64_t number_;
|
|
|
|
// Backing store for value(). Holds the file number and size.
|
|
mutable char value_buf_[16];
|
|
};
|
|
|
|
static Iterator* GetFileIterator(void* arg,
|
|
const ReadOptions& options,
|
|
const Slice& file_value) {
|
|
TableCache* cache = reinterpret_cast<TableCache*>(arg);
|
|
if (file_value.size() != 16) {
|
|
return NewErrorIterator(
|
|
Status::Corruption("FileReader invoked with unexpected value"));
|
|
} else {
|
|
return cache->NewIterator(options,
|
|
DecodeFixed64(file_value.data()),
|
|
DecodeFixed64(file_value.data() + 8));
|
|
}
|
|
}
|
|
|
|
Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
|
|
int level, uint64_t num) const {
|
|
return NewTwoLevelIterator(
|
|
new LevelFileNumIterator(vset_->icmp_, &files_[level], num),
|
|
&GetFileIterator, vset_->table_cache_, options);
|
|
}
|
|
|
|
void Version::AddIterators(const ReadOptions& options,
|
|
std::vector<Iterator*>* iters) {
|
|
return AddSomeIterators(options, 0, iters);
|
|
}
|
|
|
|
void Version::AddSomeIterators(const ReadOptions& options, uint64_t num,
|
|
std::vector<Iterator*>* iters) {
|
|
// Merge all level zero files together since they may overlap
|
|
for (size_t i = 0; i < files_[0].size(); i++) {
|
|
iters->push_back(
|
|
vset_->table_cache_->NewIterator(
|
|
options, files_[0][i]->number, files_[0][i]->file_size));
|
|
}
|
|
|
|
// For levels > 0, we can use a concatenating iterator that sequentially
|
|
// walks through the non-overlapping files in the level, opening them
|
|
// lazily.
|
|
for (int level = 1; level < config::kNumLevels; level++) {
|
|
if (!files_[level].empty()) {
|
|
iters->push_back(NewConcatenatingIterator(options, level, num));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Callback from TableCache::Get()
|
|
namespace {
|
|
enum SaverState {
|
|
kNotFound,
|
|
kFound,
|
|
kDeleted,
|
|
kCorrupt,
|
|
};
|
|
struct Saver {
|
|
SaverState state;
|
|
const Comparator* ucmp;
|
|
Slice user_key;
|
|
std::string* value;
|
|
};
|
|
}
|
|
static void SaveValue(void* arg, const Slice& ikey, const Slice& v) {
|
|
Saver* s = reinterpret_cast<Saver*>(arg);
|
|
ParsedInternalKey parsed_key;
|
|
if (!ParseInternalKey(ikey, &parsed_key)) {
|
|
s->state = kCorrupt;
|
|
} else {
|
|
if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
|
|
s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted;
|
|
if (s->state == kFound) {
|
|
s->value->assign(v.data(), v.size());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
|
|
return a->number > b->number;
|
|
}
|
|
|
|
void Version::ForEachOverlapping(Slice user_key, Slice internal_key,
|
|
void* arg,
|
|
bool (*func)(void*, int, FileMetaData*)) {
|
|
// TODO(sanjay): Change Version::Get() to use this function.
|
|
const Comparator* ucmp = vset_->icmp_.user_comparator();
|
|
|
|
// Search level-0 in order from newest to oldest.
|
|
std::vector<FileMetaData*> tmp;
|
|
tmp.reserve(files_[0].size());
|
|
for (uint32_t i = 0; i < files_[0].size(); i++) {
|
|
FileMetaData* f = files_[0][i];
|
|
if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
|
|
ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
|
|
tmp.push_back(f);
|
|
}
|
|
}
|
|
if (!tmp.empty()) {
|
|
std::sort(tmp.begin(), tmp.end(), NewestFirst);
|
|
for (uint32_t i = 0; i < tmp.size(); i++) {
|
|
if (!(*func)(arg, 0, tmp[i])) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Search other levels.
|
|
for (int level = 1; level < config::kNumLevels; level++) {
|
|
size_t num_files = files_[level].size();
|
|
if (num_files == 0) continue;
|
|
|
|
// Binary search to find earliest index whose largest key >= internal_key.
|
|
uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key);
|
|
if (index < num_files) {
|
|
FileMetaData* f = files_[level][index];
|
|
if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) {
|
|
// All of "f" is past any data for user_key
|
|
} else {
|
|
if (!(*func)(arg, level, f)) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Status Version::Get(const ReadOptions& options,
|
|
const LookupKey& k,
|
|
std::string* value,
|
|
GetStats* stats) {
|
|
Slice ikey = k.internal_key();
|
|
Slice user_key = k.user_key();
|
|
const Comparator* ucmp = vset_->icmp_.user_comparator();
|
|
Status s;
|
|
|
|
stats->seek_file = NULL;
|
|
stats->seek_file_level = -1;
|
|
FileMetaData* last_file_read = NULL;
|
|
int last_file_read_level = -1;
|
|
|
|
// We can search level-by-level since entries never hop across
|
|
// levels. Therefore we are guaranteed that if we find data
|
|
// in an smaller level, later levels are irrelevant.
|
|
std::vector<FileMetaData*> tmp;
|
|
FileMetaData* tmp2;
|
|
for (int level = 0; level < config::kNumLevels; level++) {
|
|
size_t num_files = files_[level].size();
|
|
if (num_files == 0) continue;
|
|
|
|
// Get the list of files to search in this level
|
|
FileMetaData* const* files = &files_[level][0];
|
|
if (level == 0) {
|
|
// Level-0 files may overlap each other. Find all files that
|
|
// overlap user_key and process them in order from newest to oldest.
|
|
tmp.reserve(num_files);
|
|
for (uint32_t i = 0; i < num_files; i++) {
|
|
FileMetaData* f = files[i];
|
|
if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
|
|
ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
|
|
tmp.push_back(f);
|
|
}
|
|
}
|
|
if (tmp.empty()) continue;
|
|
|
|
std::sort(tmp.begin(), tmp.end(), NewestFirst);
|
|
files = &tmp[0];
|
|
num_files = tmp.size();
|
|
} else {
|
|
// Binary search to find earliest index whose largest key >= ikey.
|
|
uint32_t index = FindFile(vset_->icmp_, files_[level], ikey);
|
|
if (index >= num_files) {
|
|
files = NULL;
|
|
num_files = 0;
|
|
} else {
|
|
tmp2 = files[index];
|
|
if (ucmp->Compare(user_key, tmp2->smallest.user_key()) < 0) {
|
|
// All of "tmp2" is past any data for user_key
|
|
files = NULL;
|
|
num_files = 0;
|
|
} else {
|
|
files = &tmp2;
|
|
num_files = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (uint32_t i = 0; i < num_files; ++i) {
|
|
if (last_file_read != NULL && stats->seek_file == NULL) {
|
|
// We have had more than one seek for this read. Charge the 1st file.
|
|
stats->seek_file = last_file_read;
|
|
stats->seek_file_level = last_file_read_level;
|
|
}
|
|
|
|
FileMetaData* f = files[i];
|
|
last_file_read = f;
|
|
last_file_read_level = level;
|
|
|
|
Saver saver;
|
|
saver.state = kNotFound;
|
|
saver.ucmp = ucmp;
|
|
saver.user_key = user_key;
|
|
saver.value = value;
|
|
s = vset_->table_cache_->Get(options, f->number, f->file_size,
|
|
ikey, &saver, SaveValue);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
switch (saver.state) {
|
|
case kNotFound:
|
|
break; // Keep searching in other files
|
|
case kFound:
|
|
return s;
|
|
case kDeleted:
|
|
s = Status::NotFound(Slice()); // Use empty error message for speed
|
|
return s;
|
|
case kCorrupt:
|
|
s = Status::Corruption("corrupted key for ", user_key);
|
|
return s;
|
|
}
|
|
}
|
|
}
|
|
|
|
return Status::NotFound(Slice()); // Use an empty error message for speed
|
|
}
|
|
|
|
bool Version::UpdateStats(const GetStats& stats) {
|
|
FileMetaData* f = stats.seek_file;
|
|
if (f != NULL) {
|
|
f->allowed_seeks--;
|
|
if (f->allowed_seeks <= 0 && file_to_compact_ == NULL) {
|
|
file_to_compact_ = f;
|
|
file_to_compact_level_ = stats.seek_file_level;
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Version::RecordReadSample(Slice internal_key) {
|
|
ParsedInternalKey ikey;
|
|
if (!ParseInternalKey(internal_key, &ikey)) {
|
|
return false;
|
|
}
|
|
|
|
struct State {
|
|
GetStats stats; // Holds first matching file
|
|
int matches;
|
|
|
|
static bool Match(void* arg, int level, FileMetaData* f) {
|
|
State* state = reinterpret_cast<State*>(arg);
|
|
state->matches++;
|
|
if (state->matches == 1) {
|
|
// Remember first match.
|
|
state->stats.seek_file = f;
|
|
state->stats.seek_file_level = level;
|
|
}
|
|
// We can stop iterating once we have a second match.
|
|
return state->matches < 2;
|
|
}
|
|
};
|
|
|
|
State state;
|
|
state.matches = 0;
|
|
ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match);
|
|
|
|
// Must have at least two matches since we want to merge across
|
|
// files. But what if we have a single file that contains many
|
|
// overwrites and deletions? Should we have another mechanism for
|
|
// finding such files?
|
|
if (state.matches >= 2) {
|
|
// 1MB cost is about 1 seek (see comment in Builder::Apply).
|
|
return UpdateStats(state.stats);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void Version::Ref() {
|
|
++refs_;
|
|
}
|
|
|
|
void Version::Unref() {
|
|
assert(this != &vset_->dummy_versions_);
|
|
assert(refs_ >= 1);
|
|
--refs_;
|
|
if (refs_ == 0) {
|
|
delete this;
|
|
}
|
|
}
|
|
|
|
bool Version::OverlapInLevel(int level,
|
|
const Slice* smallest_user_key,
|
|
const Slice* largest_user_key) {
|
|
return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
|
|
smallest_user_key, largest_user_key);
|
|
}
|
|
|
|
int Version::PickLevelForMemTableOutput(
|
|
const Slice& smallest_user_key,
|
|
const Slice& largest_user_key) {
|
|
int level = 0;
|
|
if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
|
|
// Push to next level if there is no overlap in next level,
|
|
// and the #bytes overlapping in the level after that are limited.
|
|
InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
|
|
InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
|
|
std::vector<FileMetaData*> overlaps;
|
|
while (level < config::kMaxMemCompactLevel) {
|
|
if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
|
|
break;
|
|
}
|
|
GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
|
|
const int64_t sum = TotalFileSize(overlaps);
|
|
level++;
|
|
}
|
|
}
|
|
return level;
|
|
}
|
|
|
|
// Store in "*inputs" all files in "level" that overlap [begin,end]
|
|
void Version::GetOverlappingInputs(
|
|
int level,
|
|
const InternalKey* begin,
|
|
const InternalKey* end,
|
|
std::vector<FileMetaData*>* inputs) {
|
|
assert(level >= 0);
|
|
assert(level < config::kNumLevels);
|
|
inputs->clear();
|
|
Slice user_begin, user_end;
|
|
if (begin != NULL) {
|
|
user_begin = begin->user_key();
|
|
}
|
|
if (end != NULL) {
|
|
user_end = end->user_key();
|
|
}
|
|
const Comparator* user_cmp = vset_->icmp_.user_comparator();
|
|
for (size_t i = 0; i < files_[level].size(); ) {
|
|
FileMetaData* f = files_[level][i++];
|
|
const Slice file_start = f->smallest.user_key();
|
|
const Slice file_limit = f->largest.user_key();
|
|
if (begin != NULL && user_cmp->Compare(file_limit, user_begin) < 0) {
|
|
// "f" is completely before specified range; skip it
|
|
} else if (end != NULL && user_cmp->Compare(file_start, user_end) > 0) {
|
|
// "f" is completely after specified range; skip it
|
|
} else {
|
|
inputs->push_back(f);
|
|
if (level == 0) {
|
|
// Level-0 files may overlap each other. So check if the newly
|
|
// added file has expanded the range. If so, restart search.
|
|
if (begin != NULL && user_cmp->Compare(file_start, user_begin) < 0) {
|
|
user_begin = file_start;
|
|
inputs->clear();
|
|
i = 0;
|
|
} else if (end != NULL && user_cmp->Compare(file_limit, user_end) > 0) {
|
|
user_end = file_limit;
|
|
inputs->clear();
|
|
i = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
std::string Version::DebugString() const {
|
|
std::string r;
|
|
for (int level = 0; level < config::kNumLevels; level++) {
|
|
// E.g.,
|
|
// --- level 1 ---
|
|
// 17:123['a' .. 'd']
|
|
// 20:43['e' .. 'g']
|
|
r.append("--- level ");
|
|
AppendNumberTo(&r, level);
|
|
r.append(" ---\n");
|
|
const std::vector<FileMetaData*>& files = files_[level];
|
|
for (size_t i = 0; i < files.size(); i++) {
|
|
r.push_back(' ');
|
|
AppendNumberTo(&r, files[i]->number);
|
|
r.push_back(':');
|
|
AppendNumberTo(&r, files[i]->file_size);
|
|
r.append("[");
|
|
r.append(files[i]->smallest.DebugString());
|
|
r.append(" .. ");
|
|
r.append(files[i]->largest.DebugString());
|
|
r.append("]\n");
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
|
|
// A helper class so we can efficiently apply a whole sequence
|
|
// of edits to a particular state without creating intermediate
|
|
// Versions that contain full copies of the intermediate state.
|
|
class VersionSet::Builder {
|
|
private:
|
|
// Helper to sort by v->files_[file_number].smallest
|
|
struct BySmallestKey {
|
|
const InternalKeyComparator* internal_comparator;
|
|
|
|
bool operator()(FileMetaData* f1, FileMetaData* f2) const {
|
|
int r = internal_comparator->Compare(f1->smallest, f2->smallest);
|
|
if (r != 0) {
|
|
return (r < 0);
|
|
} else {
|
|
// Break ties by file number
|
|
return (f1->number < f2->number);
|
|
}
|
|
}
|
|
};
|
|
|
|
typedef std::set<FileMetaData*, BySmallestKey> FileSet;
|
|
struct LevelState {
|
|
std::set<uint64_t> deleted_files;
|
|
FileSet* added_files;
|
|
};
|
|
|
|
VersionSet* vset_;
|
|
Version* base_;
|
|
LevelState levels_[config::kNumLevels];
|
|
|
|
public:
|
|
// Initialize a builder with the files from *base and other info from *vset
|
|
Builder(VersionSet* vset, Version* base)
|
|
: vset_(vset),
|
|
base_(base) {
|
|
base_->Ref();
|
|
BySmallestKey cmp;
|
|
cmp.internal_comparator = &vset_->icmp_;
|
|
for (int level = 0; level < config::kNumLevels; level++) {
|
|
levels_[level].added_files = new FileSet(cmp);
|
|
}
|
|
}
|
|
|
|
~Builder() {
|
|
for (int level = 0; level < config::kNumLevels; level++) {
|
|
const FileSet* added = levels_[level].added_files;
|
|
std::vector<FileMetaData*> to_unref;
|
|
to_unref.reserve(added->size());
|
|
for (FileSet::const_iterator it = added->begin();
|
|
it != added->end(); ++it) {
|
|
to_unref.push_back(*it);
|
|
}
|
|
delete added;
|
|
for (uint32_t i = 0; i < to_unref.size(); i++) {
|
|
FileMetaData* f = to_unref[i];
|
|
f->refs--;
|
|
if (f->refs <= 0) {
|
|
delete f;
|
|
}
|
|
}
|
|
}
|
|
base_->Unref();
|
|
}
|
|
|
|
// Apply all of the edits in *edit to the current state.
|
|
void Apply(VersionEdit* edit) {
|
|
// Update compaction pointers
|
|
for (size_t i = 0; i < edit->compact_pointers_.size(); i++) {
|
|
const int level = edit->compact_pointers_[i].first;
|
|
vset_->compact_pointer_[level] =
|
|
edit->compact_pointers_[i].second.Encode().ToString();
|
|
}
|
|
|
|
// Delete files
|
|
const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
|
|
for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin();
|
|
iter != del.end();
|
|
++iter) {
|
|
const int level = iter->first;
|
|
const uint64_t number = iter->second;
|
|
levels_[level].deleted_files.insert(number);
|
|
}
|
|
|
|
// Add new files
|
|
for (size_t i = 0; i < edit->new_files_.size(); i++) {
|
|
const int level = edit->new_files_[i].first;
|
|
FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
|
|
f->refs = 1;
|
|
|
|
// We arrange to automatically compact this file after
|
|
// a certain number of seeks. Let's assume:
|
|
// (1) One seek costs 10ms
|
|
// (2) Writing or reading 1MB costs 10ms (100MB/s)
|
|
// (3) A compaction of 1MB does 25MB of IO:
|
|
// 1MB read from this level
|
|
// 10-12MB read from next level (boundaries may be misaligned)
|
|
// 10-12MB written to next level
|
|
// This implies that 25 seeks cost the same as the compaction
|
|
// of 1MB of data. I.e., one seek costs approximately the
|
|
// same as the compaction of 40KB of data. We are a little
|
|
// conservative and allow approximately one seek for every 16KB
|
|
// of data before triggering a compaction.
|
|
f->allowed_seeks = (f->file_size / 16384);
|
|
if (f->allowed_seeks < 100) f->allowed_seeks = 100;
|
|
|
|
levels_[level].deleted_files.erase(f->number);
|
|
levels_[level].added_files->insert(f);
|
|
}
|
|
}
|
|
|
|
// Save the current state in *v.
|
|
void SaveTo(Version* v) {
|
|
BySmallestKey cmp;
|
|
cmp.internal_comparator = &vset_->icmp_;
|
|
for (int level = 0; level < config::kNumLevels; level++) {
|
|
// Merge the set of added files with the set of pre-existing files.
|
|
// Drop any deleted files. Store the result in *v.
|
|
const std::vector<FileMetaData*>& base_files = base_->files_[level];
|
|
std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
|
|
std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
|
|
const FileSet* added = levels_[level].added_files;
|
|
v->files_[level].reserve(base_files.size() + added->size());
|
|
for (FileSet::const_iterator added_iter = added->begin();
|
|
added_iter != added->end();
|
|
++added_iter) {
|
|
// Add all smaller files listed in base_
|
|
for (std::vector<FileMetaData*>::const_iterator bpos
|
|
= std::upper_bound(base_iter, base_end, *added_iter, cmp);
|
|
base_iter != bpos;
|
|
++base_iter) {
|
|
MaybeAddFile(v, level, *base_iter);
|
|
}
|
|
|
|
MaybeAddFile(v, level, *added_iter);
|
|
}
|
|
|
|
// Add remaining base files
|
|
for (; base_iter != base_end; ++base_iter) {
|
|
MaybeAddFile(v, level, *base_iter);
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
// Make sure there is no overlap in levels > 0
|
|
if (level > 0) {
|
|
for (uint32_t i = 1; i < v->files_[level].size(); i++) {
|
|
const InternalKey& prev_end = v->files_[level][i-1]->largest;
|
|
const InternalKey& this_begin = v->files_[level][i]->smallest;
|
|
if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
|
|
fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
|
|
prev_end.DebugString().c_str(),
|
|
this_begin.DebugString().c_str());
|
|
abort();
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
void MaybeAddFile(Version* v, int level, FileMetaData* f) {
|
|
if (levels_[level].deleted_files.count(f->number) > 0) {
|
|
// File is deleted: do nothing
|
|
} else {
|
|
std::vector<FileMetaData*>* files = &v->files_[level];
|
|
if (level > 0 && !files->empty()) {
|
|
// Must not overlap
|
|
assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest,
|
|
f->smallest) < 0);
|
|
}
|
|
f->refs++;
|
|
files->push_back(f);
|
|
}
|
|
}
|
|
};
|
|
|
|
VersionSet::VersionSet(const std::string& dbname,
|
|
const Options* options,
|
|
TableCache* table_cache,
|
|
const InternalKeyComparator* cmp)
|
|
: env_(options->env),
|
|
dbname_(dbname),
|
|
options_(options),
|
|
table_cache_(table_cache),
|
|
icmp_(*cmp),
|
|
next_file_number_(2),
|
|
manifest_file_number_(0), // Filled by Recover()
|
|
last_sequence_(0),
|
|
log_number_(0),
|
|
prev_log_number_(0),
|
|
descriptor_file_(NULL),
|
|
descriptor_log_(NULL),
|
|
dummy_versions_(this),
|
|
current_(NULL) {
|
|
AppendVersion(new Version(this));
|
|
}
|
|
|
|
VersionSet::~VersionSet() {
|
|
current_->Unref();
|
|
assert(dummy_versions_.next_ == &dummy_versions_); // List must be empty
|
|
delete descriptor_log_;
|
|
delete descriptor_file_;
|
|
}
|
|
|
|
void VersionSet::AppendVersion(Version* v) {
|
|
// Make "v" current
|
|
assert(v->refs_ == 0);
|
|
assert(v != current_);
|
|
if (current_ != NULL) {
|
|
current_->Unref();
|
|
}
|
|
current_ = v;
|
|
v->Ref();
|
|
|
|
// Append to linked list
|
|
v->prev_ = dummy_versions_.prev_;
|
|
v->next_ = &dummy_versions_;
|
|
v->prev_->next_ = v;
|
|
v->next_->prev_ = v;
|
|
}
|
|
|
|
Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, port::CondVar* cv, bool* wt) {
|
|
while (*wt) {
|
|
cv->Wait();
|
|
}
|
|
*wt = true;
|
|
if (edit->has_log_number_) {
|
|
assert(edit->log_number_ >= log_number_);
|
|
assert(edit->log_number_ < next_file_number_);
|
|
} else {
|
|
edit->SetLogNumber(log_number_);
|
|
}
|
|
|
|
if (!edit->has_prev_log_number_) {
|
|
edit->SetPrevLogNumber(prev_log_number_);
|
|
}
|
|
|
|
edit->SetNextFile(next_file_number_);
|
|
edit->SetLastSequence(last_sequence_);
|
|
|
|
Version* v = new Version(this);
|
|
{
|
|
Builder builder(this, current_);
|
|
builder.Apply(edit);
|
|
builder.SaveTo(v);
|
|
}
|
|
Finalize(v);
|
|
|
|
// Initialize new descriptor log file if necessary by creating
|
|
// a temporary file that contains a snapshot of the current version.
|
|
std::string new_manifest_file;
|
|
Status s;
|
|
if (descriptor_log_ == NULL) {
|
|
// No reason to unlock *mu here since we only hit this path in the
|
|
// first call to LogAndApply (when opening the database).
|
|
assert(descriptor_file_ == NULL);
|
|
new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
|
|
edit->SetNextFile(next_file_number_);
|
|
s = env_->NewWritableFile(new_manifest_file, &descriptor_file_);
|
|
if (s.ok()) {
|
|
descriptor_log_ = new log::Writer(descriptor_file_);
|
|
s = WriteSnapshot(descriptor_log_);
|
|
}
|
|
}
|
|
|
|
// Unlock during expensive MANIFEST log write
|
|
{
|
|
mu->Unlock();
|
|
|
|
// Write new record to MANIFEST log
|
|
if (s.ok()) {
|
|
std::string record;
|
|
edit->EncodeTo(&record);
|
|
s = descriptor_log_->AddRecord(record);
|
|
if (s.ok()) {
|
|
// XXX Unlock during expensive MANIFEST log write
|
|
s = descriptor_file_->Sync();
|
|
}
|
|
if (!s.ok()) {
|
|
Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str());
|
|
if (ManifestContains(record)) {
|
|
Log(options_->info_log,
|
|
"MANIFEST contains log record despite error; advancing to new "
|
|
"version to prevent mismatch between in-memory and logged state");
|
|
s = Status::OK();
|
|
}
|
|
}
|
|
}
|
|
|
|
// If we just created a new descriptor file, install it by writing a
|
|
// new CURRENT file that points to it.
|
|
if (s.ok() && !new_manifest_file.empty()) {
|
|
s = SetCurrentFile(env_, dbname_, manifest_file_number_);
|
|
// No need to double-check MANIFEST in case of error since it
|
|
// will be discarded below.
|
|
}
|
|
|
|
mu->Lock();
|
|
}
|
|
|
|
// Install the new version
|
|
if (s.ok()) {
|
|
AppendVersion(v);
|
|
log_number_ = edit->log_number_;
|
|
prev_log_number_ = edit->prev_log_number_;
|
|
} else {
|
|
delete v;
|
|
if (!new_manifest_file.empty()) {
|
|
delete descriptor_log_;
|
|
delete descriptor_file_;
|
|
descriptor_log_ = NULL;
|
|
descriptor_file_ = NULL;
|
|
env_->DeleteFile(new_manifest_file);
|
|
}
|
|
}
|
|
|
|
*wt = false;
|
|
cv->Signal();
|
|
return s;
|
|
}
|
|
|
|
Status VersionSet::Recover() {
|
|
struct LogReporter : public log::Reader::Reporter {
|
|
Status* status;
|
|
virtual void Corruption(size_t bytes, const Status& s) {
|
|
if (this->status->ok()) *this->status = s;
|
|
}
|
|
};
|
|
|
|
// Read "CURRENT" file, which contains a pointer to the current manifest file
|
|
std::string current;
|
|
Status s = ReadFileToString(env_, CurrentFileName(dbname_), ¤t);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
if (current.empty() || current[current.size()-1] != '\n') {
|
|
return Status::Corruption("CURRENT file does not end with newline");
|
|
}
|
|
current.resize(current.size() - 1);
|
|
|
|
std::string dscname = dbname_ + "/" + current;
|
|
SequentialFile* file;
|
|
s = env_->NewSequentialFile(dscname, &file);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
|
|
bool have_log_number = false;
|
|
bool have_prev_log_number = false;
|
|
bool have_next_file = false;
|
|
bool have_last_sequence = false;
|
|
uint64_t next_file = 0;
|
|
uint64_t last_sequence = 0;
|
|
uint64_t log_number = 0;
|
|
uint64_t prev_log_number = 0;
|
|
Builder builder(this, current_);
|
|
|
|
{
|
|
LogReporter reporter;
|
|
reporter.status = &s;
|
|
log::Reader reader(file, &reporter, true/*checksum*/, 0/*initial_offset*/);
|
|
Slice record;
|
|
std::string scratch;
|
|
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
|
|
VersionEdit edit;
|
|
s = edit.DecodeFrom(record);
|
|
if (s.ok()) {
|
|
if (edit.has_comparator_ &&
|
|
edit.comparator_ != icmp_.user_comparator()->Name()) {
|
|
s = Status::InvalidArgument(
|
|
edit.comparator_ + " does not match existing comparator ",
|
|
icmp_.user_comparator()->Name());
|
|
}
|
|
}
|
|
|
|
if (s.ok()) {
|
|
builder.Apply(&edit);
|
|
}
|
|
|
|
if (edit.has_log_number_) {
|
|
log_number = edit.log_number_;
|
|
have_log_number = true;
|
|
}
|
|
|
|
if (edit.has_prev_log_number_) {
|
|
prev_log_number = edit.prev_log_number_;
|
|
have_prev_log_number = true;
|
|
}
|
|
|
|
if (edit.has_next_file_number_) {
|
|
next_file = edit.next_file_number_;
|
|
have_next_file = true;
|
|
}
|
|
|
|
if (edit.has_last_sequence_) {
|
|
last_sequence = edit.last_sequence_;
|
|
have_last_sequence = true;
|
|
}
|
|
}
|
|
}
|
|
delete file;
|
|
file = NULL;
|
|
|
|
if (s.ok()) {
|
|
if (!have_next_file) {
|
|
s = Status::Corruption("no meta-nextfile entry in descriptor");
|
|
} else if (!have_log_number) {
|
|
s = Status::Corruption("no meta-lognumber entry in descriptor");
|
|
} else if (!have_last_sequence) {
|
|
s = Status::Corruption("no last-sequence-number entry in descriptor");
|
|
}
|
|
|
|
if (!have_prev_log_number) {
|
|
prev_log_number = 0;
|
|
}
|
|
|
|
MarkFileNumberUsed(prev_log_number);
|
|
MarkFileNumberUsed(log_number);
|
|
}
|
|
|
|
if (s.ok()) {
|
|
Version* v = new Version(this);
|
|
builder.SaveTo(v);
|
|
// Install recovered version
|
|
Finalize(v);
|
|
AppendVersion(v);
|
|
manifest_file_number_ = next_file;
|
|
next_file_number_ = next_file + 1;
|
|
last_sequence_ = last_sequence;
|
|
log_number_ = log_number;
|
|
prev_log_number_ = prev_log_number;
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
void VersionSet::MarkFileNumberUsed(uint64_t number) {
|
|
if (next_file_number_ <= number) {
|
|
next_file_number_ = number + 1;
|
|
}
|
|
}
|
|
|
|
void VersionSet::Finalize(Version* v) {
|
|
// Compute the ratio of disk usage to its limit
|
|
for (int level = 0; level + 1 < config::kNumLevels; ++level) {
|
|
double score;
|
|
if (level == 0) {
|
|
// We treat level-0 specially by bounding the number of files
|
|
// instead of number of bytes for two reasons:
|
|
//
|
|
// (1) With larger write-buffer sizes, it is nice not to do too
|
|
// many level-0 compactions.
|
|
//
|
|
// (2) The files in level-0 are merged on every read and
|
|
// therefore we wish to avoid too many files when the individual
|
|
// file size is small (perhaps because of a small write-buffer
|
|
// setting, or very high compression ratios, or lots of
|
|
// overwrites/deletions).
|
|
score = v->files_[level].size() /
|
|
static_cast<double>(config::kL0_CompactionTrigger);
|
|
} else {
|
|
// Compute the ratio of current size to size limit.
|
|
const uint64_t level_bytes = TotalFileSize(v->files_[level]);
|
|
score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
|
|
}
|
|
v->compaction_scores_[level] = score;
|
|
}
|
|
}
|
|
|
|
Status VersionSet::WriteSnapshot(log::Writer* log) {
|
|
// TODO: Break up into multiple records to reduce memory usage on recovery?
|
|
|
|
// Save metadata
|
|
VersionEdit edit;
|
|
edit.SetComparatorName(icmp_.user_comparator()->Name());
|
|
|
|
// Save compaction pointers
|
|
for (int level = 0; level < config::kNumLevels; level++) {
|
|
if (!compact_pointer_[level].empty()) {
|
|
InternalKey key;
|
|
key.DecodeFrom(compact_pointer_[level]);
|
|
edit.SetCompactPointer(level, key);
|
|
}
|
|
}
|
|
|
|
// Save files
|
|
for (int level = 0; level < config::kNumLevels; level++) {
|
|
const std::vector<FileMetaData*>& files = current_->files_[level];
|
|
for (size_t i = 0; i < files.size(); i++) {
|
|
const FileMetaData* f = files[i];
|
|
edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest);
|
|
}
|
|
}
|
|
|
|
std::string record;
|
|
edit.EncodeTo(&record);
|
|
return log->AddRecord(record);
|
|
}
|
|
|
|
int VersionSet::NumLevelFiles(int level) const {
|
|
assert(level >= 0);
|
|
assert(level < config::kNumLevels);
|
|
return current_->files_[level].size();
|
|
}
|
|
|
|
const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
|
|
// Update code if kNumLevels changes
|
|
assert(config::kNumLevels == 7);
|
|
snprintf(scratch->buffer, sizeof(scratch->buffer),
|
|
"files[ %d %d %d %d %d %d %d ]",
|
|
int(current_->files_[0].size()),
|
|
int(current_->files_[1].size()),
|
|
int(current_->files_[2].size()),
|
|
int(current_->files_[3].size()),
|
|
int(current_->files_[4].size()),
|
|
int(current_->files_[5].size()),
|
|
int(current_->files_[6].size()));
|
|
return scratch->buffer;
|
|
}
|
|
|
|
// Return true iff the manifest contains the specified record.
|
|
bool VersionSet::ManifestContains(const std::string& record) const {
|
|
std::string fname = DescriptorFileName(dbname_, manifest_file_number_);
|
|
Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str());
|
|
SequentialFile* file = NULL;
|
|
Status s = env_->NewSequentialFile(fname, &file);
|
|
if (!s.ok()) {
|
|
Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str());
|
|
return false;
|
|
}
|
|
log::Reader reader(file, NULL, true/*checksum*/, 0);
|
|
Slice r;
|
|
std::string scratch;
|
|
bool result = false;
|
|
while (reader.ReadRecord(&r, &scratch)) {
|
|
if (r == Slice(record)) {
|
|
result = true;
|
|
break;
|
|
}
|
|
}
|
|
delete file;
|
|
Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0);
|
|
return result;
|
|
}
|
|
|
|
uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
|
|
uint64_t result = 0;
|
|
for (int level = 0; level < config::kNumLevels; level++) {
|
|
const std::vector<FileMetaData*>& files = v->files_[level];
|
|
for (size_t i = 0; i < files.size(); i++) {
|
|
if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
|
|
// Entire file is before "ikey", so just add the file size
|
|
result += files[i]->file_size;
|
|
} else if (icmp_.Compare(files[i]->smallest, ikey) > 0) {
|
|
// Entire file is after "ikey", so ignore
|
|
if (level > 0) {
|
|
// Files other than level 0 are sorted by meta->smallest, so
|
|
// no further files in this level will contain data for
|
|
// "ikey".
|
|
break;
|
|
}
|
|
} else {
|
|
// "ikey" falls in the range for this table. Add the
|
|
// approximate offset of "ikey" within the table.
|
|
Table* tableptr;
|
|
Iterator* iter = table_cache_->NewIterator(
|
|
ReadOptions(), files[i]->number, files[i]->file_size, &tableptr);
|
|
if (tableptr != NULL) {
|
|
result += tableptr->ApproximateOffsetOf(ikey.Encode());
|
|
}
|
|
delete iter;
|
|
}
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
|
|
for (Version* v = dummy_versions_.next_;
|
|
v != &dummy_versions_;
|
|
v = v->next_) {
|
|
for (int level = 0; level < config::kNumLevels; level++) {
|
|
const std::vector<FileMetaData*>& files = v->files_[level];
|
|
for (size_t i = 0; i < files.size(); i++) {
|
|
live->insert(files[i]->number);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
int64_t VersionSet::NumLevelBytes(int level) const {
|
|
assert(level >= 0);
|
|
assert(level < config::kNumLevels);
|
|
return TotalFileSize(current_->files_[level]);
|
|
}
|
|
|
|
int64_t VersionSet::MaxNextLevelOverlappingBytes() {
|
|
int64_t result = 0;
|
|
std::vector<FileMetaData*> overlaps;
|
|
for (int level = 1; level < config::kNumLevels - 1; level++) {
|
|
for (size_t i = 0; i < current_->files_[level].size(); i++) {
|
|
const FileMetaData* f = current_->files_[level][i];
|
|
current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest,
|
|
&overlaps);
|
|
const int64_t sum = TotalFileSize(overlaps);
|
|
if (sum > result) {
|
|
result = sum;
|
|
}
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// Stores the minimal range that covers all entries in inputs in
|
|
// *smallest, *largest.
|
|
// REQUIRES: inputs is not empty
|
|
void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
|
|
InternalKey* smallest,
|
|
InternalKey* largest) {
|
|
assert(!inputs.empty());
|
|
smallest->Clear();
|
|
largest->Clear();
|
|
for (size_t i = 0; i < inputs.size(); i++) {
|
|
FileMetaData* f = inputs[i];
|
|
if (i == 0) {
|
|
*smallest = f->smallest;
|
|
*largest = f->largest;
|
|
} else {
|
|
if (icmp_.Compare(f->smallest, *smallest) < 0) {
|
|
*smallest = f->smallest;
|
|
}
|
|
if (icmp_.Compare(f->largest, *largest) > 0) {
|
|
*largest = f->largest;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Stores the minimal range that covers all entries in inputs1 and inputs2
|
|
// in *smallest, *largest.
|
|
// REQUIRES: inputs is not empty
|
|
void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
|
|
const std::vector<FileMetaData*>& inputs2,
|
|
InternalKey* smallest,
|
|
InternalKey* largest) {
|
|
std::vector<FileMetaData*> all = inputs1;
|
|
all.insert(all.end(), inputs2.begin(), inputs2.end());
|
|
GetRange(all, smallest, largest);
|
|
}
|
|
|
|
Iterator* VersionSet::MakeInputIterator(Compaction* c) {
|
|
ReadOptions options;
|
|
options.verify_checksums = options_->paranoid_checks;
|
|
options.fill_cache = false;
|
|
|
|
// Level-0 files have to be merged together. For other levels,
|
|
// we will make a concatenating iterator per level.
|
|
// TODO(opt): use concatenating iterator for level-0 if there is no overlap
|
|
const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2);
|
|
Iterator** list = new Iterator*[space];
|
|
int num = 0;
|
|
for (int which = 0; which < 2; which++) {
|
|
if (!c->inputs_[which].empty()) {
|
|
if (c->level() + which == 0) {
|
|
const std::vector<FileMetaData*>& files = c->inputs_[which];
|
|
for (size_t i = 0; i < files.size(); i++) {
|
|
list[num++] = table_cache_->NewIterator(
|
|
options, files[i]->number, files[i]->file_size);
|
|
}
|
|
} else {
|
|
// Create concatenating iterator for the files from this level
|
|
list[num++] = NewTwoLevelIterator(
|
|
new Version::LevelFileNumIterator(icmp_, &c->inputs_[which], 0),
|
|
&GetFileIterator, table_cache_, options);
|
|
}
|
|
}
|
|
}
|
|
assert(num <= space);
|
|
Iterator* result = NewMergingIterator(&icmp_, list, num);
|
|
delete[] list;
|
|
return result;
|
|
}
|
|
|
|
struct CompactionBoundary {
|
|
size_t start;
|
|
size_t limit;
|
|
CompactionBoundary() : start(0), limit(0) {}
|
|
CompactionBoundary(size_t s, size_t l) : start(s), limit(l) {}
|
|
};
|
|
|
|
struct CmpByRange {
|
|
CmpByRange(const InternalKeyComparator* cmp) : cmp_(cmp) {}
|
|
bool operator () (const FileMetaData* lhs, const FileMetaData* rhs) {
|
|
int smallest = cmp_->Compare(lhs->smallest, rhs->smallest);
|
|
if (smallest == 0) {
|
|
return cmp_->Compare(lhs->largest, rhs->largest) < 0;
|
|
}
|
|
return smallest < 0;
|
|
}
|
|
private:
|
|
const InternalKeyComparator* cmp_;
|
|
};
|
|
|
|
// Stores the compaction boundaries between level and level + 1
|
|
void VersionSet::GetCompactionBoundaries(Version* v,
|
|
int level,
|
|
std::vector<FileMetaData*>* LA,
|
|
std::vector<FileMetaData*>* LB,
|
|
std::vector<uint64_t>* LA_sizes,
|
|
std::vector<uint64_t>* LB_sizes,
|
|
std::vector<CompactionBoundary>* boundaries)
|
|
{
|
|
const Comparator* user_cmp = icmp_.user_comparator();
|
|
*LA = v->files_[level + 0];
|
|
*LB = v->files_[level + 1];
|
|
*LA_sizes = std::vector<uint64_t>(LA->size() + 1, 0);
|
|
*LB_sizes = std::vector<uint64_t>(LB->size() + 1, 0);
|
|
std::sort(LA->begin(), LA->end(), CmpByRange(&icmp_));
|
|
std::sort(LB->begin(), LB->end(), CmpByRange(&icmp_));
|
|
boundaries->resize(LA->size());
|
|
|
|
// compute sizes
|
|
for (size_t i = 0; i < LA->size(); ++i) {
|
|
(*LA_sizes)[i + 1] = (*LA_sizes)[i] + (*LA)[i]->file_size;
|
|
}
|
|
for (size_t i = 0; i < LB->size(); ++i) {
|
|
(*LB_sizes)[i + 1] = (*LB_sizes)[i] + (*LB)[i]->file_size;
|
|
}
|
|
|
|
// compute boundaries
|
|
size_t start = 0;
|
|
size_t limit = 0;
|
|
// figure out which range of LB each LA covers
|
|
for (size_t i = 0; i < LA->size(); ++i) {
|
|
// find smallest start s.t. LB[start] overlaps LA[i]
|
|
while (start < LB->size() &&
|
|
user_cmp->Compare((*LB)[start]->largest.user_key(),
|
|
(*LA)[i]->smallest.user_key()) < 0) {
|
|
++start;
|
|
}
|
|
limit = std::max(start, limit);
|
|
// find smallest limit >= start s.t. LB[limit] does not overlap LA[i]
|
|
while (limit < LB->size() &&
|
|
user_cmp->Compare((*LB)[limit]->smallest.user_key(),
|
|
(*LA)[i]->largest.user_key()) <= 0) {
|
|
++limit;
|
|
}
|
|
(*boundaries)[i].start = start;
|
|
(*boundaries)[i].limit = limit;
|
|
}
|
|
}
|
|
|
|
int VersionSet::PickCompactionLevel(bool* locked, bool seek_driven) const {
|
|
// Find an unlocked level has score >= 1 where level + 1 has score < 1.
|
|
int level = config::kNumLevels;
|
|
for (int i = 0; i + 1 < config::kNumLevels; ++i) {
|
|
if (locked[i] || locked[i + 1]) {
|
|
continue;
|
|
}
|
|
if (current_->compaction_scores_[i + 0] >= 1.0 &&
|
|
(i + 2 >= config::kNumLevels ||
|
|
current_->compaction_scores_[i + 1] < 1.0)) {
|
|
level = i;
|
|
break;
|
|
}
|
|
}
|
|
if (seek_driven &&
|
|
level == config::kNumLevels &&
|
|
current_->file_to_compact_ != NULL &&
|
|
!locked[current_->file_to_compact_level_ + 0] &&
|
|
!locked[current_->file_to_compact_level_ + 1]) {
|
|
level = current_->file_to_compact_level_;
|
|
}
|
|
return level;
|
|
}
|
|
|
|
static bool OldestFirst(FileMetaData* a, FileMetaData* b) {
|
|
return a->number < b->number;
|
|
}
|
|
|
|
Compaction* VersionSet::PickCompaction(Version* v, int level) {
|
|
assert(0 <= level && level < config::kNumLevels);
|
|
bool trivial = false;
|
|
|
|
if (v->files_[level].empty()) {
|
|
return NULL;
|
|
}
|
|
|
|
Compaction* c = new Compaction(level);
|
|
c->input_version_ = v;
|
|
c->input_version_->Ref();
|
|
|
|
if (level > 0) {
|
|
std::vector<FileMetaData*> LA;
|
|
std::vector<FileMetaData*> LB;
|
|
std::vector<uint64_t> LA_sizes;
|
|
std::vector<uint64_t> LB_sizes;
|
|
std::vector<CompactionBoundary> boundaries;
|
|
GetCompactionBoundaries(v, level, &LA, &LB, &LA_sizes, &LB_sizes, &boundaries);
|
|
|
|
// find the best set of files: maximize the ratio of sizeof(LA)/sizeof(LB)
|
|
// while keeping sizeof(LA)+sizeof(LB) < some threshold. If there's a tie
|
|
// for ratio, minimize size.
|
|
size_t best_idx_start = 0;
|
|
size_t best_idx_limit = 0;
|
|
uint64_t best_size = 0;
|
|
double best_ratio = -1;
|
|
for (size_t i = 0; i < boundaries.size(); ++i) {
|
|
for (size_t j = i; j < boundaries.size(); ++j) {
|
|
uint64_t sz_a = LA_sizes[j + 1] - LA_sizes[i];
|
|
uint64_t sz_b = LB_sizes[boundaries[j].limit] - LB_sizes[boundaries[i].start];
|
|
if (boundaries[j].start == boundaries[j].limit) {
|
|
trivial = true;
|
|
break;
|
|
}
|
|
if (sz_a + sz_b >= MaxCompactionBytesForLevel(level)) {
|
|
break;
|
|
}
|
|
assert(sz_b > 0); // true because we exclude trivial moves
|
|
double ratio = double(sz_a) / double(sz_b);
|
|
if (ratio > best_ratio ||
|
|
(ratio == best_ratio && sz_a + sz_b < best_size)) {
|
|
best_ratio = ratio;
|
|
best_size = sz_a + sz_b;
|
|
best_idx_start = i;
|
|
best_idx_limit = j + 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Trivial moves have a near-0 cost, so do them first.
|
|
if (trivial) {
|
|
for (size_t i = 0; i < LA.size(); ++i) {
|
|
if (boundaries[i].start == boundaries[i].limit) {
|
|
c->inputs_[0].push_back(LA[i]);
|
|
}
|
|
}
|
|
trivial = level != 0;
|
|
c->SetRatio(1.0);
|
|
// If the best we could do would be wasteful and the best level has more
|
|
// data in it than the next level would have, move it all
|
|
} else if (level < 4 && best_ratio >= 0.0 &&
|
|
LA_sizes.back() * best_ratio >= LB_sizes.back()) {
|
|
for (size_t i = 0 ; i < LA.size(); ++i) {
|
|
c->inputs_[0].push_back(LA[i]);
|
|
}
|
|
c->SetRatio(double(LA_sizes.back()) / double(LB_sizes.back()));
|
|
// otherwise go with the best ratio
|
|
} else if (best_ratio >= 0.0) {
|
|
for (size_t i = best_idx_start; i < best_idx_limit; ++i) {
|
|
assert(i >= 0 && i < LA.size());
|
|
c->inputs_[0].push_back(LA[i]);
|
|
}
|
|
for (size_t i = boundaries[best_idx_start].start;
|
|
i < boundaries[best_idx_limit - 1].limit; ++i) {
|
|
assert(i >= 0 && i < LB.size());
|
|
c->inputs_[1].push_back(LB[i]);
|
|
}
|
|
c->SetRatio(best_ratio);
|
|
// pick the file to compact in this level
|
|
} else if (v->file_to_compact_ != NULL) {
|
|
c->inputs_[0].push_back(v->file_to_compact_);
|
|
// otherwise just pick the file with least overlap
|
|
} else {
|
|
assert(level >= 0);
|
|
assert(level+1 < config::kNumLevels);
|
|
// Pick the file that overlaps with the fewest files in the next level
|
|
size_t largest = boundaries.size();
|
|
size_t smallest = boundaries.size();
|
|
for (size_t i = 0; i < boundaries.size(); ++i) {
|
|
if (smallest == boundaries.size() ||
|
|
boundaries[smallest].limit - boundaries[smallest].start >
|
|
boundaries[i].limit - boundaries[i].start) {
|
|
smallest = i;
|
|
}
|
|
}
|
|
assert(smallest < boundaries.size());
|
|
c->inputs_[0].push_back(LA[smallest]);
|
|
for (size_t i = boundaries[smallest].start; i < boundaries[smallest].limit; ++i) {
|
|
c->inputs_[1].push_back(LB[i]);
|
|
}
|
|
}
|
|
} else {
|
|
std::vector<FileMetaData*> tmp(v->files_[0]);
|
|
std::sort(tmp.begin(), tmp.end(), OldestFirst);
|
|
for (size_t i = 0; i < tmp.size() && c->inputs_[0].size() < 32; ++i) {
|
|
c->inputs_[0].push_back(tmp[i]);
|
|
}
|
|
}
|
|
|
|
if (!trivial) {
|
|
SetupOtherInputs(c);
|
|
}
|
|
return c;
|
|
}
|
|
|
|
void VersionSet::SetupOtherInputs(Compaction* c) {
|
|
const int level = c->level();
|
|
InternalKey smallest, largest;
|
|
GetRange(c->inputs_[0], &smallest, &largest);
|
|
c->input_version_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]);
|
|
|
|
// Update the place where we will do the next compaction for this level.
|
|
// We update this immediately instead of waiting for the VersionEdit
|
|
// to be applied so that if the compaction fails, we will try a different
|
|
// key range next time.
|
|
//compact_pointer_[level] = largest.Encode().ToString();
|
|
c->edit_.SetCompactPointer(level, largest);
|
|
}
|
|
|
|
Compaction* VersionSet::CompactRange(
|
|
int level,
|
|
const InternalKey* begin,
|
|
const InternalKey* end) {
|
|
std::vector<FileMetaData*> inputs;
|
|
current_->GetOverlappingInputs(level, begin, end, &inputs);
|
|
if (inputs.empty()) {
|
|
return NULL;
|
|
}
|
|
|
|
// Avoid compacting too much in one shot in case the range is large.
|
|
// But we cannot do this for level-0 since level-0 files can overlap
|
|
// and we must not pick one file and drop another older file if the
|
|
// two files overlap.
|
|
if (level > 0) {
|
|
const uint64_t limit = MaxFileSizeForLevel(level);
|
|
uint64_t total = 0;
|
|
for (size_t i = 0; i < inputs.size(); i++) {
|
|
uint64_t s = inputs[i]->file_size;
|
|
total += s;
|
|
if (total >= limit) {
|
|
inputs.resize(i + 1);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
Compaction* c = new Compaction(level);
|
|
c->input_version_ = current_;
|
|
c->input_version_->Ref();
|
|
c->inputs_[0] = inputs;
|
|
SetupOtherInputs(c);
|
|
return c;
|
|
}
|
|
|
|
Compaction::Compaction(int level)
|
|
: level_(level),
|
|
max_output_file_size_(MaxFileSizeForLevel(level)),
|
|
input_version_(NULL),
|
|
ratio_(0) {
|
|
for (int i = 0; i < config::kNumLevels; i++) {
|
|
level_ptrs_[i] = 0;
|
|
}
|
|
}
|
|
|
|
Compaction::~Compaction() {
|
|
if (input_version_ != NULL) {
|
|
input_version_->Unref();
|
|
}
|
|
}
|
|
|
|
bool Compaction::IsTrivialMove() const {
|
|
return num_input_files(1) == 0;
|
|
}
|
|
|
|
void Compaction::AddInputDeletions(VersionEdit* edit) {
|
|
for (int which = 0; which < 2; which++) {
|
|
for (size_t i = 0; i < inputs_[which].size(); i++) {
|
|
edit->DeleteFile(level_ + which, inputs_[which][i]->number);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
|
|
// Maybe use binary search to find right entry instead of linear search?
|
|
const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
|
|
for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) {
|
|
const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
|
|
for (; level_ptrs_[lvl] < files.size(); ) {
|
|
FileMetaData* f = files[level_ptrs_[lvl]];
|
|
if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
|
|
// We've advanced far enough
|
|
if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
|
|
// Key falls in this file's range, so definitely not base level
|
|
return false;
|
|
}
|
|
break;
|
|
}
|
|
level_ptrs_[lvl]++;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void Compaction::ReleaseInputs() {
|
|
if (input_version_ != NULL) {
|
|
input_version_->Unref();
|
|
input_version_ = NULL;
|
|
}
|
|
}
|
|
|
|
} // namespace hyperleveldb
|