Allow users to profile a query and see bottleneck of the query

Summary:
Provide a framework to profile a query in detail to figure out latency bottleneck. Currently, in Get(), Put() and iterators, 2-3 simple timing is used. We can easily add more profile counters to the framework later.

Test Plan: Enable this profiling in seveal existing tests.

Reviewers: haobo, dhruba, kailiu, emayanke, vamsi, igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14001
This commit is contained in:
Siying Dong
2013-11-18 11:32:54 -08:00
committed by Siying Dong
parent 56589ab81f
commit 3d8ac31d71
11 changed files with 202 additions and 20 deletions

View File

@@ -22,6 +22,7 @@
#include "rocksdb/memtablerep.h"
#include "rocksdb/write_batch.h"
#include "rocksdb/statistics.h"
#include "rocksdb/perf_context.h"
#include "port/port.h"
#include "util/bit_set.h"
#include "util/crc32c.h"
@@ -350,6 +351,8 @@ DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
" this is greater than 0.");
DEFINE_int32(perf_level, 0, "Level of perf collection");
static bool ValidateRateLimit(const char* flagname, double value) {
static constexpr double EPSILON = 1e-10;
if ( value < -EPSILON ) {
@@ -689,6 +692,7 @@ struct SharedState {
port::Mutex mu;
port::CondVar cv;
int total;
int perf_level;
// Each thread goes through the following states:
// (1) initializing
@@ -700,7 +704,7 @@ struct SharedState {
long num_done;
bool start;
SharedState() : cv(&mu) { }
SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
};
// Per-thread state for concurrent executions of the same benchmark.
@@ -810,6 +814,7 @@ class Benchmark {
fprintf(stdout, "Memtablerep: vector\n");
break;
}
fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
PrintWarnings();
fprintf(stdout, "------------------------------------------------\n");
@@ -1150,6 +1155,7 @@ class Benchmark {
}
}
SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
thread->stats.Start(thread->tid);
(arg->bm->*(arg->method))(thread);
thread->stats.Stop();

View File

@@ -1035,7 +1035,7 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
(unsigned long)m->GetLogNumber());
list.push_back(m->NewIterator());
}
Iterator* iter = NewMergingIterator(&internal_comparator_, &list[0],
Iterator* iter = NewMergingIterator(env_, &internal_comparator_, &list[0],
list.size());
const SequenceNumber newest_snapshot = snapshots_.GetNewest();
const SequenceNumber earliest_seqno_in_memtable =
@@ -2519,7 +2519,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
// Collect iterators for files in L0 - Ln
versions_->current()->AddIterators(options, storage_options_, &list);
Iterator* internal_iter =
NewMergingIterator(&internal_comparator_, &list[0], list.size());
NewMergingIterator(env_, &internal_comparator_, &list[0], list.size());
versions_->current()->Ref();
cleanup->mu = &mutex_;
@@ -2555,6 +2555,8 @@ Status DBImpl::GetImpl(const ReadOptions& options,
Status s;
StopWatch sw(env_, options_.statistics, DB_GET);
StopWatchNano snapshot_timer(env_, false);
StartPerfTimer(&snapshot_timer);
SequenceNumber snapshot;
mutex_.Lock();
if (options.snapshot != nullptr) {
@@ -2583,15 +2585,23 @@ Status DBImpl::GetImpl(const ReadOptions& options,
// s is both in/out. When in, s could either be OK or MergeInProgress.
// merge_operands will contain the sequence of merges in the latter case.
LookupKey lkey(key, snapshot);
BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
if (mem->Get(lkey, value, &s, &merge_operands, options_)) {
// Done
} else if (imm.Get(lkey, value, &s, &merge_operands, options_)) {
// Done
} else {
StopWatchNano from_files_timer(env_, false);
StartPerfTimer(&from_files_timer);
current->Get(options, lkey, value, &s, &merge_operands, &stats,
options_, value_found);
have_stat_update = true;
BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer);
}
StopWatchNano post_process_timer(env_, false);
StartPerfTimer(&post_process_timer);
mutex_.Lock();
if (!options_.disable_seek_compaction &&
@@ -2607,6 +2617,8 @@ Status DBImpl::GetImpl(const ReadOptions& options,
// Note, tickers are atomic now - no lock protection needed any more.
RecordTick(options_.statistics, NUMBER_KEYS_READ);
RecordTick(options_.statistics, BYTES_READ, value->size());
BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
return s;
}
@@ -2615,6 +2627,8 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
std::vector<std::string>* values) {
StopWatch sw(env_, options_.statistics, DB_MULTIGET);
StopWatchNano snapshot_timer(env_, false);
StartPerfTimer(&snapshot_timer);
SequenceNumber snapshot;
mutex_.Lock();
if (options.snapshot != nullptr) {
@@ -2646,6 +2660,7 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
// Keep track of bytes that we read for statistics-recording later
uint64_t bytesRead = 0;
BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
// For each of the given keys, apply the entire "get" process as follows:
// First look in the memtable, then in the immutable memtable (if any).
@@ -2672,6 +2687,8 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
}
// Post processing (decrement reference counts and record statistics)
StopWatchNano post_process_timer(env_, false);
StartPerfTimer(&post_process_timer);
mutex_.Lock();
if (!options_.disable_seek_compaction &&
have_stat_update && current->UpdateStats(stats)) {
@@ -2686,6 +2703,7 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
RecordTick(options_.statistics, NUMBER_MULTIGET_CALLS);
RecordTick(options_.statistics, NUMBER_MULTIGET_KEYS_READ, numKeys);
RecordTick(options_.statistics, NUMBER_MULTIGET_BYTES_READ, bytesRead);
BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
return statList;
}
@@ -2754,6 +2772,8 @@ Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
}
Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
StopWatchNano pre_post_process_timer(env_, false);
StartPerfTimer(&pre_post_process_timer);
Writer w(&mutex_);
w.batch = my_batch;
w.sync = options.sync;
@@ -2800,12 +2820,13 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
if (options.disableWAL) {
flush_on_destroy_ = true;
}
BumpPerfTime(&perf_context.write_pre_and_post_process_time,
&pre_post_process_timer);
if (!options.disableWAL) {
StopWatchNano timer(env_);
StartPerfTimer(&timer);
status = log_->AddRecord(WriteBatchInternal::Contents(updates));
BumpPerfTime(&perf_context.wal_write_time, &timer);
if (status.ok() && options.sync) {
if (options_.use_fsync) {
StopWatch(env_, options_.statistics, WAL_FILE_SYNC_MICROS);
@@ -2815,10 +2836,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
status = log_->file()->Sync();
}
}
BumpPerfTime(&perf_context.write_wal_time, &timer);
}
if (status.ok()) {
StopWatchNano write_memtable_timer(env_, false);
StartPerfTimer(&write_memtable_timer);
status = WriteBatchInternal::InsertInto(updates, mem_, &options_, this,
options_.filter_deletes);
BumpPerfTime(&perf_context.write_memtable_time, &write_memtable_timer);
if (!status.ok()) {
// Panic for in-memory corruptions
// Note that existing logic was not sound. Any partial failure writing
@@ -2828,6 +2853,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
}
SetTickerCount(options_.statistics, SEQUENCE_NUMBER, last_sequence);
}
StartPerfTimer(&pre_post_process_timer);
LogFlush(options_.info_log);
mutex_.Lock();
if (status.ok()) {
@@ -2855,6 +2881,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
if (!writers_.empty()) {
writers_.front()->cv.Signal();
}
BumpPerfTime(&perf_context.write_pre_and_post_process_time,
&pre_post_process_timer);
return status;
}

View File

@@ -102,7 +102,8 @@ class DBIter: public Iterator {
virtual void SeekToLast();
private:
void FindNextUserEntry(bool skipping);
inline void FindNextUserEntry(bool skipping);
void FindNextUserEntryInternal(bool skipping);
void FindPrevUserEntry();
bool ParseKey(ParsedInternalKey* key);
void MergeValuesNewToOld();
@@ -191,7 +192,15 @@ void DBIter::Next() {
//
// NOTE: In between, saved_key_ can point to a user key that has
// a delete marker
void DBIter::FindNextUserEntry(bool skipping) {
inline void DBIter::FindNextUserEntry(bool skipping) {
StopWatchNano timer(env_, false);
StartPerfTimer(&timer);
FindNextUserEntryInternal(skipping);
BumpPerfTime(&perf_context.find_next_user_entry_time, &timer);
}
// Actual implementation of DBIter::FindNextUserEntry()
void DBIter::FindNextUserEntryInternal(bool skipping) {
// Loop until we hit an acceptable entry to yield
assert(iter_->Valid());
assert(direction_ == kForward);
@@ -431,7 +440,10 @@ void DBIter::Seek(const Slice& target) {
saved_key_.clear();
AppendInternalKey(
&saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
StopWatchNano internal_seek_timer(env_, false);
StartPerfTimer(&internal_seek_timer);
iter_->Seek(saved_key_);
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
if (iter_->Valid()) {
FindNextUserEntry(false /*not skipping */);
} else {
@@ -442,7 +454,10 @@ void DBIter::Seek(const Slice& target) {
void DBIter::SeekToFirst() {
direction_ = kForward;
ClearSavedValue();
StopWatchNano internal_seek_timer(env_, false);
StartPerfTimer(&internal_seek_timer);
iter_->SeekToFirst();
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
if (iter_->Valid()) {
FindNextUserEntry(false /* not skipping */);
} else {
@@ -461,7 +476,10 @@ void DBIter::SeekToLast() {
direction_ = kReverse;
ClearSavedValue();
StopWatchNano internal_seek_timer(env_, false);
StartPerfTimer(&internal_seek_timer);
iter_->SeekToLast();
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
FindPrevUserEntry();
}

View File

@@ -22,6 +22,7 @@
#include "rocksdb/compaction_filter.h"
#include "rocksdb/env.h"
#include "rocksdb/table.h"
#include "rocksdb/perf_context.h"
#include "util/hash.h"
#include "util/logging.h"
#include "util/mutexlock.h"
@@ -1215,7 +1216,13 @@ TEST(DBTest, IterMulti) {
ASSERT_EQ(IterStatus(iter), "a->va");
iter->Seek("ax");
ASSERT_EQ(IterStatus(iter), "b->vb");
SetPerfLevel(kEnableTime);
perf_context.Reset();
iter->Seek("b");
ASSERT_TRUE((int) perf_context.seek_internal_seek_time > 0);
ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0);
SetPerfLevel(kDisable);
ASSERT_EQ(IterStatus(iter), "b->vb");
iter->Seek("z");
ASSERT_EQ(IterStatus(iter), "(invalid)");
@@ -1230,7 +1237,12 @@ TEST(DBTest, IterMulti) {
// Switch from forward to reverse
iter->SeekToFirst();
iter->Next();
SetPerfLevel(kEnableTime);
perf_context.Reset();
iter->Next();
ASSERT_EQ(0, (int) perf_context.seek_internal_seek_time);
ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0);
SetPerfLevel(kDisable);
iter->Prev();
ASSERT_EQ(IterStatus(iter), "b->vb");
@@ -1590,22 +1602,42 @@ TEST(DBTest, NumImmutableMemTable) {
std::string big_value(1000000, 'x');
std::string num;
SetPerfLevel(kEnableTime);;
ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
ASSERT_EQ(num, "0");
perf_context.Reset();
Get("k1");
ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
ASSERT_EQ(num, "1");
perf_context.Reset();
Get("k1");
ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
perf_context.Reset();
Get("k2");
ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
ASSERT_EQ(num, "2");
perf_context.Reset();
Get("k2");
ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
perf_context.Reset();
Get("k3");
ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
perf_context.Reset();
Get("k1");
ASSERT_EQ(3, (int) perf_context.get_from_memtable_count);
dbfull()->Flush(FlushOptions());
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
ASSERT_EQ(num, "0");
SetPerfLevel(kDisable);
} while (ChangeCompactOptions());
}
@@ -1614,11 +1646,16 @@ TEST(DBTest, FLUSH) {
Options options = CurrentOptions();
WriteOptions writeOpt = WriteOptions();
writeOpt.disableWAL = true;
SetPerfLevel(kEnableTime);;
ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
// this will now also flush the last 2 writes
dbfull()->Flush(FlushOptions());
ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1"));
perf_context.Reset();
Get("foo");
ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
Reopen();
ASSERT_EQ("v1", Get("foo"));
ASSERT_EQ("v1", Get("bar"));
@@ -1630,7 +1667,9 @@ TEST(DBTest, FLUSH) {
Reopen();
ASSERT_EQ("v2", Get("bar"));
perf_context.Reset();
ASSERT_EQ("v2", Get("foo"));
ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
writeOpt.disableWAL = false;
ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3"));
@@ -1642,6 +1681,8 @@ TEST(DBTest, FLUSH) {
// has WAL enabled.
ASSERT_EQ("v3", Get("foo"));
ASSERT_EQ("v3", Get("bar"));
SetPerfLevel(kDisable);
} while (ChangeCompactOptions());
}

View File

@@ -19,6 +19,8 @@
#include "util/coding.h"
#include "util/mutexlock.h"
#include "util/murmurhash.h"
#include "util/perf_context_imp.h"
#include "util/stop_watch.h"
namespace std {
template <>
@@ -162,6 +164,9 @@ void MemTable::Add(SequenceNumber s, ValueType type,
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
std::deque<std::string>* operands, const Options& options) {
StopWatchNano memtable_get_timer(options.env, false);
StartPerfTimer(&memtable_get_timer);
Slice memkey = key.memtable_key();
std::shared_ptr<MemTableRep::Iterator> iter(
table_->GetIterator(key.user_key()));
@@ -175,7 +180,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
auto logger = options.info_log;
std::string merge_result;
for (; iter->Valid(); iter->Next()) {
bool found_final_value = false;
for (; !found_final_value && iter->Valid(); iter->Next()) {
// entry format is:
// klength varint32
// userkey char[klength-8]
@@ -212,7 +218,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
if (options.inplace_update_support) {
GetLock(key.user_key())->Unlock();
}
return true;
found_final_value = true;
break;
}
case kTypeDeletion: {
if (merge_in_progress) {
@@ -226,7 +233,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
} else {
*s = Status::NotFound(Slice());
}
return true;
found_final_value = true;
break;
}
case kTypeMerge: {
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
@@ -260,10 +268,12 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
// No change to value, since we have not yet found a Put/Delete
if (merge_in_progress) {
if (!found_final_value && merge_in_progress) {
*s = Status::MergeInProgress("");
}
return false;
BumpPerfTime(&perf_context.get_from_memtable_time, &memtable_get_timer);
BumpPerfCount(&perf_context.get_from_memtable_count);
return found_final_value;
}
bool MemTable::Update(SequenceNumber seq, ValueType type,

View File

@@ -174,6 +174,13 @@ void ProfileKeyComparison() {
HistogramImpl hist_put;
HistogramImpl hist_get;
HistogramImpl hist_get_snapshot;
HistogramImpl hist_get_memtable;
HistogramImpl hist_get_post_process;
HistogramImpl hist_num_memtable_checked;
HistogramImpl hist_write_pre_post;
HistogramImpl hist_write_wal_time;
HistogramImpl hist_write_memtable_time;
std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
@@ -192,16 +199,37 @@ void ProfileKeyComparison() {
perf_context.Reset();
db->Put(write_options, key, value);
hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
hist_write_wal_time.Add(perf_context.write_wal_time);
hist_write_memtable_time.Add(perf_context.write_memtable_time);
hist_put.Add(perf_context.user_key_comparison_count);
perf_context.Reset();
db->Get(read_options, key, &value);
hist_get_snapshot.Add(perf_context.get_snapshot_time);
hist_get_memtable.Add(perf_context.get_from_memtable_time);
hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
hist_get_post_process.Add(perf_context.get_post_process_time);
hist_get.Add(perf_context.user_key_comparison_count);
}
std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
<< "Get uesr key comparison: \n" << hist_get.ToString();
std::cout << "Put(): Pre and Post Process Time: \n"
<< hist_write_pre_post.ToString()
<< " Writing WAL time: \n"
<< hist_write_wal_time.ToString() << "\n"
<< " Writing Mem Table time: \n"
<< hist_write_memtable_time.ToString() << "\n";
std::cout << "Get(): Time to get snapshot: \n"
<< hist_get_snapshot.ToString()
<< " Time to get value from memtables: \n"
<< hist_get_memtable.ToString() << "\n"
<< " Number of memtables checked: \n"
<< hist_num_memtable_checked.ToString() << "\n"
<< " Time to post process: \n"
<< hist_get_post_process.ToString() << "\n";
}
TEST(PerfContextTest, KeyComparisonCount) {
@@ -259,8 +287,8 @@ TEST(PerfContextTest, SeekKeyComparison) {
db->Put(write_options, key, value);
auto put_time = timer.ElapsedNanos();
hist_put_time.Add(put_time);
hist_wal_time.Add(perf_context.wal_write_time);
hist_time_diff.Add(put_time - perf_context.wal_write_time);
hist_wal_time.Add(perf_context.write_wal_time);
hist_time_diff.Add(put_time - perf_context.write_wal_time);
}
std::cout << "Put time:\n" << hist_put_time.ToString()

View File

@@ -2091,7 +2091,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
}
}
assert(num <= space);
Iterator* result = NewMergingIterator(&icmp_, list, num);
Iterator* result = NewMergingIterator(env_, &icmp_, list, num);
delete[] list;
return result;
}