Prefix filters for scans (v4)

Summary: Similar to v2 (db and table code understands prefixes), but use ReadOptions as in v3. Also, make the CreateFilter code faster and cleaner. Test Plan: make db_test; export LEVELDB_TESTS=PrefixScan; ./db_test Reviewers: dhruba Reviewed By: dhruba CC: haobo, emayanke Differential Revision: https://reviews.facebook.net/D12027
2025-12-06 17:27:55 +00:00 · 2013-08-13 14:04:56 -07:00
parent 3b81df34bd
commit f5f1842282
13 changed files with 519 additions and 44 deletions
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -23,6 +23,7 @@
 #include "db/memtable.h"
 #include "db/memtablelist.h"
 #include "db/merge_helper.h"
+#include "db/prefix_filter_iterator.h"
 #include "db/table_cache.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
@@ -2339,12 +2340,19 @@ bool DBImpl::KeyMayExist(const ReadOptions& options,

 Iterator* DBImpl::NewIterator(const ReadOptions& options) {
  SequenceNumber latest_snapshot;
-  Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
-  return NewDBIterator(
-      &dbname_, env_, options_, user_comparator(), internal_iter,
-      (options.snapshot != nullptr
-       ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
-       : latest_snapshot));
+  Iterator* iter = NewInternalIterator(options, &latest_snapshot);
+  iter = NewDBIterator(
+             &dbname_, env_, options_, user_comparator(), iter,
+             (options.snapshot != nullptr
+              ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+              : latest_snapshot));
+  if (options.prefix) {
+    // use extra wrapper to exclude any keys from the results which
+    // don't begin with the prefix
+    iter = new PrefixFilterIterator(iter, *options.prefix,
+                                    options_.prefix_extractor);
+  }
+  return iter;
 }

 const Snapshot* DBImpl::GetSnapshot() {
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3640,6 +3640,135 @@ TEST(DBTest, MultiGetEmpty) {
  } while (ChangeCompactOptions());
 }

+void PrefixScanInit(DBTest *dbtest) {
+  char buf[100];
+  std::string keystr;
+  const int small_range_sstfiles = 5;
+  const int big_range_sstfiles = 5;
+
+  // Generate 11 sst files with the following prefix ranges.
+  // GROUP 0: [0,10]                              (level 1)
+  // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6]  (level 0)
+  // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10]  (level 0)
+  //
+  // A seek with the previous API would do 11 random I/Os (to all the
+  // files).  With the new API and a prefix filter enabled, we should
+  // only do 2 random I/O, to the 2 files containing the key.
+
+  // GROUP 0
+  snprintf(buf, sizeof(buf), "%02d______:start", 0);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  snprintf(buf, sizeof(buf), "%02d______:end", 10);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  dbtest->dbfull()->TEST_CompactMemTable();
+  dbtest->dbfull()->CompactRange(nullptr, nullptr); // move to level 1
+
+  // GROUP 1
+  for (int i = 1; i <= small_range_sstfiles; i++) {
+    snprintf(buf, sizeof(buf), "%02d______:start", i);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end", i+1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->dbfull()->TEST_CompactMemTable();
+  }
+
+  // GROUP 2
+  for (int i = 1; i <= big_range_sstfiles; i++) {
+    std::string keystr;
+    snprintf(buf, sizeof(buf), "%02d______:start", 0);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end",
+             small_range_sstfiles+i+1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->dbfull()->TEST_CompactMemTable();
+  }
+}
+
+TEST(DBTest, PrefixScan) {
+  ReadOptions ro = ReadOptions();
+  int count;
+  Slice prefix;
+  Slice key;
+  char buf[100];
+  Iterator* iter;
+  snprintf(buf, sizeof(buf), "03______:");
+  prefix = Slice(buf, 8);
+  key = Slice(buf, 9);
+
+  // db configs
+  env_->count_random_reads_ = true;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.block_cache = NewLRUCache(0);  // Prevent cache hits
+  options.filter_policy =  NewBloomFilterPolicy(10);
+  options.prefix_extractor = NewFixedPrefixTransform(8);
+  options.whole_key_filtering = false;
+  options.disable_auto_compactions = true;
+  options.max_background_compactions = 2;
+  options.create_if_missing = true;
+  options.disable_seek_compaction = true;
+
+  // prefix specified, with blooms: 2 RAND I/Os
+  // SeekToFirst
+  DestroyAndReopen(&options);
+  PrefixScanInit(this);
+  count = 0;
+  env_->random_read_counter_.Reset();
+  ro.prefix = &prefix;
+  iter = db_->NewIterator(ro);
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    assert(iter->key().starts_with(prefix));
+    count++;
+  }
+  ASSERT_TRUE(iter->status().ok());
+  delete iter;
+  ASSERT_EQ(count, 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+
+  // prefix specified, with blooms: 2 RAND I/Os
+  // Seek
+  DestroyAndReopen(&options);
+  PrefixScanInit(this);
+  count = 0;
+  env_->random_read_counter_.Reset();
+  ro.prefix = &prefix;
+  iter = db_->NewIterator(ro);
+  for (iter->Seek(key); iter->Valid(); iter->Next()) {
+    assert(iter->key().starts_with(prefix));
+    count++;
+  }
+  ASSERT_TRUE(iter->status().ok());
+  delete iter;
+  ASSERT_EQ(count, 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+
+  // no prefix specified: 11 RAND I/Os
+  DestroyAndReopen(&options);
+  PrefixScanInit(this);
+  count = 0;
+  env_->random_read_counter_.Reset();
+  iter = db_->NewIterator(ReadOptions());
+  for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
+    if (! iter->key().starts_with(prefix)) {
+      break;
+    }
+    count++;
+  }
+  ASSERT_TRUE(iter->status().ok());
+  delete iter;
+  ASSERT_EQ(count, 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), 11);
+  Close();
+  delete options.filter_policy;
+  delete options.prefix_extractor;
+}
+
 std::string MakeKey(unsigned int num) {
  char buf[30];
  snprintf(buf, sizeof(buf), "%016u", num);
--- a/db/prefix_filter_iterator.h
+++ b/db/prefix_filter_iterator.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Wrap an underlying iterator, but exclude any results not starting
+// with a given prefix.  Seeking to keys not beginning with the prefix
+// is invalid, and SeekToLast is not implemented (that would be
+// non-trivial), but otherwise this iterator will behave just like the
+// underlying iterator would if there happened to be no non-matching
+// keys in the dataset.
+
+#ifndef STORAGE_LEVELDB_DB_PREFIX_FILTER_ITERATOR_H_
+#define STORAGE_LEVELDB_DB_PREFIX_FILTER_ITERATOR_H_
+
+#include "leveldb/iterator.h"
+
+namespace leveldb {
+
+class PrefixFilterIterator : public Iterator {
+ private:
+  Iterator* iter_;
+  const Slice &prefix_;
+  const SliceTransform *prefix_extractor_;
+  Status status_;
+
+ public:
+  PrefixFilterIterator(Iterator* iter,
+                       const Slice &prefix,
+                       const SliceTransform* prefix_extractor)
+                             : iter_(iter), prefix_(prefix),
+                               prefix_extractor_(prefix_extractor),
+                               status_(Status::OK()) {
+    if (prefix_extractor == nullptr) {
+      status_ = Status::InvalidArgument("A prefix filter may not be used "
+                                        "unless a function is also defined "
+                                        "for extracting prefixes");
+    } else if (!prefix_extractor_->InRange(prefix)) {
+      status_ = Status::InvalidArgument("Must provide a slice for prefix which"
+                                        "is a prefix for some key");
+    }
+  }
+  ~PrefixFilterIterator() {
+    delete iter_;
+  }
+  Slice key() const { return iter_->key(); }
+  Slice value() const { return iter_->value(); }
+  Status status() const {
+    if (!status_.ok()) {
+      return status_;
+    }
+    return iter_->status();
+  }
+  void Next() { iter_->Next(); }
+  void Prev() { iter_->Prev(); }
+  void Seek(const Slice& k) {
+    if (prefix_extractor_->Transform(k) == prefix_) {
+      iter_->Seek(k);
+    } else {
+      status_ = Status::InvalidArgument("Seek must begin with target prefix");
+    }
+  }
+  void SeekToFirst() {
+    Seek(prefix_);
+  }
+  void SeekToLast() {
+    status_ = Status::NotSupported("SeekToLast is incompatible with prefixes");
+  }
+  bool Valid() const {
+    return (status_.ok() && iter_->Valid() &&
+            prefix_extractor_->Transform(iter_->key()) == prefix_);
+  }
+};
+
+}  // namespace leveldb
+
+#endif