From c168d54495d7d7b84639514f6443ad99b89ce996 Mon Sep 17 00:00:00 2001 From: Vinnie Falco Date: Mon, 27 Oct 2014 11:36:32 -0700 Subject: [PATCH] Squashed 'src/rocksdb2/' changes from 25888ae..1fdd726 1fdd726 Hotfix RocksDB 3.5 d67500a Add `make install` to Makefile in 3.5.fb. 4cb631a update HISTORY.md cfd0946 comments about the BlockBasedTableOptions migration in Options REVERT: 25888ae Merge pull request #329 from fyrz/master REVERT: 89833e5 Fixed signed-unsigned comparison warning in db_test.cc REVERT: fcac705 Fixed compile warning on Mac caused by unused variables. REVERT: b3343fd resolution for java build problem introduced by 5ec53f3edf62bec1b690ce12fb21a6c52203f3c8 REVERT: 187b299 ForwardIterator: update prev_key_ only if prefix hasn't changed REVERT: 5ec53f3 make compaction related options changeable REVERT: d122e7b Update INSTALL.md REVERT: 986dad0 Merge pull request #324 from dalgaaf/wip-da-SCA-20140930 REVERT: 8ee75dc db/memtable.cc: remove unused variable merge_result REVERT: 0fd8bbc db/db_impl.cc: reduce scope of prefix_initialized REVERT: 676ff7b compaction_picker.cc: remove check for >=0 for unsigned REVERT: e55aea5 document_db.cc: fix assert REVERT: d517c83 in_table_factory.cc: use correct format specifier REVERT: b140375 ttl/ttl_test.cc: prefer prefix ++operator for non-primitive types REVERT: 43c789c spatialdb/spatial_db.cc: use !empty() instead of 'size() > 0' REVERT: 0de452e document_db.cc: pass const parameter by reference REVERT: 4cc8643 util/ldb_cmd.cc: prefer prefix ++operator for non-primitive types REVERT: af8c2b2 util/signal_test.cc: suppress intentional null pointer deref REVERT: 33580fa db/db_impl.cc: fix object handling, remove double lines REVERT: 873f135 db_ttl_impl.h: pass func parameter by reference REVERT: 8558457 ldb_cmd_execute_result.h: perform init in initialization list REVERT: 063471b table/table_test.cc: pass func parameter by reference REVERT: 93548ce table/cuckoo_table_reader.cc: pass func parameter by ref REVERT: b8b7117 db/version_set.cc: use !empty() instead of 'size() > 0' REVERT: 8ce050b table/bloom_block.*: pass func parameter by reference REVERT: 53910dd db_test.cc: pass parameter by reference REVERT: 68ca534 corruption_test.cc: pass parameter by reference REVERT: 7506198 cuckoo_table_db_test.cc: add flush after delete REVERT: 1f96330 Print MB per second compaction throughput separately for reads and writes REVERT: ffe3d49 Add an instruction about SSE in INSTALL.md REVERT: ee1f3cc Package generation for Ubuntu and CentOS REVERT: f0f7955 Fixing comile errors on OS X REVERT: 99fb613 remove 2 space linter REVERT: b2d64a4 Fix linters, second try REVERT: 747523d Print per column family metrics in db_bench REVERT: 56ebd40 Fix arc lint (should fix #238) REVERT: 637f891 Merge pull request #321 from eonnen/master REVERT: 827e31c Make test use a compatible type in the size checks. REVERT: fd5d80d CompactedDB: log using the correct info_log REVERT: 2faf49d use GetContext to replace callback function pointer REVERT: 983d2de Add AUTHORS file. Fix #203 REVERT: abd70c5 Merge pull request #316 from fyrz/ReverseBytewiseComparator REVERT: 2dc6f62 handle kDelete type in cuckoo builder REVERT: 8b8011a Changed name of ReverseBytewiseComparator based on review comment REVERT: 389edb6 universal compaction picker: use double for potential overflow REVERT: 5340484 Built-in comparator(s) in RocksJava REVERT: d439451 delay initialization of cuckoo table iterator REVERT: 94997ea reduce memory usage of cuckoo table builder REVERT: c627595 improve memory efficiency of cuckoo reader REVERT: 581442d option to choose module when calculating CuckooTable hash REVERT: fbd2daf CompactedDBImpl::MultiGet() for better CuckooTable performance REVERT: 3c68006 CompactedDBImpl REVERT: f7375f3 Fix double deletes REVERT: 21ddcf6 Remove allow_thread_local REVERT: fb4a492 Merge pull request #311 from ankgup87/master REVERT: 611e286 Merge branch 'master' of https://github.com/facebook/rocksdb REVERT: 0103b44 Merge branch 'master' of ssh://github.com/ankgup87/rocksdb REVERT: 1dfb7bb Add block based table config options REVERT: cdaf44f Enlarge log size cap when printing file summary REVERT: 7cc1ed7 Merge pull request #309 from naveenatceg/staticbuild REVERT: ba6d660 Resolving merge conflict REVERT: 51eeaf6 Addressing review comments REVERT: fd7d3fe Addressing review comments (adding a env variable to override temp directory) REVERT: cf7ace8 Addressing review comments REVERT: 0a29ce5 re-enable BlockBasedTable::SetupForCompaction() REVERT: 55af370 Remove TODO for checking index checksums REVERT: 3d74f09 Fix compile REVERT: 53b0039 Fix release compile REVERT: d0de413 WriteBatchWithIndex to allow different Comparators for different column families REVERT: 57a32f1 change target_file_size_base to uint64_t REVERT: 5e6aee4 dont create backup_input if compaction filter v2 is not used REVERT: 49b5f94 Merge pull request #306 from Liuchang0812/fix_cast REVERT: 787cb4d remove cast, replace %llu with % PRIu64 REVERT: a7574d4 Update logging.cc REVERT: 7e0dcb9 Update logging.cc REVERT: 57fa3cc Merge pull request #304 from Liuchang0812/fix-check REVERT: cd44522 Merge pull request #305 from Liuchang0812/fix-logging REVERT: 6a031b6 remove unused variable REVERT: 4436f17 fixed #303: replace %ld with % PRId64 REVERT: 7a1bd05 Merge pull request #302 from ankgup87/master REVERT: 423e52c Merge branch 'master' of https://github.com/facebook/rocksdb REVERT: bfeef94 Add rate limiter REVERT: 32f2532 Print compression_size_percent as a signed int REVERT: 976caca Skip AllocateTest if fallocate() is not supported in the file system REVERT: 3b897cd Enable no-fbcode RocksDB build REVERT: f445947 RocksDB: Format uint64 using PRIu64 in db_impl.cc REVERT: e17bc65 Merge pull request #299 from ankgup87/master REVERT: b93797a Fix build REVERT: adae3ca [Java] Fix JNI link error caused by the removal of options.db_stats_log_interval REVERT: 90b8c07 Fix unit tests errors REVERT: 51af7c3 CuckooTable: add one option to allow identity function for the first hash function REVERT: 0350435 Fixed a signed-unsigned comparison in spatial_db.cc -- issue #293 REVERT: 2fb1fea Fix syncronization issues REVERT: ff76895 Remove some unnecessary constructors REVERT: feadb9d fix cuckoo table builder test REVERT: 3c232e1 Fix mac compile REVERT: 54cada9 Run make format on PR #249 REVERT: 27b22f1 Merge pull request #249 from tdfischer/decompression-refactoring REVERT: fb6456b Replace naked calls to operator new and delete (Fixes #222) REVERT: 5600c8f cuckoo table: return estimated size - 1 REVERT: a062e1f SetOptions() for memtable related options REVERT: e4eca6a Options conversion function for convenience REVERT: a7c2094 Merge pull request #292 from saghmrossi/master REVERT: 4d05234 Merge branch 'master' of github.com:saghmrossi/rocksdb REVERT: 60a4aa1 Test use_mmap_reads REVERT: 94e43a1 [Java] Fixed 32-bit overflowing issue when converting jlong to size_t REVERT: f9eaaa6 added include for inttypes.h to fix nonworking printf statements REVERT: f090575 Replaced "built on on earlier work" by "built on earlier work" in README.md REVERT: faad439 Fix #284 REVERT: 49aacd8 Fix make install REVERT: acb9348 [Java] Include WriteBatch into RocksDBSample.java, fix how DbBenchmark.java handles WriteBatch. REVERT: 4a27a2f Don't sync manifest when disableDataSync = true REVERT: 9b8480d Merge pull request #287 from yinqiwen/rate-limiter-crash-fix REVERT: 28be16b fix rate limiter crash #286 REVERT: 04ce1b2 Fix #284 REVERT: add22e3 standardize scripts to run RocksDB benchmarks REVERT: dee91c2 WriteThread REVERT: 540a257 Fix WAL synced REVERT: 24f034b Merge pull request #282 from Chilledheart/develop REVERT: 49fe329 Fix build issue under macosx REVERT: ebb5c65 Add make install REVERT: 0352a9f add_wrapped_bloom_test REVERT: 9c0e66c Don't run background jobs (flush, compactions) when bg_error_ is set REVERT: a9639bd Fix valgrind test REVERT: d1f24dc Relax FlushSchedule test REVERT: 3d9e6f7 Push model for flushing memtables REVERT: 059e584 [unit test] CompactRange should fail if we don't have space REVERT: dd641b2 fix RocksDB java build REVERT: 53404d9 add_qps_info_in cache bench REVERT: a52cecb Fix Mac compile REVERT: 092f97e Fix comments and typos REVERT: 6cc1286 Added a few statistics for BackupableDB REVERT: 0a42295 Fix SimpleWriteTimeoutTest REVERT: 06d9862 Always pass MergeContext as pointer, not reference REVERT: d343c3f Improve db recovery REVERT: 6bb7e3e Merger test REVERT: 88841bd Explicitly cast char to signed char in Hash() REVERT: 5231146 MemTableOptions REVERT: 1d284db Addressing review comments REVERT: 55114e7 Some updates for SpatialDB REVERT: 171d4ff remove TailingIterator reference in db_impl.h REVERT: 9b0f7ff rename version_set options_ to db_options_ to avoid confusion REVERT: 2d57828 Check stop level trigger-0 before slowdown level-0 trigger REVERT: 659d2d5 move compaction_filter to immutable_options REVERT: 048560a reduce references to cfd->options() in DBImpl REVERT: 011241b DB::Flush() Do not wait for background threads when there is nothing in mem table REVERT: a2bb7c3 Push- instead of pull-model for managing Write stalls REVERT: 0af157f Implement full filter for block based table. REVERT: 9360cc6 Fix valgrind issue REVERT: 02d5bff Merge pull request #277 from wankai/master REVERT: 88a2f44 fix comments REVERT: 7c16e39 Merge pull request #276 from wankai/master REVERT: 8237738 replace hard-coded number with named variable REVERT: db8ca52 Merge pull request #273 from nbougalis/static-analysis REVERT: b7b031f Merge pull request #274 from wankai/master REVERT: 4c2b1f0 Merge remote-tracking branch 'upstream/master' REVERT: a5d2863 typo improvement REVERT: 9f8aa09 Don't leak data returned by opendir REVERT: d1cfb71 Remove unused member(s) REVERT: bfee319 sizeof(int*) where sizeof(int) was intended REVERT: d40c1f7 Add missing break statement REVERT: 2e97c38 Avoid off-by-one error when using readlink REVERT: 40ddc3d add cache bench REVERT: 9f1c80b Drop column family from write thread REVERT: 8de151b Add db_bench with lots of column families to regression tests REVERT: c9e419c rename options_ to db_options_ in DBImpl to avoid confusion REVERT: 5cd0576 Fix compaction bug in Cuckoo Table Builder. Use kvs_.size() instead of num_entries in FileSize() method. REVERT: 0fbb3fa fixed memory leak in unit test DBIteratorBoundTest REVERT: adcd253 fix asan check REVERT: 4092b7a Merge pull request #272 from project-zerus/patch-1 REVERT: bb6ae0f fix more compile warnings REVERT: 6d31441 Merge pull request #271 from nbougalis/cleanups REVERT: 0cd0ec4 Plug memory leak during index creation REVERT: 4329d74 Fix swapped variable names to accurately reflect usage REVERT: 45a5e3e Remove path with arena==nullptr from NewInternalIterator REVERT: 5665e5e introduce ImmutableOptions REVERT: e0b99d4 created a new ReadOptions parameter 'iterate_upper_bound' REVERT: 51ea889 Fix travis builds REVERT: a481626 Relax backupable rate limiting test REVERT: f7f973d Merge pull request #269 from huahang/patch-2 REVERT: ef5b384 fix a few compile warnings REVERT: 2fd3806 Merge pull request #263 from wankai/master REVERT: 1785114 delete unused Comparator REVERT: 1b1d961 update HISTORY.md REVERT: 703c3ea comments about the BlockBasedTableOptions migration in Options REVERT: 4b5ad88 Merge pull request #260 from wankai/master REVERT: 19cc588 change to filter_block std::unique_ptr support RAII REVERT: 9b976e3 Merge pull request #259 from wankai/master REVERT: 5d25a46 Merge remote-tracking branch 'upstream/master' REVERT: dff2b1a typo improvement REVERT: 343e98a Reverting import change REVERT: ddb8039 RocksDB static build Make file changes to download and build the dependencies .Load the shared library when RocksDB is initialized git-subtree-dir: src/rocksdb2 git-subtree-split: 1fdd726a8254c13d0c66d8db8130ad17c13d7bcc --- .gitignore | 2 - .travis.yml | 3 +- AUTHORS | 11 - HISTORY.md | 14 +- INSTALL.md | 6 +- Makefile | 78 +- README.md | 2 +- Vagrantfile | 16 - build_tools/build_detect_platform | 2 +- build_tools/make_package.sh | 116 -- build_tools/regression_build_test.sh | 34 - db/builder.cc | 40 +- db/builder.h | 11 +- db/c.cc | 18 +- db/column_family.cc | 266 +-- db/column_family.h | 115 +- db/compaction.cc | 20 +- db/compaction.h | 8 +- db/compaction_picker.cc | 257 +-- db/compaction_picker.h | 104 +- db/corruption_test.cc | 6 +- db/cuckoo_table_db_test.cc | 27 +- db/db_bench.cc | 147 +- db/db_filesnapshot.cc | 23 +- db/db_impl.cc | 1529 ++++++++++------- db/db_impl.h | 119 +- db/db_impl_debug.cc | 27 +- db/db_impl_readonly.cc | 90 +- db/db_impl_readonly.h | 13 + db/db_iter.cc | 133 +- db/db_iter.h | 12 +- db/db_iter_test.cc | 168 +- db/db_test.cc | 1165 ++----------- db/dbformat.h | 2 +- db/deletefile_test.cc | 1 - db/filename.cc | 3 - db/flush_scheduler.cc | 62 - db/flush_scheduler.h | 39 - db/forward_iterator.cc | 55 +- db/forward_iterator.h | 3 - db/internal_stats.cc | 4 - db/log_and_apply_bench.cc | 11 +- db/memtable.cc | 108 +- db/memtable.h | 54 +- db/memtable_list.cc | 15 +- db/memtable_list.h | 7 +- db/plain_table_db_test.cc | 22 +- db/repair.cc | 40 +- db/simple_table_db_test.cc | 810 +++++++++ db/snapshot.h | 2 +- db/table_cache.cc | 54 +- db/table_cache.h | 18 +- db/table_properties_collector_test.cc | 13 +- db/version_edit.h | 6 +- db/version_set.cc | 359 ++-- db/version_set.h | 52 +- db/write_batch.cc | 45 +- db/write_batch_internal.h | 3 - db/write_batch_test.cc | 14 +- db/write_controller.cc | 37 - db/write_controller.h | 78 - db/write_controller_test.cc | 40 - db/write_thread.cc | 147 -- db/write_thread.h | 80 - include/rocksdb/c.h | 6 +- include/rocksdb/cache.h | 3 + include/rocksdb/comparator.h | 4 - include/rocksdb/db.h | 10 +- include/rocksdb/filter_policy.h | 64 +- include/rocksdb/immutable_options.h | 87 - include/rocksdb/options.h | 62 +- include/rocksdb/statistics.h | 4 +- include/rocksdb/table.h | 74 +- include/rocksdb/utilities/backupable_db.h | 39 +- .../utilities/write_batch_with_index.h | 15 +- include/rocksdb/version.h | 2 +- java/Makefile | 2 +- java/RocksDBSample.java | 47 +- java/org/rocksdb/BlockBasedTableConfig.java | 139 +- .../org/rocksdb/GenericRateLimiterConfig.java | 36 - java/org/rocksdb/NativeLibraryLoader.java | 58 - java/org/rocksdb/Options.java | 102 +- java/org/rocksdb/RateLimiterConfig.java | 20 - java/org/rocksdb/RocksDB.java | 41 +- java/org/rocksdb/benchmark/DbBenchmark.java | 2 +- java/org/rocksdb/test/OptionsTest.java | 12 + java/rocksjni/memtablejni.cc | 7 +- java/rocksjni/options.cc | 70 +- java/rocksjni/portal.h | 7 - java/rocksjni/ratelimiterjni.cc | 24 - java/rocksjni/rocksjni.cc | 24 - java/rocksjni/table.cc | 24 +- java/rocksjni/write_batch.cc | 14 +- .../lint_engine/FacebookFbcodeLintEngine.php | 13 +- port/stack_trace.cc | 2 +- table/adaptive_table_factory.cc | 18 +- table/adaptive_table_factory.h | 24 +- table/block.cc | 16 +- table/block.h | 22 +- table/block_based_filter_block.h | 101 -- table/block_based_filter_block_test.cc | 242 --- table/block_based_table_builder.cc | 117 +- table/block_based_table_builder.h | 6 +- table/block_based_table_factory.cc | 13 +- table/block_based_table_factory.h | 17 +- table/block_based_table_reader.cc | 374 ++-- table/block_based_table_reader.h | 19 +- table/block_builder.h | 6 +- table/block_prefix_index.cc | 4 +- table/block_prefix_index.h | 2 +- table/block_test.cc | 14 +- table/bloom_block.cc | 2 +- table/bloom_block.h | 2 +- table/cuckoo_table_builder.cc | 190 +- table/cuckoo_table_builder.h | 23 +- table/cuckoo_table_builder_test.cc | 78 +- table/cuckoo_table_factory.cc | 35 +- table/cuckoo_table_factory.h | 37 +- table/cuckoo_table_reader.cc | 215 +-- table/cuckoo_table_reader.h | 16 +- table/cuckoo_table_reader_test.cc | 143 +- ..._based_filter_block.cc => filter_block.cc} | 144 +- table/filter_block.h | 78 +- table/filter_block_test.cc | 139 ++ table/format.cc | 188 +- table/format.h | 26 +- table/full_filter_block.cc | 103 -- table/full_filter_block.h | 111 -- table/full_filter_block_test.cc | 182 -- table/get_context.cc | 101 -- table/get_context.h | 47 - table/merger_test.cc | 197 --- table/meta_blocks.cc | 40 +- table/plain_table_builder.cc | 30 +- table/plain_table_builder.h | 4 +- table/plain_table_factory.cc | 18 +- table/plain_table_factory.h | 23 +- table/plain_table_index.cc | 8 +- table/plain_table_index.h | 8 +- table/plain_table_key_coding.cc | 12 +- table/plain_table_key_coding.h | 4 +- table/plain_table_reader.cc | 61 +- table/plain_table_reader.h | 17 +- table/table_reader.h | 21 +- table/table_reader_bench.cc | 29 +- table/table_test.cc | 168 +- tools/auto_sanity_test.sh | 10 - tools/benchmark.sh | 205 --- tools/db_sanity_test.cc | 37 +- tools/db_stress.cc | 9 +- tools/reduce_levels_test.cc | 1 - tools/run_flash_bench.sh | 45 - tools/sst_dump.cc | 6 +- util/bloom.cc | 305 +--- util/bloom_test.cc | 147 +- util/cache_bench.cc | 278 --- util/comparator.cc | 23 +- util/db_info_dummper.cc | 3 - util/dynamic_bloom_test.cc | 3 - util/env_test.cc | 73 +- util/hash.cc | 22 +- util/hash_cuckoo_rep.cc | 2 +- util/histogram.cc | 2 +- util/histogram.h | 6 +- util/ldb_cmd.cc | 17 +- util/ldb_cmd_execute_result.h | 11 +- util/log_buffer.cc | 21 +- util/log_buffer.h | 9 +- util/logging.cc | 5 +- util/logging.h | 1 + util/mutable_cf_options.cc | 72 - util/mutable_cf_options.h | 102 -- util/options.cc | 70 +- util/options_helper.cc | 328 ---- util/options_helper.h | 18 - util/options_test.cc | 176 -- util/rate_limiter.cc | 2 +- util/rate_limiter_test.cc | 3 - util/scoped_arena_iterator.h | 28 - util/signal_test.cc | 1 - util/statistics.cc | 3 - util/testutil.cc | 9 - util/testutil.h | 2 - utilities/backupable/backupable_db.cc | 53 +- utilities/backupable/backupable_db_test.cc | 4 +- utilities/compacted_db/compacted_db_impl.cc | 156 -- utilities/compacted_db/compacted_db_impl.h | 96 -- utilities/document/document_db.cc | 5 +- utilities/document/json_document.cc | 3 - utilities/geodb/geodb_impl.cc | 2 - utilities/spatialdb/spatial_db.cc | 23 +- utilities/ttl/db_ttl_impl.h | 2 +- utilities/ttl/ttl_test.cc | 8 +- .../write_batch_with_index.cc | 79 +- .../write_batch_with_index_test.cc | 114 +- 195 files changed, 4433 insertions(+), 9442 deletions(-) delete mode 100644 AUTHORS delete mode 100644 Vagrantfile delete mode 100755 build_tools/make_package.sh delete mode 100644 db/flush_scheduler.cc delete mode 100644 db/flush_scheduler.h create mode 100644 db/simple_table_db_test.cc delete mode 100644 db/write_controller.cc delete mode 100644 db/write_controller.h delete mode 100644 db/write_controller_test.cc delete mode 100644 db/write_thread.cc delete mode 100644 db/write_thread.h delete mode 100644 include/rocksdb/immutable_options.h delete mode 100644 java/org/rocksdb/GenericRateLimiterConfig.java delete mode 100644 java/org/rocksdb/NativeLibraryLoader.java delete mode 100644 java/org/rocksdb/RateLimiterConfig.java delete mode 100644 java/rocksjni/ratelimiterjni.cc delete mode 100644 table/block_based_filter_block.h delete mode 100644 table/block_based_filter_block_test.cc rename table/{block_based_filter_block.cc => filter_block.cc} (54%) create mode 100644 table/filter_block_test.cc delete mode 100644 table/full_filter_block.cc delete mode 100644 table/full_filter_block.h delete mode 100644 table/full_filter_block_test.cc delete mode 100644 table/get_context.cc delete mode 100644 table/get_context.h delete mode 100644 table/merger_test.cc delete mode 100755 tools/benchmark.sh delete mode 100755 tools/run_flash_bench.sh delete mode 100644 util/cache_bench.cc delete mode 100644 util/mutable_cf_options.cc delete mode 100644 util/mutable_cf_options.h delete mode 100644 util/options_helper.cc delete mode 100644 util/options_helper.h delete mode 100644 util/scoped_arena_iterator.h delete mode 100644 utilities/compacted_db/compacted_db_impl.cc delete mode 100644 utilities/compacted_db/compacted_db_impl.h diff --git a/.gitignore b/.gitignore index cbb817f61a..99a7d61d61 100644 --- a/.gitignore +++ b/.gitignore @@ -28,10 +28,8 @@ util/build_version.cc build_tools/VALGRIND_LOGS/ coverage/COVERAGE_REPORT .gdbhistory -package/ .phutil_module_cache tags java/*.log java/include/org_rocksdb_*.h unity.cc -.vagrant/ diff --git a/.travis.yml b/.travis.yml index bcb852cf04..66f37a5d28 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,6 +14,7 @@ before_install: - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb # Lousy hack to disable use and testing of fallocate, which doesn't behave quite # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment. -script: OPT=-DTRAVIS make check -j8 + - sed -i "s/fallocate(/HACK_NO_fallocate(/" build_tools/build_detect_platform +script: make check -j8 notifications: email: false diff --git a/AUTHORS b/AUTHORS deleted file mode 100644 index e644f5530f..0000000000 --- a/AUTHORS +++ /dev/null @@ -1,11 +0,0 @@ -Facebook Inc. -Facebook Engineering Team - -Google Inc. -# Initial version authors: -Jeffrey Dean -Sanjay Ghemawat - -# Partial list of contributors: -Kevin Regan -Johan Bilien diff --git a/HISTORY.md b/HISTORY.md index 41c49cc1ad..c6c566ede2 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,17 +1,6 @@ # Rocksdb Change Log -## Unreleased (will be released with 3.6) -### Disk format changes -* If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy - -### Behavior changes -* We have refactored our system of stalling writes. Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6. -* When disableDataSync=true, we no longer sync the MANIFEST file. -* Add identity_as_first_hash property to CuckooTable. SST file needs to be rebuilt to be opened by reader properly. - -### Public API changes -* Change target_file_size_base type to uint64_t from int. -* Remove allow_thread_local. This feature was proved to be stable, so we are turning it always-on. +### Unreleased ----- Past Releases ----- @@ -31,7 +20,6 @@ * Support Multiple DB paths in universal style compactions * Add feature of storing plain table index and bloom filter in SST file. * CompactRange() will never output compacted files to level 0. This used to be the case when all the compaction input files were at level 0. -* Added iterate_upper_bound to define the extent upto which the forward iterator will return entries. This will prevent iterating over delete markers and overwritten entries for edge cases where you want to break out the iterator anyways. This may improve perfomance in case there are a large number of delete markers or overwritten entries. ### Public API changes * DBOptions.db_paths now is a vector of a DBPath structure which indicates both of path and target size diff --git a/INSTALL.md b/INSTALL.md index 21e8d26f05..8cf66e6ab2 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -15,10 +15,6 @@ There are few options when compiling RocksDB: * `make all` will compile our static library, and all our tools and unit tests. Our tools depend on gflags. You will need to have gflags installed to run `make all`. -* if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 " to make sure -SSE4.2 is used to speed up CRC32 when calculating data checksum. - - ## Dependencies * You can link RocksDB with following compression libraries: @@ -85,4 +81,4 @@ SSE4.2 is used to speed up CRC32 when calculating data checksum. We did not run any production workloads on it. * **iOS**: - * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`. + * Run: `TARGET_OS=IOS make static_lib` diff --git a/Makefile b/Makefile index 4deb8fc5ff..0e969e0fdd 100644 --- a/Makefile +++ b/Makefile @@ -110,18 +110,17 @@ TESTS = \ blob_store_test \ filelock_test \ filename_test \ - block_based_filter_block_test \ - full_filter_block_test \ + filter_block_test \ histogram_test \ log_test \ manual_compaction_test \ memenv_test \ merge_test \ - merger_test \ redis_test \ reduce_levels_test \ plain_table_db_test \ prefix_test \ + simple_table_db_test \ skiplist_test \ stringappend_test \ ttl_test \ @@ -132,8 +131,7 @@ TESTS = \ version_edit_test \ version_set_test \ file_indexer_test \ - write_batch_test \ - write_controller_test\ + write_batch_test\ deletefile_test \ table_test \ thread_local_test \ @@ -154,7 +152,7 @@ TOOLS = \ options_test \ blob_store_bench -PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench $(TOOLS) +PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench $(TOOLS) # The library name is configurable since we are maintaining libraries of both # debug/release mode. @@ -164,9 +162,6 @@ endif LIBRARY = ${LIBNAME}.a MEMENVLIBRARY = libmemenv.a -ROCKSDB_MAJOR = 3 -ROCKSDB_MINOR = 4 - default: all #----------------------------------------------- @@ -181,8 +176,8 @@ SHARED3 = $(SHARED1) SHARED = $(SHARED1) else # Update db.h if you change these. -SHARED_MAJOR = $(ROCKSDB_MAJOR) -SHARED_MINOR = $(ROCKSDB_MINOR) +SHARED_MAJOR = 3 +SHARED_MINOR = 5 SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) @@ -198,9 +193,9 @@ $(SHARED3): endif # PLATFORM_SHARED_EXT -.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests package \ +.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \ release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \ - dbg rocksdbjavastatic rocksdbjava install uninstall + dbg all: $(LIBRARY) $(PROGRAMS) $(TESTS) @@ -270,7 +265,7 @@ unity: unity.cc unity.o clean: -rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk unity.cc -rm -rf ios-x86/* ios-arm/* - -find . -name "*.[oda]" -exec rm {} \; + -find . -name "*.[od]" -exec rm {} \; -find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; tags: ctags * -R @@ -279,9 +274,6 @@ tags: format: build_tools/format-diff.sh -package: - bash build_tools/make_package.sh $(SHARED_MAJOR).$(SHARED_MINOR) - # --------------------------------------------------------------------------- # Unit tests and tools # --------------------------------------------------------------------------- @@ -292,9 +284,6 @@ $(LIBRARY): $(LIBOBJECTS) db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -cache_bench: util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) - $(CXX) util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) - block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -376,6 +365,9 @@ log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg @@ -418,11 +410,8 @@ rate_limiter_test: util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS) filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) - -full_filter_block_test: table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -451,15 +440,9 @@ reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -write_controller_test: db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) - merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -merger_test: table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) - deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) @@ -520,37 +503,6 @@ ROCKSDBJNILIB = librocksdbjni.jnilib JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/ endif -libz.a: - -rm -rf zlib-1.2.8 - curl -O http://zlib.net/zlib-1.2.8.tar.gz - tar xvzf zlib-1.2.8.tar.gz - cd zlib-1.2.8 && CFLAGS='-fPIC' ./configure --static && make - cp zlib-1.2.8/libz.a . - -libbz2.a: - -rm -rf bzip2-1.0.6 - curl -O http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz - tar xvzf bzip2-1.0.6.tar.gz - cd bzip2-1.0.6 && make CFLAGS='-fPIC -Wall -Winline -O2 -g -D_FILE_OFFSET_BITS=64' - cp bzip2-1.0.6/libbz2.a . - -libsnappy.a: - -rm -rf snappy-1.1.1 - curl -O https://snappy.googlecode.com/files/snappy-1.1.1.tar.gz - tar xvzf snappy-1.1.1.tar.gz - cd snappy-1.1.1 && ./configure --with-pic --enable-static - cd snappy-1.1.1 && make - cp snappy-1.1.1/.libs/libsnappy.a . - - -rocksdbjavastatic: libz.a libbz2.a libsnappy.a - OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j - cd java;$(MAKE) java; - rm -f ./java/$(ROCKSDBJNILIB) - $(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a - cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB) - - rocksdbjava: OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32 cd java;$(MAKE) java; @@ -633,10 +585,8 @@ ifneq ($(MAKECMDGOALS),clean) ifneq ($(MAKECMDGOALS),format) ifneq ($(MAKECMDGOALS),jclean) ifneq ($(MAKECMDGOALS),jtest) -ifneq ($(MAKECMDGOALS),package) -include $(DEPFILES) endif endif endif endif -endif diff --git a/README.md b/README.md index 916bdecdee..bda801fd77 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb) RocksDB is developed and maintained by Facebook Database Engineering Team. -It is built on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com) +It is built on on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) This code is a library that forms the core building block for a fast diff --git a/Vagrantfile b/Vagrantfile deleted file mode 100644 index cdee5db533..0000000000 --- a/Vagrantfile +++ /dev/null @@ -1,16 +0,0 @@ -Vagrant.configure("2") do |config| - - config.vm.provider "virtualbox" do |v| - v.memory = 4096 - v.cpus = 2 - end - - config.vm.define "ubuntu14" do |box| - box.vm.box = "ubuntu/trusty64" - end - - config.vm.define "centos65" do |box| - box.vm.box = "chef/centos-6.5" - end - -end diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 8479e31274..3389d2851b 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -46,7 +46,7 @@ PLATFORM_CXXFLAGS="-std=c++11" COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX" # Default to fbcode gcc on internal fb machines -if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then +if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then FBCODE_BUILD="true" if [ -z "$USE_CLANG" ]; then CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \ diff --git a/build_tools/make_package.sh b/build_tools/make_package.sh deleted file mode 100755 index 2ca28023de..0000000000 --- a/build_tools/make_package.sh +++ /dev/null @@ -1,116 +0,0 @@ -#/usr/bin/env bash - -set -e - -function log() { - echo "[+] $1" -} - -function fatal() { - echo "[!] $1" - exit 1 -} - -function platform() { - local __resultvar=$1 - if [[ -f "/etc/yum.conf" ]]; then - eval $__resultvar="centos" - elif [[ -f "/etc/dpkg/dpkg.cfg" ]]; then - eval $__resultvar="ubuntu" - else - fatal "Unknwon operating system" - fi -} -platform OS - -function package() { - if [[ $OS = "ubuntu" ]]; then - if dpkg --get-selections | grep --quiet $1; then - log "$1 is already installed. skipping." - else - apt-get install $@ -y - fi - elif [[ $OS = "centos" ]]; then - if rpm -qa | grep --quiet $1; then - log "$1 is already installed. skipping." - else - yum install $@ -y - fi - fi -} - -function detect_fpm_output() { - if [[ $OS = "ubuntu" ]]; then - export FPM_OUTPUT=deb - elif [[ $OS = "centos" ]]; then - export FPM_OUTPUT=rpm - fi -} -detect_fpm_output - -function gem_install() { - if gem list | grep --quiet $1; then - log "$1 is already installed. skipping." - else - gem install $@ - fi -} - -function main() { - if [[ $# -ne 1 ]]; then - fatal "Usage: $0 " - else - log "using rocksdb version: $1" - fi - - if [[ -d /vagrant ]]; then - if [[ $OS = "ubuntu" ]]; then - package g++-4.7 - export CXX=g++-4.7 - - # the deb would depend on libgflags2, but the static lib is the only thing - # installed by make install - package libgflags-dev - - package ruby-all-dev - elif [[ $OS = "centos" ]]; then - pushd /etc/yum.repos.d - if [[ ! -f /etc/yum.repos.d/devtools-1.1.repo ]]; then - wget http://people.centos.org/tru/devtools-1.1/devtools-1.1.repo - fi - package devtoolset-1.1-gcc --enablerepo=testing-1.1-devtools-6 - package devtoolset-1.1-gcc-c++ --enablerepo=testing-1.1-devtools-6 - export CC=/opt/centos/devtoolset-1.1/root/usr/bin/gcc - export CPP=/opt/centos/devtoolset-1.1/root/usr/bin/cpp - export CXX=/opt/centos/devtoolset-1.1/root/usr/bin/c++ - export PATH=$PATH:/opt/centos/devtoolset-1.1/root/usr/bin - popd - if ! rpm -qa | grep --quiet gflags; then - rpm -i https://github.com/schuhschuh/gflags/releases/download/v2.1.0/gflags-devel-2.1.0-1.amd64.rpm - fi - - package ruby - package ruby-devel - package rubygems - package rpm-build - fi - fi - gem_install fpm - - make static_lib - make install INSTALL_PATH=package - fpm \ - -s dir \ - -t $FPM_OUTPUT \ - -n rocksdb \ - -v $1 \ - --prefix /usr \ - --url http://rocksdb.org/ \ - -m rocksdb@fb.com \ - --license BSD \ - --vendor Facebook \ - --description "RocksDB is an embeddable persistent key-value store for fast storage." \ - package -} - -main $@ diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh index ee2d334f0b..5e335afde2 100755 --- a/build_tools/regression_build_test.sh +++ b/build_tools/regression_build_test.sh @@ -344,38 +344,6 @@ common_in_mem_args="--db=/dev/shm/rocksdb \ --threads=32 \ --writes_per_second=81920 > ${STAT_FILE}.seekwhilewriting_in_ram -# measure fillseq with bunch of column families -./db_bench \ - --benchmarks=fillseq \ - --num_column_families=500 \ - --write_buffer_size=1048576 \ - --db=$DATA_DIR \ - --use_existing_db=0 \ - --num=$NUM \ - --writes=$NUM \ - --open_files=55000 \ - --statistics=1 \ - --histogram=1 \ - --disable_data_sync=1 \ - --disable_wal=1 \ - --sync=0 > ${STAT_FILE}.fillseq_lots_column_families - -# measure overwrite performance with bunch of column families -./db_bench \ - --benchmarks=overwrite \ - --num_column_families=500 \ - --write_buffer_size=1048576 \ - --db=$DATA_DIR \ - --use_existing_db=1 \ - --num=$NUM \ - --writes=$((NUM / 10)) \ - --open_files=55000 \ - --statistics=1 \ - --histogram=1 \ - --disable_data_sync=1 \ - --disable_wal=1 \ - --sync=0 \ - --threads=8 > ${STAT_FILE}.overwrite_lots_column_families # send data to ods function send_to_ods { @@ -424,5 +392,3 @@ send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadr send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting send_benchmark_to_ods readwhilewriting readwhilewriting_in_ram ${STAT_FILE}.readwhilewriting_in_ram send_benchmark_to_ods seekrandomwhilewriting seekwhilewriting_in_ram ${STAT_FILE}.seekwhilewriting_in_ram -send_benchmark_to_ods fillseq fillseq_lots_column_families ${STAT_FILE}.fillseq_lots_column_families -send_benchmark_to_ods overwrite overwrite_lots_column_families ${STAT_FILE}.overwrite_lots_column_families diff --git a/db/builder.cc b/db/builder.cc index 2c50943703..1084f04138 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -26,24 +26,21 @@ namespace rocksdb { class TableFactory; -TableBuilder* NewTableBuilder(const ImmutableCFOptions& ioptions, +TableBuilder* NewTableBuilder(const Options& options, const InternalKeyComparator& internal_comparator, WritableFile* file, - const CompressionType compression_type, - const CompressionOptions& compression_opts) { - return ioptions.table_factory->NewTableBuilder( - ioptions, internal_comparator, file, compression_type, compression_opts); + CompressionType compression_type) { + return options.table_factory->NewTableBuilder(options, internal_comparator, + file, compression_type); } -Status BuildTable(const std::string& dbname, Env* env, - const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, TableCache* table_cache, +Status BuildTable(const std::string& dbname, Env* env, const Options& options, + const EnvOptions& soptions, TableCache* table_cache, Iterator* iter, FileMetaData* meta, const InternalKeyComparator& internal_comparator, const SequenceNumber newest_snapshot, const SequenceNumber earliest_seqno_in_memtable, const CompressionType compression, - const CompressionOptions& compression_opts, const Env::IOPriority io_priority) { Status s; meta->fd.file_size = 0; @@ -53,24 +50,23 @@ Status BuildTable(const std::string& dbname, Env* env, // If the sequence number of the smallest entry in the memtable is // smaller than the most recent snapshot, then we do not trigger // removal of duplicate/deleted keys as part of this builder. - bool purge = ioptions.purge_redundant_kvs_while_flush; + bool purge = options.purge_redundant_kvs_while_flush; if (earliest_seqno_in_memtable <= newest_snapshot) { purge = false; } - std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(), + std::string fname = TableFileName(options.db_paths, meta->fd.GetNumber(), meta->fd.GetPathId()); if (iter->Valid()) { unique_ptr file; - s = env->NewWritableFile(fname, &file, env_options); + s = env->NewWritableFile(fname, &file, soptions); if (!s.ok()) { return s; } file->SetIOPriority(io_priority); - TableBuilder* builder = NewTableBuilder( - ioptions, internal_comparator, file.get(), - compression, compression_opts); + TableBuilder* builder = + NewTableBuilder(options, internal_comparator, file.get(), compression); // the first key is the smallest key Slice key = iter->key(); @@ -79,8 +75,8 @@ Status BuildTable(const std::string& dbname, Env* env, meta->largest_seqno = meta->smallest_seqno; MergeHelper merge(internal_comparator.user_comparator(), - ioptions.merge_operator, ioptions.info_log, - ioptions.min_partial_merge_operands, + options.merge_operator.get(), options.info_log.get(), + options.min_partial_merge_operands, true /* internal key corruption is not ok */); if (purge) { @@ -200,12 +196,12 @@ Status BuildTable(const std::string& dbname, Env* env, delete builder; // Finish and check for file errors - if (s.ok() && !ioptions.disable_data_sync) { - if (ioptions.use_fsync) { - StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); + if (s.ok() && !options.disableDataSync) { + if (options.use_fsync) { + StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); s = file->Fsync(); } else { - StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); + StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); s = file->Sync(); } } @@ -215,7 +211,7 @@ Status BuildTable(const std::string& dbname, Env* env, if (s.ok()) { // Verify that the table is usable - Iterator* it = table_cache->NewIterator(ReadOptions(), env_options, + Iterator* it = table_cache->NewIterator(ReadOptions(), soptions, internal_comparator, meta->fd); s = it->status(); delete it; diff --git a/db/builder.h b/db/builder.h index cf3ebd1ae0..f57501abd1 100644 --- a/db/builder.h +++ b/db/builder.h @@ -11,7 +11,6 @@ #include "rocksdb/status.h" #include "rocksdb/types.h" #include "rocksdb/options.h" -#include "rocksdb/immutable_options.h" namespace rocksdb { @@ -27,10 +26,8 @@ class TableBuilder; class WritableFile; extern TableBuilder* NewTableBuilder( - const ImmutableCFOptions& options, - const InternalKeyComparator& internal_comparator, - WritableFile* file, const CompressionType compression_type, - const CompressionOptions& compression_opts); + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type); // Build a Table file from the contents of *iter. The generated file // will be named according to number specified in meta. On success, the rest of @@ -38,15 +35,13 @@ extern TableBuilder* NewTableBuilder( // If no data is present in *iter, meta->file_size will be set to // zero, and no Table file will be produced. extern Status BuildTable(const std::string& dbname, Env* env, - const ImmutableCFOptions& options, - const EnvOptions& env_options, + const Options& options, const EnvOptions& soptions, TableCache* table_cache, Iterator* iter, FileMetaData* meta, const InternalKeyComparator& internal_comparator, const SequenceNumber newest_snapshot, const SequenceNumber earliest_seqno_in_memtable, const CompressionType compression, - const CompressionOptions& compression_opts, const Env::IOPriority io_priority = Env::IO_HIGH); } // namespace rocksdb diff --git a/db/c.cc b/db/c.cc index b3077aaad6..3114f35004 100644 --- a/db/c.cc +++ b/db/c.cc @@ -118,7 +118,7 @@ struct rocksdb_compactionfilter_t : public CompactionFilter { const Slice& existing_value, std::string* new_value, bool* value_changed) const { - char* c_new_value = nullptr; + char* c_new_value = NULL; size_t new_value_length = 0; unsigned char c_value_changed = 0; unsigned char result = (*filter_)( @@ -1355,8 +1355,8 @@ void rocksdb_options_set_purge_redundant_kvs_while_flush( opt->rep.purge_redundant_kvs_while_flush = v; } -void rocksdb_options_set_allow_os_buffer(rocksdb_options_t* opt, - unsigned char v) { +void rocksdb_options_set_allow_os_buffer( + rocksdb_options_t* opt, unsigned char v) { opt->rep.allow_os_buffer = v; } @@ -1581,6 +1581,11 @@ void rocksdb_options_set_bloom_locality( opt->rep.bloom_locality = v; } +void rocksdb_options_set_allow_thread_local( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.allow_thread_local = v; +} + void rocksdb_options_set_inplace_update_support( rocksdb_options_t* opt, unsigned char v) { opt->rep.inplace_update_support = v; @@ -1839,13 +1844,6 @@ void rocksdb_readoptions_set_snapshot( opt->rep.snapshot = (snap ? snap->rep : nullptr); } -void rocksdb_readoptions_set_iterate_upper_bound( - rocksdb_readoptions_t* opt, - const char* key, size_t keylen) { - Slice prefix = Slice(key, keylen); - opt->rep.iterate_upper_bound = &prefix; -} - void rocksdb_readoptions_set_read_tier( rocksdb_readoptions_t* opt, int v) { opt->rep.read_tier = static_cast(v); diff --git a/db/column_family.cc b/db/column_family.cc index 0beb23c918..b1c9ba7e83 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -9,11 +9,6 @@ #include "db/column_family.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include #include #include @@ -24,43 +19,11 @@ #include "db/internal_stats.h" #include "db/compaction_picker.h" #include "db/table_properties_collector.h" -#include "db/write_controller.h" #include "util/autovector.h" #include "util/hash_skiplist_rep.h" -#include "util/options_helper.h" namespace rocksdb { -namespace { -// This function computes the amount of time in microseconds by which a write -// should be delayed based on the number of level-0 files according to the -// following formula: -// if n < bottom, return 0; -// if n >= top, return 1000; -// otherwise, let r = (n - bottom) / -// (top - bottom) -// and return r^2 * 1000. -// The goal of this formula is to gradually increase the rate at which writes -// are slowed. We also tried linear delay (r * 1000), but it seemed to do -// slightly worse. There is no other particular reason for choosing quadratic. -uint64_t SlowdownAmount(int n, double bottom, double top) { - uint64_t delay; - if (n >= top) { - delay = 1000; - } else if (n < bottom) { - delay = 0; - } else { - // If we are here, we know that: - // level0_start_slowdown <= n < level0_slowdown - // since the previous two conditions are false. - double how_much = static_cast(n - bottom) / (top - bottom); - delay = std::max(how_much * how_much * 1000, 100.0); - } - assert(delay <= 1000); - return delay; -} -} // namespace - ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex) : cfd_(cfd), db_(db), mutex_(mutex) { @@ -86,10 +49,6 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() { uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); } -const Comparator* ColumnFamilyHandleImpl::user_comparator() const { - return cfd()->user_comparator(); -} - ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp, const ColumnFamilyOptions& src) { ColumnFamilyOptions result = src; @@ -217,9 +176,9 @@ void SuperVersionUnrefHandle(void* ptr) { ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name, Version* dummy_versions, Cache* table_cache, - const ColumnFamilyOptions& cf_options, + const ColumnFamilyOptions& options, const DBOptions* db_options, - const EnvOptions& env_options, + const EnvOptions& storage_options, ColumnFamilySet* column_family_set) : id_(id), name_(name), @@ -227,10 +186,8 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name, current_(nullptr), refs_(0), dropped_(false), - internal_comparator_(cf_options.comparator), - options_(*db_options, SanitizeOptions(&internal_comparator_, cf_options)), - ioptions_(options_), - mutable_cf_options_(options_, ioptions_), + internal_comparator_(options.comparator), + options_(*db_options, SanitizeOptions(&internal_comparator_, options)), mem_(nullptr), imm_(options_.min_write_buffer_number_to_merge), super_version_(nullptr), @@ -239,33 +196,34 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name, next_(nullptr), prev_(nullptr), log_number_(0), + need_slowdown_for_num_level0_files_(false), column_family_set_(column_family_set) { Ref(); // if dummy_versions is nullptr, then this is a dummy column family. if (dummy_versions != nullptr) { internal_stats_.reset( - new InternalStats(ioptions_.num_levels, db_options->env, this)); - table_cache_.reset(new TableCache(ioptions_, env_options, table_cache)); - if (ioptions_.compaction_style == kCompactionStyleUniversal) { + new InternalStats(options_.num_levels, db_options->env, this)); + table_cache_.reset(new TableCache(&options_, storage_options, table_cache)); + if (options_.compaction_style == kCompactionStyleUniversal) { compaction_picker_.reset( - new UniversalCompactionPicker(ioptions_, &internal_comparator_)); - } else if (ioptions_.compaction_style == kCompactionStyleLevel) { + new UniversalCompactionPicker(&options_, &internal_comparator_)); + } else if (options_.compaction_style == kCompactionStyleLevel) { compaction_picker_.reset( - new LevelCompactionPicker(ioptions_, &internal_comparator_)); + new LevelCompactionPicker(&options_, &internal_comparator_)); } else { - assert(ioptions_.compaction_style == kCompactionStyleFIFO); + assert(options_.compaction_style == kCompactionStyleFIFO); compaction_picker_.reset( - new FIFOCompactionPicker(ioptions_, &internal_comparator_)); + new FIFOCompactionPicker(&options_, &internal_comparator_)); } - Log(ioptions_.info_log, "Options for column family \"%s\":\n", + Log(options_.info_log, "Options for column family \"%s\":\n", name.c_str()); const ColumnFamilyOptions* cf_options = &options_; - cf_options->Dump(ioptions_.info_log); + cf_options->Dump(options_.info_log.get()); } - RecalculateWriteStallConditions(mutable_cf_options_); + RecalculateWriteStallConditions(); } // DB mutex held @@ -318,107 +276,84 @@ ColumnFamilyData::~ColumnFamilyData() { } } -void ColumnFamilyData::RecalculateWriteStallConditions( - const MutableCFOptions& mutable_cf_options) { +void ColumnFamilyData::RecalculateWriteStallConditions() { + need_wait_for_num_memtables_ = + (imm()->size() == options()->max_write_buffer_number - 1); + if (current_ != nullptr) { - const double score = current_->MaxCompactionScore(); - const int max_level = current_->MaxCompactionScoreLevel(); + need_wait_for_num_level0_files_ = + (current_->NumLevelFiles(0) >= options()->level0_stop_writes_trigger); + } else { + need_wait_for_num_level0_files_ = false; + } - auto write_controller = column_family_set_->write_controller_; + RecalculateWriteStallRateLimitsConditions(); +} - if (imm()->size() == options_.max_write_buffer_number) { - write_controller_token_ = write_controller->GetStopToken(); - internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1); - Log(ioptions_.info_log, - "[%s] Stopping writes because we have %d immutable memtables " - "(waiting for flush)", - name_.c_str(), imm()->size()); - } else if (current_->NumLevelFiles(0) >= - mutable_cf_options.level0_stop_writes_trigger) { - write_controller_token_ = write_controller->GetStopToken(); - internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1); - Log(ioptions_.info_log, - "[%s] Stopping writes because we have %d level-0 files", - name_.c_str(), current_->NumLevelFiles(0)); - } else if (mutable_cf_options.level0_slowdown_writes_trigger >= 0 && - current_->NumLevelFiles(0) >= - mutable_cf_options.level0_slowdown_writes_trigger) { - uint64_t slowdown = SlowdownAmount( - current_->NumLevelFiles(0), - mutable_cf_options.level0_slowdown_writes_trigger, - mutable_cf_options.level0_stop_writes_trigger); - write_controller_token_ = write_controller->GetDelayToken(slowdown); - internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown); - Log(ioptions_.info_log, - "[%s] Stalling writes because we have %d level-0 files (%" PRIu64 - "us)", - name_.c_str(), current_->NumLevelFiles(0), slowdown); - } else if (options_.hard_rate_limit > 1.0 && - score > options_.hard_rate_limit) { - uint64_t kHardLimitSlowdown = 1000; - write_controller_token_ = - write_controller->GetDelayToken(kHardLimitSlowdown); - internal_stats_->RecordLevelNSlowdown(max_level, kHardLimitSlowdown, - false); - Log(ioptions_.info_log, - "[%s] Stalling writes because we hit hard limit on level %d. " - "(%" PRIu64 "us)", - name_.c_str(), max_level, kHardLimitSlowdown); - } else if (options_.soft_rate_limit > 0.0 && - score > options_.soft_rate_limit) { - uint64_t slowdown = SlowdownAmount(score, options_.soft_rate_limit, - options_.hard_rate_limit); - write_controller_token_ = write_controller->GetDelayToken(slowdown); - internal_stats_->RecordLevelNSlowdown(max_level, slowdown, true); - Log(ioptions_.info_log, - "[%s] Stalling writes because we hit soft limit on level %d (%" PRIu64 - "us)", - name_.c_str(), max_level, slowdown); - } else { - write_controller_token_.reset(); - } +void ColumnFamilyData::RecalculateWriteStallRateLimitsConditions() { + if (current_ != nullptr) { + exceeds_hard_rate_limit_ = + (options()->hard_rate_limit > 1.0 && + current_->MaxCompactionScore() > options()->hard_rate_limit); + + exceeds_soft_rate_limit_ = + (options()->soft_rate_limit > 0.0 && + current_->MaxCompactionScore() > options()->soft_rate_limit); + } else { + exceeds_hard_rate_limit_ = false; + exceeds_soft_rate_limit_ = false; } } const EnvOptions* ColumnFamilyData::soptions() const { - return &(column_family_set_->env_options_); + return &(column_family_set_->storage_options_); } -void ColumnFamilyData::SetCurrent(Version* current) { current_ = current; } +void ColumnFamilyData::SetCurrent(Version* current) { + current_ = current; + need_slowdown_for_num_level0_files_ = + (options_.level0_slowdown_writes_trigger >= 0 && + current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger); +} -void ColumnFamilyData::CreateNewMemtable(const MemTableOptions& moptions) { +void ColumnFamilyData::CreateNewMemtable() { assert(current_ != nullptr); if (mem_ != nullptr) { delete mem_->Unref(); } - mem_ = new MemTable(internal_comparator_, ioptions_, moptions); + mem_ = new MemTable(internal_comparator_, options_); mem_->Ref(); } -Compaction* ColumnFamilyData::PickCompaction( - const MutableCFOptions& mutable_options, LogBuffer* log_buffer) { - auto result = compaction_picker_->PickCompaction( - mutable_options, current_, log_buffer); +Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) { + auto result = compaction_picker_->PickCompaction(current_, log_buffer); + RecalculateWriteStallRateLimitsConditions(); return result; } -Compaction* ColumnFamilyData::CompactRange( - const MutableCFOptions& mutable_cf_options, - int input_level, int output_level, uint32_t output_path_id, - const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end) { - return compaction_picker_->CompactRange( - mutable_cf_options, current_, input_level, output_level, - output_path_id, begin, end, compaction_end); +Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level, + uint32_t output_path_id, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end) { + return compaction_picker_->CompactRange(current_, input_level, output_level, + output_path_id, begin, end, + compaction_end); } SuperVersion* ColumnFamilyData::GetReferencedSuperVersion( port::Mutex* db_mutex) { SuperVersion* sv = nullptr; - sv = GetThreadLocalSuperVersion(db_mutex); - sv->Ref(); - if (!ReturnThreadLocalSuperVersion(sv)) { - sv->Unref(); + if (LIKELY(column_family_set_->db_options_->allow_thread_local)) { + sv = GetThreadLocalSuperVersion(db_mutex); + sv->Ref(); + if (!ReturnThreadLocalSuperVersion(sv)) { + sv->Unref(); + } + } else { + db_mutex->Lock(); + sv = super_version_->Ref(); + db_mutex->Unlock(); } return sv; } @@ -447,11 +382,11 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion( sv = static_cast(ptr); if (sv == SuperVersion::kSVObsolete || sv->version_number != super_version_number_.load()) { - RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES); + RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_ACQUIRES); SuperVersion* sv_to_delete = nullptr; if (sv && sv->Unref()) { - RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS); + RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS); db_mutex->Lock(); // NOTE: underlying resources held by superversion (sst files) might // not be released until the next background job. @@ -489,24 +424,18 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) { SuperVersion* ColumnFamilyData::InstallSuperVersion( SuperVersion* new_superversion, port::Mutex* db_mutex) { - db_mutex->AssertHeld(); - return InstallSuperVersion(new_superversion, db_mutex, mutable_cf_options_); -} - -SuperVersion* ColumnFamilyData::InstallSuperVersion( - SuperVersion* new_superversion, port::Mutex* db_mutex, - const MutableCFOptions& mutable_cf_options) { new_superversion->db_mutex = db_mutex; - new_superversion->mutable_cf_options = mutable_cf_options; new_superversion->Init(mem_, imm_.current(), current_); SuperVersion* old_superversion = super_version_; super_version_ = new_superversion; ++super_version_number_; super_version_->version_number = super_version_number_; // Reset SuperVersions cached in thread local storage - ResetThreadLocalSuperVersions(); + if (column_family_set_->db_options_->allow_thread_local) { + ResetThreadLocalSuperVersions(); + } - RecalculateWriteStallConditions(mutable_cf_options); + RecalculateWriteStallConditions(); if (old_superversion != nullptr && old_superversion->Unref()) { old_superversion->Cleanup(); @@ -531,33 +460,19 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() { } } -bool ColumnFamilyData::SetOptions( - const std::unordered_map& options_map) { - MutableCFOptions new_mutable_cf_options; - if (GetMutableOptionsFromStrings(mutable_cf_options_, options_map, - &new_mutable_cf_options)) { - mutable_cf_options_ = new_mutable_cf_options; - mutable_cf_options_.RefreshDerivedOptions(ioptions_); - return true; - } - return false; -} - ColumnFamilySet::ColumnFamilySet(const std::string& dbname, const DBOptions* db_options, - const EnvOptions& env_options, - Cache* table_cache, - WriteController* write_controller) + const EnvOptions& storage_options, + Cache* table_cache) : max_column_family_(0), dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, ColumnFamilyOptions(), db_options, - env_options, nullptr)), + storage_options_, nullptr)), default_cfd_cache_(nullptr), db_name_(dbname), db_options_(db_options), - env_options_(env_options), + storage_options_(storage_options), table_cache_(table_cache), - write_controller_(write_controller), spin_lock_(ATOMIC_FLAG_INIT) { // initialize linked list dummy_cfd_->prev_ = dummy_cfd_; @@ -622,7 +537,7 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( assert(column_families_.find(name) == column_families_.end()); ColumnFamilyData* new_cfd = new ColumnFamilyData(id, name, dummy_versions, table_cache_, options, - db_options_, env_options_, this); + db_options_, storage_options_, this); Lock(); column_families_.insert({name, id}); column_family_data_.insert({id, new_cfd}); @@ -681,11 +596,6 @@ bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) { column_family_set_->Lock(); current_ = column_family_set_->GetColumnFamily(column_family_id); column_family_set_->Unlock(); - // TODO(icanadi) Maybe remove column family from the hash table when it's - // dropped? - if (current_ != nullptr && current_->IsDropped()) { - current_ = nullptr; - } } handle_.SetCFD(current_); return current_ != nullptr; @@ -711,13 +621,6 @@ ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() { return &handle_; } -void ColumnFamilyMemTablesImpl::CheckMemtableFull() { - if (current_ != nullptr && current_->mem()->ShouldScheduleFlush()) { - flush_scheduler_->ScheduleFlush(current_); - current_->mem()->MarkFlushScheduled(); - } -} - uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) { uint32_t column_family_id = 0; if (column_family != nullptr) { @@ -727,13 +630,4 @@ uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) { return column_family_id; } -const Comparator* GetColumnFamilyUserComparator( - ColumnFamilyHandle* column_family) { - if (column_family != nullptr) { - auto cfh = reinterpret_cast(column_family); - return cfh->user_comparator(); - } - return nullptr; -} - } // namespace rocksdb diff --git a/db/column_family.h b/db/column_family.h index 9c415c2a84..33bceadc62 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -19,11 +19,8 @@ #include "rocksdb/env.h" #include "db/memtable_list.h" #include "db/write_batch_internal.h" -#include "db/write_controller.h" #include "db/table_cache.h" #include "util/thread_local.h" -#include "db/flush_scheduler.h" -#include "util/mutable_cf_options.h" namespace rocksdb { @@ -49,7 +46,6 @@ class ColumnFamilyHandleImpl : public ColumnFamilyHandle { // destroy without mutex virtual ~ColumnFamilyHandleImpl(); virtual ColumnFamilyData* cfd() const { return cfd_; } - virtual const Comparator* user_comparator() const; virtual uint32_t GetID() const; @@ -82,7 +78,6 @@ struct SuperVersion { MemTable* mem; MemTableListVersion* imm; Version* current; - MutableCFOptions mutable_cf_options; std::atomic refs; // We need to_delete because during Cleanup(), imm->Unref() returns // all memtables that we need to free through this vector. We then @@ -137,7 +132,7 @@ class ColumnFamilyData { void Ref() { ++refs_; } // will just decrease reference count to 0, but will not delete it. returns // true if the ref count was decreased to zero. in that case, it can be - // deleted by the caller immediately, or later, by calling + // deleted by the caller immediatelly, or later, by calling // FreeDeadColumnFamilies() bool Unref() { assert(refs_ > 0); @@ -161,7 +156,6 @@ class ColumnFamilyData { // can't drop default CF assert(id_ != 0); dropped_ = true; - write_controller_token_.reset(); } bool IsDropped() const { return dropped_; } @@ -174,21 +168,6 @@ class ColumnFamilyData { // thread-safe const Options* options() const { return &options_; } const EnvOptions* soptions() const; - const ImmutableCFOptions* ioptions() const { return &ioptions_; } - // REQUIRES: DB mutex held - // This returns the MutableCFOptions used by current SuperVersion - // You shoul use this API to reference MutableCFOptions most of the time. - const MutableCFOptions* mutable_cf_options() const { - return &(super_version_->mutable_cf_options); - } - // REQUIRES: DB mutex held - // This returns the latest MutableCFOptions, which may be not in effect yet. - const MutableCFOptions* GetLatestMutableCFOptions() const { - return &mutable_cf_options_; - } - // REQUIRES: DB mutex held - bool SetOptions( - const std::unordered_map& options_map); InternalStats* internal_stats() { return internal_stats_.get(); } @@ -198,19 +177,16 @@ class ColumnFamilyData { Version* dummy_versions() { return dummy_versions_; } void SetMemtable(MemTable* new_mem) { mem_ = new_mem; } void SetCurrent(Version* current); - void CreateNewMemtable(const MemTableOptions& moptions); + void CreateNewMemtable(); TableCache* table_cache() const { return table_cache_.get(); } // See documentation in compaction_picker.h - // REQUIRES: DB mutex held - Compaction* PickCompaction(const MutableCFOptions& mutable_options, - LogBuffer* log_buffer); - Compaction* CompactRange( - const MutableCFOptions& mutable_cf_options, - int input_level, int output_level, uint32_t output_path_id, - const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end); + Compaction* PickCompaction(LogBuffer* log_buffer); + Compaction* CompactRange(int input_level, int output_level, + uint32_t output_path_id, const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end); CompactionPicker* compaction_picker() { return compaction_picker_.get(); } // thread-safe @@ -242,20 +218,40 @@ class ColumnFamilyData { // if its reference count is zero and needs deletion or nullptr if not // As argument takes a pointer to allocated SuperVersion to enable // the clients to allocate SuperVersion outside of mutex. - SuperVersion* InstallSuperVersion(SuperVersion* new_superversion, - port::Mutex* db_mutex, - const MutableCFOptions& mutable_cf_options); SuperVersion* InstallSuperVersion(SuperVersion* new_superversion, port::Mutex* db_mutex); void ResetThreadLocalSuperVersions(); + // A Flag indicating whether write needs to slowdown because of there are + // too many number of level0 files. + bool NeedSlowdownForNumLevel0Files() const { + return need_slowdown_for_num_level0_files_; + } + + bool NeedWaitForNumLevel0Files() const { + return need_wait_for_num_level0_files_; + } + + bool NeedWaitForNumMemtables() const { + return need_wait_for_num_memtables_; + } + + bool ExceedsSoftRateLimit() const { + return exceeds_soft_rate_limit_; + } + + bool ExceedsHardRateLimit() const { + return exceeds_hard_rate_limit_; + } + private: friend class ColumnFamilySet; ColumnFamilyData(uint32_t id, const std::string& name, Version* dummy_versions, Cache* table_cache, const ColumnFamilyOptions& options, - const DBOptions* db_options, const EnvOptions& env_options, + const DBOptions* db_options, + const EnvOptions& storage_options, ColumnFamilySet* column_family_set); // Recalculate some small conditions, which are changed only during @@ -263,8 +259,8 @@ class ColumnFamilyData { // recalculation of compaction score. These values are used in // DBImpl::MakeRoomForWrite function to decide, if it need to make // a write stall - void RecalculateWriteStallConditions( - const MutableCFOptions& mutable_cf_options); + void RecalculateWriteStallConditions(); + void RecalculateWriteStallRateLimitsConditions(); uint32_t id_; const std::string name_; @@ -276,9 +272,7 @@ class ColumnFamilyData { const InternalKeyComparator internal_comparator_; - const Options options_; - const ImmutableCFOptions ioptions_; - MutableCFOptions mutable_cf_options_; + Options const options_; std::unique_ptr table_cache_; @@ -307,13 +301,31 @@ class ColumnFamilyData { // recovered from uint64_t log_number_; + // A flag indicating whether we should delay writes because + // we have too many level 0 files + bool need_slowdown_for_num_level0_files_; + + // These 4 variables are updated only after compaction, + // adding new memtable, flushing memtables to files + // and/or add recalculation of compaction score. + // That's why theirs values are cached in ColumnFamilyData. + // Recalculation is made by RecalculateWriteStallConditions and + // RecalculateWriteStallRateLimitsConditions function. They are used + // in DBImpl::MakeRoomForWrite function to decide, if it need + // to sleep during write operation + bool need_wait_for_num_memtables_; + + bool need_wait_for_num_level0_files_; + + bool exceeds_hard_rate_limit_; + + bool exceeds_soft_rate_limit_; + // An object that keeps all the compaction stats // and picks the next compaction std::unique_ptr compaction_picker_; ColumnFamilySet* column_family_set_; - - std::unique_ptr write_controller_token_; }; // ColumnFamilySet has interesting thread-safety requirements @@ -355,8 +367,7 @@ class ColumnFamilySet { }; ColumnFamilySet(const std::string& dbname, const DBOptions* db_options, - const EnvOptions& env_options, Cache* table_cache, - WriteController* write_controller); + const EnvOptions& storage_options, Cache* table_cache); ~ColumnFamilySet(); ColumnFamilyData* GetDefault() const; @@ -409,9 +420,8 @@ class ColumnFamilySet { const std::string db_name_; const DBOptions* const db_options_; - const EnvOptions env_options_; + const EnvOptions storage_options_; Cache* table_cache_; - WriteController* write_controller_; std::atomic_flag spin_lock_; }; @@ -419,11 +429,8 @@ class ColumnFamilySet { // memtables of different column families (specified by ID in the write batch) class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { public: - explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set, - FlushScheduler* flush_scheduler) - : column_family_set_(column_family_set), - current_(nullptr), - flush_scheduler_(flush_scheduler) {} + explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) + : column_family_set_(column_family_set), current_(nullptr) {} // sets current_ to ColumnFamilyData with column_family_id // returns false if column family doesn't exist @@ -442,18 +449,12 @@ class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { // Returns column family handle for the selected column family virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; - virtual void CheckMemtableFull() override; - private: ColumnFamilySet* column_family_set_; ColumnFamilyData* current_; - FlushScheduler* flush_scheduler_; ColumnFamilyHandleInternal handle_; }; extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family); -extern const Comparator* GetColumnFamilyUserComparator( - ColumnFamilyHandle* column_family); - } // namespace rocksdb diff --git a/db/compaction.cc b/db/compaction.cc index f02feeee7a..0bffa0162f 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -9,10 +9,7 @@ #include "db/compaction.h" -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include @@ -56,6 +53,7 @@ Compaction::Compaction(Version* input_version, int start_level, int out_level, is_full_compaction_(false), is_manual_compaction_(false), level_ptrs_(std::vector(number_levels_)) { + cfd_->Ref(); input_version_->Ref(); edit_ = new VersionEdit(); @@ -112,8 +110,8 @@ void Compaction::AddInputDeletions(VersionEdit* edit) { } bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) { - assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO); - if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) { + assert(cfd_->options()->compaction_style != kCompactionStyleFIFO); + if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { return bottommost_level_; } // Maybe use binary search to find right entry instead of linear search? @@ -176,8 +174,8 @@ void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { // Is this compaction producing files at the bottommost level? void Compaction::SetupBottomMostLevel(bool is_manual) { - assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO); - if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) { + assert(cfd_->options()->compaction_style != kCompactionStyleFIFO); + if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { // If universal compaction style is used and manual // compaction is occuring, then we are guaranteed that // all files will be picked in a single compaction @@ -266,12 +264,12 @@ void Compaction::Summary(char* output, int len) { snprintf(output + write, len - write, "]"); } -uint64_t Compaction::OutputFilePreallocationSize( - const MutableCFOptions& mutable_options) { +uint64_t Compaction::OutputFilePreallocationSize() { uint64_t preallocation_size = 0; - if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { - preallocation_size = mutable_options.MaxFileSizeForLevel(output_level()); + if (cfd_->options()->compaction_style == kCompactionStyleLevel) { + preallocation_size = + cfd_->compaction_picker()->MaxFileSizeForLevel(output_level()); } else { for (int level = 0; level < num_input_levels(); ++level) { for (const auto& f : inputs_[level].files) { diff --git a/db/compaction.h b/db/compaction.h index 7c490946ac..6000f636be 100644 --- a/db/compaction.h +++ b/db/compaction.h @@ -10,7 +10,6 @@ #pragma once #include "util/arena.h" #include "util/autovector.h" -#include "util/mutable_cf_options.h" #include "db/version_set.h" namespace rocksdb { @@ -152,14 +151,10 @@ class Compaction { // Was this compaction triggered manually by the client? bool IsManualCompaction() { return is_manual_compaction_; } - // Return the MutableCFOptions that should be used throughout the compaction - // procedure - const MutableCFOptions* mutable_cf_options() { return &mutable_cf_options_; } - // Returns the size in bytes that the output file should be preallocated to. // In level compaction, that is max_file_size_. In universal compaction, that // is the sum of all input file sizes. - uint64_t OutputFilePreallocationSize(const MutableCFOptions& mutable_options); + uint64_t OutputFilePreallocationSize(); private: friend class CompactionPicker; @@ -176,7 +171,6 @@ class Compaction { const int output_level_; // levels to which output files are stored uint64_t max_output_file_size_; uint64_t max_grandparent_overlap_bytes_; - MutableCFOptions mutable_cf_options_; Version* input_version_; VersionEdit* edit_; int number_levels_; diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index 84bd95839b..e05d07776e 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -9,10 +9,7 @@ #include "db/compaction_picker.h" -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include #include "db/filename.h" @@ -35,36 +32,70 @@ namespace { // If enable_compression is false, then compression is always disabled no // matter what the values of the other two parameters are. // Otherwise, the compression type is determined based on options and level. -CompressionType GetCompressionType( - const ImmutableCFOptions& ioptions, int level, - const bool enable_compression = true) { +CompressionType GetCompressionType(const Options& options, int level, + const bool enable_compression = true) { if (!enable_compression) { // disable compression return kNoCompression; } // If the use has specified a different compression level for each level, - // then pick the compression for that level. - if (!ioptions.compression_per_level.empty()) { - const int n = ioptions.compression_per_level.size() - 1; + // then pick the compresison for that level. + if (!options.compression_per_level.empty()) { + const int n = options.compression_per_level.size() - 1; // It is possible for level_ to be -1; in that case, we use level // 0's compression. This occurs mostly in backwards compatibility // situations when the builder doesn't know what level the file - // belongs to. Likewise, if level is beyond the end of the + // belongs to. Likewise, if level_ is beyond the end of the // specified compression levels, use the last value. - return ioptions.compression_per_level[std::max(0, std::min(level, n))]; + return options.compression_per_level[std::max(0, std::min(level, n))]; } else { - return ioptions.compression; + return options.compression; } } +// Multiple two operands. If they overflow, return op1. +uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) { + if (op1 == 0) { + return 0; + } + if (op2 <= 0) { + return op1; + } + uint64_t casted_op2 = (uint64_t) op2; + if (std::numeric_limits::max() / op1 < casted_op2) { + return op1; + } + return op1 * casted_op2; +} } // anonymous namespace -CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions, +CompactionPicker::CompactionPicker(const Options* options, const InternalKeyComparator* icmp) - : ioptions_(ioptions), - compactions_in_progress_(ioptions_.num_levels), + : compactions_in_progress_(options->num_levels), + options_(options), + num_levels_(options->num_levels), icmp_(icmp) { + + max_file_size_.reset(new uint64_t[NumberLevels()]); + level_max_bytes_.reset(new uint64_t[NumberLevels()]); + int target_file_size_multiplier = options_->target_file_size_multiplier; + int max_bytes_multiplier = options_->max_bytes_for_level_multiplier; + for (int i = 0; i < NumberLevels(); i++) { + if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) { + max_file_size_[i] = ULLONG_MAX; + level_max_bytes_[i] = options_->max_bytes_for_level_base; + } else if (i > 1) { + max_file_size_[i] = MultiplyCheckOverflow(max_file_size_[i - 1], + target_file_size_multiplier); + level_max_bytes_[i] = MultiplyCheckOverflow( + MultiplyCheckOverflow(level_max_bytes_[i - 1], max_bytes_multiplier), + options_->max_bytes_for_level_multiplier_additional[i - 1]); + } else { + max_file_size_[i] = options_->target_file_size_base; + level_max_bytes_[i] = options_->max_bytes_for_level_base; + } + } } CompactionPicker::~CompactionPicker() {} @@ -92,6 +123,26 @@ void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { } } +uint64_t CompactionPicker::MaxFileSizeForLevel(int level) const { + assert(level >= 0); + assert(level < NumberLevels()); + return max_file_size_[level]; +} + +uint64_t CompactionPicker::MaxGrandParentOverlapBytes(int level) { + uint64_t result = MaxFileSizeForLevel(level); + result *= options_->max_grandparent_overlap_factor; + return result; +} + +double CompactionPicker::MaxBytesForLevel(int level) { + // Note: the result for level zero is not really used since we set + // the level-0 compaction threshold based on number of files. + assert(level >= 0); + assert(level < NumberLevels()); + return level_max_bytes_[level]; +} + void CompactionPicker::GetRange(const std::vector& inputs, InternalKey* smallest, InternalKey* largest) { assert(!inputs.empty()); @@ -160,7 +211,7 @@ bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) { // compaction, then we must drop/cancel this compaction. int parent_index = -1; if (c->inputs_[0].empty()) { - Log(ioptions_.info_log, + Log(options_->info_log, "[%s] ExpandWhileOverlapping() failure because zero input files", c->column_family_data()->GetName().c_str()); } @@ -175,6 +226,12 @@ bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) { return true; } +uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) { + uint64_t result = MaxFileSizeForLevel(level); + result *= options_->expanded_compaction_factor; + return result; +} + // Returns true if any one of specified files are being compacted bool CompactionPicker::FilesInCompaction(std::vector& files) { for (unsigned int i = 0; i < files.size(); i++) { @@ -202,8 +259,7 @@ bool CompactionPicker::ParentRangeInCompaction(Version* version, // Will also attempt to expand "level" if that doesn't expand "level+1" // or cause "level" to include a file for compaction that has an overlapping // user-key with another file. -void CompactionPicker::SetupOtherInputs( - const MutableCFOptions& mutable_cf_options, Compaction* c) { +void CompactionPicker::SetupOtherInputs(Compaction* c) { // If inputs are empty, then there is nothing to expand. // If both input and output levels are the same, no need to consider // files at level "level+1" @@ -239,7 +295,7 @@ void CompactionPicker::SetupOtherInputs( const uint64_t inputs0_size = TotalCompensatedFileSize(c->inputs_[0].files); const uint64_t inputs1_size = TotalCompensatedFileSize(c->inputs_[1].files); const uint64_t expanded0_size = TotalCompensatedFileSize(expanded0); - uint64_t limit = mutable_cf_options.ExpandedCompactionByteSizeLimit(level); + uint64_t limit = ExpandedCompactionByteSizeLimit(level); if (expanded0.size() > c->inputs_[0].size() && inputs1_size + expanded0_size < limit && !FilesInCompaction(expanded0) && @@ -252,7 +308,7 @@ void CompactionPicker::SetupOtherInputs( &c->parent_index_); if (expanded1.size() == c->inputs_[1].size() && !FilesInCompaction(expanded1)) { - Log(ioptions_.info_log, + Log(options_->info_log, "[%s] Expanding@%d %zu+%zu (%" PRIu64 "+%" PRIu64 " bytes) to %zu+%zu (%" PRIu64 "+%" PRIu64 "bytes)\n", c->column_family_data()->GetName().c_str(), level, @@ -277,20 +333,21 @@ void CompactionPicker::SetupOtherInputs( } } -Compaction* CompactionPicker::CompactRange( - const MutableCFOptions& mutable_cf_options, Version* version, - int input_level, int output_level, uint32_t output_path_id, - const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end) { +Compaction* CompactionPicker::CompactRange(Version* version, int input_level, + int output_level, + uint32_t output_path_id, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end) { // CompactionPickerFIFO has its own implementation of compact range - assert(ioptions_.compaction_style != kCompactionStyleFIFO); + assert(options_->compaction_style != kCompactionStyleFIFO); std::vector inputs; bool covering_the_whole_range = true; // All files are 'overlapping' in universal style compaction. // We have to compact the entire range in one shot. - if (ioptions_.compaction_style == kCompactionStyleUniversal) { + if (options_->compaction_style == kCompactionStyleUniversal) { begin = nullptr; end = nullptr; } @@ -304,8 +361,8 @@ Compaction* CompactionPicker::CompactRange( // and we must not pick one file and drop another older file if the // two files overlap. if (input_level > 0) { - const uint64_t limit = mutable_cf_options.MaxFileSizeForLevel(input_level) * - mutable_cf_options.source_compaction_factor; + const uint64_t limit = + MaxFileSizeForLevel(input_level) * options_->source_compaction_factor; uint64_t total = 0; for (size_t i = 0; i + 1 < inputs.size(); ++i) { uint64_t s = inputs[i]->compensated_file_size; @@ -318,24 +375,22 @@ Compaction* CompactionPicker::CompactRange( } } } - assert(output_path_id < static_cast(ioptions_.db_paths.size())); + assert(output_path_id < static_cast(options_->db_paths.size())); Compaction* c = new Compaction( - version, input_level, output_level, - mutable_cf_options.MaxFileSizeForLevel(output_level), - mutable_cf_options.MaxGrandParentOverlapBytes(input_level), - output_path_id, - GetCompressionType(ioptions_, output_level)); + version, input_level, output_level, MaxFileSizeForLevel(output_level), + MaxGrandParentOverlapBytes(input_level), output_path_id, + GetCompressionType(*options_, output_level)); c->inputs_[0].files = inputs; if (ExpandWhileOverlapping(c) == false) { delete c; - Log(ioptions_.info_log, + Log(options_->info_log, "[%s] Could not compact due to expansion failure.\n", version->cfd_->GetName().c_str()); return nullptr; } - SetupOtherInputs(mutable_cf_options, c); + SetupOtherInputs(c); if (covering_the_whole_range) { *compaction_end = nullptr; @@ -350,14 +405,12 @@ Compaction* CompactionPicker::CompactRange( c->SetupBottomMostLevel(true); c->is_manual_compaction_ = true; - c->mutable_cf_options_ = mutable_cf_options; return c; } -Compaction* LevelCompactionPicker::PickCompaction( - const MutableCFOptions& mutable_cf_options, - Version* version, LogBuffer* log_buffer) { +Compaction* LevelCompactionPicker::PickCompaction(Version* version, + LogBuffer* log_buffer) { Compaction* c = nullptr; int level = -1; @@ -365,7 +418,7 @@ Compaction* LevelCompactionPicker::PickCompaction( // and also in LogAndApply(), otherwise the values could be stale. std::vector size_being_compacted(NumberLevels() - 1); SizeBeingCompacted(size_being_compacted); - version->ComputeCompactionScore(mutable_cf_options, size_being_compacted); + version->ComputeCompactionScore(size_being_compacted); // We prefer compactions triggered by too much data in a level over // the compactions triggered by seeks. @@ -376,8 +429,7 @@ Compaction* LevelCompactionPicker::PickCompaction( version->compaction_score_[i] <= version->compaction_score_[i - 1]); level = version->compaction_level_[i]; if ((version->compaction_score_[i] >= 1)) { - c = PickCompactionBySize(mutable_cf_options, version, level, - version->compaction_score_[i]); + c = PickCompactionBySize(version, level, version->compaction_score_[i]); if (c == nullptr || ExpandWhileOverlapping(c) == false) { delete c; c = nullptr; @@ -417,7 +469,7 @@ Compaction* LevelCompactionPicker::PickCompaction( } // Setup "level+1" files (inputs_[1]) - SetupOtherInputs(mutable_cf_options, c); + SetupOtherInputs(c); // mark all the files that are being compacted c->MarkFilesBeingCompacted(true); @@ -428,13 +480,12 @@ Compaction* LevelCompactionPicker::PickCompaction( // remember this currently undergoing compaction compactions_in_progress_[level].insert(c); - c->mutable_cf_options_ = mutable_cf_options; return c; } -Compaction* LevelCompactionPicker::PickCompactionBySize( - const MutableCFOptions& mutable_cf_options, - Version* version, int level, double score) { +Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version, + int level, + double score) { Compaction* c = nullptr; // level 0 files are overlapping. So we cannot pick more @@ -447,10 +498,9 @@ Compaction* LevelCompactionPicker::PickCompactionBySize( assert(level >= 0); assert(level + 1 < NumberLevels()); - c = new Compaction(version, level, level + 1, - mutable_cf_options.MaxFileSizeForLevel(level + 1), - mutable_cf_options.MaxGrandParentOverlapBytes(level), 0, - GetCompressionType(ioptions_, level + 1)); + c = new Compaction(version, level, level + 1, MaxFileSizeForLevel(level + 1), + MaxGrandParentOverlapBytes(level), 0, + GetCompressionType(*options_, level + 1)); c->score_ = score; // Pick the largest file in this level that is not already @@ -510,37 +560,35 @@ Compaction* LevelCompactionPicker::PickCompactionBySize( // Universal style of compaction. Pick files that are contiguous in // time-range to compact. // -Compaction* UniversalCompactionPicker::PickCompaction( - const MutableCFOptions& mutable_cf_options, - Version* version, LogBuffer* log_buffer) { +Compaction* UniversalCompactionPicker::PickCompaction(Version* version, + LogBuffer* log_buffer) { int level = 0; double score = version->compaction_score_[0]; if ((version->files_[level].size() < - (unsigned int)mutable_cf_options.level0_file_num_compaction_trigger)) { + (unsigned int)options_->level0_file_num_compaction_trigger)) { LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n", version->cfd_->GetName().c_str()); return nullptr; } Version::FileSummaryStorage tmp; - LogToBuffer(log_buffer, 3072, "[%s] Universal: candidate files(%zu): %s\n", + LogToBuffer(log_buffer, "[%s] Universal: candidate files(%zu): %s\n", version->cfd_->GetName().c_str(), version->files_[level].size(), version->LevelFileSummary(&tmp, 0)); // Check for size amplification first. Compaction* c; - if ((c = PickCompactionUniversalSizeAmp( - mutable_cf_options, version, score, log_buffer)) != nullptr) { + if ((c = PickCompactionUniversalSizeAmp(version, score, log_buffer)) != + nullptr) { LogToBuffer(log_buffer, "[%s] Universal: compacting for size amp\n", version->cfd_->GetName().c_str()); } else { // Size amplification is within limits. Try reducing read // amplification while maintaining file size ratios. - unsigned int ratio = ioptions_.compaction_options_universal.size_ratio; + unsigned int ratio = options_->compaction_options_universal.size_ratio; - if ((c = PickCompactionUniversalReadAmp( - mutable_cf_options, version, score, ratio, - UINT_MAX, log_buffer)) != nullptr) { + if ((c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX, + log_buffer)) != nullptr) { LogToBuffer(log_buffer, "[%s] Universal: compacting for size ratio\n", version->cfd_->GetName().c_str()); } else { @@ -549,10 +597,9 @@ Compaction* UniversalCompactionPicker::PickCompaction( // compaction without looking at filesize ratios and try to reduce // the number of files to fewer than level0_file_num_compaction_trigger. unsigned int num_files = version->files_[level].size() - - mutable_cf_options.level0_file_num_compaction_trigger; + options_->level0_file_num_compaction_trigger; if ((c = PickCompactionUniversalReadAmp( - mutable_cf_options, version, score, UINT_MAX, - num_files, log_buffer)) != nullptr) { + version, score, UINT_MAX, num_files, log_buffer)) != nullptr) { LogToBuffer(log_buffer, "[%s] Universal: compacting for file num\n", version->cfd_->GetName().c_str()); } @@ -578,7 +625,7 @@ Compaction* UniversalCompactionPicker::PickCompaction( c->bottommost_level_ = c->inputs_[0].files.back() == last_file; // update statistics - MeasureTime(ioptions_.statistics, + MeasureTime(options_->statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION, c->inputs_[0].size()); // mark all the files that are being compacted @@ -592,12 +639,11 @@ Compaction* UniversalCompactionPicker::PickCompaction( c->is_full_compaction_ = (c->inputs_[0].size() == c->input_version_->files_[0].size()); - c->mutable_cf_options_ = mutable_cf_options; return c; } -uint32_t UniversalCompactionPicker::GetPathId( - const ImmutableCFOptions& ioptions, uint64_t file_size) { +uint32_t UniversalCompactionPicker::GetPathId(const Options& options, + uint64_t file_size) { // Two conditions need to be satisfied: // (1) the target path needs to be able to hold the file's size // (2) Total size left in this and previous paths need to be not @@ -613,11 +659,11 @@ uint32_t UniversalCompactionPicker::GetPathId( // considered in this algorithm. So the target size can be violated in // that case. We need to improve it. uint64_t accumulated_size = 0; - uint64_t future_size = file_size * - (100 - ioptions.compaction_options_universal.size_ratio) / 100; + uint64_t future_size = + file_size * (100 - options.compaction_options_universal.size_ratio) / 100; uint32_t p = 0; - for (; p < ioptions.db_paths.size() - 1; p++) { - uint64_t target_size = ioptions.db_paths[p].target_size; + for (; p < options.db_paths.size() - 1; p++) { + uint64_t target_size = options.db_paths[p].target_size; if (target_size > file_size && accumulated_size + (target_size - file_size) > future_size) { return p; @@ -632,15 +678,14 @@ uint32_t UniversalCompactionPicker::GetPathId( // the next file in time order. // Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( - const MutableCFOptions& mutable_cf_options, Version* version, - double score, unsigned int ratio, + Version* version, double score, unsigned int ratio, unsigned int max_number_of_files_to_compact, LogBuffer* log_buffer) { int level = 0; unsigned int min_merge_width = - ioptions_.compaction_options_universal.min_merge_width; + options_->compaction_options_universal.min_merge_width; unsigned int max_merge_width = - ioptions_.compaction_options_universal.max_merge_width; + options_->compaction_options_universal.max_merge_width; // The files are sorted from newest first to oldest last. const auto& files = version->files_[level]; @@ -698,16 +743,15 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( // default kCompactionStopStyleTotalSize; with // kCompactionStopStyleSimilarSize, it's simply the size of the last // picked file. - double sz = candidate_size * (100.0 + ratio) / 100.0; - if (sz < static_cast(f->fd.GetFileSize())) { + uint64_t sz = (candidate_size * (100L + ratio)) /100; + if (sz < f->fd.GetFileSize()) { break; } - if (ioptions_.compaction_options_universal.stop_style == - kCompactionStopStyleSimilarSize) { + if (options_->compaction_options_universal.stop_style == kCompactionStopStyleSimilarSize) { // Similar-size stopping rule: also check the last picked file isn't // far larger than the next candidate file. - sz = (f->fd.GetFileSize() * (100.0 + ratio)) / 100.0; - if (sz < static_cast(candidate_size)) { + sz = (f->fd.GetFileSize() * (100L + ratio)) / 100; + if (sz < candidate_size) { // If the small file we've encountered begins a run of similar-size // files, we'll pick them up on a future iteration of the outer // loop. If it's some lonely straggler, it'll eventually get picked @@ -747,7 +791,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( // size ratio of compression. bool enable_compression = true; int ratio_to_compress = - ioptions_.compaction_options_universal.compression_size_percent; + options_->compaction_options_universal.compression_size_percent; if (ratio_to_compress >= 0) { uint64_t total_size = version->NumLevelBytes(level); uint64_t older_file_size = 0; @@ -765,12 +809,11 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( for (unsigned int i = 0; i < first_index_after; i++) { estimated_total_size += files[i]->fd.GetFileSize(); } - uint32_t path_id = GetPathId(ioptions_, estimated_total_size); + uint32_t path_id = GetPathId(*options_, estimated_total_size); Compaction* c = new Compaction( - version, level, level, mutable_cf_options.MaxFileSizeForLevel(level), - LLONG_MAX, path_id, GetCompressionType(ioptions_, level, - enable_compression)); + version, level, level, MaxFileSizeForLevel(level), LLONG_MAX, path_id, + GetCompressionType(*options_, level, enable_compression)); c->score_ = score; for (unsigned int i = start_index; i < first_index_after; i++) { @@ -795,12 +838,11 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( // min_merge_width and max_merge_width). // Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( - const MutableCFOptions& mutable_cf_options, Version* version, - double score, LogBuffer* log_buffer) { + Version* version, double score, LogBuffer* log_buffer) { int level = 0; // percentage flexibilty while reducing size amplification - uint64_t ratio = ioptions_.compaction_options_universal. + uint64_t ratio = options_->compaction_options_universal. max_size_amplification_percent; // The files are sorted from newest first to oldest last. @@ -875,21 +917,20 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( "earliest-file-size %" PRIu64, version->cfd_->GetName().c_str(), candidate_size, earliest_file_size); } - assert(start_index < files.size() - 1); + assert(start_index >= 0 && start_index < files.size() - 1); // Estimate total file size uint64_t estimated_total_size = 0; for (unsigned int loop = start_index; loop < files.size(); loop++) { estimated_total_size += files[loop]->fd.GetFileSize(); } - uint32_t path_id = GetPathId(ioptions_, estimated_total_size); + uint32_t path_id = GetPathId(*options_, estimated_total_size); // create a compaction request // We always compact all the files, so always compress. Compaction* c = - new Compaction(version, level, level, - mutable_cf_options.MaxFileSizeForLevel(level), - LLONG_MAX, path_id, GetCompressionType(ioptions_, level)); + new Compaction(version, level, level, MaxFileSizeForLevel(level), + LLONG_MAX, path_id, GetCompressionType(*options_, level)); c->score_ = score; for (unsigned int loop = start_index; loop < files.size(); loop++) { f = c->input_version_->files_[level][loop]; @@ -904,23 +945,22 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( return c; } -Compaction* FIFOCompactionPicker::PickCompaction( - const MutableCFOptions& mutable_cf_options, - Version* version, LogBuffer* log_buffer) { +Compaction* FIFOCompactionPicker::PickCompaction(Version* version, + LogBuffer* log_buffer) { assert(version->NumberLevels() == 1); uint64_t total_size = 0; for (const auto& file : version->files_[0]) { total_size += file->compensated_file_size; } - if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size || + if (total_size <= options_->compaction_options_fifo.max_table_files_size || version->files_[0].size() == 0) { // total size not exceeded LogToBuffer(log_buffer, "[%s] FIFO compaction: nothing to do. Total size %" PRIu64 ", max size %" PRIu64 "\n", version->cfd_->GetName().c_str(), total_size, - ioptions_.compaction_options_fifo.max_table_files_size); + options_->compaction_options_fifo.max_table_files_size); return nullptr; } @@ -945,29 +985,28 @@ Compaction* FIFOCompactionPicker::PickCompaction( LogToBuffer(log_buffer, "[%s] FIFO compaction: picking file %" PRIu64 " with size %s for deletion", version->cfd_->GetName().c_str(), f->fd.GetNumber(), tmp_fsize); - if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size) { + if (total_size <= options_->compaction_options_fifo.max_table_files_size) { break; } } c->MarkFilesBeingCompacted(true); compactions_in_progress_[0].insert(c); - c->mutable_cf_options_ = mutable_cf_options; + return c; } Compaction* FIFOCompactionPicker::CompactRange( - const MutableCFOptions& mutable_cf_options, Version* version, int input_level, int output_level, uint32_t output_path_id, const InternalKey* begin, const InternalKey* end, InternalKey** compaction_end) { assert(input_level == 0); assert(output_level == 0); *compaction_end = nullptr; - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log); - Compaction* c = PickCompaction(mutable_cf_options, version, &log_buffer); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_->info_log.get()); + Compaction* c = PickCompaction(version, &log_buffer); if (c != nullptr) { - assert(output_path_id < static_cast(ioptions_.db_paths.size())); + assert(output_path_id < static_cast(options_->db_paths.size())); c->output_path_id_ = output_path_id; } log_buffer.FlushBufferToLog(); diff --git a/db/compaction_picker.h b/db/compaction_picker.h index 138b97eb46..c1e27c4718 100644 --- a/db/compaction_picker.h +++ b/db/compaction_picker.h @@ -13,7 +13,6 @@ #include "rocksdb/status.h" #include "rocksdb/options.h" #include "rocksdb/env.h" -#include "util/mutable_cf_options.h" #include #include @@ -27,17 +26,15 @@ class Version; class CompactionPicker { public: - CompactionPicker(const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icmp); + CompactionPicker(const Options* options, const InternalKeyComparator* icmp); virtual ~CompactionPicker(); // Pick level and inputs for a new compaction. // Returns nullptr if there is no compaction to be done. // Otherwise returns a pointer to a heap-allocated object that // describes the compaction. Caller should delete the result. - virtual Compaction* PickCompaction( - const MutableCFOptions& mutable_cf_options, - Version* version, LogBuffer* log_buffer) = 0; + virtual Compaction* PickCompaction(Version* version, + LogBuffer* log_buffer) = 0; // Return a compaction object for compacting the range [begin,end] in // the specified level. Returns nullptr if there is nothing in that @@ -50,11 +47,11 @@ class CompactionPicker { // compaction_end will be set to nullptr. // Client is responsible for compaction_end storage -- when called, // *compaction_end should point to valid InternalKey! - virtual Compaction* CompactRange( - const MutableCFOptions& mutable_cf_options, Version* version, - int input_level, int output_level, uint32_t output_path_id, - const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end); + virtual Compaction* CompactRange(Version* version, int input_level, + int output_level, uint32_t output_path_id, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end); // Given the current number of levels, returns the lowest allowed level // for compaction input. @@ -67,8 +64,19 @@ class CompactionPicker { // compactions per level void SizeBeingCompacted(std::vector& sizes); + // Returns maximum total overlap bytes with grandparent + // level (i.e., level+2) before we stop building a single + // file in level->level+1 compaction. + uint64_t MaxGrandParentOverlapBytes(int level); + + // Returns maximum total bytes of data on a given level. + double MaxBytesForLevel(int level); + + // Get the max file size in a given level. + uint64_t MaxFileSizeForLevel(int level) const; + protected: - int NumberLevels() const { return ioptions_.num_levels; } + int NumberLevels() const { return num_levels_; } // Stores the minimal range that covers all entries in inputs in // *smallest, *largest. @@ -95,6 +103,8 @@ class CompactionPicker { // Will return false if it is impossible to apply this compaction. bool ExpandWhileOverlapping(Compaction* c); + uint64_t ExpandedCompactionByteSizeLimit(int level); + // Returns true if any one of the specified files are being compacted bool FilesInCompaction(std::vector& files); @@ -103,27 +113,32 @@ class CompactionPicker { const InternalKey* largest, int level, int* index); - void SetupOtherInputs(const MutableCFOptions& mutable_cf_options, - Compaction* c); - - const ImmutableCFOptions& ioptions_; + void SetupOtherInputs(Compaction* c); // record all the ongoing compactions for all levels std::vector> compactions_in_progress_; + // Per-level target file size. + std::unique_ptr max_file_size_; + + // Per-level max bytes + std::unique_ptr level_max_bytes_; + + const Options* const options_; private: + int num_levels_; + const InternalKeyComparator* const icmp_; }; class UniversalCompactionPicker : public CompactionPicker { public: - UniversalCompactionPicker(const ImmutableCFOptions& ioptions, + UniversalCompactionPicker(const Options* options, const InternalKeyComparator* icmp) - : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction( - const MutableCFOptions& mutable_cf_options, - Version* version, LogBuffer* log_buffer) override; + : CompactionPicker(options, icmp) {} + virtual Compaction* PickCompaction(Version* version, + LogBuffer* log_buffer) override; // The maxinum allowed input level. Always return 0. virtual int MaxInputLevel(int current_num_levels) const override { @@ -132,30 +147,27 @@ class UniversalCompactionPicker : public CompactionPicker { private: // Pick Universal compaction to limit read amplification - Compaction* PickCompactionUniversalReadAmp( - const MutableCFOptions& mutable_cf_options, - Version* version, double score, unsigned int ratio, - unsigned int num_files, LogBuffer* log_buffer); + Compaction* PickCompactionUniversalReadAmp(Version* version, double score, + unsigned int ratio, + unsigned int num_files, + LogBuffer* log_buffer); // Pick Universal compaction to limit space amplification. - Compaction* PickCompactionUniversalSizeAmp( - const MutableCFOptions& mutable_cf_options, - Version* version, double score, LogBuffer* log_buffer); + Compaction* PickCompactionUniversalSizeAmp(Version* version, double score, + LogBuffer* log_buffer); // Pick a path ID to place a newly generated file, with its estimated file // size. - static uint32_t GetPathId(const ImmutableCFOptions& ioptions, - uint64_t file_size); + static uint32_t GetPathId(const Options& options, uint64_t file_size); }; class LevelCompactionPicker : public CompactionPicker { public: - LevelCompactionPicker(const ImmutableCFOptions& ioptions, + LevelCompactionPicker(const Options* options, const InternalKeyComparator* icmp) - : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction( - const MutableCFOptions& mutable_cf_options, - Version* version, LogBuffer* log_buffer) override; + : CompactionPicker(options, icmp) {} + virtual Compaction* PickCompaction(Version* version, + LogBuffer* log_buffer) override; // Returns current_num_levels - 2, meaning the last level cannot be // compaction input level. @@ -168,25 +180,23 @@ class LevelCompactionPicker : public CompactionPicker { // Returns nullptr if there is no compaction to be done. // If level is 0 and there is already a compaction on that level, this // function will return nullptr. - Compaction* PickCompactionBySize(const MutableCFOptions& mutable_cf_options, - Version* version, int level, double score); + Compaction* PickCompactionBySize(Version* version, int level, double score); }; class FIFOCompactionPicker : public CompactionPicker { public: - FIFOCompactionPicker(const ImmutableCFOptions& ioptions, + FIFOCompactionPicker(const Options* options, const InternalKeyComparator* icmp) - : CompactionPicker(ioptions, icmp) {} + : CompactionPicker(options, icmp) {} - virtual Compaction* PickCompaction( - const MutableCFOptions& mutable_cf_options, - Version* version, LogBuffer* log_buffer) override; + virtual Compaction* PickCompaction(Version* version, + LogBuffer* log_buffer) override; - virtual Compaction* CompactRange( - const MutableCFOptions& mutable_cf_options, Version* version, - int input_level, int output_level, uint32_t output_path_id, - const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end) override; + virtual Compaction* CompactRange(Version* version, int input_level, + int output_level, uint32_t output_path_id, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end) override; // The maxinum allowed input level. Always return 0. virtual int MaxInputLevel(int current_num_levels) const override { diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 4fcea0d5a8..7a1a5221b0 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -131,7 +131,7 @@ class CorruptionTest { ASSERT_GE(max_expected, correct); } - void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) { + void CorruptFile(const std::string fname, int offset, int bytes_to_corrupt) { struct stat sbuf; if (stat(fname.c_str(), &sbuf) != 0) { const char* msg = strerror(errno); @@ -332,9 +332,6 @@ TEST(CorruptionTest, CorruptedDescriptor) { } TEST(CorruptionTest, CompactionInputError) { - Options options; - options.max_background_flushes = 0; - Reopen(&options); Build(10); DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_FlushMemTable(); @@ -354,7 +351,6 @@ TEST(CorruptionTest, CompactionInputErrorParanoid) { options.paranoid_checks = true; options.write_buffer_size = 131072; options.max_write_buffer_number = 2; - options.max_background_flushes = 0; Reopen(&options); DBImpl* dbi = reinterpret_cast(db_); diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index 4beee59e4e..c1e59b1b56 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -218,7 +218,6 @@ TEST(CuckooTableDBTest, Uint64Comparator) { // Add more keys. ASSERT_OK(Delete(Uint64Key(2))); // Delete. - dbfull()->TEST_FlushMemTable(); ASSERT_OK(Put(Uint64Key(3), "v0")); // Update. ASSERT_OK(Put(Uint64Key(4), "v4")); dbfull()->TEST_FlushMemTable(); @@ -246,38 +245,14 @@ TEST(CuckooTableDBTest, CompactionTrigger) { ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx))); } dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ("2", FilesPerLevel()); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_EQ("0,2", FilesPerLevel()); for (int idx = 0; idx < 22; ++idx) { ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx))); } } -TEST(CuckooTableDBTest, CompactionIntoMultipleFiles) { - // Create a big L0 file and check it compacts into multiple files in L1. - Options options = CurrentOptions(); - options.write_buffer_size = 270 << 10; - // Two SST files should be created, each containing 14 keys. - // Number of buckets will be 16. Total size ~156 KB. - options.target_file_size_base = 160 << 10; - Reopen(&options); - - // Write 28 values, each 10016 B ~ 10KB - for (int idx = 0; idx < 28; ++idx) { - ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx))); - } - dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ("1", FilesPerLevel()); - - dbfull()->TEST_CompactRange(0, nullptr, nullptr); - ASSERT_EQ("0,2", FilesPerLevel()); - for (int idx = 0; idx < 28; ++idx) { - ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx))); - } -} - TEST(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) { // Insert same key twice so that they go to different SST files. Then wait for // compaction and check if the latest value is stored and old value removed. diff --git a/db/db_bench.cc b/db/db_bench.cc index bbd807c2cd..2f88e81ffa 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -7,9 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif #ifndef GFLAGS #include @@ -39,8 +37,8 @@ int main() { #include "rocksdb/memtablerep.h" #include "rocksdb/write_batch.h" #include "rocksdb/slice.h" -#include "rocksdb/filter_policy.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" #include "rocksdb/perf_context.h" #include "port/port.h" #include "port/stack_trace.h" @@ -167,8 +165,6 @@ DEFINE_int32(value_size, 100, "Size of each value"); DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator"); -DEFINE_int64(batch_size, 1, "Batch size"); - static bool ValidateKeySize(const char* flagname, int32_t value) { return true; } @@ -309,7 +305,7 @@ DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL"); DEFINE_int32(num_levels, 7, "The total number of levels"); -DEFINE_int64(target_file_size_base, 2 * 1048576, "Target file size at level-1"); +DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1"); DEFINE_int32(target_file_size_multiplier, 1, "A multiplier to compute target level-N file size (N >= 2)"); @@ -516,9 +512,6 @@ DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated " "i.e. use the prefix comes with the generated random number."); DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction " "threads' IO priority"); -DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo " - "table becomes an identity function. This is only valid when key " - "is 8 bytes"); enum RepFactory { kSkipList, @@ -558,9 +551,7 @@ DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table."); DEFINE_bool(use_hash_search, false, "if use kHashSearch " "instead of kBinarySearch. " "This is valid if only we use BlockTable"); -DEFINE_bool(use_block_based_filter, false, "if use kBlockBasedFilter " - "instead of kFullFilter for filter block. " - "This is valid if only we use BlockTable"); + DEFINE_string(merge_operator, "", "The merge operator to use with the database." "If a new merge operator is specified, be sure to use fresh" " database The possible merge operators are defined in" @@ -636,14 +627,6 @@ static void AppendWithSpace(std::string* str, Slice msg) { str->append(msg.data(), msg.size()); } -struct DBWithColumnFamilies { - std::vector cfh; - DB* db; - DBWithColumnFamilies() : db(nullptr) { - cfh.clear(); - } -}; - class Stats { private: int id_; @@ -707,7 +690,7 @@ class Stats { void SetId(int id) { id_ = id; } void SetExcludeFromMerge() { exclude_from_merge_ = true; } - void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops) { + void FinishedOps(DB* db, int64_t num_ops) { if (FLAGS_histogram) { double now = FLAGS_env->NowMicros(); double micros = now - last_op_finish_; @@ -747,17 +730,8 @@ class Stats { if (FLAGS_stats_per_interval) { std::string stats; - - if (db_with_cfh && db_with_cfh->cfh.size()) { - for (size_t i = 0; i < db_with_cfh->cfh.size(); ++i) { - if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats", - &stats)) - fprintf(stderr, "%s\n", stats.c_str()); - } - - } else if (db && db->GetProperty("rocksdb.stats", &stats)) { + if (db && db->GetProperty("rocksdb.stats", &stats)) fprintf(stderr, "%s\n", stats.c_str()); - } } fflush(stderr); @@ -876,6 +850,13 @@ class Benchmark { std::shared_ptr compressed_cache_; std::shared_ptr filter_policy_; const SliceTransform* prefix_extractor_; + struct DBWithColumnFamilies { + std::vector cfh; + DB* db; + DBWithColumnFamilies() : db(nullptr) { + cfh.clear(); + } + }; DBWithColumnFamilies db_; std::vector multi_dbs_; int64_t num_; @@ -1093,9 +1074,9 @@ class Benchmark { (FLAGS_cache_numshardbits >= 1 ? NewLRUCache(FLAGS_compressed_cache_size, FLAGS_cache_numshardbits) : NewLRUCache(FLAGS_compressed_cache_size)) : nullptr), - filter_policy_(FLAGS_bloom_bits >= 0 ? - NewBloomFilterPolicy(FLAGS_bloom_bits, FLAGS_use_block_based_filter) - : nullptr), + filter_policy_(FLAGS_bloom_bits >= 0 + ? NewBloomFilterPolicy(FLAGS_bloom_bits) + : nullptr), prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)), num_(FLAGS_num), value_size_(FLAGS_value_size), @@ -1127,8 +1108,6 @@ class Benchmark { } ~Benchmark() { - std::for_each(db_.cfh.begin(), db_.cfh.end(), - [](ColumnFamilyHandle* cfh) { delete cfh; }); delete db_.db; delete prefix_extractor_; } @@ -1274,12 +1253,7 @@ class Benchmark { method = &Benchmark::ReadReverse; } else if (name == Slice("readrandom")) { method = &Benchmark::ReadRandom; - } else if (name == Slice("readrandomfast")) { - method = &Benchmark::ReadRandomFast; } else if (name == Slice("multireadrandom")) { - entries_per_batch_ = FLAGS_batch_size; - fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", - entries_per_batch_); method = &Benchmark::MultiReadRandom; } else if (name == Slice("readmissing")) { ++key_size_; @@ -1358,8 +1332,6 @@ class Benchmark { method = nullptr; } else { if (db_.db != nullptr) { - std::for_each(db_.cfh.begin(), db_.cfh.end(), - [](ColumnFamilyHandle* cfh) { delete cfh; }); delete db_.db; db_.db = nullptr; db_.cfh.clear(); @@ -1491,7 +1463,7 @@ class Benchmark { uint32_t crc = 0; while (bytes < 500 * 1048576) { crc = crc32c::Value(data.data(), size); - thread->stats.FinishedOps(nullptr, nullptr, 1); + thread->stats.FinishedOps(nullptr, 1); bytes += size; } // Print so result is not dead @@ -1510,7 +1482,7 @@ class Benchmark { unsigned int xxh32 = 0; while (bytes < 500 * 1048576) { xxh32 = XXH32(data.data(), size, 0); - thread->stats.FinishedOps(nullptr, nullptr, 1); + thread->stats.FinishedOps(nullptr, 1); bytes += size; } // Print so result is not dead @@ -1531,7 +1503,7 @@ class Benchmark { ptr = ap.Acquire_Load(); } count++; - thread->stats.FinishedOps(nullptr, nullptr, 1); + thread->stats.FinishedOps(nullptr, 1); } if (ptr == nullptr) exit(1); // Disable unused variable warning. } @@ -1572,7 +1544,7 @@ class Benchmark { } produced += compressed.size(); bytes += input.size(); - thread->stats.FinishedOps(nullptr, nullptr, 1); + thread->stats.FinishedOps(nullptr, 1); } if (!ok) { @@ -1653,7 +1625,7 @@ class Benchmark { } delete[] uncompressed; bytes += input.size(); - thread->stats.FinishedOps(nullptr, nullptr, 1); + thread->stats.FinishedOps(nullptr, 1); } if (!ok) { @@ -1759,11 +1731,8 @@ class Benchmark { fprintf(stderr, "Invalid cuckoo_hash_ratio\n"); exit(1); } - rocksdb::CuckooTableOptions table_options; - table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio; - table_options.identity_as_first_hash = FLAGS_identity_as_first_hash; options.table_factory = std::shared_ptr( - NewCuckooTableFactory(table_options)); + NewCuckooTableFactory(FLAGS_cuckoo_hash_ratio)); } else { BlockBasedTableOptions block_based_options; if (FLAGS_use_hash_search) { @@ -2033,8 +2002,7 @@ class Benchmark { bytes += value_size_ + key_size_; } s = db_with_cfh->db->Write(write_options_, &batch); - thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, - entries_per_batch_); + thread->stats.FinishedOps(db_with_cfh->db, entries_per_batch_); if (!s.ok()) { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); @@ -2059,7 +2027,7 @@ class Benchmark { int64_t bytes = 0; for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) { bytes += iter->key().size() + iter->value().size(); - thread->stats.FinishedOps(nullptr, db, 1); + thread->stats.FinishedOps(db, 1); ++i; } delete iter; @@ -2082,56 +2050,13 @@ class Benchmark { int64_t bytes = 0; for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) { bytes += iter->key().size() + iter->value().size(); - thread->stats.FinishedOps(nullptr, db, 1); + thread->stats.FinishedOps(db, 1); ++i; } delete iter; thread->stats.AddBytes(bytes); } - void ReadRandomFast(ThreadState* thread) { - int64_t read = 0; - int64_t found = 0; - int64_t nonexist = 0; - ReadOptions options(FLAGS_verify_checksum, true); - Slice key = AllocateKey(); - std::unique_ptr key_guard(key.data()); - std::string value; - DB* db = SelectDBWithCfh(thread)->db; - - int64_t pot = 1; - while (pot < FLAGS_num) { - pot <<= 1; - } - - Duration duration(FLAGS_duration, reads_); - do { - for (int i = 0; i < 100; ++i) { - int64_t key_rand = thread->rand.Next() & (pot - 1); - GenerateKeyFromInt(key_rand, FLAGS_num, &key); - ++read; - if (db->Get(options, key, &value).ok()) { - ++found; - } - if (key_rand >= FLAGS_num) { - ++nonexist; - } - } - thread->stats.FinishedOps(nullptr, db, 100); - } while (!duration.Done(100)); - - char msg[100]; - snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found, " - "issued %" PRIu64 " non-exist keys)\n", - found, read, nonexist); - - thread->stats.AddMessage(msg); - - if (FLAGS_perf_level > 0) { - thread->stats.AddMessage(perf_context.ToString()); - } - } - void ReadRandom(ThreadState* thread) { int64_t read = 0; int64_t found = 0; @@ -2159,7 +2084,7 @@ class Benchmark { if (s.ok()) { found++; } - thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1); + thread->stats.FinishedOps(db_with_cfh->db, 1); } char msg[100]; @@ -2201,7 +2126,7 @@ class Benchmark { ++found; } } - thread->stats.FinishedOps(nullptr, db, entries_per_batch_); + thread->stats.FinishedOps(db, entries_per_batch_); } for (auto& k : keys) { delete k.data(); @@ -2220,7 +2145,7 @@ class Benchmark { DB* db = SelectDB(thread); Iterator* iter = db->NewIterator(options); delete iter; - thread->stats.FinishedOps(nullptr, db, 1); + thread->stats.FinishedOps(db, 1); } } @@ -2284,7 +2209,7 @@ class Benchmark { if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) { found++; } - thread->stats.FinishedOps(&db_, db_.db, 1); + thread->stats.FinishedOps(db_.db, 1); } delete single_iter; for (auto iter : multi_iters) { @@ -2324,7 +2249,7 @@ class Benchmark { batch.Delete(key); } auto s = db->Write(write_options_, &batch); - thread->stats.FinishedOps(nullptr, db, entries_per_batch_); + thread->stats.FinishedOps(db, entries_per_batch_); if (!s.ok()) { fprintf(stderr, "del error: %s\n", s.ToString().c_str()); exit(1); @@ -2384,7 +2309,7 @@ class Benchmark { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(&db_, db_.db, 1); + thread->stats.FinishedOps(db_.db, 1); ++num_writes; if (writes_per_second_by_10 && num_writes >= writes_per_second_by_10) { @@ -2544,7 +2469,7 @@ class Benchmark { deletes_done++; } - thread->stats.FinishedOps(&db_, db_.db, 1); + thread->stats.FinishedOps(db_.db, 1); } char msg[100]; snprintf(msg, sizeof(msg), @@ -2602,7 +2527,7 @@ class Benchmark { put_weight--; writes_done++; } - thread->stats.FinishedOps(nullptr, db, 1); + thread->stats.FinishedOps(db, 1); } char msg[100]; snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \ @@ -2636,7 +2561,7 @@ class Benchmark { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(nullptr, db, 1); + thread->stats.FinishedOps(db, 1); } char msg[100]; snprintf(msg, sizeof(msg), @@ -2683,7 +2608,7 @@ class Benchmark { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(nullptr, db, 1); + thread->stats.FinishedOps(db, 1); } char msg[100]; @@ -2719,7 +2644,7 @@ class Benchmark { fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(nullptr, db, 1); + thread->stats.FinishedOps(db, 1); } // Print some statistics @@ -2780,7 +2705,7 @@ class Benchmark { } - thread->stats.FinishedOps(nullptr, db, 1); + thread->stats.FinishedOps(db, 1); } char msg[100]; diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 9f05b8d307..4185a40cab 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -9,10 +9,7 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include #include @@ -32,9 +29,9 @@ Status DBImpl::DisableFileDeletions() { MutexLock l(&mutex_); ++disable_delete_obsolete_files_; if (disable_delete_obsolete_files_ == 1) { - Log(db_options_.info_log, "File Deletions Disabled"); + Log(options_.info_log, "File Deletions Disabled"); } else { - Log(db_options_.info_log, + Log(options_.info_log, "File Deletions Disabled, but already disabled. Counter: %d", disable_delete_obsolete_files_); } @@ -53,11 +50,11 @@ Status DBImpl::EnableFileDeletions(bool force) { --disable_delete_obsolete_files_; } if (disable_delete_obsolete_files_ == 0) { - Log(db_options_.info_log, "File Deletions Enabled"); + Log(options_.info_log, "File Deletions Enabled"); should_purge_files = true; FindObsoleteFiles(deletion_state, true); } else { - Log(db_options_.info_log, + Log(options_.info_log, "File Deletions Enable, but not really enabled. Counter: %d", disable_delete_obsolete_files_); } @@ -65,7 +62,7 @@ Status DBImpl::EnableFileDeletions(bool force) { if (should_purge_files) { PurgeObsoleteFiles(deletion_state); } - LogFlush(db_options_.info_log); + LogFlush(options_.info_log); return Status::OK(); } @@ -98,7 +95,7 @@ Status DBImpl::GetLiveFiles(std::vector& ret, if (!status.ok()) { mutex_.Unlock(); - Log(db_options_.info_log, "Cannot Flush data %s\n", + Log(options_.info_log, "Cannot Flush data %s\n", status.ToString().c_str()); return status; } @@ -136,7 +133,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { Status s; // list wal files in main db dir. VectorLogPtr logs; - s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile); + s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile); if (!s.ok()) { return s; } @@ -149,7 +146,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { files.clear(); // list wal files in archive dir. - std::string archivedir = ArchivalDirectory(db_options_.wal_dir); + std::string archivedir = ArchivalDirectory(options_.wal_dir); if (env_->FileExists(archivedir)) { s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile); if (!s.ok()) { @@ -160,7 +157,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { uint64_t latest_archived_log_number = 0; if (!files.empty()) { latest_archived_log_number = files.back()->LogNumber(); - Log(db_options_.info_log, "Latest Archived log: %" PRIu64, + Log(options_.info_log, "Latest Archived log: %" PRIu64, latest_archived_log_number); } @@ -173,7 +170,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { // same log in both db dir and archived dir. Simply // ignore the one in db dir. Note that, if we read // archived dir first, we would have missed the log file. - Log(db_options_.info_log, "%s already moved to archive", + Log(options_.info_log, "%s already moved to archive", log->PathName().c_str()); } } diff --git a/db/db_impl.cc b/db/db_impl.cc index 680a22cb3b..54faef2630 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -9,10 +9,7 @@ #include "db/db_impl.h" -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include #include @@ -77,6 +74,20 @@ const std::string kDefaultColumnFamilyName("default"); void DumpLeveldbBuildVersion(Logger * log); +// Information kept for every waiting writer +struct DBImpl::Writer { + Status status; + WriteBatch* batch; + bool sync; + bool disableWAL; + bool in_batch_group; + bool done; + uint64_t timeout_hint_us; + port::CondVar cv; + + explicit Writer(port::Mutex* mu) : cv(mu) { } +}; + struct DBImpl::WriteContext { autovector superversions_to_free_; autovector logs_to_free_; @@ -294,24 +305,24 @@ Status SanitizeDBOptionsByCFOptions( return Status::OK(); } -CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions) { +CompressionType GetCompressionFlush(const Options& options) { // Compressing memtable flushes might not help unless the sequential load // optimization is used for leveled compaction. Otherwise the CPU and // latency overhead is not offset by saving much space. bool can_compress; - if (ioptions.compaction_style == kCompactionStyleUniversal) { + if (options.compaction_style == kCompactionStyleUniversal) { can_compress = - (ioptions.compaction_options_universal.compression_size_percent < 0); + (options.compaction_options_universal.compression_size_percent < 0); } else { // For leveled compress when min_level_to_compress == 0. - can_compress = ioptions.compression_per_level.empty() || - ioptions.compression_per_level[0] != kNoCompression; + can_compress = options.compression_per_level.empty() || + options.compression_per_level[0] != kNoCompression; } if (can_compress) { - return ioptions.compression; + return options.compression; } else { return kNoCompression; } @@ -321,8 +332,8 @@ CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions) { DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) : env_(options.env), dbname_(dbname), - db_options_(SanitizeOptions(dbname, options)), - stats_(db_options_.statistics.get()), + options_(SanitizeOptions(dbname, options)), + stats_(options_.statistics.get()), db_lock_(nullptr), mutex_(options.use_adaptive_mutex), shutting_down_(nullptr), @@ -344,7 +355,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) last_stats_dump_time_microsec_(0), default_interval_to_delete_obsolete_WAL_(600), flush_on_destroy_(false), - env_options_(options), + delayed_writes_(0), + storage_options_(options), bg_work_gate_closed_(false), refitting_level_(false), opened_successfully_(false) { @@ -352,30 +364,30 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) // Reserve ten files or so for other uses and give the rest to TableCache. // Give a large number for setting of "infinite" open files. - const int table_cache_size = (db_options_.max_open_files == -1) ? - 4194304 : db_options_.max_open_files - 10; + const int table_cache_size = + (options_.max_open_files == -1) ? 4194304 : options_.max_open_files - 10; // Reserve ten files or so for other uses and give the rest to TableCache. table_cache_ = - NewLRUCache(table_cache_size, db_options_.table_cache_numshardbits, - db_options_.table_cache_remove_scan_count_limit); + NewLRUCache(table_cache_size, options_.table_cache_numshardbits, + options_.table_cache_remove_scan_count_limit); - versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, - table_cache_.get(), &write_controller_)); - column_family_memtables_.reset(new ColumnFamilyMemTablesImpl( - versions_->GetColumnFamilySet(), &flush_scheduler_)); + versions_.reset( + new VersionSet(dbname_, &options_, storage_options_, table_cache_.get())); + column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); - DumpLeveldbBuildVersion(db_options_.info_log.get()); - DumpDBFileSummary(db_options_, dbname_); - db_options_.Dump(db_options_.info_log.get()); + DumpLeveldbBuildVersion(options_.info_log.get()); + DumpDBFileSummary(options_, dbname_); + options_.Dump(options_.info_log.get()); - LogFlush(db_options_.info_log); + LogFlush(options_.info_log); } DBImpl::~DBImpl() { mutex_.Lock(); if (flush_on_destroy_) { for (auto cfd : *versions_->GetColumnFamilySet()) { - if (!cfd->mem()->IsEmpty()) { + if (cfd->mem()->GetFirstSequenceNumber() != 0) { cfd->Ref(); mutex_.Unlock(); FlushMemTable(cfd, FlushOptions()); @@ -392,8 +404,6 @@ DBImpl::~DBImpl() { bg_cv_.Wait(); } - flush_scheduler_.Clear(); - if (default_cf_handle_ != nullptr) { // we need to delete handle outside of lock because it does its own locking mutex_.Unlock(); @@ -401,22 +411,24 @@ DBImpl::~DBImpl() { mutex_.Lock(); } - // Clean up obsolete files due to SuperVersion release. - // (1) Need to delete to obsolete files before closing because RepairDB() - // scans all existing files in the file system and builds manifest file. - // Keeping obsolete files confuses the repair process. - // (2) Need to check if we Open()/Recover() the DB successfully before - // deleting because if VersionSet recover fails (may be due to corrupted - // manifest file), it is not able to identify live files correctly. As a - // result, all "live" files can get deleted by accident. However, corrupted - // manifest is recoverable by RepairDB(). - if (opened_successfully_) { - DeletionState deletion_state; - FindObsoleteFiles(deletion_state, true); - // manifest number starting from 2 - deletion_state.manifest_file_number = 1; - if (deletion_state.HaveSomethingToDelete()) { - PurgeObsoleteFiles(deletion_state); + if (options_.allow_thread_local) { + // Clean up obsolete files due to SuperVersion release. + // (1) Need to delete to obsolete files before closing because RepairDB() + // scans all existing files in the file system and builds manifest file. + // Keeping obsolete files confuses the repair process. + // (2) Need to check if we Open()/Recover() the DB successfully before + // deleting because if VersionSet recover fails (may be due to corrupted + // manifest file), it is not able to identify live files correctly. As a + // result, all "live" files can get deleted by accident. However, corrupted + // manifest is recoverable by RepairDB(). + if (opened_successfully_) { + DeletionState deletion_state; + FindObsoleteFiles(deletion_state, true); + // manifest number starting from 2 + deletion_state.manifest_file_number = 1; + if (deletion_state.HaveSomethingToDelete()) { + PurgeObsoleteFiles(deletion_state); + } } } @@ -428,7 +440,7 @@ DBImpl::~DBImpl() { env_->UnlockFile(db_lock_); } - LogFlush(db_options_.info_log); + LogFlush(options_.info_log); } Status DBImpl::NewDB() { @@ -437,15 +449,15 @@ Status DBImpl::NewDB() { new_db.SetNextFile(2); new_db.SetLastSequence(0); - Log(db_options_.info_log, "Creating manifest 1 \n"); + Log(options_.info_log, "Creating manifest 1 \n"); const std::string manifest = DescriptorFileName(dbname_, 1); unique_ptr file; Status s = env_->NewWritableFile( - manifest, &file, env_->OptimizeForManifestWrite(env_options_)); + manifest, &file, env_->OptimizeForManifestWrite(storage_options_)); if (!s.ok()) { return s; } - file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size); + file->SetPreallocationBlockSize(options_.manifest_preallocation_size); { log::Writer log(std::move(file)); std::string record; @@ -462,38 +474,38 @@ Status DBImpl::NewDB() { } void DBImpl::MaybeIgnoreError(Status* s) const { - if (s->ok() || db_options_.paranoid_checks) { + if (s->ok() || options_.paranoid_checks) { // No change needed } else { - Log(db_options_.info_log, "Ignoring error %s", s->ToString().c_str()); + Log(options_.info_log, "Ignoring error %s", s->ToString().c_str()); *s = Status::OK(); } } const Status DBImpl::CreateArchivalDirectory() { - if (db_options_.WAL_ttl_seconds > 0 || db_options_.WAL_size_limit_MB > 0) { - std::string archivalPath = ArchivalDirectory(db_options_.wal_dir); + if (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0) { + std::string archivalPath = ArchivalDirectory(options_.wal_dir); return env_->CreateDirIfMissing(archivalPath); } return Status::OK(); } void DBImpl::PrintStatistics() { - auto dbstats = db_options_.statistics.get(); + auto dbstats = options_.statistics.get(); if (dbstats) { - Log(db_options_.info_log, + Log(options_.info_log, "STATISTCS:\n %s", dbstats->ToString().c_str()); } } void DBImpl::MaybeDumpStats() { - if (db_options_.stats_dump_period_sec == 0) return; + if (options_.stats_dump_period_sec == 0) return; const uint64_t now_micros = env_->NowMicros(); if (last_stats_dump_time_microsec_ + - db_options_.stats_dump_period_sec * 1000000 + options_.stats_dump_period_sec * 1000000 <= now_micros) { // Multiple threads could race in here simultaneously. // However, the last one will update last_stats_dump_time_microsec_ @@ -517,8 +529,8 @@ void DBImpl::MaybeDumpStats() { default_cf_internal_stats_->GetStringProperty(db_property_type, "rocksdb.dbstats", &stats); } - Log(db_options_.info_log, "------- DUMPING STATS -------"); - Log(db_options_.info_log, "%s", stats.c_str()); + Log(options_.info_log, "------- DUMPING STATS -------"); + Log(options_.info_log, "%s", stats.c_str()); PrintStatistics(); } @@ -528,7 +540,7 @@ void DBImpl::MaybeDumpStats() { // of all files in the filesystem in 'candidate_files'. // no_full_scan = true -- never do the full scan using GetChildren() // force = false -- don't force the full scan, except every -// db_options_.delete_obsolete_files_period_micros +// options_.delete_obsolete_files_period_micros // force = true -- force the full scan void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, bool force, @@ -545,12 +557,12 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, // logic for figurint out if we're doing the full scan if (no_full_scan) { doing_the_full_scan = false; - } else if (force || db_options_.delete_obsolete_files_period_micros == 0) { + } else if (force || options_.delete_obsolete_files_period_micros == 0) { doing_the_full_scan = true; } else { const uint64_t now_micros = env_->NowMicros(); if (delete_obsolete_files_last_run_ + - db_options_.delete_obsolete_files_period_micros < now_micros) { + options_.delete_obsolete_files_period_micros < now_micros) { doing_the_full_scan = true; delete_obsolete_files_last_run_ = now_micros; } @@ -582,32 +594,29 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, versions_->AddLiveFiles(&deletion_state.sst_live); if (doing_the_full_scan) { - for (uint32_t path_id = 0; - path_id < db_options_.db_paths.size(); path_id++) { + for (uint32_t path_id = 0; path_id < options_.db_paths.size(); path_id++) { // set of all files in the directory. We'll exclude files that are still // alive in the subsequent processings. std::vector files; - env_->GetChildren(db_options_.db_paths[path_id].path, + env_->GetChildren(options_.db_paths[path_id].path, &files); // Ignore errors for (std::string file : files) { - // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes - deletion_state.candidate_files.emplace_back("/" + file, path_id); + deletion_state.candidate_files.emplace_back(file, path_id); } } //Add log files in wal_dir - if (db_options_.wal_dir != dbname_) { + if (options_.wal_dir != dbname_) { std::vector log_files; - env_->GetChildren(db_options_.wal_dir, &log_files); // Ignore errors + env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors for (std::string log_file : log_files) { deletion_state.candidate_files.emplace_back(log_file, 0); } } // Add info log files in db_log_dir - if (!db_options_.db_log_dir.empty() && db_options_.db_log_dir != dbname_) { + if (!options_.db_log_dir.empty() && options_.db_log_dir != dbname_) { std::vector info_log_files; - // Ignore errors - env_->GetChildren(db_options_.db_log_dir, &info_log_files); + env_->GetChildren(options_.db_log_dir, &info_log_files); // Ignore errors for (std::string log_file : info_log_files) { deletion_state.candidate_files.emplace_back(log_file, 0); } @@ -678,7 +687,7 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { candidate_files.end()); std::vector old_info_log_files; - InfoLogPrefix info_log_prefix(!db_options_.db_log_dir.empty(), dbname_); + InfoLogPrefix info_log_prefix(!options_.db_log_dir.empty(), dbname_); for (const auto& candidate_file : candidate_files) { std::string to_delete = candidate_file.file_name; uint32_t path_id = candidate_file.path_id; @@ -734,51 +743,51 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { if (type == kTableFile) { // evict from cache TableCache::Evict(table_cache_.get(), number); - fname = TableFileName(db_options_.db_paths, number, path_id); + fname = TableFileName(options_.db_paths, number, path_id); } else { - fname = ((type == kLogFile) ? - db_options_.wal_dir : dbname_) + "/" + to_delete; + fname = + ((type == kLogFile) ? options_.wal_dir : dbname_) + "/" + to_delete; } if (type == kLogFile && - (db_options_.WAL_ttl_seconds > 0 || - db_options_.WAL_size_limit_MB > 0)) { - auto archived_log_name = ArchivedLogFileName(db_options_.wal_dir, number); + (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0)) { + auto archived_log_name = ArchivedLogFileName(options_.wal_dir, number); // The sync point below is used in (DBTest,TransactionLogIteratorRace) TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:1"); Status s = env_->RenameFile(fname, archived_log_name); // The sync point below is used in (DBTest,TransactionLogIteratorRace) TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:2"); - Log(db_options_.info_log, + Log(options_.info_log, "Move log file %s to %s -- %s\n", fname.c_str(), archived_log_name.c_str(), s.ToString().c_str()); } else { Status s = env_->DeleteFile(fname); - Log(db_options_.info_log, "Delete %s type=%d #%" PRIu64 " -- %s\n", + Log(options_.info_log, "Delete %s type=%d #%" PRIu64 " -- %s\n", fname.c_str(), type, number, s.ToString().c_str()); } } // Delete old info log files. size_t old_info_log_file_count = old_info_log_files.size(); - if (old_info_log_file_count >= db_options_.keep_log_file_num) { + if (old_info_log_file_count >= options_.keep_log_file_num) { std::sort(old_info_log_files.begin(), old_info_log_files.end()); - size_t end = old_info_log_file_count - db_options_.keep_log_file_num; + size_t end = old_info_log_file_count - options_.keep_log_file_num; for (unsigned int i = 0; i <= end; i++) { std::string& to_delete = old_info_log_files.at(i); - std::string full_path_to_delete = (db_options_.db_log_dir.empty() ? - dbname_ : db_options_.db_log_dir) + "/" + to_delete; - Log(db_options_.info_log, "Delete info log file %s\n", + std::string full_path_to_delete = + (options_.db_log_dir.empty() ? dbname_ : options_.db_log_dir) + "/" + + to_delete; + Log(options_.info_log, "Delete info log file %s\n", full_path_to_delete.c_str()); Status s = env_->DeleteFile(full_path_to_delete); if (!s.ok()) { - Log(db_options_.info_log, "Delete info log file %s FAILED -- %s\n", + Log(options_.info_log, "Delete info log file %s FAILED -- %s\n", to_delete.c_str(), s.ToString().c_str()); } } } PurgeObsoleteWALFiles(); - LogFlush(db_options_.info_log); + LogFlush(options_.info_log); } void DBImpl::DeleteObsoleteFiles() { @@ -800,8 +809,8 @@ void DBImpl::DeleteObsoleteFiles() { // b. get sorted non-empty archived logs // c. delete what should be deleted void DBImpl::PurgeObsoleteWALFiles() { - bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0; - bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0; + bool const ttl_enabled = options_.WAL_ttl_seconds > 0; + bool const size_limit_enabled = options_.WAL_size_limit_MB > 0; if (!ttl_enabled && !size_limit_enabled) { return; } @@ -809,14 +818,13 @@ void DBImpl::PurgeObsoleteWALFiles() { int64_t current_time; Status s = env_->GetCurrentTime(¤t_time); if (!s.ok()) { - Log(db_options_.info_log, "Can't get current time: %s", - s.ToString().c_str()); + Log(options_.info_log, "Can't get current time: %s", s.ToString().c_str()); assert(false); return; } uint64_t const now_seconds = static_cast(current_time); uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) ? - db_options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_; + options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_; if (purge_wal_files_last_run_ + time_to_check > now_seconds) { return; @@ -824,12 +832,11 @@ void DBImpl::PurgeObsoleteWALFiles() { purge_wal_files_last_run_ = now_seconds; - std::string archival_dir = ArchivalDirectory(db_options_.wal_dir); + std::string archival_dir = ArchivalDirectory(options_.wal_dir); std::vector files; s = env_->GetChildren(archival_dir, &files); if (!s.ok()) { - Log(db_options_.info_log, "Can't get archive files: %s", - s.ToString().c_str()); + Log(options_.info_log, "Can't get archive files: %s", s.ToString().c_str()); assert(false); return; } @@ -847,14 +854,14 @@ void DBImpl::PurgeObsoleteWALFiles() { Status const s = env_->GetFileModificationTime(file_path, &file_m_time); if (!s.ok()) { - Log(db_options_.info_log, "Can't get file mod time: %s: %s", + Log(options_.info_log, "Can't get file mod time: %s: %s", file_path.c_str(), s.ToString().c_str()); continue; } - if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) { + if (now_seconds - file_m_time > options_.WAL_ttl_seconds) { Status const s = env_->DeleteFile(file_path); if (!s.ok()) { - Log(db_options_.info_log, "Can't delete file: %s: %s", + Log(options_.info_log, "Can't delete file: %s: %s", file_path.c_str(), s.ToString().c_str()); continue; } else { @@ -869,7 +876,7 @@ void DBImpl::PurgeObsoleteWALFiles() { uint64_t file_size; Status const s = env_->GetFileSize(file_path, &file_size); if (!s.ok()) { - Log(db_options_.info_log, "Can't get file size: %s: %s", + Log(options_.info_log, "Can't get file size: %s: %s", file_path.c_str(), s.ToString().c_str()); return; } else { @@ -879,7 +886,7 @@ void DBImpl::PurgeObsoleteWALFiles() { } else { Status s = env_->DeleteFile(file_path); if (!s.ok()) { - Log(db_options_.info_log, "Can't delete file: %s: %s", + Log(options_.info_log, "Can't delete file: %s: %s", file_path.c_str(), s.ToString().c_str()); continue; } else { @@ -896,7 +903,7 @@ void DBImpl::PurgeObsoleteWALFiles() { return; } - size_t const files_keep_num = db_options_.WAL_size_limit_MB * + size_t const files_keep_num = options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size; if (log_files_num <= files_keep_num) { return; @@ -907,7 +914,7 @@ void DBImpl::PurgeObsoleteWALFiles() { GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); if (files_del_num > archived_logs.size()) { - Log(db_options_.info_log, "Trying to delete more archived log files than " + Log(options_.info_log, "Trying to delete more archived log files than " "exist. Deleting all"); files_del_num = archived_logs.size(); } @@ -916,7 +923,7 @@ void DBImpl::PurgeObsoleteWALFiles() { std::string const file_path = archived_logs[i]->PathName(); Status const s = DeleteFile(file_path); if (!s.ok()) { - Log(db_options_.info_log, "Can't delete file: %s: %s", + Log(options_.info_log, "Can't delete file: %s: %s", file_path.c_str(), s.ToString().c_str()); continue; } else { @@ -1024,7 +1031,7 @@ Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number, } Status s; if (type == kAliveLogFile) { - std::string fname = LogFileName(db_options_.wal_dir, number); + std::string fname = LogFileName(options_.wal_dir, number); s = ReadFirstLine(fname, sequence); if (env_->FileExists(fname) && !s.ok()) { // return any error that is not caused by non-existing file @@ -1034,8 +1041,7 @@ Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number, if (type == kArchivedLogFile || !s.ok()) { // check if the file got moved to archive. - std::string archived_file = - ArchivedLogFileName(db_options_.wal_dir, number); + std::string archived_file = ArchivedLogFileName(options_.wal_dir, number); s = ReadFirstLine(archived_file, sequence); } @@ -1056,7 +1062,7 @@ Status DBImpl::ReadFirstLine(const std::string& fname, const char* fname; Status* status; - bool ignore_error; // true if db_options_.paranoid_checks==false + bool ignore_error; // true if options_.paranoid_checks==false virtual void Corruption(size_t bytes, const Status& s) { Log(info_log, "%s%s: dropping %d bytes; %s", (this->ignore_error ? "(ignoring error) " : ""), fname, @@ -1069,7 +1075,7 @@ Status DBImpl::ReadFirstLine(const std::string& fname, }; unique_ptr file; - Status status = env_->NewSequentialFile(fname, &file, env_options_); + Status status = env_->NewSequentialFile(fname, &file, storage_options_); if (!status.ok()) { return status; @@ -1077,17 +1083,17 @@ Status DBImpl::ReadFirstLine(const std::string& fname, LogReporter reporter; reporter.env = env_; - reporter.info_log = db_options_.info_log.get(); + reporter.info_log = options_.info_log.get(); reporter.fname = fname.c_str(); reporter.status = &status; - reporter.ignore_error = !db_options_.paranoid_checks; + reporter.ignore_error = !options_.paranoid_checks; log::Reader reader(std::move(file), &reporter, true /*checksum*/, 0 /*initial_offset*/); std::string scratch; Slice record; if (reader.ReadRecord(&record, &scratch) && - (status.ok() || !db_options_.paranoid_checks)) { + (status.ok() || !options_.paranoid_checks)) { if (record.size() < 12) { reporter.Corruption(record.size(), Status::Corruption("log record too small")); @@ -1128,7 +1134,7 @@ Status DBImpl::Recover( return s; } - for (auto& db_path : db_options_.db_paths) { + for (auto& db_path : options_.db_paths) { s = env_->CreateDirIfMissing(db_path.path); if (!s.ok()) { return s; @@ -1146,7 +1152,7 @@ Status DBImpl::Recover( } if (!env_->FileExists(CurrentFileName(dbname_))) { - if (db_options_.create_if_missing) { + if (options_.create_if_missing) { s = NewDB(); is_new_db = true; if (!s.ok()) { @@ -1157,7 +1163,7 @@ Status DBImpl::Recover( dbname_, "does not exist (create_if_missing is false)"); } } else { - if (db_options_.error_if_exists) { + if (options_.error_if_exists) { return Status::InvalidArgument( dbname_, "exists (error_if_exists is true)"); } @@ -1172,7 +1178,7 @@ Status DBImpl::Recover( } Status s = versions_->Recover(column_families, read_only); - if (db_options_.paranoid_checks && s.ok()) { + if (options_.paranoid_checks && s.ok()) { s = CheckConsistency(); } if (s.ok()) { @@ -1193,7 +1199,7 @@ Status DBImpl::Recover( const uint64_t min_log = versions_->MinLogNumber(); const uint64_t prev_log = versions_->PrevLogNumber(); std::vector filenames; - s = env_->GetChildren(db_options_.wal_dir, &filenames); + s = env_->GetChildren(options_.wal_dir, &filenames); if (!s.ok()) { return s; } @@ -1220,17 +1226,14 @@ Status DBImpl::Recover( "flag but a log file already exists"); } - if (!logs.empty()) { - // Recover in the order in which the logs were generated - std::sort(logs.begin(), logs.end()); - s = RecoverLogFiles(logs, &max_sequence, read_only); - if (!s.ok()) { - // Clear memtables if recovery failed - for (auto cfd : *versions_->GetColumnFamilySet()) { - cfd->CreateNewMemtable(MemTableOptions( - *cfd->GetLatestMutableCFOptions(), *cfd->options())); - } - } + // Recover in the order in which the logs were generated + std::sort(logs.begin(), logs.end()); + for (const auto& log : logs) { + // The previous incarnation may not have written any MANIFEST + // records after allocating this log number. So we manually + // update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(log); + s = RecoverLogFile(log, &max_sequence, read_only); } SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence()); } @@ -1243,15 +1246,14 @@ Status DBImpl::Recover( return s; } -// REQUIRES: log_numbers are sorted in ascending order -Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, - SequenceNumber* max_sequence, bool read_only) { +Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, + bool read_only) { struct LogReporter : public log::Reader::Reporter { Env* env; Logger* info_log; const char* fname; - Status* status; // nullptr if db_options_.paranoid_checks==false or - // db_options_.skip_log_error_on_recovery==true + Status* status; // nullptr if options_.paranoid_checks==false or + // options_.skip_log_error_on_recovery==true virtual void Corruption(size_t bytes, const Status& s) { Log(info_log, "%s%s: dropping %d bytes; %s", (this->status == nullptr ? "(ignoring error) " : ""), @@ -1261,7 +1263,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, }; mutex_.AssertHeld(); - Status status; + std::unordered_map version_edits; // no need to refcount because iteration is under mutex for (auto cfd : *versions_->GetColumnFamilySet()) { @@ -1270,80 +1272,67 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, version_edits.insert({cfd->GetID(), edit}); } - for (auto log_number : log_numbers) { - // The previous incarnation may not have written any MANIFEST - // records after allocating this log number. So we manually - // update the file number allocation counter in VersionSet. - versions_->MarkFileNumberUsed(log_number); - // Open the log file - std::string fname = LogFileName(db_options_.wal_dir, log_number); - unique_ptr file; - status = env_->NewSequentialFile(fname, &file, env_options_); + // Open the log file + std::string fname = LogFileName(options_.wal_dir, log_number); + unique_ptr file; + Status status = env_->NewSequentialFile(fname, &file, storage_options_); + if (!status.ok()) { + MaybeIgnoreError(&status); + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log.get(); + reporter.fname = fname.c_str(); + reporter.status = (options_.paranoid_checks && + !options_.skip_log_error_on_recovery ? &status : nullptr); + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(std::move(file), &reporter, true/*checksum*/, + 0/*initial_offset*/); + Log(options_.info_log, "Recovering log #%" PRIu64 "", log_number); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + while (reader.ReadRecord(&record, &scratch)) { + if (record.size() < 12) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + + // If column family was not found, it might mean that the WAL write + // batch references to the column family that was dropped after the + // insert. We don't want to fail the whole write batch in that case -- we + // just ignore the update. That's why we set ignore missing column families + // to true + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), + true /* ignore missing column families */, log_number); + + MaybeIgnoreError(&status); if (!status.ok()) { - MaybeIgnoreError(&status); - if (!status.ok()) { - return status; - } else { - // Fail with one log file, but that's ok. - // Try next one. - continue; - } + return status; + } + const SequenceNumber last_seq = + WriteBatchInternal::Sequence(&batch) + + WriteBatchInternal::Count(&batch) - 1; + if (last_seq > *max_sequence) { + *max_sequence = last_seq; } - // Create the log reader. - LogReporter reporter; - reporter.env = env_; - reporter.info_log = db_options_.info_log.get(); - reporter.fname = fname.c_str(); - reporter.status = - (db_options_.paranoid_checks && !db_options_.skip_log_error_on_recovery - ? &status - : nullptr); - // We intentially make log::Reader do checksumming even if - // paranoid_checks==false so that corruptions cause entire commits - // to be skipped instead of propagating bad information (like overly - // large sequence numbers). - log::Reader reader(std::move(file), &reporter, true /*checksum*/, - 0 /*initial_offset*/); - Log(db_options_.info_log, "Recovering log #%" PRIu64 "", log_number); - - // Read all the records and add to a memtable - std::string scratch; - Slice record; - WriteBatch batch; - while (reader.ReadRecord(&record, &scratch)) { - if (record.size() < 12) { - reporter.Corruption(record.size(), - Status::Corruption("log record too small")); - continue; - } - WriteBatchInternal::SetContents(&batch, record); - - // If column family was not found, it might mean that the WAL write - // batch references to the column family that was dropped after the - // insert. We don't want to fail the whole write batch in that case -- - // we just ignore the update. - // That's why we set ignore missing column families to true - status = WriteBatchInternal::InsertInto( - &batch, column_family_memtables_.get(), true, log_number); - - MaybeIgnoreError(&status); - if (!status.ok()) { - return status; - } - const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) + - WriteBatchInternal::Count(&batch) - 1; - if (last_seq > *max_sequence) { - *max_sequence = last_seq; - } - - if (!read_only) { - // we can do this because this is called before client has access to the - // DB and there is only a single thread operating on DB - ColumnFamilyData* cfd; - - while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) { - cfd->Unref(); + if (!read_only) { + // no need to refcount since client still doesn't have access + // to the DB and can not drop column families while we iterate + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->mem()->ShouldFlush()) { // If this asserts, it means that InsertInto failed in // filtering updates to already-flushed column families assert(cfd->GetLogNumber() <= log_number); @@ -1351,35 +1340,33 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, assert(iter != version_edits.end()); VersionEdit* edit = &iter->second; status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit); + // we still want to clear the memtable, even if the recovery failed + cfd->CreateNewMemtable(); if (!status.ok()) { // Reflect errors immediately so that conditions like full // file-systems cause the DB::Open() to fail. return status; } - cfd->CreateNewMemtable(MemTableOptions( - *cfd->GetLatestMutableCFOptions(), *cfd->options())); } } } + } - flush_scheduler_.Clear(); - if (versions_->LastSequence() < *max_sequence) { - versions_->SetLastSequence(*max_sequence); - } + if (versions_->LastSequence() < *max_sequence) { + versions_->SetLastSequence(*max_sequence); } if (!read_only) { // no need to refcount since client still doesn't have access // to the DB and can not drop column families while we iterate - auto max_log_number = log_numbers.back(); for (auto cfd : *versions_->GetColumnFamilySet()) { auto iter = version_edits.find(cfd->GetID()); assert(iter != version_edits.end()); VersionEdit* edit = &iter->second; - if (cfd->GetLogNumber() > max_log_number) { + if (cfd->GetLogNumber() > log_number) { // Column family cfd has already flushed the data - // from all logs. Memtable has to be empty because + // from log_number. Memtable has to be empty because // we filter the updates based on log_number // (in WriteBatch::InsertInto) assert(cfd->mem()->GetFirstSequenceNumber() == 0); @@ -1390,31 +1377,28 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // flush the final memtable (if non-empty) if (cfd->mem()->GetFirstSequenceNumber() != 0) { status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit); - if (!status.ok()) { - // Recovery failed - break; - } - cfd->CreateNewMemtable(MemTableOptions( - *cfd->GetLatestMutableCFOptions(), *cfd->options())); + } + // we still want to clear the memtable, even if the recovery failed + cfd->CreateNewMemtable(); + if (!status.ok()) { + return status; } // write MANIFEST with update - // writing log_number in the manifest means that any log file + // writing log number in the manifest means that any log file // with number strongly less than (log_number + 1) is already // recovered and should be ignored on next reincarnation. - // Since we already recovered max_log_number, we want all logs - // with numbers `<= max_log_number` (includes this one) to be ignored - edit->SetLogNumber(max_log_number + 1); + // Since we already recovered log_number, we want all logs + // with numbers `<= log_number` (includes this one) to be ignored + edit->SetLogNumber(log_number + 1); // we must mark the next log number as used, even though it's // not actually used. that is because VersionSet assumes // VersionSet::next_file_number_ always to be strictly greater than any // log number - versions_->MarkFileNumberUsed(max_log_number + 1); - status = versions_->LogAndApply( - cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_); + versions_->MarkFileNumberUsed(log_number + 1); + status = versions_->LogAndApply(cfd, edit, &mutex_); if (!status.ok()) { - // Recovery failed - break; + return status; } } } @@ -1431,32 +1415,30 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, pending_outputs_[meta.fd.GetNumber()] = 0; // path 0 for level 0 file. ReadOptions ro; ro.total_order_seek = true; - Arena arena; + Iterator* iter = mem->NewIterator(ro); + const SequenceNumber newest_snapshot = snapshots_.GetNewest(); + const SequenceNumber earliest_seqno_in_memtable = + mem->GetFirstSequenceNumber(); + Log(options_.info_log, "[%s] Level-0 table #%" PRIu64 ": started", + cfd->GetName().c_str(), meta.fd.GetNumber()); + Status s; { - ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); - const SequenceNumber newest_snapshot = snapshots_.GetNewest(); - const SequenceNumber earliest_seqno_in_memtable = - mem->GetFirstSequenceNumber(); - Log(db_options_.info_log, "[%s] Level-0 table #%" PRIu64 ": started", - cfd->GetName().c_str(), meta.fd.GetNumber()); - - { - mutex_.Unlock(); - s = BuildTable( - dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(), - iter.get(), &meta, cfd->internal_comparator(), newest_snapshot, - earliest_seqno_in_memtable, GetCompressionFlush(*cfd->ioptions()), - cfd->ioptions()->compression_opts, Env::IO_HIGH); - LogFlush(db_options_.info_log); - mutex_.Lock(); - } - - Log(db_options_.info_log, - "[%s] Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", - cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), - s.ToString().c_str()); + mutex_.Unlock(); + s = BuildTable(dbname_, env_, *cfd->options(), storage_options_, + cfd->table_cache(), iter, &meta, cfd->internal_comparator(), + newest_snapshot, earliest_seqno_in_memtable, + GetCompressionFlush(*cfd->options()), Env::IO_HIGH); + LogFlush(options_.info_log); + mutex_.Lock(); } + + Log(options_.info_log, + "[%s] Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", + cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), + s.ToString().c_str()); + delete iter; + pending_outputs_.erase(meta.fd.GetNumber()); // Note that if file_size is zero, the file has been deleted and @@ -1480,9 +1462,8 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, } Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, - const MutableCFOptions& mutable_cf_options, - const autovector& mems, - VersionEdit* edit, uint64_t* filenumber, LogBuffer* log_buffer) { + autovector& mems, VersionEdit* edit, + uint64_t* filenumber, LogBuffer* log_buffer) { mutex_.AssertHeld(); const uint64_t start_micros = env_->NowMicros(); FileMetaData meta; @@ -1503,34 +1484,29 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, std::vector memtables; ReadOptions ro; ro.total_order_seek = true; - Arena arena; for (MemTable* m : mems) { - Log(db_options_.info_log, + Log(options_.info_log, "[%s] Flushing memtable with next log file: %" PRIu64 "\n", cfd->GetName().c_str(), m->GetNextLogNumber()); - memtables.push_back(m->NewIterator(ro, &arena)); + memtables.push_back(m->NewIterator(ro)); } - { - ScopedArenaIterator iter(NewMergingIterator(&cfd->internal_comparator(), - &memtables[0], - memtables.size(), &arena)); - Log(db_options_.info_log, - "[%s] Level-0 flush table #%" PRIu64 ": started", - cfd->GetName().c_str(), meta.fd.GetNumber()); + Iterator* iter = NewMergingIterator(&cfd->internal_comparator(), + &memtables[0], memtables.size()); + Log(options_.info_log, "[%s] Level-0 flush table #%" PRIu64 ": started", + cfd->GetName().c_str(), meta.fd.GetNumber()); - s = BuildTable( - dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(), - iter.get(), &meta, cfd->internal_comparator(), newest_snapshot, - earliest_seqno_in_memtable, GetCompressionFlush(*cfd->ioptions()), - cfd->ioptions()->compression_opts, Env::IO_HIGH); - LogFlush(db_options_.info_log); - } - Log(db_options_.info_log, + s = BuildTable(dbname_, env_, *cfd->options(), storage_options_, + cfd->table_cache(), iter, &meta, cfd->internal_comparator(), + newest_snapshot, earliest_seqno_in_memtable, + GetCompressionFlush(*cfd->options()), Env::IO_HIGH); + LogFlush(options_.info_log); + delete iter; + Log(options_.info_log, "[%s] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s", cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), s.ToString().c_str()); - if (!db_options_.disableDataSync) { + if (!options_.disableDataSync) { db_directory_->Fsync(); } mutex_.Lock(); @@ -1559,11 +1535,9 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, // insert files directly into higher levels because some other // threads could be concurrently producing compacted files for // that key range. - if (base != nullptr && db_options_.max_background_compactions <= 1 && - db_options_.max_background_flushes == 0 && - cfd->ioptions()->compaction_style == kCompactionStyleLevel) { - level = base->PickLevelForMemTableOutput( - mutable_cf_options, min_user_key, max_user_key); + if (base != nullptr && options_.max_background_compactions <= 1 && + cfd->options()->compaction_style == kCompactionStyleLevel) { + level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); } edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(), meta.smallest, meta.largest, @@ -1580,9 +1554,10 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, return s; } -Status DBImpl::FlushMemTableToOutputFile( - ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, - bool* madeProgress, DeletionState& deletion_state, LogBuffer* log_buffer) { +Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd, + bool* madeProgress, + DeletionState& deletion_state, + LogBuffer* log_buffer) { mutex_.AssertHeld(); assert(cfd->imm()->size() != 0); assert(cfd->imm()->IsFlushPending()); @@ -1609,10 +1584,8 @@ Status DBImpl::FlushMemTableToOutputFile( edit->SetLogNumber(mems.back()->GetNextLogNumber()); edit->SetColumnFamily(cfd->GetID()); - // This will release and re-acquire the mutex. - Status s = WriteLevel0Table(cfd, mutable_cf_options, mems, edit, - &file_number, log_buffer); + Status s = WriteLevel0Table(cfd, mems, edit, &file_number, log_buffer); if (s.ok() && shutting_down_.Acquire_Load() && cfd->IsDropped()) { s = Status::ShutdownInProgress( @@ -1624,13 +1597,13 @@ Status DBImpl::FlushMemTableToOutputFile( } else { // Replace immutable memtable with the generated Table s = cfd->imm()->InstallMemtableFlushResults( - cfd, mutable_cf_options, mems, versions_.get(), &mutex_, - db_options_.info_log.get(), file_number, &pending_outputs_, - &deletion_state.memtables_to_free, db_directory_.get(), log_buffer); + cfd, mems, versions_.get(), &mutex_, options_.info_log.get(), + file_number, &pending_outputs_, &deletion_state.memtables_to_free, + db_directory_.get(), log_buffer); } if (s.ok()) { - InstallSuperVersion(cfd, deletion_state, mutable_cf_options); + InstallSuperVersion(cfd, deletion_state); if (madeProgress) { *madeProgress = 1; } @@ -1650,7 +1623,7 @@ Status DBImpl::FlushMemTableToOutputFile( } } - if (!s.ok() && !s.IsShutdownInProgress() && db_options_.paranoid_checks && + if (!s.ok() && !s.IsShutdownInProgress() && options_.paranoid_checks && bg_error_.ok()) { // if a bad error happened (not ShutdownInProgress) and paranoid_checks is // true, mark DB read-only @@ -1664,7 +1637,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end, bool reduce_level, int target_level, uint32_t target_path_id) { - if (target_path_id >= db_options_.db_paths.size()) { + if (target_path_id >= options_.db_paths.size()) { return Status::InvalidArgument("Invalid target path ID"); } @@ -1673,7 +1646,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, Status s = FlushMemTable(cfd, FlushOptions()); if (!s.ok()) { - LogFlush(db_options_.info_log); + LogFlush(options_.info_log); return s; } @@ -1692,8 +1665,8 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, // bottom-most level, the output level will be the same as input one. // level 0 can never be the bottommost level (i.e. if all files are in level // 0, we will compact to level 1) - if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || - cfd->ioptions()->compaction_style == kCompactionStyleFIFO || + if (cfd->options()->compaction_style == kCompactionStyleUniversal || + cfd->options()->compaction_style == kCompactionStyleFIFO || (level == max_level_with_files && level > 0)) { s = RunManualCompaction(cfd, level, level, target_path_id, begin, end); } else { @@ -1701,7 +1674,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, end); } if (!s.ok()) { - LogFlush(db_options_.info_log); + LogFlush(options_.info_log); return s; } } @@ -1709,7 +1682,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, if (reduce_level) { s = ReFitLevel(cfd, max_level_with_files, target_level); } - LogFlush(db_options_.info_log); + LogFlush(options_.info_log); { MutexLock l(&mutex_); @@ -1721,16 +1694,8 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, return s; } -bool DBImpl::SetOptions(ColumnFamilyHandle* column_family, - const std::unordered_map& options_map) { - auto cfh = reinterpret_cast(column_family); - MutexLock l(&mutex_); - return cfh->cfd()->SetOptions(options_map); -} - // return the same level if it cannot be moved -int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, - const MutableCFOptions& mutable_cf_options, int level) { +int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level) { mutex_.AssertHeld(); Version* current = cfd->current(); int minimum_level = level; @@ -1738,7 +1703,7 @@ int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, // stop if level i is not empty if (current->NumLevelFiles(i) > 0) break; // stop if level i is too small (cannot fit the level files) - if (mutable_cf_options.MaxBytesForLevel(i) < + if (cfd->compaction_picker()->MaxBytesForLevel(i) < current->NumLevelBytes(level)) { break; } @@ -1759,7 +1724,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { // only allow one thread refitting if (refitting_level_) { mutex_.Unlock(); - Log(db_options_.info_log, "ReFitLevel: another thread is refitting"); + Log(options_.info_log, "ReFitLevel: another thread is refitting"); delete new_superversion; return Status::NotSupported("another thread is refitting"); } @@ -1768,26 +1733,24 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { // wait for all background threads to stop bg_work_gate_closed_ = true; while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_) { - Log(db_options_.info_log, + Log(options_.info_log, "RefitLevel: waiting for background threads to stop: %d %d", bg_compaction_scheduled_, bg_flush_scheduled_); bg_cv_.Wait(); } - const MutableCFOptions mutable_cf_options = - *cfd->GetLatestMutableCFOptions(); // move to a smaller level int to_level = target_level; if (target_level < 0) { - to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level); + to_level = FindMinimumEmptyLevelFitting(cfd, level); } assert(to_level <= level); Status status; if (to_level < level) { - Log(db_options_.info_log, "[%s] Before refitting:\n%s", - cfd->GetName().c_str(), cfd->current()->DebugString().data()); + Log(options_.info_log, "[%s] Before refitting:\n%s", cfd->GetName().c_str(), + cfd->current()->DebugString().data()); VersionEdit edit; edit.SetColumnFamily(cfd->GetID()); @@ -1797,20 +1760,18 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { f->fd.GetFileSize(), f->smallest, f->largest, f->smallest_seqno, f->largest_seqno); } - Log(db_options_.info_log, "[%s] Apply version edit:\n%s", + Log(options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), edit.DebugString().data()); - status = versions_->LogAndApply(cfd, - mutable_cf_options, &edit, &mutex_, db_directory_.get()); - superversion_to_free = cfd->InstallSuperVersion( - new_superversion, &mutex_, mutable_cf_options); + status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get()); + superversion_to_free = cfd->InstallSuperVersion(new_superversion, &mutex_); new_superversion = nullptr; - Log(db_options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(), + Log(options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(), status.ToString().data()); if (status.ok()) { - Log(db_options_.info_log, "[%s] After refitting:\n%s", + Log(options_.info_log, "[%s] After refitting:\n%s", cfd->GetName().c_str(), cfd->current()->DebugString().data()); } } @@ -1866,16 +1827,16 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, // For universal compaction, we enforce every manual compaction to compact // all files. if (begin == nullptr || - cfd->ioptions()->compaction_style == kCompactionStyleUniversal || - cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + cfd->options()->compaction_style == kCompactionStyleUniversal || + cfd->options()->compaction_style == kCompactionStyleFIFO) { manual.begin = nullptr; } else { begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek); manual.begin = &begin_storage; } if (end == nullptr || - cfd->ioptions()->compaction_style == kCompactionStyleUniversal || - cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + cfd->options()->compaction_style == kCompactionStyleUniversal || + cfd->options()->compaction_style == kCompactionStyleFIFO) { manual.end = nullptr; } else { end_storage = InternalKey(*end, 0, static_cast(0)); @@ -1900,26 +1861,24 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, ++bg_manual_only_; while (bg_compaction_scheduled_ > 0) { - Log(db_options_.info_log, + Log(options_.info_log, "[%s] Manual compaction waiting for all other scheduled background " "compactions to finish", cfd->GetName().c_str()); bg_cv_.Wait(); } - Log(db_options_.info_log, "[%s] Manual compaction starting", + Log(options_.info_log, "[%s] Manual compaction starting", cfd->GetName().c_str()); - // We don't check bg_error_ here, because if we get the error in compaction, - // the compaction will set manual.status to bg_error_ and set manual.done to - // true. - while (!manual.done) { + while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) { assert(bg_manual_only_ > 0); if (manual_compaction_ != nullptr) { // Running either this or some other manual compaction bg_cv_.Wait(); } else { manual_compaction_ = &manual; + assert(bg_compaction_scheduled_ == 0); bg_compaction_scheduled_++; env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); } @@ -1933,18 +1892,19 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options) { + Writer w(&mutex_); + w.batch = nullptr; + w.sync = false; + w.disableWAL = false; + w.in_batch_group = false; + w.done = false; + w.timeout_hint_us = kNoTimeOut; + Status s; { WriteContext context; MutexLock guard_lock(&mutex_); - - if (cfd->imm()->size() == 0 && cfd->mem()->IsEmpty()) { - // Nothing to flush - return Status::OK(); - } - - WriteThread::Writer w(&mutex_); - s = write_thread_.EnterWriteThread(&w, 0); + s = BeginWrite(&w, 0); assert(s.ok() && !w.done); // No timeout and nobody should do our job // SetNewMemtableAndNewLogFile() will release and reacquire mutex @@ -1953,9 +1913,12 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, cfd->imm()->FlushRequested(); MaybeScheduleFlushOrCompaction(); - write_thread_.ExitWriteThread(&w, &w, s); + assert(!writers_.empty()); + assert(writers_.front() == &w); + EndWrite(&w, &w, s); } + if (s.ok() && options.wait) { // Wait until the compaction completes s = WaitForFlushMemTable(cfd); @@ -1993,10 +1956,10 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { } if (is_flush_pending) { // memtable flush needed - if (bg_flush_scheduled_ < db_options_.max_background_flushes) { + if (bg_flush_scheduled_ < options_.max_background_flushes) { bg_flush_scheduled_++; env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH); - } else if (db_options_.max_background_flushes > 0) { + } else if (options_.max_background_flushes > 0) { bg_schedule_needed_ = true; } } @@ -2015,8 +1978,8 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { // bg_manual_only_ == 0 if (!bg_manual_only_ && (is_compaction_needed || - (is_flush_pending && db_options_.max_background_flushes == 0))) { - if (bg_compaction_scheduled_ < db_options_.max_background_compactions) { + (is_flush_pending && options_.max_background_flushes == 0))) { + if (bg_compaction_scheduled_ < options_.max_background_compactions) { bg_compaction_scheduled_++; env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); } else { @@ -2052,11 +2015,6 @@ Status DBImpl::BackgroundFlush(bool* madeProgress, DeletionState& deletion_state, LogBuffer* log_buffer) { mutex_.AssertHeld(); - - if (!bg_error_.ok()) { - return bg_error_; - } - // call_status is failure if at least one flush was a failure. even if // flushing one column family reports a failure, we will continue flushing // other column families. however, call_status will be a failure in that case. @@ -2065,17 +2023,15 @@ Status DBImpl::BackgroundFlush(bool* madeProgress, for (auto cfd : *versions_->GetColumnFamilySet()) { cfd->Ref(); Status flush_status; - const MutableCFOptions mutable_cf_options = - *cfd->GetLatestMutableCFOptions(); while (flush_status.ok() && cfd->imm()->IsFlushPending()) { LogToBuffer( log_buffer, "BackgroundCallFlush doing FlushMemTableToOutputFile with column " "family [%s], flush slots available %d", cfd->GetName().c_str(), - db_options_.max_background_flushes - bg_flush_scheduled_); - flush_status = FlushMemTableToOutputFile( - cfd, mutable_cf_options, madeProgress, deletion_state, log_buffer); + options_.max_background_flushes - bg_flush_scheduled_); + flush_status = FlushMemTableToOutputFile(cfd, madeProgress, + deletion_state, log_buffer); } if (call_status.ok() && !flush_status.ok()) { call_status = flush_status; @@ -2091,7 +2047,7 @@ void DBImpl::BackgroundCallFlush() { DeletionState deletion_state(true); assert(bg_flush_scheduled_); - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get()); { MutexLock l(&mutex_); @@ -2107,12 +2063,12 @@ void DBImpl::BackgroundCallFlush() { default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); - Log(db_options_.info_log, + Log(options_.info_log, "Waiting after background flush error: %s" "Accumulated background error counts: %" PRIu64, s.ToString().c_str(), error_cnt); log_buffer.FlushBufferToLog(); - LogFlush(db_options_.info_log); + LogFlush(options_.info_log); env_->SleepForMicroseconds(1000000); mutex_.Lock(); } @@ -2158,7 +2114,7 @@ void DBImpl::BackgroundCallCompaction() { DeletionState deletion_state(true); MaybeDumpStats(); - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get()); { MutexLock l(&mutex_); assert(bg_compaction_scheduled_); @@ -2175,11 +2131,11 @@ void DBImpl::BackgroundCallCompaction() { bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); log_buffer.FlushBufferToLog(); - Log(db_options_.info_log, + Log(options_.info_log, "Waiting after background compaction error: %s, " "Accumulated background error counts: %" PRIu64, s.ToString().c_str(), error_cnt); - LogFlush(db_options_.info_log); + LogFlush(options_.info_log); env_->SleepForMicroseconds(1000000); mutex_.Lock(); } @@ -2223,7 +2179,7 @@ void DBImpl::BackgroundCallCompaction() { } if (madeProgress || bg_compaction_scheduled_ == 0 || bg_manual_only_ > 0) { // signal if - // * madeProgress -- need to wakeup DelayWrite + // * madeProgress -- need to wakeup MakeRoomForWrite // * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl // * bg_manual_only_ > 0 -- need to wakeup RunManualCompaction // If none of this is true, there is no need to signal since nobody is @@ -2246,16 +2202,6 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, bool is_manual = (manual_compaction_ != nullptr) && (manual_compaction_->in_progress == false); - if (!bg_error_.ok()) { - if (is_manual) { - manual_compaction_->status = bg_error_; - manual_compaction_->done = true; - manual_compaction_->in_progress = false; - manual_compaction_ = nullptr; - } - return bg_error_; - } - if (is_manual) { // another thread cannot pick up the same work manual_compaction_->in_progress = true; @@ -2268,17 +2214,15 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, // FLUSH preempts compaction Status flush_stat; for (auto cfd : *versions_->GetColumnFamilySet()) { - const MutableCFOptions mutable_cf_options = - *cfd->GetLatestMutableCFOptions(); while (cfd->imm()->IsFlushPending()) { LogToBuffer( log_buffer, "BackgroundCompaction doing FlushMemTableToOutputFile, " "compaction slots available %d", - db_options_.max_background_compactions - bg_compaction_scheduled_); + options_.max_background_compactions - bg_compaction_scheduled_); cfd->Ref(); - flush_stat = FlushMemTableToOutputFile( - cfd, mutable_cf_options, madeProgress, deletion_state, log_buffer); + flush_stat = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state, + log_buffer); cfd->Unref(); if (!flush_stat.ok()) { if (is_manual) { @@ -2292,18 +2236,15 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, } } - // Compaction makes a copy of the latest MutableCFOptions. It should be used - // throughout the compaction procedure to make sure consistency. It will - // eventually be installed into SuperVersion unique_ptr c; InternalKey manual_end_storage; InternalKey* manual_end = &manual_end_storage; if (is_manual) { ManualCompaction* m = manual_compaction_; assert(m->in_progress); - c.reset(m->cfd->CompactRange( - *m->cfd->GetLatestMutableCFOptions(), m->input_level, m->output_level, - m->output_path_id, m->begin, m->end, &manual_end)); + c.reset(m->cfd->CompactRange(m->input_level, m->output_level, + m->output_path_id, m->begin, m->end, + &manual_end)); if (!c) { m->done = true; } @@ -2320,11 +2261,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, // no need to refcount in iteration since it's always under a mutex for (auto cfd : *versions_->GetColumnFamilySet()) { if (!cfd->options()->disable_auto_compactions) { - // NOTE: try to avoid unnecessary copy of MutableCFOptions if - // compaction is not necessary. Need to make sure mutex is held - // until we make a copy in the following code - c.reset(cfd->PickCompaction( - *cfd->GetLatestMutableCFOptions(), log_buffer)); + c.reset(cfd->PickCompaction(log_buffer)); if (c != nullptr) { // update statistics MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION, @@ -2344,16 +2281,14 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, // file if there is alive snapshot pointing to it assert(c->num_input_files(1) == 0); assert(c->level() == 0); - assert(c->column_family_data()->ioptions()->compaction_style == + assert(c->column_family_data()->options()->compaction_style == kCompactionStyleFIFO); for (const auto& f : *c->inputs(0)) { c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); } - status = versions_->LogAndApply( - c->column_family_data(), *c->mutable_cf_options(), c->edit(), - &mutex_, db_directory_.get()); - InstallSuperVersion(c->column_family_data(), deletion_state, - *c->mutable_cf_options()); + status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_, + db_directory_.get()); + InstallSuperVersion(c->column_family_data(), deletion_state); LogToBuffer(log_buffer, "[%s] Deleted %d files\n", c->column_family_data()->GetName().c_str(), c->num_input_files(0)); @@ -2367,28 +2302,23 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, c->edit()->AddFile(c->level() + 1, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, f->smallest_seqno, f->largest_seqno); - status = versions_->LogAndApply(c->column_family_data(), - *c->mutable_cf_options(), - c->edit(), &mutex_, db_directory_.get()); - // Use latest MutableCFOptions - InstallSuperVersion(c->column_family_data(), deletion_state, - *c->mutable_cf_options()); + status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_, + db_directory_.get()); + InstallSuperVersion(c->column_family_data(), deletion_state); Version::LevelSummaryStorage tmp; LogToBuffer( - log_buffer, - "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes %s: %s\n", + log_buffer, "[%s] Moved #%lld to level-%d %lld bytes %s: %s\n", c->column_family_data()->GetName().c_str(), - f->fd.GetNumber(), c->level() + 1, - f->fd.GetFileSize(), + static_cast(f->fd.GetNumber()), c->level() + 1, + static_cast(f->fd.GetFileSize()), status.ToString().c_str(), c->input_version()->LevelSummary(&tmp)); c->ReleaseCompactionFiles(status); *madeProgress = true; } else { MaybeScheduleFlushOrCompaction(); // do more compaction work in parallel. CompactionState* compact = new CompactionState(c.get()); - status = DoCompactionWork(compact, *c->mutable_cf_options(), - deletion_state, log_buffer); + status = DoCompactionWork(compact, deletion_state, log_buffer); CleanupCompaction(compact, status); c->ReleaseCompactionFiles(status); c->ReleaseInputs(); @@ -2401,9 +2331,9 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, } else if (status.IsShutdownInProgress()) { // Ignore compaction errors found during shutting down } else { - Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "Compaction error: %s", + Log(InfoLogLevel::WARN_LEVEL, options_.info_log, "Compaction error: %s", status.ToString().c_str()); - if (db_options_.paranoid_checks && bg_error_.ok()) { + if (options_.paranoid_checks && bg_error_.ok()) { bg_error_ = status; } } @@ -2434,8 +2364,8 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, // We only compacted part of the requested range. Update *m // to the range that is left to be compacted. // Universal and FIFO compactions should always compact the whole range - assert(m->cfd->ioptions()->compaction_style != kCompactionStyleUniversal); - assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO); + assert(m->cfd->options()->compaction_style != kCompactionStyleUniversal); + assert(m->cfd->options()->compaction_style != kCompactionStyleFIFO); m->tmp_storage = *manual_end; m->begin = &m->tmp_storage; } @@ -2490,8 +2420,7 @@ void DBImpl::ReleaseCompactionUnusedFileNumbers(CompactionState* compact) { } } -Status DBImpl::OpenCompactionOutputFile( - CompactionState* compact, const MutableCFOptions& mutable_cf_options) { +Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { assert(compact != nullptr); assert(compact->builder == nullptr); uint64_t file_number; @@ -2516,22 +2445,21 @@ Status DBImpl::OpenCompactionOutputFile( compact->outputs.push_back(out); // Make the output file - std::string fname = TableFileName(db_options_.db_paths, file_number, + std::string fname = TableFileName(options_.db_paths, file_number, compact->compaction->GetOutputPathId()); - Status s = env_->NewWritableFile(fname, &compact->outfile, env_options_); + Status s = env_->NewWritableFile(fname, &compact->outfile, storage_options_); if (s.ok()) { compact->outfile->SetIOPriority(Env::IO_LOW); compact->outfile->SetPreallocationBlockSize( - compact->compaction->OutputFilePreallocationSize(mutable_cf_options)); + compact->compaction->OutputFilePreallocationSize()); ColumnFamilyData* cfd = compact->compaction->column_family_data(); compact->builder.reset(NewTableBuilder( - *cfd->ioptions(), cfd->internal_comparator(), compact->outfile.get(), - compact->compaction->OutputCompressionType(), - cfd->ioptions()->compression_opts)); + *cfd->options(), cfd->internal_comparator(), compact->outfile.get(), + compact->compaction->OutputCompressionType())); } - LogFlush(db_options_.info_log); + LogFlush(options_.info_log); return s; } @@ -2559,8 +2487,8 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, compact->builder.reset(); // Finish and check for file errors - if (s.ok() && !db_options_.disableDataSync) { - if (db_options_.use_fsync) { + if (s.ok() && !options_.disableDataSync) { + if (options_.use_fsync) { StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); s = compact->outfile->Fsync(); } else { @@ -2578,11 +2506,11 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, ColumnFamilyData* cfd = compact->compaction->column_family_data(); FileDescriptor fd(output_number, output_path_id, current_bytes); Iterator* iter = cfd->table_cache()->NewIterator( - ReadOptions(), env_options_, cfd->internal_comparator(), fd); + ReadOptions(), storage_options_, cfd->internal_comparator(), fd); s = iter->status(); delete iter; if (s.ok()) { - Log(db_options_.info_log, "[%s] Generated table #%" PRIu64 ": %" PRIu64 + Log(options_.info_log, "[%s] Generated table #%" PRIu64 ": %" PRIu64 " keys, %" PRIu64 " bytes", cfd->GetName().c_str(), output_number, current_entries, current_bytes); @@ -2593,7 +2521,7 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, Status DBImpl::InstallCompactionResults(CompactionState* compact, - const MutableCFOptions& mutable_cf_options, LogBuffer* log_buffer) { + LogBuffer* log_buffer) { mutex_.AssertHeld(); // paranoia: verify that the files that we started with @@ -2601,7 +2529,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact, // This ensures that a concurrent compaction did not erroneously // pick the same files to compact. if (!versions_->VerifyCompactionFileConsistency(compact->compaction)) { - Log(db_options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted", + Log(options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted", compact->compaction->column_family_data()->GetName().c_str(), compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), @@ -2627,7 +2555,6 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact, out.smallest_seqno, out.largest_seqno); } return versions_->LogAndApply(compact->compaction->column_family_data(), - mutable_cf_options, compact->compaction->edit(), &mutex_, db_directory_.get()); } @@ -2651,7 +2578,7 @@ inline SequenceNumber DBImpl::findEarliestVisibleSnapshot( prev = cur; // assignment assert(prev); } - Log(db_options_.info_log, + Log(options_.info_log, "Looking for seqid %" PRIu64 " but maxseqid is %" PRIu64 "", in, snapshots[snapshots.size() - 1]); assert(0); @@ -2659,9 +2586,9 @@ inline SequenceNumber DBImpl::findEarliestVisibleSnapshot( } uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd, - const MutableCFOptions& mutable_cf_options, DeletionState& deletion_state, - LogBuffer* log_buffer) { - if (db_options_.max_background_flushes > 0) { + DeletionState& deletion_state, + LogBuffer* log_buffer) { + if (options_.max_background_flushes > 0) { // flush thread will take care of this return 0; } @@ -2670,10 +2597,9 @@ uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd, mutex_.Lock(); if (cfd->imm()->IsFlushPending()) { cfd->Ref(); - FlushMemTableToOutputFile(cfd, mutable_cf_options, nullptr, - deletion_state, log_buffer); + FlushMemTableToOutputFile(cfd, nullptr, deletion_state, log_buffer); cfd->Unref(); - bg_cv_.SignalAll(); // Wakeup DelayWrite() if necessary + bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary } mutex_.Unlock(); log_buffer->FlushBufferToLog(); @@ -2683,7 +2609,6 @@ uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd, } Status DBImpl::ProcessKeyValueCompaction( - const MutableCFOptions& mutable_cf_options, bool is_snapshot_supported, SequenceNumber visible_at_tip, SequenceNumber earliest_snapshot, @@ -2707,15 +2632,15 @@ Status DBImpl::ProcessKeyValueCompaction( SequenceNumber visible_in_snapshot = kMaxSequenceNumber; ColumnFamilyData* cfd = compact->compaction->column_family_data(); MergeHelper merge( - cfd->user_comparator(), cfd->ioptions()->merge_operator, - db_options_.info_log.get(), cfd->options()->min_partial_merge_operands, + cfd->user_comparator(), cfd->options()->merge_operator.get(), + options_.info_log.get(), cfd->options()->min_partial_merge_operands, false /* internal key corruption is expected */); - auto compaction_filter = cfd->ioptions()->compaction_filter; + auto compaction_filter = cfd->options()->compaction_filter; std::unique_ptr compaction_filter_from_factory = nullptr; if (!compaction_filter) { auto context = compact->GetFilterContextV1(); compaction_filter_from_factory = - cfd->ioptions()->compaction_filter_factory->CreateCompactionFilter( + cfd->options()->compaction_filter_factory->CreateCompactionFilter( context); compaction_filter = compaction_filter_from_factory.get(); } @@ -2747,8 +2672,7 @@ Status DBImpl::ProcessKeyValueCompaction( // TODO(icanadi) this currently only checks if flush is necessary on // compacting column family. we should also check if flush is necessary on // other column families, too - imm_micros += CallFlushDuringCompaction( - cfd, mutable_cf_options, deletion_state, log_buffer); + imm_micros += CallFlushDuringCompaction(cfd, deletion_state, log_buffer); Slice key; Slice value; @@ -2876,7 +2800,7 @@ Status DBImpl::ProcessKeyValueCompaction( // optimization in BuildTable. int steps = 0; merge.MergeUntil(input, prev_snapshot, bottommost_level, - db_options_.statistics.get(), &steps); + options_.statistics.get(), &steps); // Skip the Merge ops combined_idx = combined_idx - 1 + steps; @@ -2949,7 +2873,7 @@ Status DBImpl::ProcessKeyValueCompaction( // Open output file if necessary if (compact->builder == nullptr) { - status = OpenCompactionOutputFile(compact, mutable_cf_options); + status = OpenCompactionOutputFile(compact); if (!status.ok()) { break; } @@ -3086,12 +3010,12 @@ void DBImpl::CallCompactionFilterV2(CompactionState* compact, } Status DBImpl::DoCompactionWork(CompactionState* compact, - const MutableCFOptions& mutable_cf_options, DeletionState& deletion_state, LogBuffer* log_buffer) { assert(compact); compact->CleanupBatchBuffer(); compact->CleanupMergedBuffer(); + bool prefix_initialized = false; // Generate file_levels_ for compaction berfore making Iterator compact->compaction->GenerateFileLevels(); @@ -3103,7 +3027,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, cfd->GetName().c_str(), compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), compact->compaction->output_level(), compact->compaction->score(), - db_options_.max_background_compactions - bg_compaction_scheduled_); + options_.max_background_compactions - bg_compaction_scheduled_); char scratch[2345]; compact->compaction->Summary(scratch, sizeof(scratch)); LogToBuffer(log_buffer, "[%s] Compaction start summary: %s\n", @@ -3143,6 +3067,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, const uint64_t start_micros = env_->NowMicros(); unique_ptr input(versions_->MakeInputIterator(compact->compaction)); input->SeekToFirst(); + shared_ptr backup_input( + versions_->MakeInputIterator(compact->compaction)); + backup_input->SeekToFirst(); Status status; ParsedInternalKey ikey; @@ -3150,57 +3077,38 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, = nullptr; auto context = compact->GetFilterContext(); compaction_filter_from_factory_v2 = - cfd->ioptions()->compaction_filter_factory_v2-> - CreateCompactionFilterV2(context); + cfd->options()->compaction_filter_factory_v2->CreateCompactionFilterV2( + context); auto compaction_filter_v2 = compaction_filter_from_factory_v2.get(); - if (!compaction_filter_v2) { - status = ProcessKeyValueCompaction( - mutable_cf_options, - is_snapshot_supported, - visible_at_tip, - earliest_snapshot, - latest_snapshot, - deletion_state, - bottommost_level, - imm_micros, - input.get(), - compact, - false, - log_buffer); - } else { - // temp_backup_input always point to the start of the current buffer - // temp_backup_input = backup_input; - // iterate through input, - // 1) buffer ineligible keys and value keys into 2 separate buffers; - // 2) send value_buffer to compaction filter and alternate the values; - // 3) merge value_buffer with ineligible_value_buffer; - // 4) run the modified "compaction" using the old for loop. - bool prefix_initialized = false; - shared_ptr backup_input( - versions_->MakeInputIterator(compact->compaction)); - backup_input->SeekToFirst(); + // temp_backup_input always point to the start of the current buffer + // temp_backup_input = backup_input; + // iterate through input, + // 1) buffer ineligible keys and value keys into 2 separate buffers; + // 2) send value_buffer to compaction filter and alternate the values; + // 3) merge value_buffer with ineligible_value_buffer; + // 4) run the modified "compaction" using the old for loop. + if (compaction_filter_v2) { while (backup_input->Valid() && !shutting_down_.Acquire_Load() && !cfd->IsDropped()) { // FLUSH preempts compaction // TODO(icanadi) this currently only checks if flush is necessary on // compacting column family. we should also check if flush is necessary on // other column families, too - imm_micros += CallFlushDuringCompaction(cfd, mutable_cf_options, - deletion_state, log_buffer); + imm_micros += CallFlushDuringCompaction(cfd, deletion_state, log_buffer); Slice key = backup_input->key(); Slice value = backup_input->value(); if (!ParseInternalKey(key, &ikey)) { // log error - Log(db_options_.info_log, "[%s] Failed to parse key: %s", + Log(options_.info_log, "[%s] Failed to parse key: %s", cfd->GetName().c_str(), key.ToString().c_str()); continue; } else { const SliceTransform* transformer = - cfd->ioptions()->compaction_filter_factory_v2->GetPrefixExtractor(); + cfd->options()->compaction_filter_factory_v2->GetPrefixExtractor(); const auto key_prefix = transformer->Transform(ikey.user_key); if (!prefix_initialized) { compact->cur_prefix_ = key_prefix.ToString(); @@ -3238,7 +3146,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, // Done buffering for the current prefix. Spit it out to disk // Now just iterate through all the kv-pairs status = ProcessKeyValueCompaction( - mutable_cf_options, is_snapshot_supported, visible_at_tip, earliest_snapshot, @@ -3275,7 +3182,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator()); status = ProcessKeyValueCompaction( - mutable_cf_options, is_snapshot_supported, visible_at_tip, earliest_snapshot, @@ -3298,7 +3204,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, } compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator()); status = ProcessKeyValueCompaction( - mutable_cf_options, is_snapshot_supported, visible_at_tip, earliest_snapshot, @@ -3312,6 +3217,21 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, log_buffer); } // checking for compaction filter v2 + if (!compaction_filter_v2) { + status = ProcessKeyValueCompaction( + is_snapshot_supported, + visible_at_tip, + earliest_snapshot, + latest_snapshot, + deletion_state, + bottommost_level, + imm_micros, + input.get(), + compact, + false, + log_buffer); + } + if (status.ok() && (shutting_down_.Acquire_Load() || cfd->IsDropped())) { status = Status::ShutdownInProgress( "Database shutdown or Column family drop during compaction"); @@ -3324,7 +3244,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, } input.reset(); - if (!db_options_.disableDataSync) { + if (!options_.disableDataSync) { db_directory_->Fsync(); } @@ -3356,7 +3276,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, RecordCompactionIOStats(); - LogFlush(db_options_.info_log); + LogFlush(options_.info_log); mutex_.Lock(); cfd->internal_stats()->AddCompactionStats( compact->compaction->output_level(), stats); @@ -3366,20 +3286,18 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, ReleaseCompactionUnusedFileNumbers(compact); if (status.ok()) { - status = InstallCompactionResults(compact, mutable_cf_options, log_buffer); - InstallSuperVersion(cfd, deletion_state, mutable_cf_options); + status = InstallCompactionResults(compact, log_buffer); + InstallSuperVersion(cfd, deletion_state); } Version::LevelSummaryStorage tmp; LogToBuffer( log_buffer, - "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, " - "files in(%d, %d) out(%d) " + "[%s] compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) " "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " "write-amplify(%.1f) %s\n", cfd->GetName().c_str(), cfd->current()->LevelSummary(&tmp), - (stats.bytes_readn + stats.bytes_readnp1) / - static_cast(stats.micros), - stats.bytes_written / static_cast(stats.micros), + (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) / + (double)stats.micros, compact->compaction->output_level(), stats.files_in_leveln, stats.files_in_levelnp1, stats.files_out_levelnp1, stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0, @@ -3428,18 +3346,31 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, SuperVersion* super_version, Arena* arena) { Iterator* internal_iter; - assert(arena != nullptr); - // Need to create internal iterator from the arena. - MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena); - // Collect iterator for mutable mem - merge_iter_builder.AddIterator( - super_version->mem->NewIterator(options, arena)); - // Collect all needed child iterators for immutable memtables - super_version->imm->AddIterators(options, &merge_iter_builder); - // Collect iterators for files in L0 - Ln - super_version->current->AddIterators(options, env_options_, - &merge_iter_builder); - internal_iter = merge_iter_builder.Finish(); + if (arena != nullptr) { + // Need to create internal iterator from the arena. + MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena); + // Collect iterator for mutable mem + merge_iter_builder.AddIterator( + super_version->mem->NewIterator(options, arena)); + // Collect all needed child iterators for immutable memtables + super_version->imm->AddIterators(options, &merge_iter_builder); + // Collect iterators for files in L0 - Ln + super_version->current->AddIterators(options, storage_options_, + &merge_iter_builder); + internal_iter = merge_iter_builder.Finish(); + } else { + // Need to create internal iterator using malloc. + std::vector iterator_list; + // Collect iterator for mutable mem + iterator_list.push_back(super_version->mem->NewIterator(options)); + // Collect all needed child iterators for immutable memtables + super_version->imm->AddIterators(options, &iterator_list); + // Collect iterators for files in L0 - Ln + super_version->current->AddIterators(options, storage_options_, + &iterator_list); + internal_iter = NewMergingIterator(&cfd->internal_comparator(), + &iterator_list[0], iterator_list.size()); + } IterState* cleanup = new IterState(this, &mutex_, super_version); internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); @@ -3466,16 +3397,15 @@ Status DBImpl::Get(const ReadOptions& options, // first call already used it. In that rare case, we take a hit and create a // new SuperVersion() inside of the mutex. We do similar thing // for superversion_to_free -void DBImpl::InstallSuperVersion( - ColumnFamilyData* cfd, DeletionState& deletion_state, - const MutableCFOptions& mutable_cf_options) { +void DBImpl::InstallSuperVersion(ColumnFamilyData* cfd, + DeletionState& deletion_state) { mutex_.AssertHeld(); // if new_superversion == nullptr, it means somebody already used it SuperVersion* new_superversion = (deletion_state.new_superversion != nullptr) ? deletion_state.new_superversion : new SuperVersion(); SuperVersion* old_superversion = - cfd->InstallSuperVersion(new_superversion, &mutex_, mutable_cf_options); + cfd->InstallSuperVersion(new_superversion, &mutex_); deletion_state.new_superversion = nullptr; deletion_state.superversions_to_free.push_back(old_superversion); } @@ -3509,10 +3439,10 @@ Status DBImpl::GetImpl(const ReadOptions& options, LookupKey lkey(key, snapshot); PERF_TIMER_STOP(get_snapshot_time); - if (sv->mem->Get(lkey, value, &s, &merge_context)) { + if (sv->mem->Get(lkey, value, &s, merge_context, *cfd->options())) { // Done RecordTick(stats_, MEMTABLE_HIT); - } else if (sv->imm->Get(lkey, value, &s, &merge_context)) { + } else if (sv->imm->Get(lkey, value, &s, merge_context, *cfd->options())) { // Done RecordTick(stats_, MEMTABLE_HIT); } else { @@ -3597,9 +3527,12 @@ std::vector DBImpl::MultiGet( assert(mgd_iter != multiget_cf_data.end()); auto mgd = mgd_iter->second; auto super_version = mgd->super_version; - if (super_version->mem->Get(lkey, value, &s, &merge_context)) { + auto cfd = mgd->cfd; + if (super_version->mem->Get(lkey, value, &s, merge_context, + *cfd->options())) { // Done - } else if (super_version->imm->Get(lkey, value, &s, &merge_context)) { + } else if (super_version->imm->Get(lkey, value, &s, merge_context, + *cfd->options())) { // Done } else { super_version->current->Get(options, lkey, value, &s, &merge_context); @@ -3659,24 +3592,21 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options, // LogAndApply will both write the creation in MANIFEST and create // ColumnFamilyData object - Options opt(db_options_, options); - Status s = versions_->LogAndApply(nullptr, - MutableCFOptions(opt, ImmutableCFOptions(opt)), - &edit, &mutex_, db_directory_.get(), false, &options); + Status s = versions_->LogAndApply(nullptr, &edit, &mutex_, + db_directory_.get(), false, &options); if (s.ok()) { single_column_family_mode_ = false; auto cfd = versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); assert(cfd != nullptr); - delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_, - *cfd->GetLatestMutableCFOptions()); + delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_); *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); - Log(db_options_.info_log, "Created column family [%s] (ID %u)", + Log(options_.info_log, "Created column family [%s] (ID %u)", column_family_name.c_str(), (unsigned)cfd->GetID()); max_total_in_memory_state_ += cfd->options()->write_buffer_size * cfd->options()->max_write_buffer_number; } else { - Log(db_options_.info_log, "Creating column family [%s] FAILED -- %s", + Log(options_.info_log, "Creating column family [%s] FAILED -- %s", column_family_name.c_str(), s.ToString().c_str()); } return s; @@ -3693,7 +3623,6 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { edit.DropColumnFamily(); edit.SetColumnFamily(cfd->GetID()); - Status s; { MutexLock l(&mutex_); @@ -3701,13 +3630,7 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { s = Status::InvalidArgument("Column family already dropped!\n"); } if (s.ok()) { - // we drop column family from a single write thread - WriteThread::Writer w(&mutex_); - s = write_thread_.EnterWriteThread(&w, 0); - assert(s.ok() && !w.done); // No timeout and nobody should do our job - s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_); - write_thread_.ExitWriteThread(&w, &w, s); + s = versions_->LogAndApply(cfd, &edit, &mutex_); } } @@ -3715,11 +3638,9 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { assert(cfd->IsDropped()); max_total_in_memory_state_ -= cfd->options()->write_buffer_size * cfd->options()->max_write_buffer_number; - Log(db_options_.info_log, "Dropped column family with id %u\n", - cfd->GetID()); + Log(options_.info_log, "Dropped column family with id %u\n", cfd->GetID()); } else { - Log(db_options_.info_log, - "Dropping column family with id %u FAILED -- %s\n", + Log(options_.info_log, "Dropping column family with id %u FAILED -- %s\n", cfd->GetID(), s.ToString().c_str()); } @@ -3743,21 +3664,21 @@ bool DBImpl::KeyMayExist(const ReadOptions& options, return s.ok() || s.IsIncomplete(); } -Iterator* DBImpl::NewIterator(const ReadOptions& read_options, +Iterator* DBImpl::NewIterator(const ReadOptions& options, ColumnFamilyHandle* column_family) { auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); - if (read_options.tailing) { + if (options.tailing) { #ifdef ROCKSDB_LITE // not supported in lite version return nullptr; #else - auto iter = new ForwardIterator(this, read_options, cfd); - return NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter, - kMaxSequenceNumber, - cfd->options()->max_sequential_skip_in_iterations, - read_options.iterate_upper_bound); + // TODO(ljin): remove tailing iterator + auto iter = new ForwardIterator(this, options, cfd); + return NewDBIterator(env_, *cfd->options(), cfd->user_comparator(), iter, + kMaxSequenceNumber); +// return new TailingIterator(env_, this, options, cfd); #endif } else { SequenceNumber latest_snapshot = versions_->LastSequence(); @@ -3765,9 +3686,8 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, sv = cfd->GetReferencedSuperVersion(&mutex_); auto snapshot = - read_options.snapshot != nullptr - ? reinterpret_cast( - read_options.snapshot)->number_ + options.snapshot != nullptr + ? reinterpret_cast(options.snapshot)->number_ : latest_snapshot; // Try to generate a DB iterator tree in continuous memory area to be @@ -3813,22 +3733,17 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, // likely that any iterator pointer is close to the iterator it points to so // that they are likely to be in the same cache line and/or page. ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( - env_, *cfd->ioptions(), cfd->user_comparator(), - snapshot, cfd->options()->max_sequential_skip_in_iterations, - read_options.iterate_upper_bound); - + env_, *cfd->options(), cfd->user_comparator(), snapshot); Iterator* internal_iter = - NewInternalIterator(read_options, cfd, sv, db_iter->GetArena()); + NewInternalIterator(options, cfd, sv, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); return db_iter; } - // To stop compiler from complaining - return nullptr; } Status DBImpl::NewIterators( - const ReadOptions& read_options, + const ReadOptions& options, const std::vector& column_families, std::vector* iterators) { iterators->clear(); @@ -3837,7 +3752,7 @@ Status DBImpl::NewIterators( std::vector super_versions; super_versions.reserve(column_families.size()); - if (!read_options.tailing) { + if (!options.tailing) { mutex_.Lock(); latest_snapshot = versions_->LastSequence(); for (auto cfh : column_families) { @@ -3847,18 +3762,17 @@ Status DBImpl::NewIterators( mutex_.Unlock(); } - if (read_options.tailing) { + if (options.tailing) { #ifdef ROCKSDB_LITE return Status::InvalidArgument( "Tailing interator not supported in RocksDB lite"); #else for (auto cfh : column_families) { auto cfd = reinterpret_cast(cfh)->cfd(); - auto iter = new ForwardIterator(this, read_options, cfd); + auto iter = new ForwardIterator(this, options, cfd); iterators->push_back( - NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter, - kMaxSequenceNumber, - cfd->options()->max_sequential_skip_in_iterations)); + NewDBIterator(env_, *cfd->options(), cfd->user_comparator(), iter, + kMaxSequenceNumber)); } #endif } else { @@ -3867,18 +3781,14 @@ Status DBImpl::NewIterators( auto cfd = cfh->cfd(); auto snapshot = - read_options.snapshot != nullptr - ? reinterpret_cast( - read_options.snapshot)->number_ + options.snapshot != nullptr + ? reinterpret_cast(options.snapshot)->number_ : latest_snapshot; - ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( - env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, - cfd->options()->max_sequential_skip_in_iterations); - Iterator* internal_iter = NewInternalIterator( - read_options, cfd, super_versions[i], db_iter->GetArena()); - db_iter->SetIterUnderDBIter(internal_iter); - iterators->push_back(db_iter); + auto iter = NewInternalIterator(options, cfd, super_versions[i]); + iter = NewDBIterator(env_, *cfd->options(), + cfd->user_comparator(), iter, snapshot); + iterators->push_back(iter); } } @@ -3915,7 +3825,7 @@ Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, const Slice& key, const Slice& val) { auto cfh = reinterpret_cast(column_family); - if (!cfh->cfd()->ioptions()->merge_operator) { + if (!cfh->cfd()->options()->merge_operator) { return Status::NotSupported("Provide a merge_operator when opening DB"); } else { return DB::Merge(o, column_family, key, val); @@ -3927,12 +3837,88 @@ Status DBImpl::Delete(const WriteOptions& options, return DB::Delete(options, column_family, key); } +// REQUIRES: mutex_ is held +Status DBImpl::BeginWrite(Writer* w, uint64_t expiration_time) { + // the following code block pushes the current writer "w" into the writer + // queue "writers_" and wait until one of the following conditions met: + // 1. the job of "w" has been done by some other writers. + // 2. "w" becomes the first writer in "writers_" + // 3. "w" timed-out. + mutex_.AssertHeld(); + writers_.push_back(w); + + bool timed_out = false; + while (!w->done && w != writers_.front()) { + if (expiration_time == 0) { + w->cv.Wait(); + } else if (w->cv.TimedWait(expiration_time)) { + if (w->in_batch_group) { + // then it means the front writer is currently doing the + // write on behalf of this "timed-out" writer. Then it + // should wait until the write completes. + expiration_time = 0; + } else { + timed_out = true; + break; + } + } + } + + if (timed_out) { +#ifndef NDEBUG + bool found = false; +#endif + for (auto iter = writers_.begin(); iter != writers_.end(); iter++) { + if (*iter == w) { + writers_.erase(iter); +#ifndef NDEBUG + found = true; +#endif + break; + } + } +#ifndef NDEBUG + assert(found); +#endif + // writers_.front() might still be in cond_wait without a time-out. + // As a result, we need to signal it to wake it up. Otherwise no + // one else will wake him up, and RocksDB will hang. + if (!writers_.empty()) { + writers_.front()->cv.Signal(); + } + return Status::TimedOut(); + } + return Status::OK(); +} + +// REQUIRES: mutex_ is held +void DBImpl::EndWrite(Writer* w, Writer* last_writer, Status status) { + // Pop out the current writer and all writers being pushed before the + // current writer from the writer queue. + mutex_.AssertHeld(); + while (!writers_.empty()) { + Writer* ready = writers_.front(); + writers_.pop_front(); + if (ready != w) { + ready->status = status; + ready->done = true; + ready->cv.Signal(); + } + if (ready == last_writer) break; + } + + // Notify new head of write queue + if (!writers_.empty()) { + writers_.front()->cv.Signal(); + } +} + Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (my_batch == nullptr) { return Status::Corruption("Batch is nullptr!"); } PERF_TIMER_GUARD(write_pre_and_post_process_time); - WriteThread::Writer w(&mutex_); + Writer w(&mutex_); w.batch = my_batch; w.sync = options.sync; w.disableWAL = options.disableWAL; @@ -3941,12 +3927,10 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { w.timeout_hint_us = options.timeout_hint_us; uint64_t expiration_time = 0; - bool has_timeout = false; if (w.timeout_hint_us == 0) { - w.timeout_hint_us = WriteThread::kNoTimeOut; + w.timeout_hint_us = kNoTimeOut; } else { expiration_time = env_->NowMicros() + w.timeout_hint_us; - has_timeout = true; } if (!options.disableWAL) { @@ -3956,7 +3940,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { WriteContext context; mutex_.Lock(); - Status status = write_thread_.EnterWriteThread(&w, expiration_time); + Status status = BeginWrite(&w, expiration_time); assert(status.ok() || status.IsTimedOut()); if (status.IsTimedOut()) { mutex_.Unlock(); @@ -3981,55 +3965,59 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { assert(!single_column_family_mode_ || versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1); - uint64_t max_total_wal_size = (db_options_.max_total_wal_size == 0) + uint64_t flush_column_family_if_log_file = 0; + uint64_t max_total_wal_size = (options_.max_total_wal_size == 0) ? 4 * max_total_in_memory_state_ - : db_options_.max_total_wal_size; + : options_.max_total_wal_size; if (UNLIKELY(!single_column_family_mode_) && alive_log_files_.begin()->getting_flushed == false && total_log_size_ > max_total_wal_size) { - uint64_t flush_column_family_if_log_file = alive_log_files_.begin()->number; + flush_column_family_if_log_file = alive_log_files_.begin()->number; alive_log_files_.begin()->getting_flushed = true; - Log(db_options_.info_log, + Log(options_.info_log, "Flushing all column families with data in WAL number %" PRIu64 ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64, flush_column_family_if_log_file, total_log_size_, max_total_wal_size); - // no need to refcount because drop is happening in write thread, so can't - // happen while we're in the write thread + } + + if (LIKELY(single_column_family_mode_)) { + // fast path + status = MakeRoomForWrite(default_cf_handle_->cfd(), + &context, expiration_time); + } else { + // refcounting cfd in iteration + bool dead_cfd = false; for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->GetLogNumber() <= flush_column_family_if_log_file) { + cfd->Ref(); + if (flush_column_family_if_log_file != 0 && + cfd->GetLogNumber() <= flush_column_family_if_log_file) { + // log size excedded limit and we need to do flush + // SetNewMemtableAndNewLogFie may temporarily unlock and wait status = SetNewMemtableAndNewLogFile(cfd, &context); - if (!status.ok()) { - break; - } cfd->imm()->FlushRequested(); + MaybeScheduleFlushOrCompaction(); + } else { + // May temporarily unlock and wait. + status = MakeRoomForWrite(cfd, &context, expiration_time); + } + + if (cfd->Unref()) { + dead_cfd = true; + } + if (!status.ok()) { + break; } } - MaybeScheduleFlushOrCompaction(); - } - - if (UNLIKELY(status.ok() && !bg_error_.ok())) { - status = bg_error_; - } - - if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { - status = ScheduleFlushes(&context); - } - - if (UNLIKELY(status.ok()) && - (write_controller_.IsStopped() || write_controller_.GetDelay() > 0)) { - DelayWrite(expiration_time); - } - - if (UNLIKELY(status.ok() && has_timeout && - env_->NowMicros() > expiration_time)) { - status = Status::TimedOut(); + if (dead_cfd) { + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + } } uint64_t last_sequence = versions_->LastSequence(); - WriteThread::Writer* last_writer = &w; + Writer* last_writer = &w; if (status.ok()) { autovector write_batch_group; - write_thread_.BuildBatchGroup(&last_writer, &write_batch_group); + BuildBatchGroup(&last_writer, &write_batch_group); // Add to log and apply to memtable. We can release the lock // during this phase since &w is currently responsible for logging @@ -4069,13 +4057,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { alive_log_files_.back().AddSize(log_entry.size()); log_empty_ = false; log_size = log_entry.size(); + RecordTick(stats_, WAL_FILE_SYNCED); RecordTick(stats_, WAL_FILE_BYTES, log_size); if (status.ok() && options.sync) { - RecordTick(stats_, WAL_FILE_SYNCED); - StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); - if (db_options_.use_fsync) { + if (options_.use_fsync) { + StopWatch(env_, stats_, WAL_FILE_SYNC_MICROS); status = log_->file()->Fsync(); } else { + StopWatch(env_, stats_, WAL_FILE_SYNC_MICROS); status = log_->file()->Sync(); } } @@ -4115,12 +4104,12 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { } } } - if (db_options_.paranoid_checks && !status.ok() && + if (options_.paranoid_checks && !status.ok() && !status.IsTimedOut() && bg_error_.ok()) { bg_error_ = status; // stop compaction & fail any further writes } - write_thread_.ExitWriteThread(&w, last_writer, status); + EndWrite(&w, last_writer, status); mutex_.Unlock(); if (status.IsTimedOut()) { @@ -4130,47 +4119,241 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { return status; } -// REQUIRES: mutex_ is held -// REQUIRES: this thread is currently at the front of the writer queue -void DBImpl::DelayWrite(uint64_t expiration_time) { - StopWatch sw(env_, stats_, WRITE_STALL); - bool has_timeout = (expiration_time > 0); - auto delay = write_controller_.GetDelay(); - if (write_controller_.IsStopped() == false && delay > 0) { - mutex_.Unlock(); - env_->SleepForMicroseconds(delay); - mutex_.Lock(); +// This function will be called only when the first writer succeeds. +// All writers in the to-be-built batch group will be processed. +// +// REQUIRES: Writer list must be non-empty +// REQUIRES: First writer must have a non-nullptr batch +void DBImpl::BuildBatchGroup(Writer** last_writer, + autovector* write_batch_group) { + assert(!writers_.empty()); + Writer* first = writers_.front(); + assert(first->batch != nullptr); + + size_t size = WriteBatchInternal::ByteSize(first->batch); + write_batch_group->push_back(first->batch); + + // Allow the group to grow up to a maximum size, but if the + // original write is small, limit the growth so we do not slow + // down the small write too much. + size_t max_size = 1 << 20; + if (size <= (128<<10)) { + max_size = size + (128<<10); } - while (write_controller_.IsStopped()) { - if (has_timeout) { - bg_cv_.TimedWait(expiration_time); - if (env_->NowMicros() > expiration_time) { - break; - } - } else { - bg_cv_.Wait(); + *last_writer = first; + std::deque::iterator iter = writers_.begin(); + ++iter; // Advance past "first" + for (; iter != writers_.end(); ++iter) { + Writer* w = *iter; + if (w->sync && !first->sync) { + // Do not include a sync write into a batch handled by a non-sync write. + break; } + + if (!w->disableWAL && first->disableWAL) { + // Do not include a write that needs WAL into a batch that has + // WAL disabled. + break; + } + + if (w->timeout_hint_us < first->timeout_hint_us) { + // Do not include those writes with shorter timeout. Otherwise, we might + // execute a write that should instead be aborted because of timeout. + break; + } + + if (w->batch == nullptr) { + // Do not include those writes with nullptr batch. Those are not writes, + // those are something else. They want to be alone + break; + } + + size += WriteBatchInternal::ByteSize(w->batch); + if (size > max_size) { + // Do not make batch too big + break; + } + + write_batch_group->push_back(w->batch); + w->in_batch_group = true; + *last_writer = w; } } -Status DBImpl::ScheduleFlushes(WriteContext* context) { - bool schedule_bg_work = false; - ColumnFamilyData* cfd; - while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) { - schedule_bg_work = true; - auto status = SetNewMemtableAndNewLogFile(cfd, context); - if (cfd->Unref()) { - delete cfd; - } - if (!status.ok()) { - return status; +// This function computes the amount of time in microseconds by which a write +// should be delayed based on the number of level-0 files according to the +// following formula: +// if n < bottom, return 0; +// if n >= top, return 1000; +// otherwise, let r = (n - bottom) / +// (top - bottom) +// and return r^2 * 1000. +// The goal of this formula is to gradually increase the rate at which writes +// are slowed. We also tried linear delay (r * 1000), but it seemed to do +// slightly worse. There is no other particular reason for choosing quadratic. +uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) { + uint64_t delay; + if (n >= top) { + delay = 1000; + } + else if (n < bottom) { + delay = 0; + } + else { + // If we are here, we know that: + // level0_start_slowdown <= n < level0_slowdown + // since the previous two conditions are false. + double how_much = + (double) (n - bottom) / + (top - bottom); + delay = std::max(how_much * how_much * 1000, 100.0); + } + assert(delay <= 1000); + return delay; +} + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, + WriteContext* context, + uint64_t expiration_time) { + mutex_.AssertHeld(); + assert(!writers_.empty()); + bool allow_delay = true; + bool allow_hard_rate_limit_delay = true; + bool allow_soft_rate_limit_delay = true; + uint64_t rate_limit_delay_millis = 0; + Status s; + double score; + // Once we schedule background work, we shouldn't schedule it again, since it + // might generate a tight feedback loop, constantly scheduling more background + // work, even if additional background work is not needed + bool schedule_background_work = true; + bool has_timeout = (expiration_time > 0); + + while (true) { + if (!bg_error_.ok()) { + // Yield previous error + s = bg_error_; + break; + } else if (has_timeout && env_->NowMicros() > expiration_time) { + s = Status::TimedOut(); + break; + } else if (allow_delay && cfd->NeedSlowdownForNumLevel0Files()) { + // We are getting close to hitting a hard limit on the number of + // L0 files. Rather than delaying a single write by several + // seconds when we hit the hard limit, start delaying each + // individual write by 0-1ms to reduce latency variance. Also, + // this delay hands over some CPU to the compaction thread in + // case it is sharing the same core as the writer. + uint64_t slowdown = + SlowdownAmount(cfd->current()->NumLevelFiles(0), + cfd->options()->level0_slowdown_writes_trigger, + cfd->options()->level0_stop_writes_trigger); + mutex_.Unlock(); + uint64_t delayed; + { + StopWatch sw(env_, stats_, STALL_L0_SLOWDOWN_COUNT, &delayed); + env_->SleepForMicroseconds(slowdown); + } + RecordTick(stats_, STALL_L0_SLOWDOWN_MICROS, delayed); + allow_delay = false; // Do not delay a single write more than once + mutex_.Lock(); + cfd->internal_stats()->AddCFStats( + InternalStats::LEVEL0_SLOWDOWN, delayed); + delayed_writes_++; + } else if (!cfd->mem()->ShouldFlush()) { + // There is room in current memtable + if (allow_delay) { + DelayLoggingAndReset(); + } + break; + } else if (cfd->NeedWaitForNumMemtables()) { + // We have filled up the current memtable, but the previous + // ones are still being flushed, so we wait. + DelayLoggingAndReset(); + Log(options_.info_log, "[%s] wait for memtable flush...\n", + cfd->GetName().c_str()); + if (schedule_background_work) { + MaybeScheduleFlushOrCompaction(); + schedule_background_work = false; + } + uint64_t stall; + { + StopWatch sw(env_, stats_, STALL_MEMTABLE_COMPACTION_COUNT, &stall); + if (!has_timeout) { + bg_cv_.Wait(); + } else { + bg_cv_.TimedWait(expiration_time); + } + } + RecordTick(stats_, STALL_MEMTABLE_COMPACTION_MICROS, stall); + cfd->internal_stats()->AddCFStats( + InternalStats::MEMTABLE_COMPACTION, stall); + } else if (cfd->NeedWaitForNumLevel0Files()) { + DelayLoggingAndReset(); + Log(options_.info_log, "[%s] wait for fewer level0 files...\n", + cfd->GetName().c_str()); + uint64_t stall; + { + StopWatch sw(env_, stats_, STALL_L0_NUM_FILES_COUNT, &stall); + if (!has_timeout) { + bg_cv_.Wait(); + } else { + bg_cv_.TimedWait(expiration_time); + } + } + RecordTick(stats_, STALL_L0_NUM_FILES_MICROS, stall); + cfd->internal_stats()->AddCFStats( + InternalStats::LEVEL0_NUM_FILES, stall); + } else if (allow_hard_rate_limit_delay && cfd->ExceedsHardRateLimit()) { + // Delay a write when the compaction score for any level is too large. + const int max_level = cfd->current()->MaxCompactionScoreLevel(); + score = cfd->current()->MaxCompactionScore(); + mutex_.Unlock(); + uint64_t delayed; + { + StopWatch sw(env_, stats_, HARD_RATE_LIMIT_DELAY_COUNT, &delayed); + env_->SleepForMicroseconds(1000); + } + // Make sure the following value doesn't round to zero. + uint64_t rate_limit = std::max((delayed / 1000), (uint64_t) 1); + rate_limit_delay_millis += rate_limit; + RecordTick(stats_, RATE_LIMIT_DELAY_MILLIS, rate_limit); + if (cfd->options()->rate_limit_delay_max_milliseconds > 0 && + rate_limit_delay_millis >= + (unsigned)cfd->options()->rate_limit_delay_max_milliseconds) { + allow_hard_rate_limit_delay = false; + } + mutex_.Lock(); + cfd->internal_stats()->RecordLevelNSlowdown(max_level, delayed, false); + } else if (allow_soft_rate_limit_delay && cfd->ExceedsSoftRateLimit()) { + const int max_level = cfd->current()->MaxCompactionScoreLevel(); + score = cfd->current()->MaxCompactionScore(); + // Delay a write when the compaction score for any level is too large. + // TODO: add statistics + uint64_t slowdown = SlowdownAmount(score, cfd->options()->soft_rate_limit, + cfd->options()->hard_rate_limit); + uint64_t elapsed = 0; + mutex_.Unlock(); + { + StopWatch sw(env_, stats_, SOFT_RATE_LIMIT_DELAY_COUNT, &elapsed); + env_->SleepForMicroseconds(slowdown); + rate_limit_delay_millis += slowdown; + } + allow_soft_rate_limit_delay = false; + mutex_.Lock(); + cfd->internal_stats()->RecordLevelNSlowdown(max_level, elapsed, true); + } else { + s = SetNewMemtableAndNewLogFile(cfd, context); + if (!s.ok()) { + break; + } + MaybeScheduleFlushOrCompaction(); } } - if (schedule_bg_work) { - MaybeScheduleFlushOrCompaction(); - } - return Status::OK(); + return s; } // REQUIRES: mutex_ is held @@ -4189,14 +4372,14 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, uint64_t new_log_number = creating_new_log ? versions_->NewFileNumber() : logfile_number_; SuperVersion* new_superversion = nullptr; - const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); mutex_.Unlock(); Status s; { + DelayLoggingAndReset(); if (creating_new_log) { - s = env_->NewWritableFile( - LogFileName(db_options_.wal_dir, new_log_number), - &lfile, env_->OptimizeForLogWrite(env_options_)); + s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number), + &lfile, + env_->OptimizeForLogWrite(storage_options_)); if (s.ok()) { // Our final size should be less than write_buffer_size // (compression, etc) but err on the side of caution. @@ -4207,9 +4390,7 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, } if (s.ok()) { - new_mem = new MemTable(cfd->internal_comparator(), - *cfd->ioptions(), MemTableOptions(mutable_cf_options, - *cfd->options())); + new_mem = new MemTable(cfd->internal_comparator(), *cfd->options()); new_superversion = new SuperVersion(); } } @@ -4245,11 +4426,11 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, cfd->imm()->Add(cfd->mem()); new_mem->Ref(); cfd->SetMemtable(new_mem); - Log(db_options_.info_log, + Log(options_.info_log, "[%s] New memtable created with log file: #%" PRIu64 "\n", cfd->GetName().c_str(), logfile_number_); context->superversions_to_free_.push_back( - cfd->InstallSuperVersion(new_superversion, &mutex_, mutable_cf_options)); + cfd->InstallSuperVersion(new_superversion, &mutex_)); return s; } @@ -4350,12 +4531,20 @@ bool DBImpl::GetIntPropertyInternal(ColumnFamilyHandle* column_family, SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) { // TODO(ljin): consider using GetReferencedSuperVersion() directly - return cfd->GetThreadLocalSuperVersion(&mutex_); + if (LIKELY(options_.allow_thread_local)) { + return cfd->GetThreadLocalSuperVersion(&mutex_); + } else { + MutexLock l(&mutex_); + return cfd->GetSuperVersion()->Ref(); + } } void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv) { - bool unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv); + bool unref_sv = true; + if (LIKELY(options_.allow_thread_local)) { + unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv); + } if (unref_sv) { // Release SuperVersion @@ -4398,6 +4587,13 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, } } +inline void DBImpl::DelayLoggingAndReset() { + if (delayed_writes_ > 0) { + Log(options_.info_log, "delayed %d write...\n", delayed_writes_ ); + delayed_writes_ = 0; + } +} + #ifndef ROCKSDB_LITE Status DBImpl::GetUpdatesSince( SequenceNumber seq, unique_ptr* iter, @@ -4420,8 +4616,8 @@ Status DBImpl::GetUpdatesSince( if (!s.ok()) { return s; } - iter->reset(new TransactionLogIteratorImpl(db_options_.wal_dir, &db_options_, - read_options, env_options_, + iter->reset(new TransactionLogIteratorImpl(options_.wal_dir, &options_, + read_options, storage_options_, seq, std::move(wal_files), this)); return (*iter)->status(); } @@ -4432,7 +4628,7 @@ Status DBImpl::DeleteFile(std::string name) { WalFileType log_type; if (!ParseFileName(name, &number, &type, &log_type) || (type != kTableFile && type != kLogFile)) { - Log(db_options_.info_log, "DeleteFile %s failed.\n", name.c_str()); + Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str()); return Status::InvalidArgument("Invalid file name"); } @@ -4440,13 +4636,13 @@ Status DBImpl::DeleteFile(std::string name) { if (type == kLogFile) { // Only allow deleting archived log files if (log_type != kArchivedLogFile) { - Log(db_options_.info_log, "DeleteFile %s failed - not archived log.\n", + Log(options_.info_log, "DeleteFile %s failed - not archived log.\n", name.c_str()); return Status::NotSupported("Delete only supported for archived logs"); } - status = env_->DeleteFile(db_options_.wal_dir + "/" + name.c_str()); + status = env_->DeleteFile(options_.wal_dir + "/" + name.c_str()); if (!status.ok()) { - Log(db_options_.info_log, "DeleteFile %s failed -- %s.\n", + Log(options_.info_log, "DeleteFile %s failed -- %s.\n", name.c_str(), status.ToString().c_str()); } return status; @@ -4461,7 +4657,7 @@ Status DBImpl::DeleteFile(std::string name) { MutexLock l(&mutex_); status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd); if (!status.ok()) { - Log(db_options_.info_log, "DeleteFile %s failed. File not found\n", + Log(options_.info_log, "DeleteFile %s failed. File not found\n", name.c_str()); return Status::InvalidArgument("File not found"); } @@ -4469,7 +4665,7 @@ Status DBImpl::DeleteFile(std::string name) { // If the file is being compacted no need to delete. if (metadata->being_compacted) { - Log(db_options_.info_log, + Log(options_.info_log, "DeleteFile %s Skipped. File about to be compacted\n", name.c_str()); return Status::OK(); } @@ -4479,21 +4675,19 @@ Status DBImpl::DeleteFile(std::string name) { // lost. Check that the level passed is the last level. for (int i = level + 1; i < cfd->NumberLevels(); i++) { if (cfd->current()->NumLevelFiles(i) != 0) { - Log(db_options_.info_log, + Log(options_.info_log, "DeleteFile %s FAILED. File not in last level\n", name.c_str()); return Status::InvalidArgument("File not in last level"); } } edit.DeleteFile(level, number); - status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, db_directory_.get()); + status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get()); if (status.ok()) { - InstallSuperVersion(cfd, deletion_state, - *cfd->GetLatestMutableCFOptions()); + InstallSuperVersion(cfd, deletion_state); } FindObsoleteFiles(deletion_state, false); } // lock released here - LogFlush(db_options_.info_log); + LogFlush(options_.info_log); // remove files outside the db-lock if (deletion_state.HaveSomethingToDelete()) { PurgeObsoleteFiles(deletion_state); @@ -4655,9 +4849,9 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, } DBImpl* impl = new DBImpl(db_options, dbname); - s = impl->env_->CreateDirIfMissing(impl->db_options_.wal_dir); + s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir); if (s.ok()) { - for (auto db_path : impl->db_options_.db_paths) { + for (auto db_path : impl->options_.db_paths) { s = impl->env_->CreateDirIfMissing(db_path.path); if (!s.ok()) { break; @@ -4682,9 +4876,9 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, uint64_t new_log_number = impl->versions_->NewFileNumber(); unique_ptr lfile; EnvOptions soptions(db_options); - s = impl->db_options_.env->NewWritableFile( - LogFileName(impl->db_options_.wal_dir, new_log_number), &lfile, - impl->db_options_.env->OptimizeForLogWrite(soptions)); + s = impl->options_.env->NewWritableFile( + LogFileName(impl->options_.wal_dir, new_log_number), &lfile, + impl->options_.env->OptimizeForLogWrite(soptions)); if (s.ok()) { lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size); impl->logfile_number_ = new_log_number; @@ -4718,8 +4912,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, } if (s.ok()) { for (auto cfd : *impl->versions_->GetColumnFamilySet()) { - delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_, - *cfd->GetLatestMutableCFOptions()); + delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_); } impl->alive_log_files_.push_back( DBImpl::LogFileNumberSize(impl->logfile_number_)); @@ -4731,8 +4924,8 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, if (s.ok()) { for (auto cfd : *impl->versions_->GetColumnFamilySet()) { - if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || - cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + if (cfd->options()->compaction_style == kCompactionStyleUniversal || + cfd->options()->compaction_style == kCompactionStyleFIFO) { Version* current = cfd->current(); for (int i = 1; i < current->NumberLevels(); ++i) { int num_files = current->NumLevelFiles(i); @@ -4744,7 +4937,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, } } } - if (cfd->ioptions()->merge_operator != nullptr && + if (cfd->options()->merge_operator != nullptr && !cfd->mem()->IsMergeOperatorSupported()) { s = Status::InvalidArgument( "The memtable of column family %s does not support merge operator " diff --git a/db/db_impl.h b/db/db_impl.h index f1a81e00cc..086ac9fd4e 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -30,11 +30,7 @@ #include "util/autovector.h" #include "util/stop_watch.h" #include "util/thread_local.h" -#include "util/scoped_arena_iterator.h" #include "db/internal_stats.h" -#include "db/write_controller.h" -#include "db/flush_scheduler.h" -#include "db/write_thread.h" namespace rocksdb { @@ -112,10 +108,6 @@ class DBImpl : public DB { bool reduce_level = false, int target_level = -1, uint32_t target_path_id = 0); - using DB::SetOptions; - bool SetOptions(ColumnFamilyHandle* column_family, - const std::unordered_map& options_map); - using DB::NumberLevels; virtual int NumberLevels(ColumnFamilyHandle* column_family); using DB::MaxMemCompactionLevel; @@ -181,8 +173,8 @@ class DBImpl : public DB { // Return an internal iterator over the current state of the database. // The keys of this iterator are internal keys (see format.h). // The returned iterator should be deleted when no longer needed. - Iterator* TEST_NewInternalIterator( - Arena* arena, ColumnFamilyHandle* column_family = nullptr); + Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family = + nullptr); // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. @@ -210,17 +202,6 @@ class DBImpl : public DB { SequenceNumber* sequence); Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence); - - void TEST_LockMutex(); - - void TEST_UnlockMutex(); - - // REQUIRES: mutex locked - void* TEST_BeginWrite(); - - // REQUIRES: mutex locked - // pass the pointer that you got from TEST_BeginWrite() - void TEST_EndWrite(void* w); #endif // NDEBUG // Structure to store information for candidate files to delete. @@ -294,7 +275,7 @@ class DBImpl : public DB { // Returns the list of live files in 'live' and the list // of all files in the filesystem in 'candidate_files'. // If force == false and the last call was less than - // db_options_.delete_obsolete_files_period_micros microseconds ago, + // options_.delete_obsolete_files_period_micros microseconds ago, // it will not fill up the deletion_state void FindObsoleteFiles(DeletionState& deletion_state, bool force, @@ -312,22 +293,23 @@ class DBImpl : public DB { Env* const env_; const std::string dbname_; unique_ptr versions_; - const DBOptions db_options_; + const DBOptions options_; Statistics* stats_; Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd, - SuperVersion* super_version, Arena* arena); + SuperVersion* super_version, + Arena* arena = nullptr); private: friend class DB; friend class InternalStats; #ifndef ROCKSDB_LITE + friend class TailingIterator; friend class ForwardIterator; #endif friend struct SuperVersion; - friend class CompactedDBImpl; struct CompactionState; - + struct Writer; struct WriteContext; Status NewDB(); @@ -347,13 +329,12 @@ class DBImpl : public DB { // Flush the in-memory write buffer to storage. Switches to a new // log-file/memtable and writes a new descriptor iff successful. - Status FlushMemTableToOutputFile( - ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, - bool* madeProgress, DeletionState& deletion_state, LogBuffer* log_buffer); + Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress, + DeletionState& deletion_state, + LogBuffer* log_buffer); - // REQUIRES: log_numbers are sorted in ascending order - Status RecoverLogFiles(const std::vector& log_numbers, - SequenceNumber* max_sequence, bool read_only); + Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, + bool read_only); // The following two methods are used to flush a memtable to // storage. The first one is used atdatabase RecoveryTime (when the @@ -362,18 +343,47 @@ class DBImpl : public DB { // concurrent flush memtables to storage. Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, VersionEdit* edit); - Status WriteLevel0Table(ColumnFamilyData* cfd, - const MutableCFOptions& mutable_cf_options, - const autovector& mems, - VersionEdit* edit, uint64_t* filenumber, LogBuffer* log_buffer); + Status WriteLevel0Table(ColumnFamilyData* cfd, autovector& mems, + VersionEdit* edit, uint64_t* filenumber, + LogBuffer* log_buffer); - void DelayWrite(uint64_t expiration_time); + uint64_t SlowdownAmount(int n, double bottom, double top); - Status ScheduleFlushes(WriteContext* context); + // Before applying write operation (such as DBImpl::Write, DBImpl::Flush) + // thread should grab the mutex_ and be the first on writers queue. + // BeginWrite is used for it. + // Be aware! Writer's job can be done by other thread (see DBImpl::Write + // for examples), so check it via w.done before applying changes. + // + // Writer* w: writer to be placed in the queue + // uint64_t expiration_time: maximum time to be in the queue + // See also: EndWrite + Status BeginWrite(Writer* w, uint64_t expiration_time); + + // After doing write job, we need to remove already used writers from + // writers_ queue and notify head of the queue about it. + // EndWrite is used for this. + // + // Writer* w: Writer, that was added by BeginWrite function + // Writer* last_writer: Since we can join a few Writers (as DBImpl::Write + // does) + // we should pass last_writer as a parameter to + // EndWrite + // (if you don't touch other writers, just pass w) + // Status status: Status of write operation + // See also: BeginWrite + void EndWrite(Writer* w, Writer* last_writer, Status status); + + Status MakeRoomForWrite(ColumnFamilyData* cfd, + WriteContext* context, + uint64_t expiration_time); Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, WriteContext* context); + void BuildBatchGroup(Writer** last_writer, + autovector* write_batch_group); + // Force current memtable contents to be flushed. Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options); @@ -394,7 +404,6 @@ class DBImpl : public DB { LogBuffer* log_buffer); void CleanupCompaction(CompactionState* compact, Status status); Status DoCompactionWork(CompactionState* compact, - const MutableCFOptions& mutable_cf_options, DeletionState& deletion_state, LogBuffer* log_buffer); @@ -402,13 +411,12 @@ class DBImpl : public DB { // preempt compaction, since it's higher prioirty // Returns: micros spent executing uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd, - const MutableCFOptions& mutable_cf_options, DeletionState& deletion_state, - LogBuffer* log_buffer); + DeletionState& deletion_state, + LogBuffer* log_buffer); // Call compaction filter if is_compaction_v2 is not true. Then iterate // through input and compact the kv-pairs Status ProcessKeyValueCompaction( - const MutableCFOptions& mutable_cf_options, bool is_snapshot_supported, SequenceNumber visible_at_tip, SequenceNumber earliest_snapshot, @@ -425,11 +433,10 @@ class DBImpl : public DB { void CallCompactionFilterV2(CompactionState* compact, CompactionFilterV2* compaction_filter_v2); - Status OpenCompactionOutputFile(CompactionState* compact, - const MutableCFOptions& mutable_cf_options); + Status OpenCompactionOutputFile(CompactionState* compact); Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); Status InstallCompactionResults(CompactionState* compact, - const MutableCFOptions& mutable_cf_options, LogBuffer* log_buffer); + LogBuffer* log_buffer); void AllocateCompactionOutputFileNumbers(CompactionState* compact); void ReleaseCompactionUnusedFileNumbers(CompactionState* compact); @@ -471,8 +478,7 @@ class DBImpl : public DB { // Return the minimum empty level that could hold the total data in the // input level. Return the input level, if such level could not be found. - int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, - const MutableCFOptions& mutable_cf_options, int level); + int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level); // Move the files in the input level to the target level. // If target_level < 0, automatically calculate the minimum level that could @@ -522,13 +528,10 @@ class DBImpl : public DB { std::unique_ptr db_directory_; - WriteThread write_thread_; - + // Queue of writers. + std::deque writers_; WriteBatch tmp_batch_; - WriteController write_controller_; - FlushScheduler flush_scheduler_; - SnapshotList snapshots_; // cache for ReadFirstRecord() calls @@ -597,10 +600,14 @@ class DBImpl : public DB { bool flush_on_destroy_; // Used when disableWAL is true. static const int KEEP_LOG_FILE_NUM = 1000; + static const uint64_t kNoTimeOut = std::numeric_limits::max(); std::string db_absolute_path_; + // count of the number of contiguous delaying writes + int delayed_writes_; + // The options to access storage files - const EnvOptions env_options_; + const EnvOptions storage_options_; // A value of true temporarily disables scheduling of background work bool bg_work_gate_closed_; @@ -615,6 +622,9 @@ class DBImpl : public DB { DBImpl(const DBImpl&); void operator=(const DBImpl&); + // dump the delayed_writes_ to the log file and reset counter. + void DelayLoggingAndReset(); + // Return the earliest snapshot where seqno is visible. // Store the snapshot right before that, if any, in prev_snapshot inline SequenceNumber findEarliestVisibleSnapshot( @@ -626,8 +636,7 @@ class DBImpl : public DB { // the cfd->InstallSuperVersion() function. Background threads carry // deletion_state which can have new_superversion already allocated. void InstallSuperVersion(ColumnFamilyData* cfd, - DeletionState& deletion_state, - const MutableCFOptions& mutable_cf_options); + DeletionState& deletion_state); // Find Super version and reference it. Based on options, it might return // the thread local cached one. diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc index 6c073d4d5e..8df66f6c6d 100644 --- a/db/db_impl_debug.cc +++ b/db/db_impl_debug.cc @@ -20,8 +20,7 @@ uint64_t DBImpl::TEST_GetLevel0TotalSize() { return default_cf_handle_->cfd()->current()->NumLevelBytes(0); } -Iterator* DBImpl::TEST_NewInternalIterator(Arena* arena, - ColumnFamilyHandle* column_family) { +Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) { ColumnFamilyData* cfd; if (column_family == nullptr) { cfd = default_cf_handle_->cfd(); @@ -34,7 +33,7 @@ Iterator* DBImpl::TEST_NewInternalIterator(Arena* arena, SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); mutex_.Unlock(); ReadOptions roptions; - return NewInternalIterator(roptions, cfd, super_version, arena); + return NewInternalIterator(roptions, cfd, super_version); } int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( @@ -130,27 +129,5 @@ Status DBImpl::TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence) { return ReadFirstLine(fname, sequence); } - -void DBImpl::TEST_LockMutex() { - mutex_.Lock(); -} - -void DBImpl::TEST_UnlockMutex() { - mutex_.Unlock(); -} - -void* DBImpl::TEST_BeginWrite() { - auto w = new WriteThread::Writer(&mutex_); - Status s = write_thread_.EnterWriteThread(w, 0); - assert(s.ok() && !w->done); // No timeout and nobody should do our job - return reinterpret_cast(w); -} - -void DBImpl::TEST_EndWrite(void* w) { - auto writer = reinterpret_cast(w); - write_thread_.ExitWriteThread(writer, writer, Status::OK()); - delete writer; -} - } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index 9faebd8c21..6c864aefd8 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -2,27 +2,56 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 Facebook. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. #include "db/db_impl_readonly.h" -#include "utilities/compacted_db/compacted_db_impl.h" #include "db/db_impl.h" -#include "db/merge_context.h" + +#include +#include +#include +#include +#include +#include #include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/merge_context.h" +#include "db/table_cache.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/merge_operator.h" +#include "port/port.h" +#include "table/block.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/build_version.h" namespace rocksdb { -DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options, +DBImplReadOnly::DBImplReadOnly(const DBOptions& options, const std::string& dbname) - : DBImpl(db_options, dbname) { - Log(INFO_LEVEL, db_options_.info_log, "Opening the db in read only mode"); - LogFlush(db_options_.info_log); + : DBImpl(options, dbname) { + Log(options_.info_log, "Opening the db in read only mode"); } DBImplReadOnly::~DBImplReadOnly() { } // Implementations of the DB interface -Status DBImplReadOnly::Get(const ReadOptions& read_options, +Status DBImplReadOnly::Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value) { Status s; @@ -32,34 +61,33 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options, SuperVersion* super_version = cfd->GetSuperVersion(); MergeContext merge_context; LookupKey lkey(key, snapshot); - if (super_version->mem->Get(lkey, value, &s, &merge_context)) { + if (super_version->mem->Get(lkey, value, &s, merge_context, + *cfd->options())) { } else { - super_version->current->Get(read_options, lkey, value, &s, &merge_context); + super_version->current->Get(options, lkey, value, &s, &merge_context); } return s; } -Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, +Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options, ColumnFamilyHandle* column_family) { auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); SequenceNumber latest_snapshot = versions_->LastSequence(); auto db_iter = NewArenaWrappedDbIterator( - env_, *cfd->ioptions(), cfd->user_comparator(), - (read_options.snapshot != nullptr - ? reinterpret_cast( - read_options.snapshot)->number_ - : latest_snapshot), - cfd->options()->max_sequential_skip_in_iterations); - auto internal_iter = NewInternalIterator( - read_options, cfd, super_version, db_iter->GetArena()); + env_, *cfd->options(), cfd->user_comparator(), + (options.snapshot != nullptr + ? reinterpret_cast(options.snapshot)->number_ + : latest_snapshot)); + auto internal_iter = + NewInternalIterator(options, cfd, super_version, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); return db_iter; } Status DBImplReadOnly::NewIterators( - const ReadOptions& read_options, + const ReadOptions& options, const std::vector& column_families, std::vector* iterators) { if (iterators == nullptr) { @@ -72,14 +100,12 @@ Status DBImplReadOnly::NewIterators( for (auto cfh : column_families) { auto cfd = reinterpret_cast(cfh)->cfd(); auto db_iter = NewArenaWrappedDbIterator( - env_, *cfd->ioptions(), cfd->user_comparator(), - (read_options.snapshot != nullptr - ? reinterpret_cast( - read_options.snapshot)->number_ - : latest_snapshot), - cfd->options()->max_sequential_skip_in_iterations); + env_, *cfd->options(), cfd->user_comparator(), + options.snapshot != nullptr + ? reinterpret_cast(options.snapshot)->number_ + : latest_snapshot); auto internal_iter = NewInternalIterator( - read_options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena()); + options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); iterators->push_back(db_iter); } @@ -91,15 +117,6 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, DB** dbptr, bool error_if_log_file_exist) { *dbptr = nullptr; - // Try to first open DB as fully compacted DB - Status s; -#ifndef ROCKSDB_LITE - s = CompactedDBImpl::Open(options, dbname, dbptr); - if (s.ok()) { - return s; - } -#endif - DBOptions db_options(options); ColumnFamilyOptions cf_options(options); std::vector column_families; @@ -107,7 +124,8 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); std::vector handles; - s = DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr); + Status s = + DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr); if (s.ok()) { assert(handles.size() == 1); // i can delete the handle since DBImpl is always holding a diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h index 9b10b83fbb..1dfdf422ef 100644 --- a/db/db_impl_readonly.h +++ b/db/db_impl_readonly.h @@ -2,11 +2,24 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 Facebook. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. #pragma once #include "db/db_impl.h" + +#include +#include #include #include +#include "db/dbformat.h" +#include "db/log_writer.h" +#include "db/snapshot.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "port/port.h" namespace rocksdb { diff --git a/db/db_iter.cc b/db/db_iter.cc index db86ebc2c4..599a56a99c 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -58,25 +58,22 @@ class DBIter: public Iterator { kReverse }; - DBIter(Env* env, const ImmutableCFOptions& ioptions, - const Comparator* cmp, Iterator* iter, SequenceNumber s, - bool arena_mode, uint64_t max_sequential_skip_in_iterations, - const Slice* iterate_upper_bound = nullptr) + DBIter(Env* env, const Options& options, const Comparator* cmp, + Iterator* iter, SequenceNumber s, bool arena_mode) : arena_mode_(arena_mode), env_(env), - logger_(ioptions.info_log), + logger_(options.info_log.get()), user_comparator_(cmp), - user_merge_operator_(ioptions.merge_operator), + user_merge_operator_(options.merge_operator.get()), iter_(iter), sequence_(s), direction_(kForward), valid_(false), current_entry_is_merged_(false), - statistics_(ioptions.statistics), - iterate_upper_bound_(iterate_upper_bound) { + statistics_(options.statistics.get()) { RecordTick(statistics_, NO_ITERATORS); - prefix_extractor_ = ioptions.prefix_extractor; - max_skip_ = max_sequential_skip_in_iterations; + has_prefix_extractor_ = (options.prefix_extractor.get() != nullptr); + max_skip_ = options.max_sequential_skip_in_iterations; } virtual ~DBIter() { RecordTick(statistics_, NO_ITERATORS, -1); @@ -135,7 +132,7 @@ class DBIter: public Iterator { } } - const SliceTransform* prefix_extractor_; + bool has_prefix_extractor_; bool arena_mode_; Env* const env_; Logger* logger_; @@ -152,7 +149,6 @@ class DBIter: public Iterator { bool current_entry_is_merged_; Statistics* statistics_; uint64_t max_skip_; - const Slice* iterate_upper_bound_; // No copying allowed DBIter(const DBIter&); @@ -211,44 +207,36 @@ void DBIter::FindNextUserEntryInternal(bool skipping) { uint64_t num_skipped = 0; do { ParsedInternalKey ikey; - - if (ParseKey(&ikey)) { - if (iterate_upper_bound_ != nullptr && - ikey.user_key.compare(*iterate_upper_bound_) >= 0) { - break; - } - - if (ikey.sequence <= sequence_) { - if (skipping && - user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) { - num_skipped++; // skip this entry - PERF_COUNTER_ADD(internal_key_skipped_count, 1); - } else { - skipping = false; - switch (ikey.type) { - case kTypeDeletion: - // Arrange to skip all upcoming entries for this key since - // they are hidden by this deletion. - saved_key_.SetKey(ikey.user_key); - skipping = true; - num_skipped = 0; - PERF_COUNTER_ADD(internal_delete_skipped_count, 1); - break; - case kTypeValue: - valid_ = true; - saved_key_.SetKey(ikey.user_key); - return; - case kTypeMerge: - // By now, we are sure the current ikey is going to yield a value - saved_key_.SetKey(ikey.user_key); - current_entry_is_merged_ = true; - valid_ = true; - MergeValuesNewToOld(); // Go to a different state machine - return; - default: - assert(false); - break; - } + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + if (skipping && + user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) { + num_skipped++; // skip this entry + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + } else { + skipping = false; + switch (ikey.type) { + case kTypeDeletion: + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + saved_key_.SetKey(ikey.user_key); + skipping = true; + num_skipped = 0; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + break; + case kTypeValue: + valid_ = true; + saved_key_.SetKey(ikey.user_key); + return; + case kTypeMerge: + // By now, we are sure the current ikey is going to yield a value + saved_key_.SetKey(ikey.user_key); + current_entry_is_merged_ = true; + valid_ = true; + MergeValuesNewToOld(); // Go to a different state machine + return; + default: + assert(false); + break; } } } @@ -410,7 +398,6 @@ bool DBIter::FindValueForCurrentKey() { case kTypeDeletion: operands.clear(); last_not_merge_type = kTypeDeletion; - PERF_COUNTER_ADD(internal_delete_skipped_count, 1); break; case kTypeMerge: assert(user_merge_operator_ != nullptr); @@ -420,7 +407,6 @@ bool DBIter::FindValueForCurrentKey() { assert(false); } - PERF_COUNTER_ADD(internal_key_skipped_count, 1); assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0); iter_->Prev(); ++num_skipped; @@ -567,20 +553,6 @@ void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) { void DBIter::Seek(const Slice& target) { StopWatch sw(env_, statistics_, DB_SEEK); - // total ordering is not guaranteed if prefix_extractor is set - // hence prefix based seeks will not give correct results - if (iterate_upper_bound_ != nullptr && prefix_extractor_ != nullptr) { - if (!prefix_extractor_->InDomain(*iterate_upper_bound_) || - !prefix_extractor_->InDomain(target) || - prefix_extractor_->Transform(*iterate_upper_bound_).compare( - prefix_extractor_->Transform(target)) != 0) { - status_ = Status::InvalidArgument("read_options.iterate_*_bound " - " and seek target need to have the same prefix."); - valid_ = false; - return; - } - } - saved_key_.Clear(); // now savved_key is used to store internal key. saved_key_.SetInternalKey(target, sequence_); @@ -602,7 +574,7 @@ void DBIter::Seek(const Slice& target) { void DBIter::SeekToFirst() { // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek wiil be used. - if (prefix_extractor_ != nullptr) { + if (has_prefix_extractor_) { max_skip_ = std::numeric_limits::max(); } direction_ = kForward; @@ -623,7 +595,7 @@ void DBIter::SeekToFirst() { void DBIter::SeekToLast() { // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek wiil be used. - if (prefix_extractor_ != nullptr) { + if (has_prefix_extractor_) { max_skip_ = std::numeric_limits::max(); } direction_ = kReverse; @@ -637,15 +609,12 @@ void DBIter::SeekToLast() { PrevInternal(); } -Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions, +Iterator* NewDBIterator(Env* env, const Options& options, const Comparator* user_key_comparator, Iterator* internal_iter, - const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, - const Slice* iterate_upper_bound) { - return new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence, - false, max_sequential_skip_in_iterations, - iterate_upper_bound); + const SequenceNumber& sequence) { + return new DBIter(env, options, user_key_comparator, internal_iter, sequence, + false); } ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); } @@ -673,20 +642,14 @@ void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1, } ArenaWrappedDBIter* NewArenaWrappedDbIterator( - Env* env, const ImmutableCFOptions& ioptions, - const Comparator* user_key_comparator, - const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, - const Slice* iterate_upper_bound) { + Env* env, const Options& options, const Comparator* user_key_comparator, + const SequenceNumber& sequence) { ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); Arena* arena = iter->GetArena(); auto mem = arena->AllocateAligned(sizeof(DBIter)); - DBIter* db_iter = new (mem) DBIter(env, ioptions, user_key_comparator, - nullptr, sequence, true, max_sequential_skip_in_iterations, - iterate_upper_bound); - + DBIter* db_iter = new (mem) + DBIter(env, options, user_key_comparator, nullptr, sequence, true); iter->SetDBIter(db_iter); - return iter; } diff --git a/db/db_iter.h b/db/db_iter.h index c676d6cda1..cb9840324f 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -24,12 +24,10 @@ class DBIter; // into appropriate user keys. extern Iterator* NewDBIterator( Env* env, - const ImmutableCFOptions& options, + const Options& options, const Comparator *user_key_comparator, Iterator* internal_iter, - const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, - const Slice* iterate_upper_bound = nullptr); + const SequenceNumber& sequence); // A wrapper iterator which wraps DB Iterator and the arena, with which the DB // iterator is supposed be allocated. This class is used as an entry point of @@ -69,9 +67,7 @@ class ArenaWrappedDBIter : public Iterator { // Generate the arena wrapped iterator class. extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( - Env* env, const ImmutableCFOptions& options, - const Comparator* user_key_comparator, - const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, - const Slice* iterate_upper_bound = nullptr); + Env* env, const Options& options, const Comparator* user_key_comparator, + const SequenceNumber& sequence); } // namespace rocksdb diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc index 2aa30e327c..4ce79da1ba 100644 --- a/db/db_iter_test.cc +++ b/db/db_iter_test.cc @@ -158,9 +158,7 @@ TEST(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -193,9 +191,7 @@ TEST(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -236,9 +232,7 @@ TEST(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -268,9 +262,7 @@ TEST(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -296,9 +288,7 @@ TEST(DBIteratorTest, DBIteratorEmpty) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 0, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); } @@ -308,9 +298,7 @@ TEST(DBIteratorTest, DBIteratorEmpty) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 0, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(!db_iter->Valid()); } @@ -330,9 +318,7 @@ TEST(DBIteratorTest, DBIteratorUseSkipCountSkips) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -371,9 +357,7 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { options.statistics = rocksdb::CreateDBStatistics(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i + 2, - options.max_sequential_skip_in_iterations)); + env_, options, BytewiseComparator(), internal_iter, i + 2)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -407,9 +391,7 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i + 2, - options.max_sequential_skip_in_iterations)); + env_, options, BytewiseComparator(), internal_iter, i + 2)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -436,9 +418,7 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 202, - options.max_sequential_skip_in_iterations)); + env_, options, BytewiseComparator(), internal_iter, 202)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -469,9 +449,7 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { internal_iter->AddPut("c", "200"); internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, i)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); @@ -486,9 +464,7 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { internal_iter->AddPut("c", "200"); internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 200, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 200)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -521,9 +497,7 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i + 2, - options.max_sequential_skip_in_iterations)); + env_, options, BytewiseComparator(), internal_iter, i + 2)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -556,9 +530,7 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i + 2, - options.max_sequential_skip_in_iterations)); + env_, options, BytewiseComparator(), internal_iter, i + 2)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -598,9 +570,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 1, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -620,9 +590,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 0, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -641,9 +609,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -662,9 +628,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 4, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -690,9 +654,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 0, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -713,9 +675,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 1, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -736,9 +696,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -759,9 +717,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 3, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 3)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -782,9 +738,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 4, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -805,9 +759,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 5, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -828,9 +780,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 6, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -853,9 +803,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 0, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -876,9 +824,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 1, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -899,9 +845,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -922,9 +866,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 3, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 3)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); } @@ -941,9 +883,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 4, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -964,9 +904,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 5, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -987,9 +925,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 6, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1024,9 +960,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 0, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1059,9 +993,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1100,9 +1032,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 4, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1141,9 +1071,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 5, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1187,9 +1115,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 6, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1234,9 +1160,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 7, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 7)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1275,9 +1199,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 9, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 9)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1322,9 +1244,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 13, - options.max_sequential_skip_in_iterations)); + env_, options, BytewiseComparator(), internal_iter, 13)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1370,9 +1290,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 14, - options.max_sequential_skip_in_iterations)); + env_, options, BytewiseComparator(), internal_iter, 14)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1398,9 +1316,7 @@ TEST(DBIteratorTest, DBIterator) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations)); + NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); diff --git a/db/db_test.cc b/db/db_test.cc index 7c2f051d05..6295f5921e 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -42,7 +41,6 @@ #include "util/rate_limiter.h" #include "util/statistics.h" #include "util/testharness.h" -#include "util/scoped_arena_iterator.h" #include "util/sync_point.h" #include "util/testutil.h" @@ -123,9 +121,6 @@ class SpecialEnv : public EnvWrapper { // sstable Sync() calls are blocked while this pointer is non-nullptr. port::AtomicPointer delay_sstable_sync_; - // Drop writes on the floor while this pointer is non-nullptr. - port::AtomicPointer drop_writes_; - // Simulate no-space errors while this pointer is non-nullptr. port::AtomicPointer no_space_; @@ -151,11 +146,8 @@ class SpecialEnv : public EnvWrapper { std::atomic bytes_written_; - std::atomic sync_counter_; - explicit SpecialEnv(Env* base) : EnvWrapper(base) { delay_sstable_sync_.Release_Store(nullptr); - drop_writes_.Release_Store(nullptr); no_space_.Release_Store(nullptr); non_writable_.Release_Store(nullptr); count_random_reads_ = false; @@ -164,7 +156,6 @@ class SpecialEnv : public EnvWrapper { manifest_write_error_.Release_Store(nullptr); log_write_error_.Release_Store(nullptr); bytes_written_ = 0; - sync_counter_ = 0; } Status NewWritableFile(const std::string& f, unique_ptr* r, @@ -180,11 +171,9 @@ class SpecialEnv : public EnvWrapper { base_(std::move(base)) { } Status Append(const Slice& data) { - if (env_->drop_writes_.Acquire_Load() != nullptr) { + if (env_->no_space_.Acquire_Load() != nullptr) { // Drop writes on the floor return Status::OK(); - } else if (env_->no_space_.Acquire_Load() != nullptr) { - return Status::IOError("No space left on device"); } else { env_->bytes_written_ += data.size(); return base_->Append(data); @@ -193,7 +182,6 @@ class SpecialEnv : public EnvWrapper { Status Close() { return base_->Close(); } Status Flush() { return base_->Flush(); } Status Sync() { - ++env_->sync_counter_; while (env_->delay_sstable_sync_.Acquire_Load() != nullptr) { env_->SleepForMicroseconds(100000); } @@ -220,7 +208,6 @@ class SpecialEnv : public EnvWrapper { Status Close() { return base_->Close(); } Status Flush() { return base_->Flush(); } Status Sync() { - ++env_->sync_counter_; if (env_->manifest_sync_error_.Acquire_Load() != nullptr) { return Status::IOError("simulated sync error"); } else { @@ -244,10 +231,7 @@ class SpecialEnv : public EnvWrapper { } Status Close() { return base_->Close(); } Status Flush() { return base_->Flush(); } - Status Sync() { - ++env_->sync_counter_; - return base_->Sync(); - } + Status Sync() { return base_->Sync(); } }; if (non_writable_.Acquire_Load() != nullptr) { @@ -338,22 +322,21 @@ class DBTest { kHashCuckoo = 7, kMergePut = 8, kFilter = 9, - kFullFilter = 10, - kUncompressed = 11, - kNumLevel_3 = 12, - kDBLogDir = 13, - kWalDirAndMmapReads = 14, - kManifestFileSize = 15, - kCompactOnFlush = 16, - kPerfOptions = 17, - kDeletesFilterFirst = 18, - kHashSkipList = 19, - kUniversalCompaction = 20, - kCompressedBlockCache = 21, - kInfiniteMaxOpenFiles = 22, - kxxHashChecksum = 23, - kFIFOCompaction = 24, - kEnd = 25 + kUncompressed = 10, + kNumLevel_3 = 11, + kDBLogDir = 12, + kWalDir = 13, + kManifestFileSize = 14, + kCompactOnFlush = 15, + kPerfOptions = 16, + kDeletesFilterFirst = 17, + kHashSkipList = 18, + kUniversalCompaction = 19, + kCompressedBlockCache = 20, + kInfiniteMaxOpenFiles = 21, + kxxHashChecksum = 22, + kFIFOCompaction = 23, + kEnd = 24 }; int option_config_; @@ -377,7 +360,6 @@ class DBTest { kSkipNoSeekToLast = 32, kSkipHashCuckoo = 64, kSkipFIFOCompaction = 128, - kSkipMmapReads = 256, }; @@ -437,10 +419,6 @@ class DBTest { option_config_ == kFIFOCompaction) { continue; } - if ((skip_mask & kSkipMmapReads) && - option_config_ == kWalDirAndMmapReads) { - continue; - } break; } @@ -468,30 +446,6 @@ class DBTest { } } - // Switch between different filter policy - // Jump from kDefault to kFilter to kFullFilter - bool ChangeFilterOptions(Options* prev_options = nullptr) { - if (option_config_ == kDefault) { - option_config_ = kFilter; - if (prev_options == nullptr) { - prev_options = &last_options_; - } - Destroy(prev_options); - TryReopen(); - return true; - } else if (option_config_ == kFilter) { - option_config_ = kFullFilter; - if (prev_options == nullptr) { - prev_options = &last_options_; - } - Destroy(prev_options); - TryReopen(); - return true; - } else { - return false; - } - } - // Return the current option configuration. Options CurrentOptions( const anon::OptionsOverride& options_override = anon::OptionsOverride()) { @@ -530,10 +484,7 @@ class DBTest { options.merge_operator = MergeOperators::CreatePutOperator(); break; case kFilter: - table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); - break; - case kFullFilter: - table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); break; case kUncompressed: options.compression = kNoCompression; @@ -544,11 +495,8 @@ class DBTest { case kDBLogDir: options.db_log_dir = test::TmpDir(); break; - case kWalDirAndMmapReads: + case kWalDir: options.wal_dir = test::TmpDir() + "/wal"; - // mmap reads should be orthogonal to WalDir setting, so we piggyback to - // this option config to test mmap reads as well - options.allow_mmap_reads = true; break; case kManifestFileSize: options.max_manifest_file_size = 50; // 50 bytes @@ -807,12 +755,11 @@ class DBTest { } std::string AllEntriesFor(const Slice& user_key, int cf = 0) { - ScopedArenaIterator iter; - Arena arena; + Iterator* iter; if (cf == 0) { - iter.set(dbfull()->TEST_NewInternalIterator(&arena)); + iter = dbfull()->TEST_NewInternalIterator(); } else { - iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf])); + iter = dbfull()->TEST_NewInternalIterator(handles_[cf]); } InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); iter->Seek(target.Encode()); @@ -857,6 +804,7 @@ class DBTest { } result += "]"; } + delete iter; return result; } @@ -874,18 +822,6 @@ class DBTest { return atoi(property.c_str()); } - uint64_t SizeAtLevel(int level) { - std::vector metadata; - db_->GetLiveFilesMetaData(&metadata); - uint64_t sum = 0; - for (const auto& m : metadata) { - if (m.level == level) { - sum += m.size; - } - } - return sum; - } - int TotalTableFiles(int cf = 0, int levels = -1) { if (levels == -1) { levels = CurrentOptions().num_levels; @@ -1106,12 +1042,11 @@ class DBTest { // Utility method to test InplaceUpdate void validateNumberOfEntries(int numValues, int cf = 0) { - ScopedArenaIterator iter; - Arena arena; + Iterator* iter; if (cf != 0) { - iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf])); + iter = dbfull()->TEST_NewInternalIterator(handles_[cf]); } else { - iter.set(dbfull()->TEST_NewInternalIterator(&arena)); + iter = dbfull()->TEST_NewInternalIterator(); } iter->SeekToFirst(); ASSERT_EQ(iter->status().ok(), true); @@ -1125,6 +1060,7 @@ class DBTest { ASSERT_EQ(ikey.sequence, (unsigned)seq--); iter->Next(); } + delete iter; ASSERT_EQ(0, seq); } @@ -1179,17 +1115,6 @@ void VerifyTableProperties(DB* db, uint64_t expected_entries_size) { ASSERT_EQ(props.size(), unique_entries.size()); ASSERT_EQ(expected_entries_size, sum); } - -uint64_t GetNumberOfSstFilesForColumnFamily(DB* db, - std::string column_family_name) { - std::vector metadata; - db->GetLiveFilesMetaData(&metadata); - uint64_t result = 0; - for (auto& fileMetadata : metadata) { - result += (fileMetadata.column_family_name == column_family_name); - } - return result; -} } // namespace TEST(DBTest, Empty) { @@ -1282,98 +1207,6 @@ TEST(DBTest, ReadOnlyDB) { ASSERT_EQ("v2", Get("bar")); } -TEST(DBTest, CompactedDB) { - const uint64_t kFileSize = 1 << 20; - Options options; - options.disable_auto_compactions = true; - options.max_mem_compaction_level = 0; - options.write_buffer_size = kFileSize; - options.target_file_size_base = kFileSize; - options.max_bytes_for_level_base = 1 << 30; - options.compression = kNoCompression; - Reopen(&options); - // 1 L0 file, use CompactedDB if max_open_files = -1 - ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1'))); - Flush(); - Close(); - ASSERT_OK(ReadOnlyReopen(&options)); - Status s = Put("new", "value"); - ASSERT_EQ(s.ToString(), - "Not implemented: Not supported operation in read only mode."); - ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa")); - Close(); - options.max_open_files = -1; - ASSERT_OK(ReadOnlyReopen(&options)); - s = Put("new", "value"); - ASSERT_EQ(s.ToString(), - "Not implemented: Not supported in compacted db mode."); - ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa")); - Close(); - Reopen(&options); - // Add more L0 files - ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2'))); - Flush(); - ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a'))); - Flush(); - ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b'))); - Flush(); - Close(); - - ASSERT_OK(ReadOnlyReopen(&options)); - // Fallback to read-only DB - s = Put("new", "value"); - ASSERT_EQ(s.ToString(), - "Not implemented: Not supported operation in read only mode."); - Close(); - - // Full compaction - Reopen(&options); - // Add more keys - ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e'))); - ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f'))); - ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h'))); - ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i'))); - ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j'))); - db_->CompactRange(nullptr, nullptr); - ASSERT_EQ(3, NumTableFilesAtLevel(1)); - Close(); - - // CompactedDB - ASSERT_OK(ReadOnlyReopen(&options)); - s = Put("new", "value"); - ASSERT_EQ(s.ToString(), - "Not implemented: Not supported in compacted db mode."); - ASSERT_EQ("NOT_FOUND", Get("abc")); - ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa")); - ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb")); - ASSERT_EQ("NOT_FOUND", Get("ccc")); - ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee")); - ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff")); - ASSERT_EQ("NOT_FOUND", Get("ggg")); - ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh")); - ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii")); - ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj")); - ASSERT_EQ("NOT_FOUND", Get("kkk")); - - // MultiGet - std::vector values; - std::vector status_list = dbfull()->MultiGet(ReadOptions(), - std::vector({Slice("aaa"), Slice("ccc"), Slice("eee"), - Slice("ggg"), Slice("iii"), Slice("kkk")}), - &values); - ASSERT_EQ(status_list.size(), static_cast(6)); - ASSERT_EQ(values.size(), static_cast(6)); - ASSERT_OK(status_list[0]); - ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]); - ASSERT_TRUE(status_list[1].IsNotFound()); - ASSERT_OK(status_list[2]); - ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]); - ASSERT_TRUE(status_list[3].IsNotFound()); - ASSERT_OK(status_list[4]); - ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]); - ASSERT_TRUE(status_list[5].IsNotFound()); -} - // Make sure that when options.block_cache is set, after a new table is // created its index/filter blocks are added to block cache. TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { @@ -1429,7 +1262,6 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { TEST(DBTest, GetPropertiesOfAllTablesTest) { Options options = CurrentOptions(); - options.max_background_flushes = 0; Reopen(&options); // Create 4 tables for (int table = 0; table < 4; ++table) { @@ -1625,10 +1457,7 @@ TEST(DBTest, GetPicksCorrectFile) { TEST(DBTest, GetEncountersEmptyLevel) { do { - Options options = CurrentOptions(); - options.max_background_flushes = 0; - options.disableDataSync = true; - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}); // Arrange for the following to happen: // * sstable A in level 0 // * nothing in level 1 @@ -1791,8 +1620,8 @@ TEST(DBTest, NonBlockingIteration) { // This test verifies block cache behaviors, which is not used by plain // table format. // Exclude kHashCuckoo as it does not support iteration currently - } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo | - kSkipMmapReads)); + } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | + kSkipHashCuckoo)); } // A delete is skipped for key if KeyMayExist(key) returns False @@ -2676,49 +2505,6 @@ class SleepingBackgroundTask { bool done_with_sleep_; }; -TEST(DBTest, FlushEmptyColumnFamily) { - // Block flush thread and disable compaction thread - env_->SetBackgroundThreads(1, Env::HIGH); - env_->SetBackgroundThreads(1, Env::LOW); - SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - SleepingBackgroundTask sleeping_task_high; - env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_high, - Env::Priority::HIGH); - - Options options = CurrentOptions(); - // disable compaction - options.disable_auto_compactions = true; - WriteOptions writeOpt = WriteOptions(); - writeOpt.disableWAL = true; - options.max_write_buffer_number = 2; - options.min_write_buffer_number_to_merge = 1; - CreateAndReopenWithCF({"pikachu"}, &options); - - // Compaction can still go through even if no thread can flush the - // mem table. - ASSERT_OK(Flush(0)); - ASSERT_OK(Flush(1)); - - // Insert can go through - ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1")); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); - - ASSERT_EQ("v1", Get(0, "foo")); - ASSERT_EQ("v1", Get(1, "bar")); - - sleeping_task_high.WakeUp(); - sleeping_task_high.WaitUntilDone(); - - // Flush can still go through. - ASSERT_OK(Flush(0)); - ASSERT_OK(Flush(1)); - - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); -} - TEST(DBTest, GetProperty) { // Set sizes to both background thread pool to be 1 and block them. env_->SetBackgroundThreads(1, Env::HIGH); @@ -2912,44 +2698,6 @@ TEST(DBTest, RecoverDuringMemtableCompaction) { } while (ChangeOptions()); } -TEST(DBTest, FlushSchedule) { - Options options = CurrentOptions(); - options.disable_auto_compactions = true; - options.level0_stop_writes_trigger = 1 << 10; - options.level0_slowdown_writes_trigger = 1 << 10; - options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number = 2; - options.write_buffer_size = 100 * 1000; - CreateAndReopenWithCF({"pikachu"}, &options); - std::vector threads; - - std::atomic thread_num(0); - // each column family will have 5 thread, each thread generating 2 memtables. - // each column family should end up with 10 table files - for (int i = 0; i < 10; ++i) { - threads.emplace_back([&]() { - int a = thread_num.fetch_add(1); - Random rnd(a); - WriteOptions wo; - // this should fill up 2 memtables - for (int k = 0; k < 5000; ++k) { - ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), "")); - } - }); - } - - for (auto& t : threads) { - t.join(); - } - - auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default"); - auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu"); - ASSERT_LE(default_tables, static_cast(10)); - ASSERT_GT(default_tables, static_cast(0)); - ASSERT_LE(pikachu_tables, static_cast(10)); - ASSERT_GT(pikachu_tables, static_cast(0)); -} - TEST(DBTest, MinorCompactionsHappen) { do { Options options; @@ -4462,25 +4210,22 @@ TEST(DBTest, CompactionFilter) { // TODO: figure out sequence number squashtoo int count = 0; int total = 0; - Arena arena; - { - ScopedArenaIterator iter( - dbfull()->TEST_NewInternalIterator(&arena, handles_[1])); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - total++; - if (ikey.sequence != 0) { - count++; - } - iter->Next(); + Iterator* iter = dbfull()->TEST_NewInternalIterator(handles_[1]); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; } + iter->Next(); } ASSERT_EQ(total, 100000); ASSERT_EQ(count, 1); + delete iter; // overwrite all the 100K keys once again. for (int i = 0; i < 100000; i++) { @@ -4535,7 +4280,7 @@ TEST(DBTest, CompactionFilter) { ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); // Scan the entire database to ensure that nothing is left - Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->SeekToFirst(); count = 0; while (iter->Valid()) { @@ -4551,20 +4296,18 @@ TEST(DBTest, CompactionFilter) { // TODO: remove the following or design a different // test count = 0; - { - ScopedArenaIterator iter( - dbfull()->TEST_NewInternalIterator(&arena, handles_[1])); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - ASSERT_NE(ikey.sequence, (unsigned)0); - count++; - iter->Next(); - } - ASSERT_EQ(count, 0); + iter = dbfull()->TEST_NewInternalIterator(handles_[1]); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + ASSERT_NE(ikey.sequence, (unsigned)0); + count++; + iter->Next(); } + ASSERT_EQ(count, 0); + delete iter; } // Tests the edge case where compaction does not produce any output -- all @@ -4686,24 +4429,22 @@ TEST(DBTest, CompactionFilterContextManual) { // Verify total number of keys is correct after manual compaction. int count = 0; int total = 0; - { - Arena arena; - ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena)); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - total++; - if (ikey.sequence != 0) { - count++; - } - iter->Next(); + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; } - ASSERT_EQ(total, 700); - ASSERT_EQ(count, 1); + iter->Next(); } + ASSERT_EQ(total, 700); + ASSERT_EQ(count, 1); + delete iter; } class KeepFilterV2 : public CompactionFilterV2 { @@ -4860,27 +4601,25 @@ TEST(DBTest, CompactionFilterV2) { // All the files are in the lowest level. int count = 0; int total = 0; - { - Arena arena; - ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena)); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - total++; - if (ikey.sequence != 0) { - count++; - } - iter->Next(); + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; } + iter->Next(); } ASSERT_EQ(total, 100000); // 1 snapshot only. Since we are using universal compacton, // the sequence no is cleared for better compression ASSERT_EQ(count, 1); + delete iter; // create a new database with the compaction // filter in such a way that it deletes all keys @@ -4904,7 +4643,7 @@ TEST(DBTest, CompactionFilterV2) { ASSERT_EQ(NumTableFilesAtLevel(1), 0); // Scan the entire database to ensure that nothing is left - Iterator* iter = db_->NewIterator(ReadOptions()); + iter = db_->NewIterator(ReadOptions()); iter->SeekToFirst(); count = 0; while (iter->Valid()) { @@ -5232,9 +4971,7 @@ TEST(DBTest, Snapshot) { TEST(DBTest, HiddenValuesAreRemoved) { do { - Options options = CurrentOptions(); - options.max_background_flushes = 0; - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}); Random rnd(301); FillLevels("a", "z", 1); @@ -5325,9 +5062,7 @@ TEST(DBTest, CompactBetweenSnapshots) { } TEST(DBTest, DeletionMarkers1) { - Options options = CurrentOptions(); - options.max_background_flushes = 0; - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}); Put(1, "foo", "v1"); ASSERT_OK(Flush(1)); const int last = CurrentOptions().max_mem_compaction_level; @@ -5362,9 +5097,7 @@ TEST(DBTest, DeletionMarkers1) { } TEST(DBTest, DeletionMarkers2) { - Options options = CurrentOptions(); - options.max_background_flushes = 0; - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}); Put(1, "foo", "v1"); ASSERT_OK(Flush(1)); const int last = CurrentOptions().max_mem_compaction_level; @@ -5393,9 +5126,7 @@ TEST(DBTest, DeletionMarkers2) { TEST(DBTest, OverlapInLevel0) { do { - Options options = CurrentOptions(); - options.max_background_flushes = 0; - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}); int tmp = CurrentOptions().max_mem_compaction_level; ASSERT_EQ(tmp, 2) << "Fix test to match config"; @@ -5573,9 +5304,7 @@ TEST(DBTest, CustomComparator) { } TEST(DBTest, ManualCompaction) { - Options options = CurrentOptions(); - options.max_background_flushes = 0; - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}); ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2) << "Need to update this test to match kMaxMemCompactLevel"; @@ -5613,7 +5342,6 @@ TEST(DBTest, ManualCompaction) { if (iter == 0) { Options options = CurrentOptions(); - options.max_background_flushes = 0; options.num_levels = 3; options.create_if_missing = true; DestroyAndReopen(&options); @@ -5713,7 +5441,6 @@ TEST(DBTest, DBOpen_Options) { TEST(DBTest, DBOpen_Change_NumLevels) { Options opts; opts.create_if_missing = true; - opts.max_background_flushes = 0; DestroyAndReopen(&opts); ASSERT_TRUE(db_ != nullptr); CreateAndReopenWithCF({"pikachu"}, &opts); @@ -5764,8 +5491,8 @@ TEST(DBTest, DestroyDBMetaDatabase) { ASSERT_TRUE(!(DB::Open(opts, metametadbname, &db)).ok()); } -// Check that number of files does not grow when writes are dropped -TEST(DBTest, DropWrites) { +// Check that number of files does not grow when we are out of space +TEST(DBTest, NoSpace) { do { Options options = CurrentOptions(); options.env = env_; @@ -5776,7 +5503,7 @@ TEST(DBTest, DropWrites) { ASSERT_EQ("v1", Get("foo")); Compact("a", "z"); const int num_files = CountFiles(); - env_->drop_writes_.Release_Store(env_); // Force out-of-space errors + env_->no_space_.Release_Store(env_); // Force out-of-space errors env_->sleep_counter_.Reset(); for (int i = 0; i < 5; i++) { for (int level = 0; level < dbfull()->NumberLevels()-1; level++) { @@ -5788,7 +5515,7 @@ TEST(DBTest, DropWrites) { ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); ASSERT_EQ("5", property_value); - env_->drop_writes_.Release_Store(nullptr); + env_->no_space_.Release_Store(nullptr); ASSERT_LT(CountFiles(), num_files + 3); // Check that compaction attempts slept after errors @@ -5797,7 +5524,7 @@ TEST(DBTest, DropWrites) { } // Check background error counter bumped on flush failures. -TEST(DBTest, DropWritesFlush) { +TEST(DBTest, NoSpaceFlush) { do { Options options = CurrentOptions(); options.env = env_; @@ -5805,7 +5532,7 @@ TEST(DBTest, DropWritesFlush) { Reopen(&options); ASSERT_OK(Put("foo", "v1")); - env_->drop_writes_.Release_Store(env_); // Force out-of-space errors + env_->no_space_.Release_Store(env_); // Force out-of-space errors std::string property_value; // Background error count is 0 now. @@ -5829,30 +5556,6 @@ TEST(DBTest, DropWritesFlush) { } ASSERT_EQ("1", property_value); - env_->drop_writes_.Release_Store(nullptr); - } while (ChangeCompactOptions()); -} - -// Check that CompactRange() returns failure if there is not enough space left -// on device -TEST(DBTest, NoSpaceCompactRange) { - do { - Options options = CurrentOptions(); - options.env = env_; - options.disable_auto_compactions = true; - Reopen(&options); - - // generate 5 tables - for (int i = 0; i < 5; ++i) { - ASSERT_OK(Put(Key(i), Key(i) + "v")); - ASSERT_OK(Flush()); - } - - env_->no_space_.Release_Store(env_); // Force out-of-space errors - - Status s = db_->CompactRange(nullptr, nullptr); - ASSERT_TRUE(s.IsIOError()); - env_->no_space_.Release_Store(nullptr); } while (ChangeCompactOptions()); } @@ -5897,7 +5600,6 @@ TEST(DBTest, ManifestWriteError) { options.env = env_; options.create_if_missing = true; options.error_if_exists = false; - options.max_background_flushes = 0; DestroyAndReopen(&options); ASSERT_OK(Put("foo", "bar")); ASSERT_EQ("bar", Get("foo")); @@ -6031,166 +5733,6 @@ TEST(DBTest, BloomFilter) { } while (ChangeCompactOptions()); } -TEST(DBTest, BloomFilterRate) { - while (ChangeFilterOptions()) { - Options options = CurrentOptions(); - options.statistics = rocksdb::CreateDBStatistics(); - CreateAndReopenWithCF({"pikachu"}, &options); - - const int maxKey = 10000; - for (int i = 0; i < maxKey; i++) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - // Add a large key to make the file contain wide range - ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); - Flush(1); - - // Check if they can be found - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ(Key(i), Get(1, Key(i))); - } - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - - // Check if filter is useful - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333))); - } - ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98); - } -} - -TEST(DBTest, BloomFilterCompatibility) { - Options options; - options.statistics = rocksdb::CreateDBStatistics(); - BlockBasedTableOptions table_options; - table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - // Create with block based filter - CreateAndReopenWithCF({"pikachu"}, &options); - - const int maxKey = 10000; - for (int i = 0; i < maxKey; i++) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); - Flush(1); - - // Check db with full filter - table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ReopenWithColumnFamilies({"default", "pikachu"}, &options); - - // Check if they can be found - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ(Key(i), Get(1, Key(i))); - } - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); -} - -TEST(DBTest, BloomFilterReverseCompatibility) { - Options options; - options.statistics = rocksdb::CreateDBStatistics(); - BlockBasedTableOptions table_options; - table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - // Create with full filter - CreateAndReopenWithCF({"pikachu"}, &options); - - const int maxKey = 10000; - for (int i = 0; i < maxKey; i++) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); - Flush(1); - - // Check db with block_based filter - table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ReopenWithColumnFamilies({"default", "pikachu"}, &options); - - // Check if they can be found - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ(Key(i), Get(1, Key(i))); - } - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); -} - -namespace { -// A wrapped bloom over default FilterPolicy -class WrappedBloom : public FilterPolicy { - public: - explicit WrappedBloom(int bits_per_key) : - filter_(NewBloomFilterPolicy(bits_per_key)), - counter_(0) {} - - ~WrappedBloom() { delete filter_; } - - const char* Name() const override { return "WrappedRocksDbFilterPolicy"; } - - void CreateFilter(const rocksdb::Slice* keys, int n, std::string* dst) - const override { - std::unique_ptr user_keys(new rocksdb::Slice[n]); - for (int i = 0; i < n; ++i) { - user_keys[i] = convertKey(keys[i]); - } - return filter_->CreateFilter(user_keys.get(), n, dst); - } - - bool KeyMayMatch(const rocksdb::Slice& key, const rocksdb::Slice& filter) - const override { - counter_++; - return filter_->KeyMayMatch(convertKey(key), filter); - } - - uint32_t GetCounter() { return counter_; } - - private: - const FilterPolicy* filter_; - mutable uint32_t counter_; - - rocksdb::Slice convertKey(const rocksdb::Slice& key) const { - return key; - } -}; -} // namespace - -TEST(DBTest, BloomFilterWrapper) { - Options options; - options.statistics = rocksdb::CreateDBStatistics(); - - BlockBasedTableOptions table_options; - WrappedBloom* policy = new WrappedBloom(10); - table_options.filter_policy.reset(policy); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - CreateAndReopenWithCF({"pikachu"}, &options); - - const int maxKey = 10000; - for (int i = 0; i < maxKey; i++) { - ASSERT_OK(Put(1, Key(i), Key(i))); - } - // Add a large key to make the file contain wide range - ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); - ASSERT_EQ(0U, policy->GetCounter()); - Flush(1); - - // Check if they can be found - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ(Key(i), Get(1, Key(i))); - } - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ(1U * maxKey, policy->GetCounter()); - - // Check if filter is useful - for (int i = 0; i < maxKey; i++) { - ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333))); - } - ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98); - ASSERT_EQ(2U * maxKey, policy->GetCounter()); -} - TEST(DBTest, SnapshotFiles) { do { Options options = CurrentOptions(); @@ -6410,18 +5952,18 @@ namespace { std::vector ListSpecificFiles( Env* env, const std::string& path, const FileType expected_file_type) { std::vector files; - std::vector file_numbers; + std::vector log_files; env->GetChildren(path, &files); uint64_t number; FileType type; for (size_t i = 0; i < files.size(); ++i) { if (ParseFileName(files[i], &number, &type)) { if (type == expected_file_type) { - file_numbers.push_back(number); + log_files.push_back(number); } } } - return std::move(file_numbers); + return std::move(log_files); } std::vector ListLogFiles(Env* env, const std::string& path) { @@ -6455,130 +5997,6 @@ TEST(DBTest, FlushOneColumnFamily) { } } -// In https://reviews.facebook.net/D20661 we change -// recovery behavior: previously for each log file each column family -// memtable was flushed, even it was empty. Now it's changed: -// we try to create the smallest number of table files by merging -// updates from multiple logs -TEST(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) { - Options options; - options.write_buffer_size = 5000000; - CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, &options); - - // Since we will reopen DB with smaller write_buffer_size, - // each key will go to new SST file - ASSERT_OK(Put(1, Key(10), DummyString(1000000))); - ASSERT_OK(Put(1, Key(10), DummyString(1000000))); - ASSERT_OK(Put(1, Key(10), DummyString(1000000))); - ASSERT_OK(Put(1, Key(10), DummyString(1000000))); - - ASSERT_OK(Put(3, Key(10), DummyString(1))); - // Make 'dobrynia' to be flushed and new WAL file to be created - ASSERT_OK(Put(2, Key(10), DummyString(7500000))); - ASSERT_OK(Put(2, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); - { - auto tables = ListTableFiles(env_, dbname_); - ASSERT_EQ(tables.size(), static_cast(1)); - // Make sure 'dobrynia' was flushed: check sst files amount - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(1)); - } - // New WAL file - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(3, Key(10), DummyString(1))); - ASSERT_OK(Put(3, Key(10), DummyString(1))); - ASSERT_OK(Put(3, Key(10), DummyString(1))); - - options.write_buffer_size = 10; - ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, - &options); - { - // No inserts => default is empty - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(0)); - // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(5)); - // 1 SST for big key + 1 SST for small one - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(2)); - // 1 SST for all keys - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(1)); - } -} - -// In https://reviews.facebook.net/D20661 we change -// recovery behavior: previously for each log file each column family -// memtable was flushed, even it wasn't empty. Now it's changed: -// we try to create the smallest number of table files by merging -// updates from multiple logs -TEST(DBTest, RecoverCheckFileAmount) { - Options options; - options.write_buffer_size = 100000; - CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, &options); - - ASSERT_OK(Put(0, Key(1), DummyString(1))); - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(2, Key(1), DummyString(1))); - - // Make 'nikitich' memtable to be flushed - ASSERT_OK(Put(3, Key(10), DummyString(1002400))); - ASSERT_OK(Put(3, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); - // 4 memtable are not flushed, 1 sst file - { - auto tables = ListTableFiles(env_, dbname_); - ASSERT_EQ(tables.size(), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(1)); - } - // Memtable for 'nikitich' has flushed, new WAL file has opened - // 4 memtable still not flushed - - // Write to new WAL file - ASSERT_OK(Put(0, Key(1), DummyString(1))); - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(2, Key(1), DummyString(1))); - - // Fill up 'nikitich' one more time - ASSERT_OK(Put(3, Key(10), DummyString(1002400))); - // make it flush - ASSERT_OK(Put(3, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); - // There are still 4 memtable not flushed, and 2 sst tables - ASSERT_OK(Put(0, Key(1), DummyString(1))); - ASSERT_OK(Put(1, Key(1), DummyString(1))); - ASSERT_OK(Put(2, Key(1), DummyString(1))); - - { - auto tables = ListTableFiles(env_, dbname_); - ASSERT_EQ(tables.size(), static_cast(2)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(2)); - } - - ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, - &options); - { - std::vector table_files = ListTableFiles(env_, dbname_); - // Check, that records for 'default', 'dobrynia' and 'pikachu' from - // first, second and third WALs went to the same SST. - // So, there is 6 SSTs: three for 'nikitich', one for 'default', one for - // 'dobrynia', one for 'pikachu' - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), - static_cast(3)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), - static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(1)); - } -} - TEST(DBTest, WALArchivalTtl) { do { Options options = CurrentOptions(); @@ -6714,7 +6132,7 @@ TEST(DBTest, PurgeInfoLogs) { ASSERT_EQ(5, info_log_count); Destroy(&options); - // For mode (1), test DestroyDB() to delete all the logs under DB dir. + // For mode (1), test DestoryDB() to delete all the logs under DB dir. // For mode (2), no info log file should have been put under DB dir. std::vector db_files; env_->GetChildren(dbname_, &db_files); @@ -7765,49 +7183,47 @@ void PrefixScanInit(DBTest *dbtest) { } // namespace TEST(DBTest, PrefixScan) { - while (ChangeFilterOptions()) { - int count; - Slice prefix; - Slice key; - char buf[100]; - Iterator* iter; - snprintf(buf, sizeof(buf), "03______:"); - prefix = Slice(buf, 8); - key = Slice(buf, 9); - // db configs - env_->count_random_reads_ = true; - Options options = CurrentOptions(); - options.env = env_; - options.prefix_extractor.reset(NewFixedPrefixTransform(8)); - options.disable_auto_compactions = true; - options.max_background_compactions = 2; - options.create_if_missing = true; - options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + int count; + Slice prefix; + Slice key; + char buf[100]; + Iterator* iter; + snprintf(buf, sizeof(buf), "03______:"); + prefix = Slice(buf, 8); + key = Slice(buf, 9); + // db configs + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.disable_auto_compactions = true; + options.max_background_compactions = 2; + options.create_if_missing = true; + options.memtable_factory.reset(NewHashSkipListRepFactory(16)); - BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.filter_policy.reset(NewBloomFilterPolicy(10)); - table_options.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + table_options.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - // 11 RAND I/Os - DestroyAndReopen(&options); - PrefixScanInit(this); - count = 0; - env_->random_read_counter_.Reset(); - iter = db_->NewIterator(ReadOptions()); - for (iter->Seek(prefix); iter->Valid(); iter->Next()) { - if (! iter->key().starts_with(prefix)) { - break; - } - count++; + // 11 RAND I/Os + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + iter = db_->NewIterator(ReadOptions()); + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + if (! iter->key().starts_with(prefix)) { + break; } - ASSERT_OK(iter->status()); - delete iter; - ASSERT_EQ(count, 2); - ASSERT_EQ(env_->random_read_counter_.Read(), 2); - Close(); - } // end of while + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + Close(); } TEST(DBTest, TailingIteratorSingle) { @@ -8122,25 +7538,18 @@ TEST(DBTest, FIFOCompactionTest) { } TEST(DBTest, SimpleWriteTimeoutTest) { - // Block compaction thread, which will also block the flushes because - // max_background_flushes == 0, so flushes are getting executed by the - // compaction thread - env_->SetBackgroundThreads(1, Env::LOW); - SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - Options options; options.env = env_; options.create_if_missing = true; options.write_buffer_size = 100000; options.max_background_flushes = 0; options.max_write_buffer_number = 2; + options.min_write_buffer_number_to_merge = 3; options.max_total_wal_size = std::numeric_limits::max(); WriteOptions write_opt = WriteOptions(); write_opt.timeout_hint_us = 0; DestroyAndReopen(&options); - // fill the two write buffers + // fill the two write buffer ASSERT_OK(Put(Key(1), Key(1) + std::string(100000, 'v'), write_opt)); ASSERT_OK(Put(Key(2), Key(2) + std::string(100000, 'v'), write_opt)); // As the only two write buffers are full in this moment, the third @@ -8148,9 +7557,6 @@ TEST(DBTest, SimpleWriteTimeoutTest) { write_opt.timeout_hint_us = 50; ASSERT_TRUE( Put(Key(3), Key(3) + std::string(100000, 'v'), write_opt).IsTimedOut()); - - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); } // Multi-threaded Timeout Test @@ -8251,26 +7657,6 @@ TEST(DBTest, MTRandomTimeoutTest) { } } -TEST(DBTest, Level0StopWritesTest) { - Options options = CurrentOptions(); - options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 4; - options.disable_auto_compactions = 4; - options.max_mem_compaction_level = 0; - Reopen(&options); - - // create 4 level0 tables - for (int i = 0; i < 4; ++i) { - Put("a", "b"); - Flush(); - } - - WriteOptions woptions; - woptions.timeout_hint_us = 30 * 1000; // 30 ms - Status s = Put("a", "b", woptions); - ASSERT_TRUE(s.IsTimedOut()); -} - } // anonymous namespace /* @@ -8357,285 +7743,6 @@ TEST(DBTest, TableOptionsSanitizeTest) { ASSERT_TRUE(TryReopen(&options).IsNotSupported()); } -TEST(DBTest, DBIteratorBoundTest) { - Options options; - options.env = env_; - options.create_if_missing = true; - - options.prefix_extractor = nullptr; - DestroyAndReopen(&options); - ASSERT_OK(Put("a", "0")); - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("foo1", "bar1")); - ASSERT_OK(Put("g1", "0")); - - // testing basic case with no iterate_upper_bound and no prefix_extractor - { - ReadOptions ro; - ro.iterate_upper_bound = nullptr; - - std::unique_ptr iter(db_->NewIterator(ro)); - - iter->Seek("foo"); - - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("foo")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("g1")), 0); - } - - // testing iterate_upper_bound and forward iterator - // to make sure it stops at bound - { - ReadOptions ro; - // iterate_upper_bound points beyond the last expected entry - Slice prefix("foo2"); - ro.iterate_upper_bound = &prefix; - - std::unique_ptr iter(db_->NewIterator(ro)); - - iter->Seek("foo"); - - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("foo")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(("foo1")), 0); - - iter->Next(); - // should stop here... - ASSERT_TRUE(!iter->Valid()); - } - - // prefix is the first letter of the key - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - - DestroyAndReopen(&options); - ASSERT_OK(Put("a", "0")); - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("foo1", "bar1")); - ASSERT_OK(Put("g1", "0")); - - // testing with iterate_upper_bound and prefix_extractor - // Seek target and iterate_upper_bound are not is same prefix - // This should be an error - { - ReadOptions ro; - Slice prefix("g1"); - ro.iterate_upper_bound = &prefix; - - std::unique_ptr iter(db_->NewIterator(ro)); - - iter->Seek("foo"); - - ASSERT_TRUE(!iter->Valid()); - ASSERT_TRUE(iter->status().IsInvalidArgument()); - } - - // testing that iterate_upper_bound prevents iterating over deleted items - // if the bound has already reached - { - options.prefix_extractor = nullptr; - DestroyAndReopen(&options); - ASSERT_OK(Put("a", "0")); - ASSERT_OK(Put("b", "0")); - ASSERT_OK(Put("b1", "0")); - ASSERT_OK(Put("c", "0")); - ASSERT_OK(Put("d", "0")); - ASSERT_OK(Put("e", "0")); - ASSERT_OK(Delete("c")); - ASSERT_OK(Delete("d")); - - // base case with no bound - ReadOptions ro; - ro.iterate_upper_bound = nullptr; - - std::unique_ptr iter(db_->NewIterator(ro)); - - iter->Seek("b"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("b")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(("b1")), 0); - - perf_context.Reset(); - iter->Next(); - - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 2); - - // now testing with iterate_bound - Slice prefix("c"); - ro.iterate_upper_bound = &prefix; - - iter.reset(db_->NewIterator(ro)); - - perf_context.Reset(); - - iter->Seek("b"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("b")), 0); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(("b1")), 0); - - iter->Next(); - // the iteration should stop as soon as the the bound key is reached - // even though the key is deleted - // hence internal_delete_skipped_count should be 0 - ASSERT_TRUE(!iter->Valid()); - ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 0); - } -} - -TEST(DBTest, WriteSingleThreadEntry) { - std::vector threads; - dbfull()->TEST_LockMutex(); - auto w = dbfull()->TEST_BeginWrite(); - threads.emplace_back([&] { Put("a", "b"); }); - env_->SleepForMicroseconds(10000); - threads.emplace_back([&] { Flush(); }); - env_->SleepForMicroseconds(10000); - dbfull()->TEST_UnlockMutex(); - dbfull()->TEST_LockMutex(); - dbfull()->TEST_EndWrite(w); - dbfull()->TEST_UnlockMutex(); - - for (auto& t : threads) { - t.join(); - } -} - -TEST(DBTest, DisableDataSyncTest) { - // iter 0 -- no sync - // iter 1 -- sync - for (int iter = 0; iter < 2; ++iter) { - Options options = CurrentOptions(); - options.disableDataSync = iter == 0; - options.create_if_missing = true; - options.env = env_; - Reopen(&options); - CreateAndReopenWithCF({"pikachu"}, &options); - - MakeTables(10, "a", "z"); - Compact("a", "z"); - - if (iter == 0) { - ASSERT_EQ(env_->sync_counter_.load(), 0); - } else { - ASSERT_GT(env_->sync_counter_.load(), 0); - } - Destroy(&options); - } -} - -TEST(DBTest, DynamicCompactionOptions) { - const uint64_t k64KB = 1 << 16; - const uint64_t k128KB = 1 << 17; - const uint64_t k256KB = 1 << 18; - const uint64_t k5KB = 5 * 1024; - Options options; - options.env = env_; - options.create_if_missing = true; - options.compression = kNoCompression; - options.max_background_compactions = 4; - options.hard_rate_limit = 1.1; - options.write_buffer_size = k128KB; - options.max_write_buffer_number = 2; - // Compaction related options - options.level0_file_num_compaction_trigger = 3; - options.level0_slowdown_writes_trigger = 10; - options.level0_stop_writes_trigger = 20; - options.max_grandparent_overlap_factor = 10; - options.expanded_compaction_factor = 25; - options.source_compaction_factor = 1; - options.target_file_size_base = k128KB; - options.target_file_size_multiplier = 1; - options.max_bytes_for_level_base = k256KB; - options.max_bytes_for_level_multiplier = 4; - DestroyAndReopen(&options); - - auto gen_l0_kb = [this](int start, int size, int stride = 1) { - Random rnd(301); - std::vector values; - for (int i = 0; i < size; i++) { - values.push_back(RandomString(&rnd, 1024)); - ASSERT_OK(Put(Key(start + stride * i), values[i])); - } - dbfull()->TEST_WaitForFlushMemTable(); - }; - - // Write 3 files that have the same key range, trigger compaction and - // result in one L1 file - gen_l0_kb(0, 128); - ASSERT_EQ(NumTableFilesAtLevel(0), 1); - gen_l0_kb(0, 128); - ASSERT_EQ(NumTableFilesAtLevel(0), 2); - gen_l0_kb(0, 128); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ("0,1", FilesPerLevel()); - std::vector metadata; - db_->GetLiveFilesMetaData(&metadata); - ASSERT_EQ(1U, metadata.size()); - ASSERT_LE(metadata[0].size, k128KB + k5KB); // < 128KB + 5KB - ASSERT_GE(metadata[0].size, k128KB - k5KB); // > 128B - 5KB - - // Make compaction trigger and file size smaller - ASSERT_TRUE(dbfull()->SetOptions({ - {"level0_file_num_compaction_trigger", "2"}, - {"target_file_size_base", "65536"} - })); - - gen_l0_kb(0, 128); - ASSERT_EQ("1,1", FilesPerLevel()); - gen_l0_kb(0, 128); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ("0,2", FilesPerLevel()); - metadata.clear(); - db_->GetLiveFilesMetaData(&metadata); - ASSERT_EQ(2U, metadata.size()); - ASSERT_LE(metadata[0].size, k64KB + k5KB); // < 64KB + 5KB - ASSERT_GE(metadata[0].size, k64KB - k5KB); // > 64KB - 5KB - - // Change base level size to 1MB - ASSERT_TRUE(dbfull()->SetOptions({ {"max_bytes_for_level_base", "1048576"} })); - - // writing 56 x 128KB => 7MB - // (L1 + L2) = (1 + 4) * 1MB = 5MB - for (int i = 0; i < 56; ++i) { - gen_l0_kb(i, 128, 56); - } - dbfull()->TEST_WaitForCompact(); - ASSERT_TRUE(SizeAtLevel(1) < 1048576 * 1.1); - ASSERT_TRUE(SizeAtLevel(2) < 4 * 1048576 * 1.1); - - // Change multiplier to 2 with smaller base - ASSERT_TRUE(dbfull()->SetOptions({ - {"max_bytes_for_level_multiplier", "2"}, - {"max_bytes_for_level_base", "262144"} - })); - - // writing 16 x 128KB - // (L1 + L2 + L3) = (1 + 2 + 4) * 256KB - for (int i = 0; i < 16; ++i) { - gen_l0_kb(i, 128, 50); - } - dbfull()->TEST_WaitForCompact(); - ASSERT_TRUE(SizeAtLevel(1) < 262144 * 1.1); - ASSERT_TRUE(SizeAtLevel(2) < 2 * 262144 * 1.1); - ASSERT_TRUE(SizeAtLevel(3) < 4 * 262144 * 1.1); -} - } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/dbformat.h b/db/dbformat.h index 516a4693b3..eb5d8ed534 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -244,7 +244,7 @@ class IterKey { Slice GetKey() const { return Slice(key_, key_size_); } - size_t Size() { return key_size_; } + const size_t Size() { return key_size_; } void Clear() { key_size_ = 0; } diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index a5af312848..14f0324c17 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -34,7 +34,6 @@ class DeleteFileTest { DeleteFileTest() { db_ = nullptr; env_ = Env::Default(); - options_.max_background_flushes = 0; options_.write_buffer_size = 1024*1024*1000; options_.target_file_size_base = 1024*1024*1000; options_.max_bytes_for_level_base = 1024*1024*1000; diff --git a/db/filename.cc b/db/filename.cc index a8f6852968..42c7efb781 100644 --- a/db/filename.cc +++ b/db/filename.cc @@ -6,10 +6,7 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include "db/filename.h" #include diff --git a/db/flush_scheduler.cc b/db/flush_scheduler.cc deleted file mode 100644 index 636ff5a98e..0000000000 --- a/db/flush_scheduler.cc +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#include "db/flush_scheduler.h" - -#include - -#include "db/column_family.h" - -namespace rocksdb { - -void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) { -#ifndef NDEBUG - assert(column_families_set_.find(cfd) == column_families_set_.end()); - column_families_set_.insert(cfd); -#endif // NDEBUG - cfd->Ref(); - column_families_.push_back(cfd); -} - -ColumnFamilyData* FlushScheduler::GetNextColumnFamily() { - ColumnFamilyData* cfd = nullptr; - while (column_families_.size() > 0) { - cfd = column_families_.front(); - column_families_.pop_front(); - if (cfd->IsDropped()) { - if (cfd->Unref()) { - delete cfd; - } - } else { - break; - } - } -#ifndef NDEBUG - if (cfd != nullptr) { - auto itr = column_families_set_.find(cfd); - assert(itr != column_families_set_.end()); - column_families_set_.erase(itr); - } -#endif // NDEBUG - return cfd; -} - -bool FlushScheduler::Empty() { return column_families_.empty(); } - -void FlushScheduler::Clear() { - for (auto cfd : column_families_) { -#ifndef NDEBUG - auto itr = column_families_set_.find(cfd); - assert(itr != column_families_set_.end()); - column_families_set_.erase(itr); -#endif // NDEBUG - if (cfd->Unref()) { - delete cfd; - } - } - column_families_.clear(); -} - -} // namespace rocksdb diff --git a/db/flush_scheduler.h b/db/flush_scheduler.h deleted file mode 100644 index 201e4a13c7..0000000000 --- a/db/flush_scheduler.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#pragma once - -#include -#include -#include -#include - -namespace rocksdb { - -class ColumnFamilyData; - -// This class is thread-compatible. It's should only be accessed from single -// write thread (between BeginWrite() and EndWrite()) -class FlushScheduler { - public: - FlushScheduler() = default; - ~FlushScheduler() = default; - - void ScheduleFlush(ColumnFamilyData* cfd); - // Returns Ref()-ed column family. Client needs to Unref() - ColumnFamilyData* GetNextColumnFamily(); - - bool Empty(); - - void Clear(); - - private: - std::deque column_families_; -#ifndef NDEBUG - std::set column_families_set_; -#endif // NDEBUG -}; - -} // namespace rocksdb diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 684045e056..74e6dd2492 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -125,19 +125,16 @@ ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, mutable_iter_(nullptr), current_(nullptr), valid_(false), - is_prev_set_(false), - is_prev_inclusive_(false) {} + is_prev_set_(false) {} ForwardIterator::~ForwardIterator() { Cleanup(); } void ForwardIterator::Cleanup() { - if (mutable_iter_ != nullptr) { - mutable_iter_->~Iterator(); - } + delete mutable_iter_; for (auto* m : imm_iters_) { - m->~Iterator(); + delete m; } imm_iters_.clear(); for (auto* f : l0_iters_) { @@ -315,12 +312,11 @@ void ForwardIterator::SeekInternal(const Slice& internal_key, } } - if (seek_to_first) { + if (seek_to_first || immutable_min_heap_.empty()) { is_prev_set_ = false; } else { prev_key_.SetKey(internal_key); is_prev_set_ = true; - is_prev_inclusive_ = true; } } else if (current_ && current_ != mutable_iter_) { // current_ is one of immutable iterators, push it back to the heap @@ -345,20 +341,8 @@ void ForwardIterator::Next() { } } else if (current_ != mutable_iter_) { // It is going to advance immutable iterator - - bool update_prev_key = true; - if (is_prev_set_ && prefix_extractor_) { - // advance prev_key_ to current_ only if they share the same prefix - update_prev_key = - prefix_extractor_->Transform(prev_key_.GetKey()).compare( - prefix_extractor_->Transform(current_->key())) == 0; - } - - if (update_prev_key) { - prev_key_.SetKey(current_->key()); - is_prev_set_ = true; - is_prev_inclusive_ = false; - } + prev_key_.SetKey(current_->key()); + is_prev_set_ = true; } current_->Next(); @@ -417,8 +401,8 @@ void ForwardIterator::RebuildIterators() { Cleanup(); // New sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_)); - mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_); - sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_); + mutable_iter_ = sv_->mem->NewIterator(read_options_); + sv_->imm->AddIterators(read_options_, &imm_iters_); const auto& l0_files = sv_->current->files_[0]; l0_iters_.reserve(l0_files.size()); for (const auto* l0 : l0_files) { @@ -490,14 +474,7 @@ void ForwardIterator::UpdateCurrent() { } bool ForwardIterator::NeedToSeekImmutable(const Slice& target) { - // We maintain the interval (prev_key_, immutable_min_heap_.top()->key()) - // such that there are no records with keys within that range in - // immutable_min_heap_. Since immutable structures (SST files and immutable - // memtables) can't change in this version, we don't need to do a seek if - // 'target' belongs to that interval (immutable_min_heap_.top() is already - // at the correct position). - - if (!valid_ || !current_ || !is_prev_set_) { + if (!valid_ || !is_prev_set_) { return true; } Slice prev_key = prev_key_.GetKey(); @@ -506,17 +483,13 @@ bool ForwardIterator::NeedToSeekImmutable(const Slice& target) { return true; } if (cfd_->internal_comparator().InternalKeyComparator::Compare( - prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) { + prev_key, target) >= 0) { return true; } - - if (immutable_min_heap_.empty() && current_ == mutable_iter_) { - // Nothing to seek on. - return false; - } - if (cfd_->internal_comparator().InternalKeyComparator::Compare( - target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key() - : current_->key()) > 0) { + if (immutable_min_heap_.empty() || + cfd_->internal_comparator().InternalKeyComparator::Compare( + target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key() + : current_->key()) > 0) { return true; } return false; diff --git a/db/forward_iterator.h b/db/forward_iterator.h index 4d3761ee16..bbf423a507 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -14,7 +14,6 @@ #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "db/dbformat.h" -#include "util/arena.h" namespace rocksdb { @@ -101,8 +100,6 @@ class ForwardIterator : public Iterator { IterKey prev_key_; bool is_prev_set_; - bool is_prev_inclusive_; - Arena arena_; }; } // namespace rocksdb diff --git a/db/internal_stats.cc b/db/internal_stats.cc index c9f9306e29..3142d13b30 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -7,11 +7,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/internal_stats.h" - -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include #include "db/column_family.h" diff --git a/db/log_and_apply_bench.cc b/db/log_and_apply_bench.cc index eba0a2787e..a5aa950173 100644 --- a/db/log_and_apply_bench.cc +++ b/db/log_and_apply_bench.cc @@ -9,7 +9,6 @@ #include "util/testharness.h" #include "util/benchharness.h" #include "db/version_set.h" -#include "db/write_controller.h" #include "util/mutexlock.h" namespace rocksdb { @@ -22,7 +21,6 @@ std::string MakeKey(unsigned int num) { void BM_LogAndApply(int iters, int num_base_files) { VersionSet* vset; - WriteController wc; ColumnFamilyData* default_cfd; uint64_t fnum = 1; port::Mutex mu; @@ -49,7 +47,7 @@ void BM_LogAndApply(int iters, int num_base_files) { options.db_paths.emplace_back(dbname, 0); // The parameter of table cache is passed in as null, so any file I/O // operation is likely to fail. - vset = new VersionSet(dbname, &options, sopt, nullptr, &wc); + vset = new VersionSet(dbname, &options, sopt, nullptr); std::vector dummy; dummy.push_back(ColumnFamilyDescriptor()); ASSERT_OK(vset->Recover(dummy)); @@ -60,8 +58,7 @@ void BM_LogAndApply(int iters, int num_base_files) { InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion); vbase.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1); } - ASSERT_OK(vset->LogAndApply(default_cfd, - *default_cfd->GetLatestMutableCFOptions(), &vbase, &mu)); + ASSERT_OK(vset->LogAndApply(default_cfd, &vbase, &mu)); } for (int i = 0; i < iters; i++) { @@ -70,10 +67,8 @@ void BM_LogAndApply(int iters, int num_base_files) { InternalKey start(MakeKey(2 * fnum), 1, kTypeValue); InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion); vedit.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1); - vset->LogAndApply(default_cfd, *default_cfd->GetLatestMutableCFOptions(), - &vedit, &mu); + vset->LogAndApply(default_cfd, &vedit, &mu); } - delete vset; } BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_1_file, 1000, 1) diff --git a/db/memtable.cc b/db/memtable.cc index b9b99a6840..e9e7051c75 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -31,57 +31,41 @@ namespace rocksdb { -MemTableOptions::MemTableOptions( - const MutableCFOptions& mutable_cf_options, const Options& options) - : write_buffer_size(mutable_cf_options.write_buffer_size), - arena_block_size(mutable_cf_options.arena_block_size), - memtable_prefix_bloom_bits(mutable_cf_options.memtable_prefix_bloom_bits), - memtable_prefix_bloom_probes( - mutable_cf_options.memtable_prefix_bloom_probes), - memtable_prefix_bloom_huge_page_tlb_size( - mutable_cf_options.memtable_prefix_bloom_huge_page_tlb_size), - inplace_update_support(options.inplace_update_support), - inplace_update_num_locks(options.inplace_update_num_locks), - inplace_callback(options.inplace_callback), - max_successive_merges(mutable_cf_options.max_successive_merges), - filter_deletes(mutable_cf_options.filter_deletes) {} - -MemTable::MemTable(const InternalKeyComparator& cmp, - const ImmutableCFOptions& ioptions, - const MemTableOptions& moptions) +MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) : comparator_(cmp), - ioptions_(ioptions), - moptions_(moptions), refs_(0), - kArenaBlockSize(OptimizeBlockSize(moptions.arena_block_size)), - arena_(moptions.arena_block_size), - table_(ioptions.memtable_factory->CreateMemTableRep( - comparator_, &arena_, ioptions.prefix_extractor, ioptions.info_log)), + kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)), + kWriteBufferSize(options.write_buffer_size), + arena_(options.arena_block_size), + table_(options.memtable_factory->CreateMemTableRep( + comparator_, &arena_, options.prefix_extractor.get(), + options.info_log.get())), num_entries_(0), flush_in_progress_(false), flush_completed_(false), file_number_(0), first_seqno_(0), mem_next_logfile_number_(0), - locks_(moptions.inplace_update_support ? moptions.inplace_update_num_locks - : 0), - prefix_extractor_(ioptions.prefix_extractor), - should_flush_(ShouldFlushNow()), - flush_scheduled_(false) { + locks_(options.inplace_update_support ? options.inplace_update_num_locks + : 0), + prefix_extractor_(options.prefix_extractor.get()), + should_flush_(ShouldFlushNow()) { // if should_flush_ == true without an entry inserted, something must have // gone wrong already. assert(!should_flush_); - if (prefix_extractor_ && moptions.memtable_prefix_bloom_bits > 0) { + if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) { prefix_bloom_.reset(new DynamicBloom( &arena_, - moptions.memtable_prefix_bloom_bits, ioptions.bloom_locality, - moptions.memtable_prefix_bloom_probes, nullptr, - moptions.memtable_prefix_bloom_huge_page_tlb_size, - ioptions.info_log)); + options.memtable_prefix_bloom_bits, options.bloom_locality, + options.memtable_prefix_bloom_probes, nullptr, + options.memtable_prefix_bloom_huge_page_tlb_size, + options.info_log.get())); } } -MemTable::~MemTable() { assert(refs_ == 0); } +MemTable::~MemTable() { + assert(refs_ == 0); +} size_t MemTable::ApproximateMemoryUsage() { size_t arena_usage = arena_.ApproximateMemoryUsage(); @@ -113,16 +97,14 @@ bool MemTable::ShouldFlushNow() const { // if we can still allocate one more block without exceeding the // over-allocation ratio, then we should not flush. if (allocated_memory + kArenaBlockSize < - moptions_.write_buffer_size + - kArenaBlockSize * kAllowOverAllocationRatio) { + kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) { return false; } - // if user keeps adding entries that exceeds moptions.write_buffer_size, - // we need to flush earlier even though we still have much available - // memory left. - if (allocated_memory > moptions_.write_buffer_size + - kArenaBlockSize * kAllowOverAllocationRatio) { + // if user keeps adding entries that exceeds kWriteBufferSize, we need to + // flush earlier even though we still have much available memory left. + if (allocated_memory > + kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) { return true; } @@ -193,12 +175,12 @@ const char* EncodeKey(std::string* scratch, const Slice& target) { class MemTableIterator: public Iterator { public: MemTableIterator( - const MemTable& mem, const ReadOptions& read_options, Arena* arena) + const MemTable& mem, const ReadOptions& options, Arena* arena) : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), valid_(false), arena_mode_(arena != nullptr) { - if (prefix_extractor_ != nullptr && !read_options.total_order_seek) { + if (prefix_extractor_ != nullptr && !options.total_order_seek) { bloom_ = mem.prefix_bloom_.get(); iter_ = mem.table_->GetDynamicPrefixIterator(arena); } else { @@ -266,10 +248,14 @@ class MemTableIterator: public Iterator { void operator=(const MemTableIterator&); }; -Iterator* MemTable::NewIterator(const ReadOptions& read_options, Arena* arena) { - assert(arena != nullptr); - auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); - return new (mem) MemTableIterator(*this, read_options, arena); +Iterator* MemTable::NewIterator(const ReadOptions& options, Arena* arena) { + if (arena == nullptr) { + return new MemTableIterator(*this, options, nullptr); + } else { + auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); + return new (mem) + MemTableIterator(*this, options, arena); + } } port::RWMutex* MemTable::GetLock(const Slice& key) { @@ -413,6 +399,7 @@ static bool SaveValue(void* arg, const char* entry) { *(s->found_final_value) = true; return false; } + std::string merge_result; // temporary area for merge results later Slice v = GetLengthPrefixedSlice(key_ptr + key_length); *(s->merge_in_progress) = true; merge_context->PushOperand(v); @@ -429,9 +416,9 @@ static bool SaveValue(void* arg, const char* entry) { } bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, - MergeContext* merge_context) { + MergeContext& merge_context, const Options& options) { // The sequence number is updated synchronously in version_set.h - if (IsEmpty()) { + if (first_seqno_ == 0) { // Avoiding recording stats for speed. return false; } @@ -453,11 +440,11 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, saver.value = value; saver.status = s; saver.mem = this; - saver.merge_context = merge_context; - saver.merge_operator = ioptions_.merge_operator; - saver.logger = ioptions_.info_log; - saver.inplace_update_support = moptions_.inplace_update_support; - saver.statistics = ioptions_.statistics; + saver.merge_context = &merge_context; + saver.merge_operator = options.merge_operator.get(); + saver.logger = options.info_log.get(); + saver.inplace_update_support = options.inplace_update_support; + saver.statistics = options.statistics.get(); table_->Get(key, &saver, SaveValue); } @@ -529,7 +516,8 @@ void MemTable::Update(SequenceNumber seq, bool MemTable::UpdateCallback(SequenceNumber seq, const Slice& key, - const Slice& delta) { + const Slice& delta, + const Options& options) { LookupKey lkey(key, seq); Slice memkey = lkey.memtable_key(); @@ -564,8 +552,8 @@ bool MemTable::UpdateCallback(SequenceNumber seq, std::string str_value; WriteLock wl(GetLock(lkey.user_key())); - auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size, - delta, &str_value); + auto status = options.inplace_callback(prev_buffer, &new_prev_size, + delta, &str_value); if (status == UpdateStatus::UPDATED_INPLACE) { // Value already updated by callback. assert(new_prev_size <= prev_size); @@ -578,12 +566,12 @@ bool MemTable::UpdateCallback(SequenceNumber seq, memcpy(p, prev_buffer, new_prev_size); } } - RecordTick(ioptions_.statistics, NUMBER_KEYS_UPDATED); + RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED); should_flush_ = ShouldFlushNow(); return true; } else if (status == UpdateStatus::UPDATED) { Add(seq, kTypeValue, key, Slice(str_value)); - RecordTick(ioptions_.statistics, NUMBER_KEYS_WRITTEN); + RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN); should_flush_ = ShouldFlushNow(); return true; } else if (status == UpdateStatus::UPDATE_FAILED) { diff --git a/db/memtable.h b/db/memtable.h index ce6cce7f6e..8bc281c6cf 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -10,18 +10,14 @@ #pragma once #include #include -#include #include -#include #include "db/dbformat.h" #include "db/skiplist.h" #include "db/version_edit.h" #include "rocksdb/db.h" #include "rocksdb/memtablerep.h" -#include "rocksdb/immutable_options.h" #include "util/arena.h" #include "util/dynamic_bloom.h" -#include "util/mutable_cf_options.h" namespace rocksdb { @@ -30,25 +26,6 @@ class Mutex; class MemTableIterator; class MergeContext; -struct MemTableOptions { - explicit MemTableOptions( - const MutableCFOptions& mutable_cf_options, - const Options& options); - size_t write_buffer_size; - size_t arena_block_size; - uint32_t memtable_prefix_bloom_bits; - uint32_t memtable_prefix_bloom_probes; - size_t memtable_prefix_bloom_huge_page_tlb_size; - bool inplace_update_support; - size_t inplace_update_num_locks; - UpdateStatus (*inplace_callback)(char* existing_value, - uint32_t* existing_value_size, - Slice delta_value, - std::string* merged_value); - size_t max_successive_merges; - bool filter_deletes; -}; - class MemTable { public: struct KeyComparator : public MemTableRep::KeyComparator { @@ -63,8 +40,7 @@ class MemTable { // MemTables are reference counted. The initial reference count // is zero and the caller must call Ref() at least once. explicit MemTable(const InternalKeyComparator& comparator, - const ImmutableCFOptions& ioptions, - const MemTableOptions& moptions); + const Options& options); ~MemTable(); @@ -91,11 +67,7 @@ class MemTable { // This method heuristically determines if the memtable should continue to // host more data. - bool ShouldScheduleFlush() const { - return flush_scheduled_ == false && should_flush_; - } - - void MarkFlushScheduled() { flush_scheduled_ = true; } + bool ShouldFlush() const { return should_flush_; } // Return an iterator that yields the contents of the memtable. // @@ -109,7 +81,8 @@ class MemTable { // arena: If not null, the arena needs to be used to allocate the Iterator. // Calling ~Iterator of the iterator will destroy all the states but // those allocated in arena. - Iterator* NewIterator(const ReadOptions& read_options, Arena* arena); + Iterator* NewIterator(const ReadOptions& options, + Arena* arena = nullptr); // Add an entry into memtable that maps key to value at the // specified sequence number and with the specified type. @@ -127,7 +100,7 @@ class MemTable { // store MergeInProgress in s, and return false. // Else, return false. bool Get(const LookupKey& key, std::string* value, Status* s, - MergeContext* merge_context); + MergeContext& merge_context, const Options& options); // Attempts to update the new_value inplace, else does normal Add // Pseudocode @@ -151,7 +124,8 @@ class MemTable { // else return false bool UpdateCallback(SequenceNumber seq, const Slice& key, - const Slice& delta); + const Slice& delta, + const Options& options); // Returns the number of successive merge entries starting from the newest // entry for the key up to the last non-merge entry or last entry for the @@ -164,9 +138,6 @@ class MemTable { // Returns the edits area that is needed for flushing the memtable VersionEdit* GetEdits() { return &edit_; } - // Returns if there is no entry inserted to the mem table. - bool IsEmpty() const { return first_seqno_ == 0; } - // Returns the sequence number of the first element that was inserted // into the memtable SequenceNumber GetFirstSequenceNumber() { return first_seqno_; } @@ -199,11 +170,8 @@ class MemTable { const Arena& TEST_GetArena() const { return arena_; } - const ImmutableCFOptions* GetImmutableOptions() const { return &ioptions_; } - const MemTableOptions* GetMemTableOptions() const { return &moptions_; } - private: - // Dynamically check if we can add more incoming entries + // Dynamically check if we can add more incoming entries. bool ShouldFlushNow() const; friend class MemTableIterator; @@ -211,10 +179,9 @@ class MemTable { friend class MemTableList; KeyComparator comparator_; - const ImmutableCFOptions& ioptions_; - const MemTableOptions moptions_; int refs_; const size_t kArenaBlockSize; + const size_t kWriteBufferSize; Arena arena_; unique_ptr table_; @@ -247,9 +214,6 @@ class MemTable { // a flag indicating if a memtable has met the criteria to flush bool should_flush_; - - // a flag indicating if flush has been scheduled - bool flush_scheduled_; }; extern const char* EncodeKey(std::string* scratch, const Slice& target); diff --git a/db/memtable_list.cc b/db/memtable_list.cc index bd48f1f475..d3fc1356b2 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -62,9 +62,10 @@ int MemTableList::size() const { // Return the most recent value found, if any. // Operands stores the list of merge operations to apply, so far. bool MemTableListVersion::Get(const LookupKey& key, std::string* value, - Status* s, MergeContext* merge_context) { + Status* s, MergeContext& merge_context, + const Options& options) { for (auto& memtable : memlist_) { - if (memtable->Get(key, value, s, merge_context)) { + if (memtable->Get(key, value, s, merge_context, options)) { return true; } } @@ -72,10 +73,9 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value, } void MemTableListVersion::AddIterators(const ReadOptions& options, - std::vector* iterator_list, - Arena* arena) { + std::vector* iterator_list) { for (auto& m : memlist_) { - iterator_list->push_back(m->NewIterator(options, arena)); + iterator_list->push_back(m->NewIterator(options)); } } @@ -160,8 +160,7 @@ void MemTableList::RollbackMemtableFlush(const autovector& mems, // Record a successful flush in the manifest file Status MemTableList::InstallMemtableFlushResults( - ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, - const autovector& mems, VersionSet* vset, + ColumnFamilyData* cfd, const autovector& mems, VersionSet* vset, port::Mutex* mu, Logger* info_log, uint64_t file_number, FileNumToPathIdMap* pending_outputs, autovector* to_delete, Directory* db_directory, LogBuffer* log_buffer) { @@ -198,7 +197,7 @@ Status MemTableList::InstallMemtableFlushResults( cfd->GetName().c_str(), (unsigned long)m->file_number_); // this can release and reacquire the mutex. - s = vset->LogAndApply(cfd, mutable_cf_options, &m->edit_, mu, db_directory); + s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory); // we will be changing the version in the next code path, // so we better create a new one, since versions are immutable diff --git a/db/memtable_list.h b/db/memtable_list.h index d93c7df922..f4923e831a 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -46,10 +46,10 @@ class MemTableListVersion { // Search all the memtables starting from the most recent one. // Return the most recent value found, if any. bool Get(const LookupKey& key, std::string* value, Status* s, - MergeContext* merge_context); + MergeContext& merge_context, const Options& options); void AddIterators(const ReadOptions& options, - std::vector* iterator_list, Arena* arena); + std::vector* iterator_list); void AddIterators(const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder); @@ -113,8 +113,7 @@ class MemTableList { // Commit a successful flush in the manifest file Status InstallMemtableFlushResults( - ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, - const autovector& m, VersionSet* vset, + ColumnFamilyData* cfd, const autovector& m, VersionSet* vset, port::Mutex* mu, Logger* info_log, uint64_t file_number, FileNumToPathIdMap* pending_outputs, autovector* to_delete, Directory* db_directory, LogBuffer* log_buffer); diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 1750d265c2..bb0f96f158 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -192,17 +192,16 @@ extern const uint64_t kPlainTableMagicNumber; class TestPlainTableReader : public PlainTableReader { public: - TestPlainTableReader(const EnvOptions& env_options, + TestPlainTableReader(const EnvOptions& storage_options, const InternalKeyComparator& icomparator, EncodingType encoding_type, uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, const TableProperties* table_properties, unique_ptr&& file, - const ImmutableCFOptions& ioptions, - bool* expect_bloom_not_match, + const Options& options, bool* expect_bloom_not_match, bool store_index_in_file) - : PlainTableReader(ioptions, std::move(file), env_options, icomparator, + : PlainTableReader(options, std::move(file), storage_options, icomparator, encoding_type, file_size, table_properties), expect_bloom_not_match_(expect_bloom_not_match) { Status s = MmapDataFile(); @@ -219,7 +218,7 @@ class TestPlainTableReader : public PlainTableReader { PlainTablePropertyNames::kBloomVersion); ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end()); ASSERT_EQ(bloom_version_ptr->second, std::string("1")); - if (ioptions.bloom_locality > 0) { + if (options.bloom_locality > 0) { auto num_blocks_ptr = props->user_collected_properties.find( PlainTablePropertyNames::kNumBloomBlocks); ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end()); @@ -254,26 +253,25 @@ class TestPlainTableFactory : public PlainTableFactory { store_index_in_file_(options.store_index_in_file), expect_bloom_not_match_(expect_bloom_not_match) {} - Status NewTableReader(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, + Status NewTableReader(const Options& options, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const override { TableProperties* props = nullptr; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - ioptions.env, ioptions.info_log, &props); + options.env, options.info_log.get(), &props); ASSERT_TRUE(s.ok()); if (store_index_in_file_) { BlockHandle bloom_block_handle; s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber, - ioptions.env, BloomBlockBuilder::kBloomBlock, + options.env, BloomBlockBuilder::kBloomBlock, &bloom_block_handle); ASSERT_TRUE(s.ok()); BlockHandle index_block_handle; s = FindMetaBlock( - file.get(), file_size, kPlainTableMagicNumber, ioptions.env, + file.get(), file_size, kPlainTableMagicNumber, options.env, PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle); ASSERT_TRUE(s.ok()); } @@ -286,9 +284,9 @@ class TestPlainTableFactory : public PlainTableFactory { DecodeFixed32(encoding_type_prop->second.c_str())); std::unique_ptr new_reader(new TestPlainTableReader( - env_options, internal_comparator, encoding_type, file_size, + soptions, internal_comparator, encoding_type, file_size, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props, - std::move(file), ioptions, expect_bloom_not_match_, + std::move(file), options, expect_bloom_not_match_, store_index_in_file_)); *table = std::move(new_reader); diff --git a/db/repair.cc b/db/repair.cc index 80fb92bd9d..820cc19243 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -31,10 +31,7 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include "db/builder.h" #include "db/db_impl.h" @@ -49,9 +46,6 @@ #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "rocksdb/options.h" -#include "rocksdb/immutable_options.h" -#include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -64,7 +58,6 @@ class Repairer { env_(options.env), icmp_(options.comparator), options_(SanitizeOptions(dbname, &icmp_, options)), - ioptions_(options_), raw_table_cache_( // TableCache can be small since we expect each table to be opened // once. @@ -72,7 +65,7 @@ class Repairer { options_.table_cache_remove_scan_count_limit)), next_file_number_(1) { table_cache_ = - new TableCache(ioptions_, env_options_, raw_table_cache_.get()); + new TableCache(&options_, storage_options_, raw_table_cache_.get()); edit_ = new VersionEdit(); } @@ -114,9 +107,8 @@ class Repairer { std::string const dbname_; Env* const env_; - const InternalKeyComparator icmp_; - const Options options_; - const ImmutableCFOptions ioptions_; + InternalKeyComparator const icmp_; + Options const options_; std::shared_ptr raw_table_cache_; TableCache* table_cache_; VersionEdit* edit_; @@ -126,7 +118,7 @@ class Repairer { std::vector logs_; std::vector tables_; uint64_t next_file_number_; - const EnvOptions env_options_; + const EnvOptions storage_options_; Status FindFiles() { std::vector filenames; @@ -198,7 +190,7 @@ class Repairer { // Open the log file std::string logname = LogFileName(dbname_, log); unique_ptr lfile; - Status status = env_->NewSequentialFile(logname, &lfile, env_options_); + Status status = env_->NewSequentialFile(logname, &lfile, storage_options_); if (!status.ok()) { return status; } @@ -219,8 +211,7 @@ class Repairer { std::string scratch; Slice record; WriteBatch batch; - MemTable* mem = new MemTable(icmp_, ioptions_, - MemTableOptions(MutableCFOptions(options_, ioptions_), options_)); + MemTable* mem = new MemTable(icmp_, options_); auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_); mem->Ref(); int counter = 0; @@ -245,15 +236,12 @@ class Repairer { // since ExtractMetaData() will also generate edits. FileMetaData meta; meta.fd = FileDescriptor(next_file_number_++, 0, 0); - { - ReadOptions ro; - ro.total_order_seek = true; - Arena arena; - ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); - status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_, - iter.get(), &meta, icmp_, 0, 0, kNoCompression, - CompressionOptions()); - } + ReadOptions ro; + ro.total_order_seek = true; + Iterator* iter = mem->NewIterator(ro); + status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_, + iter, &meta, icmp_, 0, 0, kNoCompression); + delete iter; delete mem->Unref(); delete cf_mems_default; mem = nullptr; @@ -298,7 +286,7 @@ class Repairer { file_size); if (status.ok()) { Iterator* iter = table_cache_->NewIterator( - ReadOptions(), env_options_, icmp_, t->meta.fd); + ReadOptions(), storage_options_, icmp_, t->meta.fd); bool empty = true; ParsedInternalKey parsed; t->min_sequence = 0; @@ -338,7 +326,7 @@ class Repairer { std::string tmp = TempFileName(dbname_, 1); unique_ptr file; Status status = env_->NewWritableFile( - tmp, &file, env_->OptimizeForManifestWrite(env_options_)); + tmp, &file, env_->OptimizeForManifestWrite(storage_options_)); if (!status.ok()) { return status; } diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc new file mode 100644 index 0000000000..e88485070e --- /dev/null +++ b/db/simple_table_db_test.cc @@ -0,0 +1,810 @@ +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "rocksdb/statistics.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/table_builder.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "utilities/merge_operators.h" + +using std::unique_ptr; + +// IS THIS FILE STILL NEEDED? +namespace rocksdb { + +// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built +// as production quality. +// SimpleTable requires the input key size to be fixed 16 bytes, value cannot +// be longer than 150000 bytes and stored data on disk in this format: +// +--------------------------------------------+ <= key1 offset +// | key1 | value_size (4 bytes) | | +// +----------------------------------------+ | +// | value1 | +// | | +// +----------------------------------------+---+ <= key2 offset +// | key2 | value_size (4 bytes) | | +// +----------------------------------------+ | +// | value2 | +// | | +// | ...... | +// +-----------------+--------------------------+ <= index_block_offset +// | key1 | key1 offset (8 bytes) | +// +-----------------+--------------------------+ +// | key2 | key2 offset (8 bytes) | +// +-----------------+--------------------------+ +// | key3 | key3 offset (8 bytes) | +// +-----------------+--------------------------+ +// | ...... | +// +-----------------+------------+-------------+ +// | index_block_offset (8 bytes) | +// +------------------------------+ + +// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built +// as production quality. +class SimpleTableReader: public TableReader { +public: + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. + // + // If successful, returns ok and sets "*table" to the newly opened + // table. The client should delete "*table" when no longer needed. + // If there was an error while initializing the table, sets "*table" + // to nullptr and returns a non-ok status. Does not take ownership of + // "*source", but the client must ensure that "source" remains live + // for the duration of the returned table's lifetime. + // + // *file must remain live while this Table is in use. + static Status Open(const Options& options, const EnvOptions& soptions, + unique_ptr && file, uint64_t file_size, + unique_ptr* table_reader); + + Iterator* NewIterator(const ReadOptions&, Arena* arena) override; + + Status Get(const ReadOptions&, const Slice& key, void* arg, + bool (*handle_result)(void* arg, const ParsedInternalKey& k, + const Slice& v), + void (*mark_key_may_exist)(void*) = nullptr) override; + + uint64_t ApproximateOffsetOf(const Slice& key) override; + + virtual size_t ApproximateMemoryUsage() const override { return 0; } + + void SetupForCompaction() override; + + std::shared_ptr GetTableProperties() const override; + + ~SimpleTableReader(); + +private: + struct Rep; + Rep* rep_; + + explicit SimpleTableReader(Rep* rep) { + rep_ = rep; + } + friend class TableCache; + friend class SimpleTableIterator; + + Status GetOffset(const Slice& target, uint64_t* offset); + + // No copying allowed + explicit SimpleTableReader(const TableReader&) = delete; + void operator=(const TableReader&) = delete; +}; + +// Iterator to iterate SimpleTable +class SimpleTableIterator: public Iterator { +public: + explicit SimpleTableIterator(SimpleTableReader* table); + ~SimpleTableIterator(); + + bool Valid() const; + + void SeekToFirst(); + + void SeekToLast(); + + void Seek(const Slice& target); + + void Next(); + + void Prev(); + + Slice key() const; + + Slice value() const; + + Status status() const; + +private: + SimpleTableReader* table_; + uint64_t offset_; + uint64_t next_offset_; + Slice key_; + Slice value_; + char tmp_str_[4]; + char* key_str_; + char* value_str_; + int value_str_len_; + Status status_; + // No copying allowed + SimpleTableIterator(const SimpleTableIterator&) = delete; + void operator=(const Iterator&) = delete; +}; + +struct SimpleTableReader::Rep { + ~Rep() { + } + Rep(const EnvOptions& storage_options, uint64_t index_start_offset, + int num_entries) : + soptions(storage_options), index_start_offset(index_start_offset), + num_entries(num_entries) { + } + + Options options; + const EnvOptions& soptions; + Status status; + unique_ptr file; + uint64_t index_start_offset; + int num_entries; + std::shared_ptr table_properties; + + const static int user_key_size = 16; + const static int offset_length = 8; + const static int key_footer_len = 8; + + static int GetInternalKeyLength() { + return user_key_size + key_footer_len; + } +}; + +SimpleTableReader::~SimpleTableReader() { + delete rep_; +} + +Status SimpleTableReader::Open(const Options& options, + const EnvOptions& soptions, + unique_ptr && file, + uint64_t size, + unique_ptr* table_reader) { + char footer_space[Rep::offset_length]; + Slice footer_input; + Status s = file->Read(size - Rep::offset_length, Rep::offset_length, + &footer_input, footer_space); + if (s.ok()) { + uint64_t index_start_offset = DecodeFixed64(footer_space); + + int num_entries = (size - Rep::offset_length - index_start_offset) + / (Rep::GetInternalKeyLength() + Rep::offset_length); + SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions, + index_start_offset, + num_entries); + + rep->file = std::move(file); + rep->options = options; + table_reader->reset(new SimpleTableReader(rep)); + } + return s; +} + +void SimpleTableReader::SetupForCompaction() { +} + +std::shared_ptr SimpleTableReader::GetTableProperties() + const { + return rep_->table_properties; +} + +Iterator* SimpleTableReader::NewIterator(const ReadOptions& options, + Arena* arena) { + if (arena == nullptr) { + return new SimpleTableIterator(this); + } else { + auto mem = arena->AllocateAligned(sizeof(SimpleTableIterator)); + return new (mem) SimpleTableIterator(this); + } +} + +Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) { + uint32_t left = 0; + uint32_t right = rep_->num_entries - 1; + char key_chars[Rep::GetInternalKeyLength()]; + Slice tmp_slice; + + uint32_t target_offset = 0; + while (left <= right) { + uint32_t mid = (left + right + 1) / 2; + + uint64_t offset_to_read = rep_->index_start_offset + + (Rep::GetInternalKeyLength() + Rep::offset_length) * mid; + Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(), + &tmp_slice, key_chars); + if (!s.ok()) { + return s; + } + + InternalKeyComparator ikc(rep_->options.comparator); + int compare_result = ikc.Compare(tmp_slice, target); + + if (compare_result < 0) { + if (left == right) { + target_offset = right + 1; + break; + } + left = mid; + } else { + if (left == right) { + target_offset = left; + break; + } + right = mid - 1; + } + } + + if (target_offset >= (uint32_t) rep_->num_entries) { + *offset = rep_->index_start_offset; + return Status::OK(); + } + + char value_offset_chars[Rep::offset_length]; + + int64_t offset_for_value_offset = rep_->index_start_offset + + (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset + + Rep::GetInternalKeyLength(); + Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length, + &tmp_slice, value_offset_chars); + if (s.ok()) { + *offset = DecodeFixed64(value_offset_chars); + } + return s; +} + +Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k, + void* arg, + bool (*saver)(void*, const ParsedInternalKey&, + const Slice&), + void (*mark_key_may_exist)(void*)) { + Status s; + SimpleTableIterator* iter = new SimpleTableIterator(this); + for (iter->Seek(k); iter->Valid(); iter->Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(iter->key(), &parsed_key)) { + return Status::Corruption(Slice()); + } + + if (!(*saver)(arg, parsed_key, iter->value())) { + break; + } + } + s = iter->status(); + delete iter; + return s; +} + +uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) { + return 0; +} + +SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) : + table_(table) { + key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()]; + value_str_len_ = -1; + SeekToFirst(); +} + +SimpleTableIterator::~SimpleTableIterator() { + delete[] key_str_; + if (value_str_len_ >= 0) { + delete[] value_str_; + } +} + +bool SimpleTableIterator::Valid() const { + return offset_ < table_->rep_->index_start_offset; +} + +void SimpleTableIterator::SeekToFirst() { + next_offset_ = 0; + Next(); +} + +void SimpleTableIterator::SeekToLast() { + assert(false); +} + +void SimpleTableIterator::Seek(const Slice& target) { + Status s = table_->GetOffset(target, &next_offset_); + if (!s.ok()) { + status_ = s; + } + Next(); +} + +void SimpleTableIterator::Next() { + offset_ = next_offset_; + if (offset_ >= table_->rep_->index_start_offset) { + return; + } + Slice result; + int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength(); + + Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result, + key_str_); + next_offset_ += internal_key_size; + key_ = result; + + Slice value_size_slice; + s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_); + next_offset_ += 4; + uint32_t value_size = DecodeFixed32(tmp_str_); + + Slice value_slice; + if ((int) value_size > value_str_len_) { + if (value_str_len_ >= 0) { + delete[] value_str_; + } + value_str_ = new char[value_size]; + value_str_len_ = value_size; + } + s = table_->rep_->file->Read(next_offset_, value_size, &value_slice, + value_str_); + next_offset_ += value_size; + value_ = value_slice; +} + +void SimpleTableIterator::Prev() { + assert(false); +} + +Slice SimpleTableIterator::key() const { + Log(table_->rep_->options.info_log, "key!!!!"); + return key_; +} + +Slice SimpleTableIterator::value() const { + return value_; +} + +Status SimpleTableIterator::status() const { + return status_; +} + +class SimpleTableBuilder: public TableBuilder { +public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). The output file + // will be part of level specified by 'level'. A value of -1 means + // that the caller does not know which level the output file will reside. + SimpleTableBuilder(const Options& options, WritableFile* file, + CompressionType compression_type); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~SimpleTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + +private: + struct Rep; + Rep* rep_; + + // No copying allowed + SimpleTableBuilder(const SimpleTableBuilder&) = delete; + void operator=(const SimpleTableBuilder&) = delete; +}; + +struct SimpleTableBuilder::Rep { + Options options; + WritableFile* file; + uint64_t offset = 0; + Status status; + + uint64_t num_entries = 0; + + bool closed = false; // Either Finish() or Abandon() has been called. + + const static int user_key_size = 16; + const static int offset_length = 8; + const static int key_footer_len = 8; + + static int GetInternalKeyLength() { + return user_key_size + key_footer_len; + } + + std::string index; + + Rep(const Options& opt, WritableFile* f) : + options(opt), file(f) { + } + ~Rep() { + } +}; + +SimpleTableBuilder::SimpleTableBuilder(const Options& options, + WritableFile* file, + CompressionType compression_type) : + rep_(new SimpleTableBuilder::Rep(options, file)) { +} + +SimpleTableBuilder::~SimpleTableBuilder() { + delete (rep_); +} + +void SimpleTableBuilder::Add(const Slice& key, const Slice& value) { + assert((int ) key.size() == Rep::GetInternalKeyLength()); + + // Update index + rep_->index.append(key.data(), key.size()); + PutFixed64(&(rep_->index), rep_->offset); + + // Write key-value pair + rep_->file->Append(key); + rep_->offset += Rep::GetInternalKeyLength(); + + std::string size; + int value_size = value.size(); + PutFixed32(&size, value_size); + Slice sizeSlice(size); + rep_->file->Append(sizeSlice); + rep_->file->Append(value); + rep_->offset += value_size + 4; + + rep_->num_entries++; +} + +Status SimpleTableBuilder::status() const { + return Status::OK(); +} + +Status SimpleTableBuilder::Finish() { + Rep* r = rep_; + assert(!r->closed); + r->closed = true; + + uint64_t index_offset = rep_->offset; + Slice index_slice(rep_->index); + rep_->file->Append(index_slice); + rep_->offset += index_slice.size(); + + std::string index_offset_str; + PutFixed64(&index_offset_str, index_offset); + Slice foot_slice(index_offset_str); + rep_->file->Append(foot_slice); + rep_->offset += foot_slice.size(); + + return Status::OK(); +} + +void SimpleTableBuilder::Abandon() { + rep_->closed = true; +} + +uint64_t SimpleTableBuilder::NumEntries() const { + return rep_->num_entries; +} + +uint64_t SimpleTableBuilder::FileSize() const { + return rep_->offset; +} + +class SimpleTableFactory: public TableFactory { +public: + ~SimpleTableFactory() { + } + SimpleTableFactory() { + } + const char* Name() const override { + return "SimpleTable"; + } + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_key, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const; + + TableBuilder* NewTableBuilder(const Options& options, + const InternalKeyComparator& internal_key, + WritableFile* file, + CompressionType compression_type) const; + + virtual Status SanitizeDBOptions(const DBOptions* db_opts) const override { + return Status::OK(); + } + + virtual std::string GetPrintableTableOptions() const override { + return std::string(); + } +}; + +Status SimpleTableFactory::NewTableReader( + const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_key, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const { + + return SimpleTableReader::Open(options, soptions, std::move(file), file_size, + table_reader); +} + +TableBuilder* SimpleTableFactory::NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_key, + WritableFile* file, CompressionType compression_type) const { + return new SimpleTableBuilder(options, file, compression_type); +} + +class SimpleTableDBTest { +protected: +public: + std::string dbname_; + Env* env_; + DB* db_; + + Options last_options_; + + SimpleTableDBTest() : + env_(Env::Default()) { + dbname_ = test::TmpDir() + "/simple_table_db_test"; + ASSERT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + ~SimpleTableDBTest() { + delete db_; + ASSERT_OK(DestroyDB(dbname_, Options())); + } + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + options.table_factory.reset(new SimpleTableFactory()); + return options; + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void Close() { + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(Options* options = nullptr) { + //Destroy using last options + Destroy(&last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(Options* options) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + } + + Status PureReopen(Options* options, DB** db) { + return DB::Open(*options, dbname_, db); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const Slice& k, const Slice& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + + int NumTableFilesAtLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } +}; + +TEST(SimpleTableDBTest, Empty) { + ASSERT_TRUE(db_ != nullptr); + ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); +} + +TEST(SimpleTableDBTest, ReadWrite) { + ASSERT_OK(Put("0000000000000foo", "v1")); + ASSERT_EQ("v1", Get("0000000000000foo")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("0000000000000foo", "v3")); + ASSERT_EQ("v3", Get("0000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); +} + +TEST(SimpleTableDBTest, Flush) { + ASSERT_OK(Put("0000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("0000000000000foo", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); +} + +TEST(SimpleTableDBTest, Flush2) { + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("0000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_OK(Put("0000000000000foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("0000000000000foo")); + + ASSERT_OK(Put("0000000000000eee", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000eee")); + + ASSERT_OK(Delete("0000000000000bar")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); + + ASSERT_OK(Put("0000000000000eee", "v5")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v5", Get("0000000000000eee")); +} + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key_______%06d", i); + return std::string(buf); +} + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +TEST(SimpleTableDBTest, CompactionTrigger) { + Options options = CurrentOptions(); + options.write_buffer_size = 100 << 10; //100KB + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.level0_file_num_compaction_trigger = 3; + Reopen(&options); + + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/snapshot.h b/db/snapshot.h index 51fa556c82..2c2e3eac80 100644 --- a/db/snapshot.h +++ b/db/snapshot.h @@ -71,7 +71,7 @@ class SnapshotList { } // get the sequence number of the most recent snapshot - SequenceNumber GetNewest() { + const SequenceNumber GetNewest() { if (empty()) { return 0; } diff --git a/db/table_cache.cc b/db/table_cache.cc index 580e8049d9..c362499a68 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -15,7 +15,6 @@ #include "rocksdb/statistics.h" #include "table/iterator_wrapper.h" #include "table/table_reader.h" -#include "table/get_context.h" #include "util/coding.h" #include "util/stop_watch.h" @@ -37,10 +36,12 @@ static Slice GetSliceForFileNumber(const uint64_t* file_number) { sizeof(*file_number)); } -TableCache::TableCache(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, Cache* const cache) - : ioptions_(ioptions), - env_options_(env_options), +TableCache::TableCache(const Options* options, + const EnvOptions& storage_options, Cache* const cache) + : env_(options->env), + db_paths_(options->db_paths), + options_(options), + storage_options_(storage_options), cache_(cache) {} TableCache::~TableCache() { @@ -54,7 +55,7 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) { cache_->Release(handle); } -Status TableCache::FindTable(const EnvOptions& env_options, +Status TableCache::FindTable(const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, Cache::Handle** handle, const bool no_io) { @@ -67,24 +68,24 @@ Status TableCache::FindTable(const EnvOptions& env_options, return Status::Incomplete("Table not found in table_cache, no_io is set"); } std::string fname = - TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId()); + TableFileName(db_paths_, fd.GetNumber(), fd.GetPathId()); unique_ptr file; unique_ptr table_reader; - s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options); - RecordTick(ioptions_.statistics, NO_FILE_OPENS); + s = env_->NewRandomAccessFile(fname, &file, toptions); + RecordTick(options_->statistics.get(), NO_FILE_OPENS); if (s.ok()) { - if (ioptions_.advise_random_on_open) { + if (options_->advise_random_on_open) { file->Hint(RandomAccessFile::RANDOM); } - StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS); - s = ioptions_.table_factory->NewTableReader( - ioptions_, env_options, internal_comparator, std::move(file), + StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS); + s = options_->table_factory->NewTableReader( + *options_, toptions, internal_comparator, std::move(file), fd.GetFileSize(), &table_reader); } if (!s.ok()) { assert(table_reader == nullptr); - RecordTick(ioptions_.statistics, NO_FILE_ERRORS); + RecordTick(options_->statistics.get(), NO_FILE_ERRORS); // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. } else { @@ -96,7 +97,7 @@ Status TableCache::FindTable(const EnvOptions& env_options, } Iterator* TableCache::NewIterator(const ReadOptions& options, - const EnvOptions& env_options, + const EnvOptions& toptions, const InternalKeyComparator& icomparator, const FileDescriptor& fd, TableReader** table_reader_ptr, @@ -108,7 +109,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, Cache::Handle* handle = nullptr; Status s; if (table_reader == nullptr) { - s = FindTable(env_options, icomparator, fd, &handle, + s = FindTable(toptions, icomparator, fd, &handle, options.read_tier == kBlockCacheTier); if (!s.ok()) { return NewErrorIterator(s, arena); @@ -133,33 +134,34 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, Status TableCache::Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileDescriptor& fd, const Slice& k, - GetContext* get_context) { + const FileDescriptor& fd, const Slice& k, void* arg, + bool (*saver)(void*, const ParsedInternalKey&, + const Slice&), + void (*mark_key_may_exist)(void*)) { TableReader* t = fd.table_reader; Status s; Cache::Handle* handle = nullptr; if (!t) { - s = FindTable(env_options_, internal_comparator, fd, &handle, + s = FindTable(storage_options_, internal_comparator, fd, &handle, options.read_tier == kBlockCacheTier); if (s.ok()) { t = GetTableReaderFromHandle(handle); } } if (s.ok()) { - s = t->Get(options, k, get_context); + s = t->Get(options, k, arg, saver, mark_key_may_exist); if (handle != nullptr) { ReleaseHandle(handle); } } else if (options.read_tier && s.IsIncomplete()) { // Couldnt find Table in cache but treat as kFound if no_io set - get_context->MarkKeyMayExist(); + (*mark_key_may_exist)(arg); return Status::OK(); } return s; } - Status TableCache::GetTableProperties( - const EnvOptions& env_options, + const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, std::shared_ptr* properties, bool no_io) { Status s; @@ -172,7 +174,7 @@ Status TableCache::GetTableProperties( } Cache::Handle* table_handle = nullptr; - s = FindTable(env_options, internal_comparator, fd, &table_handle, no_io); + s = FindTable(toptions, internal_comparator, fd, &table_handle, no_io); if (!s.ok()) { return s; } @@ -184,7 +186,7 @@ Status TableCache::GetTableProperties( } size_t TableCache::GetMemoryUsageByTableReader( - const EnvOptions& env_options, + const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd) { Status s; @@ -195,7 +197,7 @@ size_t TableCache::GetMemoryUsageByTableReader( } Cache::Handle* table_handle = nullptr; - s = FindTable(env_options, internal_comparator, fd, &table_handle, true); + s = FindTable(toptions, internal_comparator, fd, &table_handle, true); if (!s.ok()) { return 0; } diff --git a/db/table_cache.h b/db/table_cache.h index 76bb1c0a2b..79090e0649 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -19,7 +19,6 @@ #include "rocksdb/cache.h" #include "rocksdb/env.h" #include "rocksdb/table.h" -#include "rocksdb/options.h" #include "table/table_reader.h" namespace rocksdb { @@ -27,12 +26,11 @@ namespace rocksdb { class Env; class Arena; struct FileDescriptor; -class GetContext; class TableCache { public: - TableCache(const ImmutableCFOptions& ioptions, - const EnvOptions& storage_options, Cache* cache); + TableCache(const Options* options, const EnvOptions& storage_options, + Cache* cache); ~TableCache(); // Return an iterator for the specified file number (the corresponding @@ -53,8 +51,10 @@ class TableCache { // it returns false. Status Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileDescriptor& file_fd, const Slice& k, - GetContext* get_context); + const FileDescriptor& file_fd, const Slice& k, void* arg, + bool (*handle_result)(void*, const ParsedInternalKey&, + const Slice&), + void (*mark_key_may_exist)(void*) = nullptr); // Evict any entry for the specified file number static void Evict(Cache* cache, uint64_t file_number); @@ -91,8 +91,10 @@ class TableCache { void ReleaseHandle(Cache::Handle* handle); private: - const ImmutableCFOptions& ioptions_; - const EnvOptions& env_options_; + Env* const env_; + const std::vector db_paths_; + const Options* options_; + const EnvOptions& storage_options_; Cache* const cache_; }; diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 74abf86709..638b259f2a 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -11,7 +11,6 @@ #include "db/dbformat.h" #include "db/table_properties_collector.h" #include "rocksdb/table.h" -#include "rocksdb/immutable_options.h" #include "table/block_based_table_factory.h" #include "table/meta_blocks.h" #include "table/plain_table_factory.h" @@ -86,14 +85,12 @@ class DumbLogger : public Logger { // Utilities test functions namespace { void MakeBuilder(const Options& options, - const ImmutableCFOptions& ioptions, const InternalKeyComparator& internal_comparator, std::unique_ptr* writable, std::unique_ptr* builder) { writable->reset(new FakeWritableFile); - builder->reset(ioptions.table_factory->NewTableBuilder( - ioptions, internal_comparator, writable->get(), - options.compression, options.compression_opts)); + builder->reset(options.table_factory->NewTableBuilder( + options, internal_comparator, writable->get(), options.compression)); } } // namespace @@ -156,8 +153,7 @@ void TestCustomizedTablePropertiesCollector( // -- Step 1: build table std::unique_ptr builder; std::unique_ptr writable; - const ImmutableCFOptions ioptions(options); - MakeBuilder(options, ioptions, internal_comparator, &writable, &builder); + MakeBuilder(options, internal_comparator, &writable, &builder); for (const auto& kv : kvs) { if (encode_as_internal) { @@ -268,10 +264,9 @@ void TestInternalKeyPropertiesCollector( options.table_properties_collector_factories = { std::make_shared()}; } - const ImmutableCFOptions ioptions(options); for (int iter = 0; iter < 2; ++iter) { - MakeBuilder(options, ioptions, pikc, &writable, &builder); + MakeBuilder(options, pikc, &writable, &builder); for (const auto& k : keys) { builder->Add(k.Encode(), "val"); } diff --git a/db/version_edit.h b/db/version_edit.h index db133402c9..58edfed451 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -163,13 +163,13 @@ class VersionEdit { // Add the specified file at the specified number. // REQUIRES: This version has not been saved (see VersionSet::SaveTo) // REQUIRES: "smallest" and "largest" are smallest and largest keys in file - void AddFile(int level, uint64_t file, uint64_t file_path_id, - uint64_t file_size, const InternalKey& smallest, + void AddFile(int level, uint64_t file, uint64_t file_size, + uint64_t file_path_id, const InternalKey& smallest, const InternalKey& largest, const SequenceNumber& smallest_seqno, const SequenceNumber& largest_seqno) { assert(smallest_seqno <= largest_seqno); FileMetaData f; - f.fd = FileDescriptor(file, file_path_id, file_size); + f.fd = FileDescriptor(file, file_size, file_path_id); f.smallest = smallest; f.largest = largest; f.smallest_seqno = smallest_seqno; diff --git a/db/version_set.cc b/db/version_set.cc index a092277fab..3a15458532 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -9,10 +9,7 @@ #include "db/version_set.h" -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include #include @@ -37,7 +34,6 @@ #include "table/format.h" #include "table/plain_table_factory.h" #include "table/meta_blocks.h" -#include "table/get_context.h" #include "util/coding.h" #include "util/logging.h" #include "util/stop_watch.h" @@ -513,9 +509,9 @@ Status Version::GetTableProperties(std::shared_ptr* tp, const FileMetaData* file_meta, const std::string* fname) { auto table_cache = cfd_->table_cache(); - auto ioptions = cfd_->ioptions(); + auto options = cfd_->options(); Status s = table_cache->GetTableProperties( - vset_->env_options_, cfd_->internal_comparator(), file_meta->fd, + vset_->storage_options_, cfd_->internal_comparator(), file_meta->fd, tp, true /* no io */); if (s.ok()) { return s; @@ -531,13 +527,13 @@ Status Version::GetTableProperties(std::shared_ptr* tp, // directly from the properties block in the file. std::unique_ptr file; if (fname != nullptr) { - s = ioptions->env->NewRandomAccessFile( - *fname, &file, vset_->env_options_); + s = options->env->NewRandomAccessFile( + *fname, &file, vset_->storage_options_); } else { - s = ioptions->env->NewRandomAccessFile( - TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(), + s = options->env->NewRandomAccessFile( + TableFileName(vset_->options_->db_paths, file_meta->fd.GetNumber(), file_meta->fd.GetPathId()), - &file, vset_->env_options_); + &file, vset_->storage_options_); } if (!s.ok()) { return s; @@ -549,11 +545,11 @@ Status Version::GetTableProperties(std::shared_ptr* tp, s = ReadTableProperties( file.get(), file_meta->fd.GetFileSize(), Footer::kInvalidTableMagicNumber /* table's magic number */, - vset_->env_, ioptions->info_log, &raw_table_properties); + vset_->env_, options->info_log.get(), &raw_table_properties); if (!s.ok()) { return s; } - RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); + RecordTick(options->statistics.get(), NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); *tp = std::shared_ptr(raw_table_properties); return s; @@ -563,7 +559,7 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { for (int level = 0; level < num_levels_; level++) { for (const auto& file_meta : files_[level]) { auto fname = - TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(), + TableFileName(vset_->options_->db_paths, file_meta->fd.GetNumber(), file_meta->fd.GetPathId()); // 1. If the table is already present in table cache, load table // properties from there. @@ -585,7 +581,7 @@ size_t Version::GetMemoryUsageByTableReaders() { for (auto& file_level : file_levels_) { for (size_t i = 0; i < file_level.num_files; i++) { total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( - vset_->env_options_, cfd_->internal_comparator(), + vset_->storage_options_, cfd_->internal_comparator(), file_level.files[i].fd); } } @@ -600,6 +596,31 @@ uint64_t Version::GetEstimatedActiveKeys() { return num_non_deletions_ - num_deletions_; } +void Version::AddIterators(const ReadOptions& read_options, + const EnvOptions& soptions, + std::vector* iters) { + // Merge all level zero files together since they may overlap + for (size_t i = 0; i < file_levels_[0].num_files; i++) { + const auto& file = file_levels_[0].files[i]; + iters->push_back(cfd_->table_cache()->NewIterator( + read_options, soptions, cfd_->internal_comparator(), file.fd)); + } + + // For levels > 0, we can use a concatenating iterator that sequentially + // walks through the non-overlapping files in the level, opening them + // lazily. + for (int level = 1; level < num_levels_; level++) { + if (file_levels_[level].num_files != 0) { + iters->push_back(NewTwoLevelIterator(new LevelFileIteratorState( + cfd_->table_cache(), read_options, soptions, + cfd_->internal_comparator(), false /* for_compaction */, + cfd_->options()->prefix_extractor != nullptr), + new LevelFileNumIterator(cfd_->internal_comparator(), + &file_levels_[level]))); + } + } +} + void Version::AddIterators(const ReadOptions& read_options, const EnvOptions& soptions, MergeIteratorBuilder* merge_iter_builder) { @@ -620,14 +641,112 @@ void Version::AddIterators(const ReadOptions& read_options, new LevelFileIteratorState( cfd_->table_cache(), read_options, soptions, cfd_->internal_comparator(), false /* for_compaction */, - cfd_->ioptions()->prefix_extractor != nullptr), + cfd_->options()->prefix_extractor != nullptr), new LevelFileNumIterator(cfd_->internal_comparator(), &file_levels_[level]), merge_iter_builder->GetArena())); } } } +// Callback from TableCache::Get() +enum SaverState { + kNotFound, + kFound, + kDeleted, + kCorrupt, + kMerge // saver contains the current merge result (the operands) +}; +namespace version_set { +struct Saver { + SaverState state; + const Comparator* ucmp; + Slice user_key; + bool* value_found; // Is value set correctly? Used by KeyMayExist + std::string* value; + const MergeOperator* merge_operator; + // the merge operations encountered; + MergeContext* merge_context; + Logger* logger; + Statistics* statistics; +}; +} // namespace version_set + +// Called from TableCache::Get and Table::Get when file/block in which +// key may exist are not there in TableCache/BlockCache respectively. In this +// case we can't guarantee that key does not exist and are not permitted to do +// IO to be certain.Set the status=kFound and value_found=false to let the +// caller know that key may exist but is not there in memory +static void MarkKeyMayExist(void* arg) { + version_set::Saver* s = reinterpret_cast(arg); + s->state = kFound; + if (s->value_found != nullptr) { + *(s->value_found) = false; + } +} + +static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, + const Slice& v) { + version_set::Saver* s = reinterpret_cast(arg); + MergeContext* merge_contex = s->merge_context; + std::string merge_result; // temporary area for merge results later + + assert(s != nullptr && merge_contex != nullptr); + + // TODO: Merge? + if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) { + // Key matches. Process it + switch (parsed_key.type) { + case kTypeValue: + if (kNotFound == s->state) { + s->state = kFound; + s->value->assign(v.data(), v.size()); + } else if (kMerge == s->state) { + assert(s->merge_operator != nullptr); + s->state = kFound; + if (!s->merge_operator->FullMerge(s->user_key, &v, + merge_contex->GetOperands(), + s->value, s->logger)) { + RecordTick(s->statistics, NUMBER_MERGE_FAILURES); + s->state = kCorrupt; + } + } else { + assert(false); + } + return false; + + case kTypeDeletion: + if (kNotFound == s->state) { + s->state = kDeleted; + } else if (kMerge == s->state) { + s->state = kFound; + if (!s->merge_operator->FullMerge(s->user_key, nullptr, + merge_contex->GetOperands(), + s->value, s->logger)) { + RecordTick(s->statistics, NUMBER_MERGE_FAILURES); + s->state = kCorrupt; + } + } else { + assert(false); + } + return false; + + case kTypeMerge: + assert(s->state == kNotFound || s->state == kMerge); + s->state = kMerge; + merge_contex->PushOperand(v); + return true; + + default: + assert(false); + break; + } + } + + // s->state could be Corrupt, merge or notfound + + return false; +} Version::Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number) @@ -638,10 +757,10 @@ Version::Version(ColumnFamilyData* cfd, VersionSet* vset, (cfd == nullptr) ? nullptr : internal_comparator_->user_comparator()), table_cache_((cfd == nullptr) ? nullptr : cfd->table_cache()), merge_operator_((cfd == nullptr) ? nullptr - : cfd->ioptions()->merge_operator), - info_log_((cfd == nullptr) ? nullptr : cfd->ioptions()->info_log), + : cfd->options()->merge_operator.get()), + info_log_((cfd == nullptr) ? nullptr : cfd->options()->info_log.get()), db_statistics_((cfd == nullptr) ? nullptr - : cfd->ioptions()->statistics), + : cfd->options()->statistics.get()), // cfd is nullptr if Version is dummy num_levels_(cfd == nullptr ? 0 : cfd->NumberLevels()), num_non_empty_levels_(num_levels_), @@ -672,7 +791,7 @@ Version::Version(ColumnFamilyData* cfd, VersionSet* vset, } } -void Version::Get(const ReadOptions& read_options, +void Version::Get(const ReadOptions& options, const LookupKey& k, std::string* value, Status* status, @@ -682,42 +801,46 @@ void Version::Get(const ReadOptions& read_options, Slice user_key = k.user_key(); assert(status->ok() || status->IsMergeInProgress()); - - GetContext get_context(user_comparator_, merge_operator_, info_log_, - db_statistics_, status->ok() ? GetContext::kNotFound : GetContext::kMerge, - user_key, value, value_found, merge_context); + version_set::Saver saver; + saver.state = status->ok()? kNotFound : kMerge; + saver.ucmp = user_comparator_; + saver.user_key = user_key; + saver.value_found = value_found; + saver.value = value; + saver.merge_operator = merge_operator_; + saver.merge_context = merge_context; + saver.logger = info_log_; + saver.statistics = db_statistics_; FilePicker fp(files_, user_key, ikey, &file_levels_, num_non_empty_levels_, &file_indexer_, user_comparator_, internal_comparator_); FdWithKeyRange* f = fp.GetNextFile(); while (f != nullptr) { - *status = table_cache_->Get(read_options, *internal_comparator_, f->fd, - ikey, &get_context); + *status = table_cache_->Get(options, *internal_comparator_, f->fd, ikey, + &saver, SaveValue, MarkKeyMayExist); // TODO: examine the behavior for corrupted key if (!status->ok()) { return; } - switch (get_context.State()) { - case GetContext::kNotFound: - // Keep searching in other files - break; - case GetContext::kFound: + switch (saver.state) { + case kNotFound: + break; // Keep searching in other files + case kFound: return; - case GetContext::kDeleted: - // Use empty error message for speed - *status = Status::NotFound(); + case kDeleted: + *status = Status::NotFound(); // Use empty error message for speed return; - case GetContext::kCorrupt: + case kCorrupt: *status = Status::Corruption("corrupted key for ", user_key); return; - case GetContext::kMerge: + case kMerge: break; } f = fp.GetNextFile(); } - if (GetContext::kMerge == get_context.State()) { + if (kMerge == saver.state) { if (!merge_operator_) { *status = Status::InvalidArgument( "merge_operator is not properly initialized."); @@ -726,7 +849,7 @@ void Version::Get(const ReadOptions& read_options, // merge_operands are in saver and we hit the beginning of the key history // do a final merge of nullptr and operands; if (merge_operator_->FullMerge(user_key, nullptr, - merge_context->GetOperands(), value, + saver.merge_context->GetOperands(), value, info_log_)) { *status = Status::OK(); } else { @@ -746,10 +869,9 @@ void Version::GenerateFileLevels() { } } -void Version::PrepareApply(const MutableCFOptions& mutable_cf_options, - std::vector& size_being_compacted) { +void Version::PrepareApply(std::vector& size_being_compacted) { UpdateTemporaryStats(); - ComputeCompactionScore(mutable_cf_options, size_being_compacted); + ComputeCompactionScore(size_being_compacted); UpdateFilesBySize(); UpdateNumNonEmptyLevels(); file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_); @@ -764,7 +886,7 @@ bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) { Status s = GetTableProperties(&tp, file_meta); file_meta->init_stats_from_file = true; if (!s.ok()) { - Log(vset_->db_options_->info_log, + Log(vset_->options_->info_log, "Unable to load table properties for file %" PRIu64 " --- %s\n", file_meta->fd.GetNumber(), s.ToString().c_str()); return false; @@ -818,13 +940,13 @@ void Version::UpdateTemporaryStats() { } void Version::ComputeCompactionScore( - const MutableCFOptions& mutable_cf_options, std::vector& size_being_compacted) { double max_score = 0; int max_score_level = 0; int max_input_level = cfd_->compaction_picker()->MaxInputLevel(NumberLevels()); + for (int level = 0; level <= max_input_level; level++) { double score; if (level == 0) { @@ -847,25 +969,24 @@ void Version::ComputeCompactionScore( numfiles++; } } - if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO) { + if (cfd_->options()->compaction_style == kCompactionStyleFIFO) { score = static_cast(total_size) / cfd_->options()->compaction_options_fifo.max_table_files_size; - } else if (numfiles >= mutable_cf_options.level0_stop_writes_trigger) { + } else if (numfiles >= cfd_->options()->level0_stop_writes_trigger) { // If we are slowing down writes, then we better compact that first score = 1000000; - } else if (numfiles >= - mutable_cf_options.level0_slowdown_writes_trigger) { + } else if (numfiles >= cfd_->options()->level0_slowdown_writes_trigger) { score = 10000; } else { score = static_cast(numfiles) / - mutable_cf_options.level0_file_num_compaction_trigger; + cfd_->options()->level0_file_num_compaction_trigger; } } else { // Compute the ratio of current size to size limit. const uint64_t level_bytes = TotalCompensatedFileSize(files_[level]) - size_being_compacted[level]; score = static_cast(level_bytes) / - mutable_cf_options.MaxBytesForLevel(level); + cfd_->compaction_picker()->MaxBytesForLevel(level); if (max_score < score) { max_score = score; max_score_level = level; @@ -917,8 +1038,8 @@ void Version::UpdateNumNonEmptyLevels() { } void Version::UpdateFilesBySize() { - if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO || - cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) { + if (cfd_->options()->compaction_style == kCompactionStyleFIFO || + cfd_->options()->compaction_style == kCompactionStyleUniversal) { // don't need this return; } @@ -995,7 +1116,6 @@ bool Version::OverlapInLevel(int level, } int Version::PickLevelForMemTableOutput( - const MutableCFOptions& mutable_cf_options, const Slice& smallest_user_key, const Slice& largest_user_key) { int level = 0; @@ -1016,7 +1136,7 @@ int Version::PickLevelForMemTableOutput( } GetOverlappingInputs(level + 2, &start, &limit, &overlaps); const uint64_t sum = TotalFileSize(overlaps); - if (sum > mutable_cf_options.MaxGrandParentOverlapBytes(level)) { + if (sum > cfd_->compaction_picker()->MaxGrandParentOverlapBytes(level)) { break; } level++; @@ -1249,7 +1369,7 @@ bool Version::HasOverlappingUserKey( return false; } -uint64_t Version::NumLevelBytes(int level) const { +int64_t Version::NumLevelBytes(int level) const { assert(level >= 0); assert(level < NumberLevels()); return TotalFileSize(files_[level]); @@ -1579,7 +1699,7 @@ class VersionSet::Builder { for (auto& file_meta : *(levels_[level].added_files)) { assert (!file_meta->table_reader_handle); cfd_->table_cache()->FindTable( - base_->vset_->env_options_, cfd_->internal_comparator(), + base_->vset_->storage_options_, cfd_->internal_comparator(), file_meta->fd, &file_meta->table_reader_handle, false); if (file_meta->table_reader_handle != nullptr) { // Load table_reader @@ -1607,14 +1727,13 @@ class VersionSet::Builder { } }; -VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options, - const EnvOptions& env_options, Cache* table_cache, - WriteController* write_controller) - : column_family_set_(new ColumnFamilySet(dbname, db_options, env_options, - table_cache, write_controller)), - env_(db_options->env), +VersionSet::VersionSet(const std::string& dbname, const DBOptions* options, + const EnvOptions& storage_options, Cache* table_cache) + : column_family_set_(new ColumnFamilySet(dbname, options, storage_options, + table_cache)), + env_(options->env), dbname_(dbname), - db_options_(db_options), + options_(options), next_file_number_(2), manifest_file_number_(0), // Filled by Recover() pending_manifest_file_number_(0), @@ -1622,8 +1741,8 @@ VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options, prev_log_number_(0), current_version_number_(0), manifest_file_size_(0), - env_options_(env_options), - env_options_compactions_(env_options_) {} + storage_options_(storage_options), + storage_options_compactions_(storage_options_) {} VersionSet::~VersionSet() { // we need to delete column_family_set_ because its destructor depends on @@ -1656,17 +1775,16 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, } Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, - const MutableCFOptions& mutable_cf_options, VersionEdit* edit, port::Mutex* mu, Directory* db_directory, bool new_descriptor_log, - const ColumnFamilyOptions* new_cf_options) { + const ColumnFamilyOptions* options) { mu->AssertHeld(); // column_family_data can be nullptr only if this is column_family_add. // in that case, we also need to specify ColumnFamilyOptions if (column_family_data == nullptr) { assert(edit->is_column_family_add_); - assert(new_cf_options != nullptr); + assert(options != nullptr); } // queue our request @@ -1726,7 +1844,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, assert(pending_manifest_file_number_ == 0); if (!descriptor_log_ || - manifest_file_size_ > db_options_->max_manifest_file_size) { + manifest_file_size_ > options_->max_manifest_file_size) { pending_manifest_file_number_ = NewFileNumber(); batch_edits.back()->SetNextFile(next_file_number_); new_descriptor_log = true; @@ -1754,8 +1872,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, mu->Unlock(); - if (!edit->IsColumnFamilyManipulation() && - db_options_->max_open_files == -1) { + if (!edit->IsColumnFamilyManipulation() && options_->max_open_files == -1) { // unlimited table cache. Pre-load table handle now. // Need to do it out of the mutex. builder->LoadTableHandlers(); @@ -1765,15 +1882,15 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, // only one thread can be here at the same time if (new_descriptor_log) { // create manifest file - Log(db_options_->info_log, + Log(options_->info_log, "Creating manifest %" PRIu64 "\n", pending_manifest_file_number_); unique_ptr descriptor_file; s = env_->NewWritableFile( DescriptorFileName(dbname_, pending_manifest_file_number_), - &descriptor_file, env_->OptimizeForManifestWrite(env_options_)); + &descriptor_file, env_->OptimizeForManifestWrite(storage_options_)); if (s.ok()) { descriptor_file->SetPreallocationBlockSize( - db_options_->manifest_preallocation_size); + options_->manifest_preallocation_size); descriptor_log_.reset(new log::Writer(std::move(descriptor_file))); s = WriteSnapshot(descriptor_log_.get()); } @@ -1781,7 +1898,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, if (!edit->IsColumnFamilyManipulation()) { // This is cpu-heavy operations, which should be called outside mutex. - v->PrepareApply(mutable_cf_options, size_being_compacted); + v->PrepareApply(size_being_compacted); } // Write new record to MANIFEST log @@ -1794,20 +1911,19 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, break; } } - if (s.ok() && db_options_->disableDataSync == false) { - if (db_options_->use_fsync) { - StopWatch sw(env_, db_options_->statistics.get(), + if (s.ok()) { + if (options_->use_fsync) { + StopWatch sw(env_, options_->statistics.get(), MANIFEST_FILE_SYNC_MICROS); s = descriptor_log_->file()->Fsync(); } else { - StopWatch sw(env_, db_options_->statistics.get(), + StopWatch sw(env_, options_->statistics.get(), MANIFEST_FILE_SYNC_MICROS); s = descriptor_log_->file()->Sync(); } } if (!s.ok()) { - Log(db_options_->info_log, "MANIFEST write: %s\n", - s.ToString().c_str()); + Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str()); bool all_records_in = true; for (auto& e : batch_edits) { std::string record; @@ -1818,7 +1934,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, } } if (all_records_in) { - Log(db_options_->info_log, + Log(options_->info_log, "MANIFEST contains log record despite error; advancing to new " "version to prevent mismatch between in-memory and logged state" " If paranoid is set, then the db is now in readonly mode."); @@ -1831,10 +1947,10 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, // new CURRENT file that points to it. if (s.ok() && new_descriptor_log) { s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_, - db_options_->disableDataSync ? nullptr : db_directory); + db_directory); if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) { // delete old manifest file - Log(db_options_->info_log, + Log(options_->info_log, "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", manifest_file_number_, pending_manifest_file_number_); // we don't care about an error here, PurgeObsoleteFiles will take care @@ -1848,7 +1964,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, new_manifest_file_size = descriptor_log_->file()->GetFileSize(); } - LogFlush(db_options_->info_log); + LogFlush(options_->info_log); mu->Lock(); } @@ -1857,8 +1973,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, if (edit->is_column_family_add_) { // no group commit on column family add assert(batch_edits.size() == 1); - assert(new_cf_options != nullptr); - CreateColumnFamily(*new_cf_options, edit); + assert(options != nullptr); + CreateColumnFamily(*options, edit); } else if (edit->is_column_family_drop_) { assert(batch_edits.size() == 1); column_family_data->SetDropped(); @@ -1884,12 +2000,12 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, manifest_file_size_ = new_manifest_file_size; prev_log_number_ = edit->prev_log_number_; } else { - Log(db_options_->info_log, "Error in committing version %lu to [%s]", + Log(options_->info_log, "Error in committing version %lu to [%s]", (unsigned long)v->GetVersionNumber(), column_family_data->GetName().c_str()); delete v; if (new_descriptor_log) { - Log(db_options_->info_log, + Log(options_->info_log, "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", manifest_file_number_, pending_manifest_file_number_); descriptor_log_.reset(); @@ -1981,13 +2097,13 @@ Status VersionSet::Recover( return Status::Corruption("CURRENT file corrupted"); } - Log(db_options_->info_log, "Recovering from manifest file: %s\n", + Log(options_->info_log, "Recovering from manifest file: %s\n", manifest_filename.c_str()); manifest_filename = dbname_ + "/" + manifest_filename; unique_ptr manifest_file; s = env_->NewSequentialFile(manifest_filename, &manifest_file, - env_options_); + storage_options_); if (!s.ok()) { return s; } @@ -2114,7 +2230,7 @@ Status VersionSet::Recover( if (cfd != nullptr) { if (edit.has_log_number_) { if (cfd->GetLogNumber() > edit.log_number_) { - Log(db_options_->info_log, + Log(options_->info_log, "MANIFEST corruption detected, but ignored - Log numbers in " "records NOT monotonically increasing"); } else { @@ -2173,7 +2289,7 @@ Status VersionSet::Recover( // there were some column families in the MANIFEST that weren't specified // in the argument. This is OK in read_only mode - if (read_only == false && !column_families_not_found.empty()) { + if (read_only == false && column_families_not_found.size() > 0) { std::string list_of_not_found; for (const auto& cf : column_families_not_found) { list_of_not_found += ", " + cf.second; @@ -2190,7 +2306,7 @@ Status VersionSet::Recover( assert(builders_iter != builders.end()); auto builder = builders_iter->second; - if (db_options_->max_open_files == -1) { + if (options_->max_open_files == -1) { // unlimited table cache. Pre-load table handle now. // Need to do it out of the mutex. builder->LoadTableHandlers(); @@ -2202,7 +2318,7 @@ Status VersionSet::Recover( // Install recovered version std::vector size_being_compacted(v->NumberLevels() - 1); cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted); - v->PrepareApply(*cfd->GetLatestMutableCFOptions(), size_being_compacted); + v->PrepareApply(size_being_compacted); AppendVersion(cfd, v); } @@ -2211,7 +2327,7 @@ Status VersionSet::Recover( last_sequence_ = last_sequence; prev_log_number_ = prev_log_number; - Log(db_options_->info_log, + Log(options_->info_log, "Recovered from manifest file:%s succeeded," "manifest_file_number is %lu, next_file_number is %lu, " "last_sequence is %lu, log_number is %lu," @@ -2223,7 +2339,7 @@ Status VersionSet::Recover( column_family_set_->GetMaxColumnFamily()); for (auto cfd : *column_family_set_) { - Log(db_options_->info_log, + Log(options_->info_log, "Column family [%s] (ID %u), log number is %" PRIu64 "\n", cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); } @@ -2306,7 +2422,7 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, #ifndef ROCKSDB_LITE Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, const Options* options, - const EnvOptions& env_options, + const EnvOptions& storage_options, int new_levels) { if (new_levels <= 1) { return Status::InvalidArgument( @@ -2317,8 +2433,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, std::shared_ptr tc(NewLRUCache( options->max_open_files - 10, options->table_cache_numshardbits, options->table_cache_remove_scan_count_limit)); - WriteController wc; - VersionSet versions(dbname, options, env_options, tc.get(), &wc); + VersionSet versions(dbname, options, storage_options, tc.get()); Status status; std::vector dummy; @@ -2378,20 +2493,18 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, current_version->files_ = new_files_list; current_version->num_levels_ = new_levels; - MutableCFOptions mutable_cf_options(*options, ImmutableCFOptions(*options)); VersionEdit ve; port::Mutex dummy_mutex; MutexLock l(&dummy_mutex); - return versions.LogAndApply( - versions.GetColumnFamilySet()->GetDefault(), - mutable_cf_options, &ve, &dummy_mutex, nullptr, true); + return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), &ve, + &dummy_mutex, nullptr, true); } Status VersionSet::DumpManifest(Options& options, std::string& dscname, bool verbose, bool hex) { // Open the specified manifest file. unique_ptr file; - Status s = options.env->NewSequentialFile(dscname, &file, env_options_); + Status s = options.env->NewSequentialFile(dscname, &file, storage_options_); if (!s.ok()) { return s; } @@ -2536,7 +2649,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, builder->SaveTo(v); std::vector size_being_compacted(v->NumberLevels() - 1); cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted); - v->PrepareApply(*cfd->GetLatestMutableCFOptions(), size_being_compacted); + v->PrepareApply(size_being_compacted); delete builder; printf("--------------- Column family \"%s\" (ID %u) --------------\n", @@ -2633,12 +2746,12 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number, const std::string& record) const { std::string fname = DescriptorFileName(dbname_, manifest_file_number); - Log(db_options_->info_log, "ManifestContains: checking %s\n", fname.c_str()); + Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str()); unique_ptr file; - Status s = env_->NewSequentialFile(fname, &file, env_options_); + Status s = env_->NewSequentialFile(fname, &file, storage_options_); if (!s.ok()) { - Log(db_options_->info_log, "ManifestContains: %s\n", s.ToString().c_str()); - Log(db_options_->info_log, + Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str()); + Log(options_->info_log, "ManifestContains: is unable to reopen the manifest file %s", fname.c_str()); return false; @@ -2653,7 +2766,7 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number, break; } } - Log(db_options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0); + Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0); return result; } @@ -2681,7 +2794,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { // approximate offset of "ikey" within the table. TableReader* table_reader_ptr; Iterator* iter = v->cfd_->table_cache()->NewIterator( - ReadOptions(), env_options_, v->cfd_->internal_comparator(), + ReadOptions(), storage_options_, v->cfd_->internal_comparator(), files[i]->fd, &table_reader_ptr); if (table_reader_ptr != nullptr) { result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode()); @@ -2743,14 +2856,14 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { const FileLevel* flevel = c->input_levels(which); for (size_t i = 0; i < flevel->num_files; i++) { list[num++] = cfd->table_cache()->NewIterator( - read_options, env_options_compactions_, + read_options, storage_options_compactions_, cfd->internal_comparator(), flevel->files[i].fd, nullptr, true /* for compaction */); } } else { // Create concatenating iterator for the files from this level list[num++] = NewTwoLevelIterator(new Version::LevelFileIteratorState( - cfd->table_cache(), read_options, env_options_, + cfd->table_cache(), read_options, storage_options_, cfd->internal_comparator(), true /* for_compaction */, false /* prefix enabled */), new Version::LevelFileNumIterator(cfd->internal_comparator(), @@ -2771,7 +2884,7 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { #ifndef NDEBUG Version* version = c->column_family_data()->current(); if (c->input_version() != version) { - Log(db_options_->info_log, + Log(options_->info_log, "[%s] VerifyCompactionFileConsistency version mismatch", c->column_family_data()->GetName().c_str()); } @@ -2842,11 +2955,11 @@ void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { LiveFileMetaData filemetadata; filemetadata.column_family_name = cfd->GetName(); uint32_t path_id = file->fd.GetPathId(); - if (path_id < db_options_->db_paths.size()) { - filemetadata.db_path = db_options_->db_paths[path_id].path; + if (path_id < options_->db_paths.size()) { + filemetadata.db_path = options_->db_paths[path_id].path; } else { - assert(!db_options_->db_paths.empty()); - filemetadata.db_path = db_options_->db_paths.back().path; + assert(!options_->db_paths.empty()); + filemetadata.db_path = options_->db_paths.back().path; } filemetadata.name = MakeTableFileName("", file->fd.GetNumber()); filemetadata.level = level; @@ -2867,21 +2980,17 @@ void VersionSet::GetObsoleteFiles(std::vector* files) { } ColumnFamilyData* VersionSet::CreateColumnFamily( - const ColumnFamilyOptions& cf_options, VersionEdit* edit) { + const ColumnFamilyOptions& options, VersionEdit* edit) { assert(edit->is_column_family_add_); Version* dummy_versions = new Version(nullptr, this); auto new_cfd = column_family_set_->CreateColumnFamily( - edit->column_family_name_, edit->column_family_, dummy_versions, - cf_options); + edit->column_family_name_, edit->column_family_, dummy_versions, options); Version* v = new Version(new_cfd, this, current_version_number_++); AppendVersion(new_cfd, v); - // GetLatestMutableCFOptions() is safe here without mutex since the - // cfd is not available to client - new_cfd->CreateNewMemtable(MemTableOptions( - *new_cfd->GetLatestMutableCFOptions(), *new_cfd->options())); + new_cfd->CreateNewMemtable(); new_cfd->SetLogNumber(edit->log_number_); return new_cfd; } diff --git a/db/version_set.h b/db/version_set.h index 05e6e9a653..2f6d477a1d 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -34,7 +34,6 @@ #include "db/column_family.h" #include "db/log_reader.h" #include "db/file_indexer.h" -#include "db/write_controller.h" namespace rocksdb { @@ -87,6 +86,8 @@ class Version { // Append to *iters a sequence of iterators that will // yield the contents of this Version when merged together. // REQUIRES: This version has been saved (see VersionSet::SaveTo) + void AddIterators(const ReadOptions&, const EnvOptions& soptions, + std::vector* iters); void AddIterators(const ReadOptions&, const EnvOptions& soptions, MergeIteratorBuilder* merger_iter_builder); @@ -103,18 +104,14 @@ class Version { // We use compaction scores to figure out which compaction to do next // REQUIRES: If Version is not yet saved to current_, it can be called without // a lock. Once a version is saved to current_, call only with mutex held - void ComputeCompactionScore( - const MutableCFOptions& mutable_cf_options, - std::vector& size_being_compacted); + void ComputeCompactionScore(std::vector& size_being_compacted); // Generate file_levels_ from files_ void GenerateFileLevels(); // Update scores, pre-calculated variables. It needs to be called before // applying the version to the version set. - void PrepareApply( - const MutableCFOptions& mutable_cf_options, - std::vector& size_being_compacted); + void PrepareApply(std::vector& size_being_compacted); // Reference count management (so Versions do not disappear out from // under live iterators) @@ -173,8 +170,7 @@ class Version { // Return the level at which we should place a new memtable compaction // result that covers the range [smallest_user_key,largest_user_key]. - int PickLevelForMemTableOutput(const MutableCFOptions& mutable_cf_options, - const Slice& smallest_user_key, + int PickLevelForMemTableOutput(const Slice& smallest_user_key, const Slice& largest_user_key); int NumberLevels() const { return num_levels_; } @@ -183,15 +179,15 @@ class Version { int NumLevelFiles(int level) const { return files_[level].size(); } // Return the combined file size of all files at the specified level. - uint64_t NumLevelBytes(int level) const; + int64_t NumLevelBytes(int level) const; // Return a human-readable short (single-line) summary of the number // of files per level. Uses *scratch as backing store. struct LevelSummaryStorage { - char buffer[1000]; + char buffer[100]; }; struct FileSummaryStorage { - char buffer[3000]; + char buffer[1000]; }; const char* LevelSummary(LevelSummaryStorage* scratch) const; // Return a human-readable short (single-line) summary of files @@ -250,7 +246,6 @@ class Version { friend class Compaction; friend class VersionSet; friend class DBImpl; - friend class CompactedDBImpl; friend class ColumnFamilyData; friend class CompactionPicker; friend class LevelCompactionPicker; @@ -262,7 +257,7 @@ class Version { class LevelFileNumIterator; class LevelFileIteratorState; - bool PrefixMayMatch(const ReadOptions& read_options, Iterator* level_iter, + bool PrefixMayMatch(const ReadOptions& options, Iterator* level_iter, const Slice& internal_prefix) const; // Update num_non_empty_levels_. @@ -328,8 +323,8 @@ class Version { // These are used to pick the best compaction level std::vector compaction_score_; std::vector compaction_level_; - double max_compaction_score_ = 0.0; // max score in l1 to ln-1 - int max_compaction_score_level_ = 0; // level on which max score occurs + double max_compaction_score_; // max score in l1 to ln-1 + int max_compaction_score_level_; // level on which max score occurs // A version number that uniquely represents this version. This is // used for debugging and logging purposes only. @@ -363,9 +358,8 @@ class Version { class VersionSet { public: - VersionSet(const std::string& dbname, const DBOptions* db_options, - const EnvOptions& env_options, Cache* table_cache, - WriteController* write_controller); + VersionSet(const std::string& dbname, const DBOptions* options, + const EnvOptions& storage_options, Cache* table_cache); ~VersionSet(); // Apply *edit to the current version to form a new descriptor that @@ -374,9 +368,7 @@ class VersionSet { // column_family_options has to be set if edit is column family add // REQUIRES: *mu is held on entry. // REQUIRES: no other thread concurrently calls LogAndApply() - Status LogAndApply(ColumnFamilyData* column_family_data, - const MutableCFOptions& mutable_cf_options, - VersionEdit* edit, + Status LogAndApply(ColumnFamilyData* column_family_data, VersionEdit* edit, port::Mutex* mu, Directory* db_directory = nullptr, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = @@ -405,7 +397,7 @@ class VersionSet { // among [4-6] contains files. static Status ReduceNumberOfLevels(const std::string& dbname, const Options* options, - const EnvOptions& env_options, + const EnvOptions& storage_options, int new_levels); // printf contents (for debugging) @@ -514,14 +506,14 @@ class VersionSet { bool ManifestContains(uint64_t manifest_file_number, const std::string& record) const; - ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, + ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options, VersionEdit* edit); std::unique_ptr column_family_set_; Env* const env_; const std::string dbname_; - const DBOptions* const db_options_; + const DBOptions* const options_; uint64_t next_file_number_; uint64_t manifest_file_number_; uint64_t pending_manifest_file_number_; @@ -542,12 +534,12 @@ class VersionSet { std::vector obsolete_files_; - // env options for all reads and writes except compactions - const EnvOptions& env_options_; + // storage options for all reads and writes except compactions + const EnvOptions& storage_options_; - // env options used for compactions. This is a copy of - // env_options_ but with readaheads set to readahead_compactions_. - const EnvOptions env_options_compactions_; + // storage options used for compactions. This is a copy of + // storage_options_ but with readaheads set to readahead_compactions_. + const EnvOptions storage_options_compactions_; // No copying allowed VersionSet(const VersionSet&); diff --git a/db/write_batch.cc b/db/write_batch.cc index b8d0322d85..bfa5e3f6f2 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -23,6 +23,7 @@ // data: uint8[len] #include "rocksdb/write_batch.h" +#include "rocksdb/options.h" #include "rocksdb/merge_operator.h" #include "db/dbformat.h" #include "db/db_impl.h" @@ -349,15 +350,14 @@ class MemTableInserter : public WriteBatch::Handler { return seek_status; } MemTable* mem = cf_mems_->GetMemTable(); - auto* ioptions = mem->GetImmutableOptions(); - auto* moptions = mem->GetMemTableOptions(); - if (!moptions->inplace_update_support) { + const Options* options = cf_mems_->GetOptions(); + if (!options->inplace_update_support) { mem->Add(sequence_, kTypeValue, key, value); - } else if (moptions->inplace_callback == nullptr) { + } else if (options->inplace_callback == nullptr) { mem->Update(sequence_, key, value); - RecordTick(ioptions->statistics, NUMBER_KEYS_UPDATED); + RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED); } else { - if (mem->UpdateCallback(sequence_, key, value)) { + if (mem->UpdateCallback(sequence_, key, value, *options)) { } else { // key not found in memtable. Do sst get, update, add SnapshotImpl read_from_snapshot; @@ -376,17 +376,17 @@ class MemTableInserter : public WriteBatch::Handler { char* prev_buffer = const_cast(prev_value.c_str()); uint32_t prev_size = prev_value.size(); - auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr, - s.ok() ? &prev_size : nullptr, - value, &merged_value); + auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr, + s.ok() ? &prev_size : nullptr, + value, &merged_value); if (status == UpdateStatus::UPDATED_INPLACE) { // prev_value is updated in-place with final value. mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size)); - RecordTick(ioptions->statistics, NUMBER_KEYS_WRITTEN); + RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN); } else if (status == UpdateStatus::UPDATED) { // merged_value contains the final value. mem->Add(sequence_, kTypeValue, key, Slice(merged_value)); - RecordTick(ioptions->statistics, NUMBER_KEYS_WRITTEN); + RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN); } } } @@ -394,7 +394,6 @@ class MemTableInserter : public WriteBatch::Handler { // sequence number. Even if the update eventually fails and does not result // in memtable add/update. sequence_++; - cf_mems_->CheckMemtableFull(); return Status::OK(); } @@ -406,18 +405,17 @@ class MemTableInserter : public WriteBatch::Handler { return seek_status; } MemTable* mem = cf_mems_->GetMemTable(); - auto* ioptions = mem->GetImmutableOptions(); - auto* moptions = mem->GetMemTableOptions(); + const Options* options = cf_mems_->GetOptions(); bool perform_merge = false; - if (moptions->max_successive_merges > 0 && db_ != nullptr) { + if (options->max_successive_merges > 0 && db_ != nullptr) { LookupKey lkey(key, sequence_); // Count the number of successive merges at the head // of the key in the memtable size_t num_merges = mem->CountSuccessiveMergeEntries(lkey); - if (num_merges >= moptions->max_successive_merges) { + if (num_merges >= options->max_successive_merges) { perform_merge = true; } } @@ -441,16 +439,16 @@ class MemTableInserter : public WriteBatch::Handler { Slice get_value_slice = Slice(get_value); // 2) Apply this merge - auto merge_operator = ioptions->merge_operator; + auto merge_operator = options->merge_operator.get(); assert(merge_operator); std::deque operands; operands.push_front(value.ToString()); std::string new_value; if (!merge_operator->FullMerge(key, &get_value_slice, operands, - &new_value, ioptions->info_log)) { + &new_value, options->info_log.get())) { // Failed to merge! - RecordTick(ioptions->statistics, NUMBER_MERGE_FAILURES); + RecordTick(options->statistics.get(), NUMBER_MERGE_FAILURES); // Store the delta in memtable perform_merge = false; @@ -466,7 +464,6 @@ class MemTableInserter : public WriteBatch::Handler { } sequence_++; - cf_mems_->CheckMemtableFull(); return Status::OK(); } @@ -477,9 +474,8 @@ class MemTableInserter : public WriteBatch::Handler { return seek_status; } MemTable* mem = cf_mems_->GetMemTable(); - auto* ioptions = mem->GetImmutableOptions(); - auto* moptions = mem->GetMemTableOptions(); - if (!dont_filter_deletes_ && moptions->filter_deletes) { + const Options* options = cf_mems_->GetOptions(); + if (!dont_filter_deletes_ && options->filter_deletes) { SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; ReadOptions ropts; @@ -490,13 +486,12 @@ class MemTableInserter : public WriteBatch::Handler { cf_handle = db_->DefaultColumnFamily(); } if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) { - RecordTick(ioptions->statistics, NUMBER_FILTERED_DELETES); + RecordTick(options->statistics.get(), NUMBER_FILTERED_DELETES); return Status::OK(); } } mem->Add(sequence_, kTypeDeletion, key, Slice()); sequence_++; - cf_mems_->CheckMemtableFull(); return Status::OK(); } }; diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index 568cd70d81..615a47f5eb 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -28,7 +28,6 @@ class ColumnFamilyMemTables { virtual MemTable* GetMemTable() const = 0; virtual const Options* GetOptions() const = 0; virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0; - virtual void CheckMemtableFull() = 0; }; class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { @@ -55,8 +54,6 @@ class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; } - void CheckMemtableFull() override {} - private: bool ok_; MemTable* mem_; diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index cb4048214a..1d30552b31 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -18,7 +18,6 @@ #include "rocksdb/utilities/write_batch_with_index.h" #include "util/logging.h" #include "util/testharness.h" -#include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -27,16 +26,13 @@ static std::string PrintContents(WriteBatch* b) { auto factory = std::make_shared(); Options options; options.memtable_factory = factory; - ImmutableCFOptions ioptions(options); - MemTable* mem = new MemTable(cmp, ioptions, - MemTableOptions(MutableCFOptions(options, ioptions), options)); + MemTable* mem = new MemTable(cmp, options); mem->Ref(); std::string state; ColumnFamilyMemTablesDefault cf_mems_default(mem, &options); Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default); int count = 0; - Arena arena; - ScopedArenaIterator iter(mem->NewIterator(ReadOptions(), &arena)); + Iterator* iter = mem->NewIterator(ReadOptions()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey ikey; memset((void *)&ikey, 0, sizeof(ikey)); @@ -71,6 +67,7 @@ static std::string PrintContents(WriteBatch* b) { state.append("@"); state.append(NumberToString(ikey.sequence)); } + delete iter; if (!s.ok()) { state.append(s.ToString()); } else if (count != WriteBatchInternal::Count(b)) { @@ -290,9 +287,6 @@ class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { explicit ColumnFamilyHandleImplDummy(int id) : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {} uint32_t GetID() const override { return id_; } - const Comparator* user_comparator() const override { - return BytewiseComparator(); - } private: uint32_t id_; @@ -324,7 +318,7 @@ TEST(WriteBatchTest, ColumnFamiliesBatchTest) { } TEST(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) { - WriteBatchWithIndex batch; + WriteBatchWithIndex batch(BytewiseComparator(), 20); ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); batch.Put(&zero, Slice("foo"), Slice("bar")); batch.Put(&two, Slice("twofoo"), Slice("bar2")); diff --git a/db/write_controller.cc b/db/write_controller.cc deleted file mode 100644 index bb6f8ecf75..0000000000 --- a/db/write_controller.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#include "db/write_controller.h" - -#include - -namespace rocksdb { - -std::unique_ptr WriteController::GetStopToken() { - ++total_stopped_; - return std::unique_ptr(new StopWriteToken(this)); -} - -std::unique_ptr WriteController::GetDelayToken( - uint64_t delay_us) { - total_delay_us_ += delay_us; - return std::unique_ptr( - new DelayWriteToken(this, delay_us)); -} - -bool WriteController::IsStopped() const { return total_stopped_ > 0; } -uint64_t WriteController::GetDelay() const { return total_delay_us_; } - -StopWriteToken::~StopWriteToken() { - assert(controller_->total_stopped_ >= 1); - --controller_->total_stopped_; -} - -DelayWriteToken::~DelayWriteToken() { - assert(controller_->total_delay_us_ >= delay_us_); - controller_->total_delay_us_ -= delay_us_; -} - -} // namespace rocksdb diff --git a/db/write_controller.h b/db/write_controller.h deleted file mode 100644 index 32e1d58f10..0000000000 --- a/db/write_controller.h +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#pragma once - -#include - -#include - -namespace rocksdb { - -class WriteControllerToken; - -// WriteController is controlling write stalls in our write code-path. Write -// stalls happen when compaction can't keep up with write rate. -// All of the methods here (including WriteControllerToken's destructors) need -// to be called while holding DB mutex -class WriteController { - public: - WriteController() : total_stopped_(0), total_delay_us_(0) {} - ~WriteController() = default; - - // When an actor (column family) requests a stop token, all writes will be - // stopped until the stop token is released (deleted) - std::unique_ptr GetStopToken(); - // When an actor (column family) requests a delay token, total delay for all - // writes will be increased by delay_us. The delay will last until delay token - // is released - std::unique_ptr GetDelayToken(uint64_t delay_us); - - // these two metods are querying the state of the WriteController - bool IsStopped() const; - uint64_t GetDelay() const; - - private: - friend class WriteControllerToken; - friend class StopWriteToken; - friend class DelayWriteToken; - - int total_stopped_; - uint64_t total_delay_us_; -}; - -class WriteControllerToken { - public: - explicit WriteControllerToken(WriteController* controller) - : controller_(controller) {} - virtual ~WriteControllerToken() {} - - protected: - WriteController* controller_; - - private: - // no copying allowed - WriteControllerToken(const WriteControllerToken&) = delete; - void operator=(const WriteControllerToken&) = delete; -}; - -class StopWriteToken : public WriteControllerToken { - public: - explicit StopWriteToken(WriteController* controller) - : WriteControllerToken(controller) {} - virtual ~StopWriteToken(); -}; - -class DelayWriteToken : public WriteControllerToken { - public: - DelayWriteToken(WriteController* controller, uint64_t delay_us) - : WriteControllerToken(controller), delay_us_(delay_us) {} - virtual ~DelayWriteToken(); - - private: - uint64_t delay_us_; -}; - -} // namespace rocksdb diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc deleted file mode 100644 index 1cec9658d4..0000000000 --- a/db/write_controller_test.cc +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -#include "db/write_controller.h" - -#include "util/testharness.h" - -namespace rocksdb { - -class WriteControllerTest {}; - -TEST(WriteControllerTest, SanityTest) { - WriteController controller; - auto stop_token_1 = controller.GetStopToken(); - auto stop_token_2 = controller.GetStopToken(); - - ASSERT_EQ(true, controller.IsStopped()); - stop_token_1.reset(); - ASSERT_EQ(true, controller.IsStopped()); - stop_token_2.reset(); - ASSERT_EQ(false, controller.IsStopped()); - - auto delay_token_1 = controller.GetDelayToken(5); - ASSERT_EQ(static_cast(5), controller.GetDelay()); - auto delay_token_2 = controller.GetDelayToken(8); - ASSERT_EQ(static_cast(13), controller.GetDelay()); - - delay_token_2.reset(); - ASSERT_EQ(static_cast(5), controller.GetDelay()); - delay_token_1.reset(); - ASSERT_EQ(static_cast(0), controller.GetDelay()); - delay_token_1.reset(); - ASSERT_EQ(false, controller.IsStopped()); -} - -} // namespace rocksdb - -int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/db/write_thread.cc b/db/write_thread.cc deleted file mode 100644 index 052e1209ef..0000000000 --- a/db/write_thread.cc +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#include "db/write_thread.h" - -namespace rocksdb { - -Status WriteThread::EnterWriteThread(WriteThread::Writer* w, - uint64_t expiration_time) { - // the following code block pushes the current writer "w" into the writer - // queue "writers_" and wait until one of the following conditions met: - // 1. the job of "w" has been done by some other writers. - // 2. "w" becomes the first writer in "writers_" - // 3. "w" timed-out. - writers_.push_back(w); - - bool timed_out = false; - while (!w->done && w != writers_.front()) { - if (expiration_time == 0) { - w->cv.Wait(); - } else if (w->cv.TimedWait(expiration_time)) { - if (w->in_batch_group) { - // then it means the front writer is currently doing the - // write on behalf of this "timed-out" writer. Then it - // should wait until the write completes. - expiration_time = 0; - } else { - timed_out = true; - break; - } - } - } - - if (timed_out) { -#ifndef NDEBUG - bool found = false; -#endif - for (auto iter = writers_.begin(); iter != writers_.end(); iter++) { - if (*iter == w) { - writers_.erase(iter); -#ifndef NDEBUG - found = true; -#endif - break; - } - } -#ifndef NDEBUG - assert(found); -#endif - // writers_.front() might still be in cond_wait without a time-out. - // As a result, we need to signal it to wake it up. Otherwise no - // one else will wake him up, and RocksDB will hang. - if (!writers_.empty()) { - writers_.front()->cv.Signal(); - } - return Status::TimedOut(); - } - return Status::OK(); -} - -void WriteThread::ExitWriteThread(WriteThread::Writer* w, - WriteThread::Writer* last_writer, - Status status) { - // Pop out the current writer and all writers being pushed before the - // current writer from the writer queue. - while (!writers_.empty()) { - Writer* ready = writers_.front(); - writers_.pop_front(); - if (ready != w) { - ready->status = status; - ready->done = true; - ready->cv.Signal(); - } - if (ready == last_writer) break; - } - - // Notify new head of write queue - if (!writers_.empty()) { - writers_.front()->cv.Signal(); - } -} - -// This function will be called only when the first writer succeeds. -// All writers in the to-be-built batch group will be processed. -// -// REQUIRES: Writer list must be non-empty -// REQUIRES: First writer must have a non-nullptr batch -void WriteThread::BuildBatchGroup(WriteThread::Writer** last_writer, - autovector* write_batch_group) { - assert(!writers_.empty()); - Writer* first = writers_.front(); - assert(first->batch != nullptr); - - size_t size = WriteBatchInternal::ByteSize(first->batch); - write_batch_group->push_back(first->batch); - - // Allow the group to grow up to a maximum size, but if the - // original write is small, limit the growth so we do not slow - // down the small write too much. - size_t max_size = 1 << 20; - if (size <= (128<<10)) { - max_size = size + (128<<10); - } - - *last_writer = first; - std::deque::iterator iter = writers_.begin(); - ++iter; // Advance past "first" - for (; iter != writers_.end(); ++iter) { - Writer* w = *iter; - if (w->sync && !first->sync) { - // Do not include a sync write into a batch handled by a non-sync write. - break; - } - - if (!w->disableWAL && first->disableWAL) { - // Do not include a write that needs WAL into a batch that has - // WAL disabled. - break; - } - - if (w->timeout_hint_us < first->timeout_hint_us) { - // Do not include those writes with shorter timeout. Otherwise, we might - // execute a write that should instead be aborted because of timeout. - break; - } - - if (w->batch == nullptr) { - // Do not include those writes with nullptr batch. Those are not writes, - // those are something else. They want to be alone - break; - } - - size += WriteBatchInternal::ByteSize(w->batch); - if (size > max_size) { - // Do not make batch too big - break; - } - - write_batch_group->push_back(w->batch); - w->in_batch_group = true; - *last_writer = w; - } -} - -} // namespace rocksdb diff --git a/db/write_thread.h b/db/write_thread.h deleted file mode 100644 index 8c5baa664b..0000000000 --- a/db/write_thread.h +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#pragma once - -#include -#include -#include -#include "rocksdb/status.h" -#include "db/write_batch_internal.h" -#include "util/autovector.h" -#include "port/port.h" - -namespace rocksdb { - -class WriteThread { - public: - static const uint64_t kNoTimeOut = std::numeric_limits::max(); - // Information kept for every waiting writer - struct Writer { - Status status; - WriteBatch* batch; - bool sync; - bool disableWAL; - bool in_batch_group; - bool done; - uint64_t timeout_hint_us; - port::CondVar cv; - - explicit Writer(port::Mutex* mu) - : batch(nullptr), - sync(false), - disableWAL(false), - in_batch_group(false), - done(false), - timeout_hint_us(kNoTimeOut), - cv(mu) {} - }; - - WriteThread() = default; - ~WriteThread() = default; - - // Before applying write operation (such as DBImpl::Write, DBImpl::Flush) - // thread should grab the mutex_ and be the first on writers queue. - // EnterWriteThread is used for it. - // Be aware! Writer's job can be done by other thread (see DBImpl::Write - // for examples), so check it via w.done before applying changes. - // - // Writer* w: writer to be placed in the queue - // uint64_t expiration_time: maximum time to be in the queue - // See also: ExitWriteThread - // REQUIRES: db mutex held - Status EnterWriteThread(Writer* w, uint64_t expiration_time); - - // After doing write job, we need to remove already used writers from - // writers_ queue and notify head of the queue about it. - // ExitWriteThread is used for this. - // - // Writer* w: Writer, that was added by EnterWriteThread function - // Writer* last_writer: Since we can join a few Writers (as DBImpl::Write - // does) - // we should pass last_writer as a parameter to - // ExitWriteThread - // (if you don't touch other writers, just pass w) - // Status status: Status of write operation - // See also: EnterWriteThread - // REQUIRES: db mutex held - void ExitWriteThread(Writer* w, Writer* last_writer, Status status); - - void BuildBatchGroup(Writer** last_writer, - autovector* write_batch_group); - - private: - // Queue of writers. - std::deque writers_; -}; - -} // namespace rocksdb diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 726a1edc31..c54e6707f0 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -537,6 +537,8 @@ extern void rocksdb_options_set_min_partial_merge_operands( rocksdb_options_t*, uint32_t); extern void rocksdb_options_set_bloom_locality( rocksdb_options_t*, uint32_t); +extern void rocksdb_options_set_allow_thread_local( + rocksdb_options_t*, unsigned char); extern void rocksdb_options_set_inplace_update_support( rocksdb_options_t*, unsigned char); extern void rocksdb_options_set_inplace_update_num_locks( @@ -696,10 +698,6 @@ extern void rocksdb_readoptions_set_fill_cache( extern void rocksdb_readoptions_set_snapshot( rocksdb_readoptions_t*, const rocksdb_snapshot_t*); -extern void rocksdb_readoptions_set_iterate_upper_bound( - rocksdb_readoptions_t*, - const char* key, - size_t keylen); extern void rocksdb_readoptions_set_read_tier( rocksdb_readoptions_t*, int); extern void rocksdb_readoptions_set_tailing( diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index a8a6f9b73a..65d44b6cbf 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -127,6 +127,9 @@ class Cache { void LRU_Append(Handle* e); void Unref(Handle* e); + struct Rep; + Rep* rep_; + // No copying allowed Cache(const Cache&); void operator=(const Cache&); diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index 8e73667527..f3a8499a8f 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -62,10 +62,6 @@ class Comparator { // must not be deleted. extern const Comparator* BytewiseComparator(); -// Return a builtin comparator that uses reverse lexicographic byte-wise -// ordering. -extern const Comparator* ReverseBytewiseComparator(); - } // namespace rocksdb #endif // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 0653a83868..d9be6b4270 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -123,7 +123,7 @@ class DB { // Open DB with column families. // db_options specify database specific options - // column_families is the vector of all column families in the database, + // column_families is the vector of all column families in the databse, // containing column family name and options. You need to open ALL column // families in the database. To get the list of column families, you can use // ListColumnFamilies(). Also, you can open only a subset of column families @@ -359,14 +359,6 @@ class DB { return CompactRange(DefaultColumnFamily(), begin, end, reduce_level, target_level, target_path_id); } - virtual bool SetOptions(ColumnFamilyHandle* column_family, - const std::unordered_map& new_options) { - return true; - } - virtual bool SetOptions( - const std::unordered_map& new_options) { - return SetOptions(DefaultColumnFamily(), new_options); - } // Number of levels used for this DB. virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0; diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index 90aefb388b..fa44db45ff 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -21,52 +21,11 @@ #define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ #include -#include namespace rocksdb { class Slice; -// A class that takes a bunch of keys, then generates filter -class FilterBitsBuilder { - public: - virtual ~FilterBitsBuilder() {} - - // Add Key to filter, you could use any way to store the key. - // Such as: storing hashes or original keys - // Keys are in sorted order and duplicated keys are possible. - virtual void AddKey(const Slice& key) = 0; - - // Generate the filter using the keys that are added - // The return value of this function would be the filter bits, - // The ownership of actual data is set to buf - virtual Slice Finish(std::unique_ptr* buf) = 0; -}; - -// A class that checks if a key can be in filter -// It should be initialized by Slice generated by BitsBuilder -class FilterBitsReader { - public: - virtual ~FilterBitsReader() {} - - // Check if the entry match the bits in filter - virtual bool MayMatch(const Slice& entry) = 0; -}; - -// We add a new format of filter block called full filter block -// This new interface gives you more space of customization -// -// For the full filter block, you can plug in your version by implement -// the FilterBitsBuilder and FilterBitsReader -// -// There are two sets of interface in FilterPolicy -// Set 1: CreateFilter, KeyMayMatch: used for blockbased filter -// Set 2: GetFilterBitsBuilder, GetFilterBitsReader, they are used for -// full filter. -// Set 1 MUST be implemented correctly, Set 2 is optional -// RocksDB would first try using functions in Set 2. if they return nullptr, -// it would use Set 1 instead. -// You can choose filter type in NewBloomFilterPolicy class FilterPolicy { public: virtual ~FilterPolicy(); @@ -92,28 +51,11 @@ class FilterPolicy { // This method may return true or false if the key was not on the // list, but it should aim to return false with a high probability. virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0; - - // Get the FilterBitsBuilder, which is ONLY used for full filter block - // It contains interface to take individual key, then generate filter - virtual FilterBitsBuilder* GetFilterBitsBuilder() const { - return nullptr; - } - - // Get the FilterBitsReader, which is ONLY used for full filter block - // It contains interface to tell if key can be in filter - // The input slice should NOT be deleted by FilterPolicy - virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) const { - return nullptr; - } }; // Return a new filter policy that uses a bloom filter with approximately -// the specified number of bits per key. -// -// bits_per_key: bits per key in bloom filter. A good value for bits_per_key +// the specified number of bits per key. A good value for bits_per_key // is 10, which yields a filter with ~ 1% false positive rate. -// use_block_based_builder: use block based filter rather than full fiter. -// If you want to builder full filter, it needs to be set to false. // // Callers must delete the result after any database that is using the // result has been closed. @@ -125,8 +67,8 @@ class FilterPolicy { // ignores trailing spaces, it would be incorrect to use a // FilterPolicy (like NewBloomFilterPolicy) that does not ignore // trailing spaces in keys. -extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key, - bool use_block_based_builder = true); +extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key); + } #endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ diff --git a/include/rocksdb/immutable_options.h b/include/rocksdb/immutable_options.h deleted file mode 100644 index 2dd50f7563..0000000000 --- a/include/rocksdb/immutable_options.h +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#pragma once - -#include -#include "rocksdb/options.h" - -namespace rocksdb { - -// ImmutableCFOptions is a data struct used by RocksDB internal. It contains a -// subset of Options that should not be changed during the entire lifetime -// of DB. You shouldn't need to access this data structure unless you are -// implementing a new TableFactory. Raw pointers defined in this struct do -// not have ownership to the data they point to. Options contains shared_ptr -// to these data. -struct ImmutableCFOptions { - explicit ImmutableCFOptions(const Options& options); - - CompactionStyle compaction_style; - - CompactionOptionsUniversal compaction_options_universal; - CompactionOptionsFIFO compaction_options_fifo; - - const SliceTransform* prefix_extractor; - - const Comparator* comparator; - - MergeOperator* merge_operator; - - const CompactionFilter* compaction_filter; - - CompactionFilterFactory* compaction_filter_factory; - - CompactionFilterFactoryV2* compaction_filter_factory_v2; - - Logger* info_log; - - Statistics* statistics; - - InfoLogLevel info_log_level; - - Env* env; - - // Allow the OS to mmap file for reading sst tables. Default: false - bool allow_mmap_reads; - - // Allow the OS to mmap file for writing. Default: false - bool allow_mmap_writes; - - std::vector db_paths; - - MemTableRepFactory* memtable_factory; - - TableFactory* table_factory; - - Options::TablePropertiesCollectorFactories - table_properties_collector_factories; - - bool advise_random_on_open; - - // This options is required by PlainTableReader. May need to move it - // to PlainTalbeOptions just like bloom_bits_per_key - uint32_t bloom_locality; - - bool purge_redundant_kvs_while_flush; - - uint32_t min_partial_merge_operands; - - bool disable_data_sync; - - bool use_fsync; - - CompressionType compression; - - std::vector compression_per_level; - - CompressionOptions compression_opts; - - Options::AccessHint access_hint_on_compaction_start; - - int num_levels; -}; - -} // namespace rocksdb diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 467c7bb1e6..11d976fb2b 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -14,7 +14,6 @@ #include #include #include -#include #include "rocksdb/version.h" #include "rocksdb/universal_compaction.h" @@ -58,7 +57,6 @@ enum CompactionStyle : char { kCompactionStyleFIFO = 0x2, // FIFO compaction style }; - struct CompactionOptionsFIFO { // once the total sum of table files reaches this, we will delete the oldest // table file @@ -226,12 +224,17 @@ struct ColumnFamilyOptions { CompressionType compression; // Different levels can have different compression policies. There - // are cases where most lower levels would like to use quick compression - // algorithms while the higher levels (which have more data) use + // are cases where most lower levels would like to quick compression + // algorithm while the higher levels (which have more data) use // compression algorithms that have better compression but could - // be slower. This array, if non-empty, should have an entry for - // each level of the database; these override the value specified in - // the previous field 'compression'. + // be slower. This array, if non nullptr, should have an entry for + // each level of the database. This array, if non nullptr, overides the + // value specified in the previous field 'compression'. The caller is + // reponsible for allocating memory and initializing the values in it + // before invoking Open(). The caller is responsible for freeing this + // array and it could be freed anytime after the return from Open(). + // This could have been a std::vector but that makes the equivalent + // java/C api hard to construct. std::vector compression_per_level; // different options for compression algorithms @@ -288,7 +291,7 @@ struct ColumnFamilyOptions { // and each file on level-3 will be 200MB. // by default target_file_size_base is 2MB. - uint64_t target_file_size_base; + int target_file_size_base; // by default target_file_size_multiplier is 1, which means // by default files in different levels will have similar size. int target_file_size_multiplier; @@ -344,7 +347,9 @@ struct ColumnFamilyOptions { // Default: 0 (disabled) double hard_rate_limit; - // DEPRECATED -- this options is no longer used + // Max time a put will be stalled when hard_rate_limit is enforced. If 0, then + // there is no limit. + // Default: 1000 unsigned int rate_limit_delay_max_milliseconds; // size of one block in arena memory allocation. @@ -612,7 +617,7 @@ struct DBOptions { // it does not use any locks to prevent concurrent updates. std::shared_ptr statistics; - // If true, then the contents of manifest and data files are not synced + // If true, then the contents of data files are not synced // to stable storage. Their contents remain in the OS buffers till the // OS decides to flush them. This option is good for bulk-loading // of data. Once the bulk-loading is complete, please issue a @@ -784,13 +789,12 @@ struct DBOptions { // Specify the file access pattern once a compaction is started. // It will be applied to all input files of a compaction. // Default: NORMAL - enum AccessHint { - NONE, - NORMAL, - SEQUENTIAL, - WILLNEED - }; - AccessHint access_hint_on_compaction_start; + enum { + NONE, + NORMAL, + SEQUENTIAL, + WILLNEED + } access_hint_on_compaction_start; // Use adaptive mutex, which spins in the user space before resorting // to kernel. This could reduce context switch when the mutex is not @@ -799,6 +803,10 @@ struct DBOptions { // Default: false bool use_adaptive_mutex; + // Allow RocksDB to use thread local storage to optimize performance. + // Default: true + bool allow_thread_local; + // Create DBOptions with default values for all fields DBOptions(); // Create DBOptions from Options @@ -895,18 +903,6 @@ struct ReadOptions { // ! DEPRECATED // const Slice* prefix; - // "iterate_upper_bound" defines the extent upto which the forward iterator - // can returns entries. Once the bound is reached, Valid() will be false. - // "iterate_upper_bound" is exclusive ie the bound value is - // not a valid entry. If iterator_extractor is not null, the Seek target - // and iterator_upper_bound need to have the same prefix. - // This is because ordering is not guaranteed outside of prefix domain. - // There is no lower bound on the iterator. If needed, that can be easily - // implemented - // - // Default: nullptr - const Slice* iterate_upper_bound; - // Specify if this read request should process data that ALREADY // resides on a particular cache. If the required data is not // found at the specified cache, then Status::Incomplete is returned. @@ -930,7 +926,6 @@ struct ReadOptions { : verify_checksums(true), fill_cache(true), snapshot(nullptr), - iterate_upper_bound(nullptr), read_tier(kReadAllTier), tailing(false), total_order_seek(false) {} @@ -938,7 +933,6 @@ struct ReadOptions { : verify_checksums(cksum), fill_cache(cache), snapshot(nullptr), - iterate_upper_bound(nullptr), read_tier(kReadAllTier), tailing(false), total_order_seek(false) {} @@ -1011,12 +1005,6 @@ extern Options GetOptions(size_t total_write_buffer_limit, int read_amplification_threshold = 8, int write_amplification_threshold = 32, uint64_t target_db_size = 68719476736 /* 64GB */); - -bool GetOptionsFromStrings( - const Options& base_options, - const std::unordered_map& options_map, - Options* new_options); - } // namespace rocksdb #endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 87ac321c90..6785833b4d 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -115,7 +115,7 @@ enum Tickers : uint32_t { // head of the writers queue. WRITE_DONE_BY_SELF, WRITE_DONE_BY_OTHER, - WRITE_TIMEDOUT, // Number of writes ending up with timed-out. + WRITE_TIMEDOUT, // Number of writes ending up with timed-out. WRITE_WITH_WAL, // Number of Write calls that request WAL COMPACT_READ_BYTES, // Bytes read during compaction COMPACT_WRITE_BYTES, // Bytes written during compaction @@ -212,6 +212,7 @@ enum Histograms : uint32_t { READ_BLOCK_COMPACTION_MICROS, READ_BLOCK_GET_MICROS, WRITE_RAW_BLOCK_MICROS, + STALL_L0_SLOWDOWN_COUNT, STALL_MEMTABLE_COMPACTION_COUNT, STALL_L0_NUM_FILES_COUNT, @@ -219,7 +220,6 @@ enum Histograms : uint32_t { SOFT_RATE_LIMIT_DELAY_COUNT, NUM_FILES_IN_SINGLE_COMPACTION, DB_SEEK, - WRITE_STALL, HISTOGRAM_ENUM_MAX, }; diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 4c06c23f7d..0f8b41074d 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -23,7 +23,6 @@ #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" -#include "rocksdb/immutable_options.h" #include "rocksdb/status.h" namespace rocksdb { @@ -251,46 +250,23 @@ struct CuckooTablePropertyNames { // Denotes if the key sorted in the file is Internal Key (if false) // or User Key only (if true). static const std::string kIsLastLevel; - // Indicate if using identity function for the first hash function. - static const std::string kIdentityAsFirstHash; - // Indicate if using module or bit and to calculate hash value - static const std::string kUseModuleHash; - // Fixed user key length - static const std::string kUserKeyLength; -}; - -struct CuckooTableOptions { - // Determines the utilization of hash tables. Smaller values - // result in larger hash tables with fewer collisions. - double hash_table_ratio = 0.9; - // A property used by builder to determine the depth to go to - // to search for a path to displace elements in case of - // collision. See Builder.MakeSpaceForKey method. Higher - // values result in more efficient hash tables with fewer - // lookups but take more time to build. - uint32_t max_search_depth = 100; - // In case of collision while inserting, the builder - // attempts to insert in the next cuckoo_block_size - // locations before skipping over to the next Cuckoo hash - // function. This makes lookups more cache friendly in case - // of collisions. - uint32_t cuckoo_block_size = 5; - // If this option is enabled, user key is treated as uint64_t and its value - // is used as hash value directly. This option changes builder's behavior. - // Reader ignore this option and behave according to what specified in table - // property. - bool identity_as_first_hash = false; - // If this option is set to true, module is used during hash calculation. - // This often yields better space efficiency at the cost of performance. - // If this optino is set to false, # of entries in table is constrained to be - // power of two, and bit and is used to calculate hash, which is faster in - // general. - bool use_module_hash = true; }; // Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing -extern TableFactory* NewCuckooTableFactory( - const CuckooTableOptions& table_options = CuckooTableOptions()); +// @hash_table_ratio: Determines the utilization of hash tables. Smaller values +// result in larger hash tables with fewer collisions. +// @max_search_depth: A property used by builder to determine the depth to go to +// to search for a path to displace elements in case of +// collision. See Builder.MakeSpaceForKey method. Higher +// values result in more efficient hash tables with fewer +// lookups but take more time to build. +// @cuckoo_block_size: In case of collision while inserting, the builder +// attempts to insert in the next cuckoo_block_size +// locations before skipping over to the next Cuckoo hash +// function. This makes lookups more cache friendly in case +// of collisions. +extern TableFactory* NewCuckooTableFactory(double hash_table_ratio = 0.9, + uint32_t max_search_depth = 100, uint32_t cuckoo_block_size = 5); #endif // ROCKSDB_LITE @@ -317,15 +293,14 @@ class TableFactory { // and cache the table object returned. // (1) SstFileReader (for SST Dump) opens the table and dump the table // contents using the interator of the table. - // ImmutableCFOptions is a subset of Options that can not be altered. - // EnvOptions is a subset of Options that will be used by Env. + // options and soptions are options. options is the general options. // Multiple configured can be accessed from there, including and not // limited to block cache and key comparators. // file is a file handler to handle the file for the table // file_size is the physical file size of the file // table_reader is the output table reader virtual Status NewTableReader( - const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const Options& options, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const = 0; @@ -343,17 +318,14 @@ class TableFactory { // (4) When running Repairer, it creates a table builder to convert logs to // SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) // - // ImmutableCFOptions is a subset of Options that can not be altered. - // Multiple configured can be acceseed from there, including and not limited - // to compression options. file is a handle of a writable file. - // It is the caller's responsibility to keep the file open and close the file - // after closing the table builder. compression_type is the compression type - // to use in this table. + // options is the general options. Multiple configured can be acceseed from + // there, including and not limited to compression options. + // file is a handle of a writable file. It is the caller's responsibility to + // keep the file open and close the file after closing the table builder. + // compression_type is the compression type to use in this table. virtual TableBuilder* NewTableBuilder( - const ImmutableCFOptions& ioptions, - const InternalKeyComparator& internal_comparator, - WritableFile* file, const CompressionType compression_type, - const CompressionOptions& compression_opts) const = 0; + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const = 0; // Sanitizes the specified DB Options. // diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h index 57a8accdf8..78365769d2 100644 --- a/include/rocksdb/utilities/backupable_db.h +++ b/include/rocksdb/utilities/backupable_db.h @@ -10,10 +10,7 @@ #pragma once #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include #include @@ -130,41 +127,9 @@ struct BackupInfo { int64_t timestamp; uint64_t size; - uint32_t number_files; - BackupInfo() {} - - BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size, - uint32_t _number_files) - : backup_id(_backup_id), timestamp(_timestamp), size(_size), - number_files(_number_files) {} -}; - -class BackupStatistics { - public: - BackupStatistics() { - number_success_backup = 0; - number_fail_backup = 0; - } - - BackupStatistics(uint32_t _number_success_backup, - uint32_t _number_fail_backup) - : number_success_backup(_number_success_backup), - number_fail_backup(_number_fail_backup) {} - - ~BackupStatistics() {} - - void IncrementNumberSuccessBackup(); - void IncrementNumberFailBackup(); - - uint32_t GetNumberSuccessBackup() const; - uint32_t GetNumberFailBackup() const; - - std::string ToString() const; - - private: - uint32_t number_success_backup; - uint32_t number_fail_backup; + BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size) + : backup_id(_backup_id), timestamp(_timestamp), size(_size) {} }; class BackupEngineReadOnly { diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h index 85c80850fc..c09f53d112 100644 --- a/include/rocksdb/utilities/write_batch_with_index.h +++ b/include/rocksdb/utilities/write_batch_with_index.h @@ -11,9 +11,8 @@ #pragma once -#include "rocksdb/comparator.h" -#include "rocksdb/slice.h" #include "rocksdb/status.h" +#include "rocksdb/slice.h" #include "rocksdb/write_batch.h" namespace rocksdb { @@ -57,14 +56,12 @@ class WBWIIterator { // A user can call NewIterator() to create an iterator. class WriteBatchWithIndex { public: - // backup_index_comparator: the backup comparator used to compare keys - // within the same column family, if column family is not given in the - // interface, or we can't find a column family from the column family handle - // passed in, backup_index_comparator will be used for the column family. + // index_comparator indicates the order when iterating data in the write + // batch. Technically, it doesn't have to be the same as the one used in + // the DB. // reserved_bytes: reserved bytes in underlying WriteBatch - explicit WriteBatchWithIndex( - const Comparator* backup_index_comparator = BytewiseComparator(), - size_t reserved_bytes = 0); + explicit WriteBatchWithIndex(const Comparator* index_comparator, + size_t reserved_bytes = 0); virtual ~WriteBatchWithIndex(); WriteBatch* GetWriteBatch(); diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index d6ccaeda59..4109868431 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -7,7 +7,7 @@ // Also update Makefile if you change these #define ROCKSDB_MAJOR 3 #define ROCKSDB_MINOR 5 -#define ROCKSDB_PATCH 0 +#define ROCKSDB_PATCH 1 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these diff --git a/java/Makefile b/java/Makefile index b2f3674f06..47b2afb9e2 100644 --- a/java/Makefile +++ b/java/Makefile @@ -1,4 +1,4 @@ -NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig +NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv NATIVE_INCLUDE = ./include ROCKSDB_JAR = rocksdbjni.jar diff --git a/java/RocksDBSample.java b/java/RocksDBSample.java index bd5a85076c..72da4b5e86 100644 --- a/java/RocksDBSample.java +++ b/java/RocksDBSample.java @@ -72,34 +72,14 @@ public class RocksDBSample { assert(options.memTableFactoryName().equals("SkipListFactory")); options.setTableFormatConfig(new PlainTableConfig()); - // Plain-Table requires mmap read - options.setAllowMmapReads(true); assert(options.tableFactoryName().equals("PlainTable")); - - options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000, - 10000, 10)); - options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000)); BlockBasedTableConfig table_options = new BlockBasedTableConfig(); table_options.setBlockCacheSize(64 * SizeUnit.KB) .setFilterBitsPerKey(10) - .setCacheNumShardBits(6) - .setBlockSizeDeviation(5) - .setBlockRestartInterval(10) - .setCacheIndexAndFilterBlocks(true) - .setHashIndexAllowCollision(false) - .setBlockCacheCompressedSize(64 * SizeUnit.KB) - .setBlockCacheCompressedNumShardBits(10); - + .setCacheNumShardBits(6); assert(table_options.blockCacheSize() == 64 * SizeUnit.KB); assert(table_options.cacheNumShardBits() == 6); - assert(table_options.blockSizeDeviation() == 5); - assert(table_options.blockRestartInterval() == 10); - assert(table_options.cacheIndexAndFilterBlocks() == true); - assert(table_options.hashIndexAllowCollision() == false); - assert(table_options.blockCacheCompressedSize() == 64 * SizeUnit.KB); - assert(table_options.blockCacheCompressedNumShardBits() == 10); - options.setTableFormatConfig(table_options); assert(options.tableFactoryName().equals("BlockBasedTable")); @@ -108,8 +88,6 @@ public class RocksDBSample { db.put("hello".getBytes(), "world".getBytes()); byte[] value = db.get("hello".getBytes()); assert("world".equals(new String(value))); - String str = db.getProperty("rocksdb.stats"); - assert(str != null && str != ""); } catch (RocksDBException e) { System.out.format("[ERROR] caught the unexpceted exception -- %s\n", e); assert(db == null); @@ -143,29 +121,6 @@ public class RocksDBSample { System.out.println(""); } - // write batch test - WriteOptions writeOpt = new WriteOptions(); - for (int i = 10; i <= 19; ++i) { - WriteBatch batch = new WriteBatch(); - for (int j = 10; j <= 19; ++j) { - batch.put(String.format("%dx%d", i, j).getBytes(), - String.format("%d", i * j).getBytes()); - } - db.write(writeOpt, batch); - batch.dispose(); - } - for (int i = 10; i <= 19; ++i) { - for (int j = 10; j <= 19; ++j) { - assert(new String( - db.get(String.format("%dx%d", i, j).getBytes())).equals( - String.format("%d", i * j))); - System.out.format("%s ", new String(db.get( - String.format("%dx%d", i, j).getBytes()))); - } - System.out.println(""); - } - writeOpt.dispose(); - value = db.get("1x1".getBytes()); assert(value != null); value = db.get("world".getBytes()); diff --git a/java/org/rocksdb/BlockBasedTableConfig.java b/java/org/rocksdb/BlockBasedTableConfig.java index bdb27d6c2d..523a576911 100644 --- a/java/org/rocksdb/BlockBasedTableConfig.java +++ b/java/org/rocksdb/BlockBasedTableConfig.java @@ -14,14 +14,11 @@ public class BlockBasedTableConfig extends TableFormatConfig { public BlockBasedTableConfig() { noBlockCache_ = false; blockCacheSize_ = 8 * 1024 * 1024; - blockSize_ = 4 * 1024; - blockSizeDeviation_ = 10; - blockRestartInterval_ = 16; + blockSize_ = 4 * 1024; + blockSizeDeviation_ =10; + blockRestartInterval_ =16; wholeKeyFiltering_ = true; - bitsPerKey_ = 10; - cacheIndexAndFilterBlocks_ = false; - hashIndexAllowCollision_ = true; - blockCacheCompressedSize_ = 0; + bitsPerKey_ = 0; } /** @@ -74,8 +71,8 @@ public class BlockBasedTableConfig extends TableFormatConfig { * number means use default settings." * @return the reference to the current option. */ - public BlockBasedTableConfig setCacheNumShardBits(int blockCacheNumShardBits) { - blockCacheNumShardBits_ = blockCacheNumShardBits; + public BlockBasedTableConfig setCacheNumShardBits(int numShardBits) { + numShardBits_ = numShardBits; return this; } @@ -87,7 +84,7 @@ public class BlockBasedTableConfig extends TableFormatConfig { * @return the number of shard bits used in the block cache. */ public int cacheNumShardBits() { - return blockCacheNumShardBits_; + return numShardBits_; } /** @@ -189,135 +186,25 @@ public class BlockBasedTableConfig extends TableFormatConfig { bitsPerKey_ = bitsPerKey; return this; } - - /** - * Indicating if we'd put index/filter blocks to the block cache. - If not specified, each "table reader" object will pre-load index/filter - block during table initialization. - * - * @return if index and filter blocks should be put in block cache. - */ - public boolean cacheIndexAndFilterBlocks() { - return cacheIndexAndFilterBlocks_; - } - - /** - * Indicating if we'd put index/filter blocks to the block cache. - If not specified, each "table reader" object will pre-load index/filter - block during table initialization. - * - * @param index and filter blocks should be put in block cache. - * @return the reference to the current config. - */ - public BlockBasedTableConfig setCacheIndexAndFilterBlocks( - boolean cacheIndexAndFilterBlocks) { - cacheIndexAndFilterBlocks_ = cacheIndexAndFilterBlocks; - return this; - } - - /** - * Influence the behavior when kHashSearch is used. - if false, stores a precise prefix to block range mapping - if true, does not store prefix and allows prefix hash collision - (less memory consumption) - * - * @return if hash collisions should be allowed. - */ - public boolean hashIndexAllowCollision() { - return hashIndexAllowCollision_; - } - - /** - * Influence the behavior when kHashSearch is used. - if false, stores a precise prefix to block range mapping - if true, does not store prefix and allows prefix hash collision - (less memory consumption) - * - * @param if hash collisions should be allowed. - * @return the reference to the current config. - */ - public BlockBasedTableConfig setHashIndexAllowCollision( - boolean hashIndexAllowCollision) { - hashIndexAllowCollision_ = hashIndexAllowCollision; - return this; - } - - /** - * Size of compressed block cache. If 0, then block_cache_compressed is set - * to null. - * - * @return size of compressed block cache. - */ - public long blockCacheCompressedSize() { - return blockCacheCompressedSize_; - } - - /** - * Size of compressed block cache. If 0, then block_cache_compressed is set - * to null. - * - * @param size of compressed block cache. - * @return the reference to the current config. - */ - public BlockBasedTableConfig setBlockCacheCompressedSize( - long blockCacheCompressedSize) { - blockCacheCompressedSize_ = blockCacheCompressedSize; - return this; - } - - /** - * Controls the number of shards for the block compressed cache. - * This is applied only if blockCompressedCacheSize is set to non-negative. - * - * @return numShardBits the number of shard bits. The resulting - * number of shards would be 2 ^ numShardBits. Any negative - * number means use default settings. - */ - public int blockCacheCompressedNumShardBits() { - return blockCacheCompressedNumShardBits_; - } - - /** - * Controls the number of shards for the block compressed cache. - * This is applied only if blockCompressedCacheSize is set to non-negative. - * - * @param numShardBits the number of shard bits. The resulting - * number of shards would be 2 ^ numShardBits. Any negative - * number means use default settings." - * @return the reference to the current option. - */ - public BlockBasedTableConfig setBlockCacheCompressedNumShardBits( - int blockCacheCompressedNumShardBits) { - blockCacheCompressedNumShardBits_ = blockCacheCompressedNumShardBits; - return this; - } @Override protected long newTableFactoryHandle() { - return newTableFactoryHandle(noBlockCache_, blockCacheSize_, - blockCacheNumShardBits_, blockSize_, blockSizeDeviation_, - blockRestartInterval_, wholeKeyFiltering_, bitsPerKey_, - cacheIndexAndFilterBlocks_, hashIndexAllowCollision_, - blockCacheCompressedSize_, blockCacheCompressedNumShardBits_); + return newTableFactoryHandle(noBlockCache_, blockCacheSize_, numShardBits_, + blockSize_, blockSizeDeviation_, blockRestartInterval_, + wholeKeyFiltering_, bitsPerKey_); } private native long newTableFactoryHandle( - boolean noBlockCache, long blockCacheSize, int blockCacheNumShardBits, + boolean noBlockCache, long blockCacheSize, int numShardbits, long blockSize, int blockSizeDeviation, int blockRestartInterval, - boolean wholeKeyFiltering, int bitsPerKey, - boolean cacheIndexAndFilterBlocks, boolean hashIndexAllowCollision, - long blockCacheCompressedSize, int blockCacheCompressedNumShardBits); + boolean wholeKeyFiltering, int bitsPerKey); private boolean noBlockCache_; private long blockCacheSize_; - private int blockCacheNumShardBits_; + private int numShardBits_; private long shard; private long blockSize_; private int blockSizeDeviation_; private int blockRestartInterval_; private boolean wholeKeyFiltering_; private int bitsPerKey_; - private boolean cacheIndexAndFilterBlocks_; - private boolean hashIndexAllowCollision_; - private long blockCacheCompressedSize_; - private int blockCacheCompressedNumShardBits_; } diff --git a/java/org/rocksdb/GenericRateLimiterConfig.java b/java/org/rocksdb/GenericRateLimiterConfig.java deleted file mode 100644 index 78b8b37ec6..0000000000 --- a/java/org/rocksdb/GenericRateLimiterConfig.java +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -package org.rocksdb; - -/** - * Config for rate limiter, which is used to control write rate of flush and - * compaction. - */ -public class GenericRateLimiterConfig extends RateLimiterConfig { - private static final long DEFAULT_REFILL_PERIOD_MICROS = (100 * 1000); - private static final int DEFAULT_FAIRNESS = 10; - - public GenericRateLimiterConfig(long rateBytesPerSecond, - long refillPeriodMicros, int fairness) { - rateBytesPerSecond_ = rateBytesPerSecond; - refillPeriodMicros_ = refillPeriodMicros; - fairness_ = fairness; - } - - public GenericRateLimiterConfig(long rateBytesPerSecond) { - this(rateBytesPerSecond, DEFAULT_REFILL_PERIOD_MICROS, DEFAULT_FAIRNESS); - } - - @Override protected long newRateLimiterHandle() { - return newRateLimiterHandle(rateBytesPerSecond_, refillPeriodMicros_, - fairness_); - } - - private native long newRateLimiterHandle(long rateBytesPerSecond, - long refillPeriodMicros, int fairness); - private final long rateBytesPerSecond_; - private final long refillPeriodMicros_; - private final int fairness_; -} diff --git a/java/org/rocksdb/NativeLibraryLoader.java b/java/org/rocksdb/NativeLibraryLoader.java deleted file mode 100644 index 4400565821..0000000000 --- a/java/org/rocksdb/NativeLibraryLoader.java +++ /dev/null @@ -1,58 +0,0 @@ -package org.rocksdb; - -import java.io.*; - - -/** - * This class is used to load the RocksDB shared library from within the jar. - * The shared library is extracted to a temp folder and loaded from there. - */ -public class NativeLibraryLoader { - private static String sharedLibraryName = "librocksdbjni.so"; - private static String tempFilePrefix = "librocksdbjni"; - private static String tempFileSuffix = ".so"; - - public static void loadLibraryFromJar(String tmpDir) - throws IOException { - File temp; - if(tmpDir == null || tmpDir.equals("")) - temp = File.createTempFile(tempFilePrefix, tempFileSuffix); - else - temp = new File(tmpDir + "/" + sharedLibraryName); - - temp.deleteOnExit(); - - if (!temp.exists()) { - throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist."); - } - - byte[] buffer = new byte[102400]; - int readBytes; - - InputStream is = ClassLoader.getSystemClassLoader().getResourceAsStream(sharedLibraryName); - if (is == null) { - throw new RuntimeException(sharedLibraryName + " was not found inside JAR."); - } - - OutputStream os = null; - try { - os = new FileOutputStream(temp); - while ((readBytes = is.read(buffer)) != -1) { - os.write(buffer, 0, readBytes); - } - } finally { - if(os != null) - os.close(); - - if(is != null) - is.close(); - } - - System.load(temp.getAbsolutePath()); - } - /** - * Private constructor to disallow instantiation - */ - private NativeLibraryLoader() { - } -} diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java index 7ccc748349..125f06afdf 100644 --- a/java/org/rocksdb/Options.java +++ b/java/org/rocksdb/Options.java @@ -13,19 +13,8 @@ package org.rocksdb; * native resources will be released as part of the process. */ public class Options extends RocksObject { - static { - RocksDB.loadLibrary(); - } static final long DEFAULT_CACHE_SIZE = 8 << 20; static final int DEFAULT_NUM_SHARD_BITS = -1; - - /** - * Builtin RocksDB comparators - */ - public enum BuiltinComparator { - BYTEWISE_COMPARATOR, REVERSE_BYTEWISE_COMPARATOR; - } - /** * Construct options for opening a RocksDB. * @@ -86,21 +75,6 @@ public class Options extends RocksObject { return createIfMissing(nativeHandle_); } - /** - * Set BuiltinComparator to be used with RocksDB. - * - * Note: Comparator can be set once upon database creation. - * - * Default: BytewiseComparator. - * @param builtinComparator a BuiltinComparator type. - */ - public void setBuiltinComparator(BuiltinComparator builtinComparator) { - assert(isInitialized()); - setBuiltinComparator(nativeHandle_, builtinComparator.ordinal()); - } - - private native void setBuiltinComparator(long handle, int builtinComparator); - /** * Amount of data to build up in memory (backed by an unsorted log * on disk) before converting to a sorted on-disk file. @@ -331,6 +305,40 @@ public class Options extends RocksObject { } private native void setUseFsync(long handle, boolean useFsync); + /** + * The time interval in seconds between each two consecutive stats logs. + * This number controls how often a new scribe log about + * db deploy stats is written out. + * -1 indicates no logging at all. + * + * @return the time interval in seconds between each two consecutive + * stats logs. + */ + public int dbStatsLogInterval() { + assert(isInitialized()); + return dbStatsLogInterval(nativeHandle_); + } + private native int dbStatsLogInterval(long handle); + + /** + * The time interval in seconds between each two consecutive stats logs. + * This number controls how often a new scribe log about + * db deploy stats is written out. + * -1 indicates no logging at all. + * Default value is 1800 (half an hour). + * + * @param dbStatsLogInterval the time interval in seconds between each + * two consecutive stats logs. + * @return the reference to the current option. + */ + public Options setDbStatsLogInterval(int dbStatsLogInterval) { + assert(isInitialized()); + setDbStatsLogInterval(nativeHandle_, dbStatsLogInterval); + return this; + } + private native void setDbStatsLogInterval( + long handle, int dbStatsLogInterval); + /** * Returns the directory of info log. * @@ -1093,6 +1101,33 @@ public class Options extends RocksObject { private native void setBytesPerSync( long handle, long bytesPerSync); + /** + * Allow RocksDB to use thread local storage to optimize performance. + * Default: true + * + * @return true if thread-local storage is allowed + */ + public boolean allowThreadLocal() { + assert(isInitialized()); + return allowThreadLocal(nativeHandle_); + } + private native boolean allowThreadLocal(long handle); + + /** + * Allow RocksDB to use thread local storage to optimize performance. + * Default: true + * + * @param allowThreadLocal true if thread-local storage is allowed. + * @return the reference to the current option. + */ + public Options setAllowThreadLocal(boolean allowThreadLocal) { + assert(isInitialized()); + setAllowThreadLocal(nativeHandle_, allowThreadLocal); + return this; + } + private native void setAllowThreadLocal( + long handle, boolean allowThreadLocal); + /** * Set the config for mem-table. * @@ -1103,19 +1138,6 @@ public class Options extends RocksObject { setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle()); return this; } - - /** - * Use to control write rate of flush and compaction. Flush has higher - * priority than compaction. Rate limiting is disabled if nullptr. - * Default: nullptr - * - * @param config rate limiter config. - * @return the instance of the current Options. - */ - public Options setRateLimiterConfig(RateLimiterConfig config) { - setRateLimiter(nativeHandle_, config.newRateLimiterHandle()); - return this; - } /** * Returns the name of the current mem table representation. @@ -2204,8 +2226,6 @@ public class Options extends RocksObject { private native long statisticsPtr(long optHandle); private native void setMemTableFactory(long handle, long factoryHandle); - private native void setRateLimiter(long handle, - long rateLimiterHandle); private native String memTableFactoryName(long handle); private native void setTableFactory(long handle, long factoryHandle); diff --git a/java/org/rocksdb/RateLimiterConfig.java b/java/org/rocksdb/RateLimiterConfig.java deleted file mode 100644 index 22de659219..0000000000 --- a/java/org/rocksdb/RateLimiterConfig.java +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -package org.rocksdb; - -/** - * Config for rate limiter, which is used to control write rate of flush and - * compaction. - */ -public abstract class RateLimiterConfig { - /** - * This function should only be called by Options.setRateLimiter(), - * which will create a c++ shared-pointer to the c++ RateLimiter - * that is associated with the Java RateLimtierConifg. - * - * @see Options.setRateLimiter() - */ - abstract protected long newRateLimiterHandle(); -} diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java index a165865514..f8968d14d1 100644 --- a/java/org/rocksdb/RocksDB.java +++ b/java/org/rocksdb/RocksDB.java @@ -11,7 +11,6 @@ import java.util.HashMap; import java.io.Closeable; import java.io.IOException; import org.rocksdb.util.Environment; -import org.rocksdb.NativeLibraryLoader; /** * A RocksDB is a persistent ordered map from keys to values. It is safe for @@ -24,19 +23,11 @@ public class RocksDB extends RocksObject { private static final String[] compressionLibs_ = { "snappy", "z", "bzip2", "lz4", "lz4hc"}; - static { - RocksDB.loadLibrary(); - } - /** * Loads the necessary library files. * Calling this method twice will have no effect. - * By default the method extracts the shared library for loading at - * java.io.tmpdir, however, you can override this temporary location by - * setting the environment variable ROCKSDB_SHAREDLIB_DIR. */ public static synchronized void loadLibrary() { - String tmpDir = System.getenv("ROCKSDB_SHAREDLIB_DIR"); // loading possibly necessary libraries. for (String lib : compressionLibs_) { try { @@ -45,14 +36,8 @@ public class RocksDB extends RocksObject { // since it may be optional, we ignore its loading failure here. } } - try - { - NativeLibraryLoader.loadLibraryFromJar(tmpDir); - } - catch (IOException e) - { - throw new RuntimeException("Unable to load the RocksDB shared library" + e); - } + // However, if any of them is required. We will see error here. + System.loadLibrary("rocksdbjni"); } /** @@ -324,26 +309,6 @@ public class RocksDB extends RocksObject { throws RocksDBException { remove(nativeHandle_, writeOpt.nativeHandle_, key, key.length); } - - /** - * DB implementations can export properties about their state - via this method. If "property" is a valid property understood by this - DB implementation, fills "*value" with its current value and returns - true. Otherwise returns false. - - - Valid property names include: - - "rocksdb.num-files-at-level" - return the number of files at level , - where is an ASCII representation of a level number (e.g. "0"). - "rocksdb.stats" - returns a multi-line string that describes statistics - about the internal operation of the DB. - "rocksdb.sstables" - returns a multi-line string that describes all - of the sstables that make up the db contents. - */ - public String getProperty(String property) throws RocksDBException { - return getProperty0(nativeHandle_, property, property.length()); - } /** * Return a heap-allocated iterator over the contents of the database. @@ -398,8 +363,6 @@ public class RocksDB extends RocksObject { protected native void remove( long handle, long writeOptHandle, byte[] key, int keyLen) throws RocksDBException; - protected native String getProperty0(long nativeHandle, - String property, int propertyLength) throws RocksDBException; protected native long iterator0(long optHandle); private native void disposeInternal(long handle); diff --git a/java/org/rocksdb/benchmark/DbBenchmark.java b/java/org/rocksdb/benchmark/DbBenchmark.java index 686d39445f..b715f9af15 100644 --- a/java/org/rocksdb/benchmark/DbBenchmark.java +++ b/java/org/rocksdb/benchmark/DbBenchmark.java @@ -255,7 +255,7 @@ public class DbBenchmark { for (long j = 0; j < entriesPerBatch_; j++) { getKey(key, i + j, keyRange_); DbBenchmark.this.gen_.generate(value); - batch.put(key, value); + db_.put(writeOpt_, key, value); stats_.finishedSingleOp(keySize_ + valueSize_); } db_.write(writeOpt_, batch); diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java index d3abb48cd5..b065c9023d 100644 --- a/java/org/rocksdb/test/OptionsTest.java +++ b/java/org/rocksdb/test/OptionsTest.java @@ -52,6 +52,12 @@ public class OptionsTest { assert(opt.useFsync() == boolValue); } + { // DbStatsLogInterval test + int intValue = rand.nextInt(); + opt.setDbStatsLogInterval(intValue); + assert(opt.dbStatsLogInterval() == intValue); + } + { // DbLogDir test String str = "path/to/DbLogDir"; opt.setDbLogDir(str); @@ -184,6 +190,12 @@ public class OptionsTest { assert(opt.bytesPerSync() == longValue); } + { // AllowThreadLocal test + boolean boolValue = rand.nextBoolean(); + opt.setAllowThreadLocal(boolValue); + assert(opt.allowThreadLocal() == boolValue); + } + { // WriteBufferSize test long longValue = rand.nextLong(); opt.setWriteBufferSize(longValue); diff --git a/java/rocksjni/memtablejni.cc b/java/rocksjni/memtablejni.cc index 9b0dc252c0..a0d50f5f5e 100644 --- a/java/rocksjni/memtablejni.cc +++ b/java/rocksjni/memtablejni.cc @@ -5,7 +5,6 @@ // // This file implements the "bridge" between Java and C++ for MemTables. -#include "rocksjni/portal.h" #include "include/org_rocksdb_HashSkipListMemTableConfig.h" #include "include/org_rocksdb_HashLinkedListMemTableConfig.h" #include "include/org_rocksdb_VectorMemTableConfig.h" @@ -21,7 +20,7 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle( JNIEnv* env, jobject jobj, jlong jbucket_count, jint jheight, jint jbranching_factor) { return reinterpret_cast(rocksdb::NewHashSkipListRepFactory( - rocksdb::jlong_to_size_t(jbucket_count), + static_cast(jbucket_count), static_cast(jheight), static_cast(jbranching_factor))); } @@ -34,7 +33,7 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle( jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle( JNIEnv* env, jobject jobj, jlong jbucket_count) { return reinterpret_cast(rocksdb::NewHashLinkListRepFactory( - rocksdb::jlong_to_size_t(jbucket_count))); + static_cast(jbucket_count))); } /* @@ -45,7 +44,7 @@ jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle( jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle( JNIEnv* env, jobject jobj, jlong jreserved_size) { return reinterpret_cast(new rocksdb::VectorRepFactory( - rocksdb::jlong_to_size_t(jreserved_size))); + static_cast(jreserved_size))); } /* diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index 50416ef810..da420c78f2 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -21,8 +21,6 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/table.h" #include "rocksdb/slice_transform.h" -#include "rocksdb/rate_limiter.h" -#include "rocksdb/comparator.h" /* * Class: org_rocksdb_Options @@ -64,23 +62,6 @@ jboolean Java_org_rocksdb_Options_createIfMissing( return reinterpret_cast(jhandle)->create_if_missing; } -/* - * Class: org_rocksdb_Options - * Method: useReverseBytewiseComparator - * Signature: (JI)V - */ -void Java_org_rocksdb_Options_setBuiltinComparator( - JNIEnv* env, jobject jobj, jlong jhandle, jint builtinComparator) { - switch (builtinComparator){ - case 1: - reinterpret_cast(jhandle)->comparator = rocksdb::ReverseBytewiseComparator(); - break; - default: - reinterpret_cast(jhandle)->comparator = rocksdb::BytewiseComparator(); - break; - } -} - /* * Class: org_rocksdb_Options * Method: setWriteBufferSize @@ -89,7 +70,7 @@ void Java_org_rocksdb_Options_setBuiltinComparator( void Java_org_rocksdb_Options_setWriteBufferSize( JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) { reinterpret_cast(jhandle)->write_buffer_size = - rocksdb::jlong_to_size_t(jwrite_buffer_size); + static_cast(jwrite_buffer_size); } @@ -381,7 +362,7 @@ jlong Java_org_rocksdb_Options_maxLogFileSize( void Java_org_rocksdb_Options_setMaxLogFileSize( JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) { reinterpret_cast(jhandle)->max_log_file_size = - rocksdb::jlong_to_size_t(max_log_file_size); + static_cast(max_log_file_size); } /* @@ -402,7 +383,7 @@ jlong Java_org_rocksdb_Options_logFileTimeToRoll( void Java_org_rocksdb_Options_setLogFileTimeToRoll( JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) { reinterpret_cast(jhandle)->log_file_time_to_roll = - rocksdb::jlong_to_size_t(log_file_time_to_roll); + static_cast(log_file_time_to_roll); } /* @@ -423,7 +404,7 @@ jlong Java_org_rocksdb_Options_keepLogFileNum( void Java_org_rocksdb_Options_setKeepLogFileNum( JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) { reinterpret_cast(jhandle)->keep_log_file_num = - rocksdb::jlong_to_size_t(keep_log_file_num); + static_cast(keep_log_file_num); } /* @@ -478,17 +459,6 @@ void Java_org_rocksdb_Options_setMemTableFactory( reinterpret_cast(jfactory_handle)); } -/* - * Class: org_rocksdb_Options - * Method: setRateLimiter - * Signature: (JJ)V - */ -void Java_org_rocksdb_Options_setRateLimiter( - JNIEnv* env, jobject jobj, jlong jhandle, jlong jrate_limiter_handle) { - reinterpret_cast(jhandle)->rate_limiter.reset( - reinterpret_cast(jrate_limiter_handle)); -} - /* * Class: org_rocksdb_Options * Method: tableCacheNumshardbits @@ -539,8 +509,7 @@ void Java_org_rocksdb_Options_setTableCacheRemoveScanCountLimit( void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor( JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) { reinterpret_cast(jhandle)->prefix_extractor.reset( - rocksdb::NewFixedPrefixTransform( - rocksdb::jlong_to_size_t(jprefix_length))); + rocksdb::NewFixedPrefixTransform(static_cast(jprefix_length))); } /* @@ -604,7 +573,7 @@ jlong Java_org_rocksdb_Options_manifestPreallocationSize( void Java_org_rocksdb_Options_setManifestPreallocationSize( JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) { reinterpret_cast(jhandle)->manifest_preallocation_size = - rocksdb::jlong_to_size_t(preallocation_size); + static_cast(preallocation_size); } /* @@ -807,6 +776,27 @@ void Java_org_rocksdb_Options_setBytesPerSync( static_cast(bytes_per_sync); } +/* + * Class: org_rocksdb_Options + * Method: allowThreadLocal + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_allowThreadLocal( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->allow_thread_local; +} + +/* + * Class: org_rocksdb_Options + * Method: setAllowThreadLocal + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setAllowThreadLocal( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_thread_local) { + reinterpret_cast(jhandle)->allow_thread_local = + static_cast(allow_thread_local); +} + /* * Method: tableFactoryName * Signature: (J)Ljava/lang/String @@ -1255,7 +1245,7 @@ jlong Java_org_rocksdb_Options_arenaBlockSize( void Java_org_rocksdb_Options_setArenaBlockSize( JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) { reinterpret_cast(jhandle)->arena_block_size = - rocksdb::jlong_to_size_t(jarena_block_size); + static_cast(jarena_block_size); } /* @@ -1420,7 +1410,7 @@ void Java_org_rocksdb_Options_setInplaceUpdateNumLocks( jlong jinplace_update_num_locks) { reinterpret_cast( jhandle)->inplace_update_num_locks = - rocksdb::jlong_to_size_t(jinplace_update_num_locks); + static_cast(jinplace_update_num_locks); } /* @@ -1511,7 +1501,7 @@ void Java_org_rocksdb_Options_setMaxSuccessiveMerges( JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_successive_merges) { reinterpret_cast(jhandle)->max_successive_merges = - rocksdb::jlong_to_size_t(jmax_successive_merges); + static_cast(jmax_successive_merges); } /* diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 4c7a8b9b9b..28fe754f0f 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -11,19 +11,12 @@ #define JAVA_ROCKSJNI_PORTAL_H_ #include -#include #include "rocksdb/db.h" #include "rocksdb/filter_policy.h" #include "rocksdb/utilities/backupable_db.h" namespace rocksdb { -inline size_t jlong_to_size_t(const jlong& jvalue) { - return static_cast(jvalue) <= - static_cast(std::numeric_limits::max()) ? - static_cast(jvalue) : std::numeric_limits::max(); -} - // The portal class for org.rocksdb.RocksDB class RocksDBJni { public: diff --git a/java/rocksjni/ratelimiterjni.cc b/java/rocksjni/ratelimiterjni.cc deleted file mode 100644 index 5413978a00..0000000000 --- a/java/rocksjni/ratelimiterjni.cc +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -// This file implements the "bridge" between Java and C++ for RateLimiter. - -#include "rocksjni/portal.h" -#include "include/org_rocksdb_GenericRateLimiterConfig.h" -#include "rocksdb/rate_limiter.h" - -/* - * Class: org_rocksdb_GenericRateLimiterConfig - * Method: newRateLimiterHandle - * Signature: (JJI)J - */ -jlong Java_org_rocksdb_GenericRateLimiterConfig_newRateLimiterHandle( - JNIEnv* env, jobject jobj, jlong jrate_bytes_per_second, - jlong jrefill_period_micros, jint jfairness) { - return reinterpret_cast(rocksdb::NewGenericRateLimiter( - rocksdb::jlong_to_size_t(jrate_bytes_per_second), - rocksdb::jlong_to_size_t(jrefill_period_micros), - static_cast(jfairness))); -} diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc index f1b9cc758a..f55290f649 100644 --- a/java/rocksjni/rocksjni.cc +++ b/java/rocksjni/rocksjni.cc @@ -425,27 +425,3 @@ jlong Java_org_rocksdb_RocksDB_iterator0( rocksdb::Iterator* iterator = db->NewIterator(rocksdb::ReadOptions()); return reinterpret_cast(iterator); } - -/* - * Class: org_rocksdb_RocksDB - * Method: getProperty0 - * Signature: (JLjava/lang/String;I)Ljava/lang/String; - */ -jstring Java_org_rocksdb_RocksDB_getProperty0( - JNIEnv* env, jobject jdb, jlong db_handle, jstring jproperty, - jint jproperty_len) { - auto db = reinterpret_cast(db_handle); - - const char* property = env->GetStringUTFChars(jproperty, 0); - rocksdb::Slice property_slice(property, jproperty_len); - - std::string property_value; - bool retCode = db->GetProperty(property_slice, &property_value); - env->ReleaseStringUTFChars(jproperty, property); - - if (!retCode) { - rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound()); - } - - return env->NewStringUTF(property_value.data()); -} diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc index 500cb255e4..ffda1a2ba4 100644 --- a/java/rocksjni/table.cc +++ b/java/rocksjni/table.cc @@ -31,22 +31,20 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle( /* * Class: org_rocksdb_BlockBasedTableConfig * Method: newTableFactoryHandle - * Signature: (ZJIJIIZIZZJI)J + * Signature: (ZJIJIIZI)J */ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( JNIEnv* env, jobject jobj, jboolean no_block_cache, jlong block_cache_size, - jint block_cache_num_shardbits, jlong block_size, jint block_size_deviation, + jint num_shardbits, jlong block_size, jint block_size_deviation, jint block_restart_interval, jboolean whole_key_filtering, - jint bits_per_key, jboolean cache_index_and_filter_blocks, - jboolean hash_index_allow_collision, jlong block_cache_compressed_size, - jint block_cache_compressd_num_shard_bits) { + jint bits_per_key) { rocksdb::BlockBasedTableOptions options; options.no_block_cache = no_block_cache; if (!no_block_cache && block_cache_size > 0) { - if (block_cache_num_shardbits > 0) { + if (num_shardbits > 0) { options.block_cache = - rocksdb::NewLRUCache(block_cache_size, block_cache_num_shardbits); + rocksdb::NewLRUCache(block_cache_size, num_shardbits); } else { options.block_cache = rocksdb::NewLRUCache(block_cache_size); } @@ -58,17 +56,5 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( if (bits_per_key > 0) { options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(bits_per_key)); } - options.cache_index_and_filter_blocks = cache_index_and_filter_blocks; - options.hash_index_allow_collision = hash_index_allow_collision; - if (block_cache_compressed_size > 0) { - if (block_cache_compressd_num_shard_bits > 0) { - options.block_cache = - rocksdb::NewLRUCache(block_cache_compressed_size, - block_cache_compressd_num_shard_bits); - } else { - options.block_cache = rocksdb::NewLRUCache(block_cache_compressed_size); - } - } - return reinterpret_cast(rocksdb::NewBlockBasedTableFactory(options)); } diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc index 46e7a6fa09..e8b2456eee 100644 --- a/java/rocksjni/write_batch.cc +++ b/java/rocksjni/write_batch.cc @@ -12,14 +12,12 @@ #include "include/org_rocksdb_WriteBatchTest.h" #include "rocksjni/portal.h" #include "rocksdb/db.h" -#include "rocksdb/immutable_options.h" #include "db/memtable.h" #include "rocksdb/write_batch.h" #include "db/write_batch_internal.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "util/logging.h" -#include "util/scoped_arena_iterator.h" #include "util/testharness.h" /* @@ -30,7 +28,7 @@ void Java_org_rocksdb_WriteBatch_newWriteBatch( JNIEnv* env, jobject jobj, jint jreserved_bytes) { rocksdb::WriteBatch* wb = new rocksdb::WriteBatch( - rocksdb::jlong_to_size_t(jreserved_bytes)); + static_cast(jreserved_bytes)); rocksdb::WriteBatchJni::setHandle(env, jobj, wb); } @@ -204,19 +202,14 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( auto factory = std::make_shared(); rocksdb::Options options; options.memtable_factory = factory; - rocksdb::MemTable* mem = new rocksdb::MemTable( - cmp, rocksdb::ImmutableCFOptions(options), - rocksdb::MemTableOptions(rocksdb::MutableCFOptions(options, - rocksdb::ImmutableCFOptions(options)), options)); + rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options); mem->Ref(); std::string state; rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options); rocksdb::Status s = rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default); int count = 0; - rocksdb::Arena arena; - rocksdb::ScopedArenaIterator iter(mem->NewIterator( - rocksdb::ReadOptions(), &arena)); + rocksdb::Iterator* iter = mem->NewIterator(rocksdb::ReadOptions()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { rocksdb::ParsedInternalKey ikey; memset(reinterpret_cast(&ikey), 0, sizeof(ikey)); @@ -251,6 +244,7 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( state.append("@"); state.append(rocksdb::NumberToString(ikey.sequence)); } + delete iter; if (!s.ok()) { state.append(s.ToString()); } else if (count != rocksdb::WriteBatchInternal::Count(b)) { diff --git a/linters/lint_engine/FacebookFbcodeLintEngine.php b/linters/lint_engine/FacebookFbcodeLintEngine.php index 6765c33d1e..cb9cf9bdba 100644 --- a/linters/lint_engine/FacebookFbcodeLintEngine.php +++ b/linters/lint_engine/FacebookFbcodeLintEngine.php @@ -36,9 +36,16 @@ class FacebookFbcodeLintEngine extends ArcanistLintEngine { )); $linters[] = $java_text_linter; + $pep8_options = $this->getPEP8WithTextOptions().',E302'; + $python_linter = new ArcanistPEP8Linter(); + $python_linter->setConfig(array('options' => $pep8_options)); $linters[] = $python_linter; + $python_2space_linter = new ArcanistPEP8Linter(); + $python_2space_linter->setConfig(array('options' => $pep8_options.',E111')); + $linters[] = $python_2space_linter; + // Currently we can't run cpplint in commit hook mode, because it // depends on having access to the working directory. if (!$this->getCommitHookMode()) { @@ -112,7 +119,11 @@ class FacebookFbcodeLintEngine extends ArcanistLintEngine { $dir = dirname($dir); } while ($dir != '/' && $dir != '.'); - $cur_path_linter = $python_linter; + if ($space_count == 4) { + $cur_path_linter = $python_linter; + } else { + $cur_path_linter = $python_2space_linter; + } $cur_path_linter->addPath($path); $cur_path_linter->addData($path, $this->loadData($path)); diff --git a/port/stack_trace.cc b/port/stack_trace.cc index 296b1f6209..76866e63cc 100644 --- a/port/stack_trace.cc +++ b/port/stack_trace.cc @@ -33,7 +33,7 @@ const char* GetExecutableName() { char link[1024]; snprintf(link, sizeof(link), "/proc/%d/exe", getpid()); - auto read = readlink(link, name, sizeof(name) - 1); + auto read = readlink(link, name, sizeof(name)); if (-1 == read) { return nullptr; } else { diff --git a/table/adaptive_table_factory.cc b/table/adaptive_table_factory.cc index c693064af9..a259e79d8a 100644 --- a/table/adaptive_table_factory.cc +++ b/table/adaptive_table_factory.cc @@ -39,7 +39,7 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber; extern const uint64_t kCuckooTableMagicNumber; Status AdaptiveTableFactory::NewTableReader( - const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const Options& options, const EnvOptions& soptions, const InternalKeyComparator& icomp, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const { Footer footer; @@ -50,26 +50,24 @@ Status AdaptiveTableFactory::NewTableReader( if (footer.table_magic_number() == kPlainTableMagicNumber || footer.table_magic_number() == kLegacyPlainTableMagicNumber) { return plain_table_factory_->NewTableReader( - ioptions, env_options, icomp, std::move(file), file_size, table); + options, soptions, icomp, std::move(file), file_size, table); } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber || footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) { return block_based_table_factory_->NewTableReader( - ioptions, env_options, icomp, std::move(file), file_size, table); + options, soptions, icomp, std::move(file), file_size, table); } else if (footer.table_magic_number() == kCuckooTableMagicNumber) { return cuckoo_table_factory_->NewTableReader( - ioptions, env_options, icomp, std::move(file), file_size, table); + options, soptions, icomp, std::move(file), file_size, table); } else { return Status::NotSupported("Unidentified table format"); } } TableBuilder* AdaptiveTableFactory::NewTableBuilder( - const ImmutableCFOptions& ioptions, - const InternalKeyComparator& internal_comparator, - WritableFile* file, const CompressionType compression_type, - const CompressionOptions& compression_opts) const { - return table_factory_to_write_->NewTableBuilder( - ioptions, internal_comparator, file, compression_type, compression_opts); + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const { + return table_factory_to_write_->NewTableBuilder(options, internal_comparator, + file, compression_type); } std::string AdaptiveTableFactory::GetPrintableTableOptions() const { diff --git a/table/adaptive_table_factory.h b/table/adaptive_table_factory.h index f0920db97f..f119d97b1c 100644 --- a/table/adaptive_table_factory.h +++ b/table/adaptive_table_factory.h @@ -12,6 +12,7 @@ namespace rocksdb { +struct Options; struct EnvOptions; using std::unique_ptr; @@ -30,21 +31,16 @@ class AdaptiveTableFactory : public TableFactory { std::shared_ptr block_based_table_factory, std::shared_ptr plain_table_factory, std::shared_ptr cuckoo_table_factory); - const char* Name() const override { return "AdaptiveTableFactory"; } - - Status NewTableReader( - const ImmutableCFOptions& ioptions, const EnvOptions& env_options, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const override; - - TableBuilder* NewTableBuilder( - const ImmutableCFOptions& ioptions, - const InternalKeyComparator& icomparator, - WritableFile* file, - const CompressionType compression_type, - const CompressionOptions& compression_opts) const override; + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override; + TableBuilder* NewTableBuilder(const Options& options, + const InternalKeyComparator& icomparator, + WritableFile* file, + CompressionType compression_type) const + override; // Sanitizes the specified DB Options. Status SanitizeDBOptions(const DBOptions* db_opts) const override { diff --git a/table/block.cc b/table/block.cc index 592d175b15..0db23a1bd8 100644 --- a/table/block.cc +++ b/table/block.cc @@ -297,10 +297,12 @@ uint32_t Block::NumRestarts() const { return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); } -Block::Block(BlockContents&& contents) - : contents_(std::move(contents)), - data_(contents_.data.data()), - size_(contents_.data.size()) { +Block::Block(const BlockContents& contents) + : data_(contents.data.data()), + size_(contents.data.size()), + owned_(contents.heap_allocated), + cachable_(contents.cachable), + compression_type_(contents.compression_type) { if (size_ < sizeof(uint32_t)) { size_ = 0; // Error marker } else { @@ -313,6 +315,12 @@ Block::Block(BlockContents&& contents) } } +Block::~Block() { + if (owned_) { + delete[] data_; + } +} + Iterator* Block::NewIterator( const Comparator* cmp, BlockIter* iter, bool total_order_seek) { if (size_ < 2*sizeof(uint32_t)) { diff --git a/table/block.h b/table/block.h index 68b16ea1f3..49bcf12cf3 100644 --- a/table/block.h +++ b/table/block.h @@ -14,10 +14,6 @@ #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "db/dbformat.h" -#include "table/block_prefix_index.h" -#include "table/block_hash_index.h" - -#include "format.h" namespace rocksdb { @@ -30,17 +26,15 @@ class BlockPrefixIndex; class Block { public: // Initialize the block with the specified contents. - explicit Block(BlockContents&& contents); + explicit Block(const BlockContents& contents); - ~Block() = default; + ~Block(); size_t size() const { return size_; } const char* data() const { return data_; } - bool cachable() const { return contents_.cachable; } + bool cachable() const { return cachable_; } uint32_t NumRestarts() const; - CompressionType compression_type() const { - return contents_.compression_type; - } + CompressionType compression_type() const { return compression_type_; } // If hash index lookup is enabled and `use_hash_index` is true. This block // will do hash lookup for the key prefix. @@ -64,10 +58,12 @@ class Block { size_t ApproximateMemoryUsage() const; private: - BlockContents contents_; - const char* data_; // contents_.data.data() - size_t size_; // contents_.data.size() + const char* data_; + size_t size_; uint32_t restart_offset_; // Offset in data_ of restart array + bool owned_; // Block owns data_[] + bool cachable_; + CompressionType compression_type_; std::unique_ptr hash_index_; std::unique_ptr prefix_index_; diff --git a/table/block_based_filter_block.h b/table/block_based_filter_block.h deleted file mode 100644 index 9621425e39..0000000000 --- a/table/block_based_filter_block.h +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -// Copyright (c) 2012 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A filter block is stored near the end of a Table file. It contains -// filters (e.g., bloom filters) for all data blocks in the table combined -// into a single filter block. - -#pragma once - -#include -#include -#include -#include -#include -#include "rocksdb/options.h" -#include "rocksdb/slice.h" -#include "rocksdb/slice_transform.h" -#include "table/filter_block.h" -#include "util/hash.h" - -namespace rocksdb { - - -// A BlockBasedFilterBlockBuilder is used to construct all of the filters for a -// particular Table. It generates a single string which is stored as -// a special block in the Table. -// -// The sequence of calls to BlockBasedFilterBlockBuilder must match the regexp: -// (StartBlock Add*)* Finish -class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { - public: - BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt); - - virtual bool IsBlockBased() override { return true; } - virtual void StartBlock(uint64_t block_offset) override; - virtual void Add(const Slice& key) override; - virtual Slice Finish() override; - - private: - void AddKey(const Slice& key); - void AddPrefix(const Slice& key); - void GenerateFilter(); - - // important: all of these might point to invalid addresses - // at the time of destruction of this filter block. destructor - // should NOT dereference them. - const FilterPolicy* policy_; - const SliceTransform* prefix_extractor_; - bool whole_key_filtering_; - - std::string entries_; // Flattened entry contents - std::vector start_; // Starting index in entries_ of each entry - uint32_t added_to_start_; // To indicate if key is added - std::string result_; // Filter data computed so far - std::vector tmp_entries_; // policy_->CreateFilter() argument - std::vector filter_offsets_; - - // No copying allowed - BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&); - void operator=(const BlockBasedFilterBlockBuilder&); -}; - -// A FilterBlockReader is used to parse filter from SST table. -// KeyMayMatch and PrefixMayMatch would trigger filter checking -class BlockBasedFilterBlockReader : public FilterBlockReader { - public: - // REQUIRES: "contents" and *policy must stay live while *this is live. - BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, - BlockContents&& contents); - virtual bool IsBlockBased() override { return true; } - virtual bool KeyMayMatch(const Slice& key, - uint64_t block_offset = kNotValid) override; - virtual bool PrefixMayMatch(const Slice& prefix, - uint64_t block_offset = kNotValid) override; - virtual size_t ApproximateMemoryUsage() const override; - - private: - const FilterPolicy* policy_; - const SliceTransform* prefix_extractor_; - bool whole_key_filtering_; - const char* data_; // Pointer to filter data (at block-start) - const char* offset_; // Pointer to beginning of offset array (at block-end) - size_t num_; // Number of entries in offset array - size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) - BlockContents contents_; - - bool MayMatch(const Slice& entry, uint64_t block_offset); - - // No copying allowed - BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&); - void operator=(const BlockBasedFilterBlockReader&); -}; -} // namespace rocksdb diff --git a/table/block_based_filter_block_test.cc b/table/block_based_filter_block_test.cc deleted file mode 100644 index 28eea16ce8..0000000000 --- a/table/block_based_filter_block_test.cc +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -// Copyright (c) 2012 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "table/block_based_filter_block.h" - -#include "rocksdb/filter_policy.h" -#include "util/coding.h" -#include "util/hash.h" -#include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace rocksdb { - -// For testing: emit an array with one hash value per key -class TestHashFilter : public FilterPolicy { - public: - virtual const char* Name() const { - return "TestHashFilter"; - } - - virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { - for (int i = 0; i < n; i++) { - uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); - PutFixed32(dst, h); - } - } - - virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { - uint32_t h = Hash(key.data(), key.size(), 1); - for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { - if (h == DecodeFixed32(filter.data() + i)) { - return true; - } - } - return false; - } -}; - -class FilterBlockTest { - public: - TestHashFilter policy_; - BlockBasedTableOptions table_options_; - - FilterBlockTest() { - table_options_.filter_policy.reset(new TestHashFilter()); - } -}; - -TEST(FilterBlockTest, EmptyBuilder) { - BlockBasedFilterBlockBuilder builder(nullptr, table_options_); - BlockContents block(builder.Finish(), false, kNoCompression); - ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); - BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block)); - ASSERT_TRUE(reader.KeyMayMatch("foo", 0)); - ASSERT_TRUE(reader.KeyMayMatch("foo", 100000)); -} - -TEST(FilterBlockTest, SingleChunk) { - BlockBasedFilterBlockBuilder builder(nullptr, table_options_); - builder.StartBlock(100); - builder.Add("foo"); - builder.Add("bar"); - builder.Add("box"); - builder.StartBlock(200); - builder.Add("box"); - builder.StartBlock(300); - builder.Add("hello"); - BlockContents block(builder.Finish(), false, kNoCompression); - BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block)); - ASSERT_TRUE(reader.KeyMayMatch("foo", 100)); - ASSERT_TRUE(reader.KeyMayMatch("bar", 100)); - ASSERT_TRUE(reader.KeyMayMatch("box", 100)); - ASSERT_TRUE(reader.KeyMayMatch("hello", 100)); - ASSERT_TRUE(reader.KeyMayMatch("foo", 100)); - ASSERT_TRUE(!reader.KeyMayMatch("missing", 100)); - ASSERT_TRUE(!reader.KeyMayMatch("other", 100)); -} - -TEST(FilterBlockTest, MultiChunk) { - BlockBasedFilterBlockBuilder builder(nullptr, table_options_); - - // First filter - builder.StartBlock(0); - builder.Add("foo"); - builder.StartBlock(2000); - builder.Add("bar"); - - // Second filter - builder.StartBlock(3100); - builder.Add("box"); - - // Third filter is empty - - // Last filter - builder.StartBlock(9000); - builder.Add("box"); - builder.Add("hello"); - - BlockContents block(builder.Finish(), false, kNoCompression); - BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block)); - - // Check first filter - ASSERT_TRUE(reader.KeyMayMatch("foo", 0)); - ASSERT_TRUE(reader.KeyMayMatch("bar", 2000)); - ASSERT_TRUE(!reader.KeyMayMatch("box", 0)); - ASSERT_TRUE(!reader.KeyMayMatch("hello", 0)); - - // Check second filter - ASSERT_TRUE(reader.KeyMayMatch("box", 3100)); - ASSERT_TRUE(!reader.KeyMayMatch("foo", 3100)); - ASSERT_TRUE(!reader.KeyMayMatch("bar", 3100)); - ASSERT_TRUE(!reader.KeyMayMatch("hello", 3100)); - - // Check third filter (empty) - ASSERT_TRUE(!reader.KeyMayMatch("foo", 4100)); - ASSERT_TRUE(!reader.KeyMayMatch("bar", 4100)); - ASSERT_TRUE(!reader.KeyMayMatch("box", 4100)); - ASSERT_TRUE(!reader.KeyMayMatch("hello", 4100)); - - // Check last filter - ASSERT_TRUE(reader.KeyMayMatch("box", 9000)); - ASSERT_TRUE(reader.KeyMayMatch("hello", 9000)); - ASSERT_TRUE(!reader.KeyMayMatch("foo", 9000)); - ASSERT_TRUE(!reader.KeyMayMatch("bar", 9000)); -} - -// Test for block based filter block -// use new interface in FilterPolicy to create filter builder/reader -class BlockBasedFilterBlockTest { - public: - BlockBasedTableOptions table_options_; - - BlockBasedFilterBlockTest() { - table_options_.filter_policy.reset(NewBloomFilterPolicy(10)); - } - - ~BlockBasedFilterBlockTest() {} -}; - -TEST(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) { - FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder( - nullptr, table_options_); - BlockContents block(builder->Finish(), false, kNoCompression); - ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); - FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, std::move(block)); - ASSERT_TRUE(reader->KeyMayMatch("foo", 0)); - ASSERT_TRUE(reader->KeyMayMatch("foo", 100000)); - - delete builder; - delete reader; -} - -TEST(BlockBasedFilterBlockTest, BlockBasedSingleChunk) { - FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder( - nullptr, table_options_); - builder->StartBlock(100); - builder->Add("foo"); - builder->Add("bar"); - builder->Add("box"); - builder->StartBlock(200); - builder->Add("box"); - builder->StartBlock(300); - builder->Add("hello"); - BlockContents block(builder->Finish(), false, kNoCompression); - FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, std::move(block)); - ASSERT_TRUE(reader->KeyMayMatch("foo", 100)); - ASSERT_TRUE(reader->KeyMayMatch("bar", 100)); - ASSERT_TRUE(reader->KeyMayMatch("box", 100)); - ASSERT_TRUE(reader->KeyMayMatch("hello", 100)); - ASSERT_TRUE(reader->KeyMayMatch("foo", 100)); - ASSERT_TRUE(!reader->KeyMayMatch("missing", 100)); - ASSERT_TRUE(!reader->KeyMayMatch("other", 100)); - - delete builder; - delete reader; -} - -TEST(BlockBasedFilterBlockTest, BlockBasedMultiChunk) { - FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder( - nullptr, table_options_); - - // First filter - builder->StartBlock(0); - builder->Add("foo"); - builder->StartBlock(2000); - builder->Add("bar"); - - // Second filter - builder->StartBlock(3100); - builder->Add("box"); - - // Third filter is empty - - // Last filter - builder->StartBlock(9000); - builder->Add("box"); - builder->Add("hello"); - - BlockContents block(builder->Finish(), false, kNoCompression); - FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, std::move(block)); - - // Check first filter - ASSERT_TRUE(reader->KeyMayMatch("foo", 0)); - ASSERT_TRUE(reader->KeyMayMatch("bar", 2000)); - ASSERT_TRUE(!reader->KeyMayMatch("box", 0)); - ASSERT_TRUE(!reader->KeyMayMatch("hello", 0)); - - // Check second filter - ASSERT_TRUE(reader->KeyMayMatch("box", 3100)); - ASSERT_TRUE(!reader->KeyMayMatch("foo", 3100)); - ASSERT_TRUE(!reader->KeyMayMatch("bar", 3100)); - ASSERT_TRUE(!reader->KeyMayMatch("hello", 3100)); - - // Check third filter (empty) - ASSERT_TRUE(!reader->KeyMayMatch("foo", 4100)); - ASSERT_TRUE(!reader->KeyMayMatch("bar", 4100)); - ASSERT_TRUE(!reader->KeyMayMatch("box", 4100)); - ASSERT_TRUE(!reader->KeyMayMatch("hello", 4100)); - - // Check last filter - ASSERT_TRUE(reader->KeyMayMatch("box", 9000)); - ASSERT_TRUE(reader->KeyMayMatch("hello", 9000)); - ASSERT_TRUE(!reader->KeyMayMatch("foo", 9000)); - ASSERT_TRUE(!reader->KeyMayMatch("bar", 9000)); - - delete builder; - delete reader; -} - -} // namespace rocksdb - -int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 2f373fff19..03f1e199c6 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -17,7 +17,6 @@ #include #include #include -#include #include "db/dbformat.h" @@ -26,14 +25,13 @@ #include "rocksdb/env.h" #include "rocksdb/filter_policy.h" #include "rocksdb/flush_block_policy.h" +#include "rocksdb/options.h" #include "rocksdb/table.h" #include "table/block.h" #include "table/block_based_table_reader.h" #include "table/block_builder.h" #include "table/filter_block.h" -#include "table/block_based_filter_block.h" -#include "table/full_filter_block.h" #include "table/format.h" #include "table/meta_blocks.h" #include "table/table_builder.h" @@ -135,12 +133,12 @@ class ShortenedIndexBuilder : public IndexBuilder { index_block_builder_.Add(*last_key_in_current_block, handle_encoding); } - virtual Status Finish(IndexBlocks* index_blocks) override { + virtual Status Finish(IndexBlocks* index_blocks) { index_blocks->index_block_contents = index_block_builder_.Finish(); return Status::OK(); } - virtual size_t EstimatedSize() const override { + virtual size_t EstimatedSize() const { return index_block_builder_.CurrentSizeEstimate(); } @@ -177,14 +175,14 @@ class HashIndexBuilder : public IndexBuilder { explicit HashIndexBuilder(const Comparator* comparator, const SliceTransform* hash_key_extractor) : IndexBuilder(comparator), - primary_index_builder_(comparator), + primary_index_builder(comparator), hash_key_extractor_(hash_key_extractor) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, const Slice* first_key_in_next_block, const BlockHandle& block_handle) override { ++current_restart_index_; - primary_index_builder_.AddIndexEntry(last_key_in_current_block, + primary_index_builder.AddIndexEntry(last_key_in_current_block, first_key_in_next_block, block_handle); } @@ -215,9 +213,9 @@ class HashIndexBuilder : public IndexBuilder { } } - virtual Status Finish(IndexBlocks* index_blocks) override { + virtual Status Finish(IndexBlocks* index_blocks) { FlushPendingPrefix(); - primary_index_builder_.Finish(index_blocks); + primary_index_builder.Finish(index_blocks); index_blocks->meta_blocks.insert( {kHashIndexPrefixesBlock.c_str(), prefix_block_}); index_blocks->meta_blocks.insert( @@ -225,8 +223,8 @@ class HashIndexBuilder : public IndexBuilder { return Status::OK(); } - virtual size_t EstimatedSize() const override { - return primary_index_builder_.EstimatedSize() + prefix_block_.size() + + virtual size_t EstimatedSize() const { + return primary_index_builder.EstimatedSize() + prefix_block_.size() + prefix_meta_block_.size(); } @@ -239,7 +237,7 @@ class HashIndexBuilder : public IndexBuilder { PutVarint32(&prefix_meta_block_, pending_block_num_); } - ShortenedIndexBuilder primary_index_builder_; + ShortenedIndexBuilder primary_index_builder; const SliceTransform* hash_key_extractor_; // stores a sequence of prefixes @@ -277,21 +275,6 @@ IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator, return nullptr; } -// Create a index builder based on its type. -FilterBlockBuilder* CreateFilterBlockBuilder(const ImmutableCFOptions& opt, - const BlockBasedTableOptions& table_opt) { - if (table_opt.filter_policy == nullptr) return nullptr; - - FilterBitsBuilder* filter_bits_builder = - table_opt.filter_policy->GetFilterBitsBuilder(); - if (filter_bits_builder == nullptr) { - return new BlockBasedFilterBlockBuilder(opt.prefix_extractor, table_opt); - } else { - return new FullFilterBlockBuilder(opt.prefix_extractor, table_opt, - filter_bits_builder); - } -} - bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) { // Check to see if compressed less than 12.5% return compressed_size < raw_size - (raw_size / 8u); @@ -383,6 +366,7 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector std::string val; PutFixed32(&val, static_cast(index_type_)); properties->insert({BlockBasedTablePropertyNames::kIndexType, val}); + return Status::OK(); } @@ -401,7 +385,7 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector }; struct BlockBasedTableBuilder::Rep { - const ImmutableCFOptions ioptions; + const Options options; const BlockBasedTableOptions table_options; const InternalKeyComparator& internal_comparator; WritableFile* file; @@ -413,12 +397,11 @@ struct BlockBasedTableBuilder::Rep { std::unique_ptr index_builder; std::string last_key; - const CompressionType compression_type; - const CompressionOptions compression_opts; + CompressionType compression_type; TableProperties props; bool closed = false; // Either Finish() or Abandon() has been called. - std::unique_ptr filter_block; + FilterBlockBuilder* filter_block; char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize]; size_t compressed_cache_key_prefix_size; @@ -430,27 +413,27 @@ struct BlockBasedTableBuilder::Rep { std::vector> table_properties_collectors; - Rep(const ImmutableCFOptions& ioptions, - const BlockBasedTableOptions& table_opt, + Rep(const Options& opt, const BlockBasedTableOptions& table_opt, const InternalKeyComparator& icomparator, - WritableFile* f, const CompressionType compression_type, - const CompressionOptions& compression_opts) - : ioptions(ioptions), + WritableFile* f, CompressionType compression_type) + : options(opt), table_options(table_opt), internal_comparator(icomparator), file(f), data_block(table_options.block_restart_interval), - internal_prefix_transform(ioptions.prefix_extractor), + internal_prefix_transform(options.prefix_extractor.get()), index_builder(CreateIndexBuilder( table_options.index_type, &internal_comparator, &this->internal_prefix_transform)), compression_type(compression_type), - filter_block(CreateFilterBlockBuilder(ioptions, table_options)), + filter_block(table_options.filter_policy == nullptr ? + nullptr : + new FilterBlockBuilder(opt, table_options, &internal_comparator)), flush_block_policy( table_options.flush_block_policy_factory->NewFlushBlockPolicy( table_options, data_block)) { for (auto& collector_factories : - ioptions.table_properties_collector_factories) { + options.table_properties_collector_factories) { table_properties_collectors.emplace_back( collector_factories->CreateTablePropertiesCollector()); } @@ -460,13 +443,11 @@ struct BlockBasedTableBuilder::Rep { }; BlockBasedTableBuilder::BlockBasedTableBuilder( - const ImmutableCFOptions& ioptions, - const BlockBasedTableOptions& table_options, + const Options& options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, WritableFile* file, - const CompressionType compression_type, - const CompressionOptions& compression_opts) - : rep_(new Rep(ioptions, table_options, internal_comparator, - file, compression_type, compression_opts)) { + CompressionType compression_type) + : rep_(new Rep(options, table_options, internal_comparator, + file, compression_type)) { if (rep_->filter_block != nullptr) { rep_->filter_block->StartBlock(0); } @@ -480,6 +461,7 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( BlockBasedTableBuilder::~BlockBasedTableBuilder() { assert(rep_->closed); // Catch errors where caller forgot to call Finish() + delete rep_->filter_block; delete rep_; } @@ -510,7 +492,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { } if (r->filter_block != nullptr) { - r->filter_block->Add(ExtractUserKey(key)); + r->filter_block->AddKey(ExtractUserKey(key)); } r->last_key.assign(key.data(), key.size()); @@ -521,7 +503,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->index_builder->OnKeyAdded(key); NotifyCollectTableCollectorsOnAdd(key, value, r->table_properties_collectors, - r->ioptions.info_log); + r->options.info_log.get()); } void BlockBasedTableBuilder::Flush() { @@ -559,10 +541,10 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, Slice block_contents; if (raw_block_contents.size() < kCompressionSizeLimit) { block_contents = - CompressBlock(raw_block_contents, r->compression_opts, &type, + CompressBlock(raw_block_contents, r->options.compression_opts, &type, &r->compressed_output); } else { - RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED); + RecordTick(r->options.statistics.get(), NUMBER_BLOCK_NOT_COMPRESSED); type = kNoCompression; block_contents = raw_block_contents; } @@ -574,7 +556,8 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, CompressionType type, BlockHandle* handle) { Rep* r = rep_; - StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS); + StopWatch sw(r->options.env, r->options.statistics.get(), + WRITE_RAW_BLOCK_MICROS); handle->set_offset(r->offset); handle->set_size(block_contents.size()); r->status = r->file->Append(block_contents); @@ -635,13 +618,18 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, Cache::Handle* cache_handle = nullptr; size_t size = block_contents.size(); - std::unique_ptr ubuf(new char[size + 1]); - memcpy(ubuf.get(), block_contents.data(), size); + char* ubuf = new char[size + 1]; // make a new copy + memcpy(ubuf, block_contents.data(), size); ubuf[size] = type; - BlockContents results(std::move(ubuf), size, true, type); + BlockContents results; + Slice sl(ubuf, size); + results.data = sl; + results.cachable = true; // XXX + results.heap_allocated = true; + results.compression_type = type; - Block* block = new Block(std::move(results)); + Block* block = new Block(results); // make cache key by appending the file offset to the cache prefix id char* end = EncodeVarint64( @@ -669,7 +657,10 @@ Status BlockBasedTableBuilder::Finish() { assert(!r->closed); r->closed = true; - BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle; + BlockHandle filter_block_handle, + metaindex_block_handle, + index_block_handle; + // Write filter block if (ok() && r->filter_block != nullptr) { auto filter_contents = r->filter_block->Finish(); @@ -708,12 +699,7 @@ Status BlockBasedTableBuilder::Finish() { if (r->filter_block != nullptr) { // Add mapping from ".Name" to location // of filter data. - std::string key; - if (r->filter_block->IsBlockBased()) { - key = BlockBasedTable::kFilterBlockPrefix; - } else { - key = BlockBasedTable::kFullFilterBlockPrefix; - } + std::string key = BlockBasedTable::kFilterBlockPrefix; key.append(r->table_options.filter_policy->Name()); meta_index_builder.Add(key, filter_block_handle); } @@ -732,7 +718,7 @@ Status BlockBasedTableBuilder::Finish() { // Add use collected properties NotifyCollectTableCollectorsOnFinish(r->table_properties_collectors, - r->ioptions.info_log, + r->options.info_log.get(), &property_block_builder); BlockHandle properties_block_handle; @@ -791,12 +777,14 @@ Status BlockBasedTableBuilder::Finish() { } } - Log(r->ioptions.info_log, + Log( + r->options.info_log, "Table was constructed:\n" " [basic properties]: %s\n" " [user collected properties]: %s", r->props.ToString().c_str(), - user_collected.c_str()); + user_collected.c_str() + ); } return r->status; @@ -817,6 +805,5 @@ uint64_t BlockBasedTableBuilder::FileSize() const { } const std::string BlockBasedTable::kFilterBlockPrefix = "filter."; -const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter."; } // namespace rocksdb diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h index 6fde329199..72a2f207a6 100644 --- a/table/block_based_table_builder.h +++ b/table/block_based_table_builder.h @@ -28,12 +28,10 @@ class BlockBasedTableBuilder : public TableBuilder { // Create a builder that will store the contents of the table it is // building in *file. Does not close the file. It is up to the // caller to close the file after calling Finish(). - BlockBasedTableBuilder(const ImmutableCFOptions& ioptions, + BlockBasedTableBuilder(const Options& options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, - WritableFile* file, - const CompressionType compression_type, - const CompressionOptions& compression_opts); + WritableFile* file, CompressionType compression_type); // REQUIRES: Either Finish() or Abandon() has been called. ~BlockBasedTableBuilder(); diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index b4e2e7d1fe..de30fb383e 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -41,24 +41,21 @@ BlockBasedTableFactory::BlockBasedTableFactory( } Status BlockBasedTableFactory::NewTableReader( - const ImmutableCFOptions& ioptions, const EnvOptions& soptions, + const Options& options, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const { - return BlockBasedTable::Open(ioptions, soptions, table_options_, + return BlockBasedTable::Open(options, soptions, table_options_, internal_comparator, std::move(file), file_size, table_reader); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( - const ImmutableCFOptions& ioptions, - const InternalKeyComparator& internal_comparator, - WritableFile* file, const CompressionType compression_type, - const CompressionOptions& compression_opts) const { + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const { auto table_builder = new BlockBasedTableBuilder( - ioptions, table_options_, internal_comparator, file, - compression_type, compression_opts); + options, table_options_, internal_comparator, file, compression_type); return table_builder; } diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h index 2dcfda6d48..d7045346af 100644 --- a/table/block_based_table_factory.h +++ b/table/block_based_table_factory.h @@ -14,11 +14,13 @@ #include #include "rocksdb/flush_block_policy.h" +#include "rocksdb/options.h" #include "rocksdb/table.h" #include "db/dbformat.h" namespace rocksdb { +struct Options; struct EnvOptions; using std::unique_ptr; @@ -33,17 +35,14 @@ class BlockBasedTableFactory : public TableFactory { const char* Name() const override { return "BlockBasedTable"; } - Status NewTableReader( - const ImmutableCFOptions& ioptions, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader) const override; + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const override; TableBuilder* NewTableBuilder( - const ImmutableCFOptions& ioptions, - const InternalKeyComparator& internal_comparator, - WritableFile* file, const CompressionType compression_type, - const CompressionOptions& compression_opts) const override; + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const override; // Sanitizes the specified DB Options. Status SanitizeDBOptions(const DBOptions* db_opts) const override { diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 4b2050e03c..0be38a1dc3 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -26,14 +26,11 @@ #include "table/block.h" #include "table/filter_block.h" -#include "table/block_based_filter_block.h" -#include "table/full_filter_block.h" #include "table/block_hash_index.h" #include "table/block_prefix_index.h" #include "table/format.h" #include "table/meta_blocks.h" #include "table/two_level_iterator.h" -#include "table/get_context.h" #include "util/coding.h" #include "util/perf_context_imp.h" @@ -49,6 +46,7 @@ using std::unique_ptr; typedef BlockBasedTable::IndexReader IndexReader; namespace { + // The longest the prefix of the cache key used to identify blocks can be. // We are using the fact that we know for Posix files the unique ID is three // varints. @@ -67,7 +65,7 @@ Status ReadBlockFromFile(RandomAccessFile* file, const Footer& footer, Status s = ReadBlockContents(file, footer, options, handle, &contents, env, do_uncompress); if (s.ok()) { - *result = new Block(std::move(contents)); + *result = new Block(contents); } return s; @@ -253,6 +251,9 @@ class HashIndexReader : public IndexReader { &prefixes_meta_contents, env, true /* do decompression */); if (!s.ok()) { + if (prefixes_contents.heap_allocated) { + delete[] prefixes_contents.data.data(); + } // TODO: log error return Status::OK(); } @@ -267,7 +268,7 @@ class HashIndexReader : public IndexReader { // TODO: log error if (s.ok()) { new_index_reader->index_block_->SetBlockHashIndex(hash_index); - new_index_reader->OwnPrefixesContents(std::move(prefixes_contents)); + new_index_reader->OwnPrefixesContents(prefixes_contents); } } else { BlockPrefixIndex* prefix_index = nullptr; @@ -281,6 +282,18 @@ class HashIndexReader : public IndexReader { } } + // Always release prefix meta block + if (prefixes_meta_contents.heap_allocated) { + delete[] prefixes_meta_contents.data.data(); + } + + // Release prefix content block if we don't own it. + if (!new_index_reader->own_prefixes_contents_) { + if (prefixes_contents.heap_allocated) { + delete[] prefixes_contents.data.data(); + } + } + return Status::OK(); } @@ -299,33 +312,39 @@ class HashIndexReader : public IndexReader { private: HashIndexReader(const Comparator* comparator, Block* index_block) - : IndexReader(comparator), index_block_(index_block) { + : IndexReader(comparator), + index_block_(index_block), + own_prefixes_contents_(false) { assert(index_block_ != nullptr); } ~HashIndexReader() { + if (own_prefixes_contents_ && prefixes_contents_.heap_allocated) { + delete[] prefixes_contents_.data.data(); + } } - void OwnPrefixesContents(BlockContents&& prefixes_contents) { - prefixes_contents_ = std::move(prefixes_contents); + void OwnPrefixesContents(const BlockContents& prefixes_contents) { + prefixes_contents_ = prefixes_contents; + own_prefixes_contents_ = true; } std::unique_ptr index_block_; + bool own_prefixes_contents_; BlockContents prefixes_contents_; }; struct BlockBasedTable::Rep { - Rep(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, + Rep(const EnvOptions& storage_options, const BlockBasedTableOptions& table_opt, const InternalKeyComparator& internal_comparator) - : ioptions(ioptions), env_options(env_options), table_options(table_opt), + : soptions(storage_options), table_options(table_opt), filter_policy(table_opt.filter_policy.get()), internal_comparator(internal_comparator) {} - const ImmutableCFOptions& ioptions; - const EnvOptions& env_options; + Options options; + const EnvOptions& soptions; const BlockBasedTableOptions& table_options; const FilterPolicy* const filter_policy; const InternalKeyComparator& internal_comparator; @@ -427,8 +446,7 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc, } } -Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, +Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, @@ -443,7 +461,8 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // We've successfully read the footer and the index block: we're // ready to serve requests. Rep* rep = new BlockBasedTable::Rep( - ioptions, env_options, table_options, internal_comparator); + soptions, table_options, internal_comparator); + rep->options = options; rep->file = std::move(file); rep->footer = footer; rep->index_type = table_options.index_type; @@ -465,7 +484,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, TableProperties* table_properties = nullptr; if (s.ok()) { s = ReadProperties(meta_iter->value(), rep->file.get(), rep->footer, - rep->ioptions.env, rep->ioptions.info_log, + rep->options.env, rep->options.info_log.get(), &table_properties); } @@ -473,12 +492,12 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, auto err_msg = "[Warning] Encountered error while reading data from properties " "block " + s.ToString(); - Log(rep->ioptions.info_log, "%s", err_msg.c_str()); + Log(rep->options.info_log, "%s", err_msg.c_str()); } else { rep->table_properties.reset(table_properties); } } else { - Log(WARN_LEVEL, rep->ioptions.info_log, + Log(WARN_LEVEL, rep->options.info_log, "Cannot find Properties block from file."); } @@ -499,6 +518,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // pre-load these blocks, which will kept in member variables in Rep // and with a same life-time as this table object. IndexReader* index_reader = nullptr; + // TODO: we never really verify check sum for index block s = new_table->CreateIndexReader(&index_reader, meta_iter.get()); if (s.ok()) { @@ -506,18 +526,11 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // Set filter block if (rep->filter_policy) { - // First try reading full_filter, then reading block_based_filter - for (auto filter_block_prefix : { kFullFilterBlockPrefix, - kFilterBlockPrefix }) { - std::string key = filter_block_prefix; - key.append(rep->filter_policy->Name()); - - BlockHandle handle; - if (FindMetaBlock(meta_iter.get(), key, &handle).ok()) { - rep->filter.reset(ReadFilter(handle, rep, - filter_block_prefix, nullptr)); - break; - } + std::string key = kFilterBlockPrefix; + key.append(rep->filter_policy->Name()); + BlockHandle handle; + if (FindMetaBlock(meta_iter.get(), key, &handle).ok()) { + rep->filter.reset(ReadFilter(handle, rep)); } } } else { @@ -533,7 +546,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, } void BlockBasedTable::SetupForCompaction() { - switch (rep_->ioptions.access_hint_on_compaction_start) { + switch (rep_->options.access_hint_on_compaction_start) { case Options::NONE: break; case Options::NORMAL: @@ -583,13 +596,13 @@ Status BlockBasedTable::ReadMetaBlock( ReadOptions(), rep->footer.metaindex_handle(), &meta, - rep->ioptions.env); + rep->options.env); if (!s.ok()) { auto err_msg = "[Warning] Encountered error while reading data from properties" "block " + s.ToString(); - Log(rep->ioptions.info_log, "%s", err_msg.c_str()); + Log(rep->options.info_log, "%s", err_msg.c_str()); } if (!s.ok()) { delete meta; @@ -653,7 +666,7 @@ Status BlockBasedTable::GetDataBlockFromCache( // Insert uncompressed block into block cache if (s.ok()) { - block->value = new Block(std::move(contents)); // uncompressed block + block->value = new Block(contents); // uncompressed block assert(block->value->compression_type() == kNoCompression); if (block_cache != nullptr && block->value->cachable() && read_options.fill_cache) { @@ -691,7 +704,7 @@ Status BlockBasedTable::PutDataBlockToCache( } if (raw_block->compression_type() != kNoCompression) { - block->value = new Block(std::move(contents)); // uncompressed block + block->value = new Block(contents); // uncompressed block } else { block->value = raw_block; raw_block = nullptr; @@ -725,15 +738,15 @@ Status BlockBasedTable::PutDataBlockToCache( return s; } -FilterBlockReader* BlockBasedTable::ReadFilter( - const BlockHandle& filter_handle, BlockBasedTable::Rep* rep, - const std::string& filter_block_prefix, size_t* filter_size) { +FilterBlockReader* BlockBasedTable::ReadFilter(const BlockHandle& filter_handle, + BlockBasedTable::Rep* rep, + size_t* filter_size) { // TODO: We might want to unify with ReadBlockFromFile() if we start // requiring checksum verification in Table::Open. ReadOptions opt; BlockContents block; if (!ReadBlockContents(rep->file.get(), rep->footer, opt, filter_handle, - &block, rep->ioptions.env, false).ok()) { + &block, rep->options.env, false).ok()) { return nullptr; } @@ -741,25 +754,12 @@ FilterBlockReader* BlockBasedTable::ReadFilter( *filter_size = block.data.size(); } - assert(rep->filter_policy); - if (kFilterBlockPrefix == filter_block_prefix) { - return new BlockBasedFilterBlockReader( - rep->ioptions.prefix_extractor, rep->table_options, std::move(block)); - } else if (kFullFilterBlockPrefix == filter_block_prefix) { - auto filter_bits_reader = rep->filter_policy-> - GetFilterBitsReader(block.data); - - if (filter_bits_reader != nullptr) { - return new FullFilterBlockReader(rep->ioptions.prefix_extractor, - rep->table_options, std::move(block), - filter_bits_reader); - } - } - return nullptr; + return new FilterBlockReader( + rep->options, rep->table_options, block.data, block.heap_allocated); } BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( - bool no_io) const { + bool no_io) const { // filter pre-populated if (rep_->filter != nullptr) { return {rep_->filter.get(), nullptr /* cache handle */}; @@ -773,20 +773,22 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( // Fetching from the cache char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - rep_->footer.metaindex_handle(), - cache_key + auto key = GetCacheKey( + rep_->cache_key_prefix, + rep_->cache_key_prefix_size, + rep_->footer.metaindex_handle(), + cache_key ); - Statistics* statistics = rep_->ioptions.statistics; + Statistics* statistics = rep_->options.statistics.get(); auto cache_handle = GetEntryFromCache(block_cache, key, BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT, statistics); FilterBlockReader* filter = nullptr; if (cache_handle != nullptr) { - filter = reinterpret_cast( - block_cache->Value(cache_handle)); + filter = reinterpret_cast( + block_cache->Value(cache_handle)); } else if (no_io) { // Do not invoke any io. return CachableEntry(); @@ -797,22 +799,17 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( auto s = ReadMetaBlock(rep_, &meta, &iter); if (s.ok()) { - // First try reading full_filter, then reading block_based_filter - for (auto filter_block_prefix : {kFullFilterBlockPrefix, - kFilterBlockPrefix}) { - std::string filter_block_key = filter_block_prefix; - filter_block_key.append(rep_->filter_policy->Name()); - BlockHandle handle; - if (FindMetaBlock(iter.get(), filter_block_key, &handle).ok()) { - filter = ReadFilter(handle, rep_, filter_block_prefix, &filter_size); + std::string filter_block_key = kFilterBlockPrefix; + filter_block_key.append(rep_->filter_policy->Name()); + BlockHandle handle; + if (FindMetaBlock(iter.get(), filter_block_key, &handle).ok()) { + filter = ReadFilter(handle, rep_, &filter_size); + assert(filter); + assert(filter_size > 0); - if (filter == nullptr) break; // err happen in ReadFilter - assert(filter_size > 0); - cache_handle = block_cache->Insert( - key, filter, filter_size, &DeleteCachedEntry); - RecordTick(statistics, BLOCK_CACHE_ADD); - break; - } + cache_handle = block_cache->Insert( + key, filter, filter_size, &DeleteCachedEntry); + RecordTick(statistics, BLOCK_CACHE_ADD); } } } @@ -833,7 +830,7 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options, char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, rep_->footer.index_handle(), cache_key); - Statistics* statistics = rep_->ioptions.statistics; + Statistics* statistics = rep_->options.statistics.get(); auto cache_handle = GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS, BLOCK_CACHE_INDEX_HIT, statistics); @@ -909,7 +906,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, // If either block cache is enabled, we'll try to read from it. if (block_cache != nullptr || block_cache_compressed != nullptr) { - Statistics* statistics = rep->ioptions.statistics; + Statistics* statistics = rep->options.statistics.get(); char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; Slice key, /* key to the block cache */ @@ -917,8 +914,8 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, // create key for block cache if (block_cache != nullptr) { - key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size, - handle, cache_key); + key = GetCacheKey(rep->cache_key_prefix, + rep->cache_key_prefix_size, handle, cache_key); } if (block_cache_compressed != nullptr) { @@ -933,9 +930,9 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, if (block.value == nullptr && !no_io && ro.fill_cache) { Block* raw_block = nullptr; { - StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS); + StopWatch sw(rep->options.env, statistics, READ_BLOCK_GET_MICROS); s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, - &raw_block, rep->ioptions.env, + &raw_block, rep->options.env, block_cache_compressed == nullptr); } @@ -958,7 +955,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, } } s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, - &block.value, rep->ioptions.env); + &block.value, rep->options.env); } Iterator* iter; @@ -985,8 +982,7 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { public: BlockEntryIteratorState(BlockBasedTable* table, const ReadOptions& read_options) - : TwoLevelIteratorState( - table->rep_->ioptions.prefix_extractor != nullptr), + : TwoLevelIteratorState(table->rep_->options.prefix_extractor != nullptr), table_(table), read_options_(read_options) {} @@ -1024,8 +1020,8 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { return true; } - assert(rep_->ioptions.prefix_extractor != nullptr); - auto prefix = rep_->ioptions.prefix_extractor->Transform( + assert(rep_->options.prefix_extractor != nullptr); + auto prefix = rep_->options.prefix_extractor->Transform( ExtractUserKey(internal_key)); InternalKey internal_key_prefix(prefix, 0, kTypeValue); auto internal_prefix = internal_key_prefix.Encode(); @@ -1038,59 +1034,50 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { // loaded to memory. ReadOptions no_io_read_options; no_io_read_options.read_tier = kBlockCacheTier; + unique_ptr iiter(NewIndexIterator(no_io_read_options)); + iiter->Seek(internal_prefix); - // First, try check with full filter - auto filter_entry = GetFilter(true /* no io */); - FilterBlockReader* filter = filter_entry.value; - if (filter != nullptr && !filter->IsBlockBased()) { - may_match = filter->PrefixMayMatch(prefix); + if (!iiter->Valid()) { + // we're past end of file + // if it's incomplete, it means that we avoided I/O + // and we're not really sure that we're past the end + // of the file + may_match = iiter->status().IsIncomplete(); + } else if (ExtractUserKey(iiter->key()).starts_with( + ExtractUserKey(internal_prefix))) { + // we need to check for this subtle case because our only + // guarantee is that "the key is a string >= last key in that data + // block" according to the doc/table_format.txt spec. + // + // Suppose iiter->key() starts with the desired prefix; it is not + // necessarily the case that the corresponding data block will + // contain the prefix, since iiter->key() need not be in the + // block. However, the next data block may contain the prefix, so + // we return true to play it safe. + may_match = true; + } else { + // iiter->key() does NOT start with the desired prefix. Because + // Seek() finds the first key that is >= the seek target, this + // means that iiter->key() > prefix. Thus, any data blocks coming + // after the data block corresponding to iiter->key() cannot + // possibly contain the key. Thus, the corresponding data block + // is the only one which could potentially contain the prefix. + Slice handle_value = iiter->value(); + BlockHandle handle; + s = handle.DecodeFrom(&handle_value); + assert(s.ok()); + auto filter_entry = GetFilter(true /* no io */); + may_match = filter_entry.value == nullptr || + filter_entry.value->PrefixMayMatch(handle.offset(), prefix); + filter_entry.Release(rep_->table_options.block_cache.get()); } - // Then, try find it within each block - if (may_match) { - unique_ptr iiter(NewIndexIterator(no_io_read_options)); - iiter->Seek(internal_prefix); - - if (!iiter->Valid()) { - // we're past end of file - // if it's incomplete, it means that we avoided I/O - // and we're not really sure that we're past the end - // of the file - may_match = iiter->status().IsIncomplete(); - } else if (ExtractUserKey(iiter->key()).starts_with( - ExtractUserKey(internal_prefix))) { - // we need to check for this subtle case because our only - // guarantee is that "the key is a string >= last key in that data - // block" according to the doc/table_format.txt spec. - // - // Suppose iiter->key() starts with the desired prefix; it is not - // necessarily the case that the corresponding data block will - // contain the prefix, since iiter->key() need not be in the - // block. However, the next data block may contain the prefix, so - // we return true to play it safe. - may_match = true; - } else if (filter != nullptr && filter->IsBlockBased()) { - // iiter->key() does NOT start with the desired prefix. Because - // Seek() finds the first key that is >= the seek target, this - // means that iiter->key() > prefix. Thus, any data blocks coming - // after the data block corresponding to iiter->key() cannot - // possibly contain the key. Thus, the corresponding data block - // is the only on could potentially contain the prefix. - Slice handle_value = iiter->value(); - BlockHandle handle; - s = handle.DecodeFrom(&handle_value); - assert(s.ok()); - may_match = filter->PrefixMayMatch(prefix, handle.offset()); - } - } - - Statistics* statistics = rep_->ioptions.statistics; + Statistics* statistics = rep_->options.statistics.get(); RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); if (!may_match) { RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); } - filter_entry.Release(rep_->table_options.block_cache.get()); return may_match; } @@ -1101,74 +1088,69 @@ Iterator* BlockBasedTable::NewIterator(const ReadOptions& read_options, } Status BlockBasedTable::Get( - const ReadOptions& read_options, const Slice& key, - GetContext* get_context) { + const ReadOptions& read_options, const Slice& key, void* handle_context, + bool (*result_handler)(void* handle_context, const ParsedInternalKey& k, + const Slice& v), + void (*mark_key_may_exist_handler)(void* handle_context)) { Status s; + BlockIter iiter; + NewIndexIterator(read_options, &iiter); + auto filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier); FilterBlockReader* filter = filter_entry.value; + bool done = false; + for (iiter.Seek(key); iiter.Valid() && !done; iiter.Next()) { + Slice handle_value = iiter.value(); - // First check the full filter - // If full filter not useful, Then go into each block - if (filter != nullptr && !filter->IsBlockBased() - && !filter->KeyMayMatch(ExtractUserKey(key))) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); - } else { - BlockIter iiter; - NewIndexIterator(read_options, &iiter); + BlockHandle handle; + bool may_not_exist_in_filter = + filter != nullptr && handle.DecodeFrom(&handle_value).ok() && + !filter->KeyMayMatch(handle.offset(), ExtractUserKey(key)); - bool done = false; - for (iiter.Seek(key); iiter.Valid() && !done; iiter.Next()) { - Slice handle_value = iiter.value(); + if (may_not_exist_in_filter) { + // Not found + // TODO: think about interaction with Merge. If a user key cannot + // cross one data block, we should be fine. + RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL); + break; + } else { + BlockIter biter; + NewDataBlockIterator(rep_, read_options, iiter.value(), &biter); - BlockHandle handle; - bool not_exist_in_filter = - filter != nullptr && filter->IsBlockBased() == true && - handle.DecodeFrom(&handle_value).ok() && - !filter->KeyMayMatch(ExtractUserKey(key), handle.offset()); - - if (not_exist_in_filter) { - // Not found - // TODO: think about interaction with Merge. If a user key cannot - // cross one data block, we should be fine. - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + if (read_options.read_tier && biter.status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for whether + // we can guarantee the key is not there when "no_io" is set + (*mark_key_may_exist_handler)(handle_context); break; - } else { - BlockIter biter; - NewDataBlockIterator(rep_, read_options, iiter.value(), &biter); - - if (read_options.read_tier && biter.status().IsIncomplete()) { - // couldn't get block from block_cache - // Update Saver.state to Found because we are only looking for whether - // we can guarantee the key is not there when "no_io" is set - get_context->MarkKeyMayExist(); - break; - } - if (!biter.status().ok()) { - s = biter.status(); - break; - } - - // Call the *saver function on each entry/block until it returns false - for (biter.Seek(key); biter.Valid(); biter.Next()) { - ParsedInternalKey parsed_key; - if (!ParseInternalKey(biter.key(), &parsed_key)) { - s = Status::Corruption(Slice()); - } - - if (!get_context->SaveValue(parsed_key, biter.value())) { - done = true; - break; - } - } - s = biter.status(); } - } - if (s.ok()) { - s = iiter.status(); + if (!biter.status().ok()) { + s = biter.status(); + break; + } + + // Call the *saver function on each entry/block until it returns false + for (biter.Seek(key); biter.Valid(); biter.Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(biter.key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + + if (!(*result_handler)(handle_context, parsed_key, + biter.value())) { + done = true; + break; + } + } + s = biter.status(); } } filter_entry.Release(rep_->table_options.block_cache.get()); + if (s.ok()) { + s = iiter.status(); + } + return s; } @@ -1188,8 +1170,8 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; Slice cache_key = - GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - handle, cache_key_storage); + GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle, + cache_key_storage); Slice ckey; s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, nullptr, @@ -1223,13 +1205,13 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, } auto file = rep_->file.get(); - auto env = rep_->ioptions.env; + auto env = rep_->options.env; auto comparator = &rep_->internal_comparator; const Footer& footer = rep_->footer; if (index_type_on_file == BlockBasedTableOptions::kHashSearch && - rep_->ioptions.prefix_extractor == nullptr) { - Log(rep_->ioptions.info_log, + rep_->options.prefix_extractor == nullptr) { + Log(rep_->options.info_log, "BlockBasedTableOptions::kHashSearch requires " "options.prefix_extractor to be set." " Fall back to binary seach index."); @@ -1250,7 +1232,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, if (!s.ok()) { // we simply fall back to binary search in case there is any // problem with prefix hash index loading. - Log(rep_->ioptions.info_log, + Log(rep_->options.info_log, "Unable to read the metaindex block." " Fall back to binary seach index."); return BinarySearchIndexReader::Create( @@ -1262,7 +1244,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, // We need to wrap data with internal_prefix_transform to make sure it can // handle prefix correctly. rep_->internal_prefix_transform.reset( - new InternalKeySliceTransform(rep_->ioptions.prefix_extractor)); + new InternalKeySliceTransform(rep_->options.prefix_extractor.get())); return HashIndexReader::Create( rep_->internal_prefix_transform.get(), footer, file, env, comparator, footer.index_handle(), meta_index_iter, index_reader, diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index b272c4d13a..3ff97dda68 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -14,7 +14,6 @@ #include #include -#include "rocksdb/options.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" @@ -28,8 +27,6 @@ class BlockIter; class BlockHandle; class Cache; class FilterBlockReader; -class BlockBasedFilterBlockReader; -class FullFilterBlockReader; class Footer; class InternalKeyComparator; class Iterator; @@ -39,8 +36,8 @@ class TableReader; class WritableFile; struct BlockBasedTableOptions; struct EnvOptions; +struct Options; struct ReadOptions; -class GetContext; using std::unique_ptr; @@ -50,7 +47,6 @@ using std::unique_ptr; class BlockBasedTable : public TableReader { public: static const std::string kFilterBlockPrefix; - static const std::string kFullFilterBlockPrefix; // Attempt to open the table that is stored in bytes [0..file_size) // of "file", and read the metadata entries necessary to allow @@ -62,8 +58,7 @@ class BlockBasedTable : public TableReader { // to nullptr and returns a non-ok status. // // *file must remain live while this Table is in use. - static Status Open(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, + static Status Open(const Options& db_options, const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_key_comparator, unique_ptr&& file, uint64_t file_size, @@ -77,7 +72,11 @@ class BlockBasedTable : public TableReader { Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override; Status Get(const ReadOptions& readOptions, const Slice& key, - GetContext* get_context) override; + void* handle_context, + bool (*result_handler)(void* handle_context, + const ParsedInternalKey& k, const Slice& v), + void (*mark_key_may_exist_handler)(void* handle_context) = + nullptr) override; // Given a key, return an approximate byte offset in the file where // the data for that key begins (or would begin if the key were @@ -184,9 +183,7 @@ class BlockBasedTable : public TableReader { // Create the filter from the filter block. static FilterBlockReader* ReadFilter(const BlockHandle& filter_handle, - Rep* rep, - const std::string& filter_block_prefix, - size_t* filter_size = nullptr); + Rep* rep, size_t* filter_size = nullptr); static void SetupCacheKeyPrefix(Rep* rep); diff --git a/table/block_builder.h b/table/block_builder.h index c01a23bea9..3b5b2b4444 100644 --- a/table/block_builder.h +++ b/table/block_builder.h @@ -15,13 +15,15 @@ namespace rocksdb { +class Comparator; + class BlockBuilder { public: BlockBuilder(const BlockBuilder&) = delete; void operator=(const BlockBuilder&) = delete; - + explicit BlockBuilder(int block_restart_interval); - + // Reset the contents as if the BlockBuilder was just constructed. void Reset(); diff --git a/table/block_prefix_index.cc b/table/block_prefix_index.cc index d64b73b984..f06dcd9fe7 100644 --- a/table/block_prefix_index.cc +++ b/table/block_prefix_index.cc @@ -210,8 +210,8 @@ Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor, return s; } -uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, - uint32_t** blocks) { +const uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, + uint32_t** blocks) { Slice prefix = internal_prefix_extractor_->Transform(key); uint32_t bucket = PrefixToBucket(prefix, num_buckets_); diff --git a/table/block_prefix_index.h b/table/block_prefix_index.h index 662bc09aae..2afecadd26 100644 --- a/table/block_prefix_index.h +++ b/table/block_prefix_index.h @@ -23,7 +23,7 @@ class BlockPrefixIndex { // the key, based on the prefix. // Returns the total number of relevant blocks, 0 means the key does // not exist. - uint32_t GetBlocks(const Slice& key, uint32_t** blocks); + const uint32_t GetBlocks(const Slice& key, uint32_t** blocks); size_t ApproximateMemoryUsage() const { return sizeof(BlockPrefixIndex) + diff --git a/table/block_test.cc b/table/block_test.cc index 6b82c4d93e..b36787f8f5 100644 --- a/table/block_test.cc +++ b/table/block_test.cc @@ -92,7 +92,8 @@ TEST(BlockTest, SimpleTest) { BlockContents contents; contents.data = rawblock; contents.cachable = false; - Block reader(std::move(contents)); + contents.heap_allocated = false; + Block reader(contents); // read contents of block sequentially int count = 0; @@ -142,6 +143,7 @@ BlockContents GetBlockContents(std::unique_ptr *builder, BlockContents contents; contents.data = rawblock; contents.cachable = false; + contents.heap_allocated = false; return contents; } @@ -151,10 +153,8 @@ void CheckBlockContents(BlockContents contents, const int max_key, const std::vector &values) { const size_t prefix_size = 6; // create block reader - BlockContents contents_ref(contents.data, contents.cachable, - contents.compression_type); - Block reader1(std::move(contents)); - Block reader2(std::move(contents_ref)); + Block reader1(contents); + Block reader2(contents); std::unique_ptr prefix_extractor( NewFixedPrefixTransform(prefix_size)); @@ -212,7 +212,7 @@ TEST(BlockTest, SimpleIndexHash) { std::unique_ptr builder; auto contents = GetBlockContents(&builder, keys, values); - CheckBlockContents(std::move(contents), kMaxKey, keys, values); + CheckBlockContents(contents, kMaxKey, keys, values); } TEST(BlockTest, IndexHashWithSharedPrefix) { @@ -231,7 +231,7 @@ TEST(BlockTest, IndexHashWithSharedPrefix) { std::unique_ptr builder; auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup); - CheckBlockContents(std::move(contents), kMaxKey, keys, values); + CheckBlockContents(contents, kMaxKey, keys, values); } } // namespace rocksdb diff --git a/table/bloom_block.cc b/table/bloom_block.cc index cfea8a2c5d..c44ab66ca2 100644 --- a/table/bloom_block.cc +++ b/table/bloom_block.cc @@ -11,7 +11,7 @@ namespace rocksdb { -void BloomBlockBuilder::AddKeysHashes(const std::vector& keys_hashes) { +void BloomBlockBuilder::AddKeysHashes(const std::vector keys_hashes) { for (auto hash : keys_hashes) { bloom_.AddHash(hash); } diff --git a/table/bloom_block.h b/table/bloom_block.h index 7ef5d14b6d..d55453edaa 100644 --- a/table/bloom_block.h +++ b/table/bloom_block.h @@ -26,7 +26,7 @@ class BloomBlockBuilder { uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); } - void AddKeysHashes(const std::vector& keys_hashes); + void AddKeysHashes(const std::vector keys_hashes); Slice Finish(); diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc index 6ff1fa0cf7..6326d37876 100644 --- a/table/cuckoo_table_builder.cc +++ b/table/cuckoo_table_builder.cc @@ -35,12 +35,6 @@ const std::string CuckooTablePropertyNames::kIsLastLevel = "rocksdb.cuckoo.file.islastlevel"; const std::string CuckooTablePropertyNames::kCuckooBlockSize = "rocksdb.cuckoo.hash.cuckooblocksize"; -const std::string CuckooTablePropertyNames::kIdentityAsFirstHash = - "rocksdb.cuckoo.hash.identityfirst"; -const std::string CuckooTablePropertyNames::kUseModuleHash = - "rocksdb.cuckoo.hash.usemodule"; -const std::string CuckooTablePropertyNames::kUserKeyLength = - "rocksdb.cuckoo.hash.userkeylength"; // Obtained by running echo rocksdb.table.cuckoo | sha1sum extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull; @@ -49,7 +43,6 @@ CuckooTableBuilder::CuckooTableBuilder( WritableFile* file, double max_hash_table_ratio, uint32_t max_num_hash_table, uint32_t max_search_depth, const Comparator* user_comparator, uint32_t cuckoo_block_size, - bool use_module_hash, bool identity_as_first_hash, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) : num_hash_func_(2), file_(file), @@ -57,19 +50,13 @@ CuckooTableBuilder::CuckooTableBuilder( max_num_hash_func_(max_num_hash_table), max_search_depth_(max_search_depth), cuckoo_block_size_(std::max(1U, cuckoo_block_size)), - hash_table_size_(use_module_hash ? 0 : 2), + hash_table_size_(2), is_last_level_file_(false), has_seen_first_key_(false), - has_seen_first_value_(false), - key_size_(0), - value_size_(0), - num_entries_(0), - num_values_(0), ucomp_(user_comparator), - use_module_hash_(use_module_hash), - identity_as_first_hash_(identity_as_first_hash), get_slice_hash_(get_slice_hash), closed_(false) { + properties_.num_entries = 0; // Data is in a huge block. properties_.num_data_blocks = 1; properties_.index_size = 0; @@ -77,7 +64,7 @@ CuckooTableBuilder::CuckooTableBuilder( } void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { - if (num_entries_ >= kMaxVectorIdx - 1) { + if (properties_.num_entries >= kMaxVectorIdx - 1) { status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1"); return; } @@ -86,12 +73,6 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { status_ = Status::Corruption("Unable to parse key into inernal key."); return; } - if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) { - status_ = Status::NotSupported("Unsupported key type " + - std::to_string(ikey.type)); - return; - } - // Determine if we can ignore the sequence number and value type from // internal keys by looking at sequence number from first key. We assume // that if first key has a zero sequence number, then all the remaining @@ -101,40 +82,15 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { has_seen_first_key_ = true; smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); - key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size(); - } - if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) { - status_ = Status::NotSupported("all keys have to be the same size"); - return; } // Even if one sequence number is non-zero, then it is not last level. assert(!is_last_level_file_ || ikey.sequence == 0); - - if (ikey.type == kTypeValue) { - if (!has_seen_first_value_) { - has_seen_first_value_ = true; - value_size_ = value.size(); - } - if (value_size_ != value.size()) { - status_ = Status::NotSupported("all values have to be the same size"); - return; - } - - if (is_last_level_file_) { - kvs_.append(ikey.user_key.data(), ikey.user_key.size()); - } else { - kvs_.append(key.data(), key.size()); - } - kvs_.append(value.data(), value.size()); - ++num_values_; + if (is_last_level_file_) { + kvs_.emplace_back(std::make_pair( + ikey.user_key.ToString(), value.ToString())); } else { - if (is_last_level_file_) { - deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size()); - } else { - deleted_keys_.append(key.data(), key.size()); - } + kvs_.emplace_back(std::make_pair(key.ToString(), value.ToString())); } - ++num_entries_; // In order to fill the empty buckets in the hash table, we identify a // key which is not used so far (unused_user_key). We determine this by @@ -146,52 +102,25 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { } else if (ikey.user_key.compare(largest_user_key_) > 0) { largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); } - if (!use_module_hash_) { - if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) { - hash_table_size_ *= 2; - } + if (hash_table_size_ < kvs_.size() / max_hash_table_ratio_) { + hash_table_size_ *= 2; } } -bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const { - assert(closed_); - return idx >= num_values_; -} - -Slice CuckooTableBuilder::GetKey(uint64_t idx) const { - assert(closed_); - if (IsDeletedKey(idx)) { - return Slice(&deleted_keys_[(idx - num_values_) * key_size_], key_size_); - } - return Slice(&kvs_[idx * (key_size_ + value_size_)], key_size_); -} - -Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const { - assert(closed_); - return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx)); -} - -Slice CuckooTableBuilder::GetValue(uint64_t idx) const { - assert(closed_); - if (IsDeletedKey(idx)) { - static std::string empty_value(value_size_, 'a'); - return Slice(empty_value); - } - return Slice(&kvs_[idx * (key_size_ + value_size_) + key_size_], value_size_); -} - Status CuckooTableBuilder::MakeHashTable(std::vector* buckets) { - buckets->resize(hash_table_size_ + cuckoo_block_size_ - 1); + uint64_t hash_table_size_minus_one = hash_table_size_ - 1; + buckets->resize(hash_table_size_minus_one + cuckoo_block_size_); uint64_t make_space_for_key_call_id = 0; - for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) { + for (uint32_t vector_idx = 0; vector_idx < kvs_.size(); vector_idx++) { uint64_t bucket_id; bool bucket_found = false; autovector hash_vals; - Slice user_key = GetUserKey(vector_idx); + Slice user_key = is_last_level_file_ ? kvs_[vector_idx].first : + ExtractUserKey(kvs_[vector_idx].first); for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found; ++hash_cnt) { - uint64_t hash_val = CuckooHash(user_key, hash_cnt, use_module_hash_, - hash_table_size_, identity_as_first_hash_, get_slice_hash_); + uint64_t hash_val = CuckooHash(user_key, hash_cnt, + hash_table_size_minus_one, get_slice_hash_); // If there is a collision, check next cuckoo_block_size_ locations for // empty locations. While checking, if we reach end of the hash table, // stop searching and proceed for next hash function. @@ -202,8 +131,10 @@ Status CuckooTableBuilder::MakeHashTable(std::vector* buckets) { bucket_found = true; break; } else { - if (ucomp_->Compare(user_key, - GetUserKey((*buckets)[hash_val].vector_idx)) == 0) { + if (ucomp_->Compare(user_key, is_last_level_file_ + ? Slice(kvs_[(*buckets)[hash_val].vector_idx].first) + : ExtractUserKey( + kvs_[(*buckets)[hash_val].vector_idx].first)) == 0) { return Status::NotSupported("Same key is being inserted again."); } hash_vals.push_back(hash_val); @@ -218,8 +149,8 @@ Status CuckooTableBuilder::MakeHashTable(std::vector* buckets) { } // We don't really need to rehash the entire table because old hashes are // still valid and we only increased the number of hash functions. - uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_, - hash_table_size_, identity_as_first_hash_, get_slice_hash_); + uint64_t hash_val = CuckooHash(user_key, num_hash_func_, + hash_table_size_minus_one, get_slice_hash_); ++num_hash_func_; for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; ++block_idx, ++hash_val) { @@ -243,11 +174,7 @@ Status CuckooTableBuilder::Finish() { std::vector buckets; Status s; std::string unused_bucket; - if (num_entries_ > 0) { - // Calculate the real hash size if module hash is enabled. - if (use_module_hash_) { - hash_table_size_ = num_entries_ / max_hash_table_ratio_; - } + if (!kvs_.empty()) { s = MakeHashTable(&buckets); if (!s.ok()) { return s; @@ -284,13 +211,14 @@ Status CuckooTableBuilder::Finish() { AppendInternalKey(&unused_bucket, ikey); } } - properties_.num_entries = num_entries_; - properties_.fixed_key_len = key_size_; + properties_.num_entries = kvs_.size(); + properties_.fixed_key_len = unused_bucket.size(); + uint32_t value_length = kvs_.empty() ? 0 : kvs_[0].second.size(); + uint32_t bucket_size = value_length + properties_.fixed_key_len; properties_.user_collected_properties[ CuckooTablePropertyNames::kValueLength].assign( - reinterpret_cast(&value_size_), sizeof(value_size_)); + reinterpret_cast(&value_length), sizeof(value_length)); - uint64_t bucket_size = key_size_ + value_size_; unused_bucket.resize(bucket_size, 'a'); // Write the table. uint32_t num_added = 0; @@ -299,11 +227,9 @@ Status CuckooTableBuilder::Finish() { s = file_->Append(Slice(unused_bucket)); } else { ++num_added; - s = file_->Append(GetKey(bucket.vector_idx)); + s = file_->Append(kvs_[bucket.vector_idx].first); if (s.ok()) { - if (value_size_ > 0) { - s = file_->Append(GetValue(bucket.vector_idx)); - } + s = file_->Append(kvs_[bucket.vector_idx].second); } } if (!s.ok()) { @@ -312,7 +238,7 @@ Status CuckooTableBuilder::Finish() { } assert(num_added == NumEntries()); properties_.raw_key_size = num_added * properties_.fixed_key_len; - properties_.raw_value_size = num_added * value_size_; + properties_.raw_value_size = num_added * value_length; uint64_t offset = buckets.size() * bucket_size; properties_.data_size = offset; @@ -323,10 +249,11 @@ Status CuckooTableBuilder::Finish() { CuckooTablePropertyNames::kNumHashFunc].assign( reinterpret_cast(&num_hash_func_), sizeof(num_hash_func_)); + uint64_t hash_table_size = buckets.size() - cuckoo_block_size_ + 1; properties_.user_collected_properties[ CuckooTablePropertyNames::kHashTableSize].assign( - reinterpret_cast(&hash_table_size_), - sizeof(hash_table_size_)); + reinterpret_cast(&hash_table_size), + sizeof(hash_table_size)); properties_.user_collected_properties[ CuckooTablePropertyNames::kIsLastLevel].assign( reinterpret_cast(&is_last_level_file_), @@ -335,19 +262,6 @@ Status CuckooTableBuilder::Finish() { CuckooTablePropertyNames::kCuckooBlockSize].assign( reinterpret_cast(&cuckoo_block_size_), sizeof(cuckoo_block_size_)); - properties_.user_collected_properties[ - CuckooTablePropertyNames::kIdentityAsFirstHash].assign( - reinterpret_cast(&identity_as_first_hash_), - sizeof(identity_as_first_hash_)); - properties_.user_collected_properties[ - CuckooTablePropertyNames::kUseModuleHash].assign( - reinterpret_cast(&use_module_hash_), - sizeof(use_module_hash_)); - uint32_t user_key_len = static_cast(smallest_user_key_.size()); - properties_.user_collected_properties[ - CuckooTablePropertyNames::kUserKeyLength].assign( - reinterpret_cast(&user_key_len), - sizeof(user_key_len)); // Write meta blocks. MetaIndexBuilder meta_index_builder; @@ -391,30 +305,26 @@ void CuckooTableBuilder::Abandon() { } uint64_t CuckooTableBuilder::NumEntries() const { - return num_entries_; + return kvs_.size(); } uint64_t CuckooTableBuilder::FileSize() const { if (closed_) { return file_->GetFileSize(); - } else if (num_entries_ == 0) { + } else if (properties_.num_entries == 0) { return 0; } - if (use_module_hash_) { - return (key_size_ + value_size_) * num_entries_ / max_hash_table_ratio_; - } else { - // Account for buckets being a power of two. - // As elements are added, file size remains constant for a while and - // doubles its size. Since compaction algorithm stops adding elements - // only after it exceeds the file limit, we account for the extra element - // being added here. - uint64_t expected_hash_table_size = hash_table_size_; - if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) { - expected_hash_table_size *= 2; - } - return (key_size_ + value_size_) * expected_hash_table_size - 1; + // Account for buckets being a power of two. + // As elements are added, file size remains constant for a while and doubles + // its size. Since compaction algorithm stops adding elements only after it + // exceeds the file limit, we account for the extra element being added here. + uint64_t expected_hash_table_size = hash_table_size_; + if (expected_hash_table_size < (kvs_.size() + 1) / max_hash_table_ratio_) { + expected_hash_table_size *= 2; } + return (kvs_[0].first.size() + kvs_[0].second.size()) * + expected_hash_table_size; } // This method is invoked when there is no place to insert the target key. @@ -449,13 +359,14 @@ bool CuckooTableBuilder::MakeSpaceForKey( // of the method. We store this number into the nodes that we explore in // current method call. // It is unlikely for the increment operation to overflow because the maximum - // no. of times this will be called is <= max_num_hash_func_ + num_entries_. + // no. of times this will be called is <= max_num_hash_func_ + kvs_.size(). for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { uint64_t bucket_id = hash_vals[hash_cnt]; (*buckets)[bucket_id].make_space_for_key_call_id = make_space_for_key_call_id; tree.push_back(CuckooNode(bucket_id, 0, 0)); } + uint64_t hash_table_size_minus_one = hash_table_size_ - 1; bool null_found = false; uint32_t curr_pos = 0; while (!null_found && curr_pos < tree.size()) { @@ -467,9 +378,10 @@ bool CuckooTableBuilder::MakeSpaceForKey( CuckooBucket& curr_bucket = (*buckets)[curr_node.bucket_id]; for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) { - uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx), - hash_cnt, use_module_hash_, hash_table_size_, identity_as_first_hash_, - get_slice_hash_); + uint64_t child_bucket_id = CuckooHash( + (is_last_level_file_ ? kvs_[curr_bucket.vector_idx].first : + ExtractUserKey(Slice(kvs_[curr_bucket.vector_idx].first))), + hash_cnt, hash_table_size_minus_one, get_slice_hash_); // Iterate inside Cuckoo Block. for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; ++block_idx, ++child_bucket_id) { diff --git a/table/cuckoo_table_builder.h b/table/cuckoo_table_builder.h index 6898c1ef68..2bf206102e 100644 --- a/table/cuckoo_table_builder.h +++ b/table/cuckoo_table_builder.h @@ -24,7 +24,6 @@ class CuckooTableBuilder: public TableBuilder { WritableFile* file, double max_hash_table_ratio, uint32_t max_num_hash_func, uint32_t max_search_depth, const Comparator* user_comparator, uint32_t cuckoo_block_size, - bool use_module_hash, bool identity_as_first_hash, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)); // REQUIRES: Either Finish() or Abandon() has been called. @@ -75,11 +74,6 @@ class CuckooTableBuilder: public TableBuilder { uint64_t* bucket_id); Status MakeHashTable(std::vector* buckets); - inline bool IsDeletedKey(uint64_t idx) const; - inline Slice GetKey(uint64_t idx) const; - inline Slice GetUserKey(uint64_t idx) const; - inline Slice GetValue(uint64_t idx) const; - uint32_t num_hash_func_; WritableFile* file_; const double max_hash_table_ratio_; @@ -88,24 +82,11 @@ class CuckooTableBuilder: public TableBuilder { const uint32_t cuckoo_block_size_; uint64_t hash_table_size_; bool is_last_level_file_; - bool has_seen_first_key_; - bool has_seen_first_value_; - uint64_t key_size_; - uint64_t value_size_; - // A list of fixed-size key-value pairs concatenating into a string. - // Use GetKey(), GetUserKey(), and GetValue() to retrieve a specific - // key / value given an index - std::string kvs_; - std::string deleted_keys_; - // Number of key-value pairs stored in kvs_ + number of deleted keys - uint64_t num_entries_; - // Number of keys that contain value (non-deletion op) - uint64_t num_values_; Status status_; + std::vector> kvs_; TableProperties properties_; + bool has_seen_first_key_; const Comparator* ucomp_; - bool use_module_hash_; - bool identity_as_first_hash_; uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, uint64_t max_num_buckets); std::string largest_user_key_ = ""; diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc index d3b3a713e0..69647d410e 100644 --- a/table/cuckoo_table_builder_test.cc +++ b/table/cuckoo_table_builder_test.cc @@ -50,6 +50,12 @@ class CuckooBuilderTest { TableProperties* props = nullptr; ASSERT_OK(ReadTableProperties(read_file.get(), read_file_size, kCuckooTableMagicNumber, env_, nullptr, &props)); + ASSERT_EQ(props->num_entries, keys.size()); + ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size()); + ASSERT_EQ(props->data_size, expected_unused_bucket.size() * + (expected_table_size + expected_cuckoo_block_size - 1)); + ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len); + // Check unused bucket. std::string unused_key = props->user_collected_properties[ CuckooTablePropertyNames::kEmptyKey]; @@ -77,12 +83,6 @@ class CuckooBuilderTest { *reinterpret_cast(props->user_collected_properties[ CuckooTablePropertyNames::kIsLastLevel].data()); ASSERT_EQ(expected_is_last_level, is_last_level_found); - - ASSERT_EQ(props->num_entries, keys.size()); - ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size()); - ASSERT_EQ(props->data_size, expected_unused_bucket.size() * - (expected_table_size + expected_cuckoo_block_size - 1)); - ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len); delete props; // Check contents of the bucket. @@ -133,12 +133,11 @@ TEST(CuckooBuilderTest, SuccessWithEmptyFile) { fname = test::TmpDir() + "/EmptyFile"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - 4, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + 4, 100, BytewiseComparator(), 1, GetSliceHash); ASSERT_OK(builder.status()); - ASSERT_EQ(0UL, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); - CheckFileContents({}, {}, {}, "", 2, 2, false); + CheckFileContents({}, {}, {}, "", 0, 2, false); } TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { @@ -156,25 +155,22 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/NoCollisionFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } - uint32_t bucket_size = keys[0].size() + values[0].size(); - ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); - ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -196,25 +192,22 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } - uint32_t bucket_size = keys[0].size() + values[0].size(); - ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); - ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -236,27 +229,23 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; uint32_t cuckoo_block_size = 2; fname = test::TmpDir() + "/WithCollisionFullKey2"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size, - false, false, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } - uint32_t bucket_size = keys[0].size() + values[0].size(); - ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); - ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -283,25 +272,22 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } - uint32_t bucket_size = keys[0].size() + values[0].size(); - ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); - ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -325,25 +311,22 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 2, false, false, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 2, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } - uint32_t bucket_size = keys[0].size() + values[0].size(); - ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); - ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -361,25 +344,22 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { {user_keys[3], {3, 4, 5, 6}} }; std::vector expected_locations = {0, 1, 2, 3}; - uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/NoCollisionUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } - uint32_t bucket_size = user_keys[0].size() + values[0].size(); - ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); - ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, @@ -397,25 +377,22 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { {user_keys[3], {0, 1, 2, 3}}, }; std::vector expected_locations = {0, 1, 2, 3}; - uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } - uint32_t bucket_size = user_keys[0].size() + values[0].size(); - ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); - ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, @@ -435,25 +412,22 @@ TEST(CuckooBuilderTest, WithCollisionPathUserKey) { {user_keys[4], {0, 2}}, }; std::vector expected_locations = {0, 1, 3, 4, 2}; - uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash); + num_hash_fun, 2, BytewiseComparator(), 1, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } - uint32_t bucket_size = user_keys[0].size() + values[0].size(); - ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); - ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, @@ -479,7 +453,7 @@ TEST(CuckooBuilderTest, FailWhenCollisionPathTooLong) { fname = test::TmpDir() + "/WithCollisionPathUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash); + num_hash_fun, 2, BytewiseComparator(), 1, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value")); @@ -499,7 +473,7 @@ TEST(CuckooBuilderTest, FailWhenSameKeyInserted) { fname = test::TmpDir() + "/FailWhenSameKeyInserted"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash); ASSERT_OK(builder.status()); builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1")); diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo_table_factory.cc index 4afc9fc2e2..e2cc6fd891 100644 --- a/table/cuckoo_table_factory.cc +++ b/table/cuckoo_table_factory.cc @@ -11,12 +11,11 @@ #include "table/cuckoo_table_reader.h" namespace rocksdb { - -Status CuckooTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, const InternalKeyComparator& icomp, +Status CuckooTableFactory::NewTableReader(const Options& options, + const EnvOptions& soptions, const InternalKeyComparator& icomp, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table) const { - std::unique_ptr new_reader(new CuckooTableReader(ioptions, + std::unique_ptr new_reader(new CuckooTableReader(options, std::move(file), file_size, icomp.user_comparator(), nullptr)); Status s = new_reader->status(); if (s.ok()) { @@ -26,15 +25,10 @@ Status CuckooTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, } TableBuilder* CuckooTableFactory::NewTableBuilder( - const ImmutableCFOptions& ioptions, - const InternalKeyComparator& internal_comparator, - WritableFile* file, const CompressionType, - const CompressionOptions&) const { - // TODO: change builder to take the option struct - return new CuckooTableBuilder(file, table_options_.hash_table_ratio, 64, - table_options_.max_search_depth, internal_comparator.user_comparator(), - table_options_.cuckoo_block_size, table_options_.use_module_hash, - table_options_.identity_as_first_hash, nullptr); + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const { + return new CuckooTableBuilder(file, hash_table_ratio_, 64, max_search_depth_, + internal_comparator.user_comparator(), cuckoo_block_size_, nullptr); } std::string CuckooTableFactory::GetPrintableTableOptions() const { @@ -44,22 +38,21 @@ std::string CuckooTableFactory::GetPrintableTableOptions() const { char buffer[kBufferSize]; snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n", - table_options_.hash_table_ratio); + hash_table_ratio_); ret.append(buffer); snprintf(buffer, kBufferSize, " max_search_depth: %u\n", - table_options_.max_search_depth); + max_search_depth_); ret.append(buffer); snprintf(buffer, kBufferSize, " cuckoo_block_size: %u\n", - table_options_.cuckoo_block_size); - ret.append(buffer); - snprintf(buffer, kBufferSize, " identity_as_first_hash: %d\n", - table_options_.identity_as_first_hash); + cuckoo_block_size_); ret.append(buffer); return ret; } -TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) { - return new CuckooTableFactory(table_options); +TableFactory* NewCuckooTableFactory(double hash_table_ratio, + uint32_t max_search_depth, uint32_t cuckoo_block_size) { + return new CuckooTableFactory( + hash_table_ratio, max_search_depth, cuckoo_block_size); } } // namespace rocksdb diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h index 599908678b..5799a7f23f 100644 --- a/table/cuckoo_table_factory.h +++ b/table/cuckoo_table_factory.h @@ -9,33 +9,21 @@ #include #include "rocksdb/table.h" #include "util/murmurhash.h" -#include "rocksdb/options.h" namespace rocksdb { const uint32_t kCuckooMurmurSeedMultiplier = 816922183; static inline uint64_t CuckooHash( - const Slice& user_key, uint32_t hash_cnt, bool use_module_hash, - uint64_t table_size_, bool identity_as_first_hash, + const Slice& user_key, uint32_t hash_cnt, uint64_t table_size_minus_one, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) { #ifndef NDEBUG // This part is used only in unit tests. if (get_slice_hash != nullptr) { - return get_slice_hash(user_key, hash_cnt, table_size_); + return get_slice_hash(user_key, hash_cnt, table_size_minus_one + 1); } #endif - uint64_t value = 0; - if (hash_cnt == 0 && identity_as_first_hash) { - value = (*reinterpret_cast(user_key.data())); - } else { - value = MurmurHash(user_key.data(), user_key.size(), - kCuckooMurmurSeedMultiplier * hash_cnt); - } - if (use_module_hash) { - return value % table_size_; - } else { - return value & (table_size_ - 1); - } + return MurmurHash(user_key.data(), user_key.size(), + kCuckooMurmurSeedMultiplier * hash_cnt) & table_size_minus_one; } // Cuckoo Table is designed for applications that require fast point lookups @@ -47,21 +35,24 @@ static inline uint64_t CuckooHash( // - Does not support Merge operations. class CuckooTableFactory : public TableFactory { public: - explicit CuckooTableFactory(const CuckooTableOptions& table_options) - : table_options_(table_options) {} + CuckooTableFactory(double hash_table_ratio, uint32_t max_search_depth, + uint32_t cuckoo_block_size) + : hash_table_ratio_(hash_table_ratio), + max_search_depth_(max_search_depth), + cuckoo_block_size_(cuckoo_block_size) {} ~CuckooTableFactory() {} const char* Name() const override { return "CuckooTable"; } Status NewTableReader( - const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const Options& options, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const override; - TableBuilder* NewTableBuilder(const ImmutableCFOptions& options, + TableBuilder* NewTableBuilder(const Options& options, const InternalKeyComparator& icomparator, WritableFile* file, - const CompressionType, const CompressionOptions&) const override; + CompressionType compression_type) const override; // Sanitizes the specified DB Options. Status SanitizeDBOptions(const DBOptions* db_opts) const override { @@ -71,7 +62,9 @@ class CuckooTableFactory : public TableFactory { std::string GetPrintableTableOptions() const override; private: - const CuckooTableOptions table_options_; + const double hash_table_ratio_; + const uint32_t max_search_depth_; + const uint32_t cuckoo_block_size_; }; } // namespace rocksdb diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc index f39900addb..f1dcbc3bb6 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo_table_reader.cc @@ -16,23 +16,20 @@ #include #include #include "rocksdb/iterator.h" -#include "rocksdb/table.h" #include "table/meta_blocks.h" #include "table/cuckoo_table_factory.h" -#include "table/get_context.h" #include "util/arena.h" #include "util/coding.h" namespace rocksdb { namespace { -const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1); -const uint32_t kInvalidIndex = std::numeric_limits::max(); + static const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1); } extern const uint64_t kCuckooTableMagicNumber; CuckooTableReader::CuckooTableReader( - const ImmutableCFOptions& ioptions, + const Options& options, std::unique_ptr&& file, uint64_t file_size, const Comparator* comparator, @@ -40,12 +37,12 @@ CuckooTableReader::CuckooTableReader( : file_(std::move(file)), ucomp_(comparator), get_slice_hash_(get_slice_hash) { - if (!ioptions.allow_mmap_reads) { + if (!options.allow_mmap_reads) { status_ = Status::InvalidArgument("File is not mmaped"); } TableProperties* props = nullptr; status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, - ioptions.env, ioptions.info_log, &props); + options.env, options.info_log.get(), &props); if (!status_.ok()) { return; } @@ -53,29 +50,21 @@ CuckooTableReader::CuckooTableReader( auto& user_props = props->user_collected_properties; auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc); if (hash_funs == user_props.end()) { - status_ = Status::Corruption("Number of hash functions not found"); + status_ = Status::InvalidArgument("Number of hash functions not found"); return; } num_hash_func_ = *reinterpret_cast(hash_funs->second.data()); auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey); if (unused_key == user_props.end()) { - status_ = Status::Corruption("Empty bucket value not found"); + status_ = Status::InvalidArgument("Empty bucket value not found"); return; } unused_key_ = unused_key->second; key_length_ = props->fixed_key_len; - auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength); - if (user_key_len == user_props.end()) { - status_ = Status::Corruption("User key length not found"); - return; - } - user_key_length_ = *reinterpret_cast( - user_key_len->second.data()); - auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength); if (value_length == user_props.end()) { - status_ = Status::Corruption("Value length not found"); + status_ = Status::InvalidArgument("Value length not found"); return; } value_length_ = *reinterpret_cast( @@ -85,40 +74,21 @@ CuckooTableReader::CuckooTableReader( auto hash_table_size = user_props.find( CuckooTablePropertyNames::kHashTableSize); if (hash_table_size == user_props.end()) { - status_ = Status::Corruption("Hash table size not found"); + status_ = Status::InvalidArgument("Hash table size not found"); return; } - table_size_ = *reinterpret_cast( - hash_table_size->second.data()); - + table_size_minus_one_ = *reinterpret_cast( + hash_table_size->second.data()) - 1; auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel); if (is_last_level == user_props.end()) { - status_ = Status::Corruption("Is last level not found"); + status_ = Status::InvalidArgument("Is last level not found"); return; } is_last_level_ = *reinterpret_cast(is_last_level->second.data()); - - auto identity_as_first_hash = user_props.find( - CuckooTablePropertyNames::kIdentityAsFirstHash); - if (identity_as_first_hash == user_props.end()) { - status_ = Status::Corruption("identity as first hash not found"); - return; - } - identity_as_first_hash_ = *reinterpret_cast( - identity_as_first_hash->second.data()); - - auto use_module_hash = user_props.find( - CuckooTablePropertyNames::kUseModuleHash); - if (use_module_hash == user_props.end()) { - status_ = Status::Corruption("hash type is not found"); - return; - } - use_module_hash_ = *reinterpret_cast( - use_module_hash->second.data()); auto cuckoo_block_size = user_props.find( CuckooTablePropertyNames::kCuckooBlockSize); if (cuckoo_block_size == user_props.end()) { - status_ = Status::Corruption("Cuckoo block size not found"); + status_ = Status::InvalidArgument("Cuckoo block size not found"); return; } cuckoo_block_size_ = *reinterpret_cast( @@ -127,32 +97,36 @@ CuckooTableReader::CuckooTableReader( status_ = file_->Read(0, file_size, &file_data_, nullptr); } -Status CuckooTableReader::Get(const ReadOptions& readOptions, const Slice& key, - GetContext* get_context) { +Status CuckooTableReader::Get( + const ReadOptions& readOptions, const Slice& key, void* handle_context, + bool (*result_handler)(void* arg, const ParsedInternalKey& k, + const Slice& v), + void (*mark_key_may_exist_handler)(void* handle_context)) { assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); Slice user_key = ExtractUserKey(key); for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { uint64_t offset = bucket_length_ * CuckooHash( - user_key, hash_cnt, use_module_hash_, table_size_, - identity_as_first_hash_, get_slice_hash_); + user_key, hash_cnt, table_size_minus_one_, get_slice_hash_); const char* bucket = &file_data_.data()[offset]; for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; - ++block_idx, bucket += bucket_length_) { + ++block_idx, bucket += bucket_length_) { if (ucomp_->Compare(Slice(unused_key_.data(), user_key.size()), - Slice(bucket, user_key.size())) == 0) { + Slice(bucket, user_key.size())) == 0) { return Status::OK(); } // Here, we compare only the user key part as we support only one entry // per user key and we don't support sanpshot. if (ucomp_->Compare(user_key, Slice(bucket, user_key.size())) == 0) { - Slice value(bucket + key_length_, value_length_); + Slice value = Slice(&bucket[key_length_], value_length_); if (is_last_level_) { - get_context->SaveValue(value); + ParsedInternalKey found_ikey( + Slice(bucket, key_length_), 0, kTypeValue); + result_handler(handle_context, found_ikey, value); } else { Slice full_key(bucket, key_length_); ParsedInternalKey found_ikey; ParseInternalKey(full_key, &found_ikey); - get_context->SaveValue(found_ikey, value); + result_handler(handle_context, found_ikey, value); } // We don't support merge operations. So, we return here. return Status::OK(); @@ -166,8 +140,7 @@ void CuckooTableReader::Prepare(const Slice& key) { // Prefetch the first Cuckoo Block. Slice user_key = ExtractUserKey(key); uint64_t addr = reinterpret_cast(file_data_.data()) + - bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_, - identity_as_first_hash_, nullptr); + bucket_length_ * CuckooHash(user_key, 0, table_size_minus_one_, nullptr); uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_; for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) { PREFETCH(reinterpret_cast(addr), 0, 3); @@ -187,43 +160,33 @@ class CuckooTableIterator : public Iterator { Slice key() const override; Slice value() const override; Status status() const override { return status_; } - void InitIfNeeded(); + void LoadKeysFromReader(); private: - struct BucketComparator { - BucketComparator(const Slice& file_data, const Comparator* ucomp, - uint32_t bucket_len, uint32_t user_key_len, - const Slice target = Slice()) - : file_data_(file_data), - ucomp_(ucomp), - bucket_len_(bucket_len), - user_key_len_(user_key_len), - target_(target) {} - bool operator()(const uint32_t first, const uint32_t second) const { - const char* first_bucket = - (first == kInvalidIndex) ? target_.data() : - &file_data_.data()[first * bucket_len_]; - const char* second_bucket = - (second == kInvalidIndex) ? target_.data() : - &file_data_.data()[second * bucket_len_]; - return ucomp_->Compare(Slice(first_bucket, user_key_len_), - Slice(second_bucket, user_key_len_)) < 0; + struct CompareKeys { + CompareKeys(const Comparator* ucomp, const bool last_level) + : ucomp_(ucomp), + is_last_level_(last_level) {} + bool operator()(const std::pair& first, + const std::pair& second) const { + if (is_last_level_) { + return ucomp_->Compare(first.first, second.first) < 0; + } else { + return ucomp_->Compare(ExtractUserKey(first.first), + ExtractUserKey(second.first)) < 0; + } } - private: - const Slice file_data_; - const Comparator* ucomp_; - const uint32_t bucket_len_; - const uint32_t user_key_len_; - const Slice target_; - }; - const BucketComparator bucket_comparator_; + private: + const Comparator* ucomp_; + const bool is_last_level_; + }; + const CompareKeys comparator_; void PrepareKVAtCurrIdx(); CuckooTableReader* reader_; - bool initialized_; Status status_; // Contains a map of keys to bucket_id sorted in key order. - std::vector sorted_bucket_ids_; + std::vector> key_to_bucket_id_; // We assume that the number of items can be stored in uint32 (4 Billion). uint32_t curr_key_idx_; Slice curr_value_; @@ -234,66 +197,57 @@ class CuckooTableIterator : public Iterator { }; CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader) - : bucket_comparator_(reader->file_data_, reader->ucomp_, - reader->bucket_length_, reader->user_key_length_), + : comparator_(reader->ucomp_, reader->is_last_level_), reader_(reader), - initialized_(false), - curr_key_idx_(kInvalidIndex) { - sorted_bucket_ids_.clear(); + curr_key_idx_(std::numeric_limits::max()) { + key_to_bucket_id_.clear(); curr_value_.clear(); curr_key_.Clear(); } -void CuckooTableIterator::InitIfNeeded() { - if (initialized_) { - return; - } - sorted_bucket_ids_.reserve(reader_->GetTableProperties()->num_entries); - uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1; - assert(num_buckets < kInvalidIndex); - const char* bucket = reader_->file_data_.data(); - for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) { - if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) { - sorted_bucket_ids_.push_back(bucket_id); +void CuckooTableIterator::LoadKeysFromReader() { + key_to_bucket_id_.reserve(reader_->GetTableProperties()->num_entries); + uint64_t num_buckets = reader_->table_size_minus_one_ + + reader_->cuckoo_block_size_; + for (uint32_t bucket_id = 0; bucket_id < num_buckets; bucket_id++) { + Slice read_key; + status_ = reader_->file_->Read(bucket_id * reader_->bucket_length_, + reader_->key_length_, &read_key, nullptr); + if (read_key != Slice(reader_->unused_key_)) { + key_to_bucket_id_.push_back(std::make_pair(read_key, bucket_id)); } - bucket += reader_->bucket_length_; } - assert(sorted_bucket_ids_.size() == + assert(key_to_bucket_id_.size() == reader_->GetTableProperties()->num_entries); - std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(), - bucket_comparator_); - curr_key_idx_ = kInvalidIndex; - initialized_ = true; + std::sort(key_to_bucket_id_.begin(), key_to_bucket_id_.end(), comparator_); + curr_key_idx_ = key_to_bucket_id_.size(); } void CuckooTableIterator::SeekToFirst() { - InitIfNeeded(); curr_key_idx_ = 0; PrepareKVAtCurrIdx(); } void CuckooTableIterator::SeekToLast() { - InitIfNeeded(); - curr_key_idx_ = sorted_bucket_ids_.size() - 1; + curr_key_idx_ = key_to_bucket_id_.size() - 1; PrepareKVAtCurrIdx(); } void CuckooTableIterator::Seek(const Slice& target) { - InitIfNeeded(); - const BucketComparator seek_comparator( - reader_->file_data_, reader_->ucomp_, - reader_->bucket_length_, reader_->user_key_length_, - ExtractUserKey(target)); - auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(), - sorted_bucket_ids_.end(), - kInvalidIndex, - seek_comparator); - curr_key_idx_ = std::distance(sorted_bucket_ids_.begin(), seek_it); + // We assume that the target is an internal key. If this is last level file, + // we need to take only the user key part to seek. + Slice target_to_search = reader_->is_last_level_ ? + ExtractUserKey(target) : target; + auto seek_it = std::lower_bound(key_to_bucket_id_.begin(), + key_to_bucket_id_.end(), + std::make_pair(target_to_search, 0), + comparator_); + curr_key_idx_ = std::distance(key_to_bucket_id_.begin(), seek_it); PrepareKVAtCurrIdx(); } bool CuckooTableIterator::Valid() const { - return curr_key_idx_ < sorted_bucket_ids_.size(); + return curr_key_idx_ < key_to_bucket_id_.size(); } void CuckooTableIterator::PrepareKVAtCurrIdx() { @@ -302,17 +256,15 @@ void CuckooTableIterator::PrepareKVAtCurrIdx() { curr_key_.Clear(); return; } - uint32_t id = sorted_bucket_ids_[curr_key_idx_]; - const char* offset = reader_->file_data_.data() + - id * reader_->bucket_length_; + uint64_t offset = ((uint64_t) key_to_bucket_id_[curr_key_idx_].second + * reader_->bucket_length_) + reader_->key_length_; + status_ = reader_->file_->Read(offset, reader_->value_length_, + &curr_value_, nullptr); if (reader_->is_last_level_) { // Always return internal key. - curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), - 0, kTypeValue); - } else { - curr_key_.SetKey(Slice(offset, reader_->key_length_)); + curr_key_.SetInternalKey( + key_to_bucket_id_[curr_key_idx_].first, 0, kTypeValue); } - curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_); } void CuckooTableIterator::Next() { @@ -327,7 +279,7 @@ void CuckooTableIterator::Next() { void CuckooTableIterator::Prev() { if (curr_key_idx_ == 0) { - curr_key_idx_ = sorted_bucket_ids_.size(); + curr_key_idx_ = key_to_bucket_id_.size(); } if (!Valid()) { curr_value_.clear(); @@ -340,7 +292,11 @@ void CuckooTableIterator::Prev() { Slice CuckooTableIterator::key() const { assert(Valid()); - return curr_key_.GetKey(); + if (reader_->is_last_level_) { + return curr_key_.GetKey(); + } else { + return key_to_bucket_id_[curr_key_idx_].first; + } } Slice CuckooTableIterator::value() const { @@ -367,6 +323,9 @@ Iterator* CuckooTableReader::NewIterator( auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator)); iter = new (iter_mem) CuckooTableIterator(this); } + if (iter->status().ok()) { + iter->LoadKeysFromReader(); + } return iter; } diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h index 4f00a9e417..05d5c33978 100644 --- a/table/cuckoo_table_reader.h +++ b/table/cuckoo_table_reader.h @@ -16,7 +16,6 @@ #include "db/dbformat.h" #include "rocksdb/env.h" -#include "rocksdb/options.h" #include "table/table_reader.h" namespace rocksdb { @@ -27,7 +26,7 @@ class TableReader; class CuckooTableReader: public TableReader { public: CuckooTableReader( - const ImmutableCFOptions& ioptions, + const Options& options, std::unique_ptr&& file, uint64_t file_size, const Comparator* user_comparator, @@ -40,8 +39,12 @@ class CuckooTableReader: public TableReader { Status status() const { return status_; } - Status Get(const ReadOptions& read_options, const Slice& key, - GetContext* get_context) override; + Status Get( + const ReadOptions& readOptions, const Slice& key, void* handle_context, + bool (*result_handler)(void* arg, const ParsedInternalKey& k, + const Slice& v), + void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) + override; Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override; void Prepare(const Slice& target) override; @@ -60,19 +63,16 @@ class CuckooTableReader: public TableReader { std::unique_ptr file_; Slice file_data_; bool is_last_level_; - bool identity_as_first_hash_; - bool use_module_hash_; std::shared_ptr table_props_; Status status_; uint32_t num_hash_func_; std::string unused_key_; uint32_t key_length_; - uint32_t user_key_length_; uint32_t value_length_; uint32_t bucket_length_; uint32_t cuckoo_block_size_; uint32_t cuckoo_block_bytes_minus_one_; - uint64_t table_size_; + uint64_t table_size_minus_one_; const Comparator* ucomp_; uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, uint64_t max_num_buckets); diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc index 66d88fc713..63fe0ae5ba 100644 --- a/table/cuckoo_table_reader_test.cc +++ b/table/cuckoo_table_reader_test.cc @@ -11,10 +11,7 @@ int main() { } #else -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include #include @@ -25,7 +22,6 @@ int main() { #include "table/cuckoo_table_builder.h" #include "table/cuckoo_table_reader.h" #include "table/cuckoo_table_factory.h" -#include "table/get_context.h" #include "util/arena.h" #include "util/random.h" #include "util/testharness.h" @@ -39,7 +35,6 @@ DEFINE_string(file_dir, "", "Directory where the files will be created" DEFINE_bool(enable_perf, false, "Run Benchmark Tests too."); DEFINE_bool(write, false, "Should write new values to file in performance tests?"); -DEFINE_bool(identity_as_first_hash, true, "use identity as first hash"); namespace rocksdb { @@ -62,6 +57,25 @@ uint64_t GetSliceHash(const Slice& s, uint32_t index, return hash_map[s.ToString()][index]; } +// Methods, variables for checking key and values read. +struct ValuesToAssert { + ValuesToAssert(const std::string& key, const Slice& value) + : expected_user_key(key), + expected_value(value), + call_count(0) {} + std::string expected_user_key; + Slice expected_value; + int call_count; +}; + +bool AssertValues(void* assert_obj, + const ParsedInternalKey& k, const Slice& v) { + ValuesToAssert *ptr = reinterpret_cast(assert_obj); + ASSERT_EQ(ptr->expected_value.ToString(), v.ToString()); + ASSERT_EQ(ptr->expected_user_key, k.user_key.ToString()); + ++ptr->call_count; + return false; +} } // namespace class CuckooReaderTest { @@ -92,8 +106,7 @@ class CuckooReaderTest { std::unique_ptr writable_file; ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); CuckooTableBuilder builder( - writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, 2, - false, false, GetSliceHash); + writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, 2, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) { builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); @@ -108,22 +121,18 @@ class CuckooReaderTest { // Check reader now. std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - ioptions, + options, std::move(read_file), file_size, ucomp, GetSliceHash); ASSERT_OK(reader.status()); - // Assume no merge/deletion for (uint32_t i = 0; i < num_items; ++i) { - std::string value; - GetContext get_context(ucomp, nullptr, nullptr, nullptr, - GetContext::kNotFound, Slice(user_keys[i]), &value, - nullptr, nullptr); - ASSERT_OK(reader.Get(ReadOptions(), Slice(keys[i]), &get_context)); - ASSERT_EQ(values[i], value); + ValuesToAssert v(user_keys[i], values[i]); + ASSERT_OK(reader.Get( + ReadOptions(), Slice(keys[i]), &v, AssertValues, nullptr)); + ASSERT_EQ(1, v.call_count); } } void UpdateKeys(bool with_zero_seqno) { @@ -138,9 +147,8 @@ class CuckooReaderTest { void CheckIterator(const Comparator* ucomp = BytewiseComparator()) { std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - ioptions, + options, std::move(read_file), file_size, ucomp, @@ -314,16 +322,14 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { // Make all hash values collide. AddHashLookups(user_keys[i], 0, kNumHashFunc); } - auto* ucmp = BytewiseComparator(); CreateCuckooFileAndCheckReader(); std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - ioptions, + options, std::move(read_file), file_size, - ucmp, + BytewiseComparator(), GetSliceHash); ASSERT_OK(reader.status()); // Search for a key with colliding hash values. @@ -332,11 +338,10 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { AddHashLookups(not_found_user_key, 0, kNumHashFunc); ParsedInternalKey ikey(not_found_user_key, 1000, kTypeValue); AppendInternalKey(¬_found_key, ikey); - std::string value; - GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, - Slice(not_found_key), &value, nullptr, nullptr); - ASSERT_OK(reader.Get(ReadOptions(), Slice(not_found_key), &get_context)); - ASSERT_TRUE(value.empty()); + ValuesToAssert v("", ""); + ASSERT_OK(reader.Get( + ReadOptions(), Slice(not_found_key), &v, AssertValues, nullptr)); + ASSERT_EQ(0, v.call_count); ASSERT_OK(reader.status()); // Search for a key with an independent hash value. std::string not_found_user_key2 = "key" + NumToStr(num_items + 1); @@ -344,11 +349,9 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { ParsedInternalKey ikey2(not_found_user_key2, 1000, kTypeValue); std::string not_found_key2; AppendInternalKey(¬_found_key2, ikey2); - GetContext get_context2(ucmp, nullptr, nullptr, nullptr, - GetContext::kNotFound, Slice(not_found_key2), &value, - nullptr, nullptr); - ASSERT_OK(reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2)); - ASSERT_TRUE(value.empty()); + ASSERT_OK(reader.Get( + ReadOptions(), Slice(not_found_key2), &v, AssertValues, nullptr)); + ASSERT_EQ(0, v.call_count); ASSERT_OK(reader.status()); // Test read when key is unused key. @@ -358,25 +361,34 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { // Add hash values that map to empty buckets. AddHashLookups(ExtractUserKey(unused_key).ToString(), kNumHashFunc, kNumHashFunc); - GetContext get_context3(ucmp, nullptr, nullptr, nullptr, - GetContext::kNotFound, Slice(unused_key), &value, - nullptr, nullptr); - ASSERT_OK(reader.Get(ReadOptions(), Slice(unused_key), &get_context3)); - ASSERT_TRUE(value.empty()); + ASSERT_OK(reader.Get( + ReadOptions(), Slice(unused_key), &v, AssertValues, nullptr)); + ASSERT_EQ(0, v.call_count); ASSERT_OK(reader.status()); } // Performance tests namespace { +bool DoNothing(void* arg, const ParsedInternalKey& k, const Slice& v) { + // Deliberately empty. + return false; +} + +bool CheckValue(void* cnt_ptr, const ParsedInternalKey& k, const Slice& v) { + ++*reinterpret_cast(cnt_ptr); + std::string expected_value; + AppendInternalKey(&expected_value, k); + ASSERT_EQ(0, v.compare(Slice(&expected_value[0], v.size()))); + return false; +} + void GetKeys(uint64_t num, std::vector* keys) { - keys->clear(); IterKey k; k.SetInternalKey("", 0, kTypeValue); std::string internal_key_suffix = k.GetKey().ToString(); ASSERT_EQ(static_cast(8), internal_key_suffix.size()); for (uint64_t key_idx = 0; key_idx < num; ++key_idx) { - uint64_t value = 2 * key_idx; - std::string new_key(reinterpret_cast(&value), sizeof(value)); + std::string new_key(reinterpret_cast(&key_idx), sizeof(key_idx)); new_key += internal_key_suffix; keys->push_back(new_key); } @@ -404,8 +416,7 @@ void WriteFile(const std::vector& keys, ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); CuckooTableBuilder builder( writable_file.get(), hash_ratio, - 64, 1000, test::Uint64Comparator(), 5, - false, FLAGS_identity_as_first_hash, nullptr); + 64, 1000, test::Uint64Comparator(), 5, nullptr); ASSERT_OK(builder.status()); for (uint64_t key_idx = 0; key_idx < num; ++key_idx) { // Value is just a part of key. @@ -422,21 +433,18 @@ void WriteFile(const std::vector& keys, std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - ioptions, std::move(read_file), file_size, + options, std::move(read_file), file_size, test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); ReadOptions r_options; - std::string value; - // Assume only the fast path is triggered - GetContext get_context(nullptr, nullptr, nullptr, nullptr, - GetContext::kNotFound, Slice(), &value, - nullptr, nullptr); for (uint64_t i = 0; i < num; ++i) { - value.clear(); - ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &get_context)); - ASSERT_TRUE(Slice(keys[i]) == Slice(&keys[i][0], 4)); + int cnt = 0; + ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &cnt, CheckValue, nullptr)); + if (cnt != 1) { + fprintf(stderr, "%" PRIu64 " not found.\n", i); + ASSERT_EQ(1, cnt); + } } } @@ -452,9 +460,8 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - ioptions, std::move(read_file), file_size, test::Uint64Comparator(), + options, std::move(read_file), file_size, test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); const UserCollectedProperties user_props = @@ -467,33 +474,21 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { " hash functions: %u.\n", num, num * 100.0 / (table_size), num_hash_fun); ReadOptions r_options; - std::vector keys; - keys.reserve(num); - for (uint64_t i = 0; i < num; ++i) { - keys.push_back(2 * i); - } - std::random_shuffle(keys.begin(), keys.end()); - - std::string value; - // Assume only the fast path is triggered - GetContext get_context(nullptr, nullptr, nullptr, nullptr, - GetContext::kNotFound, Slice(), &value, - nullptr, nullptr); uint64_t start_time = env->NowMicros(); if (batch_size > 0) { for (uint64_t i = 0; i < num; i += batch_size) { for (uint64_t j = i; j < i+batch_size && j < num; ++j) { - reader.Prepare(Slice(reinterpret_cast(&keys[j]), 16)); + reader.Prepare(Slice(reinterpret_cast(&j), 16)); } for (uint64_t j = i; j < i+batch_size && j < num; ++j) { - reader.Get(r_options, Slice(reinterpret_cast(&keys[j]), 16), - &get_context); + reader.Get(r_options, Slice(reinterpret_cast(&j), 16), + nullptr, DoNothing, nullptr); } } } else { for (uint64_t i = 0; i < num; i++) { - reader.Get(r_options, Slice(reinterpret_cast(&keys[i]), 16), - &get_context); + reader.Get(r_options, Slice(reinterpret_cast(&i), 16), nullptr, + DoNothing, nullptr); } } float time_per_op = (env->NowMicros() - start_time) * 1.0 / num; @@ -511,16 +506,16 @@ TEST(CuckooReaderTest, TestReadPerformance) { // These numbers are chosen to have a hash utilizaiton % close to // 0.9, 0.75, 0.6 and 0.5 respectively. // They all create 128 M buckets. - std::vector nums = {120*1024*1024, 100*1024*1024, 80*1024*1024, - 70*1024*1024}; + std::vector nums = {120*1000*1000, 100*1000*1000, 80*1000*1000, + 70*1000*1000}; #ifndef NDEBUG fprintf(stdout, "WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n"); #endif std::vector keys; + GetKeys(*std::max_element(nums.begin(), nums.end()), &keys); for (uint64_t num : nums) { if (FLAGS_write || !Env::Default()->FileExists(GetFileName(num))) { - GetKeys(num, &keys); WriteFile(keys, num, hash_ratio); } ReadKeys(num, 0); diff --git a/table/block_based_filter_block.cc b/table/filter_block.cc similarity index 54% rename from table/block_based_filter_block.cc rename to table/filter_block.cc index fea37b67f8..6b4ff1c10d 100644 --- a/table/block_based_filter_block.cc +++ b/table/filter_block.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "table/block_based_filter_block.h" +#include "table/filter_block.h" #include "db/dbformat.h" #include "rocksdb/filter_policy.h" @@ -15,39 +15,21 @@ namespace rocksdb { -namespace { -bool SamePrefix(const SliceTransform* prefix_extractor, - const Slice& key1, const Slice& key2) { - if (!prefix_extractor->InDomain(key1) && - !prefix_extractor->InDomain(key2)) { - return true; - } else if (!prefix_extractor->InDomain(key1) || - !prefix_extractor->InDomain(key2)) { - return false; - } else { - return (prefix_extractor->Transform(key1) == - prefix_extractor->Transform(key2)); - } -} -} // namespace - - // See doc/table_format.txt for an explanation of the filter block format. // Generate new filter every 2KB of data static const size_t kFilterBaseLg = 11; static const size_t kFilterBase = 1 << kFilterBaseLg; -BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder( - const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt) +FilterBlockBuilder::FilterBlockBuilder(const Options& opt, + const BlockBasedTableOptions& table_opt, + const Comparator* internal_comparator) : policy_(table_opt.filter_policy.get()), - prefix_extractor_(prefix_extractor), - whole_key_filtering_(table_opt.whole_key_filtering) { - assert(policy_); -} + prefix_extractor_(opt.prefix_extractor.get()), + whole_key_filtering_(table_opt.whole_key_filtering), + comparator_(internal_comparator) {} -void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) { +void FilterBlockBuilder::StartBlock(uint64_t block_offset) { uint64_t filter_index = (block_offset / kFilterBase); assert(filter_index >= filter_offsets_.size()); while (filter_index > filter_offsets_.size()) { @@ -55,45 +37,53 @@ void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) { } } -void BlockBasedFilterBlockBuilder::Add(const Slice& key) { - added_to_start_ = 0; - if (whole_key_filtering_) { - AddKey(key); - added_to_start_ = 1; - } - if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { - AddPrefix(key); +bool FilterBlockBuilder::SamePrefix(const Slice &key1, + const Slice &key2) const { + if (!prefix_extractor_->InDomain(key1) && + !prefix_extractor_->InDomain(key2)) { + return true; + } else if (!prefix_extractor_->InDomain(key1) || + !prefix_extractor_->InDomain(key2)) { + return false; + } else { + return (prefix_extractor_->Transform(key1) == + prefix_extractor_->Transform(key2)); } } -// Add key to filter if needed -inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) { - start_.push_back(entries_.size()); - entries_.append(key.data(), key.size()); -} - -// Add prefix to filter if needed -inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) { +void FilterBlockBuilder::AddKey(const Slice& key) { // get slice for most recently added entry Slice prev; - if (start_.size() > added_to_start_) { - size_t prev_start = start_[start_.size() - 1 - added_to_start_]; + size_t added_to_start = 0; + + // add key to filter if needed + if (whole_key_filtering_) { + start_.push_back(entries_.size()); + ++added_to_start; + entries_.append(key.data(), key.size()); + } + + if (start_.size() > added_to_start) { + size_t prev_start = start_[start_.size() - 1 - added_to_start]; const char* base = entries_.data() + prev_start; size_t length = entries_.size() - prev_start; prev = Slice(base, length); } - // this assumes prefix(prefix(key)) == prefix(key), as the last - // entry in entries_ may be either a key or prefix, and we use - // prefix(last entry) to get the prefix of the last key. - if (prev.size() == 0 || !SamePrefix(prefix_extractor_, key, prev)) { - Slice prefix = prefix_extractor_->Transform(key); - start_.push_back(entries_.size()); - entries_.append(prefix.data(), prefix.size()); + // add prefix to filter if needed + if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { + // this assumes prefix(prefix(key)) == prefix(key), as the last + // entry in entries_ may be either a key or prefix, and we use + // prefix(last entry) to get the prefix of the last key. + if (prev.size() == 0 || !SamePrefix(key, prev)) { + Slice prefix = prefix_extractor_->Transform(key); + start_.push_back(entries_.size()); + entries_.append(prefix.data(), prefix.size()); + } } } -Slice BlockBasedFilterBlockBuilder::Finish() { +Slice FilterBlockBuilder::Finish() { if (!start_.empty()) { GenerateFilter(); } @@ -109,7 +99,7 @@ Slice BlockBasedFilterBlockBuilder::Finish() { return Slice(result_); } -void BlockBasedFilterBlockBuilder::GenerateFilter() { +void FilterBlockBuilder::GenerateFilter() { const size_t num_entries = start_.size(); if (num_entries == 0) { // Fast path if there are no keys for this filter @@ -122,7 +112,7 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() { tmp_entries_.resize(num_entries); for (size_t i = 0; i < num_entries; i++) { const char* base = entries_.data() + start_[i]; - size_t length = start_[i + 1] - start_[i]; + size_t length = start_[i+1] - start_[i]; tmp_entries_[i] = Slice(base, length); } @@ -135,52 +125,50 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() { start_.clear(); } -BlockBasedFilterBlockReader::BlockBasedFilterBlockReader( - const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, BlockContents&& contents) +FilterBlockReader::FilterBlockReader( + const Options& opt, const BlockBasedTableOptions& table_opt, + const Slice& contents, bool delete_contents_after_use) : policy_(table_opt.filter_policy.get()), - prefix_extractor_(prefix_extractor), + prefix_extractor_(opt.prefix_extractor.get()), whole_key_filtering_(table_opt.whole_key_filtering), data_(nullptr), offset_(nullptr), num_(0), - base_lg_(0), - contents_(std::move(contents)) { - assert(policy_); - size_t n = contents_.data.size(); + base_lg_(0) { + size_t n = contents.size(); if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array - base_lg_ = contents_.data[n - 1]; - uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5); + base_lg_ = contents[n-1]; + uint32_t last_word = DecodeFixed32(contents.data() + n - 5); if (last_word > n - 5) return; - data_ = contents_.data.data(); + data_ = contents.data(); offset_ = data_ + last_word; num_ = (n - 5 - last_word) / 4; + if (delete_contents_after_use) { + filter_data.reset(contents.data()); + } } -bool BlockBasedFilterBlockReader::KeyMayMatch(const Slice& key, - uint64_t block_offset) { - assert(block_offset != kNotValid); +bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, + const Slice& key) { if (!whole_key_filtering_) { return true; } - return MayMatch(key, block_offset); + return MayMatch(block_offset, key); } -bool BlockBasedFilterBlockReader::PrefixMayMatch(const Slice& prefix, - uint64_t block_offset) { - assert(block_offset != kNotValid); +bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset, + const Slice& prefix) { if (!prefix_extractor_) { return true; } - return MayMatch(prefix, block_offset); + return MayMatch(block_offset, prefix); } -bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry, - uint64_t block_offset) { +bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) { uint64_t index = block_offset >> base_lg_; if (index < num_) { - uint32_t start = DecodeFixed32(offset_ + index * 4); - uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4); + uint32_t start = DecodeFixed32(offset_ + index*4); + uint32_t limit = DecodeFixed32(offset_ + index*4 + 4); if (start <= limit && limit <= (uint32_t)(offset_ - data_)) { Slice filter = Slice(data_ + start, limit - start); return policy_->KeyMayMatch(entry, filter); @@ -192,7 +180,7 @@ bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry, return true; // Errors are treated as potential matches } -size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const { +size_t FilterBlockReader::ApproximateMemoryUsage() const { return num_ * 4 + 5 + (offset_ - data_); } } diff --git a/table/filter_block.h b/table/filter_block.h index 1976768276..5041393f6b 100644 --- a/table/filter_block.h +++ b/table/filter_block.h @@ -10,11 +10,6 @@ // A filter block is stored near the end of a Table file. It contains // filters (e.g., bloom filters) for all data blocks in the table combined // into a single filter block. -// -// It is a base class for BlockBasedFilter and FullFilter. -// These two are both used in BlockBasedTable. The first one contain filter -// For a part of keys in sst file, the second contain filter for all keys -// in sst file. #pragma once @@ -28,11 +23,9 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" #include "util/hash.h" -#include "format.h" namespace rocksdb { -const uint64_t kNotValid = ULLONG_MAX; class FilterPolicy; // A FilterBlockBuilder is used to construct all of the filters for a @@ -40,45 +33,64 @@ class FilterPolicy; // a special block in the Table. // // The sequence of calls to FilterBlockBuilder must match the regexp: -// (StartBlock Add*)* Finish -// -// BlockBased/Full FilterBlock would be called in the same way. +// (StartBlock AddKey*)* Finish class FilterBlockBuilder { public: - explicit FilterBlockBuilder() {} - virtual ~FilterBlockBuilder() {} + explicit FilterBlockBuilder(const Options& opt, + const BlockBasedTableOptions& table_opt, + const Comparator* internal_comparator); - virtual bool IsBlockBased() = 0; // If is blockbased filter - virtual void StartBlock(uint64_t block_offset) = 0; // Start new block filter - virtual void Add(const Slice& key) = 0; // Add a key to current filter - virtual Slice Finish() = 0; // Generate Filter + void StartBlock(uint64_t block_offset); + void AddKey(const Slice& key); + Slice Finish(); private: + bool SamePrefix(const Slice &key1, const Slice &key2) const; + void GenerateFilter(); + + // important: all of these might point to invalid addresses + // at the time of destruction of this filter block. destructor + // should NOT dereference them. + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + const Comparator* comparator_; + + std::string entries_; // Flattened entry contents + std::vector start_; // Starting index in entries_ of each entry + std::string result_; // Filter data computed so far + std::vector tmp_entries_; // policy_->CreateFilter() argument + std::vector filter_offsets_; + // No copying allowed FilterBlockBuilder(const FilterBlockBuilder&); void operator=(const FilterBlockBuilder&); }; -// A FilterBlockReader is used to parse filter from SST table. -// KeyMayMatch and PrefixMayMatch would trigger filter checking -// -// BlockBased/Full FilterBlock would be called in the same way. class FilterBlockReader { public: - explicit FilterBlockReader() {} - virtual ~FilterBlockReader() {} - - virtual bool IsBlockBased() = 0; // If is blockbased filter - virtual bool KeyMayMatch(const Slice& key, - uint64_t block_offset = kNotValid) = 0; - virtual bool PrefixMayMatch(const Slice& prefix, - uint64_t block_offset = kNotValid) = 0; - virtual size_t ApproximateMemoryUsage() const = 0; + // REQUIRES: "contents" and *policy must stay live while *this is live. + FilterBlockReader( + const Options& opt, + const BlockBasedTableOptions& table_opt, + const Slice& contents, + bool delete_contents_after_use = false); + bool KeyMayMatch(uint64_t block_offset, const Slice& key); + bool PrefixMayMatch(uint64_t block_offset, const Slice& prefix); + size_t ApproximateMemoryUsage() const; private: - // No copying allowed - FilterBlockReader(const FilterBlockReader&); - void operator=(const FilterBlockReader&); + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + const char* data_; // Pointer to filter data (at block-start) + const char* offset_; // Pointer to beginning of offset array (at block-end) + size_t num_; // Number of entries in offset array + size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) + std::unique_ptr filter_data; + + + bool MayMatch(uint64_t block_offset, const Slice& entry); }; -} // namespace rocksdb +} diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc new file mode 100644 index 0000000000..95496a82c2 --- /dev/null +++ b/table/filter_block_test.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/filter_block.h" + +#include "rocksdb/filter_policy.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +// For testing: emit an array with one hash value per key +class TestHashFilter : public FilterPolicy { + public: + virtual const char* Name() const { + return "TestHashFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { + uint32_t h = Hash(key.data(), key.size(), 1); + for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } +}; + +class FilterBlockTest { + public: + Options options_; + BlockBasedTableOptions table_options_; + + FilterBlockTest() { + options_ = Options(); + table_options_.filter_policy.reset(new TestHashFilter()); + } +}; + +TEST(FilterBlockTest, EmptyBuilder) { + FilterBlockBuilder builder(options_, table_options_, options_.comparator); + Slice block = builder.Finish(); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block)); + FilterBlockReader reader(options_, table_options_, block); + ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(100000, "foo")); +} + +TEST(FilterBlockTest, SingleChunk) { + FilterBlockBuilder builder(options_, table_options_, options_.comparator); + builder.StartBlock(100); + builder.AddKey("foo"); + builder.AddKey("bar"); + builder.AddKey("box"); + builder.StartBlock(200); + builder.AddKey("box"); + builder.StartBlock(300); + builder.AddKey("hello"); + Slice block = builder.Finish(); + FilterBlockReader reader(options_, table_options_, block); + ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(100, "bar")); + ASSERT_TRUE(reader.KeyMayMatch(100, "box")); + ASSERT_TRUE(reader.KeyMayMatch(100, "hello")); + ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(100, "missing")); + ASSERT_TRUE(! reader.KeyMayMatch(100, "other")); +} + +TEST(FilterBlockTest, MultiChunk) { + FilterBlockBuilder builder(options_, table_options_, options_.comparator); + + // First filter + builder.StartBlock(0); + builder.AddKey("foo"); + builder.StartBlock(2000); + builder.AddKey("bar"); + + // Second filter + builder.StartBlock(3100); + builder.AddKey("box"); + + // Third filter is empty + + // Last filter + builder.StartBlock(9000); + builder.AddKey("box"); + builder.AddKey("hello"); + + Slice block = builder.Finish(); + FilterBlockReader reader(options_, table_options_, block); + + // Check first filter + ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(2000, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(0, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(0, "hello")); + + // Check second filter + ASSERT_TRUE(reader.KeyMayMatch(3100, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "hello")); + + // Check third filter (empty) + ASSERT_TRUE(! reader.KeyMayMatch(4100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "hello")); + + // Check last filter + ASSERT_TRUE(reader.KeyMayMatch(9000, "box")); + ASSERT_TRUE(reader.KeyMayMatch(9000, "hello")); + ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar")); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/table/format.cc b/table/format.cc index db11f9d4a8..46105247f3 100644 --- a/table/format.cc +++ b/table/format.cc @@ -135,7 +135,7 @@ Status Footer::DecodeFrom(Slice* input) { snprintf(buffer, sizeof(buffer) - 1, "not an sstable (bad magic number --- %lx)", (long)magic); - return Status::Corruption(buffer); + return Status::InvalidArgument(buffer); } } else { set_table_magic_number(magic); @@ -156,7 +156,7 @@ Status Footer::DecodeFrom(Slice* input) { // It consists of the checksum type, two block handles, padding, // a version number, and a magic number if (input->size() < kVersion1EncodedLength) { - return Status::Corruption("input is too short to be an sstable"); + return Status::InvalidArgument("input is too short to be an sstable"); } else { input->remove_prefix(input->size() - kVersion1EncodedLength); } @@ -183,7 +183,7 @@ Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size, Footer* footer) { if (file_size < Footer::kMinEncodedLength) { - return Status::Corruption("file is too short to be an sstable"); + return Status::InvalidArgument("file is too short to be an sstable"); } char footer_space[Footer::kMaxEncodedLength]; @@ -198,7 +198,7 @@ Status ReadFooterFromFile(RandomAccessFile* file, // Check that we actually read the whole footer from the file. It may be // that size isn't correct. if (footer_input.size() < Footer::kMinEncodedLength) { - return Status::Corruption("file is too short to be an sstable"); + return Status::InvalidArgument("file is too short to be an sstable"); } return footer->DecodeFrom(&footer_input); @@ -255,54 +255,112 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer, return s; } -Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, - const ReadOptions& options, const BlockHandle& handle, - BlockContents* contents, Env* env, - bool decompression_requested) { - Status status; - Slice slice; - size_t n = static_cast(handle.size()); - std::unique_ptr heap_buf; - char stack_buf[DefaultStackBufferSize]; - char* used_buf = nullptr; - rocksdb::CompressionType compression_type; +// Decompress a block according to params +// May need to malloc a space for cache usage +Status DecompressBlock(BlockContents* result, size_t block_size, + bool do_uncompress, const char* buf, + const Slice& contents, bool use_stack_buf) { + Status s; + size_t n = block_size; + const char* data = contents.data(); - if (decompression_requested && - n + kBlockTrailerSize < DefaultStackBufferSize) { - // If we've got a small enough hunk of data, read it in to the - // trivially allocated stack buffer instead of needing a full malloc() - used_buf = &stack_buf[0]; - } else { - heap_buf = std::unique_ptr(new char[n + kBlockTrailerSize]); - used_buf = heap_buf.get(); - } - - status = ReadBlock(file, footer, options, handle, &slice, used_buf); - - if (!status.ok()) { - return status; - } + result->data = Slice(); + result->cachable = false; + result->heap_allocated = false; PERF_TIMER_GUARD(block_decompress_time); + rocksdb::CompressionType compression_type = + static_cast(data[n]); + // If the caller has requested that the block not be uncompressed + if (!do_uncompress || compression_type == kNoCompression) { + if (data != buf) { + // File implementation gave us pointer to some other data. + // Use it directly under the assumption that it will be live + // while the file is open. + result->data = Slice(data, n); + result->heap_allocated = false; + result->cachable = false; // Do not double-cache + } else { + if (use_stack_buf) { + // Need to allocate space in heap for cache usage + char* new_buf = new char[n]; + memcpy(new_buf, buf, n); + result->data = Slice(new_buf, n); + } else { + result->data = Slice(buf, n); + } - compression_type = static_cast(slice.data()[n]); + result->heap_allocated = true; + result->cachable = true; + } + result->compression_type = compression_type; + s = Status::OK(); + } else { + s = UncompressBlockContents(data, n, result); + } + return s; +} - if (decompression_requested && compression_type != kNoCompression) { - return UncompressBlockContents(slice.data(), n, contents); +// Read and Decompress block +// Use buf in stack as temp reading buffer +Status ReadAndDecompressFast(RandomAccessFile* file, const Footer& footer, + const ReadOptions& options, + const BlockHandle& handle, BlockContents* result, + Env* env, bool do_uncompress) { + Status s; + Slice contents; + size_t n = static_cast(handle.size()); + char buf[DefaultStackBufferSize]; + + s = ReadBlock(file, footer, options, handle, &contents, buf); + if (!s.ok()) { + return s; + } + s = DecompressBlock(result, n, do_uncompress, buf, contents, true); + if (!s.ok()) { + return s; + } + return s; +} + +// Read and Decompress block +// Use buf in heap as temp reading buffer +Status ReadAndDecompress(RandomAccessFile* file, const Footer& footer, + const ReadOptions& options, const BlockHandle& handle, + BlockContents* result, Env* env, bool do_uncompress) { + Status s; + Slice contents; + size_t n = static_cast(handle.size()); + char* buf = new char[n + kBlockTrailerSize]; + + s = ReadBlock(file, footer, options, handle, &contents, buf); + if (!s.ok()) { + delete[] buf; + return s; + } + s = DecompressBlock(result, n, do_uncompress, buf, contents, false); + if (!s.ok()) { + delete[] buf; + return s; } - if (slice.data() != used_buf) { - *contents = BlockContents(Slice(slice.data(), n), false, compression_type); - return status; + if (result->data.data() != buf) { + delete[] buf; } + return s; +} - if (used_buf == &stack_buf[0]) { - heap_buf = std::unique_ptr(new char[n]); - memcpy(heap_buf.get(), stack_buf, n); +Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, + const ReadOptions& options, const BlockHandle& handle, + BlockContents* result, Env* env, bool do_uncompress) { + size_t n = static_cast(handle.size()); + if (do_uncompress && n + kBlockTrailerSize < DefaultStackBufferSize) { + return ReadAndDecompressFast(file, footer, options, handle, result, env, + do_uncompress); + } else { + return ReadAndDecompress(file, footer, options, handle, result, env, + do_uncompress); } - - *contents = BlockContents(std::move(heap_buf), n, true, compression_type); - return status; } // @@ -312,8 +370,8 @@ Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, // buffer is returned via 'result' and it is upto the caller to // free this buffer. Status UncompressBlockContents(const char* data, size_t n, - BlockContents* contents) { - std::unique_ptr ubuf; + BlockContents* result) { + char* ubuf = nullptr; int decompress_size = 0; assert(data[n] != kNoCompression); switch (data[n]) { @@ -324,60 +382,64 @@ Status UncompressBlockContents(const char* data, size_t n, if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) { return Status::Corruption(snappy_corrupt_msg); } - ubuf = std::unique_ptr(new char[ulength]); - if (!port::Snappy_Uncompress(data, n, ubuf.get())) { + ubuf = new char[ulength]; + if (!port::Snappy_Uncompress(data, n, ubuf)) { + delete[] ubuf; return Status::Corruption(snappy_corrupt_msg); } - *contents = BlockContents(std::move(ubuf), ulength, true, kNoCompression); + result->data = Slice(ubuf, ulength); + result->heap_allocated = true; + result->cachable = true; break; } case kZlibCompression: - ubuf = std::unique_ptr( - port::Zlib_Uncompress(data, n, &decompress_size)); + ubuf = port::Zlib_Uncompress(data, n, &decompress_size); static char zlib_corrupt_msg[] = "Zlib not supported or corrupted Zlib compressed block contents"; if (!ubuf) { return Status::Corruption(zlib_corrupt_msg); } - *contents = - BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); + result->data = Slice(ubuf, decompress_size); + result->heap_allocated = true; + result->cachable = true; break; case kBZip2Compression: - ubuf = std::unique_ptr( - port::BZip2_Uncompress(data, n, &decompress_size)); + ubuf = port::BZip2_Uncompress(data, n, &decompress_size); static char bzip2_corrupt_msg[] = "Bzip2 not supported or corrupted Bzip2 compressed block contents"; if (!ubuf) { return Status::Corruption(bzip2_corrupt_msg); } - *contents = - BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); + result->data = Slice(ubuf, decompress_size); + result->heap_allocated = true; + result->cachable = true; break; case kLZ4Compression: - ubuf = std::unique_ptr( - port::LZ4_Uncompress(data, n, &decompress_size)); + ubuf = port::LZ4_Uncompress(data, n, &decompress_size); static char lz4_corrupt_msg[] = "LZ4 not supported or corrupted LZ4 compressed block contents"; if (!ubuf) { return Status::Corruption(lz4_corrupt_msg); } - *contents = - BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); + result->data = Slice(ubuf, decompress_size); + result->heap_allocated = true; + result->cachable = true; break; case kLZ4HCCompression: - ubuf = std::unique_ptr( - port::LZ4_Uncompress(data, n, &decompress_size)); + ubuf = port::LZ4_Uncompress(data, n, &decompress_size); static char lz4hc_corrupt_msg[] = "LZ4HC not supported or corrupted LZ4HC compressed block contents"; if (!ubuf) { return Status::Corruption(lz4hc_corrupt_msg); } - *contents = - BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); + result->data = Slice(ubuf, decompress_size); + result->heap_allocated = true; + result->cachable = true; break; default: return Status::Corruption("bad block type"); } + result->compression_type = kNoCompression; // not compressed any more return Status::OK(); } diff --git a/table/format.h b/table/format.h index 986164d81b..a971c1a67c 100644 --- a/table/format.h +++ b/table/format.h @@ -160,29 +160,18 @@ static const size_t kBlockTrailerSize = 5; struct BlockContents { Slice data; // Actual contents of data bool cachable; // True iff data can be cached + bool heap_allocated; // True iff caller should delete[] data.data() CompressionType compression_type; - std::unique_ptr allocation; - - BlockContents() : cachable(false), compression_type(kNoCompression) {} - - BlockContents(const Slice& _data, bool _cachable, - CompressionType _compression_type) - : data(_data), cachable(_cachable), compression_type(_compression_type) {} - - BlockContents(std::unique_ptr&& _data, size_t _size, bool _cachable, - CompressionType _compression_type) - : data(_data.get(), _size), - cachable(_cachable), - compression_type(_compression_type), - allocation(std::move(_data)) {} }; // Read the block identified by "handle" from "file". On failure // return non-OK. On success fill *result and return OK. -extern Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, +extern Status ReadBlockContents(RandomAccessFile* file, + const Footer& footer, const ReadOptions& options, const BlockHandle& handle, - BlockContents* contents, Env* env, + BlockContents* result, + Env* env, bool do_uncompress); // The 'data' points to the raw block contents read in from file. @@ -190,8 +179,9 @@ extern Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, // contents are uncompresed into this buffer. This buffer is // returned via 'result' and it is upto the caller to // free this buffer. -extern Status UncompressBlockContents(const char* data, size_t n, - BlockContents* contents); +extern Status UncompressBlockContents(const char* data, + size_t n, + BlockContents* result); // Implementation details follow. Clients should ignore, diff --git a/table/full_filter_block.cc b/table/full_filter_block.cc deleted file mode 100644 index 4113ec57a9..0000000000 --- a/table/full_filter_block.cc +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#include "table/full_filter_block.h" - -#include "rocksdb/filter_policy.h" -#include "port/port.h" -#include "util/coding.h" - -namespace rocksdb { - -FullFilterBlockBuilder::FullFilterBlockBuilder( - const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, - FilterBitsBuilder* filter_bits_builder) - : prefix_extractor_(prefix_extractor), - whole_key_filtering_(table_opt.whole_key_filtering), - num_added_(0) { - assert(filter_bits_builder != nullptr); - filter_bits_builder_.reset(filter_bits_builder); -} - -void FullFilterBlockBuilder::Add(const Slice& key) { - if (whole_key_filtering_) { - AddKey(key); - } - if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { - AddPrefix(key); - } -} - -// Add key to filter if needed -inline void FullFilterBlockBuilder::AddKey(const Slice& key) { - filter_bits_builder_->AddKey(key); - num_added_++; -} - -// Add prefix to filter if needed -inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) { - Slice prefix = prefix_extractor_->Transform(key); - filter_bits_builder_->AddKey(prefix); - num_added_++; -} - -Slice FullFilterBlockBuilder::Finish() { - if (num_added_ != 0) { - num_added_ = 0; - return filter_bits_builder_->Finish(&filter_data_); - } - return Slice(); -} - -FullFilterBlockReader::FullFilterBlockReader( - const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, const Slice& contents, - FilterBitsReader* filter_bits_reader) - : prefix_extractor_(prefix_extractor), - whole_key_filtering_(table_opt.whole_key_filtering), - contents_(contents) { - assert(filter_bits_reader != nullptr); - filter_bits_reader_.reset(filter_bits_reader); -} - -FullFilterBlockReader::FullFilterBlockReader( - const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, BlockContents&& contents, - FilterBitsReader* filter_bits_reader) - : FullFilterBlockReader(prefix_extractor, table_opt, contents.data, - filter_bits_reader) { - block_contents_ = std::move(contents); -} - -bool FullFilterBlockReader::KeyMayMatch(const Slice& key, - uint64_t block_offset) { - assert(block_offset == kNotValid); - if (!whole_key_filtering_) { - return true; - } - return MayMatch(key); -} - -bool FullFilterBlockReader::PrefixMayMatch(const Slice& prefix, - uint64_t block_offset) { - assert(block_offset == kNotValid); - if (!prefix_extractor_) { - return true; - } - return MayMatch(prefix); -} - -bool FullFilterBlockReader::MayMatch(const Slice& entry) { - if (contents_.size() != 0) { - return filter_bits_reader_->MayMatch(entry); - } - return true; // remain the same with block_based filter -} - -size_t FullFilterBlockReader::ApproximateMemoryUsage() const { - return contents_.size(); -} -} // namespace rocksdb diff --git a/table/full_filter_block.h b/table/full_filter_block.h deleted file mode 100644 index 6d6294cf24..0000000000 --- a/table/full_filter_block.h +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#pragma once - -#include -#include -#include -#include -#include -#include "rocksdb/options.h" -#include "rocksdb/slice.h" -#include "rocksdb/slice_transform.h" -#include "db/dbformat.h" -#include "util/hash.h" -#include "table/filter_block.h" - -namespace rocksdb { - -class FilterPolicy; -class FilterBitsBuilder; -class FilterBitsReader; - -// A FullFilterBlockBuilder is used to construct a full filter for a -// particular Table. It generates a single string which is stored as -// a special block in the Table. -// The format of full filter block is: -// +----------------------------------------------------------------+ -// | full filter for all keys in sst file | -// +----------------------------------------------------------------+ -// The full filter can be very large. At the end of it, we put -// num_probes: how many hash functions are used in bloom filter -// -class FullFilterBlockBuilder : public FilterBlockBuilder { - public: - explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, - FilterBitsBuilder* filter_bits_builder); - // bits_builder is created in filter_policy, it should be passed in here - // directly. and be deleted here - ~FullFilterBlockBuilder() {} - - virtual bool IsBlockBased() override { return false; } - virtual void StartBlock(uint64_t block_offset) override {} - virtual void Add(const Slice& key) override; - virtual Slice Finish() override; - - private: - // important: all of these might point to invalid addresses - // at the time of destruction of this filter block. destructor - // should NOT dereference them. - const SliceTransform* prefix_extractor_; - bool whole_key_filtering_; - - uint32_t num_added_; - std::unique_ptr filter_bits_builder_; - std::unique_ptr filter_data_; - - void AddKey(const Slice& key); - void AddPrefix(const Slice& key); - - // No copying allowed - FullFilterBlockBuilder(const FullFilterBlockBuilder&); - void operator=(const FullFilterBlockBuilder&); -}; - -// A FilterBlockReader is used to parse filter from SST table. -// KeyMayMatch and PrefixMayMatch would trigger filter checking -class FullFilterBlockReader : public FilterBlockReader { - public: - // REQUIRES: "contents" and filter_bits_reader must stay live - // while *this is live. - explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, - const Slice& contents, - FilterBitsReader* filter_bits_reader); - explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, - BlockContents&& contents, - FilterBitsReader* filter_bits_reader); - - // bits_reader is created in filter_policy, it should be passed in here - // directly. and be deleted here - ~FullFilterBlockReader() {} - - virtual bool IsBlockBased() override { return false; } - virtual bool KeyMayMatch(const Slice& key, - uint64_t block_offset = kNotValid) override; - virtual bool PrefixMayMatch(const Slice& prefix, - uint64_t block_offset = kNotValid) override; - virtual size_t ApproximateMemoryUsage() const override; - - private: - const SliceTransform* prefix_extractor_; - bool whole_key_filtering_; - - std::unique_ptr filter_bits_reader_; - Slice contents_; - BlockContents block_contents_; - std::unique_ptr filter_data_; - - bool MayMatch(const Slice& entry); - - // No copying allowed - FullFilterBlockReader(const FullFilterBlockReader&); - void operator=(const FullFilterBlockReader&); -}; - -} // namespace rocksdb diff --git a/table/full_filter_block_test.cc b/table/full_filter_block_test.cc deleted file mode 100644 index 7bf61f2383..0000000000 --- a/table/full_filter_block_test.cc +++ /dev/null @@ -1,182 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#include "table/full_filter_block.h" - -#include "rocksdb/filter_policy.h" -#include "util/coding.h" -#include "util/hash.h" -#include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace rocksdb { - -class TestFilterBitsBuilder : public FilterBitsBuilder { - public: - explicit TestFilterBitsBuilder() {} - - // Add Key to filter - virtual void AddKey(const Slice& key) override { - hash_entries_.push_back(Hash(key.data(), key.size(), 1)); - } - - // Generate the filter using the keys that are added - virtual Slice Finish(std::unique_ptr* buf) override { - uint32_t len = hash_entries_.size() * 4; - char* data = new char[len]; - for (size_t i = 0; i < hash_entries_.size(); i++) { - EncodeFixed32(data + i * 4, hash_entries_[i]); - } - const char* const_data = data; - buf->reset(const_data); - return Slice(data, len); - } - - private: - std::vector hash_entries_; -}; - -class TestFilterBitsReader : public FilterBitsReader { - public: - explicit TestFilterBitsReader(const Slice& contents) - : data_(contents.data()), len_(contents.size()) {} - - virtual bool MayMatch(const Slice& entry) override { - uint32_t h = Hash(entry.data(), entry.size(), 1); - for (size_t i = 0; i + 4 <= len_; i += 4) { - if (h == DecodeFixed32(data_ + i)) { - return true; - } - } - return false; - } - - private: - const char* data_; - uint32_t len_; -}; - - -class TestHashFilter : public FilterPolicy { - public: - virtual const char* Name() const { - return "TestHashFilter"; - } - - virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { - for (int i = 0; i < n; i++) { - uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); - PutFixed32(dst, h); - } - } - - virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { - uint32_t h = Hash(key.data(), key.size(), 1); - for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { - if (h == DecodeFixed32(filter.data() + i)) { - return true; - } - } - return false; - } - - virtual FilterBitsBuilder* GetFilterBitsBuilder() const override { - return new TestFilterBitsBuilder(); - } - - virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) - const override { - return new TestFilterBitsReader(contents); - } -}; - -class PluginFullFilterBlockTest { - public: - BlockBasedTableOptions table_options_; - - PluginFullFilterBlockTest() { - table_options_.filter_policy.reset(new TestHashFilter()); - } -}; - -TEST(PluginFullFilterBlockTest, PluginEmptyBuilder) { - FullFilterBlockBuilder builder(nullptr, table_options_, - table_options_.filter_policy->GetFilterBitsBuilder()); - Slice block = builder.Finish(); - ASSERT_EQ("", EscapeString(block)); - - FullFilterBlockReader reader(nullptr, table_options_, block, - table_options_.filter_policy->GetFilterBitsReader(block)); - // Remain same symantic with blockbased filter - ASSERT_TRUE(reader.KeyMayMatch("foo")); -} - -TEST(PluginFullFilterBlockTest, PluginSingleChunk) { - FullFilterBlockBuilder builder(nullptr, table_options_, - table_options_.filter_policy->GetFilterBitsBuilder()); - builder.Add("foo"); - builder.Add("bar"); - builder.Add("box"); - builder.Add("box"); - builder.Add("hello"); - Slice block = builder.Finish(); - FullFilterBlockReader reader(nullptr, table_options_, block, - table_options_.filter_policy->GetFilterBitsReader(block)); - ASSERT_TRUE(reader.KeyMayMatch("foo")); - ASSERT_TRUE(reader.KeyMayMatch("bar")); - ASSERT_TRUE(reader.KeyMayMatch("box")); - ASSERT_TRUE(reader.KeyMayMatch("hello")); - ASSERT_TRUE(reader.KeyMayMatch("foo")); - ASSERT_TRUE(!reader.KeyMayMatch("missing")); - ASSERT_TRUE(!reader.KeyMayMatch("other")); -} - -class FullFilterBlockTest { - public: - BlockBasedTableOptions table_options_; - - FullFilterBlockTest() { - table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false)); - } - - ~FullFilterBlockTest() {} -}; - -TEST(FullFilterBlockTest, EmptyBuilder) { - FullFilterBlockBuilder builder(nullptr, table_options_, - table_options_.filter_policy->GetFilterBitsBuilder()); - Slice block = builder.Finish(); - ASSERT_EQ("", EscapeString(block)); - - FullFilterBlockReader reader(nullptr, table_options_, block, - table_options_.filter_policy->GetFilterBitsReader(block)); - // Remain same symantic with blockbased filter - ASSERT_TRUE(reader.KeyMayMatch("foo")); -} - -TEST(FullFilterBlockTest, SingleChunk) { - FullFilterBlockBuilder builder(nullptr, table_options_, - table_options_.filter_policy->GetFilterBitsBuilder()); - builder.Add("foo"); - builder.Add("bar"); - builder.Add("box"); - builder.Add("box"); - builder.Add("hello"); - Slice block = builder.Finish(); - FullFilterBlockReader reader(nullptr, table_options_, block, - table_options_.filter_policy->GetFilterBitsReader(block)); - ASSERT_TRUE(reader.KeyMayMatch("foo")); - ASSERT_TRUE(reader.KeyMayMatch("bar")); - ASSERT_TRUE(reader.KeyMayMatch("box")); - ASSERT_TRUE(reader.KeyMayMatch("hello")); - ASSERT_TRUE(reader.KeyMayMatch("foo")); - ASSERT_TRUE(!reader.KeyMayMatch("missing")); - ASSERT_TRUE(!reader.KeyMayMatch("other")); -} - -} // namespace rocksdb - -int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/table/get_context.cc b/table/get_context.cc deleted file mode 100644 index 59dfa41e6c..0000000000 --- a/table/get_context.cc +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#include "table/get_context.h" -#include "rocksdb/merge_operator.h" -#include "rocksdb/statistics.h" -#include "util/statistics.h" - -namespace rocksdb { - -GetContext::GetContext(const Comparator* ucmp, - const MergeOperator* merge_operator, - Logger* logger, Statistics* statistics, - GetState init_state, const Slice& user_key, std::string* ret_value, - bool* value_found, MergeContext* merge_context) - : ucmp_(ucmp), - merge_operator_(merge_operator), - logger_(logger), - statistics_(statistics), - state_(init_state), - user_key_(user_key), - value_(ret_value), - value_found_(value_found), - merge_context_(merge_context) { -} - -// Called from TableCache::Get and Table::Get when file/block in which -// key may exist are not there in TableCache/BlockCache respectively. In this -// case we can't guarantee that key does not exist and are not permitted to do -// IO to be certain.Set the status=kFound and value_found=false to let the -// caller know that key may exist but is not there in memory -void GetContext::MarkKeyMayExist() { - state_ = kFound; - if (value_found_ != nullptr) { - *value_found_ = false; - } -} - -void GetContext::SaveValue(const Slice& value) { - state_ = kFound; - value_->assign(value.data(), value.size()); -} - -bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, - const Slice& value) { - assert((state_ != kMerge && parsed_key.type != kTypeMerge) || - merge_context_ != nullptr); - if (ucmp_->Compare(parsed_key.user_key, user_key_) == 0) { - // Key matches. Process it - switch (parsed_key.type) { - case kTypeValue: - assert(state_ == kNotFound || state_ == kMerge); - if (kNotFound == state_) { - state_ = kFound; - value_->assign(value.data(), value.size()); - } else if (kMerge == state_) { - assert(merge_operator_ != nullptr); - state_ = kFound; - if (!merge_operator_->FullMerge(user_key_, &value, - merge_context_->GetOperands(), - value_, logger_)) { - RecordTick(statistics_, NUMBER_MERGE_FAILURES); - state_ = kCorrupt; - } - } - return false; - - case kTypeDeletion: - assert(state_ == kNotFound || state_ == kMerge); - if (kNotFound == state_) { - state_ = kDeleted; - } else if (kMerge == state_) { - state_ = kFound; - if (!merge_operator_->FullMerge(user_key_, nullptr, - merge_context_->GetOperands(), - value_, logger_)) { - RecordTick(statistics_, NUMBER_MERGE_FAILURES); - state_ = kCorrupt; - } - } - return false; - - case kTypeMerge: - assert(state_ == kNotFound || state_ == kMerge); - state_ = kMerge; - merge_context_->PushOperand(value); - return true; - - default: - assert(false); - break; - } - } - - // state_ could be Corrupt, merge or notfound - return false; -} - -} // namespace rocksdb diff --git a/table/get_context.h b/table/get_context.h deleted file mode 100644 index a38f3c5339..0000000000 --- a/table/get_context.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#pragma once -#include -#include "db/merge_context.h" - -namespace rocksdb { -class MergeContext; - -class GetContext { - public: - enum GetState { - kNotFound, - kFound, - kDeleted, - kCorrupt, - kMerge // saver contains the current merge result (the operands) - }; - - GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, - Logger* logger, Statistics* statistics, - GetState init_state, const Slice& user_key, std::string* ret_value, - bool* value_found, MergeContext* merge_context); - - void MarkKeyMayExist(); - void SaveValue(const Slice& value); - bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value); - GetState State() const { return state_; } - - private: - const Comparator* ucmp_; - const MergeOperator* merge_operator_; - // the merge operations encountered; - Logger* logger_; - Statistics* statistics_; - - GetState state_; - Slice user_key_; - std::string* value_; - bool* value_found_; // Is value set correctly? Used by KeyMayExist - MergeContext* merge_context_; -}; - -} // namespace rocksdb diff --git a/table/merger_test.cc b/table/merger_test.cc deleted file mode 100644 index 3a10527f44..0000000000 --- a/table/merger_test.cc +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#include -#include -#include - -#include "rocksdb/iterator.h" -#include "table/merger.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace rocksdb { - -class VectorIterator : public Iterator { - public: - explicit VectorIterator(const std::vector& keys) - : keys_(keys), current_(keys.size()) { - std::sort(keys_.begin(), keys_.end()); - } - - virtual bool Valid() const { return current_ < keys_.size(); } - - virtual void SeekToFirst() { current_ = 0; } - virtual void SeekToLast() { current_ = keys_.size() - 1; } - - virtual void Seek(const Slice& target) { - current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) - - keys_.begin(); - } - - virtual void Next() { current_++; } - virtual void Prev() { current_--; } - - virtual Slice key() const { return Slice(keys_[current_]); } - virtual Slice value() const { return Slice(); } - - virtual Status status() const { return Status::OK(); } - - private: - std::vector keys_; - size_t current_; -}; - -class MergerTest { - public: - MergerTest() - : rnd_(3), merging_iterator_(nullptr), single_iterator_(nullptr) {} - ~MergerTest() = default; - std::vector GenerateStrings(int len, int string_len) { - std::vector ret; - for (int i = 0; i < len; ++i) { - ret.push_back(test::RandomHumanReadableString(&rnd_, string_len)); - } - return ret; - } - - void AssertEquivalence() { - auto a = merging_iterator_.get(); - auto b = single_iterator_.get(); - if (!a->Valid()) { - ASSERT_TRUE(!b->Valid()); - } else { - ASSERT_TRUE(b->Valid()); - ASSERT_EQ(b->key().ToString(), a->key().ToString()); - ASSERT_EQ(b->value().ToString(), a->value().ToString()); - } - } - - void SeekToRandom() { Seek(test::RandomHumanReadableString(&rnd_, 5)); } - - void Seek(std::string target) { - merging_iterator_->Seek(target); - single_iterator_->Seek(target); - } - - void SeekToFirst() { - merging_iterator_->SeekToFirst(); - single_iterator_->SeekToFirst(); - } - - void SeekToLast() { - merging_iterator_->SeekToLast(); - single_iterator_->SeekToLast(); - } - - void Next(int times) { - for (int i = 0; i < times && merging_iterator_->Valid(); ++i) { - AssertEquivalence(); - merging_iterator_->Next(); - single_iterator_->Next(); - } - AssertEquivalence(); - } - - void Prev(int times) { - for (int i = 0; i < times && merging_iterator_->Valid(); ++i) { - AssertEquivalence(); - merging_iterator_->Prev(); - single_iterator_->Prev(); - } - AssertEquivalence(); - } - - void NextAndPrev(int times) { - for (int i = 0; i < times && merging_iterator_->Valid(); ++i) { - AssertEquivalence(); - if (rnd_.OneIn(2)) { - merging_iterator_->Prev(); - single_iterator_->Prev(); - } else { - merging_iterator_->Next(); - single_iterator_->Next(); - } - } - AssertEquivalence(); - } - - void Generate(size_t num_iterators, size_t strings_per_iterator, - size_t letters_per_string) { - std::vector small_iterators; - for (size_t i = 0; i < num_iterators; ++i) { - auto strings = GenerateStrings(strings_per_iterator, letters_per_string); - small_iterators.push_back(new VectorIterator(strings)); - all_keys_.insert(all_keys_.end(), strings.begin(), strings.end()); - } - - merging_iterator_.reset(NewMergingIterator( - BytewiseComparator(), &small_iterators[0], small_iterators.size())); - single_iterator_.reset(new VectorIterator(all_keys_)); - } - - Random rnd_; - std::unique_ptr merging_iterator_; - std::unique_ptr single_iterator_; - std::vector all_keys_; -}; - -TEST(MergerTest, SeekToRandomNextTest) { - Generate(1000, 50, 50); - for (int i = 0; i < 10; ++i) { - SeekToRandom(); - AssertEquivalence(); - Next(50000); - } -} - -TEST(MergerTest, SeekToRandomNextSmallStringsTest) { - Generate(1000, 50, 2); - for (int i = 0; i < 10; ++i) { - SeekToRandom(); - AssertEquivalence(); - Next(50000); - } -} - -TEST(MergerTest, SeekToRandomPrevTest) { - Generate(1000, 50, 50); - for (int i = 0; i < 10; ++i) { - SeekToRandom(); - AssertEquivalence(); - Prev(50000); - } -} - -TEST(MergerTest, SeekToRandomRandomTest) { - Generate(200, 50, 50); - for (int i = 0; i < 3; ++i) { - SeekToRandom(); - AssertEquivalence(); - NextAndPrev(5000); - } -} - -TEST(MergerTest, SeekToFirstTest) { - Generate(1000, 50, 50); - for (int i = 0; i < 10; ++i) { - SeekToFirst(); - AssertEquivalence(); - Next(50000); - } -} - -TEST(MergerTest, SeekToLastTest) { - Generate(1000, 50, 50); - for (int i = 0; i < 10; ++i) { - SeekToLast(); - AssertEquivalence(); - Prev(50000); - } -} - -} // namespace rocksdb - -int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 5aabffcb0a..d9d0ed6c99 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -141,15 +141,14 @@ Status ReadProperties(const Slice &handle_value, RandomAccessFile *file, BlockContents block_contents; ReadOptions read_options; read_options.verify_checksums = false; - Status s; - s = ReadBlockContents(file, footer, read_options, handle, &block_contents, - env, false); + Status s = ReadBlockContents(file, footer, read_options, handle, + &block_contents, env, false); if (!s.ok()) { return s; } - Block properties_block(std::move(block_contents)); + Block properties_block(block_contents); std::unique_ptr iter( properties_block.NewIterator(BytewiseComparator())); @@ -234,7 +233,7 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size, if (!s.ok()) { return s; } - Block metaindex_block(std::move(metaindex_contents)); + Block metaindex_block(metaindex_contents); std::unique_ptr meta_iter( metaindex_block.NewIterator(BytewiseComparator())); @@ -288,7 +287,7 @@ Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, if (!s.ok()) { return s; } - Block metaindex_block(std::move(metaindex_contents)); + Block metaindex_block(metaindex_contents); std::unique_ptr meta_iter; meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator())); @@ -300,11 +299,10 @@ Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size, uint64_t table_magic_number, Env* env, const std::string& meta_block_name, BlockContents* contents) { - Status status; Footer footer(table_magic_number); - status = ReadFooterFromFile(file, file_size, &footer); - if (!status.ok()) { - return status; + auto s = ReadFooterFromFile(file, file_size, &footer); + if (!s.ok()) { + return s; } // Reading metaindex block @@ -312,28 +310,30 @@ Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size, BlockContents metaindex_contents; ReadOptions read_options; read_options.verify_checksums = false; - status = ReadBlockContents(file, footer, read_options, metaindex_handle, - &metaindex_contents, env, false); - if (!status.ok()) { - return status; + s = ReadBlockContents(file, footer, read_options, metaindex_handle, + &metaindex_contents, env, false); + if (!s.ok()) { + return s; } // Finding metablock - Block metaindex_block(std::move(metaindex_contents)); + Block metaindex_block(metaindex_contents); std::unique_ptr meta_iter; meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator())); BlockHandle block_handle; - status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); + s = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); - if (!status.ok()) { - return status; + if (!s.ok()) { + return s; } // Reading metablock - return ReadBlockContents(file, footer, read_options, block_handle, contents, - env, false); + s = ReadBlockContents(file, footer, read_options, block_handle, contents, env, + false); + + return s; } } // namespace rocksdb diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc index b5914554bd..4f3b62ad4e 100644 --- a/table/plain_table_builder.cc +++ b/table/plain_table_builder.cc @@ -20,6 +20,7 @@ #include "table/block_builder.h" #include "table/bloom_block.h" #include "table/plain_table_index.h" +#include "table/filter_block.h" #include "table/format.h" #include "table/meta_blocks.h" #include "util/coding.h" @@ -57,24 +58,24 @@ extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull; extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; PlainTableBuilder::PlainTableBuilder( - const ImmutableCFOptions& ioptions, WritableFile* file, - uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness, + const Options& options, WritableFile* file, uint32_t user_key_len, + EncodingType encoding_type, size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio, bool store_index_in_file) - : ioptions_(ioptions), + : options_(options), bloom_block_(num_probes), file_(file), bloom_bits_per_key_(bloom_bits_per_key), huge_page_tlb_size_(huge_page_tlb_size), - encoder_(encoding_type, user_key_len, ioptions.prefix_extractor, + encoder_(encoding_type, user_key_len, options.prefix_extractor.get(), index_sparseness), store_index_in_file_(store_index_in_file), - prefix_extractor_(ioptions.prefix_extractor) { + prefix_extractor_(options.prefix_extractor.get()) { // Build index block and save it in the file if hash_table_ratio > 0 if (store_index_in_file_) { assert(hash_table_ratio > 0 || IsTotalOrderMode()); index_builder_.reset( - new PlainTableIndexBuilder(&arena_, ioptions, index_sparseness, + new PlainTableIndexBuilder(&arena_, options, index_sparseness, hash_table_ratio, huge_page_tlb_size_)); assert(bloom_bits_per_key_ > 0); properties_.user_collected_properties @@ -92,10 +93,10 @@ PlainTableBuilder::PlainTableBuilder( // plain encoding. properties_.format_version = (encoding_type == kPlain) ? 0 : 1; - if (ioptions_.prefix_extractor) { + if (options_.prefix_extractor) { properties_.user_collected_properties [PlainTablePropertyNames::kPrefixExtractorName] = - ioptions_.prefix_extractor->Name(); + options_.prefix_extractor->Name(); } std::string val; @@ -104,7 +105,7 @@ PlainTableBuilder::PlainTableBuilder( [PlainTablePropertyNames::kEncodingType] = val; for (auto& collector_factories : - ioptions.table_properties_collector_factories) { + options.table_properties_collector_factories) { table_properties_collectors_.emplace_back( collector_factories->CreateTablePropertiesCollector()); } @@ -123,11 +124,11 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { // Store key hash if (store_index_in_file_) { - if (ioptions_.prefix_extractor == nullptr) { + if (options_.prefix_extractor.get() == nullptr) { keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key)); } else { Slice prefix = - ioptions_.prefix_extractor->Transform(internal_key.user_key); + options_.prefix_extractor->Transform(internal_key.user_key); keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix)); } } @@ -159,7 +160,7 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { // notify property collectors NotifyCollectTableCollectorsOnAdd(key, value, table_properties_collectors_, - ioptions_.info_log); + options_.info_log.get()); } Status PlainTableBuilder::status() const { return status_; } @@ -182,8 +183,7 @@ Status PlainTableBuilder::Finish() { if (store_index_in_file_ && (properties_.num_entries > 0)) { bloom_block_.SetTotalBits( &arena_, properties_.num_entries * bloom_bits_per_key_, - ioptions_.bloom_locality, huge_page_tlb_size_, - ioptions_.info_log); + options_.bloom_locality, huge_page_tlb_size_, options_.info_log.get()); PutVarint32(&properties_.user_collected_properties [PlainTablePropertyNames::kNumBloomBlocks], @@ -224,7 +224,7 @@ Status PlainTableBuilder::Finish() { // -- Add user collected properties NotifyCollectTableCollectorsOnFinish(table_properties_collectors_, - ioptions_.info_log, + options_.info_log.get(), &property_block_builder); // -- Write property block diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h index c3af080727..2871d887e8 100644 --- a/table/plain_table_builder.h +++ b/table/plain_table_builder.h @@ -30,7 +30,7 @@ class PlainTableBuilder: public TableBuilder { // caller to close the file after calling Finish(). The output file // will be part of level specified by 'level'. A value of -1 means // that the caller does not know which level the output file will reside. - PlainTableBuilder(const ImmutableCFOptions& ioptions, WritableFile* file, + PlainTableBuilder(const Options& options, WritableFile* file, uint32_t user_key_size, EncodingType encoding_type, size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes = 6, size_t huge_page_tlb_size = 0, @@ -71,7 +71,7 @@ class PlainTableBuilder: public TableBuilder { private: Arena arena_; - const ImmutableCFOptions& ioptions_; + Options options_; std::vector> table_properties_collectors_; diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc index fae0d8018b..145179bae1 100644 --- a/table/plain_table_factory.cc +++ b/table/plain_table_factory.cc @@ -14,24 +14,22 @@ namespace rocksdb { -Status PlainTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, +Status PlainTableFactory::NewTableReader(const Options& options, + const EnvOptions& soptions, const InternalKeyComparator& icomp, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const { - return PlainTableReader::Open(ioptions, env_options, icomp, std::move(file), + return PlainTableReader::Open(options, soptions, icomp, std::move(file), file_size, table, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, huge_page_tlb_size_, full_scan_mode_); } TableBuilder* PlainTableFactory::NewTableBuilder( - const ImmutableCFOptions& ioptions, - const InternalKeyComparator& internal_comparator, - WritableFile* file, const CompressionType, - const CompressionOptions&) const { - return new PlainTableBuilder(ioptions, file, user_key_len_, encoding_type_, + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const { + return new PlainTableBuilder(options, file, user_key_len_, encoding_type_, index_sparseness_, bloom_bits_per_key_, 6, huge_page_tlb_size_, hash_table_ratio_, store_index_in_file_); @@ -52,10 +50,10 @@ std::string PlainTableFactory::GetPrintableTableOptions() const { snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n", hash_table_ratio_); ret.append(buffer); - snprintf(buffer, kBufferSize, " index_sparseness: %zu\n", + snprintf(buffer, kBufferSize, " index_sparseness: %zd\n", index_sparseness_); ret.append(buffer); - snprintf(buffer, kBufferSize, " huge_page_tlb_size: %zu\n", + snprintf(buffer, kBufferSize, " huge_page_tlb_size: %zd\n", huge_page_tlb_size_); ret.append(buffer); snprintf(buffer, kBufferSize, " encoding_type: %d\n", diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h index e79475221d..d1cf0cae61 100644 --- a/table/plain_table_factory.h +++ b/table/plain_table_factory.h @@ -14,6 +14,7 @@ namespace rocksdb { +struct Options; struct EnvOptions; using std::unique_ptr; @@ -127,7 +128,7 @@ class TableBuilder; class PlainTableFactory : public TableFactory { public: ~PlainTableFactory() {} - // user_key_len is the length of the user key. If it is set to be + // user_key_size is the length of the user key. If it is set to be // kPlainTableVariableLength, then it means variable length. Otherwise, all // the keys need to have the fix length of this value. bloom_bits_per_key is // number of bits used for bloom filer per key. hash_table_ratio is @@ -153,17 +154,15 @@ class PlainTableFactory : public TableFactory { full_scan_mode_(options.full_scan_mode), store_index_in_file_(options.store_index_in_file) {} const char* Name() const override { return "PlainTable"; } - Status NewTableReader( - const ImmutableCFOptions& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const override; - TableBuilder* NewTableBuilder( - const ImmutableCFOptions& options, - const InternalKeyComparator& icomparator, - WritableFile* file, - const CompressionType, - const CompressionOptions&) const override; + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override; + TableBuilder* NewTableBuilder(const Options& options, + const InternalKeyComparator& icomparator, + WritableFile* file, + CompressionType compression_type) const + override; std::string GetPrintableTableOptions() const override; diff --git a/table/plain_table_index.cc b/table/plain_table_index.cc index 61f9e335b4..efba9b71da 100644 --- a/table/plain_table_index.cc +++ b/table/plain_table_index.cc @@ -93,7 +93,7 @@ Slice PlainTableIndexBuilder::Finish() { BucketizeIndexes(&hash_to_offsets, &entries_per_bucket); keys_per_prefix_hist_.Add(num_keys_per_prefix_); - Log(ioptions_.info_log, "Number of Keys per prefix Histogram: %s", + Log(options_.info_log, "Number of Keys per prefix Histogram: %s", keys_per_prefix_hist_.ToString().c_str()); // From the temp data structure, populate indexes. @@ -147,11 +147,11 @@ void PlainTableIndexBuilder::BucketizeIndexes( Slice PlainTableIndexBuilder::FillIndexes( const std::vector& hash_to_offsets, const std::vector& entries_per_bucket) { - Log(ioptions_.info_log, "Reserving %zu bytes for plain table's sub_index", + Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index", sub_index_size_); auto total_allocate_size = GetTotalSize(); char* allocated = arena_->AllocateAligned( - total_allocate_size, huge_page_tlb_size_, ioptions_.info_log); + total_allocate_size, huge_page_tlb_size_, options_.info_log.get()); auto temp_ptr = EncodeVarint32(allocated, index_size_); uint32_t* index = @@ -191,7 +191,7 @@ Slice PlainTableIndexBuilder::FillIndexes( } assert(sub_index_offset == sub_index_size_); - Log(ioptions_.info_log, "hash table size: %d, suffix_map length %zu", + Log(options_.info_log, "hash table size: %d, suffix_map length %zu", index_size_, sub_index_size_); return Slice(allocated, GetTotalSize()); } diff --git a/table/plain_table_index.h b/table/plain_table_index.h index 0b26ecd0d0..f63bbd0d52 100644 --- a/table/plain_table_index.h +++ b/table/plain_table_index.h @@ -108,11 +108,11 @@ class PlainTableIndex { // #wiki-in-memory-index-format class PlainTableIndexBuilder { public: - PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions, + PlainTableIndexBuilder(Arena* arena, const Options& options, uint32_t index_sparseness, double hash_table_ratio, double huge_page_tlb_size) : arena_(arena), - ioptions_(ioptions), + options_(options), record_list_(kRecordsPerGroup), is_first_record_(true), due_index_(false), @@ -120,7 +120,7 @@ class PlainTableIndexBuilder { num_keys_per_prefix_(0), prev_key_prefix_hash_(0), index_sparseness_(index_sparseness), - prefix_extractor_(ioptions.prefix_extractor), + prefix_extractor_(options.prefix_extractor.get()), hash_table_ratio_(hash_table_ratio), huge_page_tlb_size_(huge_page_tlb_size) {} @@ -196,7 +196,7 @@ class PlainTableIndexBuilder { const std::vector& entries_per_bucket); Arena* arena_; - const ImmutableCFOptions ioptions_; + Options options_; HistogramImpl keys_per_prefix_hist_; IndexRecordList record_list_; bool is_first_record_; diff --git a/table/plain_table_key_coding.cc b/table/plain_table_key_coding.cc index c553752e17..eedf58aeaa 100644 --- a/table/plain_table_key_coding.cc +++ b/table/plain_table_key_coding.cc @@ -30,7 +30,7 @@ const unsigned char kSizeInlineLimit = 0x3F; size_t EncodeSize(EntryType type, uint32_t key_size, char* out_buffer) { out_buffer[0] = type << 6; - if (key_size < static_cast(kSizeInlineLimit)) { + if (key_size < 0x3F) { // size inlined out_buffer[0] |= static_cast(key_size); return 1; @@ -97,9 +97,9 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file, Slice prefix = prefix_extractor_->Transform(Slice(key.data(), user_key_size)); - if (key_count_for_prefix_ == 0 || prefix != pre_prefix_.GetKey() || - key_count_for_prefix_ % index_sparseness_ == 0) { - key_count_for_prefix_ = 1; + if (key_count_for_prefix == 0 || prefix != pre_prefix_.GetKey() || + key_count_for_prefix % index_sparseness_ == 0) { + key_count_for_prefix = 1; pre_prefix_.SetKey(prefix); size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes); Status s = file->Append(Slice(size_bytes, size_bytes_pos)); @@ -108,8 +108,8 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file, } *offset += size_bytes_pos; } else { - key_count_for_prefix_++; - if (key_count_for_prefix_ == 2) { + key_count_for_prefix++; + if (key_count_for_prefix == 2) { // For second key within a prefix, need to encode prefix length size_bytes_pos += EncodeSize(kPrefixFromPreviousKey, pre_prefix_.GetKey().size(), diff --git a/table/plain_table_key_coding.h b/table/plain_table_key_coding.h index 9047087aed..ba66c26452 100644 --- a/table/plain_table_key_coding.h +++ b/table/plain_table_key_coding.h @@ -26,7 +26,7 @@ class PlainTableKeyEncoder { fixed_user_key_len_(user_key_len), prefix_extractor_(prefix_extractor), index_sparseness_((index_sparseness > 1) ? index_sparseness : 1), - key_count_for_prefix_(0) {} + key_count_for_prefix(0) {} // key: the key to write out, in the format of internal key. // file: the output file to write out // offset: offset in the file. Needs to be updated after appending bytes @@ -45,7 +45,7 @@ class PlainTableKeyEncoder { uint32_t fixed_user_key_len_; const SliceTransform* prefix_extractor_; const size_t index_sparseness_; - size_t key_count_for_prefix_; + size_t key_count_for_prefix; IterKey pre_prefix_; }; diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index db37241a9b..b5eccd310c 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -26,7 +26,6 @@ #include "table/two_level_iterator.h" #include "table/plain_table_factory.h" #include "table/plain_table_key_coding.h" -#include "table/get_context.h" #include "util/arena.h" #include "util/coding.h" @@ -88,7 +87,7 @@ class PlainTableIterator : public Iterator { }; extern const uint64_t kPlainTableMagicNumber; -PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions, +PlainTableReader::PlainTableReader(const Options& options, unique_ptr&& file, const EnvOptions& storage_options, const InternalKeyComparator& icomparator, @@ -100,10 +99,10 @@ PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions, full_scan_mode_(false), data_end_offset_(table_properties->data_size), user_key_len_(table_properties->fixed_key_len), - prefix_extractor_(ioptions.prefix_extractor), + prefix_extractor_(options.prefix_extractor.get()), enable_bloom_(false), bloom_(6, nullptr), - ioptions_(ioptions), + options_(options), file_(std::move(file)), file_size_(file_size), table_properties_(nullptr) {} @@ -111,8 +110,8 @@ PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions, PlainTableReader::~PlainTableReader() { } -Status PlainTableReader::Open(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, +Status PlainTableReader::Open(const Options& options, + const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, @@ -120,14 +119,14 @@ Status PlainTableReader::Open(const ImmutableCFOptions& ioptions, const int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, bool full_scan_mode) { - assert(ioptions.allow_mmap_reads); + assert(options.allow_mmap_reads); if (file_size > PlainTableIndex::kMaxFileSize) { return Status::NotSupported("File is too large for PlainTableReader!"); } TableProperties* props = nullptr; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - ioptions.env, ioptions.info_log, &props); + options.env, options.info_log.get(), &props); if (!s.ok()) { return s; } @@ -138,12 +137,12 @@ Status PlainTableReader::Open(const ImmutableCFOptions& ioptions, user_props.find(PlainTablePropertyNames::kPrefixExtractorName); if (!full_scan_mode && prefix_extractor_in_file != user_props.end()) { - if (!ioptions.prefix_extractor) { + if (!options.prefix_extractor) { return Status::InvalidArgument( "Prefix extractor is missing when opening a PlainTable built " "using a prefix extractor"); } else if (prefix_extractor_in_file->second.compare( - ioptions.prefix_extractor->Name()) != 0) { + options.prefix_extractor->Name()) != 0) { return Status::InvalidArgument( "Prefix extractor given doesn't match the one used to build " "PlainTable"); @@ -159,8 +158,8 @@ Status PlainTableReader::Open(const ImmutableCFOptions& ioptions, } std::unique_ptr new_reader(new PlainTableReader( - ioptions, std::move(file), env_options, internal_comparator, - encoding_type, file_size, props)); + options, std::move(file), soptions, internal_comparator, encoding_type, + file_size, props)); s = new_reader->MmapDataFile(); if (!s.ok()) { @@ -208,7 +207,7 @@ Status PlainTableReader::PopulateIndexRecordList( bool is_first_record = true; Slice key_prefix_slice; PlainTableKeyDecoder decoder(encoding_type_, user_key_len_, - ioptions_.prefix_extractor); + options_.prefix_extractor.get()); while (pos < data_end_offset_) { uint32_t key_offset = pos; ParsedInternalKey key; @@ -253,8 +252,8 @@ void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key, uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key; if (bloom_total_bits > 0) { enable_bloom_ = true; - bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, - huge_page_tlb_size, ioptions_.info_log); + bloom_.SetTotalBits(&arena_, bloom_total_bits, options_.bloom_locality, + huge_page_tlb_size, options_.info_log.get()); FillBloom(prefix_hashes); } } @@ -282,14 +281,14 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, BlockContents bloom_block_contents; auto s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber, - ioptions_.env, BloomBlockBuilder::kBloomBlock, + options_.env, BloomBlockBuilder::kBloomBlock, &bloom_block_contents); bool index_in_file = s.ok(); BlockContents index_block_contents; s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber, - ioptions_.env, PlainTableIndexBuilder::kPlainTableIndexBlock, - &index_block_contents); + options_.env, PlainTableIndexBuilder::kPlainTableIndexBlock, + &index_block_contents); index_in_file &= s.ok(); @@ -311,9 +310,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, index_block = nullptr; } - if ((ioptions_.prefix_extractor == nullptr) && - (hash_table_ratio != 0)) { - // ioptions.prefix_extractor is requried for a hash-based look-up. + if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) { + // options.prefix_extractor is requried for a hash-based look-up. return Status::NotSupported( "PlainTable requires a prefix extractor enable prefix hash mode."); } @@ -330,8 +328,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, table_properties_->num_entries * bloom_bits_per_key; if (num_bloom_bits > 0) { enable_bloom_ = true; - bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality, - huge_page_tlb_size, ioptions_.info_log); + bloom_.SetTotalBits(&arena_, num_bloom_bits, options_.bloom_locality, + huge_page_tlb_size, options_.info_log.get()); } } } else { @@ -353,7 +351,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, bloom_block->size() * 8, num_blocks); } - PlainTableIndexBuilder index_builder(&arena_, ioptions_, index_sparseness, + PlainTableIndexBuilder index_builder(&arena_, options_, index_sparseness, hash_table_ratio, huge_page_tlb_size); std::vector prefix_hashes; @@ -424,7 +422,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, uint32_t file_offset = GetFixed32Element(base_ptr, mid); size_t tmp; Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_, - ioptions_.prefix_extractor) + options_.prefix_extractor.get()) .NextKey(file_data_.data() + file_offset, file_data_.data() + data_end_offset_, &mid_key, nullptr, &tmp); @@ -453,7 +451,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, size_t tmp; uint32_t low_key_offset = GetFixed32Element(base_ptr, low); Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_, - ioptions_.prefix_extractor) + options_.prefix_extractor.get()) .NextKey(file_data_.data() + low_key_offset, file_data_.data() + data_end_offset_, &low_key, nullptr, &tmp); @@ -526,7 +524,10 @@ void PlainTableReader::Prepare(const Slice& target) { } Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, - GetContext* get_context) { + void* arg, + bool (*saver)(void*, const ParsedInternalKey&, + const Slice&), + void (*mark_key_may_exist)(void*)) { // Check bloom filter first. Slice prefix_slice; uint32_t prefix_hash; @@ -564,7 +565,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, } Slice found_value; PlainTableKeyDecoder decoder(encoding_type_, user_key_len_, - ioptions_.prefix_extractor); + options_.prefix_extractor.get()); while (offset < data_end_offset_) { Status s = Next(&decoder, &offset, &found_key, nullptr, &found_value); if (!s.ok()) { @@ -578,10 +579,8 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, } prefix_match = true; } - // TODO(ljin): since we know the key comparison result here, - // can we enable the fast path? if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { - if (!get_context->SaveValue(found_key, found_value)) { + if (!(*saver)(arg, found_key, found_value)) { break; } } diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index 531ac8e8b9..4a626979a8 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -36,7 +36,6 @@ class TableCache; class TableReader; class InternalKeyComparator; class PlainTableKeyDecoder; -class GetContext; using std::unique_ptr; using std::unordered_map; @@ -53,8 +52,7 @@ extern const uint32_t kPlainTableVariableLength; // The implementation of IndexedTableReader requires output file is mmaped class PlainTableReader: public TableReader { public: - static Status Open(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, + static Status Open(const Options& options, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table, @@ -66,8 +64,10 @@ class PlainTableReader: public TableReader { void Prepare(const Slice& target); - Status Get(const ReadOptions&, const Slice& key, - GetContext* get_context) override; + Status Get(const ReadOptions&, const Slice& key, void* arg, + bool (*result_handler)(void* arg, const ParsedInternalKey& k, + const Slice& v), + void (*mark_key_may_exist)(void*) = nullptr); uint64_t ApproximateOffsetOf(const Slice& key); @@ -82,9 +82,8 @@ class PlainTableReader: public TableReader { return arena_.MemoryAllocatedBytes(); } - PlainTableReader(const ImmutableCFOptions& ioptions, - unique_ptr&& file, - const EnvOptions& env_options, + PlainTableReader(const Options& options, unique_ptr&& file, + const EnvOptions& storage_options, const InternalKeyComparator& internal_comparator, EncodingType encoding_type, uint64_t file_size, const TableProperties* table_properties); @@ -133,7 +132,7 @@ class PlainTableReader: public TableReader { DynamicBloom bloom_; Arena arena_; - const ImmutableCFOptions& ioptions_; + const Options& options_; unique_ptr file_; uint32_t file_size_; std::shared_ptr table_properties_; diff --git a/table/table_reader.h b/table/table_reader.h index 2f6360ad10..22f5a859e4 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -18,7 +18,6 @@ class Slice; class Arena; struct ReadOptions; struct TableProperties; -class GetContext; // A Table is a sorted map from strings to strings. Tables are // immutable and persistent. A Table may be safely accessed from @@ -56,17 +55,23 @@ class TableReader { // Report an approximation of how much memory has been used. virtual size_t ApproximateMemoryUsage() const = 0; - // Calls get_context->SaveValue() repeatedly, starting with - // the entry found after a call to Seek(key), until it returns false. - // May not make such a call if filter policy says that key is not present. + // Calls (*result_handler)(handle_context, ...) repeatedly, starting with + // the entry found after a call to Seek(key), until result_handler returns + // false, where k is the actual internal key for a row found and v as the + // value of the key. May not make such a call if filter policy says that key + // is not present. // - // get_context->MarkKeyMayExist needs to be called when it is configured to be - // memory only and the key is not found in the block cache. + // mark_key_may_exist_handler needs to be called when it is configured to be + // memory only and the key is not found in the block cache, with + // the parameter to be handle_context. // // readOptions is the options for the read // key is the key to search for - virtual Status Get(const ReadOptions& readOptions, const Slice& key, - GetContext* get_context) = 0; + virtual Status Get( + const ReadOptions& readOptions, const Slice& key, void* handle_context, + bool (*result_handler)(void* arg, const ParsedInternalKey& k, + const Slice& v), + void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0; }; } // namespace rocksdb diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index 52fa20ec0b..ed2c7c52dc 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -22,7 +22,6 @@ int main() { #include "table/block_based_table_factory.h" #include "table/plain_table_factory.h" #include "table/table_builder.h" -#include "table/get_context.h" #include "util/histogram.h" #include "util/testharness.h" #include "util/testutil.h" @@ -49,6 +48,11 @@ static std::string MakeKey(int i, int j, bool through_db) { return key.Encode().ToString(); } +static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey, + const Slice& v) { + return false; +} + uint64_t Now(Env* env, bool measured_by_nanosecond) { return measured_by_nanosecond ? env->NowNanos() : env->NowMicros(); } @@ -84,12 +88,10 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, TableBuilder* tb = nullptr; DB* db = nullptr; Status s; - const ImmutableCFOptions ioptions(opts); if (!through_db) { env->NewWritableFile(file_name, &file, env_options); - tb = opts.table_factory->NewTableBuilder(ioptions, ikc, file.get(), - CompressionType::kNoCompression, - CompressionOptions()); + tb = opts.table_factory->NewTableBuilder(opts, ikc, file.get(), + CompressionType::kNoCompression); } else { s = DB::Open(opts, dbname, &db); ASSERT_OK(s); @@ -120,13 +122,14 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, uint64_t file_size; env->GetFileSize(file_name, &file_size); s = opts.table_factory->NewTableReader( - ioptions, env_options, ikc, std::move(raf), file_size, &table_reader); + opts, env_options, ikc, std::move(raf), file_size, &table_reader); } Random rnd(301); std::string result; HistogramImpl hist; + void* arg = nullptr; for (int it = 0; it < num_iter; it++) { for (int i = 0; i < num_keys1; i++) { for (int j = 0; j < num_keys2; j++) { @@ -142,13 +145,8 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, std::string key = MakeKey(r1, r2, through_db); uint64_t start_time = Now(env, measured_by_nanosecond); if (!through_db) { - std::string value; - MergeContext merge_context; - GetContext get_context(ioptions.comparator, ioptions.merge_operator, - ioptions.info_log, ioptions.statistics, - GetContext::kNotFound, Slice(key), &value, - nullptr, &merge_context); - s = table_reader->Get(read_options, key, &get_context); + s = table_reader->Get(read_options, key, arg, DummySaveValue, + nullptr); } else { s = db->Get(read_options, key, &result); } @@ -260,9 +258,8 @@ int main(int argc, char** argv) { if (FLAGS_table_factory == "cuckoo_hash") { options.allow_mmap_reads = true; env_options.use_mmap_reads = true; - rocksdb::CuckooTableOptions table_options; - table_options.hash_table_ratio = 0.75; - tf.reset(rocksdb::NewCuckooTableFactory(table_options)); + + tf.reset(rocksdb::NewCuckooTableFactory(0.75)); } else if (FLAGS_table_factory == "plain_table") { options.allow_mmap_reads = true; env_options.use_mmap_reads = true; diff --git a/table/table_test.cc b/table/table_test.cc index e4657e8cd9..500abf48f2 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -37,13 +37,11 @@ #include "table/format.h" #include "table/meta_blocks.h" #include "table/plain_table_factory.h" -#include "table/get_context.h" #include "util/random.h" #include "util/statistics.h" #include "util/testharness.h" #include "util/testutil.h" -#include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -196,7 +194,6 @@ class Constructor { // been added so far. Returns the keys in sorted order in "*keys" // and stores the key/value pairs in "*kvmap" void Finish(const Options& options, - const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, std::vector* keys, KVMap* kvmap) { @@ -209,14 +206,12 @@ class Constructor { keys->push_back(it->first); } data_.clear(); - Status s = FinishImpl(options, ioptions, table_options, - internal_comparator, *kvmap); + Status s = FinishImpl(options, table_options, internal_comparator, *kvmap); ASSERT_TRUE(s.ok()) << s.ToString(); } // Construct the data structure from the data in "data" virtual Status FinishImpl(const Options& options, - const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) = 0; @@ -225,12 +220,8 @@ class Constructor { virtual const KVMap& data() { return data_; } - virtual bool IsArenaMode() const { return false; } - virtual DB* db() const { return nullptr; } // Overridden in DBConstructor - virtual bool AnywayDeleteIterator() const { return false; } - protected: const InternalKeyComparator* last_internal_key_; @@ -248,7 +239,6 @@ class BlockConstructor: public Constructor { delete block_; } virtual Status FinishImpl(const Options& options, - const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) { @@ -266,7 +256,8 @@ class BlockConstructor: public Constructor { BlockContents contents; contents.data = data_; contents.cachable = false; - block_ = new Block(std::move(contents)); + contents.heap_allocated = false; + block_ = new Block(contents); return Status::OK(); } virtual Iterator* NewIterator() const { @@ -284,15 +275,8 @@ class BlockConstructor: public Constructor { // A helper class that converts internal format keys into user keys class KeyConvertingIterator: public Iterator { public: - KeyConvertingIterator(Iterator* iter, bool arena_mode = false) - : iter_(iter), arena_mode_(arena_mode) {} - virtual ~KeyConvertingIterator() { - if (arena_mode_) { - iter_->~Iterator(); - } else { - delete iter_; - } - } + explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } + virtual ~KeyConvertingIterator() { delete iter_; } virtual bool Valid() const { return iter_->Valid(); } virtual void Seek(const Slice& target) { ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); @@ -323,7 +307,6 @@ class KeyConvertingIterator: public Iterator { private: mutable Status status_; Iterator* iter_; - bool arena_mode_; // No copying allowed KeyConvertingIterator(const KeyConvertingIterator&); @@ -339,16 +322,14 @@ class TableConstructor: public Constructor { ~TableConstructor() { Reset(); } virtual Status FinishImpl(const Options& options, - const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) { Reset(); sink_.reset(new StringSink()); unique_ptr builder; - builder.reset(ioptions.table_factory->NewTableBuilder( - ioptions, internal_comparator, sink_.get(), options.compression, - CompressionOptions())); + builder.reset(options.table_factory->NewTableBuilder( + options, internal_comparator, sink_.get(), options.compression)); for (KVMap::const_iterator it = data.begin(); it != data.end(); @@ -371,9 +352,9 @@ class TableConstructor: public Constructor { // Open the table uniq_id_ = cur_uniq_id_++; source_.reset(new StringSource(sink_->contents(), uniq_id_, - ioptions.allow_mmap_reads)); - return ioptions.table_factory->NewTableReader( - ioptions, soptions, internal_comparator, std::move(source_), + options.allow_mmap_reads)); + return options.table_factory->NewTableReader( + options, soptions, internal_comparator, std::move(source_), sink_->contents().size(), &table_reader_); } @@ -391,12 +372,12 @@ class TableConstructor: public Constructor { return table_reader_->ApproximateOffsetOf(key); } - virtual Status Reopen(const ImmutableCFOptions& ioptions) { + virtual Status Reopen(const Options& options) { source_.reset( new StringSource(sink_->contents(), uniq_id_, - ioptions.allow_mmap_reads)); - return ioptions.table_factory->NewTableReader( - ioptions, soptions, *last_internal_key_, std::move(source_), + options.allow_mmap_reads)); + return options.table_factory->NewTableReader( + options, soptions, *last_internal_key_, std::move(source_), sink_->contents().size(), &table_reader_); } @@ -404,10 +385,6 @@ class TableConstructor: public Constructor { return table_reader_.get(); } - virtual bool AnywayDeleteIterator() const override { - return convert_to_internal_key_; - } - private: void Reset() { uniq_id_ = 0; @@ -415,12 +392,12 @@ class TableConstructor: public Constructor { sink_.reset(); source_.reset(); } + bool convert_to_internal_key_; uint64_t uniq_id_; unique_ptr sink_; unique_ptr source_; unique_ptr table_reader_; - bool convert_to_internal_key_; TableConstructor(); @@ -437,25 +414,20 @@ class MemTableConstructor: public Constructor { table_factory_(new SkipListFactory) { Options options; options.memtable_factory = table_factory_; - ImmutableCFOptions ioptions(options); - memtable_ = new MemTable(internal_comparator_, ioptions, - MemTableOptions(MutableCFOptions(options, ioptions), options)); + memtable_ = new MemTable(internal_comparator_, options); memtable_->Ref(); } ~MemTableConstructor() { delete memtable_->Unref(); } - virtual Status FinishImpl(const Options&, - const ImmutableCFOptions& ioptions, + virtual Status FinishImpl(const Options& options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) { delete memtable_->Unref(); - Options options; - options.memtable_factory = table_factory_; - ImmutableCFOptions mem_ioptions(options); - memtable_ = new MemTable(internal_comparator_, mem_ioptions, - MemTableOptions(MutableCFOptions(options, mem_ioptions), options)); + Options memtable_options; + memtable_options.memtable_factory = table_factory_; + memtable_ = new MemTable(internal_comparator_, memtable_options); memtable_->Ref(); int seq = 1; for (KVMap::const_iterator it = data.begin(); @@ -467,16 +439,10 @@ class MemTableConstructor: public Constructor { return Status::OK(); } virtual Iterator* NewIterator() const { - return new KeyConvertingIterator( - memtable_->NewIterator(ReadOptions(), &arena_), true); + return new KeyConvertingIterator(memtable_->NewIterator(ReadOptions())); } - virtual bool AnywayDeleteIterator() const override { return true; } - - virtual bool IsArenaMode() const override { return true; } - private: - mutable Arena arena_; InternalKeyComparator internal_comparator_; MemTable* memtable_; std::shared_ptr table_factory_; @@ -494,7 +460,6 @@ class DBConstructor: public Constructor { delete db_; } virtual Status FinishImpl(const Options& options, - const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) { @@ -705,7 +670,7 @@ class FixedOrLessPrefixTransform : public SliceTransform { class Harness { public: - Harness() : ioptions_(options_), constructor_(nullptr) {} + Harness() : constructor_(nullptr) { } void Init(const TestArgs& args) { delete constructor_; @@ -791,7 +756,6 @@ class Harness { constructor_ = new DBConstructor(options_.comparator); break; } - ioptions_ = ImmutableCFOptions(options_); } ~Harness() { @@ -805,8 +769,8 @@ class Harness { void Test(Random* rnd) { std::vector keys; KVMap data; - constructor_->Finish(options_, ioptions_, table_options_, - *internal_comparator_, &keys, &data); + constructor_->Finish(options_, table_options_, *internal_comparator_, + &keys, &data); TestForwardScan(keys, data); if (support_prev_) { @@ -827,11 +791,7 @@ class Harness { iter->Next(); } ASSERT_TRUE(!iter->Valid()); - if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { - iter->~Iterator(); - } else { - delete iter; - } + delete iter; } void TestBackwardScan(const std::vector& keys, @@ -846,11 +806,7 @@ class Harness { iter->Prev(); } ASSERT_TRUE(!iter->Valid()); - if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { - iter->~Iterator(); - } else { - delete iter; - } + delete iter; } void TestRandomAccess(Random* rnd, @@ -920,11 +876,7 @@ class Harness { } } } - if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { - iter->~Iterator(); - } else { - delete iter; - } + delete iter; } std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { @@ -987,7 +939,6 @@ class Harness { private: Options options_ = Options(); - ImmutableCFOptions ioptions_; BlockBasedTableOptions table_options_ = BlockBasedTableOptions(); Constructor* constructor_; bool support_prev_; @@ -1087,8 +1038,7 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) { table_options.block_restart_interval = 1; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); - c.Finish(options, ioptions, table_options, + c.Finish(options, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); auto& props = *c.GetTableReader()->GetTableProperties(); @@ -1121,8 +1071,7 @@ TEST(BlockBasedTableTest, FilterPolicyNameProperties) { Options options; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); - c.Finish(options, ioptions, table_options, + c.Finish(options, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); auto& props = *c.GetTableReader()->GetTableProperties(); ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name); @@ -1173,8 +1122,7 @@ TEST(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { c.Add("cccc2", std::string('a', 56)); std::vector keys; KVMap kvmap; - const ImmutableCFOptions ioptions(options); - c.Finish(options, ioptions, table_options, + c.Finish(options, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); auto props = c.GetTableReader()->GetTableProperties(); ASSERT_EQ(7u, props->num_data_blocks); @@ -1218,7 +1166,7 @@ static std::string RandomString(Random* rnd, int len) { return r; } -void AddInternalKey(TableConstructor* c, const std::string& prefix, +void AddInternalKey(TableConstructor* c, const std::string prefix, int suffix_len = 800) { static Random rnd(1023); InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue); @@ -1258,8 +1206,7 @@ TEST(TableTest, HashIndexTest) { std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); - const ImmutableCFOptions ioptions(options); - c.Finish(options, ioptions, table_options, *comparator, &keys, &kvmap); + c.Finish(options, table_options, *comparator, &keys, &kvmap); auto reader = c.GetTableReader(); auto props = reader->GetTableProperties(); @@ -1367,8 +1314,7 @@ TEST(BlockBasedTableTest, IndexSizeStat) { table_options.block_restart_interval = 1; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); - c.Finish(options, ioptions, table_options, + c.Finish(options, table_options, GetPlainInternalComparator(options.comparator), &ks, &kvmap); auto index_size = c.GetTableReader()->GetTableProperties()->index_size; ASSERT_GT(index_size, last_index_size); @@ -1394,8 +1340,7 @@ TEST(BlockBasedTableTest, NumBlockStat) { std::vector ks; KVMap kvmap; - const ImmutableCFOptions ioptions(options); - c.Finish(options, ioptions, table_options, + c.Finish(options, table_options, GetPlainInternalComparator(options.comparator), &ks, &kvmap); ASSERT_EQ(kvmap.size(), c.GetTableReader()->GetTableProperties()->num_data_blocks); @@ -1471,8 +1416,7 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) { TableConstructor c(BytewiseComparator(), true); c.Add("key", "value"); - const ImmutableCFOptions ioptions(options); - c.Finish(options, ioptions, table_options, + c.Finish(options, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); // preloading filter/index blocks is enabled. @@ -1488,11 +1432,8 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) { } { - GetContext get_context(options.comparator, nullptr, nullptr, nullptr, - GetContext::kNotFound, Slice(), nullptr, - nullptr, nullptr); // a hack that just to trigger BlockBasedTable::GetFilter. - reader->Get(ReadOptions(), "non-exist-key", &get_context); + reader->Get(ReadOptions(), "non-exist-key", nullptr, nullptr, nullptr); BlockCachePropertiesSnapshot props(options.statistics.get()); props.AssertIndexBlockStat(0, 0); props.AssertFilterBlockStat(0, 0); @@ -1517,8 +1458,7 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { TableConstructor c(BytewiseComparator()); c.Add("key", "value"); - const ImmutableCFOptions ioptions(options); - c.Finish(options, ioptions, table_options, + c.Finish(options, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); // preloading filter/index blocks is prohibited. auto reader = dynamic_cast(c.GetTableReader()); @@ -1572,8 +1512,7 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { table_options.block_cache.reset(); options.table_factory.reset(new BlockBasedTableFactory(table_options)); options.statistics = CreateDBStatistics(); // reset the stats - const ImmutableCFOptions ioptions1(options); - c.Reopen(ioptions1); + c.Reopen(options); table_options.no_block_cache = false; { @@ -1590,8 +1529,7 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { // too small to fit even one entry. table_options.block_cache = NewLRUCache(1); options.table_factory.reset(new BlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions2(options); - c.Reopen(ioptions2); + c.Reopen(options); { BlockCachePropertiesSnapshot props(options.statistics.get()); props.AssertEqual(1, // index block miss @@ -1645,8 +1583,7 @@ TEST(BlockBasedTableTest, BlockCacheLeak) { c.Add("k07", std::string(100000, 'x')); std::vector keys; KVMap kvmap; - const ImmutableCFOptions ioptions(opt); - c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap); + c.Finish(opt, table_options, *ikc, &keys, &kvmap); unique_ptr iter(c.NewIterator()); iter->SeekToFirst(); @@ -1657,8 +1594,7 @@ TEST(BlockBasedTableTest, BlockCacheLeak) { } ASSERT_OK(iter->status()); - const ImmutableCFOptions ioptions1(opt); - ASSERT_OK(c.Reopen(ioptions1)); + ASSERT_OK(c.Reopen(opt)); auto table_reader = dynamic_cast(c.GetTableReader()); for (const std::string& key : keys) { ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key)); @@ -1667,8 +1603,7 @@ TEST(BlockBasedTableTest, BlockCacheLeak) { // rerun with different block cache table_options.block_cache = NewLRUCache(16 * 1024 * 1024); opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions2(opt); - ASSERT_OK(c.Reopen(ioptions2)); + ASSERT_OK(c.Reopen(opt)); table_reader = dynamic_cast(c.GetTableReader()); for (const std::string& key : keys) { ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key)); @@ -1684,11 +1619,9 @@ TEST(PlainTableTest, BasicPlainTableProperties) { PlainTableFactory factory(plain_table_options); StringSink sink; Options options; - const ImmutableCFOptions ioptions(options); InternalKeyComparator ikc(options.comparator); std::unique_ptr builder( - factory.NewTableBuilder(ioptions, ikc, &sink, kNoCompression, - CompressionOptions())); + factory.NewTableBuilder(options, ikc, &sink, kNoCompression)); for (char c = 'a'; c <= 'z'; ++c) { std::string key(8, c); @@ -1731,9 +1664,7 @@ TEST(GeneralTableTest, ApproximateOffsetOfPlain) { options.compression = kNoCompression; BlockBasedTableOptions table_options; table_options.block_size = 1024; - const ImmutableCFOptions ioptions(options); - c.Finish(options, ioptions, table_options, internal_comparator, - &keys, &kvmap); + c.Finish(options, table_options, internal_comparator, &keys, &kvmap); ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); @@ -1763,8 +1694,7 @@ static void DoCompressionTest(CompressionType comp) { options.compression = comp; BlockBasedTableOptions table_options; table_options.block_size = 1024; - const ImmutableCFOptions ioptions(options); - c.Finish(options, ioptions, table_options, ikc, &keys, &kvmap); + c.Finish(options, table_options, ikc, &keys, &kvmap); ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); @@ -1866,9 +1796,7 @@ TEST(MemTableTest, Simple) { auto table_factory = std::make_shared(); Options options; options.memtable_factory = table_factory; - ImmutableCFOptions ioptions(options); - MemTable* memtable = new MemTable(cmp, ioptions, - MemTableOptions(MutableCFOptions(options, ioptions), options)); + MemTable* memtable = new MemTable(cmp, options); memtable->Ref(); WriteBatch batch; WriteBatchInternal::SetSequence(&batch, 100); @@ -1879,8 +1807,7 @@ TEST(MemTableTest, Simple) { ColumnFamilyMemTablesDefault cf_mems_default(memtable, &options); ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok()); - Arena arena; - ScopedArenaIterator iter(memtable->NewIterator(ReadOptions(), &arena)); + Iterator* iter = memtable->NewIterator(ReadOptions()); iter->SeekToFirst(); while (iter->Valid()) { fprintf(stderr, "key: '%s' -> '%s'\n", @@ -1889,6 +1816,7 @@ TEST(MemTableTest, Simple) { iter->Next(); } + delete iter; delete memtable->Unref(); } diff --git a/tools/auto_sanity_test.sh b/tools/auto_sanity_test.sh index 138c855c08..2d63c0a85f 100755 --- a/tools/auto_sanity_test.sh +++ b/tools/auto_sanity_test.sh @@ -37,11 +37,6 @@ echo "Running db sanity check with commits $commit_new and $commit_old." echo "=============================================================" echo "Making build $commit_new" -git checkout $commit_new -if [ $? -ne 0 ]; then - echo "[ERROR] Can't checkout $commit_new" - exit 1 -fi makestuff mv db_sanity_test new_db_sanity_test echo "Creating db based on the new commit --- $commit_new" @@ -49,11 +44,6 @@ echo "Creating db based on the new commit --- $commit_new" echo "=============================================================" echo "Making build $commit_old" -git checkout $commit_old -if [ $? -ne 0 ]; then - echo "[ERROR] Can't checkout $commit_old" - exit 1 -fi makestuff mv db_sanity_test old_db_sanity_test echo "Creating db based on the old commit --- $commit_old" diff --git a/tools/benchmark.sh b/tools/benchmark.sh deleted file mode 100755 index cde545801c..0000000000 --- a/tools/benchmark.sh +++ /dev/null @@ -1,205 +0,0 @@ -#!/bin/bash -# REQUIRE: db_bench binary exists in the current directory - -if [ $# -ne 1 ]; then - echo "./benchmark.sh [bulkload/fillseq/overwrite/filluniquerandom/readrandom/readwhilewriting]" - exit 0 -fi - -# size constants -K=1024 -M=$((1024 * K)) -G=$((1024 * M)) - -if [ -z $DB_DIR ]; then - echo "DB_DIR is not defined" - exit 0 -fi - -if [ -z $WAL_DIR ]; then - echo "WAL_DIR is not defined" - exit 0 -fi - -output_dir=${OUTPUT_DIR:-/tmp/} -if [ ! -d $output_dir ]; then - mkdir -p $output_dir -fi - -num_read_threads=${NUM_READ_THREADS:-16} -writes_per_second=${WRITES_PER_SEC:-$((80 * K))} # (only for readwhilewriting) -cache_size=$((16 * G)) -duration=${DURATION:-0} - -num_keys=${NUM_KEYS:-$((1 * G))} -key_size=20 -value_size=800 - -const_params=" - --db=$DB_DIR \ - --wal_dir=$WAL_DIR \ - \ - --num_levels=6 \ - --key_size=$key_size \ - --value_size=$value_size \ - --block_size=4096 \ - --cache_size=$cache_size \ - --cache_numshardbits=6 \ - --compression_type=snappy \ - --compression_ratio=0.5 \ - \ - --hard_rate_limit=2 \ - --rate_limit_delay_max_milliseconds=1000000 \ - --write_buffer_size=$((128 * M)) \ - --max_write_buffer_number=2 \ - --target_file_size_base=$((128 * M)) \ - --max_bytes_for_level_base=$((1 * G)) \ - \ - --sync=0 \ - --disable_data_sync=1 \ - --verify_checksum=1 \ - --delete_obsolete_files_period_micros=$((60 * M)) \ - --max_grandparent_overlap_factor=10 \ - \ - --statistics=1 \ - --stats_per_interval=1 \ - --stats_interval=$((1 * M)) \ - --histogram=1 \ - \ - --memtablerep=skip_list \ - --bloom_bits=10 \ - --open_files=$((20 * K))" - -l0_config=" - --level0_file_num_compaction_trigger=8 \ - --level0_slowdown_writes_trigger=16 \ - --level0_stop_writes_trigger=24" - -if [ $duration -gt 0 ]; then - const_params="$const_params --duration=$duration" -fi - -params_r="$const_params $l0_config --max_background_compactions=4 --max_background_flushes=1" -params_w="$const_params $l0_config --max_background_compactions=16 --max_background_flushes=16" -params_bulkload="$const_params --max_background_compactions=16 --max_background_flushes=16 \ - --level0_file_num_compaction_trigger=$((100 * M)) \ - --level0_slowdown_writes_trigger=$((100 * M)) \ - --level0_stop_writes_trigger=$((100 * M))" - -function run_bulkload { - echo "Bulk loading $num_keys random keys into database..." - cmd="./db_bench $params_bulkload --benchmarks=fillrandom \ - --use_existing_db=0 \ - --num=$num_keys \ - --disable_auto_compactions=1 \ - --disable_data_sync=1 \ - --threads=1 2>&1 | tee $output_dir/benchmark_bulkload_fillrandom.log" - echo $cmd | tee $output_dir/benchmark_bulkload_fillrandom.log - eval $cmd - echo "Compacting..." - cmd="./db_bench $params_w --benchmarks=compact \ - --use_existing_db=1 \ - --num=$num_keys \ - --disable_auto_compactions=1 \ - --disable_data_sync=1 \ - --threads=1 2>&1 | tee $output_dir/benchmark_bulkload_compact.log" - echo $cmd | tee $output_dir/benchmark_bulkload_compact.log - eval $cmd -} - -function run_fillseq { - echo "Loading $num_keys keys sequentially into database..." - cmd="./db_bench $params_w --benchmarks=fillseq \ - --use_existing_db=0 \ - --num=$num_keys \ - --threads=1 2>&1 | tee $output_dir/benchmark_fillseq.log" - echo $cmd | tee $output_dir/benchmark_fillseq.log - eval $cmd -} - -function run_overwrite { - echo "Loading $num_keys keys sequentially into database..." - cmd="./db_bench $params_w --benchmarks=overwrite \ - --use_existing_db=1 \ - --num=$num_keys \ - --threads=1 2>&1 | tee $output_dir/benchmark_overwrite.log" - echo $cmd | tee $output_dir/benchmark_overwrite.log - eval $cmd -} - -function run_filluniquerandom { - echo "Loading $num_keys unique keys randomly into database..." - cmd="./db_bench $params_w --benchmarks=filluniquerandom \ - --use_existing_db=0 \ - --num=$num_keys \ - --threads=1 2>&1 | tee $output_dir/benchmark_filluniquerandom.log" - echo $cmd | tee $output_dir/benchmark_filluniquerandom.log - eval $cmd -} - -function run_readrandom { - echo "Reading $num_keys random keys from database..." - cmd="./db_bench $params_r --benchmarks=readrandom \ - --use_existing_db=1 \ - --num=$num_keys \ - --threads=$num_read_threads \ - --disable_auto_compactions=1 \ - 2>&1 | tee $output_dir/benchmark_readrandom.log" - echo $cmd | tee $output_dir/benchmark_readrandom.log - eval $cmd -} - -function run_readwhilewriting { - echo "Reading $num_keys random keys from database whiling writing.." - cmd="./db_bench $params_r --benchmarks=readwhilewriting \ - --use_existing_db=1 \ - --num=$num_keys \ - --threads=$num_read_threads \ - --writes_per_second=$writes_per_second \ - 2>&1 | tee $output_dir/benchmark_readwhilewriting.log" - echo $cmd | tee $output_dir/benchmark_readwhilewriting.log - eval $cmd -} - -function now() { - echo `date +"%s"` -} - -report="$output_dir/report.txt" - -# print start time -echo "===== Benchmark =====" - -# Run!!! -IFS=',' read -a jobs <<< $1 -for job in ${jobs[@]}; do - echo "Start $job at `date`" | tee -a $report - start=$(now) - if [ $job = bulkload ]; then - run_bulkload - elif [ $job = fillseq ]; then - run_fillseq - elif [ $job = overwrite ]; then - run_overwrite - elif [ $job = filluniquerandom ]; then - run_filluniquerandom - elif [ $job = readrandom ]; then - run_readrandom - elif [ $job = readwhilewriting ]; then - run_readwhilewriting - else - echo "unknown job $job" - exit - fi - end=$(now) - - echo "Complete $job in $((end-start)) seconds" | tee -a $report - if [[ $job = readrandom || $job = readwhilewriting ]]; then - qps=$(grep "micros\/op" "$output_dir/benchmark_$job.log" | grep "ops\/sec" | awk '{print $5}') - line=$(grep "rocksdb.db.get.micros" "$output_dir/benchmark_$job.log") - p50=$(echo $line | awk '{print $7}') - p99=$(echo $line | awk '{print $13}') - echo "Read latency p50 = $p50 us, p99 = $p99 us" | tee -a $report - echo "QPS = $qps ops/sec" | tee -a $report - fi -done diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc index 237ef07d0d..4ae120c21e 100644 --- a/tools/db_sanity_test.cc +++ b/tools/db_sanity_test.cc @@ -8,15 +8,14 @@ #include #include -#include "rocksdb/db.h" -#include "rocksdb/options.h" -#include "rocksdb/env.h" -#include "rocksdb/slice.h" -#include "rocksdb/status.h" -#include "rocksdb/comparator.h" -#include "rocksdb/table.h" -#include "rocksdb/slice_transform.h" -#include "rocksdb/filter_policy.h" +#include "include/rocksdb/db.h" +#include "include/rocksdb/options.h" +#include "include/rocksdb/env.h" +#include "include/rocksdb/slice.h" +#include "include/rocksdb/status.h" +#include "include/rocksdb/comparator.h" +#include "include/rocksdb/table.h" +#include "include/rocksdb/slice_transform.h" namespace rocksdb { @@ -50,7 +49,7 @@ class SanityTest { return s; } } - return db->Flush(FlushOptions()); + return Status::OK(); } Status Verify() { DB* db; @@ -147,29 +146,13 @@ class SanityTestPlainTableFactory : public SanityTest { Options options_; }; -class SanityTestBloomFilter : public SanityTest { - public: - explicit SanityTestBloomFilter(const std::string& path) : SanityTest(path) { - BlockBasedTableOptions table_options; - table_options.filter_policy.reset(NewBloomFilterPolicy(10)); - options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); - } - ~SanityTestBloomFilter() {} - virtual Options GetOptions() const { return options_; } - virtual std::string Name() const { return "BloomFilter"; } - - private: - Options options_; -}; - namespace { bool RunSanityTests(const std::string& command, const std::string& path) { std::vector sanity_tests = { new SanityTestBasic(path), new SanityTestSpecialComparator(path), new SanityTestZlibCompression(path), - new SanityTestPlainTableFactory(path), - new SanityTestBloomFilter(path)}; + new SanityTestPlainTableFactory(path)}; if (command == "create") { fprintf(stderr, "Creating...\n"); diff --git a/tools/db_stress.cc b/tools/db_stress.cc index b5c79bf3b3..e9955953df 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -209,9 +209,6 @@ static const bool FLAGS_reopen_dummy __attribute__((unused)) = DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. " "Negative means use default settings."); -DEFINE_bool(use_block_based_filter, false, "use block based filter" - "instead of full filter for block based table"); - DEFINE_string(db, "", "Use the db with the following name."); DEFINE_bool(verify_checksum, false, @@ -760,10 +757,8 @@ class StressTest { ? NewLRUCache(FLAGS_compressed_cache_size) : nullptr), filter_policy_(FLAGS_bloom_bits >= 0 - ? FLAGS_use_block_based_filter - ? NewBloomFilterPolicy(FLAGS_bloom_bits, true) - : NewBloomFilterPolicy(FLAGS_bloom_bits, false) - : nullptr), + ? NewBloomFilterPolicy(FLAGS_bloom_bits) + : nullptr), db_(nullptr), new_column_family_name_(1), num_times_reopened_(0) { diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc index b1d58e10ec..b41f36d010 100644 --- a/tools/reduce_levels_test.cc +++ b/tools/reduce_levels_test.cc @@ -76,7 +76,6 @@ Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels, opt.num_levels = num_levels; opt.create_if_missing = create_if_missing; opt.max_mem_compaction_level = mem_table_compact_level; - opt.max_background_flushes = 0; rocksdb::Status st = rocksdb::DB::Open(opt, dbname_, &db_); if (!st.ok()) { fprintf(stderr, "Can't open the db:%s\n", st.ToString().c_str()); diff --git a/tools/run_flash_bench.sh b/tools/run_flash_bench.sh deleted file mode 100755 index be7d1631f6..0000000000 --- a/tools/run_flash_bench.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# REQUIRE: benchmark.sh exists in the current directory -# After execution of this script, log files are generated in $output_dir. -# report.txt provides a high level statistics - -# Size constants -K=1024 -M=$((1024 * K)) -G=$((1024 * M)) - -n=$((1 * G)) -wps=$((80 * K)) -duration=$((6 * 60 * 60)) -num_read_threads=24 - -# Update these parameters before execution !!! -db_dir="/tmp/rocksdb/" -wal_dir="/tmp/rocksdb/" -output_dir="/tmp/output" - -# Test 1: bulk load -OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ - ./benchmark.sh bulkload - -# Test 2: sequential fill -OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ - ./benchmark.sh fillseq - -# Test 3: overwrite -OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ - ./benchmark.sh overwrite - -# Prepare: populate DB with random data -OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ - ./benchmark.sh filluniquerandom - -# Test 4: random read -OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ - DURATION=$duration NUM_READ_THREADS=$num_read_threads \ - ./benchmark.sh readrandom - -# Test 5: random read while writing -OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ - DURATION=$duration NUM_READ_THREADS=$num_read_threads WRITES_PER_SECOND=$wps \ - ./benchmark.sh readwhilewriting diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc index 6c496e8ddb..9b130c7c63 100644 --- a/tools/sst_dump.cc +++ b/tools/sst_dump.cc @@ -68,7 +68,6 @@ class SstFileReader { // options_ and internal_comparator_ will also be used in // ReadSequential internally (specifically, seek-related operations) Options options_; - const ImmutableCFOptions ioptions_; InternalKeyComparator internal_comparator_; unique_ptr table_properties_; }; @@ -77,8 +76,7 @@ SstFileReader::SstFileReader(const std::string& file_path, bool verify_checksum, bool output_hex) :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum), - output_hex_(output_hex), ioptions_(options_), - internal_comparator_(BytewiseComparator()) { + output_hex_(output_hex), internal_comparator_(BytewiseComparator()) { fprintf(stdout, "Process %s\n", file_path.c_str()); init_result_ = NewTableReader(file_name_); @@ -125,7 +123,7 @@ Status SstFileReader::NewTableReader(const std::string& file_path) { if (s.ok()) { s = options_.table_factory->NewTableReader( - ioptions_, soptions_, internal_comparator_, std::move(file_), file_size, + options_, soptions_, internal_comparator_, std::move(file_), file_size, &table_reader_); } return s; diff --git a/util/bloom.cc b/util/bloom.cc index 19d8edead2..723adf843c 100644 --- a/util/bloom.cc +++ b/util/bloom.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2013, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,266 +10,42 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/slice.h" -#include "table/block_based_filter_block.h" -#include "table/full_filter_block.h" #include "util/hash.h" -#include "util/coding.h" namespace rocksdb { -class BlockBasedFilterBlockBuilder; -class FullFilterBlockBuilder; - namespace { -class FullFilterBitsBuilder : public FilterBitsBuilder { - public: - explicit FullFilterBitsBuilder(const size_t bits_per_key, - const size_t num_probes) - : bits_per_key_(bits_per_key), - num_probes_(num_probes) { - assert(bits_per_key_); - } - - ~FullFilterBitsBuilder() {} - - virtual void AddKey(const Slice& key) override { - uint32_t hash = BloomHash(key); - if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { - hash_entries_.push_back(hash); - } - } - - // Create a filter that for hashes [0, n-1], the filter is allocated here - // When creating filter, it is ensured that - // total_bits = num_lines * CACHE_LINE_SIZE * 8 - // dst len is >= 5, 1 for num_probes, 4 for num_lines - // Then total_bits = (len - 5) * 8, and cache_line_size could be calulated - // +----------------------------------------------------------------+ - // | filter data with length total_bits/8 | - // +----------------------------------------------------------------+ - // | | - // | ... | - // | | - // +----------------------------------------------------------------+ - // | ... | num_probes : 1 byte | num_lines : 4 bytes | - // +----------------------------------------------------------------+ - virtual Slice Finish(std::unique_ptr* buf) override { - uint32_t total_bits, num_lines; - char* data = ReserveSpace(hash_entries_.size(), &total_bits, &num_lines); - assert(data); - - if (total_bits != 0 && num_lines != 0) { - for (auto h : hash_entries_) { - AddHash(h, data, num_lines, total_bits); - } - } - data[total_bits/8] = static_cast(num_probes_); - EncodeFixed32(data + total_bits/8 + 1, static_cast(num_lines)); - - const char* const_data = data; - buf->reset(const_data); - hash_entries_.clear(); - - return Slice(data, total_bits / 8 + 5); - } +class BloomFilterPolicy : public FilterPolicy { private: size_t bits_per_key_; - size_t num_probes_; - std::vector hash_entries_; + size_t k_; + uint32_t (*hash_func_)(const Slice& key); - // Get totalbits that optimized for cpu cache line - uint32_t GetTotalBitsForLocality(uint32_t total_bits); - - // Reserve space for new filter - char* ReserveSpace(const int num_entry, uint32_t* total_bits, - uint32_t* num_lines); - - // Assuming single threaded access to this function. - void AddHash(uint32_t h, char* data, uint32_t num_lines, - uint32_t total_bits); - - // No Copy allowed - FullFilterBitsBuilder(const FullFilterBitsBuilder&); - void operator=(const FullFilterBitsBuilder&); -}; - -uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { - uint32_t num_lines = - (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); - - // Make num_lines an odd number to make sure more bits are involved - // when determining which block. - if (num_lines % 2 == 0) { - num_lines++; - } - return num_lines * (CACHE_LINE_SIZE * 8); -} - -char* FullFilterBitsBuilder::ReserveSpace(const int num_entry, - uint32_t* total_bits, uint32_t* num_lines) { - assert(bits_per_key_); - char* data = nullptr; - if (num_entry != 0) { - uint32_t total_bits_tmp = num_entry * bits_per_key_; - - *total_bits = GetTotalBitsForLocality(total_bits_tmp); - *num_lines = *total_bits / (CACHE_LINE_SIZE * 8); - assert(*total_bits > 0 && *total_bits % 8 == 0); - } else { - // filter is empty, just leave space for metadata - *total_bits = 0; - *num_lines = 0; + void initialize() { + // We intentionally round down to reduce probing cost a little bit + k_ = static_cast(bits_per_key_ * 0.69); // 0.69 =~ ln(2) + if (k_ < 1) k_ = 1; + if (k_ > 30) k_ = 30; } - // Reserve space for Filter - uint32_t sz = *total_bits / 8; - sz += 5; // 4 bytes for num_lines, 1 byte for num_probes - - data = new char[sz]; - memset(data, 0, sz); - return data; -} - -inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data, - uint32_t num_lines, uint32_t total_bits) { - assert(num_lines > 0 && total_bits > 0); - - const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits - uint32_t b = (h % num_lines) * (CACHE_LINE_SIZE * 8); - - for (uint32_t i = 0; i < num_probes_; ++i) { - // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized - // to a simple operation by compiler. - const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); - data[bitpos / 8] |= (1 << (bitpos % 8)); - - h += delta; - } -} - -class FullFilterBitsReader : public FilterBitsReader { public: - explicit FullFilterBitsReader(const Slice& contents) - : data_(const_cast(contents.data())), - data_len_(contents.size()), - num_probes_(0), num_lines_(0) { - assert(data_); - GetFilterMeta(contents, &num_probes_, &num_lines_); - // Sanitize broken parameter - if (num_lines_ != 0 && (data_len_-5) % num_lines_ != 0) { - num_lines_ = 0; - num_probes_ = 0; - } + explicit BloomFilterPolicy(int bits_per_key, + uint32_t (*hash_func)(const Slice& key)) + : bits_per_key_(bits_per_key), hash_func_(hash_func) { + initialize(); } - - ~FullFilterBitsReader() {} - - virtual bool MayMatch(const Slice& entry) override { - if (data_len_ <= 5) { // remain same with original filter - return false; - } - // Other Error params, including a broken filter, regarded as match - if (num_probes_ == 0 || num_lines_ == 0) return true; - uint32_t hash = BloomHash(entry); - return HashMayMatch(hash, Slice(data_, data_len_), - num_probes_, num_lines_); - } - - private: - // Filter meta data - char* data_; - uint32_t data_len_; - size_t num_probes_; - uint32_t num_lines_; - - // Get num_probes, and num_lines from filter - // If filter format broken, set both to 0. - void GetFilterMeta(const Slice& filter, size_t* num_probes, - uint32_t* num_lines); - - // "filter" contains the data appended by a preceding call to - // CreateFilterFromHash() on this class. This method must return true if - // the key was in the list of keys passed to CreateFilter(). - // This method may return true or false if the key was not on the - // list, but it should aim to return false with a high probability. - // - // hash: target to be checked - // filter: the whole filter, including meta data bytes - // num_probes: number of probes, read before hand - // num_lines: filter metadata, read before hand - // Before calling this function, need to ensure the input meta data - // is valid. - bool HashMayMatch(const uint32_t& hash, const Slice& filter, - const size_t& num_probes, const uint32_t& num_lines); - - // No Copy allowed - FullFilterBitsReader(const FullFilterBitsReader&); - void operator=(const FullFilterBitsReader&); -}; - -void FullFilterBitsReader::GetFilterMeta(const Slice& filter, - size_t* num_probes, uint32_t* num_lines) { - uint32_t len = filter.size(); - if (len <= 5) { - // filter is empty or broken - *num_probes = 0; - *num_lines = 0; - return; - } - - *num_probes = filter.data()[len - 5]; - *num_lines = DecodeFixed32(filter.data() + len - 4); -} - -bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash, - const Slice& filter, const size_t& num_probes, - const uint32_t& num_lines) { - uint32_t len = filter.size(); - if (len <= 5) return false; // remain the same with original filter - - // It is ensured the params are valid before calling it - assert(num_probes != 0); - assert(num_lines != 0 && (len - 5) % num_lines == 0); - uint32_t cache_line_size = (len - 5) / num_lines; - const char* data = filter.data(); - - uint32_t h = hash; - const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits - uint32_t b = (h % num_lines) * (cache_line_size * 8); - - for (uint32_t i = 0; i < num_probes; ++i) { - // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized - // to a simple and operation by compiler. - const uint32_t bitpos = b + (h % (cache_line_size * 8)); - if (((data[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { - return false; - } - - h += delta; - } - - return true; -} - -// An implementation of filter policy -class BloomFilterPolicy : public FilterPolicy { - public: - explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder) - : bits_per_key_(bits_per_key), hash_func_(BloomHash), - use_block_based_builder_(use_block_based_builder) { + explicit BloomFilterPolicy(int bits_per_key) + : bits_per_key_(bits_per_key) { + hash_func_ = BloomHash; initialize(); } - ~BloomFilterPolicy() { - } - - virtual const char* Name() const override { + virtual const char* Name() const { return "rocksdb.BuiltinBloomFilter"; } - virtual void CreateFilter(const Slice* keys, int n, - std::string* dst) const override { + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { // Compute bloom filter size (in both bits and bytes) size_t bits = n * bits_per_key_; @@ -282,14 +58,14 @@ class BloomFilterPolicy : public FilterPolicy { const size_t init_size = dst->size(); dst->resize(init_size + bytes, 0); - dst->push_back(static_cast(num_probes_)); // Remember # of probes + dst->push_back(static_cast(k_)); // Remember # of probes in filter char* array = &(*dst)[init_size]; for (size_t i = 0; i < (size_t)n; i++) { // Use double-hashing to generate a sequence of hash values. // See analysis in [Kirsch,Mitzenmacher 2006]. uint32_t h = hash_func_(keys[i]); const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits - for (size_t j = 0; j < num_probes_; j++) { + for (size_t j = 0; j < k_; j++) { const uint32_t bitpos = h % bits; array[bitpos/8] |= (1 << (bitpos % 8)); h += delta; @@ -297,8 +73,7 @@ class BloomFilterPolicy : public FilterPolicy { } } - virtual bool KeyMayMatch(const Slice& key, - const Slice& bloom_filter) const override { + virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const { const size_t len = bloom_filter.size(); if (len < 2) return false; @@ -323,43 +98,11 @@ class BloomFilterPolicy : public FilterPolicy { } return true; } - - virtual FilterBitsBuilder* GetFilterBitsBuilder() const override { - if (use_block_based_builder_) { - return nullptr; - } - - return new FullFilterBitsBuilder(bits_per_key_, num_probes_); - } - - virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) - const override { - return new FullFilterBitsReader(contents); - } - - // If choose to use block based builder - bool UseBlockBasedBuilder() { return use_block_based_builder_; } - - private: - size_t bits_per_key_; - size_t num_probes_; - uint32_t (*hash_func_)(const Slice& key); - - const bool use_block_based_builder_; - - void initialize() { - // We intentionally round down to reduce probing cost a little bit - num_probes_ = static_cast(bits_per_key_ * 0.69); // 0.69 =~ ln(2) - if (num_probes_ < 1) num_probes_ = 1; - if (num_probes_ > 30) num_probes_ = 30; - } }; +} -} // namespace - -const FilterPolicy* NewBloomFilterPolicy(int bits_per_key, - bool use_block_based_builder) { - return new BloomFilterPolicy(bits_per_key, use_block_based_builder); +const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) { + return new BloomFilterPolicy(bits_per_key); } } // namespace rocksdb diff --git a/util/bloom_test.cc b/util/bloom_test.cc index 275592b70a..881e3b0f59 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -16,13 +16,12 @@ int main() { #else #include -#include #include "rocksdb/filter_policy.h" + #include "util/logging.h" #include "util/testharness.h" #include "util/testutil.h" -#include "util/arena.h" using GFLAGS::ParseCommandLineFlags; @@ -37,19 +36,6 @@ static Slice Key(int i, char* buffer) { return Slice(buffer, sizeof(i)); } -static int NextLength(int length) { - if (length < 10) { - length += 1; - } else if (length < 100) { - length += 10; - } else if (length < 1000) { - length += 100; - } else { - length += 1000; - } - return length; -} - class BloomTest { private: const FilterPolicy* policy_; @@ -57,8 +43,7 @@ class BloomTest { std::vector keys_; public: - BloomTest() : policy_( - NewBloomFilterPolicy(FLAGS_bits_per_key)) {} + BloomTest() : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key)) { } ~BloomTest() { delete policy_; @@ -132,6 +117,19 @@ TEST(BloomTest, Small) { ASSERT_TRUE(! Matches("foo")); } +static int NextLength(int length) { + if (length < 10) { + length += 1; + } else if (length < 100) { + length += 10; + } else if (length < 1000) { + length += 100; + } else { + length += 1000; + } + return length; +} + TEST(BloomTest, VaryingLengths) { char buffer[sizeof(int)]; @@ -173,121 +171,6 @@ TEST(BloomTest, VaryingLengths) { // Different bits-per-byte -class FullBloomTest { - private: - const FilterPolicy* policy_; - std::unique_ptr bits_builder_; - std::unique_ptr bits_reader_; - std::unique_ptr buf_; - size_t filter_size_; - - public: - FullBloomTest() : - policy_(NewBloomFilterPolicy(FLAGS_bits_per_key, false)), - filter_size_(0) { - Reset(); - } - - ~FullBloomTest() { - delete policy_; - } - - void Reset() { - bits_builder_.reset(policy_->GetFilterBitsBuilder()); - bits_reader_.reset(nullptr); - buf_.reset(nullptr); - filter_size_ = 0; - } - - void Add(const Slice& s) { - bits_builder_->AddKey(s); - } - - void Build() { - Slice filter = bits_builder_->Finish(&buf_); - bits_reader_.reset(policy_->GetFilterBitsReader(filter)); - filter_size_ = filter.size(); - } - - size_t FilterSize() const { - return filter_size_; - } - - bool Matches(const Slice& s) { - if (bits_reader_ == nullptr) { - Build(); - } - return bits_reader_->MayMatch(s); - } - - double FalsePositiveRate() { - char buffer[sizeof(int)]; - int result = 0; - for (int i = 0; i < 10000; i++) { - if (Matches(Key(i + 1000000000, buffer))) { - result++; - } - } - return result / 10000.0; - } -}; - -TEST(FullBloomTest, FullEmptyFilter) { - // Empty filter is not match, at this level - ASSERT_TRUE(!Matches("hello")); - ASSERT_TRUE(!Matches("world")); -} - -TEST(FullBloomTest, FullSmall) { - Add("hello"); - Add("world"); - ASSERT_TRUE(Matches("hello")); - ASSERT_TRUE(Matches("world")); - ASSERT_TRUE(!Matches("x")); - ASSERT_TRUE(!Matches("foo")); -} - -TEST(FullBloomTest, FullVaryingLengths) { - char buffer[sizeof(int)]; - - // Count number of filters that significantly exceed the false positive rate - int mediocre_filters = 0; - int good_filters = 0; - - for (int length = 1; length <= 10000; length = NextLength(length)) { - Reset(); - for (int i = 0; i < length; i++) { - Add(Key(i, buffer)); - } - Build(); - - ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 128 + 5)) << length; - - // All added keys must match - for (int i = 0; i < length; i++) { - ASSERT_TRUE(Matches(Key(i, buffer))) - << "Length " << length << "; key " << i; - } - - // Check false positive rate - double rate = FalsePositiveRate(); - if (kVerbose >= 1) { - fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n", - rate*100.0, length, static_cast(FilterSize())); - } - ASSERT_LE(rate, 0.02); // Must not be over 2% - if (rate > 0.0125) - mediocre_filters++; // Allowed, but not too often - else - good_filters++; - } - if (kVerbose >= 1) { - fprintf(stderr, "Filters: %d good, %d mediocre\n", - good_filters, mediocre_filters); - } - ASSERT_LE(mediocre_filters, good_filters/5); -} - } // namespace rocksdb int main(int argc, char** argv) { diff --git a/util/cache_bench.cc b/util/cache_bench.cc deleted file mode 100644 index 3d006ecf88..0000000000 --- a/util/cache_bench.cc +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif -#ifndef GFLAGS -#include -int main() { - fprintf(stderr, "Please install gflags to run rocksdb tools\n"); - return 1; -} -#else - -#include -#include -#include -#include - -#include "rocksdb/db.h" -#include "rocksdb/cache.h" -#include "rocksdb/env.h" -#include "port/port.h" -#include "util/mutexlock.h" -#include "util/random.h" - -using GFLAGS::ParseCommandLineFlags; - -static const uint32_t KB = 1024; - -DEFINE_int32(threads, 16, "Number of concurrent threads to run."); -DEFINE_int64(cache_size, 8 * KB * KB, - "Number of bytes to use as a cache of uncompressed data."); -DEFINE_int32(num_shard_bits, 4, "shard_bits."); - -DEFINE_int64(max_key, 1 * KB * KB * KB, "Max number of key to place in cache"); -DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread."); - -DEFINE_bool(populate_cache, false, "Populate cache before operations"); -DEFINE_int32(insert_percent, 40, - "Ratio of insert to total workload (expressed as a percentage)"); -DEFINE_int32(lookup_percent, 50, - "Ratio of lookup to total workload (expressed as a percentage)"); -DEFINE_int32(erase_percent, 10, - "Ratio of erase to total workload (expressed as a percentage)"); - -namespace rocksdb { - -class CacheBench; -namespace { -void deleter(const Slice& key, void* value) { - delete reinterpret_cast(value); -} - -// State shared by all concurrent executions of the same benchmark. -class SharedState { - public: - explicit SharedState(CacheBench* cache_bench) - : cv_(&mu_), - num_threads_(FLAGS_threads), - num_initialized_(0), - start_(false), - num_done_(0), - cache_bench_(cache_bench) { - } - - ~SharedState() {} - - port::Mutex* GetMutex() { - return &mu_; - } - - port::CondVar* GetCondVar() { - return &cv_; - } - - CacheBench* GetCacheBench() const { - return cache_bench_; - } - - void IncInitialized() { - num_initialized_++; - } - - void IncDone() { - num_done_++; - } - - bool AllInitialized() const { - return num_initialized_ >= num_threads_; - } - - bool AllDone() const { - return num_done_ >= num_threads_; - } - - void SetStart() { - start_ = true; - } - - bool Started() const { - return start_; - } - - private: - port::Mutex mu_; - port::CondVar cv_; - - const uint64_t num_threads_; - uint64_t num_initialized_; - bool start_; - uint64_t num_done_; - - CacheBench* cache_bench_; -}; - -// Per-thread state for concurrent executions of the same benchmark. -struct ThreadState { - uint32_t tid; - Random rnd; - SharedState* shared; - - ThreadState(uint32_t index, SharedState *shared) - : tid(index), - rnd(1000 + index), - shared(shared) {} -}; -} // namespace - -class CacheBench { - public: - CacheBench() : - cache_(NewLRUCache(FLAGS_cache_size, FLAGS_num_shard_bits)), - num_threads_(FLAGS_threads) {} - - ~CacheBench() {} - - void PopulateCache() { - Random rnd(1); - for (int64_t i = 0; i < FLAGS_cache_size; i++) { - uint64_t rand_key = rnd.Next() % FLAGS_max_key; - // Cast uint64* to be char*, data would be copied to cache - Slice key(reinterpret_cast(&rand_key), 8); - // do insert - auto handle = cache_->Insert(key, new char[10], 1, &deleter); - cache_->Release(handle); - } - } - - bool Run() { - rocksdb::Env* env = rocksdb::Env::Default(); - - PrintEnv(); - SharedState shared(this); - std::vector threads(num_threads_); - for (uint32_t i = 0; i < num_threads_; i++) { - threads[i] = new ThreadState(i, &shared); - env->StartThread(ThreadBody, threads[i]); - } - { - MutexLock l(shared.GetMutex()); - while (!shared.AllInitialized()) { - shared.GetCondVar()->Wait(); - } - // Record start time - uint64_t start_time = env->NowMicros(); - - // Start all threads - shared.SetStart(); - shared.GetCondVar()->SignalAll(); - - // Wait threads to complete - while (!shared.AllDone()) { - shared.GetCondVar()->Wait(); - } - - // Record end time - uint64_t end_time = env->NowMicros(); - double elapsed = static_cast(end_time - start_time) * 1e-6; - uint32_t qps = static_cast( - static_cast(FLAGS_threads * FLAGS_ops_per_thread) / elapsed); - fprintf(stdout, "Complete in %.3f s; QPS = %u\n", elapsed, qps); - } - return true; - } - - private: - std::shared_ptr cache_; - uint32_t num_threads_; - - static void ThreadBody(void* v) { - ThreadState* thread = reinterpret_cast(v); - SharedState* shared = thread->shared; - - { - MutexLock l(shared->GetMutex()); - shared->IncInitialized(); - if (shared->AllInitialized()) { - shared->GetCondVar()->SignalAll(); - } - while (!shared->Started()) { - shared->GetCondVar()->Wait(); - } - } - thread->shared->GetCacheBench()->OperateCache(thread); - - { - MutexLock l(shared->GetMutex()); - shared->IncDone(); - if (shared->AllDone()) { - shared->GetCondVar()->SignalAll(); - } - } - } - - void OperateCache(ThreadState* thread) { - for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { - uint64_t rand_key = thread->rnd.Next() % FLAGS_max_key; - // Cast uint64* to be char*, data would be copied to cache - Slice key(reinterpret_cast(&rand_key), 8); - int32_t prob_op = thread->rnd.Uniform(100); - if (prob_op >= 0 && prob_op < FLAGS_insert_percent) { - // do insert - auto handle = cache_->Insert(key, new char[10], 1, &deleter); - cache_->Release(handle); - } else if (prob_op -= FLAGS_insert_percent && - prob_op < FLAGS_lookup_percent) { - // do lookup - auto handle = cache_->Lookup(key); - if (handle) { - cache_->Release(handle); - } - } else if (prob_op -= FLAGS_lookup_percent && - prob_op < FLAGS_erase_percent) { - // do erase - cache_->Erase(key); - } - } - } - - void PrintEnv() const { - printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion); - printf("Number of threads : %d\n", FLAGS_threads); - printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread); - printf("Cache size : %" PRIu64 "\n", FLAGS_cache_size); - printf("Num shard bits : %d\n", FLAGS_num_shard_bits); - printf("Max key : %" PRIu64 "\n", FLAGS_max_key); - printf("Populate cache : %d\n", FLAGS_populate_cache); - printf("Insert percentage : %d%%\n", FLAGS_insert_percent); - printf("Lookup percentage : %d%%\n", FLAGS_lookup_percent); - printf("Erase percentage : %d%%\n", FLAGS_erase_percent); - printf("----------------------------\n"); - } -}; -} // namespace rocksdb - -int main(int argc, char** argv) { - ParseCommandLineFlags(&argc, &argv, true); - - if (FLAGS_threads <= 0) { - fprintf(stderr, "threads number <= 0\n"); - exit(1); - } - - rocksdb::CacheBench bench; - if (FLAGS_populate_cache) { - bench.PopulateCache(); - } - if (bench.Run()) { - return 0; - } else { - return 1; - } -} - -#endif // GFLAGS diff --git a/util/comparator.cc b/util/comparator.cc index bbf0262f0c..adeacac0ac 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -69,29 +69,13 @@ class BytewiseComparatorImpl : public Comparator { // *key is a run of 0xffs. Leave it alone. } }; - -class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { - public: - ReverseBytewiseComparatorImpl() { } - - virtual const char* Name() const { - return "rocksdb.ReverseBytewiseComparator"; - } - - virtual int Compare(const Slice& a, const Slice& b) const { - return -a.compare(b); - } -}; - -}// namespace +} // namespace static port::OnceType once = LEVELDB_ONCE_INIT; static const Comparator* bytewise; -static const Comparator* rbytewise; static void InitModule() { bytewise = new BytewiseComparatorImpl; - rbytewise= new ReverseBytewiseComparatorImpl; } const Comparator* BytewiseComparator() { @@ -99,9 +83,4 @@ const Comparator* BytewiseComparator() { return bytewise; } -const Comparator* ReverseBytewiseComparator() { - port::InitOnce(&once, InitModule); - return rbytewise; -} - } // namespace rocksdb diff --git a/util/db_info_dummper.cc b/util/db_info_dummper.cc index 2e0d344816..d5dd97ad2c 100644 --- a/util/db_info_dummper.cc +++ b/util/db_info_dummper.cc @@ -6,10 +6,7 @@ // Must not be included from any .h files to avoid polluting the namespace // with macros. -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include #include diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc index 6d228e81df..3e55488f22 100644 --- a/util/dynamic_bloom_test.cc +++ b/util/dynamic_bloom_test.cc @@ -11,10 +11,7 @@ int main() { } #else -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include #include diff --git a/util/env_test.cc b/util/env_test.cc index 1779f1aa08..c0d00ce94d 100644 --- a/util/env_test.cc +++ b/util/env_test.cc @@ -17,11 +17,6 @@ #include #endif -#ifdef ROCKSDB_FALLOCATE_PRESENT -#include -#include -#endif - #include "rocksdb/env.h" #include "port/port.h" #include "util/coding.h" @@ -397,9 +392,6 @@ TEST(EnvPosixTest, DecreaseNumBgThreads) { } #ifdef OS_LINUX -// Travis doesn't support fallocate or getting unique ID from files for whatever -// reason. -#ifndef TRAVIS // To make sure the Env::GetUniqueId() related tests work correctly, The files // should be stored in regular storage like "hard disk" or "flash device". // Otherwise we cannot get the correct id. @@ -483,31 +475,6 @@ TEST(EnvPosixTest, RandomAccessUniqueID) { #ifdef ROCKSDB_FALLOCATE_PRESENT TEST(EnvPosixTest, AllocateTest) { std::string fname = GetOnDiskTestDir() + "/preallocate_testfile"; - - // Try fallocate in a file to see whether the target file system supports it. - // Skip the test if fallocate is not supported. - std::string fname_test_fallocate = - GetOnDiskTestDir() + "/preallocate_testfile_2"; - int fd = -1; - do { - fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); - } while (fd < 0 && errno == EINTR); - ASSERT_GT(fd, 0); - - int alloc_status = fallocate(fd, 0, 0, 1); - - int err_number = 0; - if (alloc_status != 0) { - err_number = errno; - fprintf(stderr, "Warning: fallocate() fails, %s\n", strerror(err_number)); - } - close(fd); - ASSERT_OK(env_->DeleteFile(fname_test_fallocate)); - if (alloc_status != 0 && err_number == EOPNOTSUPP) { - // The filesystem containing the file does not support fallocate - return; - } - EnvOptions soptions; soptions.use_mmap_writes = false; unique_ptr wfile; @@ -540,7 +507,7 @@ TEST(EnvPosixTest, AllocateTest) { // verify that preallocated blocks were deallocated on file close ASSERT_GT(st_blocks, f_stat.st_blocks); } -#endif // ROCKSDB_FALLOCATE_PRESENT +#endif // Returns true if any of the strings in ss are the prefix of another string. bool HasPrefix(const std::unordered_set& ss) { @@ -671,8 +638,7 @@ TEST(EnvPosixTest, InvalidateCache) { // Delete the file ASSERT_OK(env_->DeleteFile(fname)); } -#endif // not TRAVIS -#endif // OS_LINUX +#endif TEST(EnvPosixTest, PosixRandomRWFileTest) { EnvOptions soptions; @@ -768,41 +734,6 @@ TEST(EnvPosixTest, LogBufferTest) { ASSERT_EQ(10, test_logger.char_x_count); } -class TestLogger2 : public Logger { - public: - explicit TestLogger2(size_t max_log_size) : max_log_size_(max_log_size) {} - virtual void Logv(const char* format, va_list ap) override { - char new_format[2000]; - std::fill_n(new_format, sizeof(new_format), '2'); - { - va_list backup_ap; - va_copy(backup_ap, ap); - int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap); - // 48 bytes for extra information + bytes allocated - ASSERT_TRUE( - n <= 48 + static_cast(max_log_size_ - sizeof(struct timeval))); - ASSERT_TRUE(n > static_cast(max_log_size_ - sizeof(struct timeval))); - va_end(backup_ap); - } - } - size_t max_log_size_; -}; - -TEST(EnvPosixTest, LogBufferMaxSizeTest) { - char bytes9000[9000]; - std::fill_n(bytes9000, sizeof(bytes9000), '1'); - bytes9000[sizeof(bytes9000) - 1] = '\0'; - - for (size_t max_log_size = 256; max_log_size <= 1024; - max_log_size += 1024 - 256) { - TestLogger2 test_logger(max_log_size); - test_logger.SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, &test_logger); - LogToBuffer(&log_buffer, max_log_size, "%s", bytes9000); - log_buffer.FlushBufferToLog(); - } -} - } // namespace rocksdb int main(int argc, char** argv) { diff --git a/util/hash.cc b/util/hash.cc index 37eaa4057f..e38c186c3b 100644 --- a/util/hash.cc +++ b/util/hash.cc @@ -31,26 +31,14 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) { // Pick up remaining bytes switch (limit - data) { - // Note: It would be better if this was cast to unsigned char, but that - // would be a disk format change since we previously didn't have any cast - // at all (so gcc used signed char). - // To understand the difference between shifting unsigned and signed chars, - // let's use 250 as an example. unsigned char will be 250, while signed char - // will be -6. Bit-wise, they are equivalent: 11111010. However, when - // converting negative number (signed char) to int, it will be converted - // into negative int (of equivalent value, which is -6), while converting - // positive number (unsigned char) will be converted to 250. Bitwise, - // this looks like this: - // signed char 11111010 -> int 11111111111111111111111111111010 - // unsigned char 11111010 -> int 00000000000000000000000011111010 case 3: - h += static_cast(static_cast(data[2]) << 16); - // fall through + h += data[2] << 16; + // fall through case 2: - h += static_cast(static_cast(data[1]) << 8); - // fall through + h += data[1] << 8; + // fall through case 1: - h += static_cast(static_cast(data[0])); + h += data[0]; h *= m; h ^= (h >> r); break; diff --git a/util/hash_cuckoo_rep.cc b/util/hash_cuckoo_rep.cc index 2ee05faac5..a9a79a2742 100644 --- a/util/hash_cuckoo_rep.cc +++ b/util/hash_cuckoo_rep.cc @@ -70,7 +70,7 @@ class HashCuckooRep : public MemTableRep { } cuckoo_path_ = reinterpret_cast( - arena_->Allocate(sizeof(int) * (cuckoo_path_max_depth_ + 1))); + arena_->Allocate(sizeof(int*) * (cuckoo_path_max_depth_ + 1))); is_nearly_full_ = false; } diff --git a/util/histogram.cc b/util/histogram.cc index 0dbfba7d62..968769cef5 100644 --- a/util/histogram.cc +++ b/util/histogram.cc @@ -53,7 +53,7 @@ HistogramBucketMapper::HistogramBucketMapper() } } -size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { +const size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { if (value >= maxBucketValue_) { return bucketValues_.size() - 1; } else if ( value >= minBucketValue_ ) { diff --git a/util/histogram.h b/util/histogram.h index af3a019d80..d95588dc2d 100644 --- a/util/histogram.h +++ b/util/histogram.h @@ -23,10 +23,10 @@ class HistogramBucketMapper { HistogramBucketMapper(); // converts a value to the bucket index. - size_t IndexForValue(const uint64_t value) const; + const size_t IndexForValue(const uint64_t value) const; // number of buckets required. - size_t BucketCount() const { + const size_t BucketCount() const { return bucketValues_.size(); } @@ -65,8 +65,6 @@ class HistogramImpl { virtual double StandardDeviation() const; virtual void Data(HistogramData * const data) const; - virtual ~HistogramImpl() {} - private: // To be able to use HistogramImpl as thread local variable, its constructor // has to be static. That's why we're using manually values from BucketMapper diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc index 8eda39bf92..1aa3856a35 100644 --- a/util/ldb_cmd.cc +++ b/util/ldb_cmd.cc @@ -14,7 +14,6 @@ #include "rocksdb/write_batch.h" #include "rocksdb/cache.h" #include "util/coding.h" -#include "util/scoped_arena_iterator.h" #include "utilities/ttl/db_ttl_impl.h" #include @@ -325,7 +324,7 @@ bool LDBCommand::ParseKeyValue(const string& line, string* key, string* value, bool LDBCommand::ValidateCmdLineOptions() { for (map::const_iterator itr = option_map_.begin(); - itr != option_map_.end(); ++itr) { + itr != option_map_.end(); itr++) { if (find(valid_cmd_line_options_.begin(), valid_cmd_line_options_.end(), itr->first) == valid_cmd_line_options_.end()) { @@ -335,7 +334,7 @@ bool LDBCommand::ValidateCmdLineOptions() { } for (vector::const_iterator itr = flags_.begin(); - itr != flags_.end(); ++itr) { + itr != flags_.end(); itr++) { if (find(valid_cmd_line_options_.begin(), valid_cmd_line_options_.end(), *itr) == valid_cmd_line_options_.end()) { @@ -541,7 +540,6 @@ void ManifestDumpCommand::DoCommand() { } else { exec_state_ = LDBCommandExecuteResult::FAILED( "Multiple MANIFEST files found; use --path to select one"); - closedir(d); return; } } @@ -564,8 +562,7 @@ void ManifestDumpCommand::DoCommand() { // if VersionSet::DumpManifest() depends on any option done by // SanitizeOptions(), we need to initialize it manually. options.db_paths.emplace_back("dummy", 0); - WriteController wc; - VersionSet versions(dbname, &options, sopt, tc.get(), &wc); + VersionSet versions(dbname, &options, sopt, tc.get()); Status s = versions.DumpManifest(options, file, verbose_, is_key_hex_); if (!s.ok()) { printf("Error in processing file %s %s\n", manifestfile.c_str(), @@ -742,8 +739,7 @@ void InternalDumpCommand::DoCommand() { uint64_t c=0; uint64_t s1=0,s2=0; // Setup internal key iterator - Arena arena; - ScopedArenaIterator iter(idb->TEST_NewInternalIterator(&arena)); + auto iter = unique_ptr(idb->TEST_NewInternalIterator()); Status st = iter->status(); if (!st.ok()) { exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error:" @@ -1090,8 +1086,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits, opt.table_cache_remove_scan_count_limit)); const InternalKeyComparator cmp(opt.comparator); - WriteController wc; - VersionSet versions(db_path_, &opt, soptions, tc.get(), &wc); + VersionSet versions(db_path_, &opt, soptions, tc.get()); std::vector dummy; ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName, ColumnFamilyOptions(opt)); @@ -1538,7 +1533,7 @@ void BatchPutCommand::DoCommand() { WriteBatch batch; for (vector>::const_iterator itr - = key_values_.begin(); itr != key_values_.end(); ++itr) { + = key_values_.begin(); itr != key_values_.end(); itr++) { batch.Put(itr->first, itr->second); } Status st = db_->Write(WriteOptions(), &batch); diff --git a/util/ldb_cmd_execute_result.h b/util/ldb_cmd_execute_result.h index b8e6c4634f..b9121b2b0a 100644 --- a/util/ldb_cmd_execute_result.h +++ b/util/ldb_cmd_execute_result.h @@ -13,10 +13,15 @@ public: EXEC_NOT_STARTED = 0, EXEC_SUCCEED = 1, EXEC_FAILED = 2, }; - LDBCommandExecuteResult() : state_(EXEC_NOT_STARTED), message_("") {} + LDBCommandExecuteResult() { + state_ = EXEC_NOT_STARTED; + message_ = ""; + } - LDBCommandExecuteResult(State state, std::string& msg) : - state_(state), message_(msg) {} + LDBCommandExecuteResult(State state, std::string& msg) { + state_ = state; + message_ = msg; + } std::string ToString() { std::string ret; diff --git a/util/log_buffer.cc b/util/log_buffer.cc index ddddaec9fd..726c01442b 100644 --- a/util/log_buffer.cc +++ b/util/log_buffer.cc @@ -13,17 +13,17 @@ LogBuffer::LogBuffer(const InfoLogLevel log_level, Logger*info_log) : log_level_(log_level), info_log_(info_log) {} -void LogBuffer::AddLogToBuffer(size_t max_log_size, const char* format, - va_list ap) { +void LogBuffer::AddLogToBuffer(const char* format, va_list ap) { if (!info_log_ || log_level_ < info_log_->GetInfoLogLevel()) { // Skip the level because of its level. return; } - char* alloc_mem = arena_.AllocateAligned(max_log_size); + const size_t kLogSizeLimit = 512; + char* alloc_mem = arena_.AllocateAligned(kLogSizeLimit); BufferedLog* buffered_log = new (alloc_mem) BufferedLog(); char* p = buffered_log->message; - char* limit = alloc_mem + max_log_size - 1; + char* limit = alloc_mem + kLogSizeLimit - 1; // store the time gettimeofday(&(buffered_log->now_tv), nullptr); @@ -61,22 +61,11 @@ void LogBuffer::FlushBufferToLog() { logs_.clear(); } -void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size, const char* format, - ...) { - if (log_buffer != nullptr) { - va_list ap; - va_start(ap, format); - log_buffer->AddLogToBuffer(max_log_size, format, ap); - va_end(ap); - } -} - void LogToBuffer(LogBuffer* log_buffer, const char* format, ...) { - const size_t kDefaultMaxLogSize = 512; if (log_buffer != nullptr) { va_list ap; va_start(ap, format); - log_buffer->AddLogToBuffer(kDefaultMaxLogSize, format, ap); + log_buffer->AddLogToBuffer(format, ap); va_end(ap); } } diff --git a/util/log_buffer.h b/util/log_buffer.h index 2d790086ee..2a24bf854c 100644 --- a/util/log_buffer.h +++ b/util/log_buffer.h @@ -21,9 +21,8 @@ class LogBuffer { // info_log: logger to write the logs to LogBuffer(const InfoLogLevel log_level, Logger* info_log); - // Add a log entry to the buffer. Use default max_log_size. - // max_log_size indicates maximize log size, including some metadata. - void AddLogToBuffer(size_t max_log_size, const char* format, va_list ap); + // Add a log entry to the buffer. + void AddLogToBuffer(const char* format, va_list ap); size_t IsEmpty() const { return logs_.empty(); } @@ -45,10 +44,6 @@ class LogBuffer { // Add log to the LogBuffer for a delayed info logging. It can be used when // we want to add some logs inside a mutex. -// max_log_size indicates maximize log size, including some metadata. -extern void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size, - const char* format, ...); -// Same as previous function, but with default max log size. extern void LogToBuffer(LogBuffer* log_buffer, const char* format, ...); } // namespace rocksdb diff --git a/util/logging.cc b/util/logging.cc index 98d96b82bb..1b5549d731 100644 --- a/util/logging.cc +++ b/util/logging.cc @@ -9,10 +9,7 @@ #include "util/logging.h" -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include #include @@ -45,7 +42,7 @@ int AppendHumanBytes(uint64_t bytes, char* output, int len) { void AppendNumberTo(std::string* str, uint64_t num) { char buf[30]; - snprintf(buf, sizeof(buf), "%" PRIu64, num); + snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); str->append(buf); } diff --git a/util/logging.h b/util/logging.h index 7ca8ae0a30..ce02697268 100644 --- a/util/logging.h +++ b/util/logging.h @@ -19,6 +19,7 @@ namespace rocksdb { class Slice; +class WritableFile; // Append a human-readable size in bytes int AppendHumanBytes(uint64_t bytes, char* output, int len); diff --git a/util/mutable_cf_options.cc b/util/mutable_cf_options.cc deleted file mode 100644 index 1c710c6562..0000000000 --- a/util/mutable_cf_options.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#include -#include -#include "rocksdb/options.h" -#include "rocksdb/immutable_options.h" -#include "util/mutable_cf_options.h" - -namespace rocksdb { - -namespace { -// Multiple two operands. If they overflow, return op1. -uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) { - if (op1 == 0) { - return 0; - } - if (op2 <= 0) { - return op1; - } - uint64_t casted_op2 = (uint64_t) op2; - if (std::numeric_limits::max() / op1 < casted_op2) { - return op1; - } - return op1 * casted_op2; -} -} // anonymous namespace - -void MutableCFOptions::RefreshDerivedOptions( - const ImmutableCFOptions& ioptions) { - max_file_size.resize(ioptions.num_levels); - level_max_bytes.resize(ioptions.num_levels); - for (int i = 0; i < ioptions.num_levels; ++i) { - if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) { - max_file_size[i] = ULLONG_MAX; - level_max_bytes[i] = max_bytes_for_level_base; - } else if (i > 1) { - max_file_size[i] = MultiplyCheckOverflow(max_file_size[i - 1], - target_file_size_multiplier); - level_max_bytes[i] = MultiplyCheckOverflow( - MultiplyCheckOverflow(level_max_bytes[i - 1], - max_bytes_for_level_multiplier), - max_bytes_for_level_multiplier_additional[i - 1]); - } else { - max_file_size[i] = target_file_size_base; - level_max_bytes[i] = max_bytes_for_level_base; - } - } -} - -uint64_t MutableCFOptions::MaxFileSizeForLevel(int level) const { - assert(level >= 0); - assert(level < (int)max_file_size.size()); - return max_file_size[level]; -} -uint64_t MutableCFOptions::MaxBytesForLevel(int level) const { - // Note: the result for level zero is not really used since we set - // the level-0 compaction threshold based on number of files. - assert(level >= 0); - assert(level < (int)level_max_bytes.size()); - return level_max_bytes[level]; -} -uint64_t MutableCFOptions::MaxGrandParentOverlapBytes(int level) const { - return MaxFileSizeForLevel(level) * max_grandparent_overlap_factor; -} -uint64_t MutableCFOptions::ExpandedCompactionByteSizeLimit(int level) const { - return MaxFileSizeForLevel(level) * expanded_compaction_factor; -} - -} // namespace rocksdb diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h deleted file mode 100644 index 02f63fed4a..0000000000 --- a/util/mutable_cf_options.h +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#pragma once - -#include -#include "rocksdb/options.h" -#include "rocksdb/immutable_options.h" - -namespace rocksdb { - -struct MutableCFOptions { - MutableCFOptions(const Options& options, const ImmutableCFOptions& ioptions) - : write_buffer_size(options.write_buffer_size), - arena_block_size(options.arena_block_size), - memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits), - memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes), - memtable_prefix_bloom_huge_page_tlb_size( - options.memtable_prefix_bloom_huge_page_tlb_size), - max_successive_merges(options.max_successive_merges), - filter_deletes(options.filter_deletes), - level0_file_num_compaction_trigger( - options.level0_file_num_compaction_trigger), - level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger), - level0_stop_writes_trigger(options.level0_stop_writes_trigger), - max_grandparent_overlap_factor(options.max_grandparent_overlap_factor), - expanded_compaction_factor(options.expanded_compaction_factor), - source_compaction_factor(options.source_compaction_factor), - target_file_size_base(options.target_file_size_base), - target_file_size_multiplier(options.target_file_size_multiplier), - max_bytes_for_level_base(options.max_bytes_for_level_base), - max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier), - max_bytes_for_level_multiplier_additional( - options.max_bytes_for_level_multiplier_additional) - { - RefreshDerivedOptions(ioptions); - } - MutableCFOptions() - : write_buffer_size(0), - arena_block_size(0), - memtable_prefix_bloom_bits(0), - memtable_prefix_bloom_probes(0), - memtable_prefix_bloom_huge_page_tlb_size(0), - max_successive_merges(0), - filter_deletes(false), - level0_file_num_compaction_trigger(0), - level0_slowdown_writes_trigger(0), - level0_stop_writes_trigger(0), - max_grandparent_overlap_factor(0), - expanded_compaction_factor(0), - source_compaction_factor(0), - target_file_size_base(0), - target_file_size_multiplier(0), - max_bytes_for_level_base(0), - max_bytes_for_level_multiplier(0) - {} - - // Must be called after any change to MutableCFOptions - void RefreshDerivedOptions(const ImmutableCFOptions& ioptions); - - // Get the max file size in a given level. - uint64_t MaxFileSizeForLevel(int level) const; - // Returns maximum total bytes of data on a given level. - uint64_t MaxBytesForLevel(int level) const; - // Returns maximum total overlap bytes with grandparent - // level (i.e., level+2) before we stop building a single - // file in level->level+1 compaction. - uint64_t MaxGrandParentOverlapBytes(int level) const; - uint64_t ExpandedCompactionByteSizeLimit(int level) const; - - // Memtable related options - size_t write_buffer_size; - size_t arena_block_size; - uint32_t memtable_prefix_bloom_bits; - uint32_t memtable_prefix_bloom_probes; - size_t memtable_prefix_bloom_huge_page_tlb_size; - size_t max_successive_merges; - bool filter_deletes; - - // Compaction related options - int level0_file_num_compaction_trigger; - int level0_slowdown_writes_trigger; - int level0_stop_writes_trigger; - int max_grandparent_overlap_factor; - int expanded_compaction_factor; - int source_compaction_factor; - int target_file_size_base; - int target_file_size_multiplier; - uint64_t max_bytes_for_level_base; - int max_bytes_for_level_multiplier; - std::vector max_bytes_for_level_multiplier_additional; - - // Derived options - // Per-level target file size. - std::vector max_file_size; - // Per-level max bytes - std::vector level_max_bytes; -}; - -} // namespace rocksdb diff --git a/util/options.cc b/util/options.cc index b5dc98317d..b16c6f2f57 100644 --- a/util/options.cc +++ b/util/options.cc @@ -8,12 +8,8 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/options.h" -#include "rocksdb/immutable_options.h" -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include @@ -32,38 +28,6 @@ namespace rocksdb { -ImmutableCFOptions::ImmutableCFOptions(const Options& options) - : compaction_style(options.compaction_style), - compaction_options_universal(options.compaction_options_universal), - compaction_options_fifo(options.compaction_options_fifo), - prefix_extractor(options.prefix_extractor.get()), - comparator(options.comparator), - merge_operator(options.merge_operator.get()), - compaction_filter(options.compaction_filter), - compaction_filter_factory(options.compaction_filter_factory.get()), - compaction_filter_factory_v2(options.compaction_filter_factory_v2.get()), - info_log(options.info_log.get()), - statistics(options.statistics.get()), - env(options.env), - allow_mmap_reads(options.allow_mmap_reads), - allow_mmap_writes(options.allow_mmap_writes), - db_paths(options.db_paths), - memtable_factory(options.memtable_factory.get()), - table_factory(options.table_factory.get()), - table_properties_collector_factories( - options.table_properties_collector_factories), - advise_random_on_open(options.advise_random_on_open), - bloom_locality(options.bloom_locality), - purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush), - min_partial_merge_operands(options.min_partial_merge_operands), - disable_data_sync(options.disableDataSync), - use_fsync(options.use_fsync), - compression(options.compression), - compression_per_level(options.compression_per_level), - compression_opts(options.compression_opts), - access_hint_on_compaction_start(options.access_hint_on_compaction_start), - num_levels(options.num_levels) {} - ColumnFamilyOptions::ColumnFamilyOptions() : comparator(BytewiseComparator()), merge_operator(nullptr), @@ -215,6 +179,7 @@ DBOptions::DBOptions() advise_random_on_open(true), access_hint_on_compaction_start(NORMAL), use_adaptive_mutex(false), + allow_thread_local(true), bytes_per_sync(0) {} DBOptions::DBOptions(const Options& options) @@ -257,6 +222,7 @@ DBOptions::DBOptions(const Options& options) advise_random_on_open(options.advise_random_on_open), access_hint_on_compaction_start(options.access_hint_on_compaction_start), use_adaptive_mutex(options.use_adaptive_mutex), + allow_thread_local(options.allow_thread_local), bytes_per_sync(options.bytes_per_sync) {} static const char* const access_hints[] = { @@ -274,8 +240,8 @@ void DBOptions::Dump(Logger* log) const { Log(log, " Options.disableDataSync: %d", disableDataSync); Log(log, " Options.use_fsync: %d", use_fsync); Log(log, " Options.max_log_file_size: %zu", max_log_file_size); - Log(log, "Options.max_manifest_file_size: %" PRIu64, - max_manifest_file_size); + Log(log, "Options.max_manifest_file_size: %lu", + (unsigned long)max_manifest_file_size); Log(log, " Options.log_file_time_to_roll: %zu", log_file_time_to_roll); Log(log, " Options.keep_log_file_num: %zu", keep_log_file_num); Log(log, " Options.allow_os_buffer: %d", allow_os_buffer); @@ -291,16 +257,16 @@ void DBOptions::Dump(Logger* log) const { table_cache_numshardbits); Log(log, " Options.table_cache_remove_scan_count_limit: %d", table_cache_remove_scan_count_limit); - Log(log, " Options.delete_obsolete_files_period_micros: %" PRIu64, - delete_obsolete_files_period_micros); + Log(log, " Options.delete_obsolete_files_period_micros: %lu", + (unsigned long)delete_obsolete_files_period_micros); Log(log, " Options.max_background_compactions: %d", max_background_compactions); Log(log, " Options.max_background_flushes: %d", max_background_flushes); - Log(log, " Options.WAL_ttl_seconds: %" PRIu64, - WAL_ttl_seconds); - Log(log, " Options.WAL_size_limit_MB: %" PRIu64, - WAL_size_limit_MB); + Log(log, " Options.WAL_ttl_seconds: %lu", + (unsigned long)WAL_ttl_seconds); + Log(log, " Options.WAL_size_limit_MB: %lu", + (unsigned long)WAL_size_limit_MB); Log(log, " Options.manifest_preallocation_size: %zu", manifest_preallocation_size); Log(log, " Options.allow_os_buffer: %d", @@ -323,8 +289,8 @@ void DBOptions::Dump(Logger* log) const { use_adaptive_mutex); Log(log, " Options.rate_limiter: %p", rate_limiter.get()); - Log(log, " Options.bytes_per_sync: %" PRIu64, - bytes_per_sync); + Log(log, " Options.bytes_per_sync: %lu", + (unsigned long)bytes_per_sync); } // DBOptions::Dump void ColumnFamilyOptions::Dump(Logger* log) const { @@ -372,20 +338,20 @@ void ColumnFamilyOptions::Dump(Logger* log) const { level0_stop_writes_trigger); Log(log," Options.max_mem_compaction_level: %d", max_mem_compaction_level); - Log(log," Options.target_file_size_base: %" PRIu64, + Log(log," Options.target_file_size_base: %d", target_file_size_base); Log(log," Options.target_file_size_multiplier: %d", target_file_size_multiplier); - Log(log," Options.max_bytes_for_level_base: %" PRIu64, - max_bytes_for_level_base); + Log(log," Options.max_bytes_for_level_base: %lu", + (unsigned long)max_bytes_for_level_base); Log(log," Options.max_bytes_for_level_multiplier: %d", max_bytes_for_level_multiplier); for (int i = 0; i < num_levels; i++) { Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d", i, max_bytes_for_level_multiplier_additional[i]); } - Log(log," Options.max_sequential_skip_in_iterations: %" PRIu64, - max_sequential_skip_in_iterations); + Log(log," Options.max_sequential_skip_in_iterations: %lu", + (unsigned long)max_sequential_skip_in_iterations); Log(log," Options.expanded_compaction_factor: %d", expanded_compaction_factor); Log(log," Options.source_compaction_factor: %d", @@ -420,7 +386,7 @@ void ColumnFamilyOptions::Dump(Logger* log) const { "max_size_amplification_percent: %u", compaction_options_universal.max_size_amplification_percent); Log(log, - "Options.compaction_options_universal.compression_size_percent: %d", + "Options.compaction_options_universal.compression_size_percent: %u", compaction_options_universal.compression_size_percent); Log(log, "Options.compaction_options_fifo.max_table_files_size: %" PRIu64, compaction_options_fifo.max_table_files_size); diff --git a/util/options_helper.cc b/util/options_helper.cc deleted file mode 100644 index 2a61c8b69f..0000000000 --- a/util/options_helper.cc +++ /dev/null @@ -1,328 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#include -#include -#include "rocksdb/options.h" -#include "util/options_helper.h" - -namespace rocksdb { - -namespace { -CompressionType ParseCompressionType(const std::string& type) { - if (type == "kNoCompression") { - return kNoCompression; - } else if (type == "kSnappyCompression") { - return kSnappyCompression; - } else if (type == "kZlibCompression") { - return kZlibCompression; - } else if (type == "kBZip2Compression") { - return kBZip2Compression; - } else if (type == "kLZ4Compression") { - return kLZ4Compression; - } else if (type == "kLZ4HCCompression") { - return kLZ4HCCompression; - } else { - throw "unknown compression type: " + type; - } - return kNoCompression; -} - -bool ParseBoolean(const std::string& type, const std::string& value) { - if (value == "true" || value == "1") { - return true; - } else if (value == "false" || value == "0") { - return false; - } else { - throw type; - } -} -uint32_t ParseInt(const std::string& value) { - return std::stoi(value); -} - -uint32_t ParseUint32(const std::string& value) { - return std::stoul(value); -} - -uint64_t ParseUint64(const std::string& value) { - return std::stoull(value); -} - -int64_t ParseInt64(const std::string& value) { - return std::stol(value); -} - -double ParseDouble(const std::string& value) { - return std::stod(value); -} - -CompactionStyle ParseCompactionStyle(const std::string& type) { - if (type == "kCompactionStyleLevel") { - return kCompactionStyleLevel; - } else if (type == "kCompactionStyleUniversal") { - return kCompactionStyleUniversal; - } else if (type == "kCompactionStyleFIFO") { - return kCompactionStyleFIFO; - } else { - throw "unknown compaction style: " + type; - } - return kCompactionStyleLevel; -} -} // anonymouse namespace - -template -bool ParseMemtableOptions(const std::string& name, const std::string& value, - OptionsType* new_options) { - if (name == "write_buffer_size") { - new_options->write_buffer_size = ParseInt64(value); - } else if (name == "arena_block_size") { - new_options->arena_block_size = ParseInt64(value); - } else if (name == "memtable_prefix_bloom_bits") { - new_options->memtable_prefix_bloom_bits = stoul(value); - } else if (name == "memtable_prefix_bloom_probes") { - new_options->memtable_prefix_bloom_probes = stoul(value); - } else if (name == "memtable_prefix_bloom_huge_page_tlb_size") { - new_options->memtable_prefix_bloom_huge_page_tlb_size = - ParseInt64(value); - } else if (name == "max_successive_merges") { - new_options->max_successive_merges = ParseInt64(value); - } else if (name == "filter_deletes") { - new_options->filter_deletes = ParseBoolean(name, value); - } else { - return false; - } - return true; -} - -template -bool ParseCompactionOptions(const std::string& name, const std::string& value, - OptionsType* new_options) { - if (name == "level0_file_num_compaction_trigger") { - new_options->level0_file_num_compaction_trigger = ParseInt(value); - } else if (name == "level0_slowdown_writes_trigger") { - new_options->level0_slowdown_writes_trigger = ParseInt(value); - } else if (name == "level0_stop_writes_trigger") { - new_options->level0_stop_writes_trigger = ParseInt(value); - } else if (name == "max_grandparent_overlap_factor") { - new_options->max_grandparent_overlap_factor = ParseInt(value); - } else if (name == "expanded_compaction_factor") { - new_options->expanded_compaction_factor = ParseInt(value); - } else if (name == "source_compaction_factor") { - new_options->source_compaction_factor = ParseInt(value); - } else if (name == "target_file_size_base") { - new_options->target_file_size_base = ParseInt(value); - } else if (name == "target_file_size_multiplier") { - new_options->target_file_size_multiplier = ParseInt(value); - } else if (name == "max_bytes_for_level_base") { - new_options->max_bytes_for_level_base = ParseUint64(value); - } else if (name == "max_bytes_for_level_multiplier") { - new_options->max_bytes_for_level_multiplier = ParseInt(value); - } else if (name == "max_bytes_for_level_multiplier_additional") { - new_options->max_bytes_for_level_multiplier_additional.clear(); - size_t start = 0; - while (true) { - size_t end = value.find_first_of(':', start); - if (end == std::string::npos) { - new_options->max_bytes_for_level_multiplier_additional.push_back( - ParseInt(value.substr(start))); - break; - } else { - new_options->max_bytes_for_level_multiplier_additional.push_back( - ParseInt(value.substr(start, end - start))); - start = end + 1; - } - } - } else { - return false; - } - return true; -} - -bool GetMutableOptionsFromStrings( - const MutableCFOptions& base_options, - const std::unordered_map& options_map, - MutableCFOptions* new_options) { - assert(new_options); - *new_options = base_options; - try { - for (const auto& o : options_map) { - if (ParseMemtableOptions(o.first, o.second, new_options)) { - } else if (ParseCompactionOptions(o.first, o.second, new_options)) { - } else { - return false; - } - } - } catch (std::exception) { - return false; - } - return true; -} - -bool GetOptionsFromStrings( - const Options& base_options, - const std::unordered_map& options_map, - Options* new_options) { - assert(new_options); - *new_options = base_options; - for (const auto& o : options_map) { - try { - if (ParseMemtableOptions(o.first, o.second, new_options)) { - } else if (ParseCompactionOptions(o.first, o.second, new_options)) { - } else if (o.first == "max_write_buffer_number") { - new_options->max_write_buffer_number = ParseInt(o.second); - } else if (o.first == "min_write_buffer_number_to_merge") { - new_options->min_write_buffer_number_to_merge = ParseInt(o.second); - } else if (o.first == "compression") { - new_options->compression = ParseCompressionType(o.second); - } else if (o.first == "compression_per_level") { - new_options->compression_per_level.clear(); - size_t start = 0; - while (true) { - size_t end = o.second.find_first_of(':', start); - if (end == std::string::npos) { - new_options->compression_per_level.push_back( - ParseCompressionType(o.second.substr(start))); - break; - } else { - new_options->compression_per_level.push_back( - ParseCompressionType(o.second.substr(start, end - start))); - start = end + 1; - } - } - } else if (o.first == "compression_opts") { - size_t start = 0; - size_t end = o.second.find_first_of(':'); - if (end == std::string::npos) { - throw o.first; - } - new_options->compression_opts.window_bits = - ParseInt(o.second.substr(start, end - start)); - start = end + 1; - end = o.second.find_first_of(':', start); - if (end == std::string::npos) { - throw o.first; - } - new_options->compression_opts.level = - ParseInt(o.second.substr(start, end - start)); - start = end + 1; - if (start >= o.second.size()) { - throw o.first; - } - new_options->compression_opts.strategy = - ParseInt(o.second.substr(start, o.second.size() - start)); - } else if (o.first == "num_levels") { - new_options->num_levels = ParseInt(o.second); - } else if (o.first == "max_mem_compaction_level") { - new_options->max_mem_compaction_level = ParseInt(o.second); - } else if (o.first == "soft_rate_limit") { - new_options->soft_rate_limit = ParseDouble(o.second); - } else if (o.first == "hard_rate_limit") { - new_options->hard_rate_limit = ParseDouble(o.second); - } else if (o.first == "disable_auto_compactions") { - new_options->disable_auto_compactions = ParseBoolean(o.first, o.second); - } else if (o.first == "purge_redundant_kvs_while_flush") { - new_options->purge_redundant_kvs_while_flush = - ParseBoolean(o.first, o.second); - } else if (o.first == "compaction_style") { - new_options->compaction_style = ParseCompactionStyle(o.second); - } else if (o.first == "verify_checksums_in_compaction") { - new_options->verify_checksums_in_compaction = - ParseBoolean(o.first, o.second); - } else if (o.first == "compaction_options_universal") { - // TODO(ljin): add support - throw o.first; - } else if (o.first == "compaction_options_fifo") { - new_options->compaction_options_fifo.max_table_files_size - = ParseUint64(o.second); - } else if (o.first == "max_sequential_skip_in_iterations") { - new_options->max_sequential_skip_in_iterations = ParseUint64(o.second); - } else if (o.first == "inplace_update_support") { - new_options->inplace_update_support = ParseBoolean(o.first, o.second); - } else if (o.first == "inplace_update_num_locks") { - new_options->inplace_update_num_locks = ParseInt64(o.second); - } else if (o.first == "bloom_locality") { - new_options->bloom_locality = ParseUint32(o.second); - } else if (o.first == "min_partial_merge_operands") { - new_options->min_partial_merge_operands = ParseUint32(o.second); - } else if (o.first == "create_if_missing") { - new_options->create_if_missing = ParseBoolean(o.first, o.second); - } else if (o.first == "create_missing_column_families") { - new_options->create_missing_column_families = - ParseBoolean(o.first, o.second); - } else if (o.first == "error_if_exists") { - new_options->error_if_exists = ParseBoolean(o.first, o.second); - } else if (o.first == "paranoid_checks") { - new_options->paranoid_checks = ParseBoolean(o.first, o.second); - } else if (o.first == "max_open_files") { - new_options->max_open_files = ParseInt(o.second); - } else if (o.first == "max_total_wal_size") { - new_options->max_total_wal_size = ParseUint64(o.second); - } else if (o.first == "disable_data_sync") { - new_options->disableDataSync = ParseBoolean(o.first, o.second); - } else if (o.first == "use_fsync") { - new_options->use_fsync = ParseBoolean(o.first, o.second); - } else if (o.first == "db_paths") { - // TODO(ljin): add support - throw o.first; - } else if (o.first == "db_log_dir") { - new_options->db_log_dir = o.second; - } else if (o.first == "wal_dir") { - new_options->wal_dir = o.second; - } else if (o.first == "delete_obsolete_files_period_micros") { - new_options->delete_obsolete_files_period_micros = - ParseUint64(o.second); - } else if (o.first == "max_background_compactions") { - new_options->max_background_compactions = ParseInt(o.second); - } else if (o.first == "max_background_flushes") { - new_options->max_background_flushes = ParseInt(o.second); - } else if (o.first == "max_log_file_size") { - new_options->max_log_file_size = ParseInt64(o.second); - } else if (o.first == "log_file_time_to_roll") { - new_options->log_file_time_to_roll = ParseInt64(o.second); - } else if (o.first == "keep_log_file_num") { - new_options->keep_log_file_num = ParseInt64(o.second); - } else if (o.first == "max_manifest_file_size") { - new_options->max_manifest_file_size = ParseUint64(o.second); - } else if (o.first == "table_cache_numshardbits") { - new_options->table_cache_numshardbits = ParseInt(o.second); - } else if (o.first == "table_cache_remove_scan_count_limit") { - new_options->table_cache_remove_scan_count_limit = ParseInt(o.second); - } else if (o.first == "WAL_ttl_seconds") { - new_options->WAL_ttl_seconds = ParseUint64(o.second); - } else if (o.first == "WAL_size_limit_MB") { - new_options->WAL_size_limit_MB = ParseUint64(o.second); - } else if (o.first == "manifest_preallocation_size") { - new_options->manifest_preallocation_size = ParseInt64(o.second); - } else if (o.first == "allow_os_buffer") { - new_options->allow_os_buffer = ParseBoolean(o.first, o.second); - } else if (o.first == "allow_mmap_reads") { - new_options->allow_mmap_reads = ParseBoolean(o.first, o.second); - } else if (o.first == "allow_mmap_writes") { - new_options->allow_mmap_writes = ParseBoolean(o.first, o.second); - } else if (o.first == "is_fd_close_on_exec") { - new_options->is_fd_close_on_exec = ParseBoolean(o.first, o.second); - } else if (o.first == "skip_log_error_on_recovery") { - new_options->skip_log_error_on_recovery = - ParseBoolean(o.first, o.second); - } else if (o.first == "stats_dump_period_sec") { - new_options->stats_dump_period_sec = ParseUint32(o.second); - } else if (o.first == "advise_random_on_open") { - new_options->advise_random_on_open = ParseBoolean(o.first, o.second); - } else if (o.first == "use_adaptive_mutex") { - new_options->use_adaptive_mutex = ParseBoolean(o.first, o.second); - } else if (o.first == "bytes_per_sync") { - new_options->bytes_per_sync = ParseUint64(o.second); - } else { - return false; - } - } catch (std::exception) { - return false; - } - } - return true; -} - -} // namespace rocksdb diff --git a/util/options_helper.h b/util/options_helper.h deleted file mode 100644 index c04d2a5d7c..0000000000 --- a/util/options_helper.h +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#pragma once - -#include -#include "util/mutable_cf_options.h" - -namespace rocksdb { - -bool GetMutableOptionsFromStrings( - const MutableCFOptions& base_options, - const std::unordered_map& options_map, - MutableCFOptions* new_options); - -} // namespace rocksdb diff --git a/util/options_test.cc b/util/options_test.cc index 1e26c343d4..be07a83f52 100644 --- a/util/options_test.cc +++ b/util/options_test.cc @@ -7,11 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - -#include #include #include @@ -76,178 +72,6 @@ TEST(OptionsTest, LooseCondition) { // Both tight amplifications PrintAndGetOptions(128 * 1024 * 1024, 4, 8); } - -TEST(OptionsTest, GetOptionsFromStringsTest) { - std::unordered_map options_map = { - {"write_buffer_size", "1"}, - {"max_write_buffer_number", "2"}, - {"min_write_buffer_number_to_merge", "3"}, - {"compression", "kSnappyCompression"}, - {"compression_per_level", "kNoCompression:" - "kSnappyCompression:" - "kZlibCompression:" - "kBZip2Compression:" - "kLZ4Compression:" - "kLZ4HCCompression"}, - {"compression_opts", "4:5:6"}, - {"num_levels", "7"}, - {"level0_file_num_compaction_trigger", "8"}, - {"level0_slowdown_writes_trigger", "9"}, - {"level0_stop_writes_trigger", "10"}, - {"max_mem_compaction_level", "11"}, - {"target_file_size_base", "12"}, - {"target_file_size_multiplier", "13"}, - {"max_bytes_for_level_base", "14"}, - {"max_bytes_for_level_multiplier", "15"}, - {"max_bytes_for_level_multiplier_additional", "16:17:18"}, - {"expanded_compaction_factor", "19"}, - {"source_compaction_factor", "20"}, - {"max_grandparent_overlap_factor", "21"}, - {"soft_rate_limit", "1.1"}, - {"hard_rate_limit", "2.1"}, - {"arena_block_size", "22"}, - {"disable_auto_compactions", "true"}, - {"purge_redundant_kvs_while_flush", "1"}, - {"compaction_style", "kCompactionStyleLevel"}, - {"verify_checksums_in_compaction", "false"}, - {"compaction_options_fifo", "23"}, - {"filter_deletes", "0"}, - {"max_sequential_skip_in_iterations", "24"}, - {"inplace_update_support", "true"}, - {"inplace_update_num_locks", "25"}, - {"memtable_prefix_bloom_bits", "26"}, - {"memtable_prefix_bloom_probes", "27"}, - {"memtable_prefix_bloom_huge_page_tlb_size", "28"}, - {"bloom_locality", "29"}, - {"max_successive_merges", "30"}, - {"min_partial_merge_operands", "31"}, - {"create_if_missing", "false"}, - {"create_missing_column_families", "true"}, - {"error_if_exists", "false"}, - {"paranoid_checks", "true"}, - {"max_open_files", "32"}, - {"max_total_wal_size", "33"}, - {"disable_data_sync", "false"}, - {"use_fsync", "true"}, - {"db_log_dir", "/db_log_dir"}, - {"wal_dir", "/wal_dir"}, - {"delete_obsolete_files_period_micros", "34"}, - {"max_background_compactions", "35"}, - {"max_background_flushes", "36"}, - {"max_log_file_size", "37"}, - {"log_file_time_to_roll", "38"}, - {"keep_log_file_num", "39"}, - {"max_manifest_file_size", "40"}, - {"table_cache_numshardbits", "41"}, - {"table_cache_remove_scan_count_limit", "42"}, - {"WAL_ttl_seconds", "43"}, - {"WAL_size_limit_MB", "44"}, - {"manifest_preallocation_size", "45"}, - {"allow_os_buffer", "false"}, - {"allow_mmap_reads", "true"}, - {"allow_mmap_writes", "false"}, - {"is_fd_close_on_exec", "true"}, - {"skip_log_error_on_recovery", "false"}, - {"stats_dump_period_sec", "46"}, - {"advise_random_on_open", "true"}, - {"use_adaptive_mutex", "false"}, - {"bytes_per_sync", "47"}, - }; - - Options base_opt; - Options new_opt; - ASSERT_TRUE(GetOptionsFromStrings(base_opt, options_map, &new_opt)); - ASSERT_EQ(new_opt.write_buffer_size, 1U); - ASSERT_EQ(new_opt.max_write_buffer_number, 2); - ASSERT_EQ(new_opt.min_write_buffer_number_to_merge, 3); - ASSERT_EQ(new_opt.compression, kSnappyCompression); - ASSERT_EQ(new_opt.compression_per_level.size(), 6U); - ASSERT_EQ(new_opt.compression_per_level[0], kNoCompression); - ASSERT_EQ(new_opt.compression_per_level[1], kSnappyCompression); - ASSERT_EQ(new_opt.compression_per_level[2], kZlibCompression); - ASSERT_EQ(new_opt.compression_per_level[3], kBZip2Compression); - ASSERT_EQ(new_opt.compression_per_level[4], kLZ4Compression); - ASSERT_EQ(new_opt.compression_per_level[5], kLZ4HCCompression); - ASSERT_EQ(new_opt.compression_opts.window_bits, 4); - ASSERT_EQ(new_opt.compression_opts.level, 5); - ASSERT_EQ(new_opt.compression_opts.strategy, 6); - ASSERT_EQ(new_opt.num_levels, 7); - ASSERT_EQ(new_opt.level0_file_num_compaction_trigger, 8); - ASSERT_EQ(new_opt.level0_slowdown_writes_trigger, 9); - ASSERT_EQ(new_opt.level0_stop_writes_trigger, 10); - ASSERT_EQ(new_opt.max_mem_compaction_level, 11); - ASSERT_EQ(new_opt.target_file_size_base, static_cast(12)); - ASSERT_EQ(new_opt.target_file_size_multiplier, 13); - ASSERT_EQ(new_opt.max_bytes_for_level_base, 14U); - ASSERT_EQ(new_opt.max_bytes_for_level_multiplier, 15); - ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional.size(), 3U); - ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[0], 16); - ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[1], 17); - ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[2], 18); - ASSERT_EQ(new_opt.expanded_compaction_factor, 19); - ASSERT_EQ(new_opt.source_compaction_factor, 20); - ASSERT_EQ(new_opt.max_grandparent_overlap_factor, 21); - ASSERT_EQ(new_opt.soft_rate_limit, 1.1); - ASSERT_EQ(new_opt.hard_rate_limit, 2.1); - ASSERT_EQ(new_opt.arena_block_size, 22U); - ASSERT_EQ(new_opt.disable_auto_compactions, true); - ASSERT_EQ(new_opt.purge_redundant_kvs_while_flush, true); - ASSERT_EQ(new_opt.compaction_style, kCompactionStyleLevel); - ASSERT_EQ(new_opt.verify_checksums_in_compaction, false); - ASSERT_EQ(new_opt.compaction_options_fifo.max_table_files_size, - static_cast(23)); - ASSERT_EQ(new_opt.filter_deletes, false); - ASSERT_EQ(new_opt.max_sequential_skip_in_iterations, - static_cast(24)); - ASSERT_EQ(new_opt.inplace_update_support, true); - ASSERT_EQ(new_opt.inplace_update_num_locks, 25U); - ASSERT_EQ(new_opt.memtable_prefix_bloom_bits, 26U); - ASSERT_EQ(new_opt.memtable_prefix_bloom_probes, 27U); - ASSERT_EQ(new_opt.memtable_prefix_bloom_huge_page_tlb_size, 28U); - ASSERT_EQ(new_opt.bloom_locality, 29U); - ASSERT_EQ(new_opt.max_successive_merges, 30U); - ASSERT_EQ(new_opt.min_partial_merge_operands, 31U); - ASSERT_EQ(new_opt.create_if_missing, false); - ASSERT_EQ(new_opt.create_missing_column_families, true); - ASSERT_EQ(new_opt.error_if_exists, false); - ASSERT_EQ(new_opt.paranoid_checks, true); - ASSERT_EQ(new_opt.max_open_files, 32); - ASSERT_EQ(new_opt.max_total_wal_size, static_cast(33)); - ASSERT_EQ(new_opt.disableDataSync, false); - ASSERT_EQ(new_opt.use_fsync, true); - ASSERT_EQ(new_opt.db_log_dir, "/db_log_dir"); - ASSERT_EQ(new_opt.wal_dir, "/wal_dir"); - ASSERT_EQ(new_opt.delete_obsolete_files_period_micros, - static_cast(34)); - ASSERT_EQ(new_opt.max_background_compactions, 35); - ASSERT_EQ(new_opt.max_background_flushes, 36); - ASSERT_EQ(new_opt.max_log_file_size, 37U); - ASSERT_EQ(new_opt.log_file_time_to_roll, 38U); - ASSERT_EQ(new_opt.keep_log_file_num, 39U); - ASSERT_EQ(new_opt.max_manifest_file_size, static_cast(40)); - ASSERT_EQ(new_opt.table_cache_numshardbits, 41); - ASSERT_EQ(new_opt.table_cache_remove_scan_count_limit, 42); - ASSERT_EQ(new_opt.WAL_ttl_seconds, static_cast(43)); - ASSERT_EQ(new_opt.WAL_size_limit_MB, static_cast(44)); - ASSERT_EQ(new_opt.manifest_preallocation_size, 45U); - ASSERT_EQ(new_opt.allow_os_buffer, false); - ASSERT_EQ(new_opt.allow_mmap_reads, true); - ASSERT_EQ(new_opt.allow_mmap_writes, false); - ASSERT_EQ(new_opt.is_fd_close_on_exec, true); - ASSERT_EQ(new_opt.skip_log_error_on_recovery, false); - ASSERT_EQ(new_opt.stats_dump_period_sec, 46U); - ASSERT_EQ(new_opt.advise_random_on_open, true); - ASSERT_EQ(new_opt.use_adaptive_mutex, false); - ASSERT_EQ(new_opt.bytes_per_sync, static_cast(47)); - - options_map["write_buffer_size"] = "hello"; - ASSERT_TRUE(!GetOptionsFromStrings(base_opt, options_map, &new_opt)); - options_map["write_buffer_size"] = "1"; - ASSERT_TRUE(GetOptionsFromStrings(base_opt, options_map, &new_opt)); - options_map["unknown_option"] = "1"; - ASSERT_TRUE(!GetOptionsFromStrings(base_opt, options_map, &new_opt)); -} - } // namespace rocksdb int main(int argc, char** argv) { diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc index 47f96de84b..cde86f3c9e 100644 --- a/util/rate_limiter.cc +++ b/util/rate_limiter.cc @@ -60,7 +60,7 @@ GenericRateLimiter::~GenericRateLimiter() { } void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri) { - assert(bytes <= refill_bytes_per_period_); + assert(bytes < refill_bytes_per_period_); MutexLock g(&request_mutex_); if (stop_) { diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc index 9d6cfb7e6a..1b72e4ed0b 100644 --- a/util/rate_limiter_test.cc +++ b/util/rate_limiter_test.cc @@ -7,10 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include #include "util/testharness.h" diff --git a/util/scoped_arena_iterator.h b/util/scoped_arena_iterator.h deleted file mode 100644 index 2021d2dc22..0000000000 --- a/util/scoped_arena_iterator.h +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -#pragma once - -#include "rocksdb/iterator.h" - -namespace rocksdb { -class ScopedArenaIterator { - public: - explicit ScopedArenaIterator(Iterator* iter = nullptr) : iter_(iter) {} - - Iterator* operator->() { return iter_; } - - void set(Iterator* iter) { iter_ = iter; } - - Iterator* get() { return iter_; } - - ~ScopedArenaIterator() { iter_->~Iterator(); } - - private: - Iterator* iter_; -}; -} // namespace rocksdb diff --git a/util/signal_test.cc b/util/signal_test.cc index b23ad6a98b..f51fa548ef 100644 --- a/util/signal_test.cc +++ b/util/signal_test.cc @@ -9,7 +9,6 @@ namespace { void f0() { char *p = nullptr; - // cppcheck-suppress nullPointer *p = 10; /* SIGSEGV here!! */ } diff --git a/util/statistics.cc b/util/statistics.cc index 9d828a6feb..24957c9b6f 100644 --- a/util/statistics.cc +++ b/util/statistics.cc @@ -5,10 +5,7 @@ // #include "util/statistics.h" -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include "rocksdb/statistics.h" #include "port/likely.h" diff --git a/util/testutil.cc b/util/testutil.cc index 20f22c2dc4..363b8ff191 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -23,15 +23,6 @@ Slice RandomString(Random* rnd, int len, std::string* dst) { return Slice(*dst); } -extern std::string RandomHumanReadableString(Random* rnd, int len) { - std::string ret; - ret.resize(len); - for (int i = 0; i < len; ++i) { - ret[i] = static_cast('a' + rnd->Uniform(26)); - } - return ret; -} - std::string RandomKey(Random* rnd, int len) { // Make sure to generate a wide variety of characters so we // test the boundary conditions for short-key optimizations. diff --git a/util/testutil.h b/util/testutil.h index eff0d7e7d8..c615fc1e7a 100644 --- a/util/testutil.h +++ b/util/testutil.h @@ -21,8 +21,6 @@ namespace test { // references the generated data. extern Slice RandomString(Random* rnd, int len, std::string* dst); -extern std::string RandomHumanReadableString(Random* rnd, int len); - // Return a random key with the specified length that may contain interesting // characters (e.g. \x00, \xff, etc.). extern std::string RandomKey(Random* rnd, int len); diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 20ec9db85e..436f4c2d68 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -15,9 +15,7 @@ #include "util/crc32c.h" #include "rocksdb/transaction_log.h" -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif #include #include @@ -72,27 +70,6 @@ class BackupRateLimiter { }; } // namespace -void BackupStatistics::IncrementNumberSuccessBackup() { - number_success_backup++; -} -void BackupStatistics::IncrementNumberFailBackup() { - number_fail_backup++; -} - -uint32_t BackupStatistics::GetNumberSuccessBackup() const { - return number_success_backup; -} -uint32_t BackupStatistics::GetNumberFailBackup() const { - return number_fail_backup; -} - -std::string BackupStatistics::ToString() const { - char result[50]; - snprintf(result, sizeof(result), "# success backup: %u, # fail backup: %u", - GetNumberSuccessBackup(), GetNumberFailBackup()); - return result; -} - void BackupableDBOptions::Dump(Logger* logger) const { Log(logger, " Options.backup_dir: %s", backup_dir.c_str()); Log(logger, " Options.backup_env: %p", backup_env); @@ -165,9 +142,6 @@ class BackupEngineImpl : public BackupEngine { uint64_t GetSize() const { return size_; } - uint32_t GetNumberFiles() { - return files_.size(); - } void SetSequenceNumber(uint64_t sequence_number) { sequence_number_ = sequence_number; } @@ -312,7 +286,6 @@ class BackupEngineImpl : public BackupEngine { static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL; // 5MB size_t copy_file_buffer_size_; bool read_only_; - BackupStatistics backup_statistics_; }; BackupEngine* BackupEngine::NewBackupEngine( @@ -468,8 +441,6 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { new_backup.RecordTimestamp(); new_backup.SetSequenceNumber(sequence_number); - auto start_backup = backup_env_-> NowMicros(); - Log(options_.info_log, "Started the backup process -- creating backup %u", new_backup_id); @@ -534,8 +505,6 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { GetAbsolutePath(GetPrivateFileRel(new_backup_id, false))); } - auto backup_time = backup_env_->NowMicros() - start_backup; - if (s.ok()) { // persist the backup metadata on the disk s = new_backup.StoreToFile(options_.sync); @@ -566,15 +535,9 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { } } - if (s.ok()) { - backup_statistics_.IncrementNumberSuccessBackup(); - } if (!s.ok()) { - backup_statistics_.IncrementNumberFailBackup(); // clean all the files we might have created Log(options_.info_log, "Backup failed -- %s", s.ToString().c_str()); - Log(options_.info_log, "Backup Statistics %s\n", - backup_statistics_.ToString().c_str()); backups_.erase(new_backup_id); GarbageCollection(true); return s; @@ -584,17 +547,6 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { // in the LATEST_BACKUP file latest_backup_id_ = new_backup_id; Log(options_.info_log, "Backup DONE. All is good"); - - // backup_speed is in byte/second - double backup_speed = new_backup.GetSize() / (1.048576 * backup_time); - Log(options_.info_log, "Backup number of files: %u", - new_backup.GetNumberFiles()); - Log(options_.info_log, "Backup size: %" PRIu64 " bytes", - new_backup.GetSize()); - Log(options_.info_log, "Backup time: %" PRIu64 " microseconds", backup_time); - Log(options_.info_log, "Backup speed: %.3f MB/s", backup_speed); - Log(options_.info_log, "Backup Statistics %s", - backup_statistics_.ToString().c_str()); return s; } @@ -630,9 +582,8 @@ void BackupEngineImpl::GetBackupInfo(std::vector* backup_info) { backup_info->reserve(backups_.size()); for (auto& backup : backups_) { if (!backup.second.Empty()) { - backup_info->push_back(BackupInfo( - backup.first, backup.second.GetTimestamp(), backup.second.GetSize(), - backup.second.GetNumberFiles())); + backup_info->push_back(BackupInfo( + backup.first, backup.second.GetTimestamp(), backup.second.GetSize())); } } } diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index a585d1a9cf..1d876cd501 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -916,7 +916,7 @@ TEST(BackupableDBTest, RateLimiting) { auto backup_time = env_->NowMicros() - start_backup; auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) / backupable_options_->backup_rate_limit; - ASSERT_GT(backup_time, 0.8 * rate_limited_backup_time); + ASSERT_GT(backup_time, 0.9 * rate_limited_backup_time); CloseBackupableDB(); @@ -927,7 +927,7 @@ TEST(BackupableDBTest, RateLimiting) { CloseRestoreDB(); auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) / backupable_options_->restore_rate_limit; - ASSERT_GT(restore_time, 0.8 * rate_limited_restore_time); + ASSERT_GT(restore_time, 0.9 * rate_limited_restore_time); AssertBackupConsistency(0, 0, 100000, 100010); } diff --git a/utilities/compacted_db/compacted_db_impl.cc b/utilities/compacted_db/compacted_db_impl.cc deleted file mode 100644 index 775033e2a8..0000000000 --- a/utilities/compacted_db/compacted_db_impl.cc +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#ifndef ROCKSDB_LITE -#include "utilities/compacted_db/compacted_db_impl.h" -#include "db/db_impl.h" -#include "db/version_set.h" -#include "table/get_context.h" - -namespace rocksdb { - -extern void MarkKeyMayExist(void* arg); -extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, - const Slice& v, bool hit_and_return); - -CompactedDBImpl::CompactedDBImpl( - const DBOptions& options, const std::string& dbname) - : DBImpl(options, dbname) { -} - -CompactedDBImpl::~CompactedDBImpl() { -} - -size_t CompactedDBImpl::FindFile(const Slice& key) { - size_t left = 0; - size_t right = files_.num_files - 1; - while (left < right) { - size_t mid = (left + right) >> 1; - const FdWithKeyRange& f = files_.files[mid]; - if (user_comparator_->Compare(ExtractUserKey(f.largest_key), key) < 0) { - // Key at "mid.largest" is < "target". Therefore all - // files at or before "mid" are uninteresting. - left = mid + 1; - } else { - // Key at "mid.largest" is >= "target". Therefore all files - // after "mid" are uninteresting. - right = mid; - } - } - return right; -} - -Status CompactedDBImpl::Get(const ReadOptions& options, - ColumnFamilyHandle*, const Slice& key, std::string* value) { - GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, - GetContext::kNotFound, key, value, nullptr, nullptr); - LookupKey lkey(key, kMaxSequenceNumber); - files_.files[FindFile(key)].fd.table_reader->Get( - options, lkey.internal_key(), &get_context); - if (get_context.State() == GetContext::kFound) { - return Status::OK(); - } - return Status::NotFound(); -} - -std::vector CompactedDBImpl::MultiGet(const ReadOptions& options, - const std::vector&, - const std::vector& keys, std::vector* values) { - autovector reader_list; - for (const auto& key : keys) { - const FdWithKeyRange& f = files_.files[FindFile(key)]; - if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) { - reader_list.push_back(nullptr); - } else { - LookupKey lkey(key, kMaxSequenceNumber); - f.fd.table_reader->Prepare(lkey.internal_key()); - reader_list.push_back(f.fd.table_reader); - } - } - std::vector statuses(keys.size(), Status::NotFound()); - values->resize(keys.size()); - int idx = 0; - for (auto* r : reader_list) { - if (r != nullptr) { - GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, - GetContext::kNotFound, keys[idx], &(*values)[idx], - nullptr, nullptr); - LookupKey lkey(keys[idx], kMaxSequenceNumber); - r->Get(options, lkey.internal_key(), &get_context); - if (get_context.State() == GetContext::kFound) { - statuses[idx] = Status::OK(); - } - } - ++idx; - } - return statuses; -} - -Status CompactedDBImpl::Init(const Options& options) { - mutex_.Lock(); - ColumnFamilyDescriptor cf(kDefaultColumnFamilyName, - ColumnFamilyOptions(options)); - Status s = Recover({ cf }, true /* read only */, false); - if (s.ok()) { - cfd_ = reinterpret_cast( - DefaultColumnFamily())->cfd(); - delete cfd_->InstallSuperVersion(new SuperVersion(), &mutex_); - } - mutex_.Unlock(); - if (!s.ok()) { - return s; - } - version_ = cfd_->GetSuperVersion()->current; - user_comparator_ = cfd_->user_comparator(); - // L0 should not have files - if (version_->file_levels_[0].num_files > 1) { - return Status::NotSupported("L0 contain more than 1 file"); - } - if (version_->file_levels_[0].num_files == 1) { - if (version_->num_non_empty_levels_ > 1) { - return Status::NotSupported("Both L0 and other level contain files"); - } - files_ = version_->file_levels_[0]; - return Status::OK(); - } - - for (int i = 1; i < version_->num_non_empty_levels_ - 1; ++i) { - if (version_->file_levels_[i].num_files > 0) { - return Status::NotSupported("Other levels also contain files"); - } - } - - int level = version_->num_non_empty_levels_ - 1; - if (version_->file_levels_[level].num_files > 0) { - files_ = version_->file_levels_[version_->num_non_empty_levels_ - 1]; - return Status::OK(); - } - return Status::NotSupported("no file exists"); -} - -Status CompactedDBImpl::Open(const Options& options, - const std::string& dbname, DB** dbptr) { - *dbptr = nullptr; - - if (options.max_open_files != -1) { - return Status::InvalidArgument("require max_open_files = -1"); - } - if (options.merge_operator.get() != nullptr) { - return Status::InvalidArgument("merge operator is not supported"); - } - DBOptions db_options(options); - std::unique_ptr db(new CompactedDBImpl(db_options, dbname)); - Status s = db->Init(options); - if (s.ok()) { - Log(INFO_LEVEL, db->db_options_.info_log, - "Opened the db as fully compacted mode"); - LogFlush(db->db_options_.info_log); - *dbptr = db.release(); - } - return s; -} - -} // namespace rocksdb -#endif // ROCKSDB_LITE diff --git a/utilities/compacted_db/compacted_db_impl.h b/utilities/compacted_db/compacted_db_impl.h deleted file mode 100644 index 49aca53b1b..0000000000 --- a/utilities/compacted_db/compacted_db_impl.h +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#pragma once -#ifndef ROCKSDB_LITE -#include "db/db_impl.h" -#include -#include - -namespace rocksdb { - -class CompactedDBImpl : public DBImpl { - public: - CompactedDBImpl(const DBOptions& options, const std::string& dbname); - virtual ~CompactedDBImpl(); - - static Status Open(const Options& options, const std::string& dbname, - DB** dbptr); - - // Implementations of the DB interface - using DB::Get; - virtual Status Get(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - std::string* value) override; - using DB::MultiGet; - virtual std::vector MultiGet( - const ReadOptions& options, - const std::vector&, - const std::vector& keys, std::vector* values) - override; - - using DBImpl::Put; - virtual Status Put(const WriteOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - using DBImpl::Merge; - virtual Status Merge(const WriteOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - using DBImpl::Delete; - virtual Status Delete(const WriteOptions& options, - ColumnFamilyHandle* column_family, - const Slice& key) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - virtual Status Write(const WriteOptions& options, - WriteBatch* updates) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - using DBImpl::CompactRange; - virtual Status CompactRange(ColumnFamilyHandle* column_family, - const Slice* begin, const Slice* end, - bool reduce_level = false, int target_level = -1, - uint32_t target_path_id = 0) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - - virtual Status DisableFileDeletions() override { - return Status::NotSupported("Not supported in compacted db mode."); - } - virtual Status EnableFileDeletions(bool force) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - virtual Status GetLiveFiles(std::vector&, - uint64_t* manifest_file_size, - bool flush_memtable = true) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - using DBImpl::Flush; - virtual Status Flush(const FlushOptions& options, - ColumnFamilyHandle* column_family) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - - private: - friend class DB; - inline size_t FindFile(const Slice& key); - Status Init(const Options& options); - - ColumnFamilyData* cfd_; - Version* version_; - const Comparator* user_comparator_; - FileLevel files_; - - // No copying allowed - CompactedDBImpl(const CompactedDBImpl&); - void operator=(const CompactedDBImpl&); -}; -} -#endif // ROCKSDB_LITE diff --git a/utilities/document/document_db.cc b/utilities/document/document_db.cc index 901e91163f..c12a1f253f 100644 --- a/utilities/document/document_db.cc +++ b/utilities/document/document_db.cc @@ -33,7 +33,7 @@ namespace { // > 0 <=> lhs == rhs // TODO(icanadi) move this to JSONDocument? int DocumentCompare(const JSONDocument& lhs, const JSONDocument& rhs) { - assert(lhs.IsObject() == false && rhs.IsObject() == false && + assert(rhs.IsObject() == false && rhs.IsObject() == false && lhs.type() == rhs.type()); switch (lhs.type()) { @@ -376,7 +376,7 @@ class IndexKey { class SimpleSortedIndex : public Index { public: - SimpleSortedIndex(const std::string& field, const std::string& name) + SimpleSortedIndex(const std::string field, const std::string& name) : field_(field), name_(name) {} virtual const char* Name() const override { return name_.c_str(); } @@ -736,7 +736,6 @@ class DocumentDBImpl : public DocumentDB { CreateColumnFamily(ColumnFamilyOptions(rocksdb_options_), InternalSecondaryIndexName(index.name), &cf_handle); if (!s.ok()) { - delete index_obj; return s; } diff --git a/utilities/document/json_document.cc b/utilities/document/json_document.cc index 4368b759d5..641f4ee09e 100644 --- a/utilities/document/json_document.cc +++ b/utilities/document/json_document.cc @@ -6,10 +6,7 @@ #include "rocksdb/utilities/json_document.h" -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include #include diff --git a/utilities/geodb/geodb_impl.cc b/utilities/geodb/geodb_impl.cc index 6c13fd6916..f63c91c3e5 100644 --- a/utilities/geodb/geodb_impl.cc +++ b/utilities/geodb/geodb_impl.cc @@ -7,9 +7,7 @@ #include "utilities/geodb/geodb_impl.h" -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif #include #include diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc index 6fbb780bc1..8b9e49bd44 100644 --- a/utilities/spatialdb/spatial_db.cc +++ b/utilities/spatialdb/spatial_db.cc @@ -7,10 +7,7 @@ #include "rocksdb/utilities/spatial_db.h" -#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif - #include #include #include @@ -22,7 +19,6 @@ #include "rocksdb/options.h" #include "rocksdb/memtablerep.h" #include "rocksdb/slice_transform.h" -#include "rocksdb/statistics.h" #include "rocksdb/table.h" #include "rocksdb/db.h" #include "rocksdb/utilities/stackable_db.h" @@ -222,7 +218,6 @@ std::string FeatureSet::DebugString() const { switch (iter.second.type()) { case Variant::kNull: out.append("null"); - break; case Variant::kBool: if (iter.second.get_bool()) { out.append("true"); @@ -369,7 +364,7 @@ class SpatialIndexCursor : public Cursor { } delete spatial_iterator; - valid_ = valid_ && !primary_key_ids_.empty(); + valid_ = valid_ && primary_key_ids_.size() > 0; if (valid_) { primary_keys_iterator_ = primary_key_ids_.begin(); @@ -517,7 +512,6 @@ class SpatialDBImpl : public SpatialDB { return Status::InvalidArgument("Spatial indexes can't be empty"); } - const size_t kWriteOutEveryBytes = 1024 * 1024; // 1MB uint64_t id = next_id_.fetch_add(1); for (const auto& si : spatial_indexes) { @@ -539,13 +533,6 @@ class SpatialDBImpl : public SpatialDB { &key, GetQuadKeyFromTile(x, y, spatial_index.tile_bits)); PutFixed64BigEndian(&key, id); batch.Put(itr->second.column_family, key, Slice()); - if (batch.GetDataSize() >= kWriteOutEveryBytes) { - Status s = Write(write_options, &batch); - batch.Clear(); - if (!s.ok()) { - return s; - } - } } } } @@ -562,7 +549,6 @@ class SpatialDBImpl : public SpatialDB { } virtual Status Compact() override { - // TODO(icanadi) maybe do this in parallel? Status s, t; for (auto& iter : name_to_index_) { t = Flush(FlushOptions(), iter.second.column_family); @@ -635,7 +621,6 @@ class SpatialDBImpl : public SpatialDB { namespace { DBOptions GetDBOptions(const SpatialDBOptions& options) { DBOptions db_options; - db_options.max_open_files = 50000; db_options.max_background_compactions = 3 * options.num_threads / 4; db_options.max_background_flushes = options.num_threads - db_options.max_background_compactions; @@ -643,12 +628,8 @@ DBOptions GetDBOptions(const SpatialDBOptions& options) { Env::LOW); db_options.env->SetBackgroundThreads(db_options.max_background_flushes, Env::HIGH); - db_options.statistics = CreateDBStatistics(); if (options.bulk_load) { - db_options.stats_dump_period_sec = 600; db_options.disableDataSync = true; - } else { - db_options.stats_dump_period_sec = 1800; // 30min } return db_options; } @@ -658,8 +639,6 @@ ColumnFamilyOptions GetColumnFamilyOptions(const SpatialDBOptions& options, ColumnFamilyOptions column_family_options; column_family_options.write_buffer_size = 128 * 1024 * 1024; // 128MB column_family_options.max_write_buffer_number = 4; - column_family_options.max_bytes_for_level_base = 256 * 1024 * 1024; // 256MB - column_family_options.target_file_size_base = 64 * 1024 * 1024; // 64MB column_family_options.level0_file_num_compaction_trigger = 2; column_family_options.level0_slowdown_writes_trigger = 16; column_family_options.level0_slowdown_writes_trigger = 32; diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h index 92b8eab7fb..84fb555681 100644 --- a/utilities/ttl/db_ttl_impl.h +++ b/utilities/ttl/db_ttl_impl.h @@ -206,7 +206,7 @@ class TtlCompactionFilterFactory : public CompactionFilterFactory { class TtlMergeOperator : public MergeOperator { public: - explicit TtlMergeOperator(const std::shared_ptr& merge_op, + explicit TtlMergeOperator(const std::shared_ptr merge_op, Env* env) : user_merge_op_(merge_op), env_(env) { assert(merge_op); diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc index 66cabe8e39..e6d64e54ea 100644 --- a/utilities/ttl/ttl_test.cc +++ b/utilities/ttl/ttl_test.cc @@ -120,7 +120,7 @@ class TtlTest { static FlushOptions flush_opts; WriteBatch batch; kv_it_ = kvmap_.begin(); - for (int i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, ++kv_it_) { + for (int i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, kv_it_++) { switch (batch_ops[i]) { case PUT: batch.Put(kv_it_->first, kv_it_->second); @@ -145,7 +145,7 @@ class TtlTest { static FlushOptions flush_opts; kv_it_ = kvmap_.begin(); advance(kv_it_, start_pos_map); - for (int i = 0; kv_it_ != kvmap_.end() && i < num_entries; i++, ++kv_it_) { + for (int i = 0; kv_it_ != kvmap_.end() && i < num_entries; i++, kv_it_++) { ASSERT_OK(cf == nullptr ? db_ttl_->Put(wopts, kv_it_->first, kv_it_->second) : db_ttl_->Put(wopts, cf, kv_it_->first, kv_it_->second)); @@ -207,7 +207,7 @@ class TtlTest { kv_it_ = kvmap_.begin(); advance(kv_it_, st_pos); std::string v; - for (int i = 0; kv_it_ != kvmap_.end() && i < span; i++, ++kv_it_) { + for (int i = 0; kv_it_ != kvmap_.end() && i < span; i++, kv_it_++) { Status s = (cf == nullptr) ? db_ttl_->Get(ropts, kv_it_->first, &v) : db_ttl_->Get(ropts, cf, kv_it_->first, &v); if (s.ok() != check) { @@ -252,7 +252,7 @@ class TtlTest { } else { // dbiter should have found out kvmap_[st_pos] for (int i = st_pos; kv_it_ != kvmap_.end() && i < st_pos + span; - i++, ++kv_it_) { + i++, kv_it_++) { ASSERT_TRUE(dbiter->Valid()); ASSERT_EQ(dbiter->value().compare(kv_it_->second), 0); dbiter->Next(); diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index 2caa2e4ccc..68b3d3970e 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -20,6 +20,7 @@ class ReadableWriteBatch : public WriteBatch { Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key, Slice* value, Slice* blob) const; }; +} // namespace // Key used by skip list, as the binary searchable index of WriteBatchWithIndex. struct WriteBatchIndexEntry { @@ -37,28 +38,44 @@ struct WriteBatchIndexEntry { class WriteBatchEntryComparator { public: - WriteBatchEntryComparator(const Comparator* default_comparator, + WriteBatchEntryComparator(const Comparator* comparator, const ReadableWriteBatch* write_batch) - : default_comparator_(default_comparator), write_batch_(write_batch) {} + : comparator_(comparator), write_batch_(write_batch) {} // Compare a and b. Return a negative value if a is less than b, 0 if they // are equal, and a positive value if a is greater than b int operator()(const WriteBatchIndexEntry* entry1, const WriteBatchIndexEntry* entry2) const; - void SetComparatorForCF(uint32_t column_family_id, - const Comparator* comparator) { - cf_comparator_map_[column_family_id] = comparator; - } - private: - const Comparator* default_comparator_; - std::unordered_map cf_comparator_map_; + const Comparator* comparator_; const ReadableWriteBatch* write_batch_; }; typedef SkipList WriteBatchEntrySkipList; +struct WriteBatchWithIndex::Rep { + Rep(const Comparator* index_comparator, size_t reserved_bytes = 0) + : write_batch(reserved_bytes), + comparator(index_comparator, &write_batch), + skip_list(comparator, &arena) {} + ReadableWriteBatch write_batch; + WriteBatchEntryComparator comparator; + Arena arena; + WriteBatchEntrySkipList skip_list; + + WriteBatchIndexEntry* GetEntry(ColumnFamilyHandle* column_family) { + return GetEntryWithCfId(GetColumnFamilyID(column_family)); + } + + WriteBatchIndexEntry* GetEntryWithCfId(uint32_t column_family_id) { + auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry)); + auto* index_entry = new (mem) + WriteBatchIndexEntry(write_batch.GetDataSize(), column_family_id); + return index_entry; + } +}; + class WBWIIteratorImpl : public WBWIIterator { public: WBWIIteratorImpl(uint32_t column_family_id, @@ -121,35 +138,6 @@ class WBWIIteratorImpl : public WBWIIterator { } } }; -} // namespace - -struct WriteBatchWithIndex::Rep { - Rep(const Comparator* index_comparator, size_t reserved_bytes = 0) - : write_batch(reserved_bytes), - comparator(index_comparator, &write_batch), - skip_list(comparator, &arena) {} - ReadableWriteBatch write_batch; - WriteBatchEntryComparator comparator; - Arena arena; - WriteBatchEntrySkipList skip_list; - - WriteBatchIndexEntry* GetEntry(ColumnFamilyHandle* column_family) { - uint32_t cf_id = GetColumnFamilyID(column_family); - const auto* cf_cmp = GetColumnFamilyUserComparator(column_family); - if (cf_cmp != nullptr) { - comparator.SetComparatorForCF(cf_id, cf_cmp); - } - - return GetEntryWithCfId(cf_id); - } - - WriteBatchIndexEntry* GetEntryWithCfId(uint32_t column_family_id) { - auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry)); - auto* index_entry = new (mem) - WriteBatchIndexEntry(write_batch.GetDataSize(), column_family_id); - return index_entry; - } -}; Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key, @@ -191,9 +179,9 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset, return Status::OK(); } -WriteBatchWithIndex::WriteBatchWithIndex( - const Comparator* default_index_comparator, size_t reserved_bytes) - : rep(new Rep(default_index_comparator, reserved_bytes)) {} +WriteBatchWithIndex::WriteBatchWithIndex(const Comparator* index_comparator, + size_t reserved_bytes) + : rep(new Rep(index_comparator, reserved_bytes)) {} WriteBatchWithIndex::~WriteBatchWithIndex() { delete rep; } @@ -299,14 +287,7 @@ int WriteBatchEntryComparator::operator()( key2 = *(entry2->search_key); } - int cmp; - auto comparator_for_cf = cf_comparator_map_.find(entry1->column_family); - if (comparator_for_cf != cf_comparator_map_.end()) { - cmp = comparator_for_cf->second->Compare(key1, key2); - } else { - cmp = default_comparator_->Compare(key1, key2); - } - + int cmp = comparator_->Compare(key1, key2); if (cmp != 0) { return cmp; } else if (entry1->offset > entry2->offset) { diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc index ad8c110c12..fdceed4c44 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_test.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc @@ -19,16 +19,12 @@ namespace rocksdb { namespace { class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { public: - explicit ColumnFamilyHandleImplDummy(int id, const Comparator* comparator) - : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), - id_(id), - comparator_(comparator) {} + explicit ColumnFamilyHandleImplDummy(int id) + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {} uint32_t GetID() const override { return id_; } - const Comparator* user_comparator() const override { return comparator_; } private: uint32_t id_; - const Comparator* comparator_; }; struct Entry { @@ -94,9 +90,8 @@ TEST(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) { index_map[e.value].push_back(&e); } - WriteBatchWithIndex batch(nullptr, 20); - ColumnFamilyHandleImplDummy data(6, BytewiseComparator()); - ColumnFamilyHandleImplDummy index(8, BytewiseComparator()); + WriteBatchWithIndex batch(BytewiseComparator(), 20); + ColumnFamilyHandleImplDummy data(6), index(8); for (auto& e : entries) { if (e.type == kPutRecord) { batch.Put(&data, e.key, e.value); @@ -235,107 +230,6 @@ TEST(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) { } } -class ReverseComparator : public Comparator { - public: - ReverseComparator() {} - - virtual const char* Name() const override { - return "rocksdb.ReverseComparator"; - } - - virtual int Compare(const Slice& a, const Slice& b) const override { - return 0 - BytewiseComparator()->Compare(a, b); - } - - virtual void FindShortestSeparator(std::string* start, - const Slice& limit) const {} - virtual void FindShortSuccessor(std::string* key) const {} -}; - -TEST(WriteBatchWithIndexTest, TestComparatorForCF) { - ReverseComparator reverse_cmp; - ColumnFamilyHandleImplDummy cf1(6, nullptr); - ColumnFamilyHandleImplDummy reverse_cf(66, &reverse_cmp); - ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator()); - WriteBatchWithIndex batch(BytewiseComparator(), 20); - - batch.Put(&cf1, "ddd", ""); - batch.Put(&cf2, "aaa", ""); - batch.Put(&cf2, "eee", ""); - batch.Put(&cf1, "ccc", ""); - batch.Put(&reverse_cf, "a11", ""); - batch.Put(&cf1, "bbb", ""); - batch.Put(&reverse_cf, "a33", ""); - batch.Put(&reverse_cf, "a22", ""); - - { - std::unique_ptr iter(batch.NewIterator(&cf1)); - iter->Seek(""); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("bbb", iter->Entry().key.ToString()); - iter->Next(); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("ccc", iter->Entry().key.ToString()); - iter->Next(); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("ddd", iter->Entry().key.ToString()); - iter->Next(); - ASSERT_OK(iter->status()); - ASSERT_TRUE(!iter->Valid()); - } - - { - std::unique_ptr iter(batch.NewIterator(&cf2)); - iter->Seek(""); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("aaa", iter->Entry().key.ToString()); - iter->Next(); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("eee", iter->Entry().key.ToString()); - iter->Next(); - ASSERT_OK(iter->status()); - ASSERT_TRUE(!iter->Valid()); - } - - { - std::unique_ptr iter(batch.NewIterator(&reverse_cf)); - iter->Seek(""); - ASSERT_OK(iter->status()); - ASSERT_TRUE(!iter->Valid()); - - iter->Seek("z"); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("a33", iter->Entry().key.ToString()); - iter->Next(); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("a22", iter->Entry().key.ToString()); - iter->Next(); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("a11", iter->Entry().key.ToString()); - iter->Next(); - ASSERT_OK(iter->status()); - ASSERT_TRUE(!iter->Valid()); - - iter->Seek("a22"); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("a22", iter->Entry().key.ToString()); - - iter->Seek("a13"); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("a11", iter->Entry().key.ToString()); - } -} - } // namespace int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }