From 8514b88974c71c0fa85bb154507536ee49c33458 Mon Sep 17 00:00:00 2001 From: Vinnie Falco Date: Wed, 4 Jun 2014 16:10:38 -0700 Subject: [PATCH] Squashed 'src/rocksdb/' content from commit 457bae6 git-subtree-dir: src/rocksdb git-subtree-split: 457bae6911343c2e03222e24d0c946b94dca82b4 --- .arcconfig | 10 + .clang-format | 5 + .gitignore | 34 + .travis.yml | 20 + CONTRIBUTING.md | 20 + HISTORY.md | 89 + INSTALL.md | 84 + LICENSE | Bin 0 -> 1646 bytes Makefile | 523 ++ PATENTS | 23 + README.md | 26 + ROCKSDB_LITE.md | 20 + build_tools/build_detect_platform | 320 + build_tools/build_detect_version | 22 + build_tools/fbcode.clang31.sh | 74 + build_tools/fbcode.gcc471.sh | 70 + build_tools/fbcode.gcc481.sh | 81 + build_tools/format-diff.sh | 107 + build_tools/mac-install-gflags.sh | 25 + build_tools/make_new_version.sh | 46 + build_tools/regression_build_test.sh | 330 + build_tools/valgrind_test.sh | 15 + coverage/coverage_test.sh | 78 + coverage/parse_gcov_output.py | 118 + db/builder.cc | 224 + db/builder.h | 45 + db/c.cc | 1476 ++++ db/c_test.c | 494 ++ db/column_family.cc | 604 ++ db/column_family.h | 419 + db/column_family_test.cc | 977 +++ db/compaction.cc | 253 + db/compaction.h | 158 + db/compaction_picker.cc | 960 +++ db/compaction_picker.h | 181 + db/corruption_test.cc | 440 ++ db/db_bench.cc | 2642 +++++++ db/db_filesnapshot.cc | 179 + db/db_impl.cc | 4703 +++++++++++ db/db_impl.h | 635 ++ db/db_impl_debug.cc | 133 + db/db_impl_readonly.cc | 154 + db/db_impl_readonly.h | 103 + db/db_iter.cc | 517 ++ db/db_iter.h | 73 + db/db_stats_logger.cc | 95 + db/db_test.cc | 6852 +++++++++++++++++ db/dbformat.cc | 169 + db/dbformat.h | 345 + db/dbformat_test.cc | 117 + db/deletefile_test.cc | 295 + db/file_indexer.cc | 202 + db/file_indexer.h | 129 + db/file_indexer_test.cc | 330 + db/filename.cc | 266 + db/filename.h | 110 + db/filename_test.cc | 140 + db/forward_iterator.cc | 383 + db/forward_iterator.h | 105 + db/internal_stats.cc | 369 + db/internal_stats.h | 187 + db/log_and_apply_bench.cc | 79 + db/log_format.h | 35 + db/log_reader.cc | 339 + db/log_reader.h | 130 + db/log_test.cc | 689 ++ db/log_writer.cc | 108 + db/log_writer.h | 55 + db/memtable.cc | 620 ++ db/memtable.h | 222 + db/memtable_list.cc | 286 + db/memtable_list.h | 156 + db/merge_context.h | 69 + db/merge_helper.cc | 209 + db/merge_helper.h | 105 + db/merge_operator.cc | 77 + db/merge_test.cc | 472 ++ db/perf_context_test.cc | 358 + db/plain_table_db_test.cc | 853 ++ db/prefix_test.cc | 499 ++ db/repair.cc | 403 + db/simple_table_db_test.cc | 800 ++ db/skiplist.h | 429 ++ db/skiplist_test.cc | 383 + db/snapshot.h | 86 + db/table_cache.cc | 198 + db/table_cache.h | 95 + db/table_properties_collector.cc | 83 + db/table_properties_collector.h | 95 + db/table_properties_collector_test.cc | 313 + db/tailing_iter.cc | 221 + db/tailing_iter.h | 97 + db/transaction_log_impl.cc | 262 + db/transaction_log_impl.h | 120 + db/version_edit.cc | 364 + db/version_edit.h | 176 + db/version_edit_test.cc | 65 + db/version_set.cc | 2822 +++++++ db/version_set.h | 499 ++ db/version_set_test.cc | 184 + db/write_batch.cc | 489 ++ db/write_batch_internal.h | 123 + db/write_batch_test.cc | 323 + doc/doc.css | 89 + doc/index.html | 831 ++ doc/log_format.txt | 75 + doc/rockslogo.jpg | Bin 0 -> 137232 bytes doc/rockslogo.png | Bin 0 -> 61703 bytes examples/.gitignore | 2 + examples/Makefile | 9 + examples/README.md | 1 + examples/column_families_example.cc | 72 + examples/simple_example.cc | 41 + hdfs/README | 23 + hdfs/env_hdfs.h | 327 + hdfs/setup.sh | 7 + helpers/memenv/memenv.cc | 395 + helpers/memenv/memenv_test.cc | 231 + include/rocksdb/c.h | 575 ++ include/rocksdb/cache.h | 140 + include/rocksdb/compaction_filter.h | 198 + include/rocksdb/comparator.h | 67 + include/rocksdb/db.h | 495 ++ include/rocksdb/env.h | 772 ++ include/rocksdb/filter_policy.h | 74 + include/rocksdb/flush_block_policy.h | 58 + include/rocksdb/iterator.h | 106 + include/rocksdb/ldb_tool.h | 18 + include/rocksdb/memtablerep.h | 284 + include/rocksdb/merge_operator.h | 182 + include/rocksdb/options.h | 975 +++ include/rocksdb/perf_context.h | 75 + include/rocksdb/slice.h | 136 + include/rocksdb/slice_transform.h | 47 + include/rocksdb/statistics.h | 268 + include/rocksdb/status.h | 145 + include/rocksdb/table.h | 206 + include/rocksdb/table_properties.h | 127 + include/rocksdb/transaction_log.h | 104 + include/rocksdb/types.h | 20 + include/rocksdb/universal_compaction.h | 83 + include/rocksdb/version.h | 17 + include/rocksdb/write_batch.h | 158 + include/utilities/backupable_db.h | 251 + include/utilities/db_ttl.h | 68 + include/utilities/geo_db.h | 105 + include/utilities/stackable_db.h | 215 + include/utilities/utility_db.h | 30 + java/Makefile | 31 + java/RocksDBSample.java | 253 + java/jdb_bench.sh | 1 + java/org/rocksdb/BackupableDB.java | 80 + java/org/rocksdb/BackupableDBOptions.java | 43 + java/org/rocksdb/BloomFilter.java | 37 + java/org/rocksdb/Filter.java | 31 + .../rocksdb/HashLinkedListMemTableConfig.java | 52 + .../rocksdb/HashSkipListMemTableConfig.java | 97 + java/org/rocksdb/HistogramData.java | 43 + java/org/rocksdb/HistogramType.java | 39 + java/org/rocksdb/MemTableConfig.java | 27 + java/org/rocksdb/Options.java | 2354 ++++++ java/org/rocksdb/PlainTableConfig.java | 123 + java/org/rocksdb/ReadOptions.java | 125 + java/org/rocksdb/RocksDB.java | 370 + java/org/rocksdb/RocksDBException.java | 23 + java/org/rocksdb/RocksIterator.java | 136 + java/org/rocksdb/RocksObject.java | 72 + java/org/rocksdb/SkipListMemTableConfig.java | 15 + java/org/rocksdb/Statistics.java | 38 + java/org/rocksdb/TableFormatConfig.java | 20 + java/org/rocksdb/TickerType.java | 123 + java/org/rocksdb/VectorMemTableConfig.java | 40 + java/org/rocksdb/WriteBatch.java | 112 + java/org/rocksdb/WriteBatchTest.java | 124 + java/org/rocksdb/WriteOptions.java | 99 + java/org/rocksdb/benchmark/DbBenchmark.java | 1590 ++++ java/org/rocksdb/test/BackupableDBTest.java | 41 + java/org/rocksdb/test/OptionsTest.java | 424 + java/org/rocksdb/test/ReadOptionsTest.java | 40 + java/org/rocksdb/util/Environment.java | 37 + java/org/rocksdb/util/SizeUnit.java | 16 + java/rocksjni/backupablejni.cc | 85 + java/rocksjni/filter.cc | 38 + java/rocksjni/iterator.cc | 145 + java/rocksjni/memtablejni.cc | 58 + java/rocksjni/options.cc | 1805 +++++ java/rocksjni/portal.h | 383 + java/rocksjni/rocksjni.cc | 440 ++ java/rocksjni/statistics.cc | 50 + java/rocksjni/table.cc | 25 + java/rocksjni/write_batch.cc | 261 + linters/__phutil_library_init__.php | 3 + linters/__phutil_library_map__.php | 27 + linters/cpp_linter/ArcanistCpplintLinter.php | 88 + linters/cpp_linter/FbcodeCppLinter.php | 99 + linters/cpp_linter/PfffCppLinter.php | 68 + linters/cpp_linter/cpplint.py | 4767 ++++++++++++ .../lint_engine/FacebookFbcodeLintEngine.php | 147 + port/README | 10 + port/atomic_pointer.h | 157 + port/likely.h | 21 + port/port.h | 22 + port/port_example.h | 133 + port/port_posix.cc | 109 + port/port_posix.h | 488 ++ port/stack_trace.cc | 132 + port/stack_trace.h | 19 + port/win/stdint.h | 24 + table/block.cc | 307 + table/block.h | 61 + table/block_based_table_builder.cc | 804 ++ table/block_based_table_builder.h | 92 + table/block_based_table_factory.cc | 63 + table/block_based_table_factory.h | 53 + table/block_based_table_reader.cc | 1176 +++ table/block_based_table_reader.h | 201 + table/block_builder.cc | 134 + table/block_builder.h | 65 + table/block_hash_index.cc | 157 + table/block_hash_index.h | 85 + table/block_hash_index_test.cc | 117 + table/block_test.cc | 242 + table/filter_block.cc | 187 + table/filter_block.h | 92 + table/filter_block_test.cc | 139 + table/flush_block_policy.cc | 70 + table/format.cc | 371 + table/format.h | 198 + table/iter_heap.h | 44 + table/iterator.cc | 92 + table/iterator_wrapper.h | 81 + table/merger.cc | 356 + table/merger.h | 60 + table/meta_blocks.cc | 276 + table/meta_blocks.h | 131 + table/plain_table_builder.cc | 211 + table/plain_table_builder.h | 87 + table/plain_table_factory.cc | 54 + table/plain_table_factory.h | 94 + table/plain_table_reader.cc | 776 ++ table/plain_table_reader.h | 265 + table/table_builder.h | 55 + table/table_properties.cc | 115 + table/table_reader.h | 71 + table/table_reader_bench.cc | 284 + table/table_test.cc | 1805 +++++ table/two_level_iterator.cc | 199 + table/two_level_iterator.h | 50 + third-party/rapidjson/document.h | 821 ++ third-party/rapidjson/filestream.h | 46 + third-party/rapidjson/internal/pow10.h | 54 + third-party/rapidjson/internal/stack.h | 82 + third-party/rapidjson/internal/strfunc.h | 24 + third-party/rapidjson/license.txt | 19 + third-party/rapidjson/prettywriter.h | 156 + third-party/rapidjson/rapidjson.h | 525 ++ third-party/rapidjson/reader.h | 683 ++ third-party/rapidjson/stringbuffer.h | 49 + third-party/rapidjson/writer.h | 241 + tools/auto_sanity_test.sh | 71 + tools/blob_store_bench.cc | 280 + tools/db_crashtest.py | 150 + tools/db_crashtest2.py | 174 + tools/db_repl_stress.cc | 147 + tools/db_sanity_test.cc | 204 + tools/db_stress.cc | 1808 +++++ tools/ldb.cc | 13 + tools/ldb_test.py | 383 + tools/reduce_levels_test.cc | 197 + tools/sst_dump.cc | 367 + util/arena.cc | 130 + util/arena.h | 128 + util/arena_test.cc | 142 + util/auto_roll_logger.cc | 116 + util/auto_roll_logger.h | 91 + util/auto_roll_logger_test.cc | 292 + util/autovector.h | 319 + util/autovector_test.cc | 316 + util/benchharness.cc | 398 + util/benchharness.h | 357 + util/benchharness_test.cc | 67 + util/blob_store.cc | 270 + util/blob_store.h | 163 + util/blob_store_test.cc | 200 + util/bloom.cc | 111 + util/bloom_test.cc | 182 + util/build_version.h | 16 + util/cache.cc | 481 ++ util/cache_test.cc | 449 ++ util/coding.cc | 169 + util/coding.h | 294 + util/coding_test.cc | 296 + util/comparator.cc | 86 + util/crc32c.cc | 393 + util/crc32c.h | 46 + util/crc32c_test.cc | 77 + util/dynamic_bloom.cc | 60 + util/dynamic_bloom.h | 119 + util/dynamic_bloom_test.cc | 215 + util/env.cc | 251 + util/env_hdfs.cc | 591 ++ util/env_posix.cc | 1726 +++++ util/env_test.cc | 741 ++ util/filelock_test.cc | 58 + util/filter_policy.cc | 16 + util/hash.cc | 49 + util/hash.h | 20 + util/hash_cuckoo_rep.cc | 636 ++ util/hash_cuckoo_rep.h | 42 + util/hash_linklist_rep.cc | 495 ++ util/hash_linklist_rep.h | 38 + util/hash_skiplist_rep.cc | 347 + util/hash_skiplist_rep.h | 43 + util/histogram.cc | 198 + util/histogram.h | 79 + util/histogram_test.cc | 62 + util/ldb_cmd.cc | 1839 +++++ util/ldb_cmd.h | 722 ++ util/ldb_cmd_execute_result.h | 76 + util/ldb_tool.cc | 107 + util/log_buffer.cc | 73 + util/log_buffer.h | 49 + util/log_write_bench.cc | 82 + util/logging.cc | 99 + util/logging.h | 47 + util/manual_compaction_test.cc | 156 + util/murmurhash.cc | 183 + util/murmurhash.h | 42 + util/mutexlock.h | 78 + util/options.cc | 553 ++ util/perf_context.cc | 86 + util/perf_context_imp.h | 88 + util/posix_logger.h | 161 + util/random.h | 90 + util/signal_test.cc | 34 + util/skiplistrep.cc | 129 + util/slice.cc | 74 + util/statistics.cc | 94 + util/statistics.h | 66 + util/stats_logger.h | 26 + util/status.cc | 86 + util/stl_wrappers.h | 32 + util/stop_watch.h | 67 + util/string_util.cc | 23 + util/string_util.h | 15 + util/sync_point.cc | 64 + util/sync_point.h | 80 + util/testharness.cc | 84 + util/testharness.h | 142 + util/testutil.cc | 56 + util/testutil.h | 80 + util/thread_local.cc | 243 + util/thread_local.h | 166 + util/thread_local_test.cc | 472 ++ util/vectorrep.cc | 294 + util/xxhash.cc | 475 ++ util/xxhash.h | 164 + utilities/backupable/backupable_db.cc | 1306 ++++ utilities/backupable/backupable_db_test.cc | 974 +++ utilities/geodb/geodb_impl.cc | 431 ++ utilities/geodb/geodb_impl.h | 191 + utilities/geodb/geodb_test.cc | 123 + utilities/merge_operators.h | 45 + utilities/merge_operators/put.cc | 68 + .../string_append/stringappend.cc | 60 + .../string_append/stringappend.h | 31 + .../string_append/stringappend2.cc | 113 + .../string_append/stringappend2.h | 51 + .../string_append/stringappend_test.cc | 595 ++ utilities/merge_operators/uint64add.cc | 65 + utilities/redis/README | 14 + utilities/redis/redis_list_exception.h | 22 + utilities/redis/redis_list_iterator.h | 310 + utilities/redis/redis_lists.cc | 552 ++ utilities/redis/redis_lists.h | 108 + utilities/redis/redis_lists_test.cc | 884 +++ utilities/ttl/db_ttl_impl.cc | 284 + utilities/ttl/db_ttl_impl.h | 314 + utilities/ttl/ttl_test.cc | 595 ++ 379 files changed, 108179 insertions(+) create mode 100644 .arcconfig create mode 100644 .clang-format create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 CONTRIBUTING.md create mode 100644 HISTORY.md create mode 100644 INSTALL.md create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 PATENTS create mode 100644 README.md create mode 100644 ROCKSDB_LITE.md create mode 100755 build_tools/build_detect_platform create mode 100755 build_tools/build_detect_version create mode 100644 build_tools/fbcode.clang31.sh create mode 100644 build_tools/fbcode.gcc471.sh create mode 100644 build_tools/fbcode.gcc481.sh create mode 100755 build_tools/format-diff.sh create mode 100755 build_tools/mac-install-gflags.sh create mode 100755 build_tools/make_new_version.sh create mode 100755 build_tools/regression_build_test.sh create mode 100755 build_tools/valgrind_test.sh create mode 100755 coverage/coverage_test.sh create mode 100644 coverage/parse_gcov_output.py create mode 100644 db/builder.cc create mode 100644 db/builder.h create mode 100644 db/c.cc create mode 100644 db/c_test.c create mode 100644 db/column_family.cc create mode 100644 db/column_family.h create mode 100644 db/column_family_test.cc create mode 100644 db/compaction.cc create mode 100644 db/compaction.h create mode 100644 db/compaction_picker.cc create mode 100644 db/compaction_picker.h create mode 100644 db/corruption_test.cc create mode 100644 db/db_bench.cc create mode 100644 db/db_filesnapshot.cc create mode 100644 db/db_impl.cc create mode 100644 db/db_impl.h create mode 100644 db/db_impl_debug.cc create mode 100644 db/db_impl_readonly.cc create mode 100644 db/db_impl_readonly.h create mode 100644 db/db_iter.cc create mode 100644 db/db_iter.h create mode 100644 db/db_stats_logger.cc create mode 100644 db/db_test.cc create mode 100644 db/dbformat.cc create mode 100644 db/dbformat.h create mode 100644 db/dbformat_test.cc create mode 100644 db/deletefile_test.cc create mode 100644 db/file_indexer.cc create mode 100644 db/file_indexer.h create mode 100644 db/file_indexer_test.cc create mode 100644 db/filename.cc create mode 100644 db/filename.h create mode 100644 db/filename_test.cc create mode 100644 db/forward_iterator.cc create mode 100644 db/forward_iterator.h create mode 100644 db/internal_stats.cc create mode 100644 db/internal_stats.h create mode 100644 db/log_and_apply_bench.cc create mode 100644 db/log_format.h create mode 100644 db/log_reader.cc create mode 100644 db/log_reader.h create mode 100644 db/log_test.cc create mode 100644 db/log_writer.cc create mode 100644 db/log_writer.h create mode 100644 db/memtable.cc create mode 100644 db/memtable.h create mode 100644 db/memtable_list.cc create mode 100644 db/memtable_list.h create mode 100644 db/merge_context.h create mode 100644 db/merge_helper.cc create mode 100644 db/merge_helper.h create mode 100644 db/merge_operator.cc create mode 100644 db/merge_test.cc create mode 100644 db/perf_context_test.cc create mode 100644 db/plain_table_db_test.cc create mode 100644 db/prefix_test.cc create mode 100644 db/repair.cc create mode 100644 db/simple_table_db_test.cc create mode 100644 db/skiplist.h create mode 100644 db/skiplist_test.cc create mode 100644 db/snapshot.h create mode 100644 db/table_cache.cc create mode 100644 db/table_cache.h create mode 100644 db/table_properties_collector.cc create mode 100644 db/table_properties_collector.h create mode 100644 db/table_properties_collector_test.cc create mode 100644 db/tailing_iter.cc create mode 100644 db/tailing_iter.h create mode 100644 db/transaction_log_impl.cc create mode 100644 db/transaction_log_impl.h create mode 100644 db/version_edit.cc create mode 100644 db/version_edit.h create mode 100644 db/version_edit_test.cc create mode 100644 db/version_set.cc create mode 100644 db/version_set.h create mode 100644 db/version_set_test.cc create mode 100644 db/write_batch.cc create mode 100644 db/write_batch_internal.h create mode 100644 db/write_batch_test.cc create mode 100644 doc/doc.css create mode 100644 doc/index.html create mode 100644 doc/log_format.txt create mode 100644 doc/rockslogo.jpg create mode 100644 doc/rockslogo.png create mode 100644 examples/.gitignore create mode 100644 examples/Makefile create mode 100644 examples/README.md create mode 100644 examples/column_families_example.cc create mode 100644 examples/simple_example.cc create mode 100644 hdfs/README create mode 100644 hdfs/env_hdfs.h create mode 100644 hdfs/setup.sh create mode 100644 helpers/memenv/memenv.cc create mode 100644 helpers/memenv/memenv_test.cc create mode 100644 include/rocksdb/c.h create mode 100644 include/rocksdb/cache.h create mode 100644 include/rocksdb/compaction_filter.h create mode 100644 include/rocksdb/comparator.h create mode 100644 include/rocksdb/db.h create mode 100644 include/rocksdb/env.h create mode 100644 include/rocksdb/filter_policy.h create mode 100644 include/rocksdb/flush_block_policy.h create mode 100644 include/rocksdb/iterator.h create mode 100644 include/rocksdb/ldb_tool.h create mode 100644 include/rocksdb/memtablerep.h create mode 100644 include/rocksdb/merge_operator.h create mode 100644 include/rocksdb/options.h create mode 100644 include/rocksdb/perf_context.h create mode 100644 include/rocksdb/slice.h create mode 100644 include/rocksdb/slice_transform.h create mode 100644 include/rocksdb/statistics.h create mode 100644 include/rocksdb/status.h create mode 100644 include/rocksdb/table.h create mode 100644 include/rocksdb/table_properties.h create mode 100644 include/rocksdb/transaction_log.h create mode 100644 include/rocksdb/types.h create mode 100644 include/rocksdb/universal_compaction.h create mode 100644 include/rocksdb/version.h create mode 100644 include/rocksdb/write_batch.h create mode 100644 include/utilities/backupable_db.h create mode 100644 include/utilities/db_ttl.h create mode 100644 include/utilities/geo_db.h create mode 100644 include/utilities/stackable_db.h create mode 100644 include/utilities/utility_db.h create mode 100644 java/Makefile create mode 100644 java/RocksDBSample.java create mode 100755 java/jdb_bench.sh create mode 100644 java/org/rocksdb/BackupableDB.java create mode 100644 java/org/rocksdb/BackupableDBOptions.java create mode 100644 java/org/rocksdb/BloomFilter.java create mode 100644 java/org/rocksdb/Filter.java create mode 100644 java/org/rocksdb/HashLinkedListMemTableConfig.java create mode 100644 java/org/rocksdb/HashSkipListMemTableConfig.java create mode 100644 java/org/rocksdb/HistogramData.java create mode 100644 java/org/rocksdb/HistogramType.java create mode 100644 java/org/rocksdb/MemTableConfig.java create mode 100644 java/org/rocksdb/Options.java create mode 100644 java/org/rocksdb/PlainTableConfig.java create mode 100644 java/org/rocksdb/ReadOptions.java create mode 100644 java/org/rocksdb/RocksDB.java create mode 100644 java/org/rocksdb/RocksDBException.java create mode 100644 java/org/rocksdb/RocksIterator.java create mode 100644 java/org/rocksdb/RocksObject.java create mode 100644 java/org/rocksdb/SkipListMemTableConfig.java create mode 100644 java/org/rocksdb/Statistics.java create mode 100644 java/org/rocksdb/TableFormatConfig.java create mode 100644 java/org/rocksdb/TickerType.java create mode 100644 java/org/rocksdb/VectorMemTableConfig.java create mode 100644 java/org/rocksdb/WriteBatch.java create mode 100644 java/org/rocksdb/WriteBatchTest.java create mode 100644 java/org/rocksdb/WriteOptions.java create mode 100644 java/org/rocksdb/benchmark/DbBenchmark.java create mode 100644 java/org/rocksdb/test/BackupableDBTest.java create mode 100644 java/org/rocksdb/test/OptionsTest.java create mode 100644 java/org/rocksdb/test/ReadOptionsTest.java create mode 100644 java/org/rocksdb/util/Environment.java create mode 100644 java/org/rocksdb/util/SizeUnit.java create mode 100644 java/rocksjni/backupablejni.cc create mode 100644 java/rocksjni/filter.cc create mode 100644 java/rocksjni/iterator.cc create mode 100644 java/rocksjni/memtablejni.cc create mode 100644 java/rocksjni/options.cc create mode 100644 java/rocksjni/portal.h create mode 100644 java/rocksjni/rocksjni.cc create mode 100644 java/rocksjni/statistics.cc create mode 100644 java/rocksjni/table.cc create mode 100644 java/rocksjni/write_batch.cc create mode 100644 linters/__phutil_library_init__.php create mode 100644 linters/__phutil_library_map__.php create mode 100644 linters/cpp_linter/ArcanistCpplintLinter.php create mode 100644 linters/cpp_linter/FbcodeCppLinter.php create mode 100644 linters/cpp_linter/PfffCppLinter.php create mode 100755 linters/cpp_linter/cpplint.py create mode 100644 linters/lint_engine/FacebookFbcodeLintEngine.php create mode 100644 port/README create mode 100644 port/atomic_pointer.h create mode 100644 port/likely.h create mode 100644 port/port.h create mode 100644 port/port_example.h create mode 100644 port/port_posix.cc create mode 100644 port/port_posix.h create mode 100644 port/stack_trace.cc create mode 100644 port/stack_trace.h create mode 100644 port/win/stdint.h create mode 100644 table/block.cc create mode 100644 table/block.h create mode 100644 table/block_based_table_builder.cc create mode 100644 table/block_based_table_builder.h create mode 100644 table/block_based_table_factory.cc create mode 100644 table/block_based_table_factory.h create mode 100644 table/block_based_table_reader.cc create mode 100644 table/block_based_table_reader.h create mode 100644 table/block_builder.cc create mode 100644 table/block_builder.h create mode 100644 table/block_hash_index.cc create mode 100644 table/block_hash_index.h create mode 100644 table/block_hash_index_test.cc create mode 100644 table/block_test.cc create mode 100644 table/filter_block.cc create mode 100644 table/filter_block.h create mode 100644 table/filter_block_test.cc create mode 100644 table/flush_block_policy.cc create mode 100644 table/format.cc create mode 100644 table/format.h create mode 100644 table/iter_heap.h create mode 100644 table/iterator.cc create mode 100644 table/iterator_wrapper.h create mode 100644 table/merger.cc create mode 100644 table/merger.h create mode 100644 table/meta_blocks.cc create mode 100644 table/meta_blocks.h create mode 100644 table/plain_table_builder.cc create mode 100644 table/plain_table_builder.h create mode 100644 table/plain_table_factory.cc create mode 100644 table/plain_table_factory.h create mode 100644 table/plain_table_reader.cc create mode 100644 table/plain_table_reader.h create mode 100644 table/table_builder.h create mode 100644 table/table_properties.cc create mode 100644 table/table_reader.h create mode 100644 table/table_reader_bench.cc create mode 100644 table/table_test.cc create mode 100644 table/two_level_iterator.cc create mode 100644 table/two_level_iterator.h create mode 100644 third-party/rapidjson/document.h create mode 100644 third-party/rapidjson/filestream.h create mode 100644 third-party/rapidjson/internal/pow10.h create mode 100644 third-party/rapidjson/internal/stack.h create mode 100644 third-party/rapidjson/internal/strfunc.h create mode 100644 third-party/rapidjson/license.txt create mode 100644 third-party/rapidjson/prettywriter.h create mode 100644 third-party/rapidjson/rapidjson.h create mode 100644 third-party/rapidjson/reader.h create mode 100644 third-party/rapidjson/stringbuffer.h create mode 100644 third-party/rapidjson/writer.h create mode 100755 tools/auto_sanity_test.sh create mode 100644 tools/blob_store_bench.cc create mode 100644 tools/db_crashtest.py create mode 100644 tools/db_crashtest2.py create mode 100644 tools/db_repl_stress.cc create mode 100644 tools/db_sanity_test.cc create mode 100644 tools/db_stress.cc create mode 100644 tools/ldb.cc create mode 100644 tools/ldb_test.py create mode 100644 tools/reduce_levels_test.cc create mode 100644 tools/sst_dump.cc create mode 100644 util/arena.cc create mode 100644 util/arena.h create mode 100644 util/arena_test.cc create mode 100644 util/auto_roll_logger.cc create mode 100644 util/auto_roll_logger.h create mode 100755 util/auto_roll_logger_test.cc create mode 100644 util/autovector.h create mode 100644 util/autovector_test.cc create mode 100644 util/benchharness.cc create mode 100644 util/benchharness.h create mode 100644 util/benchharness_test.cc create mode 100644 util/blob_store.cc create mode 100644 util/blob_store.h create mode 100644 util/blob_store_test.cc create mode 100644 util/bloom.cc create mode 100644 util/bloom_test.cc create mode 100644 util/build_version.h create mode 100644 util/cache.cc create mode 100644 util/cache_test.cc create mode 100644 util/coding.cc create mode 100644 util/coding.h create mode 100644 util/coding_test.cc create mode 100644 util/comparator.cc create mode 100644 util/crc32c.cc create mode 100644 util/crc32c.h create mode 100644 util/crc32c_test.cc create mode 100644 util/dynamic_bloom.cc create mode 100644 util/dynamic_bloom.h create mode 100644 util/dynamic_bloom_test.cc create mode 100644 util/env.cc create mode 100644 util/env_hdfs.cc create mode 100644 util/env_posix.cc create mode 100644 util/env_test.cc create mode 100644 util/filelock_test.cc create mode 100644 util/filter_policy.cc create mode 100644 util/hash.cc create mode 100644 util/hash.h create mode 100644 util/hash_cuckoo_rep.cc create mode 100644 util/hash_cuckoo_rep.h create mode 100644 util/hash_linklist_rep.cc create mode 100644 util/hash_linklist_rep.h create mode 100644 util/hash_skiplist_rep.cc create mode 100644 util/hash_skiplist_rep.h create mode 100644 util/histogram.cc create mode 100644 util/histogram.h create mode 100644 util/histogram_test.cc create mode 100644 util/ldb_cmd.cc create mode 100644 util/ldb_cmd.h create mode 100644 util/ldb_cmd_execute_result.h create mode 100644 util/ldb_tool.cc create mode 100644 util/log_buffer.cc create mode 100644 util/log_buffer.h create mode 100644 util/log_write_bench.cc create mode 100644 util/logging.cc create mode 100644 util/logging.h create mode 100644 util/manual_compaction_test.cc create mode 100644 util/murmurhash.cc create mode 100644 util/murmurhash.h create mode 100644 util/mutexlock.h create mode 100644 util/options.cc create mode 100644 util/perf_context.cc create mode 100644 util/perf_context_imp.h create mode 100644 util/posix_logger.h create mode 100644 util/random.h create mode 100644 util/signal_test.cc create mode 100644 util/skiplistrep.cc create mode 100644 util/slice.cc create mode 100644 util/statistics.cc create mode 100644 util/statistics.h create mode 100644 util/stats_logger.h create mode 100644 util/status.cc create mode 100644 util/stl_wrappers.h create mode 100644 util/stop_watch.h create mode 100644 util/string_util.cc create mode 100644 util/string_util.h create mode 100644 util/sync_point.cc create mode 100644 util/sync_point.h create mode 100644 util/testharness.cc create mode 100644 util/testharness.h create mode 100644 util/testutil.cc create mode 100644 util/testutil.h create mode 100644 util/thread_local.cc create mode 100644 util/thread_local.h create mode 100644 util/thread_local_test.cc create mode 100644 util/vectorrep.cc create mode 100644 util/xxhash.cc create mode 100644 util/xxhash.h create mode 100644 utilities/backupable/backupable_db.cc create mode 100644 utilities/backupable/backupable_db_test.cc create mode 100644 utilities/geodb/geodb_impl.cc create mode 100644 utilities/geodb/geodb_impl.h create mode 100644 utilities/geodb/geodb_test.cc create mode 100644 utilities/merge_operators.h create mode 100644 utilities/merge_operators/put.cc create mode 100644 utilities/merge_operators/string_append/stringappend.cc create mode 100644 utilities/merge_operators/string_append/stringappend.h create mode 100644 utilities/merge_operators/string_append/stringappend2.cc create mode 100644 utilities/merge_operators/string_append/stringappend2.h create mode 100644 utilities/merge_operators/string_append/stringappend_test.cc create mode 100644 utilities/merge_operators/uint64add.cc create mode 100644 utilities/redis/README create mode 100644 utilities/redis/redis_list_exception.h create mode 100644 utilities/redis/redis_list_iterator.h create mode 100644 utilities/redis/redis_lists.cc create mode 100644 utilities/redis/redis_lists.h create mode 100644 utilities/redis/redis_lists_test.cc create mode 100644 utilities/ttl/db_ttl_impl.cc create mode 100644 utilities/ttl/db_ttl_impl.h create mode 100644 utilities/ttl/ttl_test.cc diff --git a/.arcconfig b/.arcconfig new file mode 100644 index 0000000000..85ca38f253 --- /dev/null +++ b/.arcconfig @@ -0,0 +1,10 @@ +{ + "project_id" : "rocksdb", + "conduit_uri" : "https://reviews.facebook.net/", + "copyright_holder" : "Facebook", + "load" : [ + "linters" + ], + "lint.engine" : "FacebookFbcodeLintEngine", + "lint.engine.single.linter" : "FbcodeCppLinter" +} diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000000..7c279811ac --- /dev/null +++ b/.clang-format @@ -0,0 +1,5 @@ +# Complete list of style options can be found at: +# http://clang.llvm.org/docs/ClangFormatStyleOptions.html +--- +BasedOnStyle: Google +... diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..cc5116ec76 --- /dev/null +++ b/.gitignore @@ -0,0 +1,34 @@ +TARGETS +build_config.mk + +*.a +*.arc +*.d +*.dylib* +*.gcda +*.gcno +*.o +*.so +*.so.* +*_test +*_bench +*_stress +*.out +*.class +*.jar +*.*jnilib* +*.d-e +*.o-* +*.swp + +ldb +manifest_dump +sst_dump +util/build_version.cc +build_tools/VALGRIND_LOGS/ +coverage/COVERAGE_REPORT +.gdbhistory +.phutil_module_cache +tags +java/*.log +java/include/org_rocksdb_*.h diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000000..66f37a5d28 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,20 @@ +language: cpp +compiler: gcc +before_install: +# As of this writing (10 May 2014) the Travis build environment is Ubuntu 12.04, +# which needs the following ugly dependency incantations to build RocksDB: + - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test + - sudo apt-get update -qq + - sudo apt-get install -y -qq gcc-4.8 g++-4.8 zlib1g-dev libbz2-dev libsnappy-dev libjemalloc-dev + - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 50 + - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 50 + - wget https://gflags.googlecode.com/files/libgflags0_2.0-1_amd64.deb + - sudo dpkg -i libgflags0_2.0-1_amd64.deb + - wget https://gflags.googlecode.com/files/libgflags-dev_2.0-1_amd64.deb + - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb +# Lousy hack to disable use and testing of fallocate, which doesn't behave quite +# as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment. + - sed -i "s/fallocate(/HACK_NO_fallocate(/" build_tools/build_detect_platform +script: make check -j8 +notifications: + email: false diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000..7270d0c213 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,20 @@ +# Contributing to RocksDB + +## Contributor License Agreement ("CLA") + +In order to accept your pull request, we need you to submit a CLA. You +only need to do this once, so if you've done this for another Facebook +open source project, you're good to go. If you are submitting a pull +request for the first time, just let us know that you have completed +the CLA and we can cross-check with your GitHub username. + +Complete your CLA here: + +If you don't have a Facebook account, we can send you a PDF that you can +sign offline. Send us an e-mail or create a new github issue to +request the CLA in PDF format. + +## License + +By contributing to RocksDB, you agree that your contributions will be +licensed under the [BSD License](LICENSE). diff --git a/HISTORY.md b/HISTORY.md new file mode 100644 index 0000000000..9b014c1f20 --- /dev/null +++ b/HISTORY.md @@ -0,0 +1,89 @@ +# Rocksdb Change Log + +## 3.1.0 (05/21/2014) + +### Public API changes +* Replaced ColumnFamilyOptions::table_properties_collectors with ColumnFamilyOptions::table_properties_collector_factories + +### New Features +* Hash index for block-based table will be materialized and reconstructed more efficiently. Previously hash index is constructed by scanning the whole table during every table open. +* FIFO compaction style + +## 3.0.0 (05/05/2014) + +### Public API changes +* Added _LEVEL to all InfoLogLevel enums +* Deprecated ReadOptions.prefix and ReadOptions.prefix_seek. Seek() defaults to prefix-based seek when Options.prefix_extractor is supplied. More detail is documented in https://github.com/facebook/rocksdb/wiki/Prefix-Seek-API-Changes +* MemTableRepFactory::CreateMemTableRep() takes info logger as an extra parameter. + +### New Features +* Column family support +* Added an option to use different checksum functions in BlockBasedTableOptions +* Added ApplyToAllCacheEntries() function to Cache + +## 2.8.0 (04/04/2014) + +* Removed arena.h from public header files. +* By default, checksums are verified on every read from database +* Change default value of several options, including: paranoid_checks=true, max_open_files=5000, level0_slowdown_writes_trigger=20, level0_stop_writes_trigger=24, disable_seek_compaction=true, max_background_flushes=1 and allow_mmap_writes=false +* Added is_manual_compaction to CompactionFilter::Context +* Added "virtual void WaitForJoin()" in class Env. Default operation is no-op. +* Removed BackupEngine::DeleteBackupsNewerThan() function +* Added new option -- verify_checksums_in_compaction +* Changed Options.prefix_extractor from raw pointer to shared_ptr (take ownership) + Changed HashSkipListRepFactory and HashLinkListRepFactory constructor to not take SliceTransform object (use Options.prefix_extractor implicitly) +* Added Env::GetThreadPoolQueueLen(), which returns the waiting queue length of thread pools +* Added a command "checkconsistency" in ldb tool, which checks + if file system state matches DB state (file existence and file sizes) +* Separate options related to block based table to a new struct BlockBasedTableOptions. +* WriteBatch has a new function Count() to return total size in the batch, and Data() now returns a reference instead of a copy +* Add more counters to perf context. +* Supports several more DB properties: compaction-pending, background-errors and cur-size-active-mem-table. + +### New Features +* If we find one truncated record at the end of the MANIFEST or WAL files, + we will ignore it. We assume that writers of these records were interrupted + and that we can safely ignore it. +* A new SST format "PlainTable" is added, which is optimized for memory-only workloads. It can be created through NewPlainTableFactory() or NewTotalOrderPlainTableFactory(). +* A new mem table implementation hash linked list optimizing for the case that there are only few keys for each prefix, which can be created through NewHashLinkListRepFactory(). +* Merge operator supports a new function PartialMergeMulti() to allow users to do partial merges against multiple operands. +* Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1. +* Geo-spatial support for locations and radial-search. + +## 2.7.0 (01/28/2014) + +### Public API changes + +* Renamed `StackableDB::GetRawDB()` to `StackableDB::GetBaseDB()`. +* Renamed `WriteBatch::Data()` `const std::string& Data() const`. +* Renamed class `TableStats` to `TableProperties`. +* Deleted class `PrefixHashRepFactory`. Please use `NewHashSkipListRepFactory()` instead. +* Supported multi-threaded `EnableFileDeletions()` and `DisableFileDeletions()`. +* Added `DB::GetOptions()`. +* Added `DB::GetDbIdentity()`. + +### New Features + +* Added [BackupableDB](https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F) +* Implemented [TailingIterator](https://github.com/facebook/rocksdb/wiki/Tailing-Iterator), a special type of iterator that + doesn't create a snapshot (can be used to read newly inserted data) + and is optimized for doing sequential reads. +* Added property block for table, which allows (1) a table to store + its metadata and (2) end user to collect and store properties they + are interested in. +* Enabled caching index and filter block in block cache (turned off by default). +* Supported error report when doing manual compaction. +* Supported additional Linux platform flavors and Mac OS. +* Put with `SliceParts` - Variant of `Put()` that gathers output like `writev(2)` +* Bug fixes and code refactor for compatibility with upcoming Column + Family feature. + +### Performance Improvements + +* Huge benchmark performance improvements by multiple efforts. For example, increase in readonly QPS from about 530k in 2.6 release to 1.1 million in 2.7 [1] +* Speeding up a way RocksDB deleted obsolete files - no longer listing the whole directory under a lock -- decrease in p99 +* Use raw pointer instead of shared pointer for statistics: [5b825d](https://github.com/facebook/rocksdb/commit/5b825d6964e26ec3b4bb6faa708ebb1787f1d7bd) -- huge increase in performance -- shared pointers are slow +* Optimized locking for `Get()` -- [1fdb3f](https://github.com/facebook/rocksdb/commit/1fdb3f7dc60e96394e3e5b69a46ede5d67fb976c) -- 1.5x QPS increase for some workloads +* Cache speedup - [e8d40c3](https://github.com/facebook/rocksdb/commit/e8d40c31b3cca0c3e1ae9abe9b9003b1288026a9) +* Implemented autovector, which allocates first N elements on stack. Most of vectors in RocksDB are small. Also, we never want to allocate heap objects while holding a mutex. -- [c01676e4](https://github.com/facebook/rocksdb/commit/c01676e46d3be08c3c140361ef1f5884f47d3b3c) +* Lots of efforts to move malloc, memcpy and IO outside of locks diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000000..a4ae08f104 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,84 @@ +## Compilation + +RocksDB's library should be able to compile without any dependency installed, +although we recommend installing some compression libraries (see below). +We do depend on newer gcc with C++11 support. + +There are few options when compiling RocksDB: + +* [recommended] `make static_lib` will compile librocksdb.a, RocksDB static library. + +* `make shared_lib` will compile librocksdb.so, RocksDB shared library. + +* `make check` will compile and run all the unit tests + +* `make all` will compile our static library, and all our tools and unit tests. Our tools +depend on gflags. You will need to have gflags installed to run `make all`. + +## Dependencies + +* You can link RocksDB with following compression libraries: + - [zlib](http://www.zlib.net/) - a library for data compression. + - [bzip2](http://www.bzip.org/) - a library for data compression. + - [snappy](https://code.google.com/p/snappy/) - a library for fast + data compression. + +* All our tools depend on: + - [gflags](https://code.google.com/p/gflags/) - a library that handles + command line flags processing. You can compile rocksdb library even + if you don't have gflags installed. + +## Supported platforms + +* **Linux - Ubuntu** + * Upgrade your gcc to version at least 4.7 to get C++11 support. + * Install gflags. First, try: `sudo apt-get install libgflags-dev` + If this doesn't work and you're using Ubuntu, here's a nice tutorial: + (http://askubuntu.com/questions/312173/installing-gflags-12-04) + * Install snappy. This is usually as easy as: + `sudo apt-get install libsnappy-dev`. + * Install zlib. Try: `sudo apt-get install zlib1g-dev`. + * Install bzip2: `sudo apt-get install libbz2-dev`. +* **Linux - CentOS** + * Upgrade your gcc to version at least 4.7 to get C++11 support: + `yum install gcc47-c++` + * Install gflags: + + wget https://gflags.googlecode.com/files/gflags-2.0-no-svn-files.tar.gz + tar -xzvf gflags-2.0-no-svn-files.tar.gz + cd gflags-2.0 + ./configure && make && sudo make install + + * Install snappy: + + wget https://snappy.googlecode.com/files/snappy-1.1.1.tar.gz + tar -xzvf snappy-1.1.1.tar.gz + cd snappy-1.1.1 + ./configure && make && sudo make install + + * Install zlib: + + sudo yum install zlib + sudo yum install zlib-devel + + * Install bzip2: + + sudo yum install bzip2 + sudo yum install bzip2-devel + +* **OS X**: + * Install latest C++ compiler that supports C++ 11: + * Update XCode: run `xcode-select --install` (or install it from XCode App's settting). + * Install via [homebrew](http://brew.sh/). + * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line. + * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher). + * Install zlib, bzip2 and snappy libraries for compression. + * Install gflags. We have included a script + `build_tools/mac-install-gflags.sh`, which should automatically install it. + If you installed gflags by other means (for example, `brew install gflags`), + please set `LIBRARY_PATH` and `CPATH` accordingly. + * Please note that some of the optimizations/features are disabled in OSX. + We did not run any production workloads on it. + +* **iOS**: + * Run: `TARGET_OS=IOS make static_lib` diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..b1329018690188a5419af36f960a3e804a427eb8 GIT binary patch literal 1646 zcmb_b!EWO=6uk2*9`>{;RJU97)IHG>Z4;11m!#tQC`)uKtR)5#ogjbTp<-_wG{7FV zKG*~w-^{#uvm8(NXT6+$ozAKZm0q#uExf={()-WH*IVhV-xFK_=_ds?;kD-Ff2wCm@lLOpk4XpP88%t1YM>=7=2yn}WA>T`Z2lc6HJ2%WeRsg0O6Rrv{ zU2}#<-$k(RBIsvCu*+bLug{(9!(?sT#BZ?OgKeO$3T|kt)A(lbv={i{XHIo?PGH7C1u`(D{I^Pi};f z8Yn};>3{;8mT>*ETr)%CqpM?R(A35#4KXhLYjDO9nV2zVSJIGC3%li75Y8K|?qw!2 zn(xT21f^=T({j18v{HE{v>{$(xGk)f^U^A9*e7n#`DrTfVn^bijTXj`(j?cLTmlT} znip0IgFkDL=4B>}WkLX;LRpr}wX}F=f+A~T0u*tvdL#7^u0`l`AWHp&M&AmB4HV~d?E$}oI)Au=cSR8XR@$D>vChI zD( $(VALGRIND_DIR)/valgrind_failed_tests; \ + echo TIMES in seconds TAKEN BY TESTS ON VALGRIND > $(VALGRIND_DIR)/valgrind_tests_times; \ + for t in $(filter-out skiplist_test,$(TESTS)); do \ + stime=`date '+%s'`; \ + $(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \ + if [ $$? -eq $(VALGRIND_ERROR) ] ; then \ + echo $$t >> $(VALGRIND_DIR)/valgrind_failed_tests; \ + fi; \ + etime=`date '+%s'`; \ + echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \ + done + +clean: + -rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk + -rm -rf ios-x86/* ios-arm/* + -find . -name "*.[od]" -exec rm {} \; + -find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; +tags: + ctags * -R + cscope -b `find . -name '*.cc'` `find . -name '*.h'` + +format: + build_tools/format-diff.sh + +# --------------------------------------------------------------------------- +# Unit tests and tools +# --------------------------------------------------------------------------- +$(LIBRARY): $(LIBOBJECTS) + rm -f $@ + $(AR) -rs $@ $(LIBOBJECTS) + +db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +signal_test: util/signal_test.o $(LIBOBJECTS) + $(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +column_family_test: db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +dynamic_bloom_test: util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL) + $(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS) + +stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +benchharness_test: util/benchharness_test.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS) + $(CXX) util/benchharness_test.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS) + +histogram_test: util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS) + +thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg + +plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg + +log_and_apply_bench: db/log_and_apply_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS) + $(CXX) db/log_and_apply_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg + +perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) + +prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) + +backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +file_indexer_test : db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) + +geodb_test: utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +$(MEMENVLIBRARY) : $(MEMENVOBJECTS) + rm -f $@ + $(AR) -rs $@ $(MEMENVOBJECTS) + +memenv_test : helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +rocksdb_shell: tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o tools/shell/ShellContext.h tools/shell/ShellState.h tools/shell/DBClientProxy.h $(LIBOBJECTS) + $(CXX) tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +DBClientProxy_test: tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY) + $(CXX) tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY) $(EXEC_LDFLAGS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +sst_dump: tools/sst_dump.o $(LIBOBJECTS) + $(CXX) tools/sst_dump.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +ldb: tools/ldb.o $(LIBOBJECTS) + $(CXX) tools/ldb.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +# --------------------------------------------------------------------------- +# Jni stuff +# --------------------------------------------------------------------------- + +JNI_NATIVE_SOURCES = ./java/rocksjni/*.cc +JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux +ROCKSDBJNILIB = ./java/librocksdbjni.so + +ifeq ($(PLATFORM), OS_MACOSX) +ROCKSDBJNILIB = ./java/librocksdbjni.jnilib +JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/ +endif + +rocksdbjava: clean + OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32 + cd java;$(MAKE) java; + rm -f $(ROCKSDBJNILIB) + $(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o $(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(LDFLAGS) $(COVERAGEFLAGS) + +jclean: + cd java;$(MAKE) clean; + rm -f $(ROCKSDBJNILIB) + +jtest: + cd java;$(MAKE) sample;$(MAKE) test; + +jdb_bench: + cd java;$(MAKE) db_bench; + +# --------------------------------------------------------------------------- +# Platform-specific compilation +# --------------------------------------------------------------------------- + +ifeq ($(PLATFORM), IOS) +# For iOS, create universal object files to be used on both the simulator and +# a device. +PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms +SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer +DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer +IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString) + +.cc.o: + mkdir -p ios-x86/$(dir $@) + $(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@ + mkdir -p ios-arm/$(dir $@) + xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@ + lipo ios-x86/$@ ios-arm/$@ -create -output $@ + +.c.o: + mkdir -p ios-x86/$(dir $@) + $(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@ + mkdir -p ios-arm/$(dir $@) + xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@ + lipo ios-x86/$@ ios-arm/$@ -create -output $@ + +else +.cc.o: + $(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS) + +.c.o: + $(CC) $(CFLAGS) -c $< -o $@ +endif + +# --------------------------------------------------------------------------- +# Source files dependencies detection +# --------------------------------------------------------------------------- + +# Add proper dependency support so changing a .h file forces a .cc file to +# rebuild. + +# The .d file indicates .cc file's dependencies on .h files. We generate such +# dependency by g++'s -MM option, whose output is a make dependency rule. +# The sed command makes sure the "target" file in the generated .d file has +# the correct path prefix. +%.d: %.cc + $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM $< -o $@ +ifeq ($(PLATFORM), OS_MACOSX) + @sed -i '' -e 's,.*:,$*.o:,' $@ +else + @sed -i -e 's,.*:,$*.o:,' $@ +endif + +DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d)) + +depend: $(DEPFILES) + +# if the make goal is either "clean" or "format", we shouldn't +# try to import the *.d files. +# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly +# working solution. +ifneq ($(MAKECMDGOALS),clean) +ifneq ($(MAKECMDGOALS),format) +ifneq ($(MAKECMDGOALS),jclean) +ifneq ($(MAKECMDGOALS),jtest) +-include $(DEPFILES) +endif +endif +endif +endif diff --git a/PATENTS b/PATENTS new file mode 100644 index 0000000000..8a6fca4d2b --- /dev/null +++ b/PATENTS @@ -0,0 +1,23 @@ +Additional Grant of Patent Rights + +“Software” means the rocksdb software distributed by Facebook, Inc. + +Facebook hereby grants you a perpetual, worldwide, royalty-free, +non-exclusive, irrevocable (subject to the termination provision below) +license under any rights in any patent claims owned by Facebook, to make, +have made, use, sell, offer to sell, import, and otherwise transfer the +Software. For avoidance of doubt, no license is granted under Facebook’s +rights in any patent claims that are infringed by (i) modifications to the +Software made by you or a third party, or (ii) the Software in combination +with any software or other technology provided by you or a third party. + +The license granted hereunder will terminate, automatically and without +notice, for anyone that makes any claim (including by filing any lawsuit, +assertion or other action) alleging (a) direct, indirect, or contributory +infringement or inducement to infringe any patent: (i) by Facebook or any +of its subsidiaries or affiliates, whether or not such claim is related +to the Software, (ii) by any party if such claim arises in whole or in +part from any software, product or service of Facebook or any of its +subsidiaries or affiliates, whether or not such claim is related to the +Software, or (iii) by any party relating to the Software; or (b) that +any right in any patent claim of Facebook is invalid or unenforceable. diff --git a/README.md b/README.md new file mode 100644 index 0000000000..fabced9a61 --- /dev/null +++ b/README.md @@ -0,0 +1,26 @@ +## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage + +[![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb) + +RocksDB is developed and maintained by Facebook Database Engineering Team. +It is built on on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com) +and Jeff Dean (jeff@google.com) + +This code is a library that forms the core building block for a fast +key value server, especially suited for storing data on flash drives. +It has an Log-Structured-Merge-Database (LSM) design with flexible tradeoffs +between Write-Amplification-Factor (WAF), Read-Amplification-Factor (RAF) +and Space-Amplification-Factor (SAF). It has multi-threaded compactions, +making it specially suitable for storing multiple terabytes of data in a +single database. + +Start with example usage here: https://github.com/facebook/rocksdb/tree/master/examples + +See [doc/index.html](https://github.com/facebook/rocksdb/blob/master/doc/index.html) and +[github wiki](https://github.com/facebook/rocksdb/wiki) for more explanation. + +The public interface is in `include/`. Callers should not include or +rely on the details of any other header files in this package. Those +internal APIs may be changed without warning. + +Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/ diff --git a/ROCKSDB_LITE.md b/ROCKSDB_LITE.md new file mode 100644 index 0000000000..e7e3752c8c --- /dev/null +++ b/ROCKSDB_LITE.md @@ -0,0 +1,20 @@ +# RocksDBLite + +RocksDBLite is a project focused on mobile use cases, which don't need a lot of fancy things we've built for server workloads and they are very sensitive to binary size. For that reason, we added a compile flag ROCKSDB_LITE that comments out a lot of the nonessential code and keeps the binary lean. + +Some examples of the features disabled by ROCKSDB_LITE: +* compiled-in support for LDB tool +* No backupable DB +* No support for replication (which we provide in form of TrasactionalIterator) +* No advanced monitoring tools +* No special-purpose memtables that are highly optimized for specific use cases + +When adding a new big feature to RocksDB, please add ROCKSDB_LITE compile guard if: +* Nobody from mobile really needs your feature, +* Your feature is adding a lot of weight to the binary. + +Don't add ROCKSDB_LITE compile guard if: +* It would introduce a lot of code complexity. Compile guards make code harder to read. It's a trade-off. +* Your feature is not adding a lot of weight. + +If unsure, ask. :) diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform new file mode 100755 index 0000000000..c8ed00487a --- /dev/null +++ b/build_tools/build_detect_platform @@ -0,0 +1,320 @@ +#!/bin/sh +# +# Detects OS we're compiling on and outputs a file specified by the first +# argument, which in turn gets read while processing Makefile. +# +# The output will set the following variables: +# CC C Compiler path +# CXX C++ Compiler path +# PLATFORM_LDFLAGS Linker flags +# PLATFORM_SHARED_EXT Extension for shared libraries +# PLATFORM_SHARED_LDFLAGS Flags for building shared library +# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library +# PLATFORM_CCFLAGS C compiler flags +# PLATFORM_CXXFLAGS C++ compiler flags. Will contain: +# PLATFORM_SHARED_VERSIONED Set to 'true' if platform supports versioned +# shared libraries, empty otherwise. +# +# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following: +# +# -DLEVELDB_PLATFORM_POSIX if cstdatomic is present +# -DLEVELDB_PLATFORM_NOATOMIC if it is not +# -DSNAPPY if the Snappy library is present +# -DLZ4 if the LZ4 library is present +# +# Using gflags in rocksdb: +# Our project depends on gflags, which requires users to take some extra steps +# before they can compile the whole repository: +# 1. Install gflags. You may download it from here: +# https://code.google.com/p/gflags/ +# 2. Once install, add the include path/lib path for gflags to CPATH and +# LIBRARY_PATH respectively. If installed with default mode, the +# lib and include path will be /usr/local/lib and /usr/local/include +# Mac user can do this by running build_tools/mac-install-gflags.sh + +OUTPUT=$1 +if test -z "$OUTPUT"; then + echo "usage: $0 " >&2 + exit 1 +fi + +# we depend on C++11 +PLATFORM_CXXFLAGS="-std=c++11" +# we currently depend on POSIX platform +COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX" + +# Default to fbcode gcc on internal fb machines +if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then + FBCODE_BUILD="true" + if [ -z "$USE_CLANG" ]; then + CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \ + $(rpm -q --whatprovides redhat-release)` + if [ "$CENTOS_VERSION" = "6" ]; then + source "$PWD/build_tools/fbcode.gcc481.sh" + else + source "$PWD/build_tools/fbcode.gcc471.sh" + fi + else + source "$PWD/build_tools/fbcode.clang31.sh" + fi +fi + +# Delete existing output, if it exists +rm -f "$OUTPUT" +touch "$OUTPUT" + +if test -z "$CC"; then + CC=cc +fi + +if test -z "$CXX"; then + CXX=g++ +fi + +# Detect OS +if test -z "$TARGET_OS"; then + TARGET_OS=`uname -s` +fi + +COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}" +CROSS_COMPILE= +PLATFORM_CCFLAGS= +PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}" +PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS" +PLATFORM_SHARED_EXT="so" +PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl," +PLATFORM_SHARED_CFLAGS="-fPIC" +PLATFORM_SHARED_VERSIONED=false + +# generic port files (working on all platform by #ifdef) go directly in /port +GENERIC_PORT_FILES=`cd "$ROCKSDB_ROOT"; find port -name '*.cc' | tr "\n" " "` + +# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp +case "$TARGET_OS" in + Darwin) + PLATFORM=OS_MACOSX + COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX" + PLATFORM_SHARED_EXT=dylib + PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name " + # PORT_FILES=port/darwin/darwin_specific.cc + ;; + IOS) + PLATFORM=IOS + COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE -DROCKSDB_LITE" + PLATFORM_SHARED_EXT=dylib + PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name " + CROSS_COMPILE=true + ;; + Linux) + PLATFORM=OS_LINUX + COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX" + if [ -z "$USE_CLANG" ]; then + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp" + fi + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt" + # PORT_FILES=port/linux/linux_specific.cc + ;; + SunOS) + PLATFORM=OS_SOLARIS + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt" + # PORT_FILES=port/sunos/sunos_specific.cc + ;; + FreeBSD) + PLATFORM=OS_FREEBSD + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread" + # PORT_FILES=port/freebsd/freebsd_specific.cc + ;; + NetBSD) + PLATFORM=OS_NETBSD + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s" + # PORT_FILES=port/netbsd/netbsd_specific.cc + ;; + OpenBSD) + PLATFORM=OS_OPENBSD + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread" + # PORT_FILES=port/openbsd/openbsd_specific.cc + ;; + DragonFly) + PLATFORM=OS_DRAGONFLYBSD + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread" + # PORT_FILES=port/dragonfly/dragonfly_specific.cc + ;; + OS_ANDROID_CROSSCOMPILE) + PLATFORM=OS_ANDROID + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS " # All pthread features are in the Android C library + # PORT_FILES=port/android/android.cc + CROSS_COMPILE=true + ;; + *) + echo "Unknown platform!" >&2 + exit 1 +esac + +if test -z "$DO_NOT_RUN_BUILD_DETECT_VERSION"; then + "$PWD/build_tools/build_detect_version" +fi + +# We want to make a list of all cc files within util, db, table, and helpers +# except for the test and benchmark files. By default, find will output a list +# of all files matching either rule, so we need to append -print to make the +# prune take effect. +DIRS="util db table utilities" + +set -f # temporarily disable globbing so that our patterns arent expanded +PRUNE_TEST="-name *test*.cc -prune" +PRUNE_BENCH="-name *bench*.cc -prune" +PORTABLE_FILES=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "` +PORTABLE_CPP=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "` +set +f # re-enable globbing + +# The sources consist of the portable files, plus the platform-specific port +# file. +echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> "$OUTPUT" +echo "SOURCESCPP=$PORTABLE_CPP" >> "$OUTPUT" +echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> "$OUTPUT" + +if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then + # Cross-compiling; do not try any compilation tests. + # Also don't need any compilation tests if compiling on fbcode + true +else + # If -std=c++0x works, use . Otherwise use port_posix.h. + $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT" + fi + + # Test whether fallocate is available + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() { + int fd = open("/dev/null", 0); + fallocate(fd, 0, 0, 1024); + } +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT" + fi + + # Test whether Snappy library is installed + # http://code.google.com/p/snappy/ + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy" + fi + + + # Test whether gflags library is installed + # http://code.google.com/p/gflags/ + # check if the namespace is gflags + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + using namespace gflags; + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=gflags" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" + fi + + # check if namespace is google + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + using namespace google; + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=google" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" + fi + + # Test whether zlib library is installed + $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DZLIB" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz" + fi + + # Test whether bzip library is installed + $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DBZIP2" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2" + fi + + # Test whether lz4 library is installed + $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null < + #include + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DLZ4" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4" + fi + + # Test whether tcmalloc is available + $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <> "$OUTPUT" +echo "CXX=$CXX" >> "$OUTPUT" +echo "PLATFORM=$PLATFORM" >> "$OUTPUT" +echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT" +echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT" +echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT" +echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> "$OUTPUT" +echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> "$OUTPUT" +echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> "$OUTPUT" +echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> "$OUTPUT" +echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> "$OUTPUT" +echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> "$OUTPUT" +echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> "$OUTPUT" +echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> "$OUTPUT" diff --git a/build_tools/build_detect_version b/build_tools/build_detect_version new file mode 100755 index 0000000000..f7d711f0dd --- /dev/null +++ b/build_tools/build_detect_version @@ -0,0 +1,22 @@ +#!/bin/sh +# +# Record the version of the source that we are compiling. +# We keep a record of the git revision in util/version.cc. This source file +# is then built as a regular source file as part of the compilation process. +# One can run "strings executable_filename | grep _build_" to find the version of +# the source that we used to build the executable file. + +OUTFILE="$PWD/util/build_version.cc" + +GIT_SHA="" +if command -v git >/dev/null 2>&1; then + GIT_SHA=$(git rev-parse HEAD 2>/dev/null) +fi + +cat > "${OUTFILE}" < /dev/null +then + echo "You didn't have clang-format-diff.py available in your computer!" + echo "You can download it by running: " + echo " curl http://goo.gl/iUW1u2" + exit 128 +fi + +# Check argparse, a library that clang-format-diff.py requires. +python 2>/dev/null << EOF +import argparse +EOF + +if [ "$?" != 0 ] +then + echo "To run clang-format-diff.py, we'll need the library "argparse" to be" + echo "installed. You can try either of the follow ways to install it:" + echo " 1. Manually download argparse: https://pypi.python.org/pypi/argparse" + echo " 2. easy_install argparse (if you have easy_install)" + echo " 3. pip install argparse (if you have pip)" + exit 129 +fi + +# TODO(kailiu) following work is not complete since we still need to figure +# out how to add the modified files done pre-commit hook to git's commit index. +# +# Check if this script has already been added to pre-commit hook. +# Will suggest user to add this script to pre-commit hook if their pre-commit +# is empty. +# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit" +# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null +# then +# echo "Would you like to add this script to pre-commit hook, which will do " +# echo -n "the format check for all the affected lines before you check in (y/n):" +# read add_to_hook +# if [ "$add_to_hook" == "y" ] +# then +# ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH +# fi +# fi +set -e + +uncommitted_code=`git diff HEAD` + +# If there's no uncommitted changes, we assume user are doing post-commit +# format check, in which case we'll check the modified lines from latest commit. +# Otherwise, we'll check format of the uncommitted code only. +if [ -z "$uncommitted_code" ] +then + # Check the format of last commit + diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1) +else + # Check the format of uncommitted lines, + diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1) +fi + +if [ -z "$diffs" ] +then + echo "Nothing needs to be reformatted!" + exit 0 +fi + +# Highlight the insertion/deletion from the clang-format-diff.py's output +COLOR_END="\033[0m" +COLOR_RED="\033[0;31m" +COLOR_GREEN="\033[0;32m" + +echo -e "Detect lines that doesn't follow the format rules:\r" +# Add the color to the diff. lines added will be green; lines removed will be red. +echo "$diffs" | + sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" | + sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/" +echo -e "Would you like to fix the format automatically (y/n): \c" + +# Make sure under any mode, we can read user input. +exec < /dev/tty +read to_fix + +if [ "$to_fix" != "y" ] +then + exit 1 +fi + +# Do in-place format adjustment. +git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1 +echo "Files reformatted!" + +# Amend to last commit if user do the post-commit format check +if [ -z "$uncommitted_code" ]; then + echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c" + read to_amend + + if [ "$to_amend" == "y" ] + then + git commit -a --amend --reuse-message HEAD + echo "Amended to last commit" + fi +fi diff --git a/build_tools/mac-install-gflags.sh b/build_tools/mac-install-gflags.sh new file mode 100755 index 0000000000..ef0339c30d --- /dev/null +++ b/build_tools/mac-install-gflags.sh @@ -0,0 +1,25 @@ +#!/bin/sh +# Install gflags for mac developers. + +set -e + +DIR=`mktemp -d /tmp/rocksdb_gflags_XXXX` + +cd $DIR +wget https://gflags.googlecode.com/files/gflags-2.0.tar.gz +tar xvfz gflags-2.0.tar.gz +cd gflags-2.0 + +./configure +make +make install + +# Add include/lib path for g++ +echo 'export LIBRARY_PATH+=":/usr/local/lib"' >> ~/.bash_profile +echo 'export CPATH+=":/usr/local/include"' >> ~/.bash_profile + +echo "" +echo "-----------------------------------------------------------------------------" +echo "| Installation Completed |" +echo "-----------------------------------------------------------------------------" +echo "Please run `. ~/bash_profile` to be able to compile with gflags" diff --git a/build_tools/make_new_version.sh b/build_tools/make_new_version.sh new file mode 100755 index 0000000000..a8d524fcc3 --- /dev/null +++ b/build_tools/make_new_version.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright (c) 2013, Facebook, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +set -e +if [ -z "$GIT" ] +then + GIT="git" +fi + +# Print out the colored progress info so that it can be brainlessly +# distinguished by users. +function title() { + echo -e "\033[1;32m$*\033[0m" +} + +usage="Create new RocksDB version and prepare it for the release process\n" +usage+="USAGE: ./make_new_version.sh " + +# -- Pre-check +if [[ $# < 1 ]]; then + echo -e $usage + exit 1 +fi + +ROCKSDB_VERSION=$1 + +GIT_BRANCH=`git rev-parse --abbrev-ref HEAD` +echo $GIT_BRANCH + +if [ $GIT_BRANCH != "master" ]; then + echo "Error: Current branch is '$GIT_BRANCH', Please switch to master branch." + exit 1 +fi + +title "Adding new tag for this release ..." +BRANCH="$ROCKSDB_VERSION.fb" +$GIT co -b $BRANCH + +# Setting up the proxy for remote repo access +title "Pushing new branch to remote repo ..." +git push origin --set-upstream $BRANCH + +title "Branch $BRANCH is pushed to github;" diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh new file mode 100755 index 0000000000..82e3380fa8 --- /dev/null +++ b/build_tools/regression_build_test.sh @@ -0,0 +1,330 @@ +#!/bin/bash + +set -e + +NUM=10000000 + +if [ $# -eq 1 ];then + DATA_DIR=$1 +elif [ $# -eq 2 ];then + DATA_DIR=$1 + STAT_FILE=$2 +fi + +# On the production build servers, set data and stat +# files/directories not in /tmp or else the tempdir cleaning +# scripts will make you very unhappy. +DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)} +STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)} + +function cleanup { + rm -rf $DATA_DIR + rm -f $STAT_FILE.fillseq + rm -f $STAT_FILE.readrandom + rm -f $STAT_FILE.overwrite + rm -f $STAT_FILE.memtablefillreadrandom +} + +trap cleanup EXIT + +if [ -z $GIT_BRANCH ]; then + git_br=`git rev-parse --abbrev-ref HEAD` +else + git_br=$(basename $GIT_BRANCH) +fi + +if [ $git_br == "master" ]; then + git_br="" +else + git_br="."$git_br +fi + +make release + +# measure fillseq + fill up the DB for overwrite benchmark +./db_bench \ + --benchmarks=fillseq \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --bloom_bits=10 \ + --num=$NUM \ + --writes=$NUM \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 > ${STAT_FILE}.fillseq + +# measure overwrite performance +./db_bench \ + --benchmarks=overwrite \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --writes=$((NUM / 10)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=8 > ${STAT_FILE}.overwrite + +# fill up the db for readrandom benchmark (1GB total size) +./db_bench \ + --benchmarks=fillseq \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --bloom_bits=10 \ + --num=$NUM \ + --writes=$NUM \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=1 > /dev/null + +# measure readrandom with 6GB block cache +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --reads=$((NUM / 5)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandom + +# measure readrandom with 6GB block cache and tailing iterator +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --reads=$((NUM / 5)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --use_tailing_iterator=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandomtailing + +# measure readrandom with 100MB block cache +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --reads=$((NUM / 5)) \ + --cache_size=104857600 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandomsmallblockcache + +# measure readrandom with 8k data in memtable +./db_bench \ + --benchmarks=overwrite,readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --reads=$((NUM / 5)) \ + --writes=512 \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --write_buffer_size=1000000000 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandom_mem_sst + + +# fill up the db for readrandom benchmark with filluniquerandom (1GB total size) +./db_bench \ + --benchmarks=filluniquerandom \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --bloom_bits=10 \ + --num=$((NUM / 4)) \ + --writes=$((NUM / 4)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=1 > /dev/null + +# dummy test just to compact the data +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$((NUM / 1000)) \ + --reads=$((NUM / 1000)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > /dev/null + +# measure readrandom after load with filluniquerandom with 6GB block cache +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$((NUM / 4)) \ + --reads=$((NUM / 4)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --disable_auto_compactions=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandom_filluniquerandom + +# measure readwhilewriting after load with filluniquerandom with 6GB block cache +./db_bench \ + --benchmarks=readwhilewriting \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$((NUM / 4)) \ + --reads=$((NUM / 4)) \ + --writes_per_second=1000 \ + --write_buffer_size=100000000 \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readwhilewriting + +# measure memtable performance -- none of the data gets flushed to disk +./db_bench \ + --benchmarks=fillrandom,readrandom, \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --num=$((NUM / 10)) \ + --reads=$NUM \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --write_buffer_size=1000000000 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --value_size=10 \ + --threads=16 > ${STAT_FILE}.memtablefillreadrandom + +# send data to ods +function send_to_ods { + key="$1" + value="$2" + + if [ -z $JENKINS_HOME ]; then + # running on devbox, just print out the values + echo $1 $2 + return + fi + + if [ -z "$value" ];then + echo >&2 "ERROR: Key $key doesn't have a value." + return + fi + curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \ + --connect-timeout 60 +} + +function send_benchmark_to_ods { + bench="$1" + bench_key="$2" + file="$3" + + QPS=$(grep $bench $file | awk '{print $5}') + P50_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $3}' ) + P75_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $5}' ) + P99_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $7}' ) + + send_to_ods rocksdb.build.$bench_key.qps $QPS + send_to_ods rocksdb.build.$bench_key.p50_micros $P50_MICROS + send_to_ods rocksdb.build.$bench_key.p75_micros $P75_MICROS + send_to_ods rocksdb.build.$bench_key.p99_micros $P99_MICROS +} + +send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite +send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq +send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom +send_benchmark_to_ods readrandom readrandom_tailing $STAT_FILE.readrandomtailing +send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache +send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst +send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom +send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom +send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom +send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting diff --git a/build_tools/valgrind_test.sh b/build_tools/valgrind_test.sh new file mode 100755 index 0000000000..8c7e521345 --- /dev/null +++ b/build_tools/valgrind_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#A shell script for Jenknis to run valgrind on rocksdb tests +#Returns 0 on success when there are no failed tests + +VALGRIND_DIR=build_tools/VALGRIND_LOGS +make clean +make -j$(nproc) valgrind_check +NUM_FAILED_TESTS=$((`wc -l $VALGRIND_DIR/valgrind_failed_tests | awk '{print $1}'` - 1)) +if [ $NUM_FAILED_TESTS -lt 1 ]; then + echo No tests have valgrind errors + exit 0 +else + cat $VALGRIND_DIR/valgrind_failed_tests + exit 1 +fi diff --git a/coverage/coverage_test.sh b/coverage/coverage_test.sh new file mode 100755 index 0000000000..08dbd05a59 --- /dev/null +++ b/coverage/coverage_test.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +# Exit on error. +set -e + +if [ -n "$USE_CLANG" ]; then + echo "Error: Coverage test is supported only for gcc." + exit 1 +fi + +ROOT=".." +# Fetch right version of gcov +if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then + source $ROOT/build_tools/fbcode.gcc471.sh + GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov +else + GCOV=$(which gcov) +fi + +COVERAGE_DIR="$PWD/COVERAGE_REPORT" +mkdir -p $COVERAGE_DIR + +# Find all gcno files to generate the coverage report + +GCNO_FILES=`find $ROOT -name "*.gcno"` +$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null | + # Parse the raw gcov report to more human readable form. + python $ROOT/coverage/parse_gcov_output.py | + # Write the output to both stdout and report file. + tee $COVERAGE_DIR/coverage_report_all.txt && +echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n" + +# TODO: we also need to get the files of the latest commits. +# Get the most recently committed files. +LATEST_FILES=` + git show --pretty="format:" --name-only HEAD | + grep -v "^$" | + paste -s -d,` +RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt + +echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT +$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null | + python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES | + tee -a $RECENT_REPORT && +echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n" + +# Unless otherwise specified, we'll not generate html report by default +if [ -z "$HTML" ]; then + exit 0 +fi + +# Generate the html report. If we cannot find lcov in this machine, we'll simply +# skip this step. +echo "Generating the html coverage report..." + +LCOV=$(which lcov || true 2>/dev/null) +if [ -z $LCOV ] +then + echo "Skip: Cannot find lcov to generate the html report." + exit 0 +fi + +LCOV_VERSION=$(lcov -v | grep 1.1 || true) +if [ $LCOV_VERSION ] +then + echo "Not supported lcov version. Expect lcov 1.1." + exit 0 +fi + +(cd $ROOT; lcov --no-external \ + --capture \ + --directory $PWD \ + --gcov-tool $GCOV \ + --output-file $COVERAGE_DIR/coverage.info) + +genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR + +echo "HTML Coverage report is generated in $COVERAGE_DIR" diff --git a/coverage/parse_gcov_output.py b/coverage/parse_gcov_output.py new file mode 100644 index 0000000000..72e8b07230 --- /dev/null +++ b/coverage/parse_gcov_output.py @@ -0,0 +1,118 @@ +import optparse +import re +import sys + +from optparse import OptionParser + +# the gcov report follows certain pattern. Each file will have two lines +# of report, from which we can extract the file name, total lines and coverage +# percentage. +def parse_gcov_report(gcov_input): + per_file_coverage = {} + total_coverage = None + + for line in sys.stdin: + line = line.strip() + + # --First line of the coverage report (with file name in it)? + match_obj = re.match("^File '(.*)'$", line) + if match_obj: + # fetch the file name from the first line of the report. + current_file = match_obj.group(1) + continue + + # -- Second line of the file report (with coverage percentage) + match_obj = re.match("^Lines executed:(.*)% of (.*)", line) + + if match_obj: + coverage = float(match_obj.group(1)) + lines = int(match_obj.group(2)) + + if current_file is not None: + per_file_coverage[current_file] = (coverage, lines) + current_file = None + else: + # If current_file is not set, we reach the last line of report, + # which contains the summarized coverage percentage. + total_coverage = (coverage, lines) + continue + + # If the line's pattern doesn't fall into the above categories. We + # can simply ignore them since they're either empty line or doesn't + # find executable lines of the given file. + current_file = None + + return per_file_coverage, total_coverage + +def get_option_parser(): + usage = "Parse the gcov output and generate more human-readable code " +\ + "coverage report." + parser = OptionParser(usage) + + parser.add_option( + "--interested-files", "-i", + dest="filenames", + help="Comma separated files names. if specified, we will display " + + "the coverage report only for interested source files. " + + "Otherwise we will display the coverage report for all " + + "source files." + ) + return parser + +def display_file_coverage(per_file_coverage, total_coverage): + # To print out auto-adjustable column, we need to know the longest + # length of file names. + max_file_name_length = max( + len(fname) for fname in per_file_coverage.keys() + ) + + # -- Print header + # size of separator is determined by 3 column sizes: + # file name, coverage percentage and lines. + header_template = \ + "%" + str(max_file_name_length) + "s\t%s\t%s" + separator = "-" * (max_file_name_length + 10 + 20) + print header_template % ("Filename", "Coverage", "Lines") + print separator + + # -- Print body + # template for printing coverage report for each file. + record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d" + + for fname, coverage_info in per_file_coverage.items(): + coverage, lines = coverage_info + print record_template % (fname, coverage, lines) + + # -- Print footer + if total_coverage: + print separator + print record_template % ("Total", total_coverage[0], total_coverage[1]) + +def report_coverage(): + parser = get_option_parser() + (options, args) = parser.parse_args() + + interested_files = set() + if options.filenames is not None: + interested_files = set(f.strip() for f in options.filenames.split(',')) + + # To make things simple, right now we only read gcov report from the input + per_file_coverage, total_coverage = parse_gcov_report(sys.stdin) + + # Check if we need to display coverage info for interested files. + if len(interested_files): + per_file_coverage = dict( + (fname, per_file_coverage[fname]) for fname in interested_files + if fname in per_file_coverage + ) + # If we only interested in several files, it makes no sense to report + # the total_coverage + total_coverage = None + + if not len(per_file_coverage): + print >> sys.stderr, "Cannot find coverage info for the given files." + return + display_file_coverage(per_file_coverage, total_coverage) + +if __name__ == "__main__": + report_coverage() diff --git a/db/builder.cc b/db/builder.cc new file mode 100644 index 0000000000..ce85ae589c --- /dev/null +++ b/db/builder.cc @@ -0,0 +1,224 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/builder.h" + +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/merge_helper.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block_based_table_builder.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +class TableFactory; + +TableBuilder* NewTableBuilder(const Options& options, + const InternalKeyComparator& internal_comparator, + WritableFile* file, + CompressionType compression_type) { + return options.table_factory->NewTableBuilder(options, internal_comparator, + file, compression_type); +} + +Status BuildTable(const std::string& dbname, Env* env, const Options& options, + const EnvOptions& soptions, TableCache* table_cache, + Iterator* iter, FileMetaData* meta, + const InternalKeyComparator& internal_comparator, + const SequenceNumber newest_snapshot, + const SequenceNumber earliest_seqno_in_memtable, + const CompressionType compression) { + Status s; + meta->file_size = 0; + meta->smallest_seqno = meta->largest_seqno = 0; + iter->SeekToFirst(); + + // If the sequence number of the smallest entry in the memtable is + // smaller than the most recent snapshot, then we do not trigger + // removal of duplicate/deleted keys as part of this builder. + bool purge = options.purge_redundant_kvs_while_flush; + if (earliest_seqno_in_memtable <= newest_snapshot) { + purge = false; + } + + std::string fname = TableFileName(dbname, meta->number); + if (iter->Valid()) { + unique_ptr file; + s = env->NewWritableFile(fname, &file, soptions); + if (!s.ok()) { + return s; + } + + TableBuilder* builder = + NewTableBuilder(options, internal_comparator, file.get(), compression); + + // the first key is the smallest key + Slice key = iter->key(); + meta->smallest.DecodeFrom(key); + meta->smallest_seqno = GetInternalKeySeqno(key); + meta->largest_seqno = meta->smallest_seqno; + + MergeHelper merge(internal_comparator.user_comparator(), + options.merge_operator.get(), options.info_log.get(), + options.min_partial_merge_operands, + true /* internal key corruption is not ok */); + + if (purge) { + // Ugly walkaround to avoid compiler error for release build + bool ok __attribute__((unused)) = true; + + // Will write to builder if current key != prev key + ParsedInternalKey prev_ikey; + std::string prev_key; + bool is_first_key = true; // Also write if this is the very first key + + while (iter->Valid()) { + bool iterator_at_next = false; + + // Get current key + ParsedInternalKey this_ikey; + Slice key = iter->key(); + Slice value = iter->value(); + + // In-memory key corruption is not ok; + // TODO: find a clean way to treat in memory key corruption + ok = ParseInternalKey(key, &this_ikey); + assert(ok); + assert(this_ikey.sequence >= earliest_seqno_in_memtable); + + // If the key is the same as the previous key (and it is not the + // first key), then we skip it, since it is an older version. + // Otherwise we output the key and mark it as the "new" previous key. + if (!is_first_key && !internal_comparator.user_comparator()->Compare( + prev_ikey.user_key, this_ikey.user_key)) { + // seqno within the same key are in decreasing order + assert(this_ikey.sequence < prev_ikey.sequence); + } else { + is_first_key = false; + + if (this_ikey.type == kTypeMerge) { + // Handle merge-type keys using the MergeHelper + // TODO: pass statistics to MergeUntil + merge.MergeUntil(iter, 0 /* don't worry about snapshot */); + iterator_at_next = true; + if (merge.IsSuccess()) { + // Merge completed correctly. + // Add the resulting merge key/value and continue to next + builder->Add(merge.key(), merge.value()); + prev_key.assign(merge.key().data(), merge.key().size()); + ok = ParseInternalKey(Slice(prev_key), &prev_ikey); + assert(ok); + } else { + // Merge did not find a Put/Delete. + // Can not compact these merges into a kValueType. + // Write them out one-by-one. (Proceed back() to front()) + const std::deque& keys = merge.keys(); + const std::deque& values = merge.values(); + assert(keys.size() == values.size() && keys.size() >= 1); + std::deque::const_reverse_iterator key_iter; + std::deque::const_reverse_iterator value_iter; + for (key_iter=keys.rbegin(), value_iter = values.rbegin(); + key_iter != keys.rend() && value_iter != values.rend(); + ++key_iter, ++value_iter) { + + builder->Add(Slice(*key_iter), Slice(*value_iter)); + } + + // Sanity check. Both iterators should end at the same time + assert(key_iter == keys.rend() && value_iter == values.rend()); + + prev_key.assign(keys.front()); + ok = ParseInternalKey(Slice(prev_key), &prev_ikey); + assert(ok); + } + } else { + // Handle Put/Delete-type keys by simply writing them + builder->Add(key, value); + prev_key.assign(key.data(), key.size()); + ok = ParseInternalKey(Slice(prev_key), &prev_ikey); + assert(ok); + } + } + + if (!iterator_at_next) iter->Next(); + } + + // The last key is the largest key + meta->largest.DecodeFrom(Slice(prev_key)); + SequenceNumber seqno = GetInternalKeySeqno(Slice(prev_key)); + meta->smallest_seqno = std::min(meta->smallest_seqno, seqno); + meta->largest_seqno = std::max(meta->largest_seqno, seqno); + + } else { + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + meta->largest.DecodeFrom(key); + builder->Add(key, iter->value()); + SequenceNumber seqno = GetInternalKeySeqno(key); + meta->smallest_seqno = std::min(meta->smallest_seqno, seqno); + meta->largest_seqno = std::max(meta->largest_seqno, seqno); + } + } + + // Finish and check for builder errors + if (s.ok()) { + s = builder->Finish(); + if (s.ok()) { + meta->file_size = builder->FileSize(); + assert(meta->file_size > 0); + } + } else { + builder->Abandon(); + } + delete builder; + + // Finish and check for file errors + if (s.ok() && !options.disableDataSync) { + if (options.use_fsync) { + StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); + s = file->Fsync(); + } else { + StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); + s = file->Sync(); + } + } + if (s.ok()) { + s = file->Close(); + } + + if (s.ok()) { + // Verify that the table is usable + Iterator* it = table_cache->NewIterator(ReadOptions(), soptions, + internal_comparator, *meta); + s = it->status(); + delete it; + } + } + + // Check for input iterator errors + if (!iter->status().ok()) { + s = iter->status(); + } + + if (s.ok() && meta->file_size > 0) { + // Keep it + } else { + env->DeleteFile(fname); + } + return s; +} + +} // namespace rocksdb diff --git a/db/builder.h b/db/builder.h new file mode 100644 index 0000000000..630162968b --- /dev/null +++ b/db/builder.h @@ -0,0 +1,45 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "rocksdb/comparator.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/options.h" + +namespace rocksdb { + +struct Options; +struct FileMetaData; + +class Env; +struct EnvOptions; +class Iterator; +class TableCache; +class VersionEdit; +class TableBuilder; +class WritableFile; + +extern TableBuilder* NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type); + +// Build a Table file from the contents of *iter. The generated file +// will be named according to meta->number. On success, the rest of +// *meta will be filled with metadata about the generated table. +// If no data is present in *iter, meta->file_size will be set to +// zero, and no Table file will be produced. +extern Status BuildTable(const std::string& dbname, Env* env, + const Options& options, const EnvOptions& soptions, + TableCache* table_cache, Iterator* iter, + FileMetaData* meta, + const InternalKeyComparator& internal_comparator, + const SequenceNumber newest_snapshot, + const SequenceNumber earliest_seqno_in_memtable, + const CompressionType compression); + +} // namespace rocksdb diff --git a/db/c.cc b/db/c.cc new file mode 100644 index 0000000000..b50e59ecc5 --- /dev/null +++ b/db/c.cc @@ -0,0 +1,1476 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include "rocksdb/c.h" + +#include +#include +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/universal_compaction.h" +#include "rocksdb/statistics.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" + +using rocksdb::Cache; +using rocksdb::Comparator; +using rocksdb::CompressionType; +using rocksdb::DB; +using rocksdb::Env; +using rocksdb::InfoLogLevel; +using rocksdb::FileLock; +using rocksdb::FilterPolicy; +using rocksdb::FlushOptions; +using rocksdb::Iterator; +using rocksdb::Logger; +using rocksdb::MergeOperator; +using rocksdb::NewBloomFilterPolicy; +using rocksdb::NewLRUCache; +using rocksdb::Options; +using rocksdb::RandomAccessFile; +using rocksdb::Range; +using rocksdb::ReadOptions; +using rocksdb::SequentialFile; +using rocksdb::Slice; +using rocksdb::SliceTransform; +using rocksdb::Snapshot; +using rocksdb::Status; +using rocksdb::WritableFile; +using rocksdb::WriteBatch; +using rocksdb::WriteOptions; +using rocksdb::LiveFileMetaData; + +using std::shared_ptr; + +extern "C" { + +struct rocksdb_t { DB* rep; }; +struct rocksdb_iterator_t { Iterator* rep; }; +struct rocksdb_writebatch_t { WriteBatch rep; }; +struct rocksdb_snapshot_t { const Snapshot* rep; }; +struct rocksdb_flushoptions_t { FlushOptions rep; }; +struct rocksdb_readoptions_t { ReadOptions rep; }; +struct rocksdb_writeoptions_t { WriteOptions rep; }; +struct rocksdb_options_t { Options rep; }; +struct rocksdb_seqfile_t { SequentialFile* rep; }; +struct rocksdb_randomfile_t { RandomAccessFile* rep; }; +struct rocksdb_writablefile_t { WritableFile* rep; }; +struct rocksdb_filelock_t { FileLock* rep; }; +struct rocksdb_logger_t { shared_ptr rep; }; +struct rocksdb_cache_t { shared_ptr rep; }; +struct rocksdb_livefiles_t { std::vector rep; }; + +struct rocksdb_comparator_t : public Comparator { + void* state_; + void (*destructor_)(void*); + int (*compare_)( + void*, + const char* a, size_t alen, + const char* b, size_t blen); + const char* (*name_)(void*); + + virtual ~rocksdb_comparator_t() { + (*destructor_)(state_); + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return (*compare_)(state_, a.data(), a.size(), b.data(), b.size()); + } + + virtual const char* Name() const { + return (*name_)(state_); + } + + // No-ops since the C binding does not support key shortening methods. + virtual void FindShortestSeparator(std::string*, const Slice&) const { } + virtual void FindShortSuccessor(std::string* key) const { } +}; + +struct rocksdb_filterpolicy_t : public FilterPolicy { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*create_)( + void*, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length); + unsigned char (*key_match_)( + void*, + const char* key, size_t length, + const char* filter, size_t filter_length); + void (*delete_filter_)( + void*, + const char* filter, size_t filter_length); + + virtual ~rocksdb_filterpolicy_t() { + (*destructor_)(state_); + } + + virtual const char* Name() const { + return (*name_)(state_); + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + std::vector key_pointers(n); + std::vector key_sizes(n); + for (int i = 0; i < n; i++) { + key_pointers[i] = keys[i].data(); + key_sizes[i] = keys[i].size(); + } + size_t len; + char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len); + dst->append(filter, len); + + if (delete_filter_ != nullptr) { + (*delete_filter_)(state_, filter, len); + } else { + free(filter); + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { + return (*key_match_)(state_, key.data(), key.size(), + filter.data(), filter.size()); + } +}; + +struct rocksdb_mergeoperator_t : public MergeOperator { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*full_merge_)( + void*, + const char* key, size_t key_length, + const char* existing_value, size_t existing_value_length, + const char* const* operands_list, const size_t* operands_list_length, + int num_operands, + unsigned char* success, size_t* new_value_length); + char* (*partial_merge_)(void*, const char* key, size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length); + void (*delete_value_)( + void*, + const char* value, size_t value_length); + + virtual ~rocksdb_mergeoperator_t() { + (*destructor_)(state_); + } + + virtual const char* Name() const { + return (*name_)(state_); + } + + virtual bool FullMerge( + const Slice& key, + const Slice* existing_value, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const { + + size_t n = operand_list.size(); + std::vector operand_pointers(n); + std::vector operand_sizes(n); + for (size_t i = 0; i < n; i++) { + Slice operand(operand_list[i]); + operand_pointers[i] = operand.data(); + operand_sizes[i] = operand.size(); + } + + const char* existing_value_data = nullptr; + size_t existing_value_len = 0; + if (existing_value != nullptr) { + existing_value_data = existing_value->data(); + existing_value_len = existing_value->size(); + } + + unsigned char success; + size_t new_value_len; + char* tmp_new_value = (*full_merge_)( + state_, + key.data(), key.size(), + existing_value_data, existing_value_len, + &operand_pointers[0], &operand_sizes[0], n, + &success, &new_value_len); + new_value->assign(tmp_new_value, new_value_len); + + if (delete_value_ != nullptr) { + (*delete_value_)(state_, tmp_new_value, new_value_len); + } else { + free(tmp_new_value); + } + + return success; + } + + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const { + size_t operand_count = operand_list.size(); + std::vector operand_pointers(operand_count); + std::vector operand_sizes(operand_count); + for (size_t i = 0; i < operand_count; ++i) { + Slice operand(operand_list[i]); + operand_pointers[i] = operand.data(); + operand_sizes[i] = operand.size(); + } + + unsigned char success; + size_t new_value_len; + char* tmp_new_value = (*partial_merge_)( + state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0], + operand_count, &success, &new_value_len); + new_value->assign(tmp_new_value, new_value_len); + + if (delete_value_ != nullptr) { + (*delete_value_)(state_, tmp_new_value, new_value_len); + } else { + free(tmp_new_value); + } + + return success; + } +}; + +struct rocksdb_env_t { + Env* rep; + bool is_default; +}; + +struct rocksdb_slicetransform_t : public SliceTransform { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*transform_)( + void*, + const char* key, size_t length, + size_t* dst_length); + unsigned char (*in_domain_)( + void*, + const char* key, size_t length); + unsigned char (*in_range_)( + void*, + const char* key, size_t length); + + virtual ~rocksdb_slicetransform_t() { + (*destructor_)(state_); + } + + virtual const char* Name() const { + return (*name_)(state_); + } + + virtual Slice Transform(const Slice& src) const { + size_t len; + char* dst = (*transform_)(state_, src.data(), src.size(), &len); + return Slice(dst, len); + } + + virtual bool InDomain(const Slice& src) const { + return (*in_domain_)(state_, src.data(), src.size()); + } + + virtual bool InRange(const Slice& src) const { + return (*in_range_)(state_, src.data(), src.size()); + } +}; + +struct rocksdb_universal_compaction_options_t { + rocksdb::CompactionOptionsUniversal *rep; +}; + +static bool SaveError(char** errptr, const Status& s) { + assert(errptr != nullptr); + if (s.ok()) { + return false; + } else if (*errptr == nullptr) { + *errptr = strdup(s.ToString().c_str()); + } else { + // TODO(sanjay): Merge with existing error? + free(*errptr); + *errptr = strdup(s.ToString().c_str()); + } + return true; +} + +static char* CopyString(const std::string& str) { + char* result = reinterpret_cast(malloc(sizeof(char) * str.size())); + memcpy(result, str.data(), sizeof(char) * str.size()); + return result; +} + +rocksdb_t* rocksdb_open( + const rocksdb_options_t* options, + const char* name, + char** errptr) { + DB* db; + if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_for_read_only( + const rocksdb_options_t* options, + const char* name, + unsigned char error_if_log_file_exist, + char** errptr) { + DB* db; + if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name), &db, error_if_log_file_exist))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +void rocksdb_close(rocksdb_t* db) { + delete db->rep; + delete db; +} + +void rocksdb_put( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, + db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_delete( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + char** errptr) { + SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen))); +} + +void rocksdb_merge( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, + db->rep->Merge(options->rep, Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_write( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, + char** errptr) { + SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); +} + +char* rocksdb_get( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, + const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = db->rep->NewIterator(options->rep); + return result; +} + +const rocksdb_snapshot_t* rocksdb_create_snapshot( + rocksdb_t* db) { + rocksdb_snapshot_t* result = new rocksdb_snapshot_t; + result->rep = db->rep->GetSnapshot(); + return result; +} + +void rocksdb_release_snapshot( + rocksdb_t* db, + const rocksdb_snapshot_t* snapshot) { + db->rep->ReleaseSnapshot(snapshot->rep); + delete snapshot; +} + +char* rocksdb_property_value( + rocksdb_t* db, + const char* propname) { + std::string tmp; + if (db->rep->GetProperty(Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return nullptr; + } +} + +void rocksdb_approximate_sizes( + rocksdb_t* db, + int num_ranges, + const char* const* range_start_key, const size_t* range_start_key_len, + const char* const* range_limit_key, const size_t* range_limit_key_len, + uint64_t* sizes) { + Range* ranges = new Range[num_ranges]; + for (int i = 0; i < num_ranges; i++) { + ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); + ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); + } + db->rep->GetApproximateSizes(ranges, num_ranges, sizes); + delete[] ranges; +} + +void rocksdb_delete_file( + rocksdb_t* db, + const char* name) { + db->rep->DeleteFile(name); +} + +const rocksdb_livefiles_t* rocksdb_livefiles( + rocksdb_t* db) { + rocksdb_livefiles_t* result = new rocksdb_livefiles_t; + db->rep->GetLiveFilesMetaData(&result->rep); + return result; +} + +void rocksdb_compact_range( + rocksdb_t* db, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + // Pass nullptr Slice if corresponding "const char*" is nullptr + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_flush( + rocksdb_t* db, + const rocksdb_flushoptions_t* options, + char** errptr) { + SaveError(errptr, db->rep->Flush(options->rep)); +} + +void rocksdb_disable_file_deletions( + rocksdb_t* db, + char** errptr) { + SaveError(errptr, db->rep->DisableFileDeletions()); +} + +void rocksdb_enable_file_deletions( + rocksdb_t* db, + unsigned char force, + char** errptr) { + SaveError(errptr, db->rep->EnableFileDeletions(force)); +} + +void rocksdb_destroy_db( + const rocksdb_options_t* options, + const char* name, + char** errptr) { + SaveError(errptr, DestroyDB(name, options->rep)); +} + +void rocksdb_repair_db( + const rocksdb_options_t* options, + const char* name, + char** errptr) { + SaveError(errptr, RepairDB(name, options->rep)); +} + +void rocksdb_iter_destroy(rocksdb_iterator_t* iter) { + delete iter->rep; + delete iter; +} + +unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) { + return iter->rep->Valid(); +} + +void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) { + iter->rep->SeekToFirst(); +} + +void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) { + iter->rep->SeekToLast(); +} + +void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) { + iter->rep->Seek(Slice(k, klen)); +} + +void rocksdb_iter_next(rocksdb_iterator_t* iter) { + iter->rep->Next(); +} + +void rocksdb_iter_prev(rocksdb_iterator_t* iter) { + iter->rep->Prev(); +} + +const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) { + Slice s = iter->rep->key(); + *klen = s.size(); + return s.data(); +} + +const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) { + Slice s = iter->rep->value(); + *vlen = s.size(); + return s.data(); +} + +void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) { + SaveError(errptr, iter->rep->status()); +} + +rocksdb_writebatch_t* rocksdb_writebatch_create() { + return new rocksdb_writebatch_t; +} + +void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) { + delete b; +} + +void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) { + b->rep.Clear(); +} + +int rocksdb_writebatch_count(rocksdb_writebatch_t* b) { + return b->rep.Count(); +} + +void rocksdb_writebatch_put( + rocksdb_writebatch_t* b, + const char* key, size_t klen, + const char* val, size_t vlen) { + b->rep.Put(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_merge( + rocksdb_writebatch_t* b, + const char* key, size_t klen, + const char* val, size_t vlen) { + b->rep.Merge(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_delete( + rocksdb_writebatch_t* b, + const char* key, size_t klen) { + b->rep.Delete(Slice(key, klen)); +} + +void rocksdb_writebatch_iterate( + rocksdb_writebatch_t* b, + void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)) { + class H : public WriteBatch::Handler { + public: + void* state_; + void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen); + void (*deleted_)(void*, const char* k, size_t klen); + virtual void Put(const Slice& key, const Slice& value) { + (*put_)(state_, key.data(), key.size(), value.data(), value.size()); + } + virtual void Delete(const Slice& key) { + (*deleted_)(state_, key.data(), key.size()); + } + }; + H handler; + handler.state_ = state; + handler.put_ = put; + handler.deleted_ = deleted; + b->rep.Iterate(&handler); +} + +const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) { + *size = b->rep.GetDataSize(); + return b->rep.Data().c_str(); +} + +rocksdb_options_t* rocksdb_options_create() { + return new rocksdb_options_t; +} + +void rocksdb_options_destroy(rocksdb_options_t* options) { + delete options; +} + +void rocksdb_options_set_comparator( + rocksdb_options_t* opt, + rocksdb_comparator_t* cmp) { + opt->rep.comparator = cmp; +} + +void rocksdb_options_set_merge_operator( + rocksdb_options_t* opt, + rocksdb_mergeoperator_t* merge_operator) { + opt->rep.merge_operator = std::shared_ptr(merge_operator); +} + +void rocksdb_options_set_filter_policy( + rocksdb_options_t* opt, + rocksdb_filterpolicy_t* policy) { + opt->rep.filter_policy = policy; +} + +void rocksdb_options_set_create_if_missing( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.create_if_missing = v; +} + +void rocksdb_options_set_error_if_exists( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.error_if_exists = v; +} + +void rocksdb_options_set_paranoid_checks( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.paranoid_checks = v; +} + +void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) { + opt->rep.env = (env ? env->rep : nullptr); +} + +void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) { + if (l) { + opt->rep.info_log = l->rep; + } +} + +void rocksdb_options_set_info_log_level( + rocksdb_options_t* opt, int v) { + opt->rep.info_log_level = static_cast(v); +} + +void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) { + opt->rep.write_buffer_size = s; +} + +void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) { + opt->rep.max_open_files = n; +} + +void rocksdb_options_set_cache(rocksdb_options_t* opt, rocksdb_cache_t* c) { + if (c) { + opt->rep.block_cache = c->rep; + } +} + +void rocksdb_options_set_cache_compressed(rocksdb_options_t* opt, rocksdb_cache_t* c) { + if (c) { + opt->rep.block_cache_compressed = c->rep; + } +} + +void rocksdb_options_set_block_size(rocksdb_options_t* opt, size_t s) { + opt->rep.block_size = s; +} + +void rocksdb_options_set_block_restart_interval(rocksdb_options_t* opt, int n) { + opt->rep.block_restart_interval = n; +} + +void rocksdb_options_set_target_file_size_base( + rocksdb_options_t* opt, uint64_t n) { + opt->rep.target_file_size_base = n; +} + +void rocksdb_options_set_target_file_size_multiplier( + rocksdb_options_t* opt, int n) { + opt->rep.target_file_size_multiplier = n; +} + +void rocksdb_options_set_max_bytes_for_level_base( + rocksdb_options_t* opt, uint64_t n) { + opt->rep.max_bytes_for_level_base = n; +} + +void rocksdb_options_set_max_bytes_for_level_multiplier( + rocksdb_options_t* opt, int n) { + opt->rep.max_bytes_for_level_multiplier = n; +} + +void rocksdb_options_set_expanded_compaction_factor( + rocksdb_options_t* opt, int n) { + opt->rep.expanded_compaction_factor = n; +} + +void rocksdb_options_set_max_grandparent_overlap_factor( + rocksdb_options_t* opt, int n) { + opt->rep.max_grandparent_overlap_factor = n; +} + +void rocksdb_options_set_max_bytes_for_level_multiplier_additional( + rocksdb_options_t* opt, int* level_values, size_t num_levels) { + opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels); + for (size_t i = 0; i < num_levels; ++i) { + opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i]; + } +} + +void rocksdb_options_enable_statistics(rocksdb_options_t* opt) { + opt->rep.statistics = rocksdb::CreateDBStatistics(); +} + +void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { + opt->rep.num_levels = n; +} + +void rocksdb_options_set_level0_file_num_compaction_trigger( + rocksdb_options_t* opt, int n) { + opt->rep.level0_file_num_compaction_trigger = n; +} + +void rocksdb_options_set_level0_slowdown_writes_trigger( + rocksdb_options_t* opt, int n) { + opt->rep.level0_slowdown_writes_trigger = n; +} + +void rocksdb_options_set_level0_stop_writes_trigger( + rocksdb_options_t* opt, int n) { + opt->rep.level0_stop_writes_trigger = n; +} + +void rocksdb_options_set_max_mem_compaction_level( + rocksdb_options_t* opt, int n) { + opt->rep.max_mem_compaction_level = n; +} + +void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) { + opt->rep.compression = static_cast(t); +} + +void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt, + int* level_values, + size_t num_levels) { + opt->rep.compression_per_level.resize(num_levels); + for (size_t i = 0; i < num_levels; ++i) { + opt->rep.compression_per_level[i] = + static_cast(level_values[i]); + } +} + +void rocksdb_options_set_compression_options( + rocksdb_options_t* opt, int w_bits, int level, int strategy) { + opt->rep.compression_opts.window_bits = w_bits; + opt->rep.compression_opts.level = level; + opt->rep.compression_opts.strategy = strategy; +} + +void rocksdb_options_set_prefix_extractor( + rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) { + opt->rep.prefix_extractor.reset(prefix_extractor); +} + +void rocksdb_options_set_whole_key_filtering( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.whole_key_filtering = v; +} + +void rocksdb_options_set_disable_data_sync( + rocksdb_options_t* opt, int disable_data_sync) { + opt->rep.disableDataSync = disable_data_sync; +} + +void rocksdb_options_set_use_fsync( + rocksdb_options_t* opt, int use_fsync) { + opt->rep.use_fsync = use_fsync; +} + +void rocksdb_options_set_db_stats_log_interval( + rocksdb_options_t* opt, int db_stats_log_interval) { + opt->rep.db_stats_log_interval = db_stats_log_interval; +} + +void rocksdb_options_set_db_log_dir( + rocksdb_options_t* opt, const char* db_log_dir) { + opt->rep.db_log_dir = db_log_dir; +} + +void rocksdb_options_set_wal_dir( + rocksdb_options_t* opt, const char* v) { + opt->rep.wal_dir = v; +} + +void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) { + opt->rep.WAL_ttl_seconds = ttl; +} + +void rocksdb_options_set_WAL_size_limit_MB( + rocksdb_options_t* opt, uint64_t limit) { + opt->rep.WAL_size_limit_MB = limit; +} + +void rocksdb_options_set_manifest_preallocation_size( + rocksdb_options_t* opt, size_t v) { + opt->rep.manifest_preallocation_size = v; +} + +void rocksdb_options_set_purge_redundant_kvs_while_flush( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.purge_redundant_kvs_while_flush = v; +} + +void rocksdb_options_set_allow_os_buffer( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.allow_os_buffer = v; +} + +void rocksdb_options_set_allow_mmap_reads( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.allow_mmap_reads = v; +} + +void rocksdb_options_set_allow_mmap_writes( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.allow_mmap_writes = v; +} + +void rocksdb_options_set_is_fd_close_on_exec( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.is_fd_close_on_exec = v; +} + +void rocksdb_options_set_skip_log_error_on_recovery( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.skip_log_error_on_recovery = v; +} + +void rocksdb_options_set_stats_dump_period_sec( + rocksdb_options_t* opt, unsigned int v) { + opt->rep.stats_dump_period_sec = v; +} + +void rocksdb_options_set_block_size_deviation( + rocksdb_options_t* opt, int v) { + opt->rep.block_size_deviation = v; +} + +void rocksdb_options_set_advise_random_on_open( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.advise_random_on_open = v; +} + +void rocksdb_options_set_access_hint_on_compaction_start( + rocksdb_options_t* opt, int v) { + switch(v) { + case 0: + opt->rep.access_hint_on_compaction_start = rocksdb::Options::NONE; + break; + case 1: + opt->rep.access_hint_on_compaction_start = rocksdb::Options::NORMAL; + break; + case 2: + opt->rep.access_hint_on_compaction_start = rocksdb::Options::SEQUENTIAL; + break; + case 3: + opt->rep.access_hint_on_compaction_start = rocksdb::Options::WILLNEED; + break; + } +} + +void rocksdb_options_set_use_adaptive_mutex( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.use_adaptive_mutex = v; +} + +void rocksdb_options_set_bytes_per_sync( + rocksdb_options_t* opt, uint64_t v) { + opt->rep.bytes_per_sync = v; +} + +void rocksdb_options_set_verify_checksums_in_compaction( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.verify_checksums_in_compaction = v; +} + +void rocksdb_options_set_filter_deletes( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.filter_deletes = v; +} + +void rocksdb_options_set_max_sequential_skip_in_iterations( + rocksdb_options_t* opt, uint64_t v) { + opt->rep.max_sequential_skip_in_iterations = v; +} + +void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) { + opt->rep.max_write_buffer_number = n; +} + +void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) { + opt->rep.min_write_buffer_number_to_merge = n; +} + +void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) { + opt->rep.max_background_compactions = n; +} + +void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) { + opt->rep.max_background_flushes = n; +} + +void rocksdb_options_set_max_log_file_size(rocksdb_options_t* opt, size_t v) { + opt->rep.max_log_file_size = v; +} + +void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt, size_t v) { + opt->rep.log_file_time_to_roll = v; +} + +void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) { + opt->rep.keep_log_file_num = v; +} + +void rocksdb_options_set_soft_rate_limit(rocksdb_options_t* opt, double v) { + opt->rep.soft_rate_limit = v; +} + +void rocksdb_options_set_hard_rate_limit(rocksdb_options_t* opt, double v) { + opt->rep.hard_rate_limit = v; +} + +void rocksdb_options_set_rate_limit_delay_max_milliseconds( + rocksdb_options_t* opt, unsigned int v) { + opt->rep.rate_limit_delay_max_milliseconds = v; +} + +void rocksdb_options_set_max_manifest_file_size( + rocksdb_options_t* opt, size_t v) { + opt->rep.max_manifest_file_size = v; +} + +void rocksdb_options_set_no_block_cache( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.no_block_cache = v; +} + +void rocksdb_options_set_table_cache_numshardbits( + rocksdb_options_t* opt, int v) { + opt->rep.table_cache_numshardbits = v; +} + +void rocksdb_options_set_table_cache_remove_scan_count_limit( + rocksdb_options_t* opt, int v) { + opt->rep.table_cache_remove_scan_count_limit = v; +} + +void rocksdb_options_set_arena_block_size( + rocksdb_options_t* opt, size_t v) { + opt->rep.arena_block_size = v; +} + +void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) { + opt->rep.disable_auto_compactions = disable; +} + +void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t* opt, int disable) { + opt->rep.disable_seek_compaction = disable; +} + +void rocksdb_options_set_delete_obsolete_files_period_micros( + rocksdb_options_t* opt, uint64_t v) { + opt->rep.delete_obsolete_files_period_micros = v; +} + +void rocksdb_options_set_source_compaction_factor( + rocksdb_options_t* opt, int n) { + opt->rep.expanded_compaction_factor = n; +} + +void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) { + opt->rep.PrepareForBulkLoad(); +} + +void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t *opt) { + static rocksdb::VectorRepFactory* factory = 0; + if (!factory) { + factory = new rocksdb::VectorRepFactory; + } + opt->rep.memtable_factory.reset(factory); +} + +void rocksdb_options_set_memtable_prefix_bloom_bits( + rocksdb_options_t* opt, uint32_t v) { + opt->rep.memtable_prefix_bloom_bits = v; +} + +void rocksdb_options_set_memtable_prefix_bloom_probes( + rocksdb_options_t* opt, uint32_t v) { + opt->rep.memtable_prefix_bloom_probes = v; +} + +void rocksdb_options_set_hash_skip_list_rep( + rocksdb_options_t *opt, size_t bucket_count, + int32_t skiplist_height, int32_t skiplist_branching_factor) { + static rocksdb::MemTableRepFactory* factory = 0; + if (!factory) { + factory = rocksdb::NewHashSkipListRepFactory( + bucket_count, skiplist_height, skiplist_branching_factor); + } + opt->rep.memtable_factory.reset(factory); +} + +void rocksdb_options_set_hash_link_list_rep( + rocksdb_options_t *opt, size_t bucket_count) { + static rocksdb::MemTableRepFactory* factory = 0; + if (!factory) { + factory = rocksdb::NewHashLinkListRepFactory(bucket_count); + } + opt->rep.memtable_factory.reset(factory); +} + +void rocksdb_options_set_plain_table_factory( + rocksdb_options_t *opt, uint32_t user_key_len, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness) { + static rocksdb::TableFactory* factory = 0; + if (!factory) { + factory = rocksdb::NewPlainTableFactory( + user_key_len, bloom_bits_per_key, + hash_table_ratio, index_sparseness); + } + opt->rep.table_factory.reset(factory); +} + +void rocksdb_options_set_max_successive_merges( + rocksdb_options_t* opt, size_t v) { + opt->rep.max_successive_merges = v; +} + +void rocksdb_options_set_min_partial_merge_operands( + rocksdb_options_t* opt, uint32_t v) { + opt->rep.min_partial_merge_operands = v; +} + +void rocksdb_options_set_bloom_locality( + rocksdb_options_t* opt, uint32_t v) { + opt->rep.bloom_locality = v; +} + +void rocksdb_options_set_allow_thread_local( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.allow_thread_local = v; +} + +void rocksdb_options_set_inplace_update_support( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.inplace_update_support = v; +} + +void rocksdb_options_set_inplace_update_num_locks( + rocksdb_options_t* opt, size_t v) { + opt->rep.inplace_update_num_locks = v; +} + +void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) { + opt->rep.compaction_style = static_cast(style); +} + +void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) { + opt->rep.compaction_options_universal = *(uco->rep); +} + +/* +TODO: +DB::OpenForReadOnly +DB::MultiGet +DB::KeyMayExist +DB::GetOptions +DB::GetSortedWalFiles +DB::GetLatestSequenceNumber +DB::GetUpdatesSince +DB::GetDbIdentity +DB::RunManualCompaction +custom cache +compaction_filter +table_properties_collectors +*/ + +rocksdb_comparator_t* rocksdb_comparator_create( + void* state, + void (*destructor)(void*), + int (*compare)( + void*, + const char* a, size_t alen, + const char* b, size_t blen), + const char* (*name)(void*)) { + rocksdb_comparator_t* result = new rocksdb_comparator_t; + result->state_ = state; + result->destructor_ = destructor; + result->compare_ = compare; + result->name_ = name; + return result; +} + +void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) { + delete cmp; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create( + void* state, + void (*destructor)(void*), + char* (*create_filter)( + void*, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length), + unsigned char (*key_may_match)( + void*, + const char* key, size_t length, + const char* filter, size_t filter_length), + void (*delete_filter)( + void*, + const char* filter, size_t filter_length), + const char* (*name)(void*)) { + rocksdb_filterpolicy_t* result = new rocksdb_filterpolicy_t; + result->state_ = state; + result->destructor_ = destructor; + result->create_ = create_filter; + result->key_match_ = key_may_match; + result->delete_filter_ = delete_filter; + result->name_ = name; + return result; +} + +void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) { + delete filter; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) { + // Make a rocksdb_filterpolicy_t, but override all of its methods so + // they delegate to a NewBloomFilterPolicy() instead of user + // supplied C functions. + struct Wrapper : public rocksdb_filterpolicy_t { + const FilterPolicy* rep_; + ~Wrapper() { delete rep_; } + const char* Name() const { return rep_->Name(); } + void CreateFilter(const Slice* keys, int n, std::string* dst) const { + return rep_->CreateFilter(keys, n, dst); + } + bool KeyMayMatch(const Slice& key, const Slice& filter) const { + return rep_->KeyMayMatch(key, filter); + } + static void DoNothing(void*) { } + }; + Wrapper* wrapper = new Wrapper; + wrapper->rep_ = NewBloomFilterPolicy(bits_per_key); + wrapper->state_ = nullptr; + wrapper->delete_filter_ = nullptr; + wrapper->destructor_ = &Wrapper::DoNothing; + return wrapper; +} + +rocksdb_mergeoperator_t* rocksdb_mergeoperator_create( + void* state, void (*destructor)(void*), + char* (*full_merge)(void*, const char* key, size_t key_length, + const char* existing_value, + size_t existing_value_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + char* (*partial_merge)(void*, const char* key, size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + void (*delete_value)(void*, const char* value, size_t value_length), + const char* (*name)(void*)) { + rocksdb_mergeoperator_t* result = new rocksdb_mergeoperator_t; + result->state_ = state; + result->destructor_ = destructor; + result->full_merge_ = full_merge; + result->partial_merge_ = partial_merge; + result->delete_value_ = delete_value; + result->name_ = name; + return result; +} + +void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t* merge_operator) { + delete merge_operator; +} + +rocksdb_readoptions_t* rocksdb_readoptions_create() { + return new rocksdb_readoptions_t; +} + +void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) { + delete opt; +} + +void rocksdb_readoptions_set_verify_checksums( + rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.verify_checksums = v; +} + +void rocksdb_readoptions_set_fill_cache( + rocksdb_readoptions_t* opt, unsigned char v) { + opt->rep.fill_cache = v; +} + +void rocksdb_readoptions_set_snapshot( + rocksdb_readoptions_t* opt, + const rocksdb_snapshot_t* snap) { + opt->rep.snapshot = (snap ? snap->rep : nullptr); +} + +void rocksdb_readoptions_set_read_tier( + rocksdb_readoptions_t* opt, int v) { + opt->rep.read_tier = static_cast(v); +} + +void rocksdb_readoptions_set_tailing( + rocksdb_readoptions_t* opt, unsigned char v) { + opt->rep.tailing = v; +} + +rocksdb_writeoptions_t* rocksdb_writeoptions_create() { + return new rocksdb_writeoptions_t; +} + +void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) { + delete opt; +} + +void rocksdb_writeoptions_set_sync( + rocksdb_writeoptions_t* opt, unsigned char v) { + opt->rep.sync = v; +} + +void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable) { + opt->rep.disableWAL = disable; +} + + +rocksdb_flushoptions_t* rocksdb_flushoptions_create() { + return new rocksdb_flushoptions_t; +} + +void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t* opt) { + delete opt; +} + +void rocksdb_flushoptions_set_wait( + rocksdb_flushoptions_t* opt, unsigned char v) { + opt->rep.wait = v; +} + +rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) { + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = NewLRUCache(capacity); + return c; +} + +void rocksdb_cache_destroy(rocksdb_cache_t* cache) { + delete cache; +} + +rocksdb_env_t* rocksdb_create_default_env() { + rocksdb_env_t* result = new rocksdb_env_t; + result->rep = Env::Default(); + result->is_default = true; + return result; +} + +void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) { + env->rep->SetBackgroundThreads(n); +} + +void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) { + env->rep->SetBackgroundThreads(n, Env::HIGH); +} + +void rocksdb_env_destroy(rocksdb_env_t* env) { + if (!env->is_default) delete env->rep; + delete env; +} + +rocksdb_slicetransform_t* rocksdb_slicetransform_create( + void* state, + void (*destructor)(void*), + char* (*transform)( + void*, + const char* key, size_t length, + size_t* dst_length), + unsigned char (*in_domain)( + void*, + const char* key, size_t length), + unsigned char (*in_range)( + void*, + const char* key, size_t length), + const char* (*name)(void*)) { + rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t; + result->state_ = state; + result->destructor_ = destructor; + result->transform_ = transform; + result->in_domain_ = in_domain; + result->in_range_ = in_range; + result->name_ = name; + return result; +} + +void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) { + delete st; +} + +rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t prefixLen) { + struct Wrapper : public rocksdb_slicetransform_t { + const SliceTransform* rep_; + ~Wrapper() { delete rep_; } + const char* Name() const { return rep_->Name(); } + Slice Transform(const Slice& src) const { + return rep_->Transform(src); + } + bool InDomain(const Slice& src) const { + return rep_->InDomain(src); + } + bool InRange(const Slice& src) const { + return rep_->InRange(src); + } + static void DoNothing(void*) { } + }; + Wrapper* wrapper = new Wrapper; + wrapper->rep_ = rocksdb::NewFixedPrefixTransform(prefixLen); + wrapper->state_ = nullptr; + wrapper->destructor_ = &Wrapper::DoNothing; + return wrapper; +} + +rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() { + rocksdb_universal_compaction_options_t* result = new rocksdb_universal_compaction_options_t; + result->rep = new rocksdb::CompactionOptionsUniversal; + return result; +} + +void rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t* uco, int ratio) { + uco->rep->size_ratio = ratio; +} + +void rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->min_merge_width = w; +} + +void rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->max_merge_width = w; +} + +void rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->max_size_amplification_percent = p; +} + +void rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->compression_size_percent = p; +} + +void rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t* uco, int style) { + uco->rep->stop_style = static_cast(style); +} + +void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t* uco) { + delete uco->rep; + delete uco; +} + +void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level) { + if (level >= 0) { + assert(level <= opt->rep.num_levels); + opt->rep.compression_per_level.resize(opt->rep.num_levels); + for (int i = 0; i < level; i++) { + opt->rep.compression_per_level[i] = rocksdb::kNoCompression; + } + for (int i = level; i < opt->rep.num_levels; i++) { + opt->rep.compression_per_level[i] = opt->rep.compression; + } + } +} + +int rocksdb_livefiles_count( + const rocksdb_livefiles_t* lf) { + return lf->rep.size(); +} + +const char* rocksdb_livefiles_name( + const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].name.c_str(); +} + +int rocksdb_livefiles_level( + const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].level; +} + +size_t rocksdb_livefiles_size( + const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].size; +} + +const char* rocksdb_livefiles_smallestkey( + const rocksdb_livefiles_t* lf, + int index, + size_t* size) { + *size = lf->rep[index].smallestkey.size(); + return lf->rep[index].smallestkey.data(); +} + +const char* rocksdb_livefiles_largestkey( + const rocksdb_livefiles_t* lf, + int index, + size_t* size) { + *size = lf->rep[index].largestkey.size(); + return lf->rep[index].largestkey.data(); +} + +extern void rocksdb_livefiles_destroy( + const rocksdb_livefiles_t* lf) { + delete lf; +} + +} // end extern "C" + +#endif // ROCKSDB_LITE diff --git a/db/c_test.c b/db/c_test.c new file mode 100644 index 0000000000..8ebce9085c --- /dev/null +++ b/db/c_test.c @@ -0,0 +1,494 @@ +/* Copyright (c) 2011 The LevelDB Authors. All rights reserved. + Use of this source code is governed by a BSD-style license that can be + found in the LICENSE file. See the AUTHORS file for names of contributors. */ + +#include "rocksdb/c.h" + +#include +#include +#include +#include +#include +#include + +const char* phase = ""; +static char dbname[200]; + +static void StartPhase(const char* name) { + fprintf(stderr, "=== Test %s\n", name); + phase = name; +} + +static const char* GetTempDir(void) { + const char* ret = getenv("TEST_TMPDIR"); + if (ret == NULL || ret[0] == '\0') + ret = "/tmp"; + return ret; +} + +#define CheckNoError(err) \ + if ((err) != NULL) { \ + fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \ + abort(); \ + } + +#define CheckCondition(cond) \ + if (!(cond)) { \ + fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \ + abort(); \ + } + +static void CheckEqual(const char* expected, const char* v, size_t n) { + if (expected == NULL && v == NULL) { + // ok + } else if (expected != NULL && v != NULL && n == strlen(expected) && + memcmp(expected, v, n) == 0) { + // ok + return; + } else { + fprintf(stderr, "%s: expected '%s', got '%s'\n", + phase, + (expected ? expected : "(null)"), + (v ? v : "(null")); + abort(); + } +} + +static void Free(char** ptr) { + if (*ptr) { + free(*ptr); + *ptr = NULL; + } +} + +static void CheckGet( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, + const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_get(db, options, key, strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckIter(rocksdb_iterator_t* iter, + const char* key, const char* val) { + size_t len; + const char* str; + str = rocksdb_iter_key(iter, &len); + CheckEqual(key, str, len); + str = rocksdb_iter_value(iter, &len); + CheckEqual(val, str, len); +} + +// Callback from rocksdb_writebatch_iterate() +static void CheckPut(void* ptr, + const char* k, size_t klen, + const char* v, size_t vlen) { + int* state = (int*) ptr; + CheckCondition(*state < 2); + switch (*state) { + case 0: + CheckEqual("bar", k, klen); + CheckEqual("b", v, vlen); + break; + case 1: + CheckEqual("box", k, klen); + CheckEqual("c", v, vlen); + break; + } + (*state)++; +} + +// Callback from rocksdb_writebatch_iterate() +static void CheckDel(void* ptr, const char* k, size_t klen) { + int* state = (int*) ptr; + CheckCondition(*state == 2); + CheckEqual("bar", k, klen); + (*state)++; +} + +static void CmpDestroy(void* arg) { } + +static int CmpCompare(void* arg, const char* a, size_t alen, + const char* b, size_t blen) { + int n = (alen < blen) ? alen : blen; + int r = memcmp(a, b, n); + if (r == 0) { + if (alen < blen) r = -1; + else if (alen > blen) r = +1; + } + return r; +} + +static const char* CmpName(void* arg) { + return "foo"; +} + +// Custom filter policy +static unsigned char fake_filter_result = 1; +static void FilterDestroy(void* arg) { } +static const char* FilterName(void* arg) { + return "TestFilter"; +} +static char* FilterCreate( + void* arg, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length) { + *filter_length = 4; + char* result = malloc(4); + memcpy(result, "fake", 4); + return result; +} +static unsigned char FilterKeyMatch( + void* arg, + const char* key, size_t length, + const char* filter, size_t filter_length) { + CheckCondition(filter_length == 4); + CheckCondition(memcmp(filter, "fake", 4) == 0); + return fake_filter_result; +} + +// Custom merge operator +static void MergeOperatorDestroy(void* arg) { } +static const char* MergeOperatorName(void* arg) { + return "TestMergeOperator"; +} +static char* MergeOperatorFullMerge( + void* arg, + const char* key, size_t key_length, + const char* existing_value, size_t existing_value_length, + const char* const* operands_list, const size_t* operands_list_length, + int num_operands, + unsigned char* success, size_t* new_value_length) { + *new_value_length = 4; + *success = 1; + char* result = malloc(4); + memcpy(result, "fake", 4); + return result; +} +static char* MergeOperatorPartialMerge( + void* arg, + const char* key, size_t key_length, + const char* const* operands_list, const size_t* operands_list_length, + int num_operands, + unsigned char* success, size_t* new_value_length) { + *new_value_length = 4; + *success = 1; + char* result = malloc(4); + memcpy(result, "fake", 4); + return result; +} + +int main(int argc, char** argv) { + rocksdb_t* db; + rocksdb_comparator_t* cmp; + rocksdb_cache_t* cache; + rocksdb_env_t* env; + rocksdb_options_t* options; + rocksdb_readoptions_t* roptions; + rocksdb_writeoptions_t* woptions; + char* err = NULL; + int run = -1; + + snprintf(dbname, sizeof(dbname), + "%s/rocksdb_c_test-%d", + GetTempDir(), + ((int) geteuid())); + + StartPhase("create_objects"); + cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName); + env = rocksdb_create_default_env(); + cache = rocksdb_cache_create_lru(100000); + + options = rocksdb_options_create(); + rocksdb_options_set_comparator(options, cmp); + rocksdb_options_set_error_if_exists(options, 1); + rocksdb_options_set_cache(options, cache); + rocksdb_options_set_env(options, env); + rocksdb_options_set_info_log(options, NULL); + rocksdb_options_set_write_buffer_size(options, 100000); + rocksdb_options_set_paranoid_checks(options, 1); + rocksdb_options_set_max_open_files(options, 10); + rocksdb_options_set_block_size(options, 1024); + rocksdb_options_set_block_restart_interval(options, 8); + rocksdb_options_set_compression(options, rocksdb_no_compression); + rocksdb_options_set_compression_options(options, -14, -1, 0); + int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression, + rocksdb_no_compression, rocksdb_no_compression}; + rocksdb_options_set_compression_per_level(options, compression_levels, 4); + + roptions = rocksdb_readoptions_create(); + rocksdb_readoptions_set_verify_checksums(roptions, 1); + rocksdb_readoptions_set_fill_cache(roptions, 0); + + woptions = rocksdb_writeoptions_create(); + rocksdb_writeoptions_set_sync(woptions, 1); + + StartPhase("destroy"); + rocksdb_destroy_db(options, dbname, &err); + Free(&err); + + StartPhase("open_error"); + db = rocksdb_open(options, dbname, &err); + CheckCondition(err != NULL); + Free(&err); + + StartPhase("open"); + rocksdb_options_set_create_if_missing(options, 1); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", NULL); + + StartPhase("put"); + rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("compactall"); + rocksdb_compact_range(db, NULL, 0, NULL, 0); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("compactrange"); + rocksdb_compact_range(db, "a", 1, "z", 1); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("writebatch"); + { + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + rocksdb_writebatch_put(wb, "foo", 3, "a", 1); + rocksdb_writebatch_clear(wb); + rocksdb_writebatch_put(wb, "bar", 3, "b", 1); + rocksdb_writebatch_put(wb, "box", 3, "c", 1); + rocksdb_writebatch_delete(wb, "bar", 3); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "hello"); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "box", "c"); + int pos = 0; + rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel); + CheckCondition(pos == 3); + rocksdb_writebatch_destroy(wb); + } + + StartPhase("iter"); + { + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "box", "c"); + rocksdb_iter_next(iter); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_prev(iter); + CheckIter(iter, "box", "c"); + rocksdb_iter_prev(iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_last(iter); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_seek(iter, "b", 1); + CheckIter(iter, "box", "c"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + } + + StartPhase("approximate_sizes"); + { + int i; + int n = 20000; + char keybuf[100]; + char valbuf[100]; + uint64_t sizes[2]; + const char* start[2] = { "a", "k00000000000000010000" }; + size_t start_len[2] = { 1, 21 }; + const char* limit[2] = { "k00000000000000010000", "z" }; + size_t limit_len[2] = { 21, 1 }; + rocksdb_writeoptions_set_sync(woptions, 0); + for (i = 0; i < n; i++) { + snprintf(keybuf, sizeof(keybuf), "k%020d", i); + snprintf(valbuf, sizeof(valbuf), "v%020d", i); + rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf), + &err); + CheckNoError(err); + } + rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes); + CheckCondition(sizes[0] > 0); + CheckCondition(sizes[1] > 0); + } + + StartPhase("property"); + { + char* prop = rocksdb_property_value(db, "nosuchprop"); + CheckCondition(prop == NULL); + prop = rocksdb_property_value(db, "rocksdb.stats"); + CheckCondition(prop != NULL); + Free(&prop); + } + + StartPhase("snapshot"); + { + const rocksdb_snapshot_t* snap; + snap = rocksdb_create_snapshot(db); + rocksdb_delete(db, woptions, "foo", 3, &err); + CheckNoError(err); + rocksdb_readoptions_set_snapshot(roptions, snap); + CheckGet(db, roptions, "foo", "hello"); + rocksdb_readoptions_set_snapshot(roptions, NULL); + CheckGet(db, roptions, "foo", NULL); + rocksdb_release_snapshot(db, snap); + } + + StartPhase("repair"); + { + // If we do not compact here, then the lazy deletion of + // files (https://reviews.facebook.net/D6123) would leave + // around deleted files and the repair process will find + // those files and put them back into the database. + rocksdb_compact_range(db, NULL, 0, NULL, 0); + rocksdb_close(db); + rocksdb_options_set_create_if_missing(options, 0); + rocksdb_options_set_error_if_exists(options, 0); + rocksdb_repair_db(options, dbname, &err); + CheckNoError(err); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", NULL); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "box", "c"); + rocksdb_options_set_create_if_missing(options, 1); + rocksdb_options_set_error_if_exists(options, 1); + } + + StartPhase("filter"); + for (run = 0; run < 2; run++) { + // First run uses custom filter, second run uses bloom filter + CheckNoError(err); + rocksdb_filterpolicy_t* policy; + if (run == 0) { + policy = rocksdb_filterpolicy_create( + NULL, FilterDestroy, FilterCreate, FilterKeyMatch, NULL, FilterName); + } else { + policy = rocksdb_filterpolicy_create_bloom(10); + } + + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + rocksdb_options_set_filter_policy(options, policy); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err); + CheckNoError(err); + rocksdb_compact_range(db, NULL, 0, NULL, 0); + + fake_filter_result = 1; + CheckGet(db, roptions, "foo", "foovalue"); + CheckGet(db, roptions, "bar", "barvalue"); + if (phase == 0) { + // Must not find value when custom filter returns false + fake_filter_result = 0; + CheckGet(db, roptions, "foo", NULL); + CheckGet(db, roptions, "bar", NULL); + fake_filter_result = 1; + + CheckGet(db, roptions, "foo", "foovalue"); + CheckGet(db, roptions, "bar", "barvalue"); + } + rocksdb_options_set_filter_policy(options, NULL); + rocksdb_filterpolicy_destroy(policy); + } + + StartPhase("merge_operator"); + { + rocksdb_mergeoperator_t* merge_operator; + merge_operator = rocksdb_mergeoperator_create( + NULL, MergeOperatorDestroy, MergeOperatorFullMerge, + MergeOperatorPartialMerge, NULL, MergeOperatorName); + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + rocksdb_options_set_merge_operator(options, merge_operator); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "foovalue"); + rocksdb_merge(db, woptions, "foo", 3, "barvalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "fake"); + + // Merge of a non-existing value + rocksdb_merge(db, woptions, "bar", 3, "barvalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "bar", "fake"); + + } + + StartPhase("prefix"); + { + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + + rocksdb_filterpolicy_t* policy = rocksdb_filterpolicy_create_bloom(10); + rocksdb_options_set_filter_policy(options, policy); + rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3)); + rocksdb_options_set_hash_skip_list_rep(options, 50000, 4, 4); + rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16); + + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + rocksdb_put(db, woptions, "foo1", 4, "foo", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo2", 4, "foo", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo3", 4, "foo", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar1", 4, "bar", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar2", 4, "bar", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar3", 4, "bar", 3, &err); + CheckNoError(err); + + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + + rocksdb_iter_seek(iter, "bar", 3); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + CheckCondition(rocksdb_iter_valid(iter)); + + CheckIter(iter, "bar1", "bar"); + rocksdb_iter_next(iter); + CheckIter(iter, "bar2", "bar"); + rocksdb_iter_next(iter); + CheckIter(iter, "bar3", "bar"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + rocksdb_filterpolicy_destroy(policy); + } + + StartPhase("cleanup"); + rocksdb_close(db); + rocksdb_options_destroy(options); + rocksdb_readoptions_destroy(roptions); + rocksdb_writeoptions_destroy(woptions); + rocksdb_cache_destroy(cache); + rocksdb_comparator_destroy(cmp); + rocksdb_env_destroy(env); + + fprintf(stderr, "PASS\n"); + return 0; +} diff --git a/db/column_family.cc b/db/column_family.cc new file mode 100644 index 0000000000..45a3e9a82a --- /dev/null +++ b/db/column_family.cc @@ -0,0 +1,604 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/column_family.h" + +#include +#include +#include +#include + +#include "db/db_impl.h" +#include "db/version_set.h" +#include "db/internal_stats.h" +#include "db/compaction_picker.h" +#include "db/table_properties_collector.h" +#include "util/autovector.h" +#include "util/hash_skiplist_rep.h" + +namespace rocksdb { + +ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd, + DBImpl* db, port::Mutex* mutex) + : cfd_(cfd), db_(db), mutex_(mutex) { + if (cfd_ != nullptr) { + cfd_->Ref(); + } +} + +ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() { + if (cfd_ != nullptr) { + DBImpl::DeletionState deletion_state; + mutex_->Lock(); + if (cfd_->Unref()) { + delete cfd_; + } + db_->FindObsoleteFiles(deletion_state, false, true); + mutex_->Unlock(); + if (deletion_state.HaveSomethingToDelete()) { + db_->PurgeObsoleteFiles(deletion_state); + } + } +} + +uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); } + +namespace { +// Fix user-supplied options to be reasonable +template +static void ClipToRange(T* ptr, V minvalue, V maxvalue) { + if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; + if (static_cast(*ptr) < minvalue) *ptr = minvalue; +} +} // anonymous namespace + +ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp, + const InternalFilterPolicy* ipolicy, + const ColumnFamilyOptions& src) { + ColumnFamilyOptions result = src; + result.comparator = icmp; + result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr; +#ifdef OS_MACOSX + // TODO(icanadi) make write_buffer_size uint64_t instead of size_t + ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30); +#else + ClipToRange(&result.write_buffer_size, + ((size_t)64) << 10, ((size_t)64) << 30); +#endif + // if user sets arena_block_size, we trust user to use this value. Otherwise, + // calculate a proper value from writer_buffer_size; + if (result.arena_block_size <= 0) { + result.arena_block_size = result.write_buffer_size / 10; + } + result.min_write_buffer_number_to_merge = + std::min(result.min_write_buffer_number_to_merge, + result.max_write_buffer_number - 1); + if (result.block_cache == nullptr && !result.no_block_cache) { + result.block_cache = NewLRUCache(8 << 20); + } + result.compression_per_level = src.compression_per_level; + if (result.block_size_deviation < 0 || result.block_size_deviation > 100) { + result.block_size_deviation = 0; + } + if (result.max_mem_compaction_level >= result.num_levels) { + result.max_mem_compaction_level = result.num_levels - 1; + } + if (result.soft_rate_limit > result.hard_rate_limit) { + result.soft_rate_limit = result.hard_rate_limit; + } + if (!result.prefix_extractor) { + assert(result.memtable_factory); + Slice name = result.memtable_factory->Name(); + if (name.compare("HashSkipListRepFactory") == 0 || + name.compare("HashLinkListRepFactory") == 0) { + result.memtable_factory = std::make_shared(); + } + } + + // -- Sanitize the table properties collector + // All user defined properties collectors will be wrapped by + // UserKeyTablePropertiesCollector since for them they only have the + // knowledge of the user keys; internal keys are invisible to them. + auto& collector_factories = result.table_properties_collector_factories; + for (size_t i = 0; i < result.table_properties_collector_factories.size(); + ++i) { + assert(collector_factories[i]); + collector_factories[i] = + std::make_shared( + collector_factories[i]); + } + // Add collector to collect internal key statistics + collector_factories.push_back( + std::make_shared()); + + if (result.compaction_style == kCompactionStyleFIFO) { + result.num_levels = 1; + // since we delete level0 files in FIFO compaction when there are too many + // of them, these options don't really mean anything + result.level0_file_num_compaction_trigger = std::numeric_limits::max(); + result.level0_slowdown_writes_trigger = std::numeric_limits::max(); + result.level0_stop_writes_trigger = std::numeric_limits::max(); + } + + return result; +} + +int SuperVersion::dummy = 0; +void* const SuperVersion::kSVInUse = &SuperVersion::dummy; +void* const SuperVersion::kSVObsolete = nullptr; + +SuperVersion::~SuperVersion() { + for (auto td : to_delete) { + delete td; + } +} + +SuperVersion* SuperVersion::Ref() { + refs.fetch_add(1, std::memory_order_relaxed); + return this; +} + +bool SuperVersion::Unref() { + // fetch_sub returns the previous value of ref + uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed); + assert(previous_refs > 0); + return previous_refs == 1; +} + +void SuperVersion::Cleanup() { + assert(refs.load(std::memory_order_relaxed) == 0); + imm->Unref(&to_delete); + MemTable* m = mem->Unref(); + if (m != nullptr) { + to_delete.push_back(m); + } + current->Unref(); +} + +void SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm, + Version* new_current) { + mem = new_mem; + imm = new_imm; + current = new_current; + mem->Ref(); + imm->Ref(); + current->Ref(); + refs.store(1, std::memory_order_relaxed); +} + +namespace { +void SuperVersionUnrefHandle(void* ptr) { + // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets + // destroyed. When former happens, the thread shouldn't see kSVInUse. + // When latter happens, we are in ~ColumnFamilyData(), no get should happen as + // well. + SuperVersion* sv = static_cast(ptr); + if (sv->Unref()) { + sv->db_mutex->Lock(); + sv->Cleanup(); + sv->db_mutex->Unlock(); + delete sv; + } +} +} // anonymous namespace + +ColumnFamilyData::ColumnFamilyData(const std::string& dbname, uint32_t id, + const std::string& name, + Version* dummy_versions, Cache* table_cache, + const ColumnFamilyOptions& options, + const DBOptions* db_options, + const EnvOptions& storage_options, + ColumnFamilySet* column_family_set) + : id_(id), + name_(name), + dummy_versions_(dummy_versions), + current_(nullptr), + refs_(0), + dropped_(false), + internal_comparator_(options.comparator), + internal_filter_policy_(options.filter_policy), + options_(*db_options, SanitizeOptions(&internal_comparator_, + &internal_filter_policy_, options)), + mem_(nullptr), + imm_(options_.min_write_buffer_number_to_merge), + super_version_(nullptr), + super_version_number_(0), + local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)), + next_(nullptr), + prev_(nullptr), + log_number_(0), + need_slowdown_for_num_level0_files_(false), + column_family_set_(column_family_set) { + Ref(); + + // if dummy_versions is nullptr, then this is a dummy column family. + if (dummy_versions != nullptr) { + internal_stats_.reset(new InternalStats( + options_.num_levels, db_options->env, db_options->statistics.get())); + table_cache_.reset( + new TableCache(dbname, &options_, storage_options, table_cache)); + if (options_.compaction_style == kCompactionStyleUniversal) { + compaction_picker_.reset( + new UniversalCompactionPicker(&options_, &internal_comparator_)); + } else if (options_.compaction_style == kCompactionStyleLevel) { + compaction_picker_.reset( + new LevelCompactionPicker(&options_, &internal_comparator_)); + } else { + assert(options_.compaction_style == kCompactionStyleFIFO); + compaction_picker_.reset( + new FIFOCompactionPicker(&options_, &internal_comparator_)); + } + + Log(options_.info_log, "Options for column family \"%s\":\n", + name.c_str()); + const ColumnFamilyOptions* cf_options = &options_; + cf_options->Dump(options_.info_log.get()); + } +} + +// DB mutex held +ColumnFamilyData::~ColumnFamilyData() { + assert(refs_ == 0); + // remove from linked list + auto prev = prev_; + auto next = next_; + prev->next_ = next; + next->prev_ = prev; + + // it's nullptr for dummy CFD + if (column_family_set_ != nullptr) { + // remove from column_family_set + column_family_set_->RemoveColumnFamily(this); + } + + if (current_ != nullptr) { + current_->Unref(); + } + + if (super_version_ != nullptr) { + // Release SuperVersion reference kept in ThreadLocalPtr. + // This must be done outside of mutex_ since unref handler can lock mutex. + super_version_->db_mutex->Unlock(); + local_sv_.reset(); + super_version_->db_mutex->Lock(); + + bool is_last_reference __attribute__((unused)); + is_last_reference = super_version_->Unref(); + assert(is_last_reference); + super_version_->Cleanup(); + delete super_version_; + super_version_ = nullptr; + } + + if (dummy_versions_ != nullptr) { + // List must be empty + assert(dummy_versions_->next_ == dummy_versions_); + delete dummy_versions_; + } + + if (mem_ != nullptr) { + delete mem_->Unref(); + } + autovector to_delete; + imm_.current()->Unref(&to_delete); + for (MemTable* m : to_delete) { + delete m; + } +} + +const EnvOptions* ColumnFamilyData::soptions() const { + return &(column_family_set_->storage_options_); +} + +void ColumnFamilyData::SetCurrent(Version* current) { + current_ = current; + need_slowdown_for_num_level0_files_ = + (options_.level0_slowdown_writes_trigger >= 0 && + current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger); +} + +void ColumnFamilyData::CreateNewMemtable() { + assert(current_ != nullptr); + if (mem_ != nullptr) { + delete mem_->Unref(); + } + mem_ = new MemTable(internal_comparator_, options_); + mem_->Ref(); +} + +Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) { + return compaction_picker_->PickCompaction(current_, log_buffer); +} + +Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end) { + return compaction_picker_->CompactRange(current_, input_level, output_level, + begin, end, compaction_end); +} + +SuperVersion* ColumnFamilyData::GetReferencedSuperVersion( + port::Mutex* db_mutex) { + SuperVersion* sv = nullptr; + if (LIKELY(column_family_set_->db_options_->allow_thread_local)) { + sv = GetThreadLocalSuperVersion(db_mutex); + sv->Ref(); + if (!ReturnThreadLocalSuperVersion(sv)) { + sv->Unref(); + } + } else { + db_mutex->Lock(); + sv = super_version_->Ref(); + db_mutex->Unlock(); + } + return sv; +} + +SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion( + port::Mutex* db_mutex) { + SuperVersion* sv = nullptr; + // The SuperVersion is cached in thread local storage to avoid acquiring + // mutex when SuperVersion does not change since the last use. When a new + // SuperVersion is installed, the compaction or flush thread cleans up + // cached SuperVersion in all existing thread local storage. To avoid + // acquiring mutex for this operation, we use atomic Swap() on the thread + // local pointer to guarantee exclusive access. If the thread local pointer + // is being used while a new SuperVersion is installed, the cached + // SuperVersion can become stale. In that case, the background thread would + // have swapped in kSVObsolete. We re-check the value at when returning + // SuperVersion back to thread local, with an atomic compare and swap. + // The superversion will need to be released if detected to be stale. + void* ptr = local_sv_->Swap(SuperVersion::kSVInUse); + // Invariant: + // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage + // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage + // should only keep kSVInUse before ReturnThreadLocalSuperVersion call + // (if no Scrape happens). + assert(ptr != SuperVersion::kSVInUse); + sv = static_cast(ptr); + if (sv == SuperVersion::kSVObsolete || + sv->version_number != super_version_number_.load()) { + RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_ACQUIRES); + SuperVersion* sv_to_delete = nullptr; + + if (sv && sv->Unref()) { + RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS); + db_mutex->Lock(); + // NOTE: underlying resources held by superversion (sst files) might + // not be released until the next background job. + sv->Cleanup(); + sv_to_delete = sv; + } else { + db_mutex->Lock(); + } + sv = super_version_->Ref(); + db_mutex->Unlock(); + + delete sv_to_delete; + } + assert(sv != nullptr); + return sv; +} + +bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) { + assert(sv != nullptr); + // Put the SuperVersion back + void* expected = SuperVersion::kSVInUse; + if (local_sv_->CompareAndSwap(static_cast(sv), expected)) { + // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal + // storage has not been altered and no Scrape has happend. The + // SuperVersion is still current. + return true; + } else { + // ThreadLocal scrape happened in the process of this GetImpl call (after + // thread local Swap() at the beginning and before CompareAndSwap()). + // This means the SuperVersion it holds is obsolete. + assert(expected == SuperVersion::kSVObsolete); + } + return false; +} + +SuperVersion* ColumnFamilyData::InstallSuperVersion( + SuperVersion* new_superversion, port::Mutex* db_mutex) { + new_superversion->db_mutex = db_mutex; + new_superversion->Init(mem_, imm_.current(), current_); + SuperVersion* old_superversion = super_version_; + super_version_ = new_superversion; + ++super_version_number_; + super_version_->version_number = super_version_number_; + // Reset SuperVersions cached in thread local storage + if (column_family_set_->db_options_->allow_thread_local) { + ResetThreadLocalSuperVersions(); + } + if (old_superversion != nullptr && old_superversion->Unref()) { + old_superversion->Cleanup(); + return old_superversion; // will let caller delete outside of mutex + } + return nullptr; +} + +void ColumnFamilyData::ResetThreadLocalSuperVersions() { + autovector sv_ptrs; + local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete); + for (auto ptr : sv_ptrs) { + assert(ptr); + if (ptr == SuperVersion::kSVInUse) { + continue; + } + auto sv = static_cast(ptr); + if (sv->Unref()) { + sv->Cleanup(); + delete sv; + } + } +} + +ColumnFamilySet::ColumnFamilySet(const std::string& dbname, + const DBOptions* db_options, + const EnvOptions& storage_options, + Cache* table_cache) + : max_column_family_(0), + dummy_cfd_(new ColumnFamilyData(dbname, 0, "", nullptr, nullptr, + ColumnFamilyOptions(), db_options, + storage_options_, nullptr)), + default_cfd_cache_(nullptr), + db_name_(dbname), + db_options_(db_options), + storage_options_(storage_options), + table_cache_(table_cache), + spin_lock_(ATOMIC_FLAG_INIT) { + // initialize linked list + dummy_cfd_->prev_ = dummy_cfd_; + dummy_cfd_->next_ = dummy_cfd_; +} + +ColumnFamilySet::~ColumnFamilySet() { + while (column_family_data_.size() > 0) { + // cfd destructor will delete itself from column_family_data_ + auto cfd = column_family_data_.begin()->second; + cfd->Unref(); + delete cfd; + } + dummy_cfd_->Unref(); + delete dummy_cfd_; +} + +ColumnFamilyData* ColumnFamilySet::GetDefault() const { + assert(default_cfd_cache_ != nullptr); + return default_cfd_cache_; +} + +ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const { + auto cfd_iter = column_family_data_.find(id); + if (cfd_iter != column_family_data_.end()) { + return cfd_iter->second; + } else { + return nullptr; + } +} + +ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name) + const { + auto cfd_iter = column_families_.find(name); + if (cfd_iter != column_families_.end()) { + auto cfd = GetColumnFamily(cfd_iter->second); + assert(cfd != nullptr); + return cfd; + } else { + return nullptr; + } +} + +uint32_t ColumnFamilySet::GetNextColumnFamilyID() { + return ++max_column_family_; +} + +uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; } + +void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) { + max_column_family_ = std::max(new_max_column_family, max_column_family_); +} + +size_t ColumnFamilySet::NumberOfColumnFamilies() const { + return column_families_.size(); +} + +// under a DB mutex +ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( + const std::string& name, uint32_t id, Version* dummy_versions, + const ColumnFamilyOptions& options) { + assert(column_families_.find(name) == column_families_.end()); + ColumnFamilyData* new_cfd = + new ColumnFamilyData(db_name_, id, name, dummy_versions, table_cache_, + options, db_options_, storage_options_, this); + Lock(); + column_families_.insert({name, id}); + column_family_data_.insert({id, new_cfd}); + Unlock(); + max_column_family_ = std::max(max_column_family_, id); + // add to linked list + new_cfd->next_ = dummy_cfd_; + auto prev = dummy_cfd_->prev_; + new_cfd->prev_ = prev; + prev->next_ = new_cfd; + dummy_cfd_->prev_ = new_cfd; + if (id == 0) { + default_cfd_cache_ = new_cfd; + } + return new_cfd; +} + +void ColumnFamilySet::Lock() { + // spin lock + while (spin_lock_.test_and_set(std::memory_order_acquire)) { + } +} + +void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); } + +// REQUIRES: DB mutex held +void ColumnFamilySet::FreeDeadColumnFamilies() { + autovector to_delete; + for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) { + if (cfd->refs_ == 0) { + to_delete.push_back(cfd); + } + } + for (auto cfd : to_delete) { + // this is very rare, so it's not a problem that we do it under a mutex + delete cfd; + } +} + +// under a DB mutex +void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) { + auto cfd_iter = column_family_data_.find(cfd->GetID()); + assert(cfd_iter != column_family_data_.end()); + Lock(); + column_family_data_.erase(cfd_iter); + column_families_.erase(cfd->GetName()); + Unlock(); +} + +bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) { + if (column_family_id == 0) { + // optimization for common case + current_ = column_family_set_->GetDefault(); + } else { + // maybe outside of db mutex, should lock + column_family_set_->Lock(); + current_ = column_family_set_->GetColumnFamily(column_family_id); + column_family_set_->Unlock(); + } + handle_.SetCFD(current_); + return current_ != nullptr; +} + +uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const { + assert(current_ != nullptr); + return current_->GetLogNumber(); +} + +MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const { + assert(current_ != nullptr); + return current_->mem(); +} + +const Options* ColumnFamilyMemTablesImpl::GetOptions() const { + assert(current_ != nullptr); + return current_->options(); +} + +ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() { + assert(current_ != nullptr); + return &handle_; +} + +} // namespace rocksdb diff --git a/db/column_family.h b/db/column_family.h new file mode 100644 index 0000000000..991bb01123 --- /dev/null +++ b/db/column_family.h @@ -0,0 +1,419 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "db/memtable_list.h" +#include "db/write_batch_internal.h" +#include "db/table_cache.h" +#include "util/thread_local.h" + +namespace rocksdb { + +class Version; +class VersionSet; +class MemTable; +class MemTableListVersion; +class CompactionPicker; +class Compaction; +class InternalKey; +class InternalStats; +class ColumnFamilyData; +class DBImpl; +class LogBuffer; + +// ColumnFamilyHandleImpl is the class that clients use to access different +// column families. It has non-trivial destructor, which gets called when client +// is done using the column family +class ColumnFamilyHandleImpl : public ColumnFamilyHandle { + public: + // create while holding the mutex + ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex); + // destroy without mutex + virtual ~ColumnFamilyHandleImpl(); + virtual ColumnFamilyData* cfd() const { return cfd_; } + + virtual uint32_t GetID() const; + + private: + ColumnFamilyData* cfd_; + DBImpl* db_; + port::Mutex* mutex_; +}; + +// Does not ref-count ColumnFamilyData +// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter +// calls DBImpl methods. When this happens, MemTableInserter need access to +// ColumnFamilyHandle (same as the client would need). In that case, we feed +// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl +// methods +class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl { + public: + ColumnFamilyHandleInternal() + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {} + + void SetCFD(ColumnFamilyData* cfd) { internal_cfd_ = cfd; } + virtual ColumnFamilyData* cfd() const override { return internal_cfd_; } + + private: + ColumnFamilyData* internal_cfd_; +}; + +// holds references to memtable, all immutable memtables and version +struct SuperVersion { + MemTable* mem; + MemTableListVersion* imm; + Version* current; + std::atomic refs; + // We need to_delete because during Cleanup(), imm->Unref() returns + // all memtables that we need to free through this vector. We then + // delete all those memtables outside of mutex, during destruction + autovector to_delete; + // Version number of the current SuperVersion + uint64_t version_number; + port::Mutex* db_mutex; + + // should be called outside the mutex + SuperVersion() = default; + ~SuperVersion(); + SuperVersion* Ref(); + + bool Unref(); + + // call these two methods with db mutex held + // Cleanup unrefs mem, imm and current. Also, it stores all memtables + // that needs to be deleted in to_delete vector. Unrefing those + // objects needs to be done in the mutex + void Cleanup(); + void Init(MemTable* new_mem, MemTableListVersion* new_imm, + Version* new_current); + + // The value of dummy is not actually used. kSVInUse takes its address as a + // mark in the thread local storage to indicate the SuperVersion is in use + // by thread. This way, the value of kSVInUse is guaranteed to have no + // conflict with SuperVersion object address and portable on different + // platform. + static int dummy; + static void* const kSVInUse; + static void* const kSVObsolete; +}; + +extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp, + const InternalFilterPolicy* ipolicy, + const ColumnFamilyOptions& src); + +class ColumnFamilySet; + +// This class keeps all the data that a column family needs. It's mosly dumb and +// used just to provide access to metadata. +// Most methods require DB mutex held, unless otherwise noted +class ColumnFamilyData { + public: + ~ColumnFamilyData(); + + // thread-safe + uint32_t GetID() const { return id_; } + // thread-safe + const std::string& GetName() const { return name_; } + + void Ref() { ++refs_; } + // will just decrease reference count to 0, but will not delete it. returns + // true if the ref count was decreased to zero. in that case, it can be + // deleted by the caller immediatelly, or later, by calling + // FreeDeadColumnFamilies() + bool Unref() { + assert(refs_ > 0); + return --refs_ == 0; + } + + // This can only be called from single-threaded VersionSet::LogAndApply() + // After dropping column family no other operation on that column family + // will be executed. All the files and memory will be, however, kept around + // until client drops the column family handle. That way, client can still + // access data from dropped column family. + // Column family can be dropped and still alive. In that state: + // *) Column family is not included in the iteration. + // *) Compaction and flush is not executed on the dropped column family. + // *) Client can continue writing and reading from column family. However, all + // writes stay in the current memtable. + // When the dropped column family is unreferenced, then we: + // *) delete all memory associated with that column family + // *) delete all the files associated with that column family + void SetDropped() { + // can't drop default CF + assert(id_ != 0); + dropped_ = true; + } + bool IsDropped() const { return dropped_; } + + // thread-safe + int NumberLevels() const { return options_.num_levels; } + + void SetLogNumber(uint64_t log_number) { log_number_ = log_number; } + uint64_t GetLogNumber() const { return log_number_; } + + // thread-safe + const Options* options() const { return &options_; } + const EnvOptions* soptions() const; + + InternalStats* internal_stats() { return internal_stats_.get(); } + + MemTableList* imm() { return &imm_; } + MemTable* mem() { return mem_; } + Version* current() { return current_; } + Version* dummy_versions() { return dummy_versions_; } + void SetMemtable(MemTable* new_mem) { mem_ = new_mem; } + void SetCurrent(Version* current); + void CreateNewMemtable(); + + TableCache* table_cache() const { return table_cache_.get(); } + + // See documentation in compaction_picker.h + Compaction* PickCompaction(LogBuffer* log_buffer); + Compaction* CompactRange(int input_level, int output_level, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end); + + CompactionPicker* compaction_picker() { return compaction_picker_.get(); } + // thread-safe + const Comparator* user_comparator() const { + return internal_comparator_.user_comparator(); + } + // thread-safe + const InternalKeyComparator& internal_comparator() const { + return internal_comparator_; + } + + SuperVersion* GetSuperVersion() { return super_version_; } + // thread-safe + // Return a already referenced SuperVersion to be used safely. + SuperVersion* GetReferencedSuperVersion(port::Mutex* db_mutex); + // thread-safe + // Get SuperVersion stored in thread local storage. If it does not exist, + // get a reference from a current SuperVersion. + SuperVersion* GetThreadLocalSuperVersion(port::Mutex* db_mutex); + // Try to return SuperVersion back to thread local storage. Retrun true on + // success and false on failure. It fails when the thread local storage + // contains anything other than SuperVersion::kSVInUse flag. + bool ReturnThreadLocalSuperVersion(SuperVersion* sv); + // thread-safe + uint64_t GetSuperVersionNumber() const { + return super_version_number_.load(); + } + // will return a pointer to SuperVersion* if previous SuperVersion + // if its reference count is zero and needs deletion or nullptr if not + // As argument takes a pointer to allocated SuperVersion to enable + // the clients to allocate SuperVersion outside of mutex. + SuperVersion* InstallSuperVersion(SuperVersion* new_superversion, + port::Mutex* db_mutex); + + void ResetThreadLocalSuperVersions(); + + // A Flag indicating whether write needs to slowdown because of there are + // too many number of level0 files. + bool NeedSlowdownForNumLevel0Files() const { + return need_slowdown_for_num_level0_files_; + } + + private: + friend class ColumnFamilySet; + ColumnFamilyData(const std::string& dbname, uint32_t id, + const std::string& name, Version* dummy_versions, + Cache* table_cache, const ColumnFamilyOptions& options, + const DBOptions* db_options, + const EnvOptions& storage_options, + ColumnFamilySet* column_family_set); + + uint32_t id_; + const std::string name_; + Version* dummy_versions_; // Head of circular doubly-linked list of versions. + Version* current_; // == dummy_versions->prev_ + + int refs_; // outstanding references to ColumnFamilyData + bool dropped_; // true if client dropped it + + const InternalKeyComparator internal_comparator_; + const InternalFilterPolicy internal_filter_policy_; + + Options const options_; + + std::unique_ptr table_cache_; + + std::unique_ptr internal_stats_; + + MemTable* mem_; + MemTableList imm_; + SuperVersion* super_version_; + + // An ordinal representing the current SuperVersion. Updated by + // InstallSuperVersion(), i.e. incremented every time super_version_ + // changes. + std::atomic super_version_number_; + + // Thread's local copy of SuperVersion pointer + // This needs to be destructed before mutex_ + std::unique_ptr local_sv_; + + // pointers for a circular linked list. we use it to support iterations + // that can be concurrent with writes + ColumnFamilyData* next_; + ColumnFamilyData* prev_; + + // This is the earliest log file number that contains data from this + // Column Family. All earlier log files must be ignored and not + // recovered from + uint64_t log_number_; + + // A flag indicating whether we should delay writes because + // we have too many level 0 files + bool need_slowdown_for_num_level0_files_; + + // An object that keeps all the compaction stats + // and picks the next compaction + std::unique_ptr compaction_picker_; + + ColumnFamilySet* column_family_set_; +}; + +// ColumnFamilySet has interesting thread-safety requirements +// * CreateColumnFamily() or RemoveColumnFamily() -- need to protect by DB +// mutex. Inside, column_family_data_ and column_families_ will be protected +// by Lock() and Unlock(). CreateColumnFamily() should ONLY be called from +// VersionSet::LogAndApply() in the normal runtime. It is also called +// during Recovery and in DumpManifest(). RemoveColumnFamily() is called +// from ColumnFamilyData destructor +// * Iteration -- hold DB mutex, but you can release it in the body of +// iteration. If you release DB mutex in body, reference the column +// family before the mutex and unreference after you unlock, since the column +// family might get dropped when the DB mutex is released +// * GetDefault() -- thread safe +// * GetColumnFamily() -- either inside of DB mutex or call Lock() <-> Unlock() +// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(), +// NumberOfColumnFamilies -- inside of DB mutex +class ColumnFamilySet { + public: + // ColumnFamilySet supports iteration + class iterator { + public: + explicit iterator(ColumnFamilyData* cfd) + : current_(cfd) {} + iterator& operator++() { + // dummy is never dead or dropped, so this will never be infinite + do { + current_ = current_->next_; + } while (current_->refs_ == 0 || current_->IsDropped()); + return *this; + } + bool operator!=(const iterator& other) { + return this->current_ != other.current_; + } + ColumnFamilyData* operator*() { return current_; } + + private: + ColumnFamilyData* current_; + }; + + ColumnFamilySet(const std::string& dbname, const DBOptions* db_options, + const EnvOptions& storage_options, Cache* table_cache); + ~ColumnFamilySet(); + + ColumnFamilyData* GetDefault() const; + // GetColumnFamily() calls return nullptr if column family is not found + ColumnFamilyData* GetColumnFamily(uint32_t id) const; + ColumnFamilyData* GetColumnFamily(const std::string& name) const; + // this call will return the next available column family ID. it guarantees + // that there is no column family with id greater than or equal to the + // returned value in the current running instance or anytime in RocksDB + // instance history. + uint32_t GetNextColumnFamilyID(); + uint32_t GetMaxColumnFamily(); + void UpdateMaxColumnFamily(uint32_t new_max_column_family); + size_t NumberOfColumnFamilies() const; + + ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id, + Version* dummy_version, + const ColumnFamilyOptions& options); + + iterator begin() { return iterator(dummy_cfd_->next_); } + iterator end() { return iterator(dummy_cfd_); } + + void Lock(); + void Unlock(); + + // REQUIRES: DB mutex held + // Don't call while iterating over ColumnFamilySet + void FreeDeadColumnFamilies(); + + private: + friend class ColumnFamilyData; + // helper function that gets called from cfd destructor + // REQUIRES: DB mutex held + void RemoveColumnFamily(ColumnFamilyData* cfd); + + // column_families_ and column_family_data_ need to be protected: + // * when mutating: 1. DB mutex locked first, 2. spinlock locked second + // * when reading, either: 1. lock DB mutex, or 2. lock spinlock + // (if both, respect the ordering to avoid deadlock!) + std::unordered_map column_families_; + std::unordered_map column_family_data_; + + uint32_t max_column_family_; + ColumnFamilyData* dummy_cfd_; + // We don't hold the refcount here, since default column family always exists + // We are also not responsible for cleaning up default_cfd_cache_. This is + // just a cache that makes common case (accessing default column family) + // faster + ColumnFamilyData* default_cfd_cache_; + + const std::string db_name_; + const DBOptions* const db_options_; + const EnvOptions storage_options_; + Cache* table_cache_; + std::atomic_flag spin_lock_; +}; + +// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access +// memtables of different column families (specified by ID in the write batch) +class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { + public: + explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) + : column_family_set_(column_family_set), current_(nullptr) {} + + // sets current_ to ColumnFamilyData with column_family_id + // returns false if column family doesn't exist + bool Seek(uint32_t column_family_id) override; + + // Returns log number of the selected column family + uint64_t GetLogNumber() const override; + + // REQUIRES: Seek() called first + virtual MemTable* GetMemTable() const override; + + // Returns options for selected column family + // REQUIRES: Seek() called first + virtual const Options* GetOptions() const override; + + // Returns column family handle for the selected column family + virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; + + private: + ColumnFamilySet* column_family_set_; + ColumnFamilyData* current_; + ColumnFamilyHandleInternal handle_; +}; + +} // namespace rocksdb diff --git a/db/column_family_test.cc b/db/column_family_test.cc new file mode 100644 index 0000000000..5f7ff48a8c --- /dev/null +++ b/db/column_family_test.cc @@ -0,0 +1,977 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include + +#include "db/db_impl.h" +#include "rocksdb/env.h" +#include "rocksdb/db.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "util/coding.h" +#include "utilities/merge_operators.h" + +namespace rocksdb { + +namespace { +std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} +} // anonymous namespace + +// counts how many operations were performed +class EnvCounter : public EnvWrapper { + public: + explicit EnvCounter(Env* base) + : EnvWrapper(base), num_new_writable_file_(0) {} + int GetNumberOfNewWritableFileCalls() { + return num_new_writable_file_; + } + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& soptions) { + ++num_new_writable_file_; + return EnvWrapper::NewWritableFile(f, r, soptions); + } + + private: + int num_new_writable_file_; +}; + +class ColumnFamilyTest { + public: + ColumnFamilyTest() : rnd_(139) { + env_ = new EnvCounter(Env::Default()); + dbname_ = test::TmpDir() + "/column_family_test"; + db_options_.create_if_missing = true; + db_options_.env = env_; + DestroyDB(dbname_, Options(db_options_, column_family_options_)); + } + + ~ColumnFamilyTest() { + delete env_; + } + + void Close() { + for (auto h : handles_) { + delete h; + } + handles_.clear(); + names_.clear(); + delete db_; + db_ = nullptr; + } + + Status TryOpen(std::vector cf, + std::vector options = {}) { + std::vector column_families; + names_.clear(); + for (size_t i = 0; i < cf.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor( + cf[i], options.size() == 0 ? column_family_options_ : options[i])); + names_.push_back(cf[i]); + } + return DB::Open(db_options_, dbname_, column_families, &handles_, &db_); + } + + Status OpenReadOnly(std::vector cf, + std::vector options = {}) { + std::vector column_families; + names_.clear(); + for (size_t i = 0; i < cf.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor( + cf[i], options.size() == 0 ? column_family_options_ : options[i])); + names_.push_back(cf[i]); + } + return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_, + &db_); + } + + void AssertOpenReadOnly(std::vector cf, + std::vector options = {}) { + ASSERT_OK(OpenReadOnly(cf, options)); + } + + + void Open(std::vector cf, + std::vector options = {}) { + ASSERT_OK(TryOpen(cf, options)); + } + + void Open() { + Open({"default"}); + } + + DBImpl* dbfull() { return reinterpret_cast(db_); } + + int GetProperty(int cf, std::string property) { + std::string value; + ASSERT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value)); + return std::stoi(value); + } + + void Destroy() { + for (auto h : handles_) { + delete h; + } + handles_.clear(); + names_.clear(); + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_))); + } + + void CreateColumnFamilies( + const std::vector& cfs, + const std::vector options = {}) { + int cfi = handles_.size(); + handles_.resize(cfi + cfs.size()); + names_.resize(cfi + cfs.size()); + for (size_t i = 0; i < cfs.size(); ++i) { + ASSERT_OK(db_->CreateColumnFamily( + options.size() == 0 ? column_family_options_ : options[i], cfs[i], + &handles_[cfi])); + names_[cfi] = cfs[i]; + cfi++; + } + } + + void Reopen(const std::vector options = {}) { + std::vector names; + for (auto name : names_) { + if (name != "") { + names.push_back(name); + } + } + Close(); + assert(options.size() == 0 || names.size() == options.size()); + Open(names, options); + } + + void CreateColumnFamiliesAndReopen(const std::vector& cfs) { + CreateColumnFamilies(cfs); + Reopen(); + } + + void DropColumnFamilies(const std::vector& cfs) { + for (auto cf : cfs) { + ASSERT_OK(db_->DropColumnFamily(handles_[cf])); + delete handles_[cf]; + handles_[cf] = nullptr; + names_[cf] = ""; + } + } + + void PutRandomData(int cf, int num, int key_value_size) { + for (int i = 0; i < num; ++i) { + // 10 bytes for key, rest is value + ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 10), + RandomString(&rnd_, key_value_size - 10))); + } + } + + void WaitForFlush(int cf) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); + } + + void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); } + + Status Put(int cf, const std::string& key, const std::string& value) { + return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value)); + } + Status Merge(int cf, const std::string& key, const std::string& value) { + return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value)); + } + Status Flush(int cf) { + return db_->Flush(FlushOptions(), handles_[cf]); + } + + std::string Get(int cf, const std::string& key) { + ReadOptions options; + options.verify_checksums = true; + std::string result; + Status s = db_->Get(options, handles_[cf], Slice(key), &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + void CompactAll(int cf) { + ASSERT_OK(db_->CompactRange(handles_[cf], nullptr, nullptr)); + } + + void Compact(int cf, const Slice& start, const Slice& limit) { + ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit)); + } + + int NumTableFilesAtLevel(int level, int cf) { + return GetProperty(cf, + "rocksdb.num-files-at-level" + std::to_string(level)); + } + + // Return spread of files per level + std::string FilesPerLevel(int cf) { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) { + int f = NumTableFilesAtLevel(level, cf); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + int CountLiveFiles() { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + return static_cast(metadata.size()); + } + + // Do n memtable flushes, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int cf, int n, const std::string& small, + const std::string& large) { + for (int i = 0; i < n; i++) { + ASSERT_OK(Put(cf, small, "begin")); + ASSERT_OK(Put(cf, large, "end")); + ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf])); + } + } + + int CountLiveLogFiles() { + int micros_wait_for_log_deletion = 20000; + env_->SleepForMicroseconds(micros_wait_for_log_deletion); + int ret = 0; + VectorLogPtr wal_files; + Status s; + // GetSortedWalFiles is a flakey function -- it gets all the wal_dir + // children files and then later checks for their existance. if some of the + // log files doesn't exist anymore, it reports an error. it does all of this + // without DB mutex held, so if a background process deletes the log file + // while the function is being executed, it returns an error. We retry the + // function 10 times to avoid the error failing the test + for (int retries = 0; retries < 10; ++retries) { + wal_files.clear(); + s = db_->GetSortedWalFiles(wal_files); + if (s.ok()) { + break; + } + } + ASSERT_OK(s); + for (const auto& wal : wal_files) { + if (wal->Type() == kAliveLogFile) { + ++ret; + } + } + return ret; + } + + void AssertNumberOfImmutableMemtables(std::vector num_per_cf) { + assert(num_per_cf.size() == handles_.size()); + + for (size_t i = 0; i < num_per_cf.size(); ++i) { + ASSERT_EQ(num_per_cf[i], + GetProperty(i, "rocksdb.num-immutable-mem-table")); + } + } + + void CopyFile(const std::string& source, const std::string& destination, + uint64_t size = 0) { + const EnvOptions soptions; + unique_ptr srcfile; + ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions)); + unique_ptr destfile; + ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions)); + + if (size == 0) { + // default argument means copy everything + ASSERT_OK(env_->GetFileSize(source, &size)); + } + + char buffer[4096]; + Slice slice; + while (size > 0) { + uint64_t one = std::min(uint64_t(sizeof(buffer)), size); + ASSERT_OK(srcfile->Read(one, &slice, buffer)); + ASSERT_OK(destfile->Append(slice)); + size -= slice.size(); + } + ASSERT_OK(destfile->Close()); + } + + std::vector handles_; + std::vector names_; + ColumnFamilyOptions column_family_options_; + DBOptions db_options_; + std::string dbname_; + DB* db_ = nullptr; + EnvCounter* env_; + Random rnd_; +}; + +TEST(ColumnFamilyTest, DontReuseColumnFamilyID) { + for (int iter = 0; iter < 3; ++iter) { + Open(); + CreateColumnFamilies({"one", "two", "three"}); + for (size_t i = 0; i < handles_.size(); ++i) { + auto cfh = reinterpret_cast(handles_[i]); + ASSERT_EQ(i, cfh->GetID()); + } + if (iter == 1) { + Reopen(); + } + DropColumnFamilies({3}); + Reopen(); + if (iter == 2) { + // this tests if max_column_family is correctly persisted with + // WriteSnapshot() + Reopen(); + } + CreateColumnFamilies({"three2"}); + // ID 3 that was used for dropped column family "three" should not be reused + auto cfh3 = reinterpret_cast(handles_[3]); + ASSERT_EQ(4U, cfh3->GetID()); + Close(); + Destroy(); + } +} + + +TEST(ColumnFamilyTest, AddDrop) { + Open(); + CreateColumnFamilies({"one", "two", "three"}); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(2, "fodor")); + DropColumnFamilies({2}); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + CreateColumnFamilies({"four"}); + ASSERT_EQ("NOT_FOUND", Get(3, "fodor")); + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_EQ("mirko", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(3, "fodor")); + Close(); + ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument()); + Open({"default", "one", "three", "four"}); + DropColumnFamilies({1}); + Reopen(); + Close(); + + std::vector families; + ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families)); + sort(families.begin(), families.end()); + ASSERT_TRUE(families == + std::vector({"default", "four", "three"})); +} + +TEST(ColumnFamilyTest, DropTest) { + // first iteration - dont reopen DB before dropping + // second iteration - reopen DB before dropping + for (int iter = 0; iter < 2; ++iter) { + Open({"default"}); + CreateColumnFamiliesAndReopen({"pikachu"}); + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i))); + } + ASSERT_OK(Flush(1)); + + if (iter == 1) { + Reopen(); + } + ASSERT_EQ("bar1", Get(1, "1")); + + ASSERT_EQ(CountLiveFiles(), 1); + DropColumnFamilies({1}); + // make sure that all files are deleted when we drop the column family + ASSERT_EQ(CountLiveFiles(), 0); + Destroy(); + } +} + +TEST(ColumnFamilyTest, WriteBatchFailure) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + WriteBatch batch; + batch.Put(handles_[1], Slice("non-existing"), Slice("column-family")); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + DropColumnFamilies({1}); + Status s = db_->Write(WriteOptions(), &batch); + ASSERT_TRUE(s.IsInvalidArgument()); + Close(); +} + +TEST(ColumnFamilyTest, ReadWrite) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + ASSERT_OK(Put(0, "foo", "v1")); + ASSERT_OK(Put(0, "bar", "v2")); + ASSERT_OK(Put(1, "mirko", "v3")); + ASSERT_OK(Put(0, "foo", "v2")); + ASSERT_OK(Put(2, "fodor", "v5")); + + for (int iter = 0; iter <= 3; ++iter) { + ASSERT_EQ("v2", Get(0, "foo")); + ASSERT_EQ("v2", Get(0, "bar")); + ASSERT_EQ("v3", Get(1, "mirko")); + ASSERT_EQ("v5", Get(2, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(0, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(2, "foo")); + if (iter <= 1) { + Reopen(); + } + } + Close(); +} + +TEST(ColumnFamilyTest, IgnoreRecoveredLog) { + std::string backup_logs = dbname_ + "/backup_logs"; + + // delete old files in backup_logs directory + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); + ASSERT_OK(env_->CreateDirIfMissing(backup_logs)); + std::vector old_files; + env_->GetChildren(backup_logs, &old_files); + for (auto& file : old_files) { + if (file != "." && file != "..") { + env_->DeleteFile(backup_logs + "/" + file); + } + } + + column_family_options_.merge_operator = + MergeOperators::CreateUInt64AddOperator(); + db_options_.wal_dir = dbname_ + "/logs"; + Destroy(); + Open(); + CreateColumnFamilies({"cf1", "cf2"}); + + // fill up the DB + std::string one, two, three; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + ASSERT_OK(Merge(0, "foo", one)); + ASSERT_OK(Merge(1, "mirko", one)); + ASSERT_OK(Merge(0, "foo", one)); + ASSERT_OK(Merge(2, "bla", one)); + ASSERT_OK(Merge(2, "fodor", one)); + ASSERT_OK(Merge(0, "bar", one)); + ASSERT_OK(Merge(2, "bla", one)); + ASSERT_OK(Merge(1, "mirko", two)); + ASSERT_OK(Merge(1, "franjo", one)); + + // copy the logs to backup + std::vector logs; + env_->GetChildren(db_options_.wal_dir, &logs); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log); + } + } + + // recover the DB + Close(); + + // 1. check consistency + // 2. copy the logs from backup back to WAL dir. if the recovery happens + // again on the same log files, this should lead to incorrect results + // due to applying merge operator twice + // 3. check consistency + for (int iter = 0; iter < 2; ++iter) { + // assert consistency + Open({"default", "cf1", "cf2"}); + ASSERT_EQ(two, Get(0, "foo")); + ASSERT_EQ(one, Get(0, "bar")); + ASSERT_EQ(three, Get(1, "mirko")); + ASSERT_EQ(one, Get(1, "franjo")); + ASSERT_EQ(one, Get(2, "fodor")); + ASSERT_EQ(two, Get(2, "bla")); + Close(); + + if (iter == 0) { + // copy the logs from backup back to wal dir + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log); + } + } + } + } +} + +TEST(ColumnFamilyTest, FlushTest) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + ASSERT_OK(Put(0, "foo", "v1")); + ASSERT_OK(Put(0, "bar", "v2")); + ASSERT_OK(Put(1, "mirko", "v3")); + ASSERT_OK(Put(0, "foo", "v2")); + ASSERT_OK(Put(2, "fodor", "v5")); + for (int i = 0; i < 3; ++i) { + Flush(i); + } + Reopen(); + + for (int iter = 0; iter <= 2; ++iter) { + ASSERT_EQ("v2", Get(0, "foo")); + ASSERT_EQ("v2", Get(0, "bar")); + ASSERT_EQ("v3", Get(1, "mirko")); + ASSERT_EQ("v5", Get(2, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(0, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(2, "foo")); + if (iter <= 1) { + Reopen(); + } + } + Close(); +} + +// Makes sure that obsolete log files get deleted +TEST(ColumnFamilyTest, LogDeletionTest) { + db_options_.max_total_wal_size = std::numeric_limits::max(); + column_family_options_.write_buffer_size = 100000; // 100KB + Open(); + CreateColumnFamilies({"one", "two", "three", "four"}); + // Each bracket is one log file. if number is in (), it means + // we don't need it anymore (it's been flushed) + // [] + ASSERT_EQ(CountLiveLogFiles(), 0); + PutRandomData(0, 1, 100); + // [0] + PutRandomData(1, 1, 100); + // [0, 1] + PutRandomData(1, 1000, 100); + WaitForFlush(1); + // [0, (1)] [1] + ASSERT_EQ(CountLiveLogFiles(), 2); + PutRandomData(0, 1, 100); + // [0, (1)] [0, 1] + ASSERT_EQ(CountLiveLogFiles(), 2); + PutRandomData(2, 1, 100); + // [0, (1)] [0, 1, 2] + PutRandomData(2, 1000, 100); + WaitForFlush(2); + // [0, (1)] [0, 1, (2)] [2] + ASSERT_EQ(CountLiveLogFiles(), 3); + PutRandomData(2, 1000, 100); + WaitForFlush(2); + // [0, (1)] [0, 1, (2)] [(2)] [2] + ASSERT_EQ(CountLiveLogFiles(), 4); + PutRandomData(3, 1, 100); + // [0, (1)] [0, 1, (2)] [(2)] [2, 3] + PutRandomData(1, 1, 100); + // [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3] + ASSERT_EQ(CountLiveLogFiles(), 4); + PutRandomData(1, 1000, 100); + WaitForFlush(1); + // [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1] + ASSERT_EQ(CountLiveLogFiles(), 5); + PutRandomData(0, 1000, 100); + WaitForFlush(0); + // [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0] + // delete obsolete logs --> + // [(1), 2, 3] [1, (0)] [0] + ASSERT_EQ(CountLiveLogFiles(), 3); + PutRandomData(0, 1000, 100); + WaitForFlush(0); + // [(1), 2, 3] [1, (0)], [(0)] [0] + ASSERT_EQ(CountLiveLogFiles(), 4); + PutRandomData(1, 1000, 100); + WaitForFlush(1); + // [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1] + ASSERT_EQ(CountLiveLogFiles(), 5); + PutRandomData(2, 1000, 100); + WaitForFlush(2); + // [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2] + ASSERT_EQ(CountLiveLogFiles(), 6); + PutRandomData(3, 1000, 100); + WaitForFlush(3); + // [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3] + // delete obsolete logs --> + // [0, (1)] [1, (2)], [2, (3)] [3] + ASSERT_EQ(CountLiveLogFiles(), 4); + Close(); +} + +// Makes sure that obsolete log files get deleted +TEST(ColumnFamilyTest, DifferentWriteBufferSizes) { + // disable flushing stale column families + db_options_.max_total_wal_size = std::numeric_limits::max(); + Open(); + CreateColumnFamilies({"one", "two", "three"}); + ColumnFamilyOptions default_cf, one, two, three; + // setup options. all column families have max_write_buffer_number setup to 10 + // "default" -> 100KB memtable, start flushing immediatelly + // "one" -> 200KB memtable, start flushing with two immutable memtables + // "two" -> 1MB memtable, start flushing with three immutable memtables + // "three" -> 90KB memtable, start flushing with four immutable memtables + default_cf.write_buffer_size = 100000; + default_cf.max_write_buffer_number = 10; + default_cf.min_write_buffer_number_to_merge = 1; + one.write_buffer_size = 200000; + one.max_write_buffer_number = 10; + one.min_write_buffer_number_to_merge = 2; + two.write_buffer_size = 1000000; + two.max_write_buffer_number = 10; + two.min_write_buffer_number_to_merge = 3; + three.write_buffer_size = 90000; + three.max_write_buffer_number = 10; + three.min_write_buffer_number_to_merge = 4; + + Reopen({default_cf, one, two, three}); + + int micros_wait_for_flush = 10000; + PutRandomData(0, 100, 1000); + WaitForFlush(0); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + ASSERT_EQ(CountLiveLogFiles(), 1); + PutRandomData(1, 200, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 0, 0}); + ASSERT_EQ(CountLiveLogFiles(), 2); + PutRandomData(2, 1000, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 1, 0}); + ASSERT_EQ(CountLiveLogFiles(), 3); + PutRandomData(2, 1000, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 0}); + ASSERT_EQ(CountLiveLogFiles(), 4); + PutRandomData(3, 90, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 1}); + ASSERT_EQ(CountLiveLogFiles(), 5); + PutRandomData(3, 90, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 2}); + ASSERT_EQ(CountLiveLogFiles(), 6); + PutRandomData(3, 90, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 3}); + ASSERT_EQ(CountLiveLogFiles(), 7); + PutRandomData(0, 100, 1000); + WaitForFlush(0); + AssertNumberOfImmutableMemtables({0, 1, 2, 3}); + ASSERT_EQ(CountLiveLogFiles(), 8); + PutRandomData(2, 100, 10000); + WaitForFlush(2); + AssertNumberOfImmutableMemtables({0, 1, 0, 3}); + ASSERT_EQ(CountLiveLogFiles(), 9); + PutRandomData(3, 90, 1000); + WaitForFlush(3); + AssertNumberOfImmutableMemtables({0, 1, 0, 0}); + ASSERT_EQ(CountLiveLogFiles(), 10); + PutRandomData(3, 90, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 0, 1}); + ASSERT_EQ(CountLiveLogFiles(), 11); + PutRandomData(1, 200, 1000); + WaitForFlush(1); + AssertNumberOfImmutableMemtables({0, 0, 0, 1}); + ASSERT_EQ(CountLiveLogFiles(), 5); + PutRandomData(3, 90*6, 1000); + WaitForFlush(3); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + ASSERT_EQ(CountLiveLogFiles(), 12); + PutRandomData(0, 100, 1000); + WaitForFlush(0); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + ASSERT_EQ(CountLiveLogFiles(), 12); + PutRandomData(2, 3*100, 10000); + WaitForFlush(2); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + ASSERT_EQ(CountLiveLogFiles(), 12); + PutRandomData(1, 2*200, 1000); + WaitForFlush(1); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + ASSERT_EQ(CountLiveLogFiles(), 7); + Close(); +} + +TEST(ColumnFamilyTest, DifferentMergeOperators) { + Open(); + CreateColumnFamilies({"first", "second"}); + ColumnFamilyOptions default_cf, first, second; + first.merge_operator = MergeOperators::CreateUInt64AddOperator(); + second.merge_operator = MergeOperators::CreateStringAppendOperator(); + Reopen({default_cf, first, second}); + + std::string one, two, three; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + + ASSERT_OK(Put(0, "foo", two)); + ASSERT_OK(Put(0, "foo", one)); + ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported()); + ASSERT_EQ(Get(0, "foo"), one); + + ASSERT_OK(Put(1, "foo", two)); + ASSERT_OK(Put(1, "foo", one)); + ASSERT_OK(Merge(1, "foo", two)); + ASSERT_EQ(Get(1, "foo"), three); + + ASSERT_OK(Put(2, "foo", two)); + ASSERT_OK(Put(2, "foo", one)); + ASSERT_OK(Merge(2, "foo", two)); + ASSERT_EQ(Get(2, "foo"), one + "," + two); + Close(); +} + +TEST(ColumnFamilyTest, DifferentCompactionStyles) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.disableDataSync = true; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.filter_policy = nullptr; + default_cf.no_block_cache = true; + default_cf.source_compaction_factor = 100; + default_cf.disable_seek_compaction = false; + + one.compaction_style = kCompactionStyleUniversal; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 100000; + + two.compaction_style = kCompactionStyleLevel; + two.num_levels = 4; + two.max_mem_compaction_level = 0; + two.level0_file_num_compaction_trigger = 3; + two.write_buffer_size = 100000; + + Reopen({default_cf, one, two}); + + // SETUP column family "default" - test read compaction + ASSERT_EQ("", FilesPerLevel(0)); + PutRandomData(0, 1, 4096); + ASSERT_OK(Flush(0)); + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + // write 8MB + PutRandomData(0, 2000, 4096); + ASSERT_OK(Flush(0)); + // clear levels 0 and 1 + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[0]); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[0]); + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); + // write some new keys into level 0 and 1 + PutRandomData(0, 1024, 512); + ASSERT_OK(Flush(0)); + WaitForCompaction(); + PutRandomData(0, 10, 512); + ASSERT_OK(Flush(0)); + // remember number of files in each level + int l1 = NumTableFilesAtLevel(0, 0); + int l2 = NumTableFilesAtLevel(1, 0); + int l3 = NumTableFilesAtLevel(2, 0); + ASSERT_NE(l1, 0); + ASSERT_NE(l2, 0); + ASSERT_NE(l3, 0); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) { + PutRandomData(1, 11, 10000); + WaitForFlush(1); + ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(1)); + } + + // SETUP column family "two" -- level style with 4 levels + for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) { + PutRandomData(2, 15, 10000); + WaitForFlush(2); + ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2)); + } + + // TRIGGER compaction "default" + // read a bunch of times, trigger read compaction + for (int i = 0; i < 200000; ++i) { + Get(0, std::to_string(i)); + } + + // TRIGGER compaction "one" + PutRandomData(1, 12, 10000); + + // TRIGGER compaction "two" + PutRandomData(2, 10, 10000); + + // WAIT for compactions + WaitForCompaction(); + + // VERIFY compaction "default" + // verify that the number of files have decreased + // in some level, indicating that there was a compaction + ASSERT_TRUE(NumTableFilesAtLevel(0, 0) < l1 || + NumTableFilesAtLevel(1, 0) < l2 || + NumTableFilesAtLevel(2, 0) < l3); + + // VERIFY compaction "one" + ASSERT_EQ("1", FilesPerLevel(1)); + + // VERIFY compaction "two" + ASSERT_EQ("0,1", FilesPerLevel(2)); + CompactAll(2); + ASSERT_EQ("0,1", FilesPerLevel(2)); + + Close(); +} + +namespace { +std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; +} +} // anonymous namespace + +TEST(ColumnFamilyTest, NewIteratorsTest) { + // iter == 0 -- no tailing + // iter == 2 -- tailing + for (int iter = 0; iter < 2; ++iter) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + ASSERT_OK(Put(0, "a", "b")); + ASSERT_OK(Put(1, "b", "a")); + ASSERT_OK(Put(2, "c", "m")); + ASSERT_OK(Put(2, "v", "t")); + std::vector iterators; + ReadOptions options; + options.tailing = (iter == 1); + ASSERT_OK(db_->NewIterators(options, handles_, &iterators)); + + for (auto it : iterators) { + it->SeekToFirst(); + } + ASSERT_EQ(IterStatus(iterators[0]), "a->b"); + ASSERT_EQ(IterStatus(iterators[1]), "b->a"); + ASSERT_EQ(IterStatus(iterators[2]), "c->m"); + + ASSERT_OK(Put(1, "x", "x")); + + for (auto it : iterators) { + it->Next(); + } + + ASSERT_EQ(IterStatus(iterators[0]), "(invalid)"); + if (iter == 0) { + // no tailing + ASSERT_EQ(IterStatus(iterators[1]), "(invalid)"); + } else { + // tailing + ASSERT_EQ(IterStatus(iterators[1]), "x->x"); + } + ASSERT_EQ(IterStatus(iterators[2]), "v->t"); + + for (auto it : iterators) { + delete it; + } + Destroy(); + } +} + +TEST(ColumnFamilyTest, ReadOnlyDBTest) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two", "three", "four"}); + ASSERT_OK(Put(1, "foo", "bla")); + ASSERT_OK(Put(2, "foo", "blabla")); + ASSERT_OK(Put(3, "foo", "blablabla")); + ASSERT_OK(Put(4, "foo", "blablablabla")); + + DropColumnFamilies({2}); + Close(); + // open only a subset of column families + AssertOpenReadOnly({"default", "one", "four"}); + ASSERT_EQ("NOT_FOUND", Get(0, "foo")); + ASSERT_EQ("bla", Get(1, "foo")); + ASSERT_EQ("blablablabla", Get(2, "foo")); + + Close(); + // can't open dropped column family + Status s = OpenReadOnly({"default", "one", "two"}); + ASSERT_TRUE(!s.ok()); + + // Can't open without specifying default column family + s = OpenReadOnly({"one", "four"}); + ASSERT_TRUE(!s.ok()); +} + +TEST(ColumnFamilyTest, DontRollEmptyLogs) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two", "three", "four"}); + + for (size_t i = 0; i < handles_.size(); ++i) { + PutRandomData(i, 10, 100); + } + int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls(); + // this will trigger the flushes + ASSERT_OK(db_->Write(WriteOptions(), nullptr)); + + for (int i = 0; i < 4; ++i) { + dbfull()->TEST_WaitForFlushMemTable(handles_[i]); + } + int total_new_writable_files = + env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start; + ASSERT_EQ(static_cast(total_new_writable_files), handles_.size() + 1); + Close(); +} + +TEST(ColumnFamilyTest, FlushStaleColumnFamilies) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + default_cf.write_buffer_size = 100000; // small write buffer size + default_cf.disable_auto_compactions = true; + one.disable_auto_compactions = true; + two.disable_auto_compactions = true; + db_options_.max_total_wal_size = 210000; + + Reopen({default_cf, one, two}); + + PutRandomData(2, 1, 10); // 10 bytes + for (int i = 0; i < 2; ++i) { + PutRandomData(0, 100, 1000); // flush + WaitForFlush(0); + ASSERT_EQ(i + 1, CountLiveFiles()); + } + // third flush. now, CF [two] should be detected as stale and flushed + // column family 1 should not be flushed since it's empty + PutRandomData(0, 100, 1000); // flush + WaitForFlush(0); + WaitForFlush(2); + // 3 files for default column families, 1 file for column family [two], zero + // files for column family [one], because it's empty + ASSERT_EQ(4, CountLiveFiles()); + Close(); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/compaction.cc b/db/compaction.cc new file mode 100644 index 0000000000..a8caa59efd --- /dev/null +++ b/db/compaction.cc @@ -0,0 +1,253 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction.h" + +#define __STDC_FORMAT_MACROS +#include +#include + +#include "db/column_family.h" +#include "util/logging.h" + +namespace rocksdb { + +static uint64_t TotalFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->file_size; + } + return sum; +} + +Compaction::Compaction(Version* input_version, int level, int out_level, + uint64_t target_file_size, + uint64_t max_grandparent_overlap_bytes, + bool seek_compaction, bool enable_compression, + bool deletion_compaction) + : level_(level), + out_level_(out_level), + max_output_file_size_(target_file_size), + max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes), + input_version_(input_version), + number_levels_(input_version_->NumberLevels()), + cfd_(input_version_->cfd_), + seek_compaction_(seek_compaction), + enable_compression_(enable_compression), + deletion_compaction_(deletion_compaction), + grandparent_index_(0), + seen_key_(false), + overlapped_bytes_(0), + base_index_(-1), + parent_index_(-1), + score_(0), + bottommost_level_(false), + is_full_compaction_(false), + is_manual_compaction_(false), + level_ptrs_(std::vector(number_levels_)) { + + cfd_->Ref(); + input_version_->Ref(); + edit_ = new VersionEdit(); + edit_->SetColumnFamily(cfd_->GetID()); + for (int i = 0; i < number_levels_; i++) { + level_ptrs_[i] = 0; + } +} + +Compaction::~Compaction() { + delete edit_; + if (input_version_ != nullptr) { + input_version_->Unref(); + } + if (cfd_ != nullptr) { + if (cfd_->Unref()) { + delete cfd_; + } + } +} + +bool Compaction::IsTrivialMove() const { + // Avoid a move if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + // If level_== out_level_, the purpose is to force compaction filter to be + // applied to that level, and thus cannot be a trivia move. + return (level_ != out_level_ && + num_input_files(0) == 1 && + num_input_files(1) == 0 && + TotalFileSize(grandparents_) <= max_grandparent_overlap_bytes_); +} + +bool Compaction::IsDeletionCompaction() const { return deletion_compaction_; } + +void Compaction::AddInputDeletions(VersionEdit* edit) { + for (int which = 0; which < 2; which++) { + for (size_t i = 0; i < inputs_[which].size(); i++) { + edit->DeleteFile(level_ + which, inputs_[which][i]->number); + } + } +} + +bool Compaction::IsBaseLevelForKey(const Slice& user_key) { + assert(cfd_->options()->compaction_style != kCompactionStyleFIFO); + if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { + return bottommost_level_; + } + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = cfd_->user_comparator(); + for (int lvl = level_ + 2; lvl < number_levels_; lvl++) { + const std::vector& files = input_version_->files_[lvl]; + for (; level_ptrs_[lvl] < files.size(); ) { + FileMetaData* f = files[level_ptrs_[lvl]]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so definitely not base level + return false; + } + break; + } + level_ptrs_[lvl]++; + } + } + return true; +} + +bool Compaction::ShouldStopBefore(const Slice& internal_key) { + // Scan to find earliest grandparent file that contains key. + const InternalKeyComparator* icmp = &cfd_->internal_comparator(); + while (grandparent_index_ < grandparents_.size() && + icmp->Compare(internal_key, + grandparents_[grandparent_index_]->largest.Encode()) > 0) { + if (seen_key_) { + overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; + } + assert(grandparent_index_ + 1 >= grandparents_.size() || + icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(), + grandparents_[grandparent_index_+1]->smallest.Encode()) + < 0); + grandparent_index_++; + } + seen_key_ = true; + + if (overlapped_bytes_ > max_grandparent_overlap_bytes_) { + // Too much overlap for current output; start new output + overlapped_bytes_ = 0; + return true; + } else { + return false; + } +} + +// Mark (or clear) each file that is being compacted +void Compaction::MarkFilesBeingCompacted(bool value) { + for (int i = 0; i < 2; i++) { + std::vector v = inputs_[i]; + for (unsigned int j = 0; j < inputs_[i].size(); j++) { + assert(value ? !inputs_[i][j]->being_compacted : + inputs_[i][j]->being_compacted); + inputs_[i][j]->being_compacted = value; + } + } +} + +// Is this compaction producing files at the bottommost level? +void Compaction::SetupBottomMostLevel(bool isManual) { + assert(cfd_->options()->compaction_style != kCompactionStyleFIFO); + if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { + // If universal compaction style is used and manual + // compaction is occuring, then we are guaranteed that + // all files will be picked in a single compaction + // run. We can safely set bottommost_level_ = true. + // If it is not manual compaction, then bottommost_level_ + // is already set when the Compaction was created. + if (isManual) { + bottommost_level_ = true; + } + return; + } + bottommost_level_ = true; + for (int i = output_level() + 1; i < number_levels_; i++) { + if (input_version_->NumLevelFiles(i) > 0) { + bottommost_level_ = false; + break; + } + } +} + +void Compaction::ReleaseInputs() { + if (input_version_ != nullptr) { + input_version_->Unref(); + input_version_ = nullptr; + } + if (cfd_ != nullptr) { + if (cfd_->Unref()) { + delete cfd_; + } + cfd_ = nullptr; + } +} + +void Compaction::ReleaseCompactionFiles(Status status) { + cfd_->compaction_picker()->ReleaseCompactionFiles(this, status); +} + +void Compaction::ResetNextCompactionIndex() { + input_version_->ResetNextCompactionIndex(level_); +} + +namespace { +int InputSummary(const std::vector& files, char* output, + int len) { + *output = '\0'; + int write = 0; + for (unsigned int i = 0; i < files.size(); i++) { + int sz = len - write; + int ret; + char sztxt[16]; + AppendHumanBytes(files.at(i)->file_size, sztxt, 16); + ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", files.at(i)->number, + sztxt); + if (ret < 0 || ret >= sz) break; + write += ret; + } + // if files.size() is non-zero, overwrite the last space + return write - !!files.size(); +} +} // namespace + +void Compaction::Summary(char* output, int len) { + int write = + snprintf(output, len, "Base version %" PRIu64 + " Base level %d, seek compaction:%d, inputs: [", + input_version_->GetVersionNumber(), level_, seek_compaction_); + if (write < 0 || write >= len) { + return; + } + + write += InputSummary(inputs_[0], output + write, len - write); + if (write < 0 || write >= len) { + return; + } + + write += snprintf(output + write, len - write, "], ["); + if (write < 0 || write >= len) { + return; + } + + write += InputSummary(inputs_[1], output + write, len - write); + if (write < 0 || write >= len) { + return; + } + + snprintf(output + write, len - write, "]"); +} + +} // namespace rocksdb diff --git a/db/compaction.h b/db/compaction.h new file mode 100644 index 0000000000..aaa4023038 --- /dev/null +++ b/db/compaction.h @@ -0,0 +1,158 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "db/version_set.h" + +namespace rocksdb { + +class Version; +class ColumnFamilyData; + +// A Compaction encapsulates information about a compaction. +class Compaction { + public: + ~Compaction(); + + // Return the level that is being compacted. Inputs from "level" + // will be merged. + int level() const { return level_; } + + // Outputs will go to this level + int output_level() const { return out_level_; } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return edit_; } + + // "which" must be either 0 or 1 + int num_input_files(int which) const { return inputs_[which].size(); } + + // Returns input version of the compaction + Version* input_version() const { return input_version_; } + + ColumnFamilyData* column_family_data() const { return cfd_; } + + // Return the ith input file at "level()+which" ("which" must be 0 or 1). + FileMetaData* input(int which, int i) const { return inputs_[which][i]; } + + std::vector* inputs(int which) { return &inputs_[which]; } + + // Maximum size of files to build during this compaction. + uint64_t MaxOutputFileSize() const { return max_output_file_size_; } + + // Whether compression will be enabled for compaction outputs + bool enable_compression() const { return enable_compression_; } + + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + + // If true, just delete all files in inputs_[0] + bool IsDeletionCompaction() const; + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the information we have available guarantees that + // the compaction is producing data in "level+1" for which no data exists + // in levels greater than "level+1". + bool IsBaseLevelForKey(const Slice& user_key); + + // Returns true iff we should stop building the current output + // before processing "internal_key". + bool ShouldStopBefore(const Slice& internal_key); + + // Release the input version for the compaction, once the compaction + // is successful. + void ReleaseInputs(); + + // Clear all files to indicate that they are not being compacted + // Delete this compaction from the list of running compactions. + void ReleaseCompactionFiles(Status status); + + void Summary(char* output, int len); + + // Return the score that was used to pick this compaction run. + double score() const { return score_; } + + // Is this compaction creating a file in the bottom most level? + bool BottomMostLevel() { return bottommost_level_; } + + // Does this compaction include all sst files? + bool IsFullCompaction() { return is_full_compaction_; } + + // Was this compaction triggered manually by the client? + bool IsManualCompaction() { return is_manual_compaction_; } + + private: + friend class CompactionPicker; + friend class UniversalCompactionPicker; + friend class FIFOCompactionPicker; + friend class LevelCompactionPicker; + + Compaction(Version* input_version, int level, int out_level, + uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes, + bool seek_compaction = false, bool enable_compression = true, + bool deletion_compaction = false); + + int level_; + int out_level_; // levels to which output files are stored + uint64_t max_output_file_size_; + uint64_t max_grandparent_overlap_bytes_; + Version* input_version_; + VersionEdit* edit_; + int number_levels_; + ColumnFamilyData* cfd_; + + bool seek_compaction_; + bool enable_compression_; + // if true, just delete files in inputs_[0] + bool deletion_compaction_; + + // Each compaction reads inputs from "level_" and "level_+1" + std::vector inputs_[2]; // The two sets of inputs + + // State used to check for number of of overlapping grandparent files + // (parent == level_ + 1, grandparent == level_ + 2) + std::vector grandparents_; + size_t grandparent_index_; // Index in grandparent_starts_ + bool seen_key_; // Some output key has been seen + uint64_t overlapped_bytes_; // Bytes of overlap between current output + // and grandparent files + int base_index_; // index of the file in files_[level_] + int parent_index_; // index of some file with same range in files_[level_+1] + double score_; // score that was used to pick this compaction. + + // Is this compaction creating a file in the bottom most level? + bool bottommost_level_; + // Does this compaction include all sst files? + bool is_full_compaction_; + + // Is this compaction requested by the client? + bool is_manual_compaction_; + + // level_ptrs_ holds indices into input_version_->levels_: our state + // is that we are positioned at one of the file ranges for each + // higher level than the ones involved in this compaction (i.e. for + // all L >= level_ + 2). + std::vector level_ptrs_; + + // mark (or clear) all files that are being compacted + void MarkFilesBeingCompacted(bool); + + // Initialize whether compaction producing files at the bottommost level + void SetupBottomMostLevel(bool isManual); + + // In case of compaction error, reset the nextIndex that is used + // to pick up the next file to be compacted from files_by_size_ + void ResetNextCompactionIndex(); +}; + +} // namespace rocksdb diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc new file mode 100644 index 0000000000..3416a0bac9 --- /dev/null +++ b/db/compaction_picker.cc @@ -0,0 +1,960 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction_picker.h" + +#define __STDC_FORMAT_MACROS +#include +#include +#include "util/log_buffer.h" +#include "util/statistics.h" + +namespace rocksdb { + +namespace { + +uint64_t TotalFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->file_size; + } + return sum; +} + +// Multiple two operands. If they overflow, return op1. +uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) { + if (op1 == 0) { + return 0; + } + if (op2 <= 0) { + return op1; + } + uint64_t casted_op2 = (uint64_t) op2; + if (std::numeric_limits::max() / op1 < casted_op2) { + return op1; + } + return op1 * casted_op2; +} + +} // anonymous namespace + +CompactionPicker::CompactionPicker(const Options* options, + const InternalKeyComparator* icmp) + : compactions_in_progress_(options->num_levels), + options_(options), + num_levels_(options->num_levels), + icmp_(icmp) { + + max_file_size_.reset(new uint64_t[NumberLevels()]); + level_max_bytes_.reset(new uint64_t[NumberLevels()]); + int target_file_size_multiplier = options_->target_file_size_multiplier; + int max_bytes_multiplier = options_->max_bytes_for_level_multiplier; + for (int i = 0; i < NumberLevels(); i++) { + if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) { + max_file_size_[i] = ULLONG_MAX; + level_max_bytes_[i] = options_->max_bytes_for_level_base; + } else if (i > 1) { + max_file_size_[i] = MultiplyCheckOverflow(max_file_size_[i - 1], + target_file_size_multiplier); + level_max_bytes_[i] = MultiplyCheckOverflow( + MultiplyCheckOverflow(level_max_bytes_[i - 1], max_bytes_multiplier), + options_->max_bytes_for_level_multiplier_additional[i - 1]); + } else { + max_file_size_[i] = options_->target_file_size_base; + level_max_bytes_[i] = options_->max_bytes_for_level_base; + } + } +} + +CompactionPicker::~CompactionPicker() {} + +void CompactionPicker::SizeBeingCompacted(std::vector& sizes) { + for (int level = 0; level < NumberLevels() - 1; level++) { + uint64_t total = 0; + for (auto c : compactions_in_progress_[level]) { + assert(c->level() == level); + for (int i = 0; i < c->num_input_files(0); i++) { + total += c->input(0,i)->file_size; + } + } + sizes[level] = total; + } +} + +// Clear all files to indicate that they are not being compacted +// Delete this compaction from the list of running compactions. +void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { + c->MarkFilesBeingCompacted(false); + compactions_in_progress_[c->level()].erase(c); + if (!status.ok()) { + c->ResetNextCompactionIndex(); + } +} + +uint64_t CompactionPicker::MaxFileSizeForLevel(int level) const { + assert(level >= 0); + assert(level < NumberLevels()); + return max_file_size_[level]; +} + +uint64_t CompactionPicker::MaxGrandParentOverlapBytes(int level) { + uint64_t result = MaxFileSizeForLevel(level); + result *= options_->max_grandparent_overlap_factor; + return result; +} + +double CompactionPicker::MaxBytesForLevel(int level) { + // Note: the result for level zero is not really used since we set + // the level-0 compaction threshold based on number of files. + assert(level >= 0); + assert(level < NumberLevels()); + return level_max_bytes_[level]; +} + +void CompactionPicker::GetRange(const std::vector& inputs, + InternalKey* smallest, InternalKey* largest) { + assert(!inputs.empty()); + smallest->Clear(); + largest->Clear(); + for (size_t i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (i == 0) { + *smallest = f->smallest; + *largest = f->largest; + } else { + if (icmp_->Compare(f->smallest, *smallest) < 0) { + *smallest = f->smallest; + } + if (icmp_->Compare(f->largest, *largest) > 0) { + *largest = f->largest; + } + } + } +} + +void CompactionPicker::GetRange(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, InternalKey* largest) { + std::vector all = inputs1; + all.insert(all.end(), inputs2.begin(), inputs2.end()); + GetRange(all, smallest, largest); +} + +bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) { + // If inputs are empty then there is nothing to expand. + if (!c || c->inputs_[0].empty()) { + return true; + } + + // GetOverlappingInputs will always do the right thing for level-0. + // So we don't need to do any expansion if level == 0. + if (c->level() == 0) { + return true; + } + + const int level = c->level(); + InternalKey smallest, largest; + + // Keep expanding c->inputs_[0] until we are sure that there is a + // "clean cut" boundary between the files in input and the surrounding files. + // This will ensure that no parts of a key are lost during compaction. + int hint_index = -1; + size_t old_size; + do { + old_size = c->inputs_[0].size(); + GetRange(c->inputs_[0], &smallest, &largest); + c->inputs_[0].clear(); + c->input_version_->GetOverlappingInputs( + level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index); + } while(c->inputs_[0].size() > old_size); + + // Get the new range + GetRange(c->inputs_[0], &smallest, &largest); + + // If, after the expansion, there are files that are already under + // compaction, then we must drop/cancel this compaction. + int parent_index = -1; + if (c->inputs_[0].empty()) { + Log(options_->info_log, + "[%s] ExpandWhileOverlapping() failure because zero input files", + c->column_family_data()->GetName().c_str()); + } + if (c->inputs_[0].empty() || FilesInCompaction(c->inputs_[0]) || + (c->level() != c->output_level() && + ParentRangeInCompaction(c->input_version_, &smallest, &largest, level, + &parent_index))) { + c->inputs_[0].clear(); + c->inputs_[1].clear(); + return false; + } + return true; +} + +uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) { + uint64_t result = MaxFileSizeForLevel(level); + result *= options_->expanded_compaction_factor; + return result; +} + +// Returns true if any one of specified files are being compacted +bool CompactionPicker::FilesInCompaction(std::vector& files) { + for (unsigned int i = 0; i < files.size(); i++) { + if (files[i]->being_compacted) { + return true; + } + } + return false; +} + +// Returns true if any one of the parent files are being compacted +bool CompactionPicker::ParentRangeInCompaction(Version* version, + const InternalKey* smallest, + const InternalKey* largest, + int level, int* parent_index) { + std::vector inputs; + assert(level + 1 < NumberLevels()); + + version->GetOverlappingInputs(level + 1, smallest, largest, &inputs, + *parent_index, parent_index); + return FilesInCompaction(inputs); +} + +// Populates the set of inputs from "level+1" that overlap with "level". +// Will also attempt to expand "level" if that doesn't expand "level+1" +// or cause "level" to include a file for compaction that has an overlapping +// user-key with another file. +void CompactionPicker::SetupOtherInputs(Compaction* c) { + // If inputs are empty, then there is nothing to expand. + // If both input and output levels are the same, no need to consider + // files at level "level+1" + if (c->inputs_[0].empty() || c->level() == c->output_level()) { + return; + } + + const int level = c->level(); + InternalKey smallest, largest; + + // Get the range one last time. + GetRange(c->inputs_[0], &smallest, &largest); + + // Populate the set of next-level files (inputs_[1]) to include in compaction + c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest, + &c->inputs_[1], c->parent_index_, + &c->parent_index_); + + // Get entire range covered by compaction + InternalKey all_start, all_limit; + GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + + // See if we can further grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up. We also choose NOT + // to expand if this would cause "level" to include some entries for some + // user key, while excluding other entries for the same user key. This + // can happen when one user key spans multiple files. + if (!c->inputs_[1].empty()) { + std::vector expanded0; + c->input_version_->GetOverlappingInputs( + level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr); + const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]); + const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]); + const uint64_t expanded0_size = TotalFileSize(expanded0); + uint64_t limit = ExpandedCompactionByteSizeLimit(level); + if (expanded0.size() > c->inputs_[0].size() && + inputs1_size + expanded0_size < limit && + !FilesInCompaction(expanded0) && + !c->input_version_->HasOverlappingUserKey(&expanded0, level)) { + InternalKey new_start, new_limit; + GetRange(expanded0, &new_start, &new_limit); + std::vector expanded1; + c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit, + &expanded1, c->parent_index_, + &c->parent_index_); + if (expanded1.size() == c->inputs_[1].size() && + !FilesInCompaction(expanded1)) { + Log(options_->info_log, + "[%s] Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu " + "bytes)\n", + c->column_family_data()->GetName().c_str(), (unsigned long)level, + (unsigned long)(c->inputs_[0].size()), + (unsigned long)(c->inputs_[1].size()), (unsigned long)inputs0_size, + (unsigned long)inputs1_size, (unsigned long)(expanded0.size()), + (unsigned long)(expanded1.size()), (unsigned long)expanded0_size, + (unsigned long)inputs1_size); + smallest = new_start; + largest = new_limit; + c->inputs_[0] = expanded0; + c->inputs_[1] = expanded1; + GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + } + } + } + + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2) + if (level + 2 < NumberLevels()) { + c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit, + &c->grandparents_); + } +} + + +Compaction* CompactionPicker::CompactRange(Version* version, int input_level, + int output_level, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end) { + // CompactionPickerFIFO has its own implementation of compact range + assert(options_->compaction_style != kCompactionStyleFIFO); + + std::vector inputs; + bool covering_the_whole_range = true; + + // All files are 'overlapping' in universal style compaction. + // We have to compact the entire range in one shot. + if (options_->compaction_style == kCompactionStyleUniversal) { + begin = nullptr; + end = nullptr; + } + version->GetOverlappingInputs(input_level, begin, end, &inputs); + if (inputs.empty()) { + return nullptr; + } + + // Avoid compacting too much in one shot in case the range is large. + // But we cannot do this for level-0 since level-0 files can overlap + // and we must not pick one file and drop another older file if the + // two files overlap. + if (input_level > 0) { + const uint64_t limit = + MaxFileSizeForLevel(input_level) * options_->source_compaction_factor; + uint64_t total = 0; + for (size_t i = 0; i + 1 < inputs.size(); ++i) { + uint64_t s = inputs[i]->file_size; + total += s; + if (total >= limit) { + **compaction_end = inputs[i + 1]->smallest; + covering_the_whole_range = false; + inputs.resize(i + 1); + break; + } + } + } + Compaction* c = new Compaction(version, input_level, output_level, + MaxFileSizeForLevel(output_level), + MaxGrandParentOverlapBytes(input_level)); + + c->inputs_[0] = inputs; + if (ExpandWhileOverlapping(c) == false) { + delete c; + Log(options_->info_log, + "[%s] Could not compact due to expansion failure.\n", + version->cfd_->GetName().c_str()); + return nullptr; + } + + SetupOtherInputs(c); + + if (covering_the_whole_range) { + *compaction_end = nullptr; + } + + // These files that are to be manaully compacted do not trample + // upon other files because manual compactions are processed when + // the system has a max of 1 background compaction thread. + c->MarkFilesBeingCompacted(true); + + // Is this compaction creating a file at the bottommost level + c->SetupBottomMostLevel(true); + + c->is_manual_compaction_ = true; + + return c; +} + +Compaction* LevelCompactionPicker::PickCompaction(Version* version, + LogBuffer* log_buffer) { + Compaction* c = nullptr; + int level = -1; + + // Compute the compactions needed. It is better to do it here + // and also in LogAndApply(), otherwise the values could be stale. + std::vector size_being_compacted(NumberLevels() - 1); + SizeBeingCompacted(size_being_compacted); + version->ComputeCompactionScore(size_being_compacted); + + // We prefer compactions triggered by too much data in a level over + // the compactions triggered by seeks. + // + // Find the compactions by size on all levels. + for (int i = 0; i < NumberLevels() - 1; i++) { + assert(i == 0 || + version->compaction_score_[i] <= version->compaction_score_[i - 1]); + level = version->compaction_level_[i]; + if ((version->compaction_score_[i] >= 1)) { + c = PickCompactionBySize(version, level, version->compaction_score_[i]); + if (ExpandWhileOverlapping(c) == false) { + delete c; + c = nullptr; + } else { + break; + } + } + } + + // Find compactions needed by seeks + FileMetaData* f = version->file_to_compact_; + if (c == nullptr && f != nullptr && !f->being_compacted) { + + level = version->file_to_compact_level_; + int parent_index = -1; + + // Only allow one level 0 compaction at a time. + // Do not pick this file if its parents at level+1 are being compacted. + if (level != 0 || compactions_in_progress_[0].empty()) { + if (!ParentRangeInCompaction(version, &f->smallest, &f->largest, level, + &parent_index)) { + c = new Compaction(version, level, level + 1, + MaxFileSizeForLevel(level + 1), + MaxGrandParentOverlapBytes(level), true); + c->inputs_[0].push_back(f); + c->parent_index_ = parent_index; + c->input_version_->file_to_compact_ = nullptr; + if (ExpandWhileOverlapping(c) == false) { + return nullptr; + } + } + } + } + + if (c == nullptr) { + return nullptr; + } + + // Two level 0 compaction won't run at the same time, so don't need to worry + // about files on level 0 being compacted. + if (level == 0) { + assert(compactions_in_progress_[0].empty()); + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + // Note that the next call will discard the file we placed in + // c->inputs_[0] earlier and replace it with an overlapping set + // which will include the picked file. + c->inputs_[0].clear(); + c->input_version_->GetOverlappingInputs(0, &smallest, &largest, + &c->inputs_[0]); + + // If we include more L0 files in the same compaction run it can + // cause the 'smallest' and 'largest' key to get extended to a + // larger range. So, re-invoke GetRange to get the new key range + GetRange(c->inputs_[0], &smallest, &largest); + if (ParentRangeInCompaction(c->input_version_, &smallest, &largest, level, + &c->parent_index_)) { + delete c; + return nullptr; + } + assert(!c->inputs_[0].empty()); + } + + // Setup "level+1" files (inputs_[1]) + SetupOtherInputs(c); + + // mark all the files that are being compacted + c->MarkFilesBeingCompacted(true); + + // Is this compaction creating a file at the bottommost level + c->SetupBottomMostLevel(false); + + // remember this currently undergoing compaction + compactions_in_progress_[level].insert(c); + + return c; +} + +Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version, + int level, + double score) { + Compaction* c = nullptr; + + // level 0 files are overlapping. So we cannot pick more + // than one concurrent compactions at this level. This + // could be made better by looking at key-ranges that are + // being compacted at level 0. + if (level == 0 && compactions_in_progress_[level].size() == 1) { + return nullptr; + } + + assert(level >= 0); + assert(level + 1 < NumberLevels()); + c = new Compaction(version, level, level + 1, MaxFileSizeForLevel(level + 1), + MaxGrandParentOverlapBytes(level)); + c->score_ = score; + + // Pick the largest file in this level that is not already + // being compacted + std::vector& file_size = c->input_version_->files_by_size_[level]; + + // record the first file that is not yet compacted + int nextIndex = -1; + + for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level]; + i < file_size.size(); i++) { + int index = file_size[i]; + FileMetaData* f = c->input_version_->files_[level][index]; + + // check to verify files are arranged in descending size + assert((i == file_size.size() - 1) || + (i >= Version::number_of_files_to_sort_ - 1) || + (f->file_size >= + c->input_version_->files_[level][file_size[i + 1]]->file_size)); + + // do not pick a file to compact if it is being compacted + // from n-1 level. + if (f->being_compacted) { + continue; + } + + // remember the startIndex for the next call to PickCompaction + if (nextIndex == -1) { + nextIndex = i; + } + + // Do not pick this file if its parents at level+1 are being compacted. + // Maybe we can avoid redoing this work in SetupOtherInputs + int parent_index = -1; + if (ParentRangeInCompaction(c->input_version_, &f->smallest, &f->largest, + level, &parent_index)) { + continue; + } + c->inputs_[0].push_back(f); + c->base_index_ = index; + c->parent_index_ = parent_index; + break; + } + + if (c->inputs_[0].empty()) { + delete c; + c = nullptr; + } + + // store where to start the iteration in the next call to PickCompaction + version->next_file_to_compact_by_size_[level] = nextIndex; + + return c; +} + +// Universal style of compaction. Pick files that are contiguous in +// time-range to compact. +// +Compaction* UniversalCompactionPicker::PickCompaction(Version* version, + LogBuffer* log_buffer) { + int level = 0; + double score = version->compaction_score_[0]; + + if ((version->files_[level].size() < + (unsigned int)options_->level0_file_num_compaction_trigger)) { + LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n", + version->cfd_->GetName().c_str()); + return nullptr; + } + Version::FileSummaryStorage tmp; + LogToBuffer(log_buffer, "[%s] Universal: candidate files(%zu): %s\n", + version->cfd_->GetName().c_str(), version->files_[level].size(), + version->LevelFileSummary(&tmp, 0)); + + // Check for size amplification first. + Compaction* c; + if ((c = PickCompactionUniversalSizeAmp(version, score, log_buffer)) != + nullptr) { + LogToBuffer(log_buffer, "[%s] Universal: compacting for size amp\n", + version->cfd_->GetName().c_str()); + } else { + // Size amplification is within limits. Try reducing read + // amplification while maintaining file size ratios. + unsigned int ratio = options_->compaction_options_universal.size_ratio; + + if ((c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX, + log_buffer)) != nullptr) { + LogToBuffer(log_buffer, "[%s] Universal: compacting for size ratio\n", + version->cfd_->GetName().c_str()); + } else { + // Size amplification and file size ratios are within configured limits. + // If max read amplification is exceeding configured limits, then force + // compaction without looking at filesize ratios and try to reduce + // the number of files to fewer than level0_file_num_compaction_trigger. + unsigned int num_files = version->files_[level].size() - + options_->level0_file_num_compaction_trigger; + if ((c = PickCompactionUniversalReadAmp( + version, score, UINT_MAX, num_files, log_buffer)) != nullptr) { + LogToBuffer(log_buffer, "[%s] Universal: compacting for file num\n", + version->cfd_->GetName().c_str()); + } + } + } + if (c == nullptr) { + return nullptr; + } + assert(c->inputs_[0].size() > 1); + + // validate that all the chosen files are non overlapping in time + FileMetaData* newerfile __attribute__((unused)) = nullptr; + for (unsigned int i = 0; i < c->inputs_[0].size(); i++) { + FileMetaData* f = c->inputs_[0][i]; + assert (f->smallest_seqno <= f->largest_seqno); + assert(newerfile == nullptr || + newerfile->smallest_seqno > f->largest_seqno); + newerfile = f; + } + + // The files are sorted from newest first to oldest last. + std::vector& file_by_time = c->input_version_->files_by_size_[level]; + + // Is the earliest file part of this compaction? + int last_index = file_by_time[file_by_time.size()-1]; + FileMetaData* last_file = c->input_version_->files_[level][last_index]; + if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) { + c->bottommost_level_ = true; + } + + // update statistics + MeasureTime(options_->statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION, + c->inputs_[0].size()); + + // mark all the files that are being compacted + c->MarkFilesBeingCompacted(true); + + // remember this currently undergoing compaction + compactions_in_progress_[level].insert(c); + + // Record whether this compaction includes all sst files. + // For now, it is only relevant in universal compaction mode. + c->is_full_compaction_ = + (c->inputs_[0].size() == c->input_version_->files_[0].size()); + + return c; +} + +// +// Consider compaction files based on their size differences with +// the next file in time order. +// +Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( + Version* version, double score, unsigned int ratio, + unsigned int max_number_of_files_to_compact, LogBuffer* log_buffer) { + int level = 0; + + unsigned int min_merge_width = + options_->compaction_options_universal.min_merge_width; + unsigned int max_merge_width = + options_->compaction_options_universal.max_merge_width; + + // The files are sorted from newest first to oldest last. + std::vector& file_by_time = version->files_by_size_[level]; + FileMetaData* f = nullptr; + bool done = false; + int start_index = 0; + unsigned int candidate_count = 0; + assert(file_by_time.size() == version->files_[level].size()); + + unsigned int max_files_to_compact = std::min(max_merge_width, + max_number_of_files_to_compact); + min_merge_width = std::max(min_merge_width, 2U); + + // Considers a candidate file only if it is smaller than the + // total size accumulated so far. + for (unsigned int loop = 0; loop < file_by_time.size(); loop++) { + + candidate_count = 0; + + // Skip files that are already being compacted + for (f = nullptr; loop < file_by_time.size(); loop++) { + int index = file_by_time[loop]; + f = version->files_[level][index]; + + if (!f->being_compacted) { + candidate_count = 1; + break; + } + LogToBuffer( + log_buffer, "[%s] Universal: file %lu[%d] being compacted, skipping", + version->cfd_->GetName().c_str(), (unsigned long)f->number, loop); + f = nullptr; + } + + // This file is not being compacted. Consider it as the + // first candidate to be compacted. + uint64_t candidate_size = f != nullptr? f->file_size : 0; + if (f != nullptr) { + LogToBuffer( + log_buffer, "[%s] Universal: Possible candidate file %lu[%d].", + version->cfd_->GetName().c_str(), (unsigned long)f->number, loop); + } + + // Check if the suceeding files need compaction. + for (unsigned int i = loop+1; + candidate_count < max_files_to_compact && i < file_by_time.size(); + i++) { + int index = file_by_time[i]; + FileMetaData* f = version->files_[level][index]; + if (f->being_compacted) { + break; + } + // Pick files if the total/last candidate file size (increased by the + // specified ratio) is still larger than the next candidate file. + // candidate_size is the total size of files picked so far with the + // default kCompactionStopStyleTotalSize; with + // kCompactionStopStyleSimilarSize, it's simply the size of the last + // picked file. + uint64_t sz = (candidate_size * (100L + ratio)) /100; + if (sz < f->file_size) { + break; + } + if (options_->compaction_options_universal.stop_style == kCompactionStopStyleSimilarSize) { + // Similar-size stopping rule: also check the last picked file isn't + // far larger than the next candidate file. + sz = (f->file_size * (100L + ratio)) / 100; + if (sz < candidate_size) { + // If the small file we've encountered begins a run of similar-size + // files, we'll pick them up on a future iteration of the outer + // loop. If it's some lonely straggler, it'll eventually get picked + // by the last-resort read amp strategy which disregards size ratios. + break; + } + candidate_size = f->file_size; + } else { // default kCompactionStopStyleTotalSize + candidate_size += f->file_size; + } + candidate_count++; + } + + // Found a series of consecutive files that need compaction. + if (candidate_count >= (unsigned int)min_merge_width) { + start_index = loop; + done = true; + break; + } else { + for (unsigned int i = loop; + i < loop + candidate_count && i < file_by_time.size(); i++) { + int index = file_by_time[i]; + FileMetaData* f = version->files_[level][index]; + LogToBuffer(log_buffer, + "[%s] Universal: Skipping file %lu[%d] with size %lu %d\n", + version->cfd_->GetName().c_str(), (unsigned long)f->number, + i, (unsigned long)f->file_size, f->being_compacted); + } + } + } + if (!done || candidate_count <= 1) { + return nullptr; + } + unsigned int first_index_after = start_index + candidate_count; + // Compression is enabled if files compacted earlier already reached + // size ratio of compression. + bool enable_compression = true; + int ratio_to_compress = + options_->compaction_options_universal.compression_size_percent; + if (ratio_to_compress >= 0) { + uint64_t total_size = version->NumLevelBytes(level); + uint64_t older_file_size = 0; + for (unsigned int i = file_by_time.size() - 1; i >= first_index_after; + i--) { + older_file_size += version->files_[level][file_by_time[i]]->file_size; + if (older_file_size * 100L >= total_size * (long) ratio_to_compress) { + enable_compression = false; + break; + } + } + } + Compaction* c = + new Compaction(version, level, level, MaxFileSizeForLevel(level), + LLONG_MAX, false, enable_compression); + c->score_ = score; + + for (unsigned int i = start_index; i < first_index_after; i++) { + int index = file_by_time[i]; + FileMetaData* f = c->input_version_->files_[level][index]; + c->inputs_[0].push_back(f); + LogToBuffer(log_buffer, + "[%s] Universal: Picking file %lu[%d] with size %lu\n", + version->cfd_->GetName().c_str(), (unsigned long)f->number, i, + (unsigned long)f->file_size); + } + return c; +} + +// Look at overall size amplification. If size amplification +// exceeeds the configured value, then do a compaction +// of the candidate files all the way upto the earliest +// base file (overrides configured values of file-size ratios, +// min_merge_width and max_merge_width). +// +Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( + Version* version, double score, LogBuffer* log_buffer) { + int level = 0; + + // percentage flexibilty while reducing size amplification + uint64_t ratio = options_->compaction_options_universal. + max_size_amplification_percent; + + // The files are sorted from newest first to oldest last. + std::vector& file_by_time = version->files_by_size_[level]; + assert(file_by_time.size() == version->files_[level].size()); + + unsigned int candidate_count = 0; + uint64_t candidate_size = 0; + unsigned int start_index = 0; + FileMetaData* f = nullptr; + + // Skip files that are already being compacted + for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) { + int index = file_by_time[loop]; + f = version->files_[level][index]; + if (!f->being_compacted) { + start_index = loop; // Consider this as the first candidate. + break; + } + LogToBuffer(log_buffer, + "[%s] Universal: skipping file %lu[%d] compacted %s", + version->cfd_->GetName().c_str(), (unsigned long)f->number, + loop, " cannot be a candidate to reduce size amp.\n"); + f = nullptr; + } + if (f == nullptr) { + return nullptr; // no candidate files + } + + LogToBuffer(log_buffer, "[%s] Universal: First candidate file %lu[%d] %s", + version->cfd_->GetName().c_str(), (unsigned long)f->number, + start_index, " to reduce size amp.\n"); + + // keep adding up all the remaining files + for (unsigned int loop = start_index; loop < file_by_time.size() - 1; + loop++) { + int index = file_by_time[loop]; + f = version->files_[level][index]; + if (f->being_compacted) { + LogToBuffer( + log_buffer, "[%s] Universal: Possible candidate file %lu[%d] %s.", + version->cfd_->GetName().c_str(), (unsigned long)f->number, loop, + " is already being compacted. No size amp reduction possible.\n"); + return nullptr; + } + candidate_size += f->file_size; + candidate_count++; + } + if (candidate_count == 0) { + return nullptr; + } + + // size of earliest file + int index = file_by_time[file_by_time.size() - 1]; + uint64_t earliest_file_size = version->files_[level][index]->file_size; + + // size amplification = percentage of additional size + if (candidate_size * 100 < ratio * earliest_file_size) { + LogToBuffer( + log_buffer, + "[%s] Universal: size amp not needed. newer-files-total-size %lu " + "earliest-file-size %lu", + version->cfd_->GetName().c_str(), (unsigned long)candidate_size, + (unsigned long)earliest_file_size); + return nullptr; + } else { + LogToBuffer(log_buffer, + "[%s] Universal: size amp needed. newer-files-total-size %lu " + "earliest-file-size %lu", + version->cfd_->GetName().c_str(), (unsigned long)candidate_size, + (unsigned long)earliest_file_size); + } + assert(start_index >= 0 && start_index < file_by_time.size() - 1); + + // create a compaction request + // We always compact all the files, so always compress. + Compaction* c = + new Compaction(version, level, level, MaxFileSizeForLevel(level), + LLONG_MAX, false, true); + c->score_ = score; + for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) { + int index = file_by_time[loop]; + f = c->input_version_->files_[level][index]; + c->inputs_[0].push_back(f); + LogToBuffer(log_buffer, + "[%s] Universal: size amp picking file %lu[%d] with size %lu", + version->cfd_->GetName().c_str(), (unsigned long)f->number, + index, (unsigned long)f->file_size); + } + return c; +} + +Compaction* FIFOCompactionPicker::PickCompaction(Version* version, + LogBuffer* log_buffer) { + assert(version->NumberLevels() == 1); + uint64_t total_size = 0; + for (const auto& file : version->files_[0]) { + total_size += file->file_size; + } + + if (total_size <= options_->compaction_options_fifo.max_table_files_size || + version->files_[0].size() == 0) { + // total size not exceeded + LogToBuffer(log_buffer, + "[%s] FIFO compaction: nothing to do. Total size %" PRIu64 + ", max size %" PRIu64 "\n", + version->cfd_->GetName().c_str(), total_size, + options_->compaction_options_fifo.max_table_files_size); + return nullptr; + } + + if (compactions_in_progress_[0].size() > 0) { + LogToBuffer(log_buffer, + "[%s] FIFO compaction: Already executing compaction. No need " + "to run parallel compactions since compactions are very fast", + version->cfd_->GetName().c_str()); + return nullptr; + } + + Compaction* c = new Compaction(version, 0, 0, 0, 0, false, false, + true /* is deletion compaction */); + // delete old files (FIFO) + for (auto ritr = version->files_[0].rbegin(); + ritr != version->files_[0].rend(); ++ritr) { + auto f = *ritr; + total_size -= f->file_size; + c->inputs_[0].push_back(f); + char tmp_fsize[16]; + AppendHumanBytes(f->file_size, tmp_fsize, sizeof(tmp_fsize)); + LogToBuffer(log_buffer, "[%s] FIFO compaction: picking file %" PRIu64 + " with size %s for deletion", + version->cfd_->GetName().c_str(), f->number, tmp_fsize); + if (total_size <= options_->compaction_options_fifo.max_table_files_size) { + break; + } + } + + c->MarkFilesBeingCompacted(true); + compactions_in_progress_[0].insert(c); + + return c; +} + +Compaction* FIFOCompactionPicker::CompactRange(Version* version, + int input_level, + int output_level, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end) { + assert(input_level == 0); + assert(output_level == 0); + *compaction_end = nullptr; + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_->info_log.get()); + auto c = PickCompaction(version, &log_buffer); + log_buffer.FlushBufferToLog(); + return c; +} + +} // namespace rocksdb diff --git a/db/compaction_picker.h b/db/compaction_picker.h new file mode 100644 index 0000000000..65b1bc37ac --- /dev/null +++ b/db/compaction_picker.h @@ -0,0 +1,181 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "db/version_set.h" +#include "db/compaction.h" +#include "rocksdb/status.h" +#include "rocksdb/options.h" +#include "rocksdb/env.h" + +#include +#include +#include + +namespace rocksdb { + +class LogBuffer; +class Compaction; +class Version; + +class CompactionPicker { + public: + CompactionPicker(const Options* options, const InternalKeyComparator* icmp); + virtual ~CompactionPicker(); + + // Pick level and inputs for a new compaction. + // Returns nullptr if there is no compaction to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the compaction. Caller should delete the result. + virtual Compaction* PickCompaction(Version* version, + LogBuffer* log_buffer) = 0; + + // Return a compaction object for compacting the range [begin,end] in + // the specified level. Returns nullptr if there is nothing in that + // level that overlaps the specified range. Caller should delete + // the result. + // + // The returned Compaction might not include the whole requested range. + // In that case, compaction_end will be set to the next key that needs + // compacting. In case the compaction will compact the whole range, + // compaction_end will be set to nullptr. + // Client is responsible for compaction_end storage -- when called, + // *compaction_end should point to valid InternalKey! + virtual Compaction* CompactRange(Version* version, int input_level, + int output_level, const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end); + + // Free up the files that participated in a compaction + void ReleaseCompactionFiles(Compaction* c, Status status); + + // Return the total amount of data that is undergoing + // compactions per level + void SizeBeingCompacted(std::vector& sizes); + + // Returns maximum total overlap bytes with grandparent + // level (i.e., level+2) before we stop building a single + // file in level->level+1 compaction. + uint64_t MaxGrandParentOverlapBytes(int level); + + // Returns maximum total bytes of data on a given level. + double MaxBytesForLevel(int level); + + // Get the max file size in a given level. + uint64_t MaxFileSizeForLevel(int level) const; + + protected: + int NumberLevels() const { return num_levels_; } + + // Stores the minimal range that covers all entries in inputs in + // *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const std::vector& inputs, InternalKey* smallest, + InternalKey* largest); + + // Stores the minimal range that covers all entries in inputs1 and inputs2 + // in *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, InternalKey* largest); + + // Add more files to the inputs on "level" to make sure that + // no newer version of a key is compacted to "level+1" while leaving an older + // version in a "level". Otherwise, any Get() will search "level" first, + // and will likely return an old/stale value for the key, since it always + // searches in increasing order of level to find the value. This could + // also scramble the order of merge operands. This function should be + // called any time a new Compaction is created, and its inputs_[0] are + // populated. + // + // Will return false if it is impossible to apply this compaction. + bool ExpandWhileOverlapping(Compaction* c); + + uint64_t ExpandedCompactionByteSizeLimit(int level); + + // Returns true if any one of the specified files are being compacted + bool FilesInCompaction(std::vector& files); + + // Returns true if any one of the parent files are being compacted + bool ParentRangeInCompaction(Version* version, const InternalKey* smallest, + const InternalKey* largest, int level, + int* index); + + void SetupOtherInputs(Compaction* c); + + // record all the ongoing compactions for all levels + std::vector> compactions_in_progress_; + + // Per-level target file size. + std::unique_ptr max_file_size_; + + // Per-level max bytes + std::unique_ptr level_max_bytes_; + + const Options* const options_; + + private: + int num_levels_; + + const InternalKeyComparator* const icmp_; +}; + +class UniversalCompactionPicker : public CompactionPicker { + public: + UniversalCompactionPicker(const Options* options, + const InternalKeyComparator* icmp) + : CompactionPicker(options, icmp) {} + virtual Compaction* PickCompaction(Version* version, + LogBuffer* log_buffer) override; + + private: + // Pick Universal compaction to limit read amplification + Compaction* PickCompactionUniversalReadAmp(Version* version, double score, + unsigned int ratio, + unsigned int num_files, + LogBuffer* log_buffer); + + // Pick Universal compaction to limit space amplification. + Compaction* PickCompactionUniversalSizeAmp(Version* version, double score, + LogBuffer* log_buffer); +}; + +class LevelCompactionPicker : public CompactionPicker { + public: + LevelCompactionPicker(const Options* options, + const InternalKeyComparator* icmp) + : CompactionPicker(options, icmp) {} + virtual Compaction* PickCompaction(Version* version, + LogBuffer* log_buffer) override; + + private: + // For the specfied level, pick a compaction. + // Returns nullptr if there is no compaction to be done. + // If level is 0 and there is already a compaction on that level, this + // function will return nullptr. + Compaction* PickCompactionBySize(Version* version, int level, double score); +}; + +class FIFOCompactionPicker : public CompactionPicker { + public: + FIFOCompactionPicker(const Options* options, + const InternalKeyComparator* icmp) + : CompactionPicker(options, icmp) {} + + virtual Compaction* PickCompaction(Version* version, + LogBuffer* log_buffer) override; + + virtual Compaction* CompactRange(Version* version, int input_level, + int output_level, const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end) override; +}; + +} // namespace rocksdb diff --git a/db/corruption_test.cc b/db/corruption_test.cc new file mode 100644 index 0000000000..4726e92b92 --- /dev/null +++ b/db/corruption_test.cc @@ -0,0 +1,440 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/db.h" + +#include +#include +#include +#include +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "rocksdb/write_batch.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/log_format.h" +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +static const int kValueSize = 1000; + +class CorruptionTest { + public: + test::ErrorEnv env_; + std::string dbname_; + shared_ptr tiny_cache_; + Options options_; + DB* db_; + + CorruptionTest() { + tiny_cache_ = NewLRUCache(100); + options_.env = &env_; + dbname_ = test::TmpDir() + "/corruption_test"; + DestroyDB(dbname_, options_); + + db_ = nullptr; + options_.create_if_missing = true; + options_.block_size_deviation = 0; // make unit test pass for now + Reopen(); + options_.create_if_missing = false; + } + + ~CorruptionTest() { + delete db_; + DestroyDB(dbname_, Options()); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opt = (options ? *options : options_); + opt.env = &env_; + opt.block_cache = tiny_cache_; + opt.block_size_deviation = 0; + opt.arena_block_size = 4096; + return DB::Open(opt, dbname_, &db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void RepairDB() { + delete db_; + db_ = nullptr; + ASSERT_OK(::rocksdb::RepairDB(dbname_, options_)); + } + + void Build(int n) { + std::string key_space, value_space; + WriteBatch batch; + for (int i = 0; i < n; i++) { + //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); + Slice key = Key(i, &key_space); + batch.Clear(); + batch.Put(key, Value(i, &value_space)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + } + } + + void Check(int min_expected, int max_expected) { + unsigned int next_expected = 0; + int missed = 0; + int bad_keys = 0; + int bad_values = 0; + int correct = 0; + std::string value_space; + // Do not verify checksums. If we verify checksums then the + // db itself will raise errors because data is corrupted. + // Instead, we want the reads to be successful and this test + // will detect whether the appropriate corruptions have + // occured. + Iterator* iter = db_->NewIterator(ReadOptions(false, true)); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + uint64_t key; + Slice in(iter->key()); + if (!ConsumeDecimalNumber(&in, &key) || + !in.empty() || + key < next_expected) { + bad_keys++; + continue; + } + missed += (key - next_expected); + next_expected = key + 1; + if (iter->value() != Value(key, &value_space)) { + bad_values++; + } else { + correct++; + } + } + delete iter; + + fprintf(stderr, + "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n", + min_expected, max_expected, correct, bad_keys, bad_values, missed); + ASSERT_LE(min_expected, correct); + ASSERT_GE(max_expected, correct); + } + + void CorruptFile(const std::string fname, int offset, int bytes_to_corrupt) { + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + const char* msg = strerror(errno); + ASSERT_TRUE(false) << fname << ": " << msg; + } + + if (offset < 0) { + // Relative to end of file; make it absolute + if (-offset > sbuf.st_size) { + offset = 0; + } else { + offset = sbuf.st_size + offset; + } + } + if (offset > sbuf.st_size) { + offset = sbuf.st_size; + } + if (offset + bytes_to_corrupt > sbuf.st_size) { + bytes_to_corrupt = sbuf.st_size - offset; + } + + // Do it + std::string contents; + Status s = ReadFileToString(Env::Default(), fname, &contents); + ASSERT_TRUE(s.ok()) << s.ToString(); + for (int i = 0; i < bytes_to_corrupt; i++) { + contents[i + offset] ^= 0x80; + } + s = WriteStringToFile(Env::Default(), contents, fname); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + // Pick file to corrupt + std::vector filenames; + ASSERT_OK(env_.GetChildren(dbname_, &filenames)); + uint64_t number; + FileType type; + std::string fname; + int picked_number = -1; + for (unsigned int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && + type == filetype && + static_cast(number) > picked_number) { // Pick latest file + fname = dbname_ + "/" + filenames[i]; + picked_number = number; + } + } + ASSERT_TRUE(!fname.empty()) << filetype; + + CorruptFile(fname, offset, bytes_to_corrupt); + } + + // corrupts exactly one file at level `level`. if no file found at level, + // asserts + void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + for (const auto& m : metadata) { + if (m.level == level) { + CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt); + return; + } + } + ASSERT_TRUE(false) << "no file found at level"; + } + + + int Property(const std::string& name) { + std::string property; + int result; + if (db_->GetProperty(name, &property) && + sscanf(property.c_str(), "%d", &result) == 1) { + return result; + } else { + return -1; + } + } + + // Return the ith key + Slice Key(int i, std::string* storage) { + char buf[100]; + snprintf(buf, sizeof(buf), "%016d", i); + storage->assign(buf, strlen(buf)); + return Slice(*storage); + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) { + Random r(k); + return test::RandomString(&r, kValueSize, storage); + } +}; + +TEST(CorruptionTest, Recovery) { + Build(100); + Check(100, 100); + Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record + Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block + Reopen(); + + // The 64 records in the first two log blocks are completely lost. + Check(36, 36); +} + +TEST(CorruptionTest, RecoverWriteError) { + env_.writable_file_error_ = true; + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); +} + +TEST(CorruptionTest, NewFileErrorDuringWrite) { + // Do enough writing to force minor compaction + env_.writable_file_error_ = true; + const int num = 3 + (Options().write_buffer_size / kValueSize); + std::string value_storage; + Status s; + bool failed = false; + for (int i = 0; i < num; i++) { + WriteBatch batch; + batch.Put("a", Value(100, &value_storage)); + s = db_->Write(WriteOptions(), &batch); + if (!s.ok()) { + failed = true; + } + ASSERT_TRUE(!failed || !s.ok()); + } + ASSERT_TRUE(!s.ok()); + ASSERT_GE(env_.num_writable_file_errors_, 1); + env_.writable_file_error_ = false; + Reopen(); +} + +TEST(CorruptionTest, TableFile) { + Build(100); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + dbi->TEST_CompactRange(0, nullptr, nullptr); + dbi->TEST_CompactRange(1, nullptr, nullptr); + + Corrupt(kTableFile, 100, 1); + Check(99, 99); +} + +TEST(CorruptionTest, TableFileIndexData) { + Build(10000); // Enough to build multiple Tables + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + + Corrupt(kTableFile, -2000, 500); + Reopen(); + Check(5000, 9999); +} + +TEST(CorruptionTest, MissingDescriptor) { + Build(1000); + RepairDB(); + Reopen(); + Check(1000, 1000); +} + +TEST(CorruptionTest, SequenceNumberRecovery) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v5", v); + // Write something. If sequence number was not recovered properly, + // it will be hidden by an earlier write. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); +} + +TEST(CorruptionTest, CorruptedDescriptor) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + dbi->TEST_CompactRange(0, nullptr, nullptr); + + Corrupt(kDescriptorFile, 0, 1000); + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); + + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("hello", v); +} + +TEST(CorruptionTest, CompactionInputError) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + const int last = dbi->MaxMemCompactionLevel(); + ASSERT_EQ(1, Property("rocksdb.num-files-at-level" + NumberToString(last))); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + + // Force compactions by writing lots of values + Build(10000); + Check(10000, 10000); +} + +TEST(CorruptionTest, CompactionInputErrorParanoid) { + Options options; + options.paranoid_checks = true; + options.write_buffer_size = 131072; + options.max_write_buffer_number = 2; + Reopen(&options); + DBImpl* dbi = reinterpret_cast(db_); + + // Fill levels >= 1 so memtable flush outputs to level 0 + for (int level = 1; level < dbi->NumberLevels(); level++) { + dbi->Put(WriteOptions(), "", "begin"); + dbi->Put(WriteOptions(), "~", "end"); + dbi->TEST_FlushMemTable(); + } + + options.max_mem_compaction_level = 0; + Reopen(&options); + + dbi = reinterpret_cast(db_); + Build(10); + dbi->TEST_FlushMemTable(); + dbi->TEST_WaitForCompact(); + ASSERT_EQ(1, Property("rocksdb.num-files-at-level0")); + + CorruptTableFileAtLevel(0, 100, 1); + Check(9, 9); + + // Write must eventually fail because of corrupted table + Status s; + std::string tmp1, tmp2; + bool failed = false; + for (int i = 0; i < 10000; i++) { + s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); + if (!s.ok()) { + failed = true; + } + // if one write failed, every subsequent write must fail, too + ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db"; + } + ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; +} + +TEST(CorruptionTest, UnrelatedKeys) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + Corrupt(kTableFile, 100, 1); + + std::string tmp1, tmp2; + ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); + dbi->TEST_FlushMemTable(); + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); +} + +TEST(CorruptionTest, FileSystemStateCorrupted) { + for (int iter = 0; iter < 2; ++iter) { + Options options; + options.paranoid_checks = true; + options.create_if_missing = true; + Reopen(&options); + Build(10); + ASSERT_OK(db_->Flush(FlushOptions())); + DBImpl* dbi = reinterpret_cast(db_); + std::vector metadata; + dbi->GetLiveFilesMetaData(&metadata); + ASSERT_GT(metadata.size(), size_t(0)); + std::string filename = dbname_ + metadata[0].name; + + delete db_; + db_ = nullptr; + + if (iter == 0) { // corrupt file size + unique_ptr file; + env_.NewWritableFile(filename, &file, EnvOptions()); + file->Append(Slice("corrupted sst")); + file.reset(); + } else { // delete the file + env_.DeleteFile(filename); + } + + Status x = TryReopen(&options); + ASSERT_TRUE(x.IsCorruption()); + DestroyDB(dbname_, options_); + Reopen(&options); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/db_bench.cc b/db/db_bench.cc new file mode 100644 index 0000000000..b8e4b32134 --- /dev/null +++ b/db/db_bench.cc @@ -0,0 +1,2642 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#define __STDC_FORMAT_MACROS + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include +#include +#include +#include +#include +#include +#include "db/db_impl.h" +#include "db/version_set.h" +#include "rocksdb/statistics.h" +#include "rocksdb/options.h" +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" +#include "rocksdb/perf_context.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "util/crc32c.h" +#include "util/histogram.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/string_util.h" +#include "util/statistics.h" +#include "util/testutil.h" +#include "util/xxhash.h" +#include "hdfs/env_hdfs.h" +#include "utilities/merge_operators.h" + +using GFLAGS::ParseCommandLineFlags; +using GFLAGS::RegisterFlagValidator; +using GFLAGS::SetUsageMessage; + +DEFINE_string(benchmarks, + "fillseq," + "fillsync," + "fillrandom," + "overwrite," + "readrandom," + "newiterator," + "newiteratorwhilewriting," + "seekrandom," + "seekrandomwhilewriting," + "readseq," + "readreverse," + "compact," + "readrandom," + "multireadrandom," + "readseq," + "readtocache," + "readreverse," + "readwhilewriting," + "readrandomwriterandom," + "updaterandom," + "randomwithverify," + "fill100K," + "crc32c," + "xxhash," + "compress," + "uncompress," + "acquireload,", + + "Comma-separated list of operations to run in the specified order" + "Actual benchmarks:\n" + "\tfillseq -- write N values in sequential key" + " order in async mode\n" + "\tfillrandom -- write N values in random key order in async" + " mode\n" + "\toverwrite -- overwrite N values in random key order in" + " async mode\n" + "\tfillsync -- write N/100 values in random key order in " + "sync mode\n" + "\tfill100K -- write N/1000 100K values in random order in" + " async mode\n" + "\tdeleteseq -- delete N keys in sequential order\n" + "\tdeleterandom -- delete N keys in random order\n" + "\treadseq -- read N times sequentially\n" + "\treadtocache -- 1 thread reading database sequentially\n" + "\treadreverse -- read N times in reverse order\n" + "\treadrandom -- read N times in random order\n" + "\treadmissing -- read N missing keys in random order\n" + "\treadhot -- read N times in random order from 1% section " + "of DB\n" + "\treadwhilewriting -- 1 writer, N threads doing random " + "reads\n" + "\treadrandomwriterandom -- N threads doing random-read, " + "random-write\n" + "\tprefixscanrandom -- prefix scan N times in random order\n" + "\tupdaterandom -- N threads doing read-modify-write for random " + "keys\n" + "\tappendrandom -- N threads doing read-modify-write with " + "growing values\n" + "\tmergerandom -- same as updaterandom/appendrandom using merge" + " operator. " + "Must be used with merge_operator\n" + "\treadrandommergerandom -- perform N random read-or-merge " + "operations. Must be used with merge_operator\n" + "\tnewiterator -- repeated iterator creation\n" + "\tseekrandom -- N random seeks\n" + "\tseekrandom -- 1 writer, N threads doing random seeks\n" + "\tcrc32c -- repeated crc32c of 4K of data\n" + "\txxhash -- repeated xxHash of 4K of data\n" + "\tacquireload -- load N*1000 times\n" + "Meta operations:\n" + "\tcompact -- Compact the entire DB\n" + "\tstats -- Print DB stats\n" + "\tlevelstats -- Print the number of files and bytes per level\n" + "\tsstables -- Print sstable info\n" + "\theapprofile -- Dump a heap profile (if supported by this" + " port)\n"); + +DEFINE_int64(num, 1000000, "Number of key/values to place in database"); + +DEFINE_int64(numdistinct, 1000, + "Number of distinct keys to use. Used in RandomWithVerify to " + "read/write on fewer keys so that gets are more likely to find the" + " key and puts are more likely to update the same key"); + +DEFINE_int64(merge_keys, -1, + "Number of distinct keys to use for MergeRandom and " + "ReadRandomMergeRandom. " + "If negative, there will be FLAGS_num keys."); + +DEFINE_int64(reads, -1, "Number of read operations to do. " + "If negative, do FLAGS_num reads."); + +DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality"); + +DEFINE_int64(seed, 0, "Seed base for random number generators. " + "When 0 it is deterministic."); + +DEFINE_int32(threads, 1, "Number of concurrent threads to run."); + +DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run." + " When 0 then num & reads determine the test duration"); + +DEFINE_int32(value_size, 100, "Size of each value"); + + +static bool ValidateKeySize(const char* flagname, int32_t value) { + return true; +} + +DEFINE_int32(key_size, 16, "size of each key"); + +DEFINE_int32(num_multi_db, 0, + "Number of DBs used in the benchmark. 0 means single DB."); + +DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink" + " to this fraction of their original size after compression"); + +DEFINE_bool(histogram, false, "Print histogram of operation timings"); + +DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size, + "Number of bytes to buffer in memtable before compacting"); + +DEFINE_int32(max_write_buffer_number, + rocksdb::Options().max_write_buffer_number, + "The number of in-memory memtables. Each memtable is of size" + "write_buffer_size."); + +DEFINE_int32(min_write_buffer_number_to_merge, + rocksdb::Options().min_write_buffer_number_to_merge, + "The minimum number of write buffers that will be merged together" + "before writing to storage. This is cheap because it is an" + "in-memory merge. If this feature is not enabled, then all these" + "write buffers are flushed to L0 as separate files and this " + "increases read amplification because a get request has to check" + " in all of these files. Also, an in-memory merge may result in" + " writing less data to storage if there are duplicate records " + " in each of these individual write buffers."); + +DEFINE_int32(max_background_compactions, + rocksdb::Options().max_background_compactions, + "The maximum number of concurrent background compactions" + " that can occur in parallel."); + +DEFINE_int32(max_background_flushes, + rocksdb::Options().max_background_flushes, + "The maximum number of concurrent background flushes" + " that can occur in parallel."); + +static rocksdb::CompactionStyle FLAGS_compaction_style_e; +DEFINE_int32(compaction_style, (int32_t) rocksdb::Options().compaction_style, + "style of compaction: level-based vs universal"); + +DEFINE_int32(universal_size_ratio, 0, + "Percentage flexibility while comparing file size" + " (for universal compaction only)."); + +DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files in a" + " single compaction run (for universal compaction only)."); + +DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact" + " in universal style compaction"); + +DEFINE_int32(universal_max_size_amplification_percent, 0, + "The max size amplification for universal style compaction"); + +DEFINE_int32(universal_compression_size_percent, -1, + "The percentage of the database to compress for universal " + "compaction. -1 means compress everything."); + +DEFINE_int64(cache_size, -1, "Number of bytes to use as a cache of uncompressed" + "data. Negative means use default settings."); + +DEFINE_int32(block_size, rocksdb::Options().block_size, + "Number of bytes in a block."); + +DEFINE_int64(compressed_cache_size, -1, + "Number of bytes to use as a cache of compressed data."); + +DEFINE_int32(open_files, rocksdb::Options().max_open_files, + "Maximum number of files to keep open at the same time" + " (use default if == 0)"); + +DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means" + " use default settings."); +DEFINE_int32(memtable_bloom_bits, 0, "Bloom filter bits per key for memtable. " + "Negative means no bloom filter."); + +DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing" + " database. If you set this flag and also specify a benchmark that" + " wants a fresh database, that benchmark will fail."); + +DEFINE_string(db, "", "Use the db with the following name."); + +static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) { + if (value >= 20) { + fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(cache_numshardbits, -1, "Number of shards for the block cache" + " is 2 ** cache_numshardbits. Negative means use default settings." + " This is applied only if FLAGS_cache_size is non-negative."); + +DEFINE_int32(cache_remove_scan_count_limit, 32, ""); + +DEFINE_bool(verify_checksum, false, "Verify checksum for every block read" + " from storage"); + +DEFINE_bool(statistics, false, "Database statistics"); +static class std::shared_ptr dbstats; + +DEFINE_int64(writes, -1, "Number of write operations to do. If negative, do" + " --num reads."); + +DEFINE_int32(writes_per_second, 0, "Per-thread rate limit on writes per second." + " No limit when <= 0. Only for the readwhilewriting test."); + +DEFINE_bool(sync, false, "Sync all writes to disk"); + +DEFINE_bool(disable_data_sync, false, "If true, do not wait until data is" + " synced to disk."); + +DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync"); + +DEFINE_bool(disable_wal, false, "If true, do not write WAL for write."); + +DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL"); + +DEFINE_int32(num_levels, 7, "The total number of levels"); + +DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1"); + +DEFINE_int32(target_file_size_multiplier, 1, + "A multiplier to compute target level-N file size (N >= 2)"); + +DEFINE_uint64(max_bytes_for_level_base, 10 * 1048576, "Max bytes for level-1"); + +DEFINE_int32(max_bytes_for_level_multiplier, 10, + "A multiplier to compute max bytes for level-N (N >= 2)"); + +static std::vector FLAGS_max_bytes_for_level_multiplier_additional_v; +DEFINE_string(max_bytes_for_level_multiplier_additional, "", + "A vector that specifies additional fanout per level"); + +DEFINE_int32(level0_stop_writes_trigger, 12, "Number of files in level-0" + " that will trigger put stop."); + +DEFINE_int32(level0_slowdown_writes_trigger, 8, "Number of files in level-0" + " that will slow down writes."); + +DEFINE_int32(level0_file_num_compaction_trigger, 4, "Number of files in level-0" + " when compactions start"); + +static bool ValidateInt32Percent(const char* flagname, int32_t value) { + if (value <= 0 || value>=100) { + fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed" + " as percentage) for the ReadRandomWriteRandom workload. The " + "default value 90 means 90% operations out of all reads and writes" + " operations are reads. In other words, 9 gets for every 1 put."); + +DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed" + " as percentage) for the ReadRandomMergeRandom workload. The" + " default value 70 means 70% out of all read and merge operations" + " are merges. In other words, 7 merges for every 3 gets."); + +DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/" + "deletes (used in RandomWithVerify only). RandomWithVerify " + "calculates writepercent as (100 - FLAGS_readwritepercent - " + "deletepercent), so deletepercent must be smaller than (100 - " + "FLAGS_readwritepercent)"); + +DEFINE_int32(disable_seek_compaction, false, "Option to disable compaction" + " triggered by read."); + +DEFINE_uint64(delete_obsolete_files_period_micros, 0, "Option to delete " + "obsolete files periodically. 0 means that obsolete files are" + " deleted after every compaction run."); + +namespace { +enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "none")) + return rocksdb::kNoCompression; + else if (!strcasecmp(ctype, "snappy")) + return rocksdb::kSnappyCompression; + else if (!strcasecmp(ctype, "zlib")) + return rocksdb::kZlibCompression; + else if (!strcasecmp(ctype, "bzip2")) + return rocksdb::kBZip2Compression; + else if (!strcasecmp(ctype, "lz4")) + return rocksdb::kLZ4Compression; + else if (!strcasecmp(ctype, "lz4hc")) + return rocksdb::kLZ4HCCompression; + + fprintf(stdout, "Cannot parse compression type '%s'\n", ctype); + return rocksdb::kSnappyCompression; //default value +} +} // namespace + +DEFINE_string(compression_type, "snappy", + "Algorithm to use to compress the database"); +static enum rocksdb::CompressionType FLAGS_compression_type_e = + rocksdb::kSnappyCompression; + +DEFINE_int32(compression_level, -1, + "Compression level. For zlib this should be -1 for the " + "default level, or between 0 and 9."); + +static bool ValidateCompressionLevel(const char* flagname, int32_t value) { + if (value < -1 || value > 9) { + fprintf(stderr, "Invalid value for --%s: %d, must be between -1 and 9\n", + flagname, value); + return false; + } + return true; +} + +static const bool FLAGS_compression_level_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_compression_level, &ValidateCompressionLevel); + +DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts" + " from this level. Levels with number < min_level_to_compress are" + " not compressed. Otherwise, apply compression_type to " + "all levels."); + +static bool ValidateTableCacheNumshardbits(const char* flagname, + int32_t value) { + if (0 >= value || value > 20) { + fprintf(stderr, "Invalid value for --%s: %d, must be 0 < val <= 20\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(table_cache_numshardbits, 4, ""); + +DEFINE_string(hdfs, "", "Name of hdfs environment"); +// posix or hdfs environment +static rocksdb::Env* FLAGS_env = rocksdb::Env::Default(); + +DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when " + "this is greater than zero. When 0 the interval grows over time."); + +DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when" + " this is greater than 0."); + +DEFINE_int32(perf_level, 0, "Level of perf collection"); + +static bool ValidateRateLimit(const char* flagname, double value) { + static constexpr double EPSILON = 1e-10; + if ( value < -EPSILON ) { + fprintf(stderr, "Invalid value for --%s: %12.6f, must be >= 0.0\n", + flagname, value); + return false; + } + return true; +} +DEFINE_double(soft_rate_limit, 0.0, ""); + +DEFINE_double(hard_rate_limit, 0.0, "When not equal to 0 this make threads " + "sleep at each stats reporting interval until the compaction" + " score for all levels is less than or equal to this value."); + +DEFINE_int32(rate_limit_delay_max_milliseconds, 1000, + "When hard_rate_limit is set then this is the max time a put will" + " be stalled."); + +DEFINE_int32(max_grandparent_overlap_factor, 10, "Control maximum bytes of " + "overlaps in grandparent (i.e., level+2) before we stop building a" + " single file in a level->level+1 compaction."); + +DEFINE_bool(readonly, false, "Run read only benchmarks."); + +DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions"); + +DEFINE_int32(source_compaction_factor, 1, "Cap the size of data in level-K for" + " a compaction run that compacts Level-K with Level-(K+1) (for" + " K >= 1)"); + +DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds."); +DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files" + " in MB."); + +DEFINE_bool(bufferedio, rocksdb::EnvOptions().use_os_buffer, + "Allow buffered io using OS buffers"); + +DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads, + "Allow reads to occur via mmap-ing files"); + +DEFINE_bool(mmap_write, rocksdb::EnvOptions().use_mmap_writes, + "Allow writes to occur via mmap-ing files"); + +DEFINE_bool(advise_random_on_open, rocksdb::Options().advise_random_on_open, + "Advise random access on table file open"); + +DEFINE_string(compaction_fadvice, "NORMAL", + "Access pattern advice when a file is compacted"); +static auto FLAGS_compaction_fadvice_e = + rocksdb::Options().access_hint_on_compaction_start; + +DEFINE_bool(use_tailing_iterator, false, + "Use tailing iterator to access a series of keys instead of get"); +DEFINE_int64(iter_refresh_interval_us, -1, + "How often to refresh iterators. Disable refresh when -1"); + +DEFINE_bool(use_adaptive_mutex, rocksdb::Options().use_adaptive_mutex, + "Use adaptive mutex"); + +DEFINE_uint64(bytes_per_sync, rocksdb::Options().bytes_per_sync, + "Allows OS to incrementally sync files to disk while they are" + " being written, in the background. Issue one request for every" + " bytes_per_sync written. 0 turns it off."); +DEFINE_bool(filter_deletes, false, " On true, deletes use bloom-filter and drop" + " the delete if key not present"); + +DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge" + " operations on a key in the memtable"); + +static bool ValidatePrefixSize(const char* flagname, int32_t value) { + if (value < 0 || value>=2000000000) { + fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(prefix_size, 0, "control the prefix size for HashSkipList and " + "plain table"); +DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated " + "per prefix, 0 means no special handling of the prefix, " + "i.e. use the prefix comes with the generated random number."); + +enum RepFactory { + kSkipList, + kPrefixHash, + kVectorRep, + kHashLinkedList, + kCuckoo +}; + +namespace { +enum RepFactory StringToRepFactory(const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "skip_list")) + return kSkipList; + else if (!strcasecmp(ctype, "prefix_hash")) + return kPrefixHash; + else if (!strcasecmp(ctype, "vector")) + return kVectorRep; + else if (!strcasecmp(ctype, "hash_linkedlist")) + return kHashLinkedList; + else if (!strcasecmp(ctype, "cuckoo")) + return kCuckoo; + + fprintf(stdout, "Cannot parse memreptable %s\n", ctype); + return kSkipList; +} +} // namespace + +static enum RepFactory FLAGS_rep_factory; +DEFINE_string(memtablerep, "skip_list", ""); +DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count"); +DEFINE_bool(use_plain_table, false, "if use plain table " + "instead of block-based table format"); + +DEFINE_string(merge_operator, "", "The merge operator to use with the database." + "If a new merge operator is specified, be sure to use fresh" + " database The possible merge operators are defined in" + " utilities/merge_operators.h"); + +static const bool FLAGS_soft_rate_limit_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit); + +static const bool FLAGS_hard_rate_limit_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit); + +static const bool FLAGS_prefix_size_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize); + +static const bool FLAGS_key_size_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize); + +static const bool FLAGS_cache_numshardbits_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_cache_numshardbits, + &ValidateCacheNumshardbits); + +static const bool FLAGS_readwritepercent_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent); + +static const bool FLAGS_deletepercent_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent); +static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_table_cache_numshardbits, + &ValidateTableCacheNumshardbits); + +namespace rocksdb { + +// Helper for quickly generating random data. +class RandomGenerator { + private: + std::string data_; + unsigned int pos_; + + public: + RandomGenerator() { + // We use a limited amount of data over and over again and ensure + // that it is larger than the compression window (32KB), and also + // large enough to serve all typical value sizes we want to write. + Random rnd(301); + std::string piece; + while (data_.size() < (unsigned)std::max(1048576, FLAGS_value_size)) { + // Add a short fragment that is as compressible as specified + // by FLAGS_compression_ratio. + test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); + data_.append(piece); + } + pos_ = 0; + } + + Slice Generate(unsigned int len) { + if (pos_ + len > data_.size()) { + pos_ = 0; + assert(len < data_.size()); + } + pos_ += len; + return Slice(data_.data() + pos_ - len, len); + } +}; + +static void AppendWithSpace(std::string* str, Slice msg) { + if (msg.empty()) return; + if (!str->empty()) { + str->push_back(' '); + } + str->append(msg.data(), msg.size()); +} + +class Stats { + private: + int id_; + double start_; + double finish_; + double seconds_; + int64_t done_; + int64_t last_report_done_; + int64_t next_report_; + int64_t bytes_; + double last_op_finish_; + double last_report_finish_; + HistogramImpl hist_; + std::string message_; + bool exclude_from_merge_; + + public: + Stats() { Start(-1); } + + void Start(int id) { + id_ = id; + next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100; + last_op_finish_ = start_; + hist_.Clear(); + done_ = 0; + last_report_done_ = 0; + bytes_ = 0; + seconds_ = 0; + start_ = FLAGS_env->NowMicros(); + finish_ = start_; + last_report_finish_ = start_; + message_.clear(); + // When set, stats from this thread won't be merged with others. + exclude_from_merge_ = false; + } + + void Merge(const Stats& other) { + if (other.exclude_from_merge_) + return; + + hist_.Merge(other.hist_); + done_ += other.done_; + bytes_ += other.bytes_; + seconds_ += other.seconds_; + if (other.start_ < start_) start_ = other.start_; + if (other.finish_ > finish_) finish_ = other.finish_; + + // Just keep the messages from one thread + if (message_.empty()) message_ = other.message_; + } + + void Stop() { + finish_ = FLAGS_env->NowMicros(); + seconds_ = (finish_ - start_) * 1e-6; + } + + void AddMessage(Slice msg) { + AppendWithSpace(&message_, msg); + } + + void SetId(int id) { id_ = id; } + void SetExcludeFromMerge() { exclude_from_merge_ = true; } + + void FinishedSingleOp(DB* db) { + if (FLAGS_histogram) { + double now = FLAGS_env->NowMicros(); + double micros = now - last_op_finish_; + hist_.Add(micros); + if (micros > 20000 && !FLAGS_stats_interval) { + fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); + fflush(stderr); + } + last_op_finish_ = now; + } + + done_++; + if (done_ >= next_report_) { + if (!FLAGS_stats_interval) { + if (next_report_ < 1000) next_report_ += 100; + else if (next_report_ < 5000) next_report_ += 500; + else if (next_report_ < 10000) next_report_ += 1000; + else if (next_report_ < 50000) next_report_ += 5000; + else if (next_report_ < 100000) next_report_ += 10000; + else if (next_report_ < 500000) next_report_ += 50000; + else next_report_ += 100000; + fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, ""); + fflush(stderr); + } else { + double now = FLAGS_env->NowMicros(); + fprintf(stderr, + "%s ... thread %d: (%" PRIu64 ",%" PRIu64 ") ops and " + "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(), + id_, + done_ - last_report_done_, done_, + (done_ - last_report_done_) / + ((now - last_report_finish_) / 1000000.0), + done_ / ((now - start_) / 1000000.0), + (now - last_report_finish_) / 1000000.0, + (now - start_) / 1000000.0); + + if (FLAGS_stats_per_interval) { + std::string stats; + if (db && db->GetProperty("rocksdb.stats", &stats)) + fprintf(stderr, "%s\n", stats.c_str()); + } + + fflush(stderr); + next_report_ += FLAGS_stats_interval; + last_report_finish_ = now; + last_report_done_ = done_; + } + } + } + + void AddBytes(int64_t n) { + bytes_ += n; + } + + void Report(const Slice& name) { + // Pretend at least one op was done in case we are running a benchmark + // that does not call FinishedSingleOp(). + if (done_ < 1) done_ = 1; + + std::string extra; + if (bytes_ > 0) { + // Rate is computed on actual elapsed time, not the sum of per-thread + // elapsed times. + double elapsed = (finish_ - start_) * 1e-6; + char rate[100]; + snprintf(rate, sizeof(rate), "%6.1f MB/s", + (bytes_ / 1048576.0) / elapsed); + extra = rate; + } + AppendWithSpace(&extra, message_); + double elapsed = (finish_ - start_) * 1e-6; + double throughput = (double)done_/elapsed; + + fprintf(stdout, "%-12s : %11.3f micros/op %ld ops/sec;%s%s\n", + name.ToString().c_str(), + elapsed * 1e6 / done_, + (long)throughput, + (extra.empty() ? "" : " "), + extra.c_str()); + if (FLAGS_histogram) { + fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); + } + fflush(stdout); + } +}; + +// State shared by all concurrent executions of the same benchmark. +struct SharedState { + port::Mutex mu; + port::CondVar cv; + int total; + int perf_level; + + // Each thread goes through the following states: + // (1) initializing + // (2) waiting for others to be initialized + // (3) running + // (4) done + + long num_initialized; + long num_done; + bool start; + + SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { } +}; + +// Per-thread state for concurrent executions of the same benchmark. +struct ThreadState { + int tid; // 0..n-1 when running in n threads + Random64 rand; // Has different seeds for different threads + Stats stats; + SharedState* shared; + + /* implicit */ ThreadState(int index) + : tid(index), + rand((FLAGS_seed ? FLAGS_seed : 1000) + index) { + } +}; + +class Duration { + public: + Duration(int max_seconds, int64_t max_ops) { + max_seconds_ = max_seconds; + max_ops_= max_ops; + ops_ = 0; + start_at_ = FLAGS_env->NowMicros(); + } + + bool Done(int64_t increment) { + if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops + ops_ += increment; + + if (max_seconds_) { + // Recheck every appx 1000 ops (exact iff increment is factor of 1000) + if ((ops_/1000) != ((ops_-increment)/1000)) { + double now = FLAGS_env->NowMicros(); + return ((now - start_at_) / 1000000.0) >= max_seconds_; + } else { + return false; + } + } else { + return ops_ > max_ops_; + } + } + + private: + int max_seconds_; + int64_t max_ops_; + int64_t ops_; + double start_at_; +}; + +class Benchmark { + private: + shared_ptr cache_; + shared_ptr compressed_cache_; + const FilterPolicy* filter_policy_; + const SliceTransform* prefix_extractor_; + DB* db_; + std::vector multi_dbs_; + int64_t num_; + int value_size_; + int key_size_; + int prefix_size_; + int64_t keys_per_prefix_; + int64_t entries_per_batch_; + WriteOptions write_options_; + int64_t reads_; + int64_t writes_; + int64_t readwrites_; + int64_t merge_keys_; + void PrintHeader() { + PrintEnvironment(); + fprintf(stdout, "Keys: %d bytes each\n", FLAGS_key_size); + fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", + FLAGS_value_size, + static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); + fprintf(stdout, "Entries: %" PRIu64 "\n", num_); + fprintf(stdout, "Prefix: %d bytes\n", FLAGS_prefix_size); + fprintf(stdout, "Keys per prefix: %" PRIu64 "\n", keys_per_prefix_); + fprintf(stdout, "RawSize: %.1f MB (estimated)\n", + ((static_cast(FLAGS_key_size + FLAGS_value_size) * num_) + / 1048576.0)); + fprintf(stdout, "FileSize: %.1f MB (estimated)\n", + (((FLAGS_key_size + FLAGS_value_size * FLAGS_compression_ratio) + * num_) + / 1048576.0)); + fprintf(stdout, "Write rate limit: %d\n", FLAGS_writes_per_second); + switch (FLAGS_compression_type_e) { + case rocksdb::kNoCompression: + fprintf(stdout, "Compression: none\n"); + break; + case rocksdb::kSnappyCompression: + fprintf(stdout, "Compression: snappy\n"); + break; + case rocksdb::kZlibCompression: + fprintf(stdout, "Compression: zlib\n"); + break; + case rocksdb::kBZip2Compression: + fprintf(stdout, "Compression: bzip2\n"); + break; + case rocksdb::kLZ4Compression: + fprintf(stdout, "Compression: lz4\n"); + break; + case rocksdb::kLZ4HCCompression: + fprintf(stdout, "Compression: lz4hc\n"); + break; + } + + switch (FLAGS_rep_factory) { + case kPrefixHash: + fprintf(stdout, "Memtablerep: prefix_hash\n"); + break; + case kSkipList: + fprintf(stdout, "Memtablerep: skip_list\n"); + break; + case kVectorRep: + fprintf(stdout, "Memtablerep: vector\n"); + break; + case kHashLinkedList: + fprintf(stdout, "Memtablerep: hash_linkedlist\n"); + break; + case kCuckoo: + fprintf(stdout, "Memtablerep: cuckoo\n"); + break; + } + fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level); + + PrintWarnings(); + fprintf(stdout, "------------------------------------------------\n"); + } + + void PrintWarnings() { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + fprintf(stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" + ); +#endif +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif + if (FLAGS_compression_type_e != rocksdb::kNoCompression) { + // The test string should not be too small. + const int len = FLAGS_block_size; + char* text = (char*) malloc(len+1); + bool result = true; + const char* name = nullptr; + std::string compressed; + + memset(text, (int) 'y', len); + text[len] = '\0'; + switch (FLAGS_compression_type_e) { + case kSnappyCompression: + result = port::Snappy_Compress(Options().compression_opts, text, + strlen(text), &compressed); + name = "Snappy"; + break; + case kZlibCompression: + result = port::Zlib_Compress(Options().compression_opts, text, + strlen(text), &compressed); + name = "Zlib"; + break; + case kBZip2Compression: + result = port::BZip2_Compress(Options().compression_opts, text, + strlen(text), &compressed); + name = "BZip2"; + break; + case kLZ4Compression: + result = port::LZ4_Compress(Options().compression_opts, text, + strlen(text), &compressed); + name = "LZ4"; + break; + case kLZ4HCCompression: + result = port::LZ4HC_Compress(Options().compression_opts, text, + strlen(text), &compressed); + name = "LZ4HC"; + break; + case kNoCompression: + assert(false); // cannot happen + break; + } + + if (!result) { + fprintf(stdout, "WARNING: %s compression is not enabled\n", name); + } else if (name && compressed.size() >= strlen(text)) { + fprintf(stdout, "WARNING: %s compression is not effective\n", name); + } + + free(text); + } + } + +// Current the following isn't equivalent to OS_LINUX. +#if defined(__linux) + static Slice TrimSpace(Slice s) { + unsigned int start = 0; + while (start < s.size() && isspace(s[start])) { + start++; + } + unsigned int limit = s.size(); + while (limit > start && isspace(s[limit-1])) { + limit--; + } + return Slice(s.data() + start, limit - start); + } +#endif + + void PrintEnvironment() { + fprintf(stderr, "LevelDB: version %d.%d\n", + kMajorVersion, kMinorVersion); + +#if defined(__linux) + time_t now = time(nullptr); + fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline + + FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo != nullptr) { + char line[1000]; + int num_cpus = 0; + std::string cpu_type; + std::string cache_size; + while (fgets(line, sizeof(line), cpuinfo) != nullptr) { + const char* sep = strchr(line, ':'); + if (sep == nullptr) { + continue; + } + Slice key = TrimSpace(Slice(line, sep - 1 - line)); + Slice val = TrimSpace(Slice(sep + 1)); + if (key == "model name") { + ++num_cpus; + cpu_type = val.ToString(); + } else if (key == "cache size") { + cache_size = val.ToString(); + } + } + fclose(cpuinfo); + fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); + fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); + } +#endif + } + + public: + Benchmark() + : cache_(FLAGS_cache_size >= 0 ? + (FLAGS_cache_numshardbits >= 1 ? + NewLRUCache(FLAGS_cache_size, FLAGS_cache_numshardbits, + FLAGS_cache_remove_scan_count_limit) : + NewLRUCache(FLAGS_cache_size)) : nullptr), + compressed_cache_(FLAGS_compressed_cache_size >= 0 ? + (FLAGS_cache_numshardbits >= 1 ? + NewLRUCache(FLAGS_compressed_cache_size, FLAGS_cache_numshardbits) : + NewLRUCache(FLAGS_compressed_cache_size)) : nullptr), + filter_policy_(FLAGS_bloom_bits >= 0 + ? NewBloomFilterPolicy(FLAGS_bloom_bits) + : nullptr), + prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)), + db_(nullptr), + num_(FLAGS_num), + value_size_(FLAGS_value_size), + key_size_(FLAGS_key_size), + prefix_size_(FLAGS_prefix_size), + keys_per_prefix_(FLAGS_keys_per_prefix), + entries_per_batch_(1), + reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads), + writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes), + readwrites_((FLAGS_writes < 0 && FLAGS_reads < 0)? FLAGS_num : + ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads) + ), + merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys) { + if (FLAGS_prefix_size > FLAGS_key_size) { + fprintf(stderr, "prefix size is larger than key size"); + exit(1); + } + + std::vector files; + FLAGS_env->GetChildren(FLAGS_db, &files); + for (unsigned int i = 0; i < files.size(); i++) { + if (Slice(files[i]).starts_with("heap-")) { + FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]); + } + } + if (!FLAGS_use_existing_db) { + DestroyDB(FLAGS_db, Options()); + } + } + + ~Benchmark() { + delete db_; + delete filter_policy_; + delete prefix_extractor_; + } + + Slice AllocateKey() { + return Slice(new char[key_size_], key_size_); + } + + // Generate key according to the given specification and random number. + // The resulting key will have the following format (if keys_per_prefix_ + // is positive), extra trailing bytes are either cut off or paddd with '0'. + // The prefix value is derived from key value. + // ---------------------------- + // | prefix 00000 | key 00000 | + // ---------------------------- + // If keys_per_prefix_ is 0, the key is simply a binary representation of + // random number followed by trailing '0's + // ---------------------------- + // | key 00000 | + // ---------------------------- + void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) { + char* start = const_cast(key->data()); + char* pos = start; + if (keys_per_prefix_ > 0) { + int64_t num_prefix = num_keys / keys_per_prefix_; + int64_t prefix = v % num_prefix; + int bytes_to_fill = std::min(prefix_size_, 8); + if (port::kLittleEndian) { + for (int i = 0; i < bytes_to_fill; ++i) { + pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF; + } + } else { + memcpy(pos, static_cast(&prefix), bytes_to_fill); + } + if (prefix_size_ > 8) { + // fill the rest with 0s + memset(pos + 8, '0', prefix_size_ - 8); + } + pos += prefix_size_; + } + + int bytes_to_fill = std::min(key_size_ - static_cast(pos - start), 8); + if (port::kLittleEndian) { + for (int i = 0; i < bytes_to_fill; ++i) { + pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF; + } + } else { + memcpy(pos, static_cast(&v), bytes_to_fill); + } + pos += bytes_to_fill; + if (key_size_ > pos - start) { + memset(pos, '0', key_size_ - (pos - start)); + } + } + + std::string GetDbNameForMultiple(std::string base_name, size_t id) { + return base_name + std::to_string(id); + } + + void Run() { + PrintHeader(); + Open(); + const char* benchmarks = FLAGS_benchmarks.c_str(); + while (benchmarks != nullptr) { + const char* sep = strchr(benchmarks, ','); + Slice name; + if (sep == nullptr) { + name = benchmarks; + benchmarks = nullptr; + } else { + name = Slice(benchmarks, sep - benchmarks); + benchmarks = sep + 1; + } + + // Sanitize parameters + num_ = FLAGS_num; + reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads); + writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes); + value_size_ = FLAGS_value_size; + key_size_ = FLAGS_key_size; + entries_per_batch_ = 1; + write_options_ = WriteOptions(); + if (FLAGS_sync) { + write_options_.sync = true; + } + write_options_.disableWAL = FLAGS_disable_wal; + + void (Benchmark::*method)(ThreadState*) = nullptr; + bool fresh_db = false; + int num_threads = FLAGS_threads; + + if (name == Slice("fillseq")) { + fresh_db = true; + method = &Benchmark::WriteSeq; + } else if (name == Slice("fillbatch")) { + fresh_db = true; + entries_per_batch_ = 1000; + method = &Benchmark::WriteSeq; + } else if (name == Slice("fillrandom")) { + fresh_db = true; + method = &Benchmark::WriteRandom; + } else if (name == Slice("filluniquerandom")) { + fresh_db = true; + if (num_threads > 1) { + fprintf(stderr, "filluniquerandom multithreaded not supported" + ", use 1 thread"); + num_threads = 1; + } + method = &Benchmark::WriteUniqueRandom; + } else if (name == Slice("overwrite")) { + fresh_db = false; + method = &Benchmark::WriteRandom; + } else if (name == Slice("fillsync")) { + fresh_db = true; + num_ /= 1000; + write_options_.sync = true; + method = &Benchmark::WriteRandom; + } else if (name == Slice("fill100K")) { + fresh_db = true; + num_ /= 1000; + value_size_ = 100 * 1000; + method = &Benchmark::WriteRandom; + } else if (name == Slice("readseq")) { + method = &Benchmark::ReadSequential; + } else if (name == Slice("readtocache")) { + method = &Benchmark::ReadSequential; + num_threads = 1; + reads_ = num_; + } else if (name == Slice("readreverse")) { + method = &Benchmark::ReadReverse; + } else if (name == Slice("readrandom")) { + method = &Benchmark::ReadRandom; + } else if (name == Slice("multireadrandom")) { + method = &Benchmark::MultiReadRandom; + } else if (name == Slice("readmissing")) { + ++key_size_; + method = &Benchmark::ReadRandom; + } else if (name == Slice("newiterator")) { + method = &Benchmark::IteratorCreation; + } else if (name == Slice("newiteratorwhilewriting")) { + num_threads++; // Add extra thread for writing + method = &Benchmark::IteratorCreationWhileWriting; + } else if (name == Slice("seekrandom")) { + method = &Benchmark::SeekRandom; + } else if (name == Slice("seekrandomwhilewriting")) { + num_threads++; // Add extra thread for writing + method = &Benchmark::SeekRandomWhileWriting; + } else if (name == Slice("readrandomsmall")) { + reads_ /= 1000; + method = &Benchmark::ReadRandom; + } else if (name == Slice("deleteseq")) { + method = &Benchmark::DeleteSeq; + } else if (name == Slice("deleterandom")) { + method = &Benchmark::DeleteRandom; + } else if (name == Slice("readwhilewriting")) { + num_threads++; // Add extra thread for writing + method = &Benchmark::ReadWhileWriting; + } else if (name == Slice("readrandomwriterandom")) { + method = &Benchmark::ReadRandomWriteRandom; + } else if (name == Slice("readrandommergerandom")) { + if (FLAGS_merge_operator.empty()) { + fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", + name.ToString().c_str()); + exit(1); + } + method = &Benchmark::ReadRandomMergeRandom; + } else if (name == Slice("updaterandom")) { + method = &Benchmark::UpdateRandom; + } else if (name == Slice("appendrandom")) { + method = &Benchmark::AppendRandom; + } else if (name == Slice("mergerandom")) { + if (FLAGS_merge_operator.empty()) { + fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", + name.ToString().c_str()); + exit(1); + } + method = &Benchmark::MergeRandom; + } else if (name == Slice("randomwithverify")) { + method = &Benchmark::RandomWithVerify; + } else if (name == Slice("compact")) { + method = &Benchmark::Compact; + } else if (name == Slice("crc32c")) { + method = &Benchmark::Crc32c; + } else if (name == Slice("xxhash")) { + method = &Benchmark::xxHash; + } else if (name == Slice("acquireload")) { + method = &Benchmark::AcquireLoad; + } else if (name == Slice("compress")) { + method = &Benchmark::Compress; + } else if (name == Slice("uncompress")) { + method = &Benchmark::Uncompress; + } else if (name == Slice("stats")) { + PrintStats("rocksdb.stats"); + } else if (name == Slice("levelstats")) { + PrintStats("rocksdb.levelstats"); + } else if (name == Slice("sstables")) { + PrintStats("rocksdb.sstables"); + } else { + if (name != Slice()) { // No error message for empty name + fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); + exit(1); + } + } + + if (fresh_db) { + if (FLAGS_use_existing_db) { + fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n", + name.ToString().c_str()); + method = nullptr; + } else { + if (db_ != nullptr) { + delete db_; + db_ = nullptr; + DestroyDB(FLAGS_db, Options()); + } + for (size_t i = 0; i < multi_dbs_.size(); i++) { + delete multi_dbs_[i]; + DestroyDB(GetDbNameForMultiple(FLAGS_db, i), Options()); + } + multi_dbs_.clear(); + } + Open(); + } + + if (method != nullptr) { + fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); + RunBenchmark(num_threads, name, method); + } + } + if (FLAGS_statistics) { + fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str()); + } + } + + private: + struct ThreadArg { + Benchmark* bm; + SharedState* shared; + ThreadState* thread; + void (Benchmark::*method)(ThreadState*); + }; + + static void ThreadBody(void* v) { + ThreadArg* arg = reinterpret_cast(v); + SharedState* shared = arg->shared; + ThreadState* thread = arg->thread; + { + MutexLock l(&shared->mu); + shared->num_initialized++; + if (shared->num_initialized >= shared->total) { + shared->cv.SignalAll(); + } + while (!shared->start) { + shared->cv.Wait(); + } + } + + SetPerfLevel(static_cast (shared->perf_level)); + thread->stats.Start(thread->tid); + (arg->bm->*(arg->method))(thread); + thread->stats.Stop(); + + { + MutexLock l(&shared->mu); + shared->num_done++; + if (shared->num_done >= shared->total) { + shared->cv.SignalAll(); + } + } + } + + void RunBenchmark(int n, Slice name, + void (Benchmark::*method)(ThreadState*)) { + SharedState shared; + shared.total = n; + shared.num_initialized = 0; + shared.num_done = 0; + shared.start = false; + + ThreadArg* arg = new ThreadArg[n]; + for (int i = 0; i < n; i++) { + arg[i].bm = this; + arg[i].method = method; + arg[i].shared = &shared; + arg[i].thread = new ThreadState(i); + arg[i].thread->shared = &shared; + FLAGS_env->StartThread(ThreadBody, &arg[i]); + } + + shared.mu.Lock(); + while (shared.num_initialized < n) { + shared.cv.Wait(); + } + + shared.start = true; + shared.cv.SignalAll(); + while (shared.num_done < n) { + shared.cv.Wait(); + } + shared.mu.Unlock(); + + // Stats for some threads can be excluded. + Stats merge_stats; + for (int i = 0; i < n; i++) { + merge_stats.Merge(arg[i].thread->stats); + } + merge_stats.Report(name); + + for (int i = 0; i < n; i++) { + delete arg[i].thread; + } + delete[] arg; + } + + void Crc32c(ThreadState* thread) { + // Checksum about 500MB of data total + const int size = 4096; + const char* label = "(4K per op)"; + std::string data(size, 'x'); + int64_t bytes = 0; + uint32_t crc = 0; + while (bytes < 500 * 1048576) { + crc = crc32c::Value(data.data(), size); + thread->stats.FinishedSingleOp(nullptr); + bytes += size; + } + // Print so result is not dead + fprintf(stderr, "... crc=0x%x\r", static_cast(crc)); + + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(label); + } + + void xxHash(ThreadState* thread) { + // Checksum about 500MB of data total + const int size = 4096; + const char* label = "(4K per op)"; + std::string data(size, 'x'); + int64_t bytes = 0; + unsigned int xxh32 = 0; + while (bytes < 500 * 1048576) { + xxh32 = XXH32(data.data(), size, 0); + thread->stats.FinishedSingleOp(nullptr); + bytes += size; + } + // Print so result is not dead + fprintf(stderr, "... xxh32=0x%x\r", static_cast(xxh32)); + + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(label); + } + + void AcquireLoad(ThreadState* thread) { + int dummy; + port::AtomicPointer ap(&dummy); + int count = 0; + void *ptr = nullptr; + thread->stats.AddMessage("(each op is 1000 loads)"); + while (count < 100000) { + for (int i = 0; i < 1000; i++) { + ptr = ap.Acquire_Load(); + } + count++; + thread->stats.FinishedSingleOp(nullptr); + } + if (ptr == nullptr) exit(1); // Disable unused variable warning. + } + + void Compress(ThreadState *thread) { + RandomGenerator gen; + Slice input = gen.Generate(Options().block_size); + int64_t bytes = 0; + int64_t produced = 0; + bool ok = true; + std::string compressed; + + // Compress 1G + while (ok && bytes < int64_t(1) << 30) { + switch (FLAGS_compression_type_e) { + case rocksdb::kSnappyCompression: + ok = port::Snappy_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kZlibCompression: + ok = port::Zlib_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kBZip2Compression: + ok = port::BZip2_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kLZ4Compression: + ok = port::LZ4_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kLZ4HCCompression: + ok = port::LZ4HC_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + default: + ok = false; + } + produced += compressed.size(); + bytes += input.size(); + thread->stats.FinishedSingleOp(nullptr); + } + + if (!ok) { + thread->stats.AddMessage("(compression failure)"); + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "(output: %.1f%%)", + (produced * 100.0) / bytes); + thread->stats.AddMessage(buf); + thread->stats.AddBytes(bytes); + } + } + + void Uncompress(ThreadState *thread) { + RandomGenerator gen; + Slice input = gen.Generate(Options().block_size); + std::string compressed; + + bool ok; + switch (FLAGS_compression_type_e) { + case rocksdb::kSnappyCompression: + ok = port::Snappy_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kZlibCompression: + ok = port::Zlib_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kBZip2Compression: + ok = port::BZip2_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kLZ4Compression: + ok = port::LZ4_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kLZ4HCCompression: + ok = port::LZ4HC_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + default: + ok = false; + } + + int64_t bytes = 0; + int decompress_size; + while (ok && bytes < 1024 * 1048576) { + char *uncompressed = nullptr; + switch (FLAGS_compression_type_e) { + case rocksdb::kSnappyCompression: + // allocate here to make comparison fair + uncompressed = new char[input.size()]; + ok = port::Snappy_Uncompress(compressed.data(), compressed.size(), + uncompressed); + break; + case rocksdb::kZlibCompression: + uncompressed = port::Zlib_Uncompress( + compressed.data(), compressed.size(), &decompress_size); + ok = uncompressed != nullptr; + break; + case rocksdb::kBZip2Compression: + uncompressed = port::BZip2_Uncompress( + compressed.data(), compressed.size(), &decompress_size); + ok = uncompressed != nullptr; + break; + case rocksdb::kLZ4Compression: + uncompressed = port::LZ4_Uncompress( + compressed.data(), compressed.size(), &decompress_size); + ok = uncompressed != nullptr; + break; + case rocksdb::kLZ4HCCompression: + uncompressed = port::LZ4_Uncompress( + compressed.data(), compressed.size(), &decompress_size); + ok = uncompressed != nullptr; + break; + default: + ok = false; + } + delete[] uncompressed; + bytes += input.size(); + thread->stats.FinishedSingleOp(nullptr); + } + + if (!ok) { + thread->stats.AddMessage("(compression failure)"); + } else { + thread->stats.AddBytes(bytes); + } + } + + void Open() { + assert(db_ == nullptr); + Options options; + options.create_if_missing = !FLAGS_use_existing_db; + options.block_cache = cache_; + options.block_cache_compressed = compressed_cache_; + if (cache_ == nullptr) { + options.no_block_cache = true; + } + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + options.max_background_compactions = FLAGS_max_background_compactions; + options.max_background_flushes = FLAGS_max_background_flushes; + options.compaction_style = FLAGS_compaction_style_e; + options.block_size = FLAGS_block_size; + options.filter_policy = filter_policy_; + if (FLAGS_use_plain_table) { + options.prefix_extractor.reset( + NewFixedPrefixTransform(FLAGS_prefix_size)); + } + options.memtable_prefix_bloom_bits = FLAGS_memtable_bloom_bits; + options.bloom_locality = FLAGS_bloom_locality; + options.max_open_files = FLAGS_open_files; + options.statistics = dbstats; + options.env = FLAGS_env; + options.disableDataSync = FLAGS_disable_data_sync; + options.use_fsync = FLAGS_use_fsync; + options.wal_dir = FLAGS_wal_dir; + options.num_levels = FLAGS_num_levels; + options.target_file_size_base = FLAGS_target_file_size_base; + options.target_file_size_multiplier = FLAGS_target_file_size_multiplier; + options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; + options.max_bytes_for_level_multiplier = + FLAGS_max_bytes_for_level_multiplier; + options.filter_deletes = FLAGS_filter_deletes; + if ((FLAGS_prefix_size == 0) && (FLAGS_rep_factory == kPrefixHash || + FLAGS_rep_factory == kHashLinkedList)) { + fprintf(stderr, "prefix_size should be non-zero if PrefixHash or " + "HashLinkedList memtablerep is used\n"); + exit(1); + } + switch (FLAGS_rep_factory) { + case kPrefixHash: + options.memtable_factory.reset(NewHashSkipListRepFactory( + FLAGS_hash_bucket_count)); + break; + case kSkipList: + // no need to do anything + break; + case kHashLinkedList: + options.memtable_factory.reset(NewHashLinkListRepFactory( + FLAGS_hash_bucket_count)); + break; + case kVectorRep: + options.memtable_factory.reset( + new VectorRepFactory + ); + break; + case kCuckoo: + options.memtable_factory.reset(NewHashCuckooRepFactory( + options.write_buffer_size, FLAGS_key_size + FLAGS_value_size)); + break; + } + if (FLAGS_use_plain_table) { + if (FLAGS_rep_factory != kPrefixHash && + FLAGS_rep_factory != kHashLinkedList) { + fprintf(stderr, "Waring: plain table is used with skipList\n"); + } + if (!FLAGS_mmap_read && !FLAGS_mmap_write) { + fprintf(stderr, "plain table format requires mmap to operate\n"); + exit(1); + } + + int bloom_bits_per_key = FLAGS_bloom_bits; + if (bloom_bits_per_key < 0) { + bloom_bits_per_key = 0; + } + options.table_factory = std::shared_ptr( + NewPlainTableFactory(FLAGS_key_size, bloom_bits_per_key, 0.75)); + } + if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) { + if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() != + (unsigned int)FLAGS_num_levels) { + fprintf(stderr, "Insufficient number of fanouts specified %d\n", + (int)FLAGS_max_bytes_for_level_multiplier_additional_v.size()); + exit(1); + } + options.max_bytes_for_level_multiplier_additional = + FLAGS_max_bytes_for_level_multiplier_additional_v; + } + options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger; + options.level0_file_num_compaction_trigger = + FLAGS_level0_file_num_compaction_trigger; + options.level0_slowdown_writes_trigger = + FLAGS_level0_slowdown_writes_trigger; + options.compression = FLAGS_compression_type_e; + options.compression_opts.level = FLAGS_compression_level; + options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds; + options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB; + if (FLAGS_min_level_to_compress >= 0) { + assert(FLAGS_min_level_to_compress <= FLAGS_num_levels); + options.compression_per_level.resize(FLAGS_num_levels); + for (int i = 0; i < FLAGS_min_level_to_compress; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = FLAGS_min_level_to_compress; + i < FLAGS_num_levels; i++) { + options.compression_per_level[i] = FLAGS_compression_type_e; + } + } + options.disable_seek_compaction = FLAGS_disable_seek_compaction; + options.delete_obsolete_files_period_micros = + FLAGS_delete_obsolete_files_period_micros; + options.soft_rate_limit = FLAGS_soft_rate_limit; + options.hard_rate_limit = FLAGS_hard_rate_limit; + options.rate_limit_delay_max_milliseconds = + FLAGS_rate_limit_delay_max_milliseconds; + options.table_cache_numshardbits = FLAGS_table_cache_numshardbits; + options.max_grandparent_overlap_factor = + FLAGS_max_grandparent_overlap_factor; + options.disable_auto_compactions = FLAGS_disable_auto_compactions; + options.source_compaction_factor = FLAGS_source_compaction_factor; + + // fill storage options + options.allow_os_buffer = FLAGS_bufferedio; + options.allow_mmap_reads = FLAGS_mmap_read; + options.allow_mmap_writes = FLAGS_mmap_write; + options.advise_random_on_open = FLAGS_advise_random_on_open; + options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e; + options.use_adaptive_mutex = FLAGS_use_adaptive_mutex; + options.bytes_per_sync = FLAGS_bytes_per_sync; + + // merge operator options + options.merge_operator = MergeOperators::CreateFromStringId( + FLAGS_merge_operator); + if (options.merge_operator == nullptr && !FLAGS_merge_operator.empty()) { + fprintf(stderr, "invalid merge operator: %s\n", + FLAGS_merge_operator.c_str()); + exit(1); + } + options.max_successive_merges = FLAGS_max_successive_merges; + + // set universal style compaction configurations, if applicable + if (FLAGS_universal_size_ratio != 0) { + options.compaction_options_universal.size_ratio = + FLAGS_universal_size_ratio; + } + if (FLAGS_universal_min_merge_width != 0) { + options.compaction_options_universal.min_merge_width = + FLAGS_universal_min_merge_width; + } + if (FLAGS_universal_max_merge_width != 0) { + options.compaction_options_universal.max_merge_width = + FLAGS_universal_max_merge_width; + } + if (FLAGS_universal_max_size_amplification_percent != 0) { + options.compaction_options_universal.max_size_amplification_percent = + FLAGS_universal_max_size_amplification_percent; + } + if (FLAGS_universal_compression_size_percent != -1) { + options.compaction_options_universal.compression_size_percent = + FLAGS_universal_compression_size_percent; + } + + if (FLAGS_num_multi_db <= 1) { + OpenDb(options, FLAGS_db, &db_); + } else { + multi_dbs_.clear(); + for (int i = 0; i < FLAGS_num_multi_db; i++) { + DB* db; + OpenDb(options, GetDbNameForMultiple(FLAGS_db, i), &db); + multi_dbs_.push_back(db); + } + } + if (FLAGS_min_level_to_compress >= 0) { + options.compression_per_level.clear(); + } + } + + void OpenDb(Options options, std::string db_name, DB** db) { + Status s; + if(FLAGS_readonly) { + s = DB::OpenForReadOnly(options, db_name, db); + } else { + s = DB::Open(options, db_name, db); + } + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + } + + enum WriteMode { + RANDOM, SEQUENTIAL, UNIQUE_RANDOM + }; + + void WriteSeq(ThreadState* thread) { + DoWrite(thread, SEQUENTIAL); + } + + void WriteRandom(ThreadState* thread) { + DoWrite(thread, RANDOM); + } + + void WriteUniqueRandom(ThreadState* thread) { + DoWrite(thread, UNIQUE_RANDOM); + } + + class KeyGenerator { + public: + KeyGenerator(Random64* rand, WriteMode mode, + uint64_t num, uint64_t num_per_set = 64 * 1024) + : rand_(rand), + mode_(mode), + num_(num), + next_(0) { + if (mode_ == UNIQUE_RANDOM) { + // NOTE: if memory consumption of this approach becomes a concern, + // we can either break it into pieces and only random shuffle a section + // each time. Alternatively, use a bit map implementation + // (https://reviews.facebook.net/differential/diff/54627/) + values_.resize(num_); + for (uint64_t i = 0; i < num_; ++i) { + values_[i] = i; + } + std::shuffle(values_.begin(), values_.end(), + std::default_random_engine(FLAGS_seed)); + } + } + + uint64_t Next() { + switch (mode_) { + case SEQUENTIAL: + return next_++; + case RANDOM: + return rand_->Next() % num_; + case UNIQUE_RANDOM: + return values_[next_++]; + } + assert(false); + return std::numeric_limits::max(); + } + + private: + Random64* rand_; + WriteMode mode_; + const uint64_t num_; + uint64_t next_; + std::vector values_; + }; + + DB* SelectDB(ThreadState* thread) { + if (db_ != nullptr) { + return db_; + } else { + return multi_dbs_[thread->rand.Next() % multi_dbs_.size()]; + } + } + + void DoWrite(ThreadState* thread, WriteMode write_mode) { + const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0; + const int64_t num_ops = writes_ == 0 ? num_ : writes_; + + size_t num_key_gens = 1; + if (db_ == nullptr) { + num_key_gens = multi_dbs_.size(); + } + std::vector> key_gens(num_key_gens); + Duration duration(test_duration, num_ops * num_key_gens); + for (size_t i = 0; i < num_key_gens; i++) { + key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode, num_ops)); + } + + if (num_ != FLAGS_num) { + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_); + thread->stats.AddMessage(msg); + } + + RandomGenerator gen; + WriteBatch batch; + Status s; + int64_t bytes = 0; + + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + while (!duration.Done(entries_per_batch_)) { + size_t id = 0; + DB* db_to_write = db_; + if (db_to_write == nullptr) { + id = thread->rand.Next() % num_key_gens; + db_to_write = multi_dbs_[id]; + } + batch.Clear(); + for (int64_t j = 0; j < entries_per_batch_; j++) { + GenerateKeyFromInt(key_gens[id]->Next(), FLAGS_num, &key); + batch.Put(key, gen.Generate(value_size_)); + bytes += value_size_ + key_size_; + thread->stats.FinishedSingleOp(db_to_write); + } + s = db_to_write->Write(write_options_, &batch); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + } + thread->stats.AddBytes(bytes); + } + + void ReadSequential(ThreadState* thread) { + if (db_ != nullptr) { + ReadSequential(thread, db_); + } else { + for (DB* db : multi_dbs_) { + ReadSequential(thread, db); + } + } + } + + void ReadSequential(ThreadState* thread, DB* db) { + Iterator* iter = db->NewIterator(ReadOptions(FLAGS_verify_checksum, true)); + int64_t i = 0; + int64_t bytes = 0; + for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) { + bytes += iter->key().size() + iter->value().size(); + thread->stats.FinishedSingleOp(db); + ++i; + } + delete iter; + thread->stats.AddBytes(bytes); + } + + void ReadReverse(ThreadState* thread) { + if (db_ != nullptr) { + ReadReverse(thread, db_); + } else { + for (DB* db : multi_dbs_) { + ReadReverse(thread, db); + } + } + } + + void ReadReverse(ThreadState* thread, DB* db) { + Iterator* iter = db->NewIterator(ReadOptions(FLAGS_verify_checksum, true)); + int64_t i = 0; + int64_t bytes = 0; + for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) { + bytes += iter->key().size() + iter->value().size(); + thread->stats.FinishedSingleOp(db_); + ++i; + } + delete iter; + thread->stats.AddBytes(bytes); + } + + void ReadRandom(ThreadState* thread) { + int64_t read = 0; + int64_t found = 0; + ReadOptions options(FLAGS_verify_checksum, true); + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + std::string value; + + Duration duration(FLAGS_duration, reads_); + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + read++; + if (db->Get(options, key, &value).ok()) { + found++; + } + thread->stats.FinishedSingleOp(db_); + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", + found, read); + + thread->stats.AddMessage(msg); + + if (FLAGS_perf_level > 0) { + thread->stats.AddMessage(perf_context.ToString()); + } + } + + // Calls MultiGet over a list of keys from a random distribution. + // Returns the total number of keys found. + void MultiReadRandom(ThreadState* thread) { + int64_t read = 0; + int64_t found = 0; + ReadOptions options(FLAGS_verify_checksum, true); + std::vector keys; + std::vector values(entries_per_batch_); + while (static_cast(keys.size()) < entries_per_batch_) { + keys.push_back(AllocateKey()); + } + + Duration duration(FLAGS_duration, reads_); + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + for (int64_t i = 0; i < entries_per_batch_; ++i) { + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, + FLAGS_num, &keys[i]); + } + std::vector statuses = db->MultiGet(options, keys, &values); + assert(static_cast(statuses.size()) == entries_per_batch_); + + read += entries_per_batch_; + for (int64_t i = 0; i < entries_per_batch_; ++i) { + if (statuses[i].ok()) { + ++found; + } + } + } + for (auto& k : keys) { + delete k.data(); + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", + found, read); + thread->stats.AddMessage(msg); + } + + void IteratorCreation(ThreadState* thread) { + Duration duration(FLAGS_duration, reads_); + ReadOptions options(FLAGS_verify_checksum, true); + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + Iterator* iter = db->NewIterator(options); + delete iter; + thread->stats.FinishedSingleOp(db); + } + } + + void IteratorCreationWhileWriting(ThreadState* thread) { + if (thread->tid > 0) { + IteratorCreation(thread); + } else { + BGWriter(thread); + } + } + + void SeekRandom(ThreadState* thread) { + int64_t read = 0; + int64_t found = 0; + ReadOptions options(FLAGS_verify_checksum, true); + options.tailing = FLAGS_use_tailing_iterator; + + Iterator* single_iter = nullptr; + std::vector multi_iters; + if (db_ != nullptr) { + single_iter = db_->NewIterator(options); + } else { + for (DB* db : multi_dbs_) { + multi_iters.push_back(db->NewIterator(options)); + } + } + uint64_t last_refresh = FLAGS_env->NowMicros(); + + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + + Duration duration(FLAGS_duration, reads_); + while (!duration.Done(1)) { + if (!FLAGS_use_tailing_iterator && FLAGS_iter_refresh_interval_us >= 0) { + uint64_t now = FLAGS_env->NowMicros(); + if (now - last_refresh > (uint64_t)FLAGS_iter_refresh_interval_us) { + if (db_ != nullptr) { + delete single_iter; + single_iter = db_->NewIterator(options); + } else { + for (auto iter : multi_iters) { + delete iter; + } + multi_iters.clear(); + for (DB* db : multi_dbs_) { + multi_iters.push_back(db->NewIterator(options)); + } + } + } + last_refresh = now; + } + // Pick a Iterator to use + Iterator* iter_to_use = single_iter; + if (single_iter == nullptr) { + iter_to_use = multi_iters[thread->rand.Next() % multi_iters.size()]; + } + + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + iter_to_use->Seek(key); + read++; + if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) { + found++; + } + thread->stats.FinishedSingleOp(db_); + } + delete single_iter; + for (auto iter : multi_iters) { + delete iter; + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", + found, read); + thread->stats.AddMessage(msg); + if (FLAGS_perf_level > 0) { + thread->stats.AddMessage(perf_context.ToString()); + } + } + + void SeekRandomWhileWriting(ThreadState* thread) { + if (thread->tid > 0) { + SeekRandom(thread); + } else { + BGWriter(thread); + } + } + + void DoDelete(ThreadState* thread, bool seq) { + WriteBatch batch; + Duration duration(seq ? 0 : FLAGS_duration, num_); + int64_t i = 0; + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + + while (!duration.Done(entries_per_batch_)) { + DB* db = SelectDB(thread); + batch.Clear(); + for (int64_t j = 0; j < entries_per_batch_; ++j) { + const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num); + GenerateKeyFromInt(k, FLAGS_num, &key); + batch.Delete(key); + thread->stats.FinishedSingleOp(db); + } + auto s = db->Write(write_options_, &batch); + if (!s.ok()) { + fprintf(stderr, "del error: %s\n", s.ToString().c_str()); + exit(1); + } + i += entries_per_batch_; + } + } + + void DeleteSeq(ThreadState* thread) { + DoDelete(thread, true); + } + + void DeleteRandom(ThreadState* thread) { + DoDelete(thread, false); + } + + void ReadWhileWriting(ThreadState* thread) { + if (thread->tid > 0) { + ReadRandom(thread); + } else { + BGWriter(thread); + } + } + + void BGWriter(ThreadState* thread) { + // Special thread that keeps writing until other threads are done. + RandomGenerator gen; + double last = FLAGS_env->NowMicros(); + int writes_per_second_by_10 = 0; + int num_writes = 0; + + // --writes_per_second rate limit is enforced per 100 milliseconds + // intervals to avoid a burst of writes at the start of each second. + + if (FLAGS_writes_per_second > 0) + writes_per_second_by_10 = FLAGS_writes_per_second / 10; + + // Don't merge stats from this thread with the readers. + thread->stats.SetExcludeFromMerge(); + + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + + while (true) { + DB* db = SelectDB(thread); + { + MutexLock l(&thread->shared->mu); + if (thread->shared->num_done + 1 >= thread->shared->num_initialized) { + // Other threads have finished + break; + } + } + + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + Status s = db->Put(write_options_, key, gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + thread->stats.FinishedSingleOp(db_); + + ++num_writes; + if (writes_per_second_by_10 && num_writes >= writes_per_second_by_10) { + double now = FLAGS_env->NowMicros(); + double usecs_since_last = now - last; + + num_writes = 0; + last = now; + + if (usecs_since_last < 100000.0) { + FLAGS_env->SleepForMicroseconds(100000.0 - usecs_since_last); + last = FLAGS_env->NowMicros(); + } + } + } + } + + // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V) + // in DB atomically i.e in a single batch. Also refer GetMany. + Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key, + const Slice& value) { + std::string suffixes[3] = {"2", "1", "0"}; + std::string keys[3]; + + WriteBatch batch; + Status s; + for (int i = 0; i < 3; i++) { + keys[i] = key.ToString() + suffixes[i]; + batch.Put(keys[i], value); + } + + s = db->Write(writeoptions, &batch); + return s; + } + + + // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V) + // in DB atomically i.e in a single batch. Also refer GetMany. + Status DeleteMany(DB* db, const WriteOptions& writeoptions, + const Slice& key) { + std::string suffixes[3] = {"1", "2", "0"}; + std::string keys[3]; + + WriteBatch batch; + Status s; + for (int i = 0; i < 3; i++) { + keys[i] = key.ToString() + suffixes[i]; + batch.Delete(keys[i]); + } + + s = db->Write(writeoptions, &batch); + return s; + } + + // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2" + // in the same snapshot, and verifies that all the values are identical. + // ASSUMES that PutMany was used to put (K, V) into the DB. + Status GetMany(DB* db, const ReadOptions& readoptions, const Slice& key, + std::string* value) { + std::string suffixes[3] = {"0", "1", "2"}; + std::string keys[3]; + Slice key_slices[3]; + std::string values[3]; + ReadOptions readoptionscopy = readoptions; + readoptionscopy.snapshot = db->GetSnapshot(); + Status s; + for (int i = 0; i < 3; i++) { + keys[i] = key.ToString() + suffixes[i]; + key_slices[i] = keys[i]; + s = db->Get(readoptionscopy, key_slices[i], value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + values[i] = ""; + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (s.IsNotFound()) { + values[i] = ""; + } else { + values[i] = *value; + } + } + db->ReleaseSnapshot(readoptionscopy.snapshot); + + if ((values[0] != values[1]) || (values[1] != values[2])) { + fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n", + key.ToString().c_str(), values[0].c_str(), values[1].c_str(), + values[2].c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } + + return s; + } + + // Differs from readrandomwriterandom in the following ways: + // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs. + // (b) Does deletes as well (per FLAGS_deletepercent) + // (c) In order to achieve high % of 'found' during lookups, and to do + // multiple writes (including puts and deletes) it uses upto + // FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys. + // (d) Does not have a MultiGet option. + void RandomWithVerify(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + int64_t found = 0; + int get_weight = 0; + int put_weight = 0; + int delete_weight = 0; + int64_t gets_done = 0; + int64_t puts_done = 0; + int64_t deletes_done = 0; + + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + + // the number of iterations is the larger of read_ or write_ + for (int64_t i = 0; i < readwrites_; i++) { + DB* db = SelectDB(thread); + if (get_weight == 0 && put_weight == 0 && delete_weight == 0) { + // one batch completed, reinitialize for next batch + get_weight = FLAGS_readwritepercent; + delete_weight = FLAGS_deletepercent; + put_weight = 100 - get_weight - delete_weight; + } + GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct, + FLAGS_numdistinct, &key); + if (get_weight > 0) { + // do all the gets first + Status s = GetMany(db, options, key, &value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "getmany error: %s\n", s.ToString().c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (!s.IsNotFound()) { + found++; + } + get_weight--; + gets_done++; + } else if (put_weight > 0) { + // then do all the corresponding number of puts + // for all the gets we have done earlier + Status s = PutMany(db, write_options_, key, gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "putmany error: %s\n", s.ToString().c_str()); + exit(1); + } + put_weight--; + puts_done++; + } else if (delete_weight > 0) { + Status s = DeleteMany(db, write_options_, key); + if (!s.ok()) { + fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str()); + exit(1); + } + delete_weight--; + deletes_done++; + } + + thread->stats.FinishedSingleOp(db_); + } + char msg[100]; + snprintf(msg, sizeof(msg), + "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" \ + PRIu64 " found:%" PRIu64 ")", + gets_done, puts_done, deletes_done, readwrites_, found); + thread->stats.AddMessage(msg); + } + + // This is different from ReadWhileWriting because it does not use + // an extra thread. + void ReadRandomWriteRandom(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + int64_t found = 0; + int get_weight = 0; + int put_weight = 0; + int64_t reads_done = 0; + int64_t writes_done = 0; + Duration duration(FLAGS_duration, readwrites_); + + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + + // the number of iterations is the larger of read_ or write_ + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + if (get_weight == 0 && put_weight == 0) { + // one batch completed, reinitialize for next batch + get_weight = FLAGS_readwritepercent; + put_weight = 100 - get_weight; + } + if (get_weight > 0) { + // do all the gets first + Status s = db->Get(options, key, &value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (!s.IsNotFound()) { + found++; + } + get_weight--; + reads_done++; + } else if (put_weight > 0) { + // then do all the corresponding number of puts + // for all the gets we have done earlier + Status s = db->Put(write_options_, key, gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + put_weight--; + writes_done++; + } + thread->stats.FinishedSingleOp(db); + } + char msg[100]; + snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \ + " total:%" PRIu64 " found:%" PRIu64 ")", + reads_done, writes_done, readwrites_, found); + thread->stats.AddMessage(msg); + } + + // + // Read-modify-write for random keys + void UpdateRandom(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + int64_t found = 0; + Duration duration(FLAGS_duration, readwrites_); + + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + // the number of iterations is the larger of read_ or write_ + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + + if (db->Get(options, key, &value).ok()) { + found++; + } + + Status s = db->Put(write_options_, key, gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + thread->stats.FinishedSingleOp(db); + } + char msg[100]; + snprintf(msg, sizeof(msg), + "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found); + thread->stats.AddMessage(msg); + } + + // Read-modify-write for random keys. + // Each operation causes the key grow by value_size (simulating an append). + // Generally used for benchmarking against merges of similar type + void AppendRandom(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + int64_t found = 0; + + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + // The number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + + // Get the existing value + if (db->Get(options, key, &value).ok()) { + found++; + } else { + // If not existing, then just assume an empty string of data + value.clear(); + } + + // Update the value (by appending data) + Slice operand = gen.Generate(value_size_); + if (value.size() > 0) { + // Use a delimeter to match the semantics for StringAppendOperator + value.append(1,','); + } + value.append(operand.data(), operand.size()); + + // Write back to the database + Status s = db->Put(write_options_, key, value); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + thread->stats.FinishedSingleOp(db_); + } + + char msg[100]; + snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")", + readwrites_, found); + thread->stats.AddMessage(msg); + } + + // Read-modify-write for random keys (using MergeOperator) + // The merge operator to use should be defined by FLAGS_merge_operator + // Adjust FLAGS_value_size so that the keys are reasonable for this operator + // Assumes that the merge operator is non-null (i.e.: is well-defined) + // + // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8 + // to simulate random additions over 64-bit integers using merge. + // + // The number of merges on the same key can be controlled by adjusting + // FLAGS_merge_keys. + void MergeRandom(ThreadState* thread) { + RandomGenerator gen; + + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + // The number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key); + + Status s = db->Merge(write_options_, key, gen.Generate(value_size_)); + + if (!s.ok()) { + fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); + exit(1); + } + thread->stats.FinishedSingleOp(db_); + } + + // Print some statistics + char msg[100]; + snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_); + thread->stats.AddMessage(msg); + } + + // Read and merge random keys. The amount of reads and merges are controlled + // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct + // keys (and thus also the number of reads and merges on the same key) can be + // adjusted with FLAGS_merge_keys. + // + // As with MergeRandom, the merge operator to use should be defined by + // FLAGS_merge_operator. + void ReadRandomMergeRandom(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + int64_t num_hits = 0; + int64_t num_gets = 0; + int64_t num_merges = 0; + size_t max_length = 0; + + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + // the number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key); + + bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent; + + if (do_merge) { + Status s = db->Merge(write_options_, key, gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); + exit(1); + } + + num_merges++; + + } else { + Status s = db->Get(options, key, &value); + if (value.length() > max_length) + max_length = value.length(); + + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (!s.IsNotFound()) { + num_hits++; + } + + num_gets++; + + } + + thread->stats.FinishedSingleOp(db_); + } + + char msg[100]; + snprintf(msg, sizeof(msg), + "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64 " hits:%" \ + PRIu64 " maxlength:%zu)", + num_gets, num_merges, readwrites_, num_hits, max_length); + thread->stats.AddMessage(msg); + } + + void Compact(ThreadState* thread) { + DB* db = SelectDB(thread); + db->CompactRange(nullptr, nullptr); + } + + void PrintStats(const char* key) { + if (db_ != nullptr) { + PrintStats(db_, key, false); + } + for (DB* db : multi_dbs_) { + PrintStats(db, key, true); + } + } + + void PrintStats(DB* db, const char* key, bool print_header = false) { + if (print_header) { + fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str()); + } + std::string stats; + if (!db->GetProperty(key, &stats)) { + stats = "(failed)"; + } + fprintf(stdout, "\n%s\n", stats.c_str()); + } +}; + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + ParseCommandLineFlags(&argc, &argv, true); + + FLAGS_compaction_style_e = (rocksdb::CompactionStyle) FLAGS_compaction_style; + if (FLAGS_statistics) { + dbstats = rocksdb::CreateDBStatistics(); + } + + std::vector fanout = + rocksdb::stringSplit(FLAGS_max_bytes_for_level_multiplier_additional, ','); + for (unsigned int j= 0; j < fanout.size(); j++) { + FLAGS_max_bytes_for_level_multiplier_additional_v.push_back( + std::stoi(fanout[j])); + } + + FLAGS_compression_type_e = + StringToCompressionType(FLAGS_compression_type.c_str()); + + if (!FLAGS_hdfs.empty()) { + FLAGS_env = new rocksdb::HdfsEnv(FLAGS_hdfs); + } + + if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE")) + FLAGS_compaction_fadvice_e = rocksdb::Options::NONE; + else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL")) + FLAGS_compaction_fadvice_e = rocksdb::Options::NORMAL; + else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL")) + FLAGS_compaction_fadvice_e = rocksdb::Options::SEQUENTIAL; + else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED")) + FLAGS_compaction_fadvice_e = rocksdb::Options::WILLNEED; + else { + fprintf(stdout, "Unknown compaction fadvice:%s\n", + FLAGS_compaction_fadvice.c_str()); + } + + FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); + + // The number of background threads should be at least as much the + // max number of concurrent compactions. + FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions); + // Choose a location for the test database if none given with --db= + if (FLAGS_db.empty()) { + std::string default_db_path; + rocksdb::Env::Default()->GetTestDirectory(&default_db_path); + default_db_path += "/dbbench"; + FLAGS_db = default_db_path; + } + + rocksdb::Benchmark benchmark; + benchmark.Run(); + return 0; +} + +#endif // GFLAGS diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc new file mode 100644 index 0000000000..582355ccdf --- /dev/null +++ b/db/db_filesnapshot.cc @@ -0,0 +1,179 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 Facebook. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef ROCKSDB_LITE + +#define __STDC_FORMAT_MACROS +#include +#include +#include +#include +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include "util/sync_point.h" + +namespace rocksdb { + +Status DBImpl::DisableFileDeletions() { + MutexLock l(&mutex_); + ++disable_delete_obsolete_files_; + if (disable_delete_obsolete_files_ == 1) { + Log(options_.info_log, "File Deletions Disabled"); + } else { + Log(options_.info_log, + "File Deletions Disabled, but already disabled. Counter: %d", + disable_delete_obsolete_files_); + } + return Status::OK(); +} + +Status DBImpl::EnableFileDeletions(bool force) { + DeletionState deletion_state; + bool should_purge_files = false; + { + MutexLock l(&mutex_); + if (force) { + // if force, we need to enable file deletions right away + disable_delete_obsolete_files_ = 0; + } else if (disable_delete_obsolete_files_ > 0) { + --disable_delete_obsolete_files_; + } + if (disable_delete_obsolete_files_ == 0) { + Log(options_.info_log, "File Deletions Enabled"); + should_purge_files = true; + FindObsoleteFiles(deletion_state, true); + } else { + Log(options_.info_log, + "File Deletions Enable, but not really enabled. Counter: %d", + disable_delete_obsolete_files_); + } + } + if (should_purge_files) { + PurgeObsoleteFiles(deletion_state); + } + LogFlush(options_.info_log); + return Status::OK(); +} + +Status DBImpl::GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool flush_memtable) { + + *manifest_file_size = 0; + + mutex_.Lock(); + + if (flush_memtable) { + // flush all dirty data to disk. + Status status; + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->Ref(); + mutex_.Unlock(); + status = FlushMemTable(cfd, FlushOptions()); + mutex_.Lock(); + cfd->Unref(); + if (!status.ok()) { + break; + } + } + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + + if (!status.ok()) { + mutex_.Unlock(); + Log(options_.info_log, "Cannot Flush data %s\n", + status.ToString().c_str()); + return status; + } + } + + // Make a set of all of the live *.sst files + std::set live; + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->current()->AddLiveFiles(&live); + } + + ret.clear(); + ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST + + // create names of the live files. The names are not absolute + // paths, instead they are relative to dbname_; + for (auto live_file : live) { + ret.push_back(TableFileName("", live_file)); + } + + ret.push_back(CurrentFileName("")); + ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber())); + + // find length of manifest file while holding the mutex lock + *manifest_file_size = versions_->ManifestFileSize(); + + mutex_.Unlock(); + return Status::OK(); +} + +Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { + // First get sorted files in db dir, then get sorted files from archived + // dir, to avoid a race condition where a log file is moved to archived + // dir in between. + Status s; + // list wal files in main db dir. + VectorLogPtr logs; + s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile); + if (!s.ok()) { + return s; + } + + // Reproduce the race condition where a log file is moved + // to archived dir, between these two sync points, used in + // (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1"); + TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2"); + + files.clear(); + // list wal files in archive dir. + std::string archivedir = ArchivalDirectory(options_.wal_dir); + if (env_->FileExists(archivedir)) { + s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile); + if (!s.ok()) { + return s; + } + } + + uint64_t latest_archived_log_number = 0; + if (!files.empty()) { + latest_archived_log_number = files.back()->LogNumber(); + Log(options_.info_log, "Latest Archived log: %" PRIu64, + latest_archived_log_number); + } + + files.reserve(files.size() + logs.size()); + for (auto& log : logs) { + if (log->LogNumber() > latest_archived_log_number) { + files.push_back(std::move(log)); + } else { + // When the race condition happens, we could see the + // same log in both db dir and archived dir. Simply + // ignore the one in db dir. Note that, if we read + // archived dir first, we would have missed the log file. + Log(options_.info_log, "%s already moved to archive", + log->PathName().c_str()); + } + } + + return s; +} + +} + +#endif // ROCKSDB_LITE diff --git a/db/db_impl.cc b/db/db_impl.cc new file mode 100644 index 0000000000..5301fa5991 --- /dev/null +++ b/db/db_impl.cc @@ -0,0 +1,4703 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl.h" + +#define __STDC_FORMAT_MACROS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/builder.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/memtable_list.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/table_cache.h" +#include "db/table_properties_collector.h" +#include "db/tailing_iter.h" +#include "db/forward_iterator.h" +#include "db/transaction_log_impl.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "port/likely.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "table/block.h" +#include "table/block_based_table_factory.h" +#include "table/merger.h" +#include "table/table_builder.h" +#include "table/two_level_iterator.h" +#include "util/auto_roll_logger.h" +#include "util/autovector.h" +#include "util/build_version.h" +#include "util/coding.h" +#include "util/hash_skiplist_rep.h" +#include "util/hash_linklist_rep.h" +#include "util/logging.h" +#include "util/log_buffer.h" +#include "util/mutexlock.h" +#include "util/perf_context_imp.h" +#include "util/stop_watch.h" +#include "util/sync_point.h" + +namespace rocksdb { + +const std::string kDefaultColumnFamilyName("default"); + +void DumpLeveldbBuildVersion(Logger * log); + +// Information kept for every waiting writer +struct DBImpl::Writer { + Status status; + WriteBatch* batch; + bool sync; + bool disableWAL; + bool done; + port::CondVar cv; + + explicit Writer(port::Mutex* mu) : cv(mu) { } +}; + +struct DBImpl::CompactionState { + Compaction* const compaction; + + // If there were two snapshots with seq numbers s1 and + // s2 and s1 < s2, and if we find two instances of a key k1 then lies + // entirely within s1 and s2, then the earlier version of k1 can be safely + // deleted because that version is not visible in any snapshot. + std::vector existing_snapshots; + + // Files produced by compaction + struct Output { + uint64_t number; + uint64_t file_size; + InternalKey smallest, largest; + SequenceNumber smallest_seqno, largest_seqno; + }; + std::vector outputs; + std::list allocated_file_numbers; + + // State kept for output being generated + unique_ptr outfile; + unique_ptr builder; + + uint64_t total_bytes; + + Output* current_output() { return &outputs[outputs.size()-1]; } + + explicit CompactionState(Compaction* c) + : compaction(c), + total_bytes(0) { + } + + // Create a client visible context of this compaction + CompactionFilter::Context GetFilterContextV1() { + CompactionFilter::Context context; + context.is_full_compaction = compaction->IsFullCompaction(); + context.is_manual_compaction = compaction->IsManualCompaction(); + return context; + } + + // Create a client visible context of this compaction + CompactionFilterContext GetFilterContext() { + CompactionFilterContext context; + context.is_full_compaction = compaction->IsFullCompaction(); + context.is_manual_compaction = compaction->IsManualCompaction(); + return context; + } + + std::vector key_buf_; + std::vector existing_value_buf_; + std::vector key_str_buf_; + std::vector existing_value_str_buf_; + // new_value_buf_ will only be appended if a value changes + std::vector new_value_buf_; + // if values_changed_buf_[i] is true + // new_value_buf_ will add a new entry with the changed value + std::vector value_changed_buf_; + // to_delete_buf_[i] is true iff key_buf_[i] is deleted + std::vector to_delete_buf_; + // buffer for the parsed internal keys, the string buffer is backed + // by key_str_buf_ + std::vector ikey_buf_; + + std::vector other_key_buf_; + std::vector other_value_buf_; + std::vector other_key_str_buf_; + std::vector other_value_str_buf_; + + std::vector combined_key_buf_; + std::vector combined_value_buf_; + + std::string cur_prefix_; + + // Buffers the kv-pair that will be run through compaction filter V2 + // in the future. + void BufferKeyValueSlices(const Slice& key, const Slice& value) { + key_str_buf_.emplace_back(key.ToString()); + existing_value_str_buf_.emplace_back(value.ToString()); + key_buf_.emplace_back(Slice(key_str_buf_.back())); + existing_value_buf_.emplace_back(Slice(existing_value_str_buf_.back())); + + ParsedInternalKey ikey; + ParseInternalKey(key_buf_.back(), &ikey); + ikey_buf_.emplace_back(ikey); + } + + // Buffers the kv-pair that will not be run through compaction filter V2 + // in the future. + void BufferOtherKeyValueSlices(const Slice& key, const Slice& value) { + other_key_str_buf_.emplace_back(key.ToString()); + other_value_str_buf_.emplace_back(value.ToString()); + other_key_buf_.emplace_back(Slice(other_key_str_buf_.back())); + other_value_buf_.emplace_back(Slice(other_value_str_buf_.back())); + } + + // Add a kv-pair to the combined buffer + void AddToCombinedKeyValueSlices(const Slice& key, const Slice& value) { + // The real strings are stored in the batch buffers + combined_key_buf_.emplace_back(key); + combined_value_buf_.emplace_back(value); + } + + // Merging the two buffers + void MergeKeyValueSliceBuffer(const InternalKeyComparator* comparator) { + size_t i = 0; + size_t j = 0; + size_t total_size = key_buf_.size() + other_key_buf_.size(); + combined_key_buf_.reserve(total_size); + combined_value_buf_.reserve(total_size); + + while (i + j < total_size) { + int comp_res = 0; + if (i < key_buf_.size() && j < other_key_buf_.size()) { + comp_res = comparator->Compare(key_buf_[i], other_key_buf_[j]); + } else if (i >= key_buf_.size() && j < other_key_buf_.size()) { + comp_res = 1; + } else if (j >= other_key_buf_.size() && i < key_buf_.size()) { + comp_res = -1; + } + if (comp_res > 0) { + AddToCombinedKeyValueSlices(other_key_buf_[j], other_value_buf_[j]); + j++; + } else if (comp_res < 0) { + AddToCombinedKeyValueSlices(key_buf_[i], existing_value_buf_[i]); + i++; + } + } + } + + void CleanupBatchBuffer() { + to_delete_buf_.clear(); + key_buf_.clear(); + existing_value_buf_.clear(); + key_str_buf_.clear(); + existing_value_str_buf_.clear(); + new_value_buf_.clear(); + value_changed_buf_.clear(); + ikey_buf_.clear(); + + to_delete_buf_.shrink_to_fit(); + key_buf_.shrink_to_fit(); + existing_value_buf_.shrink_to_fit(); + key_str_buf_.shrink_to_fit(); + existing_value_str_buf_.shrink_to_fit(); + new_value_buf_.shrink_to_fit(); + value_changed_buf_.shrink_to_fit(); + ikey_buf_.shrink_to_fit(); + + other_key_buf_.clear(); + other_value_buf_.clear(); + other_key_str_buf_.clear(); + other_value_str_buf_.clear(); + other_key_buf_.shrink_to_fit(); + other_value_buf_.shrink_to_fit(); + other_key_str_buf_.shrink_to_fit(); + other_value_str_buf_.shrink_to_fit(); + } + + void CleanupMergedBuffer() { + combined_key_buf_.clear(); + combined_value_buf_.clear(); + combined_key_buf_.shrink_to_fit(); + combined_value_buf_.shrink_to_fit(); + } +}; + +namespace { +// Fix user-supplied options to be reasonable +template +static void ClipToRange(T* ptr, V minvalue, V maxvalue) { + if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; + if (static_cast(*ptr) < minvalue) *ptr = minvalue; +} +} // anonymous namespace + +Options SanitizeOptions(const std::string& dbname, + const InternalKeyComparator* icmp, + const InternalFilterPolicy* ipolicy, + const Options& src) { + auto db_options = SanitizeOptions(dbname, DBOptions(src)); + auto cf_options = SanitizeOptions(icmp, ipolicy, ColumnFamilyOptions(src)); + return Options(db_options, cf_options); +} + +DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { + DBOptions result = src; + // result.max_open_files means an "infinite" open files. + if (result.max_open_files != -1) { + ClipToRange(&result.max_open_files, 20, 1000000); + } + + if (result.info_log == nullptr) { + Status s = CreateLoggerFromOptions(dbname, result.db_log_dir, src.env, + result, &result.info_log); + if (!s.ok()) { + // No place suitable for logging + result.info_log = nullptr; + } + } + + if (result.wal_dir.empty()) { + // Use dbname as default + result.wal_dir = dbname; + } + if (result.wal_dir.back() == '/') { + result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); + } + + return result; +} + +CompressionType GetCompressionType(const Options& options, int level, + const bool enable_compression) { + if (!enable_compression) { + // disable compression + return kNoCompression; + } + // If the use has specified a different compression level for each level, + // then pick the compresison for that level. + if (!options.compression_per_level.empty()) { + const int n = options.compression_per_level.size() - 1; + // It is possible for level_ to be -1; in that case, we use level + // 0's compression. This occurs mostly in backwards compatibility + // situations when the builder doesn't know what level the file + // belongs to. Likewise, if level_ is beyond the end of the + // specified compression levels, use the last value. + return options.compression_per_level[std::max(0, std::min(level, n))]; + } else { + return options.compression; + } +} + +CompressionType GetCompressionFlush(const Options& options) { + // Compressing memtable flushes might not help unless the sequential load + // optimization is used for leveled compaction. Otherwise the CPU and + // latency overhead is not offset by saving much space. + + bool can_compress; + + if (options.compaction_style == kCompactionStyleUniversal) { + can_compress = + (options.compaction_options_universal.compression_size_percent < 0); + } else { + // For leveled compress when min_level_to_compress == 0. + can_compress = (GetCompressionType(options, 0, true) != kNoCompression); + } + + if (can_compress) { + return options.compression; + } else { + return kNoCompression; + } +} + +DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) + : env_(options.env), + dbname_(dbname), + options_(SanitizeOptions(dbname, options)), + db_lock_(nullptr), + mutex_(options.use_adaptive_mutex), + shutting_down_(nullptr), + bg_cv_(&mutex_), + logfile_number_(0), + log_empty_(true), + default_cf_handle_(nullptr), + total_log_size_(0), + max_total_in_memory_state_(0), + tmp_batch_(), + bg_schedule_needed_(false), + bg_compaction_scheduled_(0), + bg_manual_only_(0), + bg_flush_scheduled_(0), + bg_logstats_scheduled_(false), + manual_compaction_(nullptr), + logger_(nullptr), + disable_delete_obsolete_files_(0), + delete_obsolete_files_last_run_(options.env->NowMicros()), + purge_wal_files_last_run_(0), + last_stats_dump_time_microsec_(0), + default_interval_to_delete_obsolete_WAL_(600), + flush_on_destroy_(false), + delayed_writes_(0), + storage_options_(options), + bg_work_gate_closed_(false), + refitting_level_(false), + opened_successfully_(false) { + env_->GetAbsolutePath(dbname, &db_absolute_path_); + + // Reserve ten files or so for other uses and give the rest to TableCache. + // Give a large number for setting of "infinite" open files. + const int table_cache_size = + (options_.max_open_files == -1) ? 4194304 : options_.max_open_files - 10; + // Reserve ten files or so for other uses and give the rest to TableCache. + table_cache_ = + NewLRUCache(table_cache_size, options_.table_cache_numshardbits, + options_.table_cache_remove_scan_count_limit); + + versions_.reset( + new VersionSet(dbname_, &options_, storage_options_, table_cache_.get())); + column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); + + DumpLeveldbBuildVersion(options_.info_log.get()); + options_.Dump(options_.info_log.get()); + + char name[100]; + Status s = env_->GetHostName(name, 100L); + if (s.ok()) { + host_name_ = name; + } else { + Log(options_.info_log, "Can't get hostname, use localhost as host name."); + host_name_ = "localhost"; + } + last_log_ts = 0; + + LogFlush(options_.info_log); +} + +DBImpl::~DBImpl() { + mutex_.Lock(); + if (flush_on_destroy_) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->mem()->GetFirstSequenceNumber() != 0) { + cfd->Ref(); + mutex_.Unlock(); + FlushMemTable(cfd, FlushOptions()); + mutex_.Lock(); + cfd->Unref(); + } + } + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + } + + // Wait for background work to finish + shutting_down_.Release_Store(this); // Any non-nullptr value is ok + while (bg_compaction_scheduled_ || + bg_flush_scheduled_ || + bg_logstats_scheduled_) { + bg_cv_.Wait(); + } + + if (default_cf_handle_ != nullptr) { + // we need to delete handle outside of lock because it does its own locking + mutex_.Unlock(); + delete default_cf_handle_; + mutex_.Lock(); + } + + if (options_.allow_thread_local) { + // Clean up obsolete files due to SuperVersion release. + // (1) Need to delete to obsolete files before closing because RepairDB() + // scans all existing files in the file system and builds manifest file. + // Keeping obsolete files confuses the repair process. + // (2) Need to check if we Open()/Recover() the DB successfully before + // deleting because if VersionSet recover fails (may be due to corrupted + // manifest file), it is not able to identify live files correctly. As a + // result, all "live" files can get deleted by accident. However, corrupted + // manifest is recoverable by RepairDB(). + if (opened_successfully_) { + DeletionState deletion_state; + FindObsoleteFiles(deletion_state, true); + // manifest number starting from 2 + deletion_state.manifest_file_number = 1; + if (deletion_state.HaveSomethingToDelete()) { + PurgeObsoleteFiles(deletion_state); + } + } + } + + // versions need to be destroyed before table_cache since it can hold + // references to table_cache. + versions_.reset(); + mutex_.Unlock(); + if (db_lock_ != nullptr) { + env_->UnlockFile(db_lock_); + } + + LogFlush(options_.info_log); +} + +Status DBImpl::NewDB() { + VersionEdit new_db; + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::string manifest = DescriptorFileName(dbname_, 1); + unique_ptr file; + Status s = env_->NewWritableFile( + manifest, &file, env_->OptimizeForManifestWrite(storage_options_)); + if (!s.ok()) { + return s; + } + file->SetPreallocationBlockSize(options_.manifest_preallocation_size); + { + log::Writer log(std::move(file)); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + } + if (s.ok()) { + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1, db_directory_.get()); + } else { + env_->DeleteFile(manifest); + } + return s; +} + +void DBImpl::MaybeIgnoreError(Status* s) const { + if (s->ok() || options_.paranoid_checks) { + // No change needed + } else { + Log(options_.info_log, "Ignoring error %s", s->ToString().c_str()); + *s = Status::OK(); + } +} + +const Status DBImpl::CreateArchivalDirectory() { + if (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0) { + std::string archivalPath = ArchivalDirectory(options_.wal_dir); + return env_->CreateDirIfMissing(archivalPath); + } + return Status::OK(); +} + +void DBImpl::PrintStatistics() { + auto dbstats = options_.statistics.get(); + if (dbstats) { + Log(options_.info_log, + "STATISTCS:\n %s", + dbstats->ToString().c_str()); + } +} + +void DBImpl::MaybeDumpStats() { + if (options_.stats_dump_period_sec == 0) return; + + const uint64_t now_micros = env_->NowMicros(); + + if (last_stats_dump_time_microsec_ + + options_.stats_dump_period_sec * 1000000 + <= now_micros) { + // Multiple threads could race in here simultaneously. + // However, the last one will update last_stats_dump_time_microsec_ + // atomically. We could see more than one dump during one dump + // period in rare cases. + last_stats_dump_time_microsec_ = now_micros; + std::string stats; + GetProperty("rocksdb.stats", &stats); + Log(options_.info_log, "%s", stats.c_str()); + PrintStatistics(); + } +} + +// Returns the list of live files in 'sst_live' and the list +// of all files in the filesystem in 'candidate_files'. +// no_full_scan = true -- never do the full scan using GetChildren() +// force = false -- don't force the full scan, except every +// options_.delete_obsolete_files_period_micros +// force = true -- force the full scan +void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, + bool force, + bool no_full_scan) { + mutex_.AssertHeld(); + + // if deletion is disabled, do nothing + if (disable_delete_obsolete_files_ > 0) { + return; + } + + bool doing_the_full_scan = false; + + // logic for figurint out if we're doing the full scan + if (no_full_scan) { + doing_the_full_scan = false; + } else if (force || options_.delete_obsolete_files_period_micros == 0) { + doing_the_full_scan = true; + } else { + const uint64_t now_micros = env_->NowMicros(); + if (delete_obsolete_files_last_run_ + + options_.delete_obsolete_files_period_micros < now_micros) { + doing_the_full_scan = true; + delete_obsolete_files_last_run_ = now_micros; + } + } + + // get obsolete files + versions_->GetObsoleteFiles(&deletion_state.sst_delete_files); + + // store the current filenum, lognum, etc + deletion_state.manifest_file_number = versions_->ManifestFileNumber(); + deletion_state.pending_manifest_file_number = + versions_->PendingManifestFileNumber(); + deletion_state.log_number = versions_->MinLogNumber(); + deletion_state.prev_log_number = versions_->PrevLogNumber(); + + if (!doing_the_full_scan && !deletion_state.HaveSomethingToDelete()) { + // avoid filling up sst_live if we're sure that we + // are not going to do the full scan and that we don't have + // anything to delete at the moment + return; + } + + // don't delete live files + deletion_state.sst_live.assign(pending_outputs_.begin(), + pending_outputs_.end()); + versions_->AddLiveFiles(&deletion_state.sst_live); + + if (doing_the_full_scan) { + // set of all files in the directory. We'll exclude files that are still + // alive in the subsequent processings. + env_->GetChildren( + dbname_, &deletion_state.candidate_files + ); // Ignore errors + + //Add log files in wal_dir + if (options_.wal_dir != dbname_) { + std::vector log_files; + env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors + deletion_state.candidate_files.insert( + deletion_state.candidate_files.end(), + log_files.begin(), + log_files.end() + ); + } + } +} + +// Diffs the files listed in filenames and those that do not +// belong to live files are posibly removed. Also, removes all the +// files in sst_delete_files and log_delete_files. +// It is not necessary to hold the mutex when invoking this method. +void DBImpl::PurgeObsoleteFiles(DeletionState& state) { + // we'd better have sth to delete + assert(state.HaveSomethingToDelete()); + + // this checks if FindObsoleteFiles() was run before. If not, don't do + // PurgeObsoleteFiles(). If FindObsoleteFiles() was run, we need to also + // run PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true + if (state.manifest_file_number == 0) { + return; + } + + // Now, convert live list to an unordered set, WITHOUT mutex held; + // set is slow. + std::unordered_set sst_live(state.sst_live.begin(), + state.sst_live.end()); + + auto& candidate_files = state.candidate_files; + candidate_files.reserve( + candidate_files.size() + + state.sst_delete_files.size() + + state.log_delete_files.size()); + // We may ignore the dbname when generating the file names. + const char* kDumbDbName = ""; + for (auto file : state.sst_delete_files) { + candidate_files.push_back( + TableFileName(kDumbDbName, file->number).substr(1) + ); + delete file; + } + + for (auto file_num : state.log_delete_files) { + if (file_num > 0) { + candidate_files.push_back(LogFileName(kDumbDbName, file_num).substr(1)); + } + } + + // dedup state.candidate_files so we don't try to delete the same + // file twice + sort(candidate_files.begin(), candidate_files.end()); + candidate_files.erase(unique(candidate_files.begin(), candidate_files.end()), + candidate_files.end()); + + std::vector old_info_log_files; + + for (const auto& to_delete : candidate_files) { + uint64_t number; + FileType type; + // Ignore file if we cannot recognize it. + if (!ParseFileName(to_delete, &number, &type)) { + continue; + } + + bool keep = true; + switch (type) { + case kLogFile: + keep = ((number >= state.log_number) || + (number == state.prev_log_number)); + break; + case kDescriptorFile: + // Keep my manifest file, and any newer incarnations' + // (can happen during manifest roll) + keep = (number >= state.manifest_file_number); + break; + case kTableFile: + keep = (sst_live.find(number) != sst_live.end()); + break; + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "live". + // Also, SetCurrentFile creates a temp file when writing out new + // manifest, which is equal to state.pending_manifest_file_number. We + // should not delete that file + keep = (sst_live.find(number) != sst_live.end()) || + (number == state.pending_manifest_file_number); + break; + case kInfoLogFile: + keep = true; + if (number != 0) { + old_info_log_files.push_back(to_delete); + } + break; + case kCurrentFile: + case kDBLockFile: + case kIdentityFile: + case kMetaDatabase: + keep = true; + break; + } + + if (keep) { + continue; + } + + if (type == kTableFile) { + // evict from cache + TableCache::Evict(table_cache_.get(), number); + } + + std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) + + "/" + to_delete; + if (type == kLogFile && + (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0)) { + auto archived_log_name = ArchivedLogFileName(options_.wal_dir, number); + // The sync point below is used in (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:1"); + Status s = env_->RenameFile(fname, archived_log_name); + // The sync point below is used in (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:2"); + Log(options_.info_log, + "Move log file %s to %s -- %s\n", + fname.c_str(), archived_log_name.c_str(), s.ToString().c_str()); + } else { + Status s = env_->DeleteFile(fname); + Log(options_.info_log, "Delete %s type=%d #%lu -- %s\n", + fname.c_str(), type, (unsigned long)number, + s.ToString().c_str()); + } + } + + // Delete old info log files. + size_t old_info_log_file_count = old_info_log_files.size(); + // NOTE: Currently we only support log purge when options_.db_log_dir is + // located in `dbname` directory. + if (old_info_log_file_count >= options_.keep_log_file_num && + options_.db_log_dir.empty()) { + std::sort(old_info_log_files.begin(), old_info_log_files.end()); + size_t end = old_info_log_file_count - options_.keep_log_file_num; + for (unsigned int i = 0; i <= end; i++) { + std::string& to_delete = old_info_log_files.at(i); + Log(options_.info_log, "Delete info log file %s\n", to_delete.c_str()); + Status s = env_->DeleteFile(dbname_ + "/" + to_delete); + if (!s.ok()) { + Log(options_.info_log, "Delete info log file %s FAILED -- %s\n", + to_delete.c_str(), s.ToString().c_str()); + } + } + } + PurgeObsoleteWALFiles(); + LogFlush(options_.info_log); +} + +void DBImpl::DeleteObsoleteFiles() { + mutex_.AssertHeld(); + DeletionState deletion_state; + FindObsoleteFiles(deletion_state, true); + if (deletion_state.HaveSomethingToDelete()) { + PurgeObsoleteFiles(deletion_state); + } +} + +#ifndef ROCKSDB_LITE +// 1. Go through all archived files and +// a. if ttl is enabled, delete outdated files +// b. if archive size limit is enabled, delete empty files, +// compute file number and size. +// 2. If size limit is enabled: +// a. compute how many files should be deleted +// b. get sorted non-empty archived logs +// c. delete what should be deleted +void DBImpl::PurgeObsoleteWALFiles() { + bool const ttl_enabled = options_.WAL_ttl_seconds > 0; + bool const size_limit_enabled = options_.WAL_size_limit_MB > 0; + if (!ttl_enabled && !size_limit_enabled) { + return; + } + + int64_t current_time; + Status s = env_->GetCurrentTime(¤t_time); + if (!s.ok()) { + Log(options_.info_log, "Can't get current time: %s", s.ToString().c_str()); + assert(false); + return; + } + uint64_t const now_seconds = static_cast(current_time); + uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) ? + options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_; + + if (purge_wal_files_last_run_ + time_to_check > now_seconds) { + return; + } + + purge_wal_files_last_run_ = now_seconds; + + std::string archival_dir = ArchivalDirectory(options_.wal_dir); + std::vector files; + s = env_->GetChildren(archival_dir, &files); + if (!s.ok()) { + Log(options_.info_log, "Can't get archive files: %s", s.ToString().c_str()); + assert(false); + return; + } + + size_t log_files_num = 0; + uint64_t log_file_size = 0; + + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile) { + std::string const file_path = archival_dir + "/" + f; + if (ttl_enabled) { + uint64_t file_m_time; + Status const s = env_->GetFileModificationTime(file_path, + &file_m_time); + if (!s.ok()) { + Log(options_.info_log, "Can't get file mod time: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } + if (now_seconds - file_m_time > options_.WAL_ttl_seconds) { + Status const s = env_->DeleteFile(file_path); + if (!s.ok()) { + Log(options_.info_log, "Can't delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(number); + } + continue; + } + } + + if (size_limit_enabled) { + uint64_t file_size; + Status const s = env_->GetFileSize(file_path, &file_size); + if (!s.ok()) { + Log(options_.info_log, "Can't get file size: %s: %s", + file_path.c_str(), s.ToString().c_str()); + return; + } else { + if (file_size > 0) { + log_file_size = std::max(log_file_size, file_size); + ++log_files_num; + } else { + Status s = env_->DeleteFile(file_path); + if (!s.ok()) { + Log(options_.info_log, "Can't delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(number); + } + } + } + } + } + } + + if (0 == log_files_num || !size_limit_enabled) { + return; + } + + size_t const files_keep_num = options_.WAL_size_limit_MB * + 1024 * 1024 / log_file_size; + if (log_files_num <= files_keep_num) { + return; + } + + size_t files_del_num = log_files_num - files_keep_num; + VectorLogPtr archived_logs; + GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); + + if (files_del_num > archived_logs.size()) { + Log(options_.info_log, "Trying to delete more archived log files than " + "exist. Deleting all"); + files_del_num = archived_logs.size(); + } + + for (size_t i = 0; i < files_del_num; ++i) { + std::string const file_path = archived_logs[i]->PathName(); + Status const s = DeleteFile(file_path); + if (!s.ok()) { + Log(options_.info_log, "Can't delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(archived_logs[i]->LogNumber()); + } + } +} + +namespace { +struct CompareLogByPointer { + bool operator()(const unique_ptr& a, const unique_ptr& b) { + LogFileImpl* a_impl = dynamic_cast(a.get()); + LogFileImpl* b_impl = dynamic_cast(b.get()); + return *a_impl < *b_impl; + } +}; +} + +Status DBImpl::GetSortedWalsOfType(const std::string& path, + VectorLogPtr& log_files, + WalFileType log_type) { + std::vector all_files; + const Status status = env_->GetChildren(path, &all_files); + if (!status.ok()) { + return status; + } + log_files.reserve(all_files.size()); + for (const auto& f : all_files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile) { + SequenceNumber sequence; + Status s = ReadFirstRecord(log_type, number, &sequence); + if (!s.ok()) { + return s; + } + if (sequence == 0) { + // empty file + continue; + } + + // Reproduce the race condition where a log file is moved + // to archived dir, between these two sync points, used in + // (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("DBImpl::GetSortedWalsOfType:1"); + TEST_SYNC_POINT("DBImpl::GetSortedWalsOfType:2"); + + uint64_t size_bytes; + s = env_->GetFileSize(LogFileName(path, number), &size_bytes); + // re-try in case the alive log file has been moved to archive. + if (!s.ok() && log_type == kAliveLogFile && + env_->FileExists(ArchivedLogFileName(path, number))) { + s = env_->GetFileSize(ArchivedLogFileName(path, number), &size_bytes); + } + if (!s.ok()) { + return s; + } + + log_files.push_back(std::move(unique_ptr( + new LogFileImpl(number, log_type, sequence, size_bytes)))); + } + } + CompareLogByPointer compare_log_files; + std::sort(log_files.begin(), log_files.end(), compare_log_files); + return status; +} + +Status DBImpl::RetainProbableWalFiles(VectorLogPtr& all_logs, + const SequenceNumber target) { + int64_t start = 0; // signed to avoid overflow when target is < first file. + int64_t end = static_cast(all_logs.size()) - 1; + // Binary Search. avoid opening all files. + while (end >= start) { + int64_t mid = start + (end - start) / 2; // Avoid overflow. + SequenceNumber current_seq_num = all_logs.at(mid)->StartSequence(); + if (current_seq_num == target) { + end = mid; + break; + } else if (current_seq_num < target) { + start = mid + 1; + } else { + end = mid - 1; + } + } + // end could be -ve. + size_t start_index = std::max(static_cast(0), end); + // The last wal file is always included + all_logs.erase(all_logs.begin(), all_logs.begin() + start_index); + return Status::OK(); +} + +Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number, + SequenceNumber* sequence) { + if (type != kAliveLogFile && type != kArchivedLogFile) { + return Status::NotSupported("File Type Not Known " + std::to_string(type)); + } + { + MutexLock l(&read_first_record_cache_mutex_); + auto itr = read_first_record_cache_.find(number); + if (itr != read_first_record_cache_.end()) { + *sequence = itr->second; + return Status::OK(); + } + } + Status s; + if (type == kAliveLogFile) { + std::string fname = LogFileName(options_.wal_dir, number); + s = ReadFirstLine(fname, sequence); + if (env_->FileExists(fname) && !s.ok()) { + // return any error that is not caused by non-existing file + return s; + } + } + + if (type == kArchivedLogFile || !s.ok()) { + // check if the file got moved to archive. + std::string archived_file = ArchivedLogFileName(options_.wal_dir, number); + s = ReadFirstLine(archived_file, sequence); + } + + if (s.ok() && *sequence != 0) { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.insert({number, *sequence}); + } + return s; +} + +// the function returns status.ok() and sequence == 0 if the file exists, but is +// empty +Status DBImpl::ReadFirstLine(const std::string& fname, + SequenceNumber* sequence) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + + Status* status; + bool ignore_error; // true if options_.paranoid_checks==false + virtual void Corruption(size_t bytes, const Status& s) { + Log(info_log, "%s%s: dropping %d bytes; %s", + (this->ignore_error ? "(ignoring error) " : ""), fname, + static_cast(bytes), s.ToString().c_str()); + if (this->status->ok()) { + // only keep the first error + *this->status = s; + } + } + }; + + unique_ptr file; + Status status = env_->NewSequentialFile(fname, &file, storage_options_); + + if (!status.ok()) { + return status; + } + + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log.get(); + reporter.fname = fname.c_str(); + reporter.status = &status; + reporter.ignore_error = !options_.paranoid_checks; + log::Reader reader(std::move(file), &reporter, true /*checksum*/, + 0 /*initial_offset*/); + std::string scratch; + Slice record; + + if (reader.ReadRecord(&record, &scratch) && + (status.ok() || !options_.paranoid_checks)) { + if (record.size() < 12) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + // TODO read record's till the first no corrupt entry? + } else { + WriteBatch batch; + WriteBatchInternal::SetContents(&batch, record); + *sequence = WriteBatchInternal::Sequence(&batch); + return Status::OK(); + } + } + + // ReadRecord returns false on EOF, which means that the log file is empty. we + // return status.ok() in that case and set sequence number to 0 + *sequence = 0; + return status; +} + +#endif // ROCKSDB_LITE + +Status DBImpl::Recover( + const std::vector& column_families, bool read_only, + bool error_if_log_file_exist) { + mutex_.AssertHeld(); + + bool is_new_db = false; + assert(db_lock_ == nullptr); + if (!read_only) { + // We call CreateDirIfMissing() as the directory may already exist (if we + // are reopening a DB), when this happens we don't want creating the + // directory to cause an error. However, we need to check if creating the + // directory fails or else we may get an obscure message about the lock + // file not existing. One real-world example of this occurring is if + // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. + // when dbname_ is "dir/db" but when "dir" doesn't exist. + Status s = env_->CreateDirIfMissing(dbname_); + if (!s.ok()) { + return s; + } + + s = env_->NewDirectory(dbname_, &db_directory_); + if (!s.ok()) { + return s; + } + + s = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!s.ok()) { + return s; + } + + if (!env_->FileExists(CurrentFileName(dbname_))) { + if (options_.create_if_missing) { + // TODO: add merge_operator name check + s = NewDB(); + is_new_db = true; + if (!s.ok()) { + return s; + } + } else { + return Status::InvalidArgument( + dbname_, "does not exist (create_if_missing is false)"); + } + } else { + if (options_.error_if_exists) { + return Status::InvalidArgument( + dbname_, "exists (error_if_exists is true)"); + } + } + // Check for the IDENTITY file and create it if not there + if (!env_->FileExists(IdentityFileName(dbname_))) { + s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + } + } + + Status s = versions_->Recover(column_families, read_only); + if (options_.paranoid_checks && s.ok()) { + s = CheckConsistency(); + } + if (s.ok()) { + SequenceNumber max_sequence(0); + default_cf_handle_ = new ColumnFamilyHandleImpl( + versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); + + // Recover from all newer log files than the ones named in the + // descriptor (new log files may have been added by the previous + // incarnation without registering them in the descriptor). + // + // Note that PrevLogNumber() is no longer used, but we pay + // attention to it in case we are recovering a database + // produced by an older version of rocksdb. + const uint64_t min_log = versions_->MinLogNumber(); + const uint64_t prev_log = versions_->PrevLogNumber(); + std::vector filenames; + s = env_->GetChildren(options_.wal_dir, &filenames); + if (!s.ok()) { + return s; + } + + std::vector logs; + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) { + if (is_new_db) { + return Status::Corruption( + "While creating a new Db, wal_dir contains " + "existing log file: ", + filenames[i]); + } else if ((number >= min_log) || (number == prev_log)) { + logs.push_back(number); + } + } + } + + if (logs.size() > 0 && error_if_log_file_exist) { + return Status::Corruption("" + "The db was opened in readonly mode with error_if_log_file_exist" + "flag but a log file already exists"); + } + + // Recover in the order in which the logs were generated + std::sort(logs.begin(), logs.end()); + for (const auto& log : logs) { + // The previous incarnation may not have written any MANIFEST + // records after allocating this log number. So we manually + // update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(log); + s = RecoverLogFile(log, &max_sequence, read_only); + } + SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER, + versions_->LastSequence()); + } + + for (auto cfd : *versions_->GetColumnFamilySet()) { + max_total_in_memory_state_ += cfd->options()->write_buffer_size * + cfd->options()->max_write_buffer_number; + } + + return s; +} + +Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, + bool read_only) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + Status* status; // nullptr if options_.paranoid_checks==false or + // options_.skip_log_error_on_recovery==true + virtual void Corruption(size_t bytes, const Status& s) { + Log(info_log, "%s%s: dropping %d bytes; %s", + (this->status == nullptr ? "(ignoring error) " : ""), + fname, static_cast(bytes), s.ToString().c_str()); + if (this->status != nullptr && this->status->ok()) *this->status = s; + } + }; + + mutex_.AssertHeld(); + + std::unordered_map version_edits; + // no need to refcount because iteration is under mutex + for (auto cfd : *versions_->GetColumnFamilySet()) { + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + version_edits.insert({cfd->GetID(), edit}); + } + + // Open the log file + std::string fname = LogFileName(options_.wal_dir, log_number); + unique_ptr file; + Status status = env_->NewSequentialFile(fname, &file, storage_options_); + if (!status.ok()) { + MaybeIgnoreError(&status); + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log.get(); + reporter.fname = fname.c_str(); + reporter.status = (options_.paranoid_checks && + !options_.skip_log_error_on_recovery ? &status : nullptr); + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(std::move(file), &reporter, true/*checksum*/, + 0/*initial_offset*/); + Log(options_.info_log, "Recovering log #%lu", + (unsigned long) log_number); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + while (reader.ReadRecord(&record, &scratch)) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), true, log_number); + + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } + const SequenceNumber last_seq = + WriteBatchInternal::Sequence(&batch) + + WriteBatchInternal::Count(&batch) - 1; + if (last_seq > *max_sequence) { + *max_sequence = last_seq; + } + + if (!read_only) { + // no need to refcount since client still doesn't have access + // to the DB and can not drop column families while we iterate + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->mem()->ShouldFlush()) { + // If this asserts, it means that InsertInto failed in + // filtering updates to already-flushed column families + assert(cfd->GetLogNumber() <= log_number); + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + VersionEdit* edit = &iter->second; + status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit); + // we still want to clear the memtable, even if the recovery failed + cfd->CreateNewMemtable(); + if (!status.ok()) { + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + return status; + } + } + } + } + } + + if (versions_->LastSequence() < *max_sequence) { + versions_->SetLastSequence(*max_sequence); + } + + if (!read_only) { + // no need to refcount since client still doesn't have access + // to the DB and can not drop column families while we iterate + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + VersionEdit* edit = &iter->second; + + if (cfd->GetLogNumber() > log_number) { + // Column family cfd has already flushed the data + // from log_number. Memtable has to be empty because + // we filter the updates based on log_number + // (in WriteBatch::InsertInto) + assert(cfd->mem()->GetFirstSequenceNumber() == 0); + assert(edit->NumEntries() == 0); + continue; + } + + // flush the final memtable (if non-empty) + if (cfd->mem()->GetFirstSequenceNumber() != 0) { + status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit); + } + // we still want to clear the memtable, even if the recovery failed + cfd->CreateNewMemtable(); + if (!status.ok()) { + return status; + } + + // write MANIFEST with update + // writing log number in the manifest means that any log file + // with number strongly less than (log_number + 1) is already + // recovered and should be ignored on next reincarnation. + // Since we already recovered log_number, we want all logs + // with numbers `<= log_number` (includes this one) to be ignored + edit->SetLogNumber(log_number + 1); + // we must mark the next log number as used, even though it's + // not actually used. that is because VersionSet assumes + // VersionSet::next_file_number_ always to be strictly greater than any + // log number + versions_->MarkFileNumberUsed(log_number + 1); + status = versions_->LogAndApply(cfd, edit, &mutex_); + if (!status.ok()) { + return status; + } + } + } + + return status; +} + +Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, + VersionEdit* edit) { + mutex_.AssertHeld(); + const uint64_t start_micros = env_->NowMicros(); + FileMetaData meta; + meta.number = versions_->NewFileNumber(); + pending_outputs_.insert(meta.number); + Iterator* iter = mem->NewIterator(ReadOptions(), true); + const SequenceNumber newest_snapshot = snapshots_.GetNewest(); + const SequenceNumber earliest_seqno_in_memtable = + mem->GetFirstSequenceNumber(); + Log(options_.info_log, "[%s] Level-0 table #%lu: started", + cfd->GetName().c_str(), (unsigned long)meta.number); + + Status s; + { + mutex_.Unlock(); + s = BuildTable(dbname_, env_, *cfd->options(), storage_options_, + cfd->table_cache(), iter, &meta, cfd->internal_comparator(), + newest_snapshot, earliest_seqno_in_memtable, + GetCompressionFlush(*cfd->options())); + LogFlush(options_.info_log); + mutex_.Lock(); + } + + Log(options_.info_log, "[%s] Level-0 table #%lu: %lu bytes %s", + cfd->GetName().c_str(), (unsigned long)meta.number, + (unsigned long)meta.file_size, s.ToString().c_str()); + delete iter; + + pending_outputs_.erase(meta.number); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + int level = 0; + if (s.ok() && meta.file_size > 0) { + edit->AddFile(level, meta.number, meta.file_size, + meta.smallest, meta.largest, + meta.smallest_seqno, meta.largest_seqno); + } + + InternalStats::CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros; + stats.bytes_written = meta.file_size; + stats.files_out_levelnp1 = 1; + cfd->internal_stats()->AddCompactionStats(level, stats); + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size); + return s; +} + +Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, + autovector& mems, VersionEdit* edit, + uint64_t* filenumber, LogBuffer* log_buffer) { + mutex_.AssertHeld(); + const uint64_t start_micros = env_->NowMicros(); + FileMetaData meta; + meta.number = versions_->NewFileNumber(); + *filenumber = meta.number; + pending_outputs_.insert(meta.number); + + const SequenceNumber newest_snapshot = snapshots_.GetNewest(); + const SequenceNumber earliest_seqno_in_memtable = + mems[0]->GetFirstSequenceNumber(); + Version* base = cfd->current(); + base->Ref(); // it is likely that we do not need this reference + Status s; + { + mutex_.Unlock(); + log_buffer->FlushBufferToLog(); + std::vector memtables; + for (MemTable* m : mems) { + Log(options_.info_log, "[%s] Flushing memtable with next log file: %lu\n", + cfd->GetName().c_str(), (unsigned long)m->GetNextLogNumber()); + memtables.push_back(m->NewIterator(ReadOptions(), true)); + } + Iterator* iter = NewMergingIterator(&cfd->internal_comparator(), + &memtables[0], memtables.size()); + Log(options_.info_log, "[%s] Level-0 flush table #%lu: started", + cfd->GetName().c_str(), (unsigned long)meta.number); + + s = BuildTable(dbname_, env_, *cfd->options(), storage_options_, + cfd->table_cache(), iter, &meta, cfd->internal_comparator(), + newest_snapshot, earliest_seqno_in_memtable, + GetCompressionFlush(*cfd->options())); + LogFlush(options_.info_log); + delete iter; + Log(options_.info_log, "[%s] Level-0 flush table #%lu: %lu bytes %s", + cfd->GetName().c_str(), (unsigned long)meta.number, + (unsigned long)meta.file_size, s.ToString().c_str()); + + if (!options_.disableDataSync) { + db_directory_->Fsync(); + } + mutex_.Lock(); + } + base->Unref(); + + // re-acquire the most current version + base = cfd->current(); + + // There could be multiple threads writing to its own level-0 file. + // The pending_outputs cannot be cleared here, otherwise this newly + // created file might not be considered as a live-file by another + // compaction thread that is concurrently deleting obselete files. + // The pending_outputs can be cleared only after the new version is + // committed so that other threads can recognize this file as a + // valid one. + // pending_outputs_.erase(meta.number); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + int level = 0; + if (s.ok() && meta.file_size > 0) { + const Slice min_user_key = meta.smallest.user_key(); + const Slice max_user_key = meta.largest.user_key(); + // if we have more than 1 background thread, then we cannot + // insert files directly into higher levels because some other + // threads could be concurrently producing compacted files for + // that key range. + if (base != nullptr && options_.max_background_compactions <= 1 && + cfd->options()->compaction_style == kCompactionStyleLevel) { + level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); + } + edit->AddFile(level, meta.number, meta.file_size, + meta.smallest, meta.largest, + meta.smallest_seqno, meta.largest_seqno); + } + + InternalStats::CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros; + stats.bytes_written = meta.file_size; + cfd->internal_stats()->AddCompactionStats(level, stats); + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size); + return s; +} + +Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd, + bool* madeProgress, + DeletionState& deletion_state, + LogBuffer* log_buffer) { + mutex_.AssertHeld(); + assert(cfd->imm()->size() != 0); + assert(cfd->imm()->IsFlushPending()); + + // Save the contents of the earliest memtable as a new Table + uint64_t file_number; + autovector mems; + cfd->imm()->PickMemtablesToFlush(&mems); + if (mems.empty()) { + LogToBuffer(log_buffer, "[%s] Nothing in memtable to flush", + cfd->GetName().c_str()); + return Status::OK(); + } + + // record the logfile_number_ before we release the mutex + // entries mems are (implicitly) sorted in ascending order by their created + // time. We will use the first memtable's `edit` to keep the meta info for + // this flush. + MemTable* m = mems[0]; + VersionEdit* edit = m->GetEdits(); + edit->SetPrevLogNumber(0); + // SetLogNumber(log_num) indicates logs with number smaller than log_num + // will no longer be picked up for recovery. + edit->SetLogNumber(mems.back()->GetNextLogNumber()); + edit->SetColumnFamily(cfd->GetID()); + + // This will release and re-acquire the mutex. + Status s = WriteLevel0Table(cfd, mems, edit, &file_number, log_buffer); + + if (s.ok() && shutting_down_.Acquire_Load() && cfd->IsDropped()) { + s = Status::ShutdownInProgress( + "Database shutdown or Column family drop during flush"); + } + + if (!s.ok()) { + cfd->imm()->RollbackMemtableFlush(mems, file_number, &pending_outputs_); + } else { + // Replace immutable memtable with the generated Table + s = cfd->imm()->InstallMemtableFlushResults( + cfd, mems, versions_.get(), &mutex_, options_.info_log.get(), + file_number, pending_outputs_, &deletion_state.memtables_to_free, + db_directory_.get(), log_buffer); + } + + if (s.ok()) { + InstallSuperVersion(cfd, deletion_state); + if (madeProgress) { + *madeProgress = 1; + } + Version::LevelSummaryStorage tmp; + LogToBuffer(log_buffer, "[%s] Level summary: %s\n", cfd->GetName().c_str(), + cfd->current()->LevelSummary(&tmp)); + + MaybeScheduleLogDBDeployStats(); + + if (disable_delete_obsolete_files_ == 0) { + // add to deletion state + while (alive_log_files_.size() && + alive_log_files_.begin()->number < versions_->MinLogNumber()) { + const auto& earliest = *alive_log_files_.begin(); + deletion_state.log_delete_files.push_back(earliest.number); + total_log_size_ -= earliest.size; + alive_log_files_.pop_front(); + } + } + } + + if (!s.ok() && !s.IsShutdownInProgress() && options_.paranoid_checks && + bg_error_.ok()) { + // if a bad error happened (not ShutdownInProgress) and paranoid_checks is + // true, mark DB read-only + bg_error_ = s; + } + return s; +} + +Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + bool reduce_level, int target_level) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + Status s = FlushMemTable(cfd, FlushOptions()); + if (!s.ok()) { + LogFlush(options_.info_log); + return s; + } + + int max_level_with_files = 0; + { + MutexLock l(&mutex_); + Version* base = cfd->current(); + for (int level = 1; level < cfd->NumberLevels(); level++) { + if (base->OverlapInLevel(level, begin, end)) { + max_level_with_files = level; + } + } + } + for (int level = 0; level <= max_level_with_files; level++) { + // in case the compaction is unversal or if we're compacting the + // bottom-most level, the output level will be the same as input one + if (cfd->options()->compaction_style == kCompactionStyleUniversal || + cfd->options()->compaction_style == kCompactionStyleFIFO || + level == max_level_with_files) { + s = RunManualCompaction(cfd, level, level, begin, end); + } else { + s = RunManualCompaction(cfd, level, level + 1, begin, end); + } + if (!s.ok()) { + LogFlush(options_.info_log); + return s; + } + } + + if (reduce_level) { + s = ReFitLevel(cfd, max_level_with_files, target_level); + } + LogFlush(options_.info_log); + + return s; +} + +// return the same level if it cannot be moved +int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level) { + mutex_.AssertHeld(); + Version* current = cfd->current(); + int minimum_level = level; + for (int i = level - 1; i > 0; --i) { + // stop if level i is not empty + if (current->NumLevelFiles(i) > 0) break; + // stop if level i is too small (cannot fit the level files) + if (cfd->compaction_picker()->MaxBytesForLevel(i) < + current->NumLevelBytes(level)) { + break; + } + + minimum_level = i; + } + return minimum_level; +} + +Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { + assert(level < cfd->NumberLevels()); + + SuperVersion* superversion_to_free = nullptr; + SuperVersion* new_superversion = new SuperVersion(); + + mutex_.Lock(); + + // only allow one thread refitting + if (refitting_level_) { + mutex_.Unlock(); + Log(options_.info_log, "ReFitLevel: another thread is refitting"); + delete new_superversion; + return Status::NotSupported("another thread is refitting"); + } + refitting_level_ = true; + + // wait for all background threads to stop + bg_work_gate_closed_ = true; + while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_) { + Log(options_.info_log, + "RefitLevel: waiting for background threads to stop: %d %d", + bg_compaction_scheduled_, bg_flush_scheduled_); + bg_cv_.Wait(); + } + + // move to a smaller level + int to_level = target_level; + if (target_level < 0) { + to_level = FindMinimumEmptyLevelFitting(cfd, level); + } + + assert(to_level <= level); + + Status status; + if (to_level < level) { + Log(options_.info_log, "[%s] Before refitting:\n%s", cfd->GetName().c_str(), + cfd->current()->DebugString().data()); + + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + for (const auto& f : cfd->current()->files_[level]) { + edit.DeleteFile(level, f->number); + edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest, + f->smallest_seqno, f->largest_seqno); + } + Log(options_.info_log, "[%s] Apply version edit:\n%s", + cfd->GetName().c_str(), edit.DebugString().data()); + + status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get()); + superversion_to_free = cfd->InstallSuperVersion(new_superversion, &mutex_); + new_superversion = nullptr; + + Log(options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(), + status.ToString().data()); + + if (status.ok()) { + Log(options_.info_log, "[%s] After refitting:\n%s", + cfd->GetName().c_str(), cfd->current()->DebugString().data()); + } + } + + refitting_level_ = false; + bg_work_gate_closed_ = false; + + mutex_.Unlock(); + delete superversion_to_free; + delete new_superversion; + return status; +} + +int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + return cfh->cfd()->NumberLevels(); +} + +int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + return cfh->cfd()->options()->max_mem_compaction_level; +} + +int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + return cfh->cfd()->options()->level0_stop_writes_trigger; +} + +Status DBImpl::Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + return FlushMemTable(cfh->cfd(), options); +} + +SequenceNumber DBImpl::GetLatestSequenceNumber() const { + return versions_->LastSequence(); +} + +Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, + int output_level, const Slice* begin, + const Slice* end) { + assert(input_level >= 0); + + InternalKey begin_storage, end_storage; + + ManualCompaction manual; + manual.cfd = cfd; + manual.input_level = input_level; + manual.output_level = output_level; + manual.done = false; + manual.in_progress = false; + // For universal compaction, we enforce every manual compaction to compact + // all files. + if (begin == nullptr || + cfd->options()->compaction_style == kCompactionStyleUniversal || + cfd->options()->compaction_style == kCompactionStyleFIFO) { + manual.begin = nullptr; + } else { + begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek); + manual.begin = &begin_storage; + } + if (end == nullptr || + cfd->options()->compaction_style == kCompactionStyleUniversal || + cfd->options()->compaction_style == kCompactionStyleFIFO) { + manual.end = nullptr; + } else { + end_storage = InternalKey(*end, 0, static_cast(0)); + manual.end = &end_storage; + } + + MutexLock l(&mutex_); + + // When a manual compaction arrives, temporarily disable scheduling of + // non-manual compactions and wait until the number of scheduled compaction + // jobs drops to zero. This is needed to ensure that this manual compaction + // can compact any range of keys/files. + // + // bg_manual_only_ is non-zero when at least one thread is inside + // RunManualCompaction(), i.e. during that time no other compaction will + // get scheduled (see MaybeScheduleFlushOrCompaction). + // + // Note that the following loop doesn't stop more that one thread calling + // RunManualCompaction() from getting to the second while loop below. + // However, only one of them will actually schedule compaction, while + // others will wait on a condition variable until it completes. + + ++bg_manual_only_; + while (bg_compaction_scheduled_ > 0) { + Log(options_.info_log, + "[%s] Manual compaction waiting for all other scheduled background " + "compactions to finish", + cfd->GetName().c_str()); + bg_cv_.Wait(); + } + + Log(options_.info_log, "[%s] Manual compaction starting", + cfd->GetName().c_str()); + + while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) { + assert(bg_manual_only_ > 0); + if (manual_compaction_ != nullptr) { + // Running either this or some other manual compaction + bg_cv_.Wait(); + } else { + manual_compaction_ = &manual; + MaybeScheduleFlushOrCompaction(); + } + } + + assert(!manual.in_progress); + assert(bg_manual_only_ > 0); + --bg_manual_only_; + return manual.status; +} + +Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& options) { + // nullptr batch means just wait for earlier writes to be done + Status s = Write(WriteOptions(), nullptr); + if (s.ok() && options.wait) { + // Wait until the compaction completes + s = WaitForFlushMemTable(cfd); + } + return s; +} + +Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) { + Status s; + // Wait until the compaction completes + MutexLock l(&mutex_); + while (cfd->imm()->size() > 0 && bg_error_.ok()) { + bg_cv_.Wait(); + } + if (!bg_error_.ok()) { + s = bg_error_; + } + return s; +} + +void DBImpl::MaybeScheduleFlushOrCompaction() { + mutex_.AssertHeld(); + bg_schedule_needed_ = false; + if (bg_work_gate_closed_) { + // gate closed for backgrond work + } else if (shutting_down_.Acquire_Load()) { + // DB is being deleted; no more background compactions + } else { + bool is_flush_pending = false; + // no need to refcount since we're under a mutex + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->imm()->IsFlushPending()) { + is_flush_pending = true; + } + } + if (is_flush_pending) { + // memtable flush needed + if (bg_flush_scheduled_ < options_.max_background_flushes) { + bg_flush_scheduled_++; + env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH); + } else if (options_.max_background_flushes > 0) { + bg_schedule_needed_ = true; + } + } + bool is_compaction_needed = false; + // no need to refcount since we're under a mutex + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->current()->NeedsCompaction()) { + is_compaction_needed = true; + break; + } + } + + // Schedule BGWorkCompaction if there's a compaction pending (or a memtable + // flush, but the HIGH pool is not enabled) + // Do it only if max_background_compactions hasn't been reached and, in case + // bg_manual_only_ > 0, if it's a manual compaction. + if ((manual_compaction_ || is_compaction_needed || + (is_flush_pending && options_.max_background_flushes == 0)) && + (!bg_manual_only_ || manual_compaction_)) { + if (bg_compaction_scheduled_ < options_.max_background_compactions) { + bg_compaction_scheduled_++; + env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); + } else { + bg_schedule_needed_ = true; + } + } + } +} + +void DBImpl::BGWorkFlush(void* db) { + reinterpret_cast(db)->BackgroundCallFlush(); +} + +void DBImpl::BGWorkCompaction(void* db) { + reinterpret_cast(db)->BackgroundCallCompaction(); +} + +Status DBImpl::BackgroundFlush(bool* madeProgress, + DeletionState& deletion_state, + LogBuffer* log_buffer) { + mutex_.AssertHeld(); + // call_status is failure if at least one flush was a failure. even if + // flushing one column family reports a failure, we will continue flushing + // other column families. however, call_status will be a failure in that case. + Status call_status; + // refcounting in iteration + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->Ref(); + Status flush_status; + while (flush_status.ok() && cfd->imm()->IsFlushPending()) { + LogToBuffer( + log_buffer, + "BackgroundCallFlush doing FlushMemTableToOutputFile with column " + "family [%s], flush slots available %d", + cfd->GetName().c_str(), + options_.max_background_flushes - bg_flush_scheduled_); + flush_status = FlushMemTableToOutputFile(cfd, madeProgress, + deletion_state, log_buffer); + } + if (call_status.ok() && !flush_status.ok()) { + call_status = flush_status; + } + cfd->Unref(); + } + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + return call_status; +} + +void DBImpl::BackgroundCallFlush() { + bool madeProgress = false; + DeletionState deletion_state(true); + assert(bg_flush_scheduled_); + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get()); + { + MutexLock l(&mutex_); + + Status s; + if (!shutting_down_.Acquire_Load()) { + s = BackgroundFlush(&madeProgress, deletion_state, &log_buffer); + if (!s.ok()) { + // Wait a little bit before retrying background compaction in + // case this is an environmental problem and we do not want to + // chew up resources for failed compactions for the duration of + // the problem. + uint64_t error_cnt = default_cf_handle_->cfd() + ->internal_stats() + ->BumpAndGetBackgroundErrorCount(); + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + Log(options_.info_log, + "Waiting after background flush error: %s" + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); + log_buffer.FlushBufferToLog(); + LogFlush(options_.info_log); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } + } + + // If !s.ok(), this means that Flush failed. In that case, we want + // to delete all obsolete files and we force FindObsoleteFiles() + FindObsoleteFiles(deletion_state, !s.ok()); + // delete unnecessary files if any, this is done outside the mutex + if (deletion_state.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + // Have to flush the info logs before bg_flush_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (deletion_state.HaveSomethingToDelete()) { + PurgeObsoleteFiles(deletion_state); + } + mutex_.Lock(); + } + + bg_flush_scheduled_--; + // Any time the mutex is released After finding the work to do, another + // thread might execute MaybeScheduleFlushOrCompaction(). It is possible + // that there is a pending job but it is not scheduled because of the + // max thread limit. + if (madeProgress || bg_schedule_needed_) { + MaybeScheduleFlushOrCompaction(); + } + bg_cv_.SignalAll(); + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + } +} + +void DBImpl::BackgroundCallCompaction() { + bool madeProgress = false; + DeletionState deletion_state(true); + + MaybeDumpStats(); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get()); + { + MutexLock l(&mutex_); + assert(bg_compaction_scheduled_); + Status s; + if (!shutting_down_.Acquire_Load()) { + s = BackgroundCompaction(&madeProgress, deletion_state, &log_buffer); + if (!s.ok()) { + // Wait a little bit before retrying background compaction in + // case this is an environmental problem and we do not want to + // chew up resources for failed compactions for the duration of + // the problem. + uint64_t error_cnt = default_cf_handle_->cfd() + ->internal_stats() + ->BumpAndGetBackgroundErrorCount(); + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + log_buffer.FlushBufferToLog(); + Log(options_.info_log, + "Waiting after background compaction error: %s, " + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); + LogFlush(options_.info_log); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } + } + + // If !s.ok(), this means that Compaction failed. In that case, we want + // to delete all obsolete files we might have created and we force + // FindObsoleteFiles(). This is because deletion_state does not catch + // all created files if compaction failed. + FindObsoleteFiles(deletion_state, !s.ok()); + + // delete unnecessary files if any, this is done outside the mutex + if (deletion_state.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + // Have to flush the info logs before bg_compaction_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (deletion_state.HaveSomethingToDelete()) { + PurgeObsoleteFiles(deletion_state); + } + mutex_.Lock(); + } + + bg_compaction_scheduled_--; + + MaybeScheduleLogDBDeployStats(); + + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + + // Previous compaction may have produced too many files in a level, + // So reschedule another compaction if we made progress in the + // last compaction. + // + // Also, any time the mutex is released After finding the work to do, + // another thread might execute MaybeScheduleFlushOrCompaction(). It is + // possible that there is a pending job but it is not scheduled because of + // the max thread limit. + if (madeProgress || bg_schedule_needed_) { + MaybeScheduleFlushOrCompaction(); + } + if (madeProgress || bg_compaction_scheduled_ == 0 || bg_manual_only_ > 0) { + // signal if + // * madeProgress -- need to wakeup MakeRoomForWrite + // * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl + // * bg_manual_only_ > 0 -- need to wakeup RunManualCompaction + // If none of this is true, there is no need to signal since nobody is + // waiting for it + bg_cv_.SignalAll(); + } + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + } +} + +Status DBImpl::BackgroundCompaction(bool* madeProgress, + DeletionState& deletion_state, + LogBuffer* log_buffer) { + *madeProgress = false; + mutex_.AssertHeld(); + + bool is_manual = (manual_compaction_ != nullptr) && + (manual_compaction_->in_progress == false); + + if (is_manual) { + // another thread cannot pick up the same work + manual_compaction_->in_progress = true; + } + + // FLUSH preempts compaction + Status flush_stat; + for (auto cfd : *versions_->GetColumnFamilySet()) { + while (cfd->imm()->IsFlushPending()) { + LogToBuffer( + log_buffer, + "BackgroundCompaction doing FlushMemTableToOutputFile, " + "compaction slots available %d", + options_.max_background_compactions - bg_compaction_scheduled_); + cfd->Ref(); + flush_stat = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state, + log_buffer); + cfd->Unref(); + if (!flush_stat.ok()) { + if (is_manual) { + manual_compaction_->status = flush_stat; + manual_compaction_->done = true; + manual_compaction_->in_progress = false; + manual_compaction_ = nullptr; + } + return flush_stat; + } + } + } + + unique_ptr c; + InternalKey manual_end_storage; + InternalKey* manual_end = &manual_end_storage; + if (is_manual) { + ManualCompaction* m = manual_compaction_; + assert(m->in_progress); + c.reset(m->cfd->CompactRange(m->input_level, m->output_level, m->begin, + m->end, &manual_end)); + if (!c) { + m->done = true; + } + LogToBuffer(log_buffer, + "[%s] Manual compaction from level-%d to level-%d from %s .. " + "%s; will stop at %s\n", + m->cfd->GetName().c_str(), m->input_level, m->output_level, + (m->begin ? m->begin->DebugString().c_str() : "(begin)"), + (m->end ? m->end->DebugString().c_str() : "(end)"), + ((m->done || manual_end == nullptr) + ? "(end)" + : manual_end->DebugString().c_str())); + } else { + // no need to refcount in iteration since it's always under a mutex + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->options()->disable_auto_compactions) { + c.reset(cfd->PickCompaction(log_buffer)); + if (c != nullptr) { + // update statistics + MeasureTime(options_.statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION, + c->inputs(0)->size()); + break; + } + } + } + } + + Status status; + if (!c) { + // Nothing to do + LogToBuffer(log_buffer, "Compaction nothing to do"); + } else if (c->IsDeletionCompaction()) { + // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old + // file if there is alive snapshot pointing to it + assert(c->num_input_files(1) == 0); + assert(c->level() == 0); + assert(c->column_family_data()->options()->compaction_style == + kCompactionStyleFIFO); + for (const auto& f : *c->inputs(0)) { + c->edit()->DeleteFile(c->level(), f->number); + } + status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_, + db_directory_.get()); + InstallSuperVersion(c->column_family_data(), deletion_state); + LogToBuffer(log_buffer, "[%s] Deleted %d files\n", + c->column_family_data()->GetName().c_str(), + c->num_input_files(0)); + c->ReleaseCompactionFiles(status); + *madeProgress = true; + } else if (!is_manual && c->IsTrivialMove()) { + // Move file to next level + assert(c->num_input_files(0) == 1); + FileMetaData* f = c->input(0, 0); + c->edit()->DeleteFile(c->level(), f->number); + c->edit()->AddFile(c->level() + 1, f->number, f->file_size, + f->smallest, f->largest, + f->smallest_seqno, f->largest_seqno); + status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_, + db_directory_.get()); + InstallSuperVersion(c->column_family_data(), deletion_state); + + Version::LevelSummaryStorage tmp; + LogToBuffer(log_buffer, "[%s] Moved #%lld to level-%d %lld bytes %s: %s\n", + c->column_family_data()->GetName().c_str(), + static_cast(f->number), c->level() + 1, + static_cast(f->file_size), + status.ToString().c_str(), + c->input_version()->LevelSummary(&tmp)); + c->ReleaseCompactionFiles(status); + *madeProgress = true; + } else { + MaybeScheduleFlushOrCompaction(); // do more compaction work in parallel. + CompactionState* compact = new CompactionState(c.get()); + status = DoCompactionWork(compact, deletion_state, log_buffer); + CleanupCompaction(compact, status); + c->ReleaseCompactionFiles(status); + c->ReleaseInputs(); + *madeProgress = true; + } + c.reset(); + + if (status.ok()) { + // Done + } else if (shutting_down_.Acquire_Load()) { + // Ignore compaction errors found during shutting down + } else { + Log(InfoLogLevel::WARN_LEVEL, options_.info_log, "Compaction error: %s", + status.ToString().c_str()); + if (options_.paranoid_checks && bg_error_.ok()) { + bg_error_ = status; + } + } + + if (is_manual) { + ManualCompaction* m = manual_compaction_; + if (!status.ok()) { + m->status = status; + m->done = true; + } + // For universal compaction: + // Because universal compaction always happens at level 0, so one + // compaction will pick up all overlapped files. No files will be + // filtered out due to size limit and left for a successive compaction. + // So we can safely conclude the current compaction. + // + // Also note that, if we don't stop here, then the current compaction + // writes a new file back to level 0, which will be used in successive + // compaction. Hence the manual compaction will never finish. + // + // Stop the compaction if manual_end points to nullptr -- this means + // that we compacted the whole range. manual_end should always point + // to nullptr in case of universal compaction + if (manual_end == nullptr) { + m->done = true; + } + if (!m->done) { + // We only compacted part of the requested range. Update *m + // to the range that is left to be compacted. + // Universal and FIFO compactions should always compact the whole range + assert(m->cfd->options()->compaction_style != kCompactionStyleUniversal); + assert(m->cfd->options()->compaction_style != kCompactionStyleFIFO); + m->tmp_storage = *manual_end; + m->begin = &m->tmp_storage; + } + m->in_progress = false; // not being processed anymore + manual_compaction_ = nullptr; + } + return status; +} + +void DBImpl::CleanupCompaction(CompactionState* compact, Status status) { + mutex_.AssertHeld(); + if (compact->builder != nullptr) { + // May happen if we get a shutdown call in the middle of compaction + compact->builder->Abandon(); + compact->builder.reset(); + } else { + assert(compact->outfile == nullptr); + } + for (size_t i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + pending_outputs_.erase(out.number); + + // If this file was inserted into the table cache then remove + // them here because this compaction was not committed. + if (!status.ok()) { + TableCache::Evict(table_cache_.get(), out.number); + } + } + delete compact; +} + +// Allocate the file numbers for the output file. We allocate as +// many output file numbers as there are files in level+1 (at least one) +// Insert them into pending_outputs so that they do not get deleted. +void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) { + mutex_.AssertHeld(); + assert(compact != nullptr); + assert(compact->builder == nullptr); + int filesNeeded = compact->compaction->num_input_files(1); + for (int i = 0; i < std::max(filesNeeded, 1); i++) { + uint64_t file_number = versions_->NewFileNumber(); + pending_outputs_.insert(file_number); + compact->allocated_file_numbers.push_back(file_number); + } +} + +// Frees up unused file number. +void DBImpl::ReleaseCompactionUnusedFileNumbers(CompactionState* compact) { + mutex_.AssertHeld(); + for (const auto file_number : compact->allocated_file_numbers) { + pending_outputs_.erase(file_number); + } +} + +Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { + assert(compact != nullptr); + assert(compact->builder == nullptr); + uint64_t file_number; + // If we have not yet exhausted the pre-allocated file numbers, + // then use the one from the front. Otherwise, we have to acquire + // the heavyweight lock and allocate a new file number. + if (!compact->allocated_file_numbers.empty()) { + file_number = compact->allocated_file_numbers.front(); + compact->allocated_file_numbers.pop_front(); + } else { + mutex_.Lock(); + file_number = versions_->NewFileNumber(); + pending_outputs_.insert(file_number); + mutex_.Unlock(); + } + CompactionState::Output out; + out.number = file_number; + out.smallest.Clear(); + out.largest.Clear(); + out.smallest_seqno = out.largest_seqno = 0; + compact->outputs.push_back(out); + + // Make the output file + std::string fname = TableFileName(dbname_, file_number); + Status s = env_->NewWritableFile(fname, &compact->outfile, storage_options_); + + if (s.ok()) { + // Over-estimate slightly so we don't end up just barely crossing + // the threshold. + ColumnFamilyData* cfd = compact->compaction->column_family_data(); + compact->outfile->SetPreallocationBlockSize( + 1.1 * cfd->compaction_picker()->MaxFileSizeForLevel( + compact->compaction->output_level())); + + CompressionType compression_type = + GetCompressionType(*cfd->options(), compact->compaction->output_level(), + compact->compaction->enable_compression()); + + compact->builder.reset( + NewTableBuilder(*cfd->options(), cfd->internal_comparator(), + compact->outfile.get(), compression_type)); + } + LogFlush(options_.info_log); + return s; +} + +Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, + Iterator* input) { + assert(compact != nullptr); + assert(compact->outfile); + assert(compact->builder != nullptr); + + const uint64_t output_number = compact->current_output()->number; + assert(output_number != 0); + + // Check for iterator errors + Status s = input->status(); + const uint64_t current_entries = compact->builder->NumEntries(); + if (s.ok()) { + s = compact->builder->Finish(); + } else { + compact->builder->Abandon(); + } + const uint64_t current_bytes = compact->builder->FileSize(); + compact->current_output()->file_size = current_bytes; + compact->total_bytes += current_bytes; + compact->builder.reset(); + + // Finish and check for file errors + if (s.ok() && !options_.disableDataSync) { + if (options_.use_fsync) { + StopWatch sw(env_, options_.statistics.get(), + COMPACTION_OUTFILE_SYNC_MICROS, false); + s = compact->outfile->Fsync(); + } else { + StopWatch sw(env_, options_.statistics.get(), + COMPACTION_OUTFILE_SYNC_MICROS, false); + s = compact->outfile->Sync(); + } + } + if (s.ok()) { + s = compact->outfile->Close(); + } + compact->outfile.reset(); + + if (s.ok() && current_entries > 0) { + // Verify that the table is usable + ColumnFamilyData* cfd = compact->compaction->column_family_data(); + FileMetaData meta(output_number, current_bytes); + Iterator* iter = cfd->table_cache()->NewIterator( + ReadOptions(), storage_options_, cfd->internal_comparator(), meta); + s = iter->status(); + delete iter; + if (s.ok()) { + Log(options_.info_log, "[%s] Generated table #%lu: %lu keys, %lu bytes", + cfd->GetName().c_str(), (unsigned long)output_number, + (unsigned long)current_entries, (unsigned long)current_bytes); + } + } + return s; +} + + +Status DBImpl::InstallCompactionResults(CompactionState* compact, + LogBuffer* log_buffer) { + mutex_.AssertHeld(); + + // paranoia: verify that the files that we started with + // still exist in the current version and in the same original level. + // This ensures that a concurrent compaction did not erroneously + // pick the same files to compact. + if (!versions_->VerifyCompactionFileConsistency(compact->compaction)) { + Log(options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted", + compact->compaction->column_family_data()->GetName().c_str(), + compact->compaction->num_input_files(0), compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->output_level()); + return Status::Corruption("Compaction input files inconsistent"); + } + + LogToBuffer(log_buffer, "[%s] Compacted %d@%d + %d@%d files => %lld bytes", + compact->compaction->column_family_data()->GetName().c_str(), + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->output_level(), + static_cast(compact->total_bytes)); + + // Add compaction outputs + compact->compaction->AddInputDeletions(compact->compaction->edit()); + for (size_t i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + compact->compaction->edit()->AddFile( + compact->compaction->output_level(), out.number, out.file_size, + out.smallest, out.largest, out.smallest_seqno, out.largest_seqno); + } + return versions_->LogAndApply(compact->compaction->column_family_data(), + compact->compaction->edit(), &mutex_, + db_directory_.get()); +} + +// Given a sequence number, return the sequence number of the +// earliest snapshot that this sequence number is visible in. +// The snapshots themselves are arranged in ascending order of +// sequence numbers. +// Employ a sequential search because the total number of +// snapshots are typically small. +inline SequenceNumber DBImpl::findEarliestVisibleSnapshot( + SequenceNumber in, std::vector& snapshots, + SequenceNumber* prev_snapshot) { + if (!IsSnapshotSupported()) { + return 0; + } + SequenceNumber prev __attribute__((unused)) = 0; + for (const auto cur : snapshots) { + assert(prev <= cur); + if (cur >= in) { + *prev_snapshot = prev; + return cur; + } + prev = cur; // assignment + assert(prev); + } + Log(options_.info_log, + "Looking for seqid %lu but maxseqid is %lu", + (unsigned long)in, + (unsigned long)snapshots[snapshots.size()-1]); + assert(0); + return 0; +} + +uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd, + DeletionState& deletion_state, + LogBuffer* log_buffer) { + if (cfd->imm()->imm_flush_needed.NoBarrier_Load() != nullptr) { + const uint64_t imm_start = env_->NowMicros(); + mutex_.Lock(); + if (cfd->imm()->IsFlushPending()) { + cfd->Ref(); + FlushMemTableToOutputFile(cfd, nullptr, deletion_state, log_buffer); + cfd->Unref(); + bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary + } + mutex_.Unlock(); + log_buffer->FlushBufferToLog(); + return env_->NowMicros() - imm_start; + } + return 0; +} + +Status DBImpl::ProcessKeyValueCompaction( + SequenceNumber visible_at_tip, + SequenceNumber earliest_snapshot, + SequenceNumber latest_snapshot, + DeletionState& deletion_state, + bool bottommost_level, + int64_t& imm_micros, + Iterator* input, + CompactionState* compact, + bool is_compaction_v2, + LogBuffer* log_buffer) { + size_t combined_idx = 0; + Status status; + std::string compaction_filter_value; + ParsedInternalKey ikey; + IterKey current_user_key; + bool has_current_user_key = false; + IterKey delete_key; + SequenceNumber last_sequence_for_key __attribute__((unused)) = + kMaxSequenceNumber; + SequenceNumber visible_in_snapshot = kMaxSequenceNumber; + ColumnFamilyData* cfd = compact->compaction->column_family_data(); + MergeHelper merge( + cfd->user_comparator(), cfd->options()->merge_operator.get(), + options_.info_log.get(), cfd->options()->min_partial_merge_operands, + false /* internal key corruption is expected */); + auto compaction_filter = cfd->options()->compaction_filter; + std::unique_ptr compaction_filter_from_factory = nullptr; + if (!compaction_filter) { + auto context = compact->GetFilterContextV1(); + compaction_filter_from_factory = + cfd->options()->compaction_filter_factory->CreateCompactionFilter( + context); + compaction_filter = compaction_filter_from_factory.get(); + } + + while (input->Valid() && !shutting_down_.Acquire_Load() && + !cfd->IsDropped()) { + // FLUSH preempts compaction + // TODO(icanadi) this currently only checks if flush is necessary on + // compacting column family. we should also check if flush is necessary on + // other column families, too + imm_micros += CallFlushDuringCompaction(cfd, deletion_state, log_buffer); + + Slice key; + Slice value; + // If is_compaction_v2 is on, kv-pairs are reset to the prefix batch. + // This prefix batch should contain results after calling + // compaction_filter_v2. + // + // If is_compaction_v2 is off, this function will go through all the + // kv-pairs in input. + if (!is_compaction_v2) { + key = input->key(); + value = input->value(); + } else { + if (combined_idx >= compact->combined_key_buf_.size()) { + break; + } + assert(combined_idx < compact->combined_key_buf_.size()); + key = compact->combined_key_buf_[combined_idx]; + value = compact->combined_value_buf_[combined_idx]; + + ++combined_idx; + } + + if (compact->compaction->ShouldStopBefore(key) && + compact->builder != nullptr) { + status = FinishCompactionOutputFile(compact, input); + if (!status.ok()) { + break; + } + } + + // Handle key/value, add to state, etc. + bool drop = false; + bool current_entry_is_merging = false; + if (!ParseInternalKey(key, &ikey)) { + // Do not hide error keys + // TODO: error key stays in db forever? Figure out the intention/rationale + // v10 error v8 : we cannot hide v8 even though it's pretty obvious. + current_user_key.Clear(); + has_current_user_key = false; + last_sequence_for_key = kMaxSequenceNumber; + visible_in_snapshot = kMaxSequenceNumber; + } else { + if (!has_current_user_key || + cfd->user_comparator()->Compare(ikey.user_key, + current_user_key.GetKey()) != 0) { + // First occurrence of this user key + current_user_key.SetKey(ikey.user_key); + has_current_user_key = true; + last_sequence_for_key = kMaxSequenceNumber; + visible_in_snapshot = kMaxSequenceNumber; + // apply the compaction filter to the first occurrence of the user key + if (compaction_filter && !is_compaction_v2 && + ikey.type == kTypeValue && + (visible_at_tip || ikey.sequence > latest_snapshot)) { + // If the user has specified a compaction filter and the sequence + // number is greater than any external snapshot, then invoke the + // filter. + // If the return value of the compaction filter is true, replace + // the entry with a delete marker. + bool value_changed = false; + compaction_filter_value.clear(); + bool to_delete = compaction_filter->Filter( + compact->compaction->level(), ikey.user_key, value, + &compaction_filter_value, &value_changed); + if (to_delete) { + // make a copy of the original key and convert it to a delete + delete_key.SetInternalKey(ExtractUserKey(key), ikey.sequence, + kTypeDeletion); + // anchor the key again + key = delete_key.GetKey(); + // needed because ikey is backed by key + ParseInternalKey(key, &ikey); + // no value associated with delete + value.clear(); + RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_USER); + } else if (value_changed) { + value = compaction_filter_value; + } + } + } + + // If there are no snapshots, then this kv affect visibility at tip. + // Otherwise, search though all existing snapshots to find + // the earlist snapshot that is affected by this kv. + SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot + SequenceNumber visible = visible_at_tip ? + visible_at_tip : + findEarliestVisibleSnapshot(ikey.sequence, + compact->existing_snapshots, + &prev_snapshot); + + if (visible_in_snapshot == visible) { + // If the earliest snapshot is which this key is visible in + // is the same as the visibily of a previous instance of the + // same key, then this kv is not visible in any snapshot. + // Hidden by an newer entry for same user key + // TODO: why not > ? + assert(last_sequence_for_key >= ikey.sequence); + drop = true; // (A) + RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_NEWER_ENTRY); + } else if (ikey.type == kTypeDeletion && + ikey.sequence <= earliest_snapshot && + compact->compaction->IsBaseLevelForKey(ikey.user_key)) { + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger sequence numbers + // (3) data in layers that are being compacted here and have + // smaller sequence numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + drop = true; + RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_OBSOLETE); + } else if (ikey.type == kTypeMerge) { + // We know the merge type entry is not hidden, otherwise we would + // have hit (A) + // We encapsulate the merge related state machine in a different + // object to minimize change to the existing flow. Turn out this + // logic could also be nicely re-used for memtable flush purge + // optimization in BuildTable. + int steps = 0; + merge.MergeUntil(input, prev_snapshot, bottommost_level, + options_.statistics.get(), &steps); + // Skip the Merge ops + combined_idx = combined_idx - 1 + steps; + + current_entry_is_merging = true; + if (merge.IsSuccess()) { + // Successfully found Put/Delete/(end-of-key-range) while merging + // Get the merge result + key = merge.key(); + ParseInternalKey(key, &ikey); + value = merge.value(); + } else { + // Did not find a Put/Delete/(end-of-key-range) while merging + // We now have some stack of merge operands to write out. + // NOTE: key,value, and ikey are now referring to old entries. + // These will be correctly set below. + assert(!merge.keys().empty()); + assert(merge.keys().size() == merge.values().size()); + + // Hack to make sure last_sequence_for_key is correct + ParseInternalKey(merge.keys().front(), &ikey); + } + } + + last_sequence_for_key = ikey.sequence; + visible_in_snapshot = visible; + } + + if (!drop) { + // We may write a single key (e.g.: for Put/Delete or successful merge). + // Or we may instead have to write a sequence/list of keys. + // We have to write a sequence iff we have an unsuccessful merge + bool has_merge_list = current_entry_is_merging && !merge.IsSuccess(); + const std::deque* keys = nullptr; + const std::deque* values = nullptr; + std::deque::const_reverse_iterator key_iter; + std::deque::const_reverse_iterator value_iter; + if (has_merge_list) { + keys = &merge.keys(); + values = &merge.values(); + key_iter = keys->rbegin(); // The back (*rbegin()) is the first key + value_iter = values->rbegin(); + + key = Slice(*key_iter); + value = Slice(*value_iter); + } + + // If we have a list of keys to write, traverse the list. + // If we have a single key to write, simply write that key. + while (true) { + // Invariant: key,value,ikey will always be the next entry to write + char* kptr = (char*)key.data(); + std::string kstr; + + // Zeroing out the sequence number leads to better compression. + // If this is the bottommost level (no files in lower levels) + // and the earliest snapshot is larger than this seqno + // then we can squash the seqno to zero. + if (bottommost_level && ikey.sequence < earliest_snapshot && + ikey.type != kTypeMerge) { + assert(ikey.type != kTypeDeletion); + // make a copy because updating in place would cause problems + // with the priority queue that is managing the input key iterator + kstr.assign(key.data(), key.size()); + kptr = (char *)kstr.c_str(); + UpdateInternalKey(kptr, key.size(), (uint64_t)0, ikey.type); + } + + Slice newkey(kptr, key.size()); + assert((key.clear(), 1)); // we do not need 'key' anymore + + // Open output file if necessary + if (compact->builder == nullptr) { + status = OpenCompactionOutputFile(compact); + if (!status.ok()) { + break; + } + } + + SequenceNumber seqno = GetInternalKeySeqno(newkey); + if (compact->builder->NumEntries() == 0) { + compact->current_output()->smallest.DecodeFrom(newkey); + compact->current_output()->smallest_seqno = seqno; + } else { + compact->current_output()->smallest_seqno = + std::min(compact->current_output()->smallest_seqno, seqno); + } + compact->current_output()->largest.DecodeFrom(newkey); + compact->builder->Add(newkey, value); + compact->current_output()->largest_seqno = + std::max(compact->current_output()->largest_seqno, seqno); + + // Close output file if it is big enough + if (compact->builder->FileSize() >= + compact->compaction->MaxOutputFileSize()) { + status = FinishCompactionOutputFile(compact, input); + if (!status.ok()) { + break; + } + } + + // If we have a list of entries, move to next element + // If we only had one entry, then break the loop. + if (has_merge_list) { + ++key_iter; + ++value_iter; + + // If at end of list + if (key_iter == keys->rend() || value_iter == values->rend()) { + // Sanity Check: if one ends, then both end + assert(key_iter == keys->rend() && value_iter == values->rend()); + break; + } + + // Otherwise not at end of list. Update key, value, and ikey. + key = Slice(*key_iter); + value = Slice(*value_iter); + ParseInternalKey(key, &ikey); + + } else{ + // Only had one item to begin with (Put/Delete) + break; + } + } + } + + // MergeUntil has moved input to the next entry + if (!current_entry_is_merging) { + input->Next(); + } + } + + return status; +} + +void DBImpl::CallCompactionFilterV2(CompactionState* compact, + CompactionFilterV2* compaction_filter_v2) { + if (compact == nullptr || compaction_filter_v2 == nullptr) { + return; + } + + std::vector user_key_buf; + for (const auto& key : compact->ikey_buf_) { + user_key_buf.emplace_back(key.user_key); + } + + // If the user has specified a compaction filter and the sequence + // number is greater than any external snapshot, then invoke the + // filter. + // If the return value of the compaction filter is true, replace + // the entry with a delete marker. + compact->to_delete_buf_ = compaction_filter_v2->Filter( + compact->compaction->level(), + user_key_buf, compact->existing_value_buf_, + &compact->new_value_buf_, + &compact->value_changed_buf_); + + // new_value_buf_.size() <= to_delete__buf_.size(). "=" iff all + // kv-pairs in this compaction run needs to be deleted. + assert(compact->to_delete_buf_.size() == + compact->key_buf_.size()); + assert(compact->to_delete_buf_.size() == + compact->existing_value_buf_.size()); + assert(compact->to_delete_buf_.size() == + compact->value_changed_buf_.size()); + + int new_value_idx = 0; + for (unsigned int i = 0; i < compact->to_delete_buf_.size(); ++i) { + if (compact->to_delete_buf_[i]) { + // update the string buffer directly + // the Slice buffer points to the updated buffer + UpdateInternalKey(&compact->key_str_buf_[i][0], + compact->key_str_buf_[i].size(), + compact->ikey_buf_[i].sequence, + kTypeDeletion); + + // no value associated with delete + compact->existing_value_buf_[i].clear(); + RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_USER); + } else if (compact->value_changed_buf_[i]) { + compact->existing_value_buf_[i] = + Slice(compact->new_value_buf_[new_value_idx++]); + } + } // for +} + +Status DBImpl::DoCompactionWork(CompactionState* compact, + DeletionState& deletion_state, + LogBuffer* log_buffer) { + assert(compact); + compact->CleanupBatchBuffer(); + compact->CleanupMergedBuffer(); + bool prefix_initialized = false; + + int64_t imm_micros = 0; // Micros spent doing imm_ compactions + ColumnFamilyData* cfd = compact->compaction->column_family_data(); + LogToBuffer( + log_buffer, + "[%s] Compacting %d@%d + %d@%d files, score %.2f slots available %d", + cfd->GetName().c_str(), compact->compaction->num_input_files(0), + compact->compaction->level(), compact->compaction->num_input_files(1), + compact->compaction->output_level(), compact->compaction->score(), + options_.max_background_compactions - bg_compaction_scheduled_); + char scratch[2345]; + compact->compaction->Summary(scratch, sizeof(scratch)); + LogToBuffer(log_buffer, "[%s] Compaction start summary: %s\n", + cfd->GetName().c_str(), scratch); + + assert(cfd->current()->NumLevelFiles(compact->compaction->level()) > 0); + assert(compact->builder == nullptr); + assert(!compact->outfile); + + SequenceNumber visible_at_tip = 0; + SequenceNumber earliest_snapshot; + SequenceNumber latest_snapshot = 0; + snapshots_.getAll(compact->existing_snapshots); + if (compact->existing_snapshots.size() == 0) { + // optimize for fast path if there are no snapshots + visible_at_tip = versions_->LastSequence(); + earliest_snapshot = visible_at_tip; + } else { + latest_snapshot = compact->existing_snapshots.back(); + // Add the current seqno as the 'latest' virtual + // snapshot to the end of this list. + compact->existing_snapshots.push_back(versions_->LastSequence()); + earliest_snapshot = compact->existing_snapshots[0]; + } + + // Is this compaction producing files at the bottommost level? + bool bottommost_level = compact->compaction->BottomMostLevel(); + + // Allocate the output file numbers before we release the lock + AllocateCompactionOutputFileNumbers(compact); + + // Release mutex while we're actually doing the compaction work + mutex_.Unlock(); + log_buffer->FlushBufferToLog(); + + const uint64_t start_micros = env_->NowMicros(); + unique_ptr input(versions_->MakeInputIterator(compact->compaction)); + input->SeekToFirst(); + shared_ptr backup_input( + versions_->MakeInputIterator(compact->compaction)); + backup_input->SeekToFirst(); + + Status status; + ParsedInternalKey ikey; + std::unique_ptr compaction_filter_from_factory_v2 + = nullptr; + auto context = compact->GetFilterContext(); + compaction_filter_from_factory_v2 = + cfd->options()->compaction_filter_factory_v2->CreateCompactionFilterV2( + context); + auto compaction_filter_v2 = + compaction_filter_from_factory_v2.get(); + + // temp_backup_input always point to the start of the current buffer + // temp_backup_input = backup_input; + // iterate through input, + // 1) buffer ineligible keys and value keys into 2 separate buffers; + // 2) send value_buffer to compaction filter and alternate the values; + // 3) merge value_buffer with ineligible_value_buffer; + // 4) run the modified "compaction" using the old for loop. + if (compaction_filter_v2) { + while (backup_input->Valid() && !shutting_down_.Acquire_Load() && + !cfd->IsDropped()) { + // FLUSH preempts compaction + // TODO(icanadi) this currently only checks if flush is necessary on + // compacting column family. we should also check if flush is necessary on + // other column families, too + imm_micros += CallFlushDuringCompaction(cfd, deletion_state, log_buffer); + + Slice key = backup_input->key(); + Slice value = backup_input->value(); + + const SliceTransform* transformer = + cfd->options()->compaction_filter_factory_v2->GetPrefixExtractor(); + const auto key_prefix = transformer->Transform(key); + if (!prefix_initialized) { + compact->cur_prefix_ = key_prefix.ToString(); + prefix_initialized = true; + } + if (!ParseInternalKey(key, &ikey)) { + // log error + Log(options_.info_log, "[%s] Failed to parse key: %s", + cfd->GetName().c_str(), key.ToString().c_str()); + continue; + } else { + // If the prefix remains the same, keep buffering + if (key_prefix.compare(Slice(compact->cur_prefix_)) == 0) { + // Apply the compaction filter V2 to all the kv pairs sharing + // the same prefix + if (ikey.type == kTypeValue && + (visible_at_tip || ikey.sequence > latest_snapshot)) { + // Buffer all keys sharing the same prefix for CompactionFilterV2 + // Iterate through keys to check prefix + compact->BufferKeyValueSlices(key, value); + } else { + // buffer ineligible keys + compact->BufferOtherKeyValueSlices(key, value); + } + backup_input->Next(); + continue; + // finish changing values for eligible keys + } else { + // Now prefix changes, this batch is done. + // Call compaction filter on the buffered values to change the value + if (compact->key_buf_.size() > 0) { + CallCompactionFilterV2(compact, compaction_filter_v2); + } + compact->cur_prefix_ = key_prefix.ToString(); + } + } + + // Merge this batch of data (values + ineligible keys) + compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator()); + + // Done buffering for the current prefix. Spit it out to disk + // Now just iterate through all the kv-pairs + status = ProcessKeyValueCompaction( + visible_at_tip, + earliest_snapshot, + latest_snapshot, + deletion_state, + bottommost_level, + imm_micros, + input.get(), + compact, + true, + log_buffer); + + if (!status.ok()) { + break; + } + + // After writing the kv-pairs, we can safely remove the reference + // to the string buffer and clean them up + compact->CleanupBatchBuffer(); + compact->CleanupMergedBuffer(); + // Buffer the key that triggers the mismatch in prefix + if (ikey.type == kTypeValue && + (visible_at_tip || ikey.sequence > latest_snapshot)) { + compact->BufferKeyValueSlices(key, value); + } else { + compact->BufferOtherKeyValueSlices(key, value); + } + backup_input->Next(); + if (!backup_input->Valid()) { + // If this is the single last value, we need to merge it. + if (compact->key_buf_.size() > 0) { + CallCompactionFilterV2(compact, compaction_filter_v2); + } + compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator()); + + status = ProcessKeyValueCompaction( + visible_at_tip, + earliest_snapshot, + latest_snapshot, + deletion_state, + bottommost_level, + imm_micros, + input.get(), + compact, + true, + log_buffer); + + compact->CleanupBatchBuffer(); + compact->CleanupMergedBuffer(); + } + } // done processing all prefix batches + // finish the last batch + if (compact->key_buf_.size() > 0) { + CallCompactionFilterV2(compact, compaction_filter_v2); + } + compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator()); + status = ProcessKeyValueCompaction( + visible_at_tip, + earliest_snapshot, + latest_snapshot, + deletion_state, + bottommost_level, + imm_micros, + input.get(), + compact, + true, + log_buffer); + } // checking for compaction filter v2 + + if (!compaction_filter_v2) { + status = ProcessKeyValueCompaction( + visible_at_tip, + earliest_snapshot, + latest_snapshot, + deletion_state, + bottommost_level, + imm_micros, + input.get(), + compact, + false, + log_buffer); + } + + if (status.ok() && (shutting_down_.Acquire_Load() || cfd->IsDropped())) { + status = Status::ShutdownInProgress( + "Database shutdown or Column family drop during compaction"); + } + if (status.ok() && compact->builder != nullptr) { + status = FinishCompactionOutputFile(compact, input.get()); + } + if (status.ok()) { + status = input->status(); + } + input.reset(); + + if (!options_.disableDataSync) { + db_directory_->Fsync(); + } + + InternalStats::CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros - imm_micros; + MeasureTime(options_.statistics.get(), COMPACTION_TIME, stats.micros); + stats.files_in_leveln = compact->compaction->num_input_files(0); + stats.files_in_levelnp1 = compact->compaction->num_input_files(1); + + int num_output_files = compact->outputs.size(); + if (compact->builder != nullptr) { + // An error occurred so ignore the last output. + assert(num_output_files > 0); + --num_output_files; + } + stats.files_out_levelnp1 = num_output_files; + + for (int i = 0; i < compact->compaction->num_input_files(0); i++) { + stats.bytes_readn += compact->compaction->input(0, i)->file_size; + RecordTick(options_.statistics.get(), COMPACT_READ_BYTES, + compact->compaction->input(0, i)->file_size); + } + + for (int i = 0; i < compact->compaction->num_input_files(1); i++) { + stats.bytes_readnp1 += compact->compaction->input(1, i)->file_size; + RecordTick(options_.statistics.get(), COMPACT_READ_BYTES, + compact->compaction->input(1, i)->file_size); + } + + for (int i = 0; i < num_output_files; i++) { + stats.bytes_written += compact->outputs[i].file_size; + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, + compact->outputs[i].file_size); + } + + LogFlush(options_.info_log); + mutex_.Lock(); + cfd->internal_stats()->AddCompactionStats(compact->compaction->output_level(), + stats); + + // if there were any unused file number (mostly in case of + // compaction error), free up the entry from pending_putputs + ReleaseCompactionUnusedFileNumbers(compact); + + if (status.ok()) { + status = InstallCompactionResults(compact, log_buffer); + InstallSuperVersion(cfd, deletion_state); + } + Version::LevelSummaryStorage tmp; + LogToBuffer( + log_buffer, + "[%s] compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) " + "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " + "write-amplify(%.1f) %s\n", + cfd->GetName().c_str(), cfd->current()->LevelSummary(&tmp), + (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) / + (double)stats.micros, + compact->compaction->output_level(), stats.files_in_leveln, + stats.files_in_levelnp1, stats.files_out_levelnp1, + stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0, + stats.bytes_written / 1048576.0, + (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) / + (double)stats.bytes_readn, + stats.bytes_written / (double)stats.bytes_readn, + status.ToString().c_str()); + + return status; +} + +namespace { +struct IterState { + IterState(DBImpl* db, port::Mutex* mu, SuperVersion* super_version) + : db(db), mu(mu), super_version(super_version) {} + + DBImpl* db; + port::Mutex* mu; + SuperVersion* super_version; +}; + +static void CleanupIteratorState(void* arg1, void* arg2) { + IterState* state = reinterpret_cast(arg1); + + if (state->super_version->Unref()) { + DBImpl::DeletionState deletion_state; + + state->mu->Lock(); + state->super_version->Cleanup(); + state->db->FindObsoleteFiles(deletion_state, false, true); + state->mu->Unlock(); + + delete state->super_version; + if (deletion_state.HaveSomethingToDelete()) { + state->db->PurgeObsoleteFiles(deletion_state); + } + } + + delete state; +} +} // namespace + +Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, + ColumnFamilyData* cfd, + SuperVersion* super_version, + Arena* arena) { + Iterator* internal_iter; + if (arena != nullptr) { + // Need to create internal iterator from the arena. + MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena); + // Collect iterator for mutable mem + merge_iter_builder.AddIterator( + super_version->mem->NewIterator(options, false, arena)); + // Collect all needed child iterators for immutable memtables + super_version->imm->AddIterators(options, &merge_iter_builder); + // Collect iterators for files in L0 - Ln + super_version->current->AddIterators(options, storage_options_, + &merge_iter_builder); + internal_iter = merge_iter_builder.Finish(); + } else { + // Need to create internal iterator using malloc. + std::vector iterator_list; + // Collect iterator for mutable mem + iterator_list.push_back(super_version->mem->NewIterator(options)); + // Collect all needed child iterators for immutable memtables + super_version->imm->AddIterators(options, &iterator_list); + // Collect iterators for files in L0 - Ln + super_version->current->AddIterators(options, storage_options_, + &iterator_list); + internal_iter = NewMergingIterator(&cfd->internal_comparator(), + &iterator_list[0], iterator_list.size()); + } + IterState* cleanup = new IterState(this, &mutex_, super_version); + internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); + + return internal_iter; +} + +ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { + return default_cf_handle_; +} + +Status DBImpl::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) { + return GetImpl(options, column_family, key, value); +} + +// DeletionState gets created and destructed outside of the lock -- we +// use this convinently to: +// * malloc one SuperVersion() outside of the lock -- new_superversion +// * delete SuperVersion()s outside of the lock -- superversions_to_free +// +// However, if InstallSuperVersion() gets called twice with the same, +// deletion_state, we can't reuse the SuperVersion() that got malloced because +// first call already used it. In that rare case, we take a hit and create a +// new SuperVersion() inside of the mutex. We do similar thing +// for superversion_to_free +void DBImpl::InstallSuperVersion(ColumnFamilyData* cfd, + DeletionState& deletion_state) { + mutex_.AssertHeld(); + // if new_superversion == nullptr, it means somebody already used it + SuperVersion* new_superversion = + (deletion_state.new_superversion != nullptr) ? + deletion_state.new_superversion : new SuperVersion(); + SuperVersion* old_superversion = + cfd->InstallSuperVersion(new_superversion, &mutex_); + deletion_state.new_superversion = nullptr; + deletion_state.superversions_to_free.push_back(old_superversion); +} + +Status DBImpl::GetImpl(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found) { + StopWatch sw(env_, options_.statistics.get(), DB_GET, false); + PERF_TIMER_AUTO(get_snapshot_time); + + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + SequenceNumber snapshot; + if (options.snapshot != nullptr) { + snapshot = reinterpret_cast(options.snapshot)->number_; + } else { + snapshot = versions_->LastSequence(); + } + + // Acquire SuperVersion + SuperVersion* sv = nullptr; + // TODO(ljin): consider using GetReferencedSuperVersion() directly + if (LIKELY(options_.allow_thread_local)) { + sv = cfd->GetThreadLocalSuperVersion(&mutex_); + } else { + mutex_.Lock(); + sv = cfd->GetSuperVersion()->Ref(); + mutex_.Unlock(); + } + + bool have_stat_update = false; + Version::GetStats stats; + + // Prepare to store a list of merge operations if merge occurs. + MergeContext merge_context; + + Status s; + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + LookupKey lkey(key, snapshot); + PERF_TIMER_STOP(get_snapshot_time); + if (sv->mem->Get(lkey, value, &s, merge_context, *cfd->options())) { + // Done + RecordTick(options_.statistics.get(), MEMTABLE_HIT); + } else if (sv->imm->Get(lkey, value, &s, merge_context, *cfd->options())) { + // Done + RecordTick(options_.statistics.get(), MEMTABLE_HIT); + } else { + PERF_TIMER_START(get_from_output_files_time); + + sv->current->Get(options, lkey, value, &s, &merge_context, &stats, + value_found); + have_stat_update = true; + PERF_TIMER_STOP(get_from_output_files_time); + RecordTick(options_.statistics.get(), MEMTABLE_MISS); + } + + PERF_TIMER_START(get_post_process_time); + + if (!cfd->options()->disable_seek_compaction && have_stat_update) { + mutex_.Lock(); + if (sv->current->UpdateStats(stats)) { + MaybeScheduleFlushOrCompaction(); + } + mutex_.Unlock(); + } + + bool unref_sv = true; + if (LIKELY(options_.allow_thread_local)) { + unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv); + } + + if (unref_sv) { + // Release SuperVersion + if (sv->Unref()) { + mutex_.Lock(); + sv->Cleanup(); + mutex_.Unlock(); + delete sv; + RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS); + } + RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_RELEASES); + } + + RecordTick(options_.statistics.get(), NUMBER_KEYS_READ); + RecordTick(options_.statistics.get(), BYTES_READ, value->size()); + PERF_TIMER_STOP(get_post_process_time); + return s; +} + +std::vector DBImpl::MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + + StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET, false); + PERF_TIMER_AUTO(get_snapshot_time); + + SequenceNumber snapshot; + + struct MultiGetColumnFamilyData { + ColumnFamilyData* cfd; + SuperVersion* super_version; + Version::GetStats stats; + bool have_stat_update = false; + }; + std::unordered_map multiget_cf_data; + // fill up and allocate outside of mutex + for (auto cf : column_family) { + auto cfh = reinterpret_cast(cf); + auto cfd = cfh->cfd(); + if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) { + auto mgcfd = new MultiGetColumnFamilyData(); + mgcfd->cfd = cfd; + multiget_cf_data.insert({cfd->GetID(), mgcfd}); + } + } + + mutex_.Lock(); + if (options.snapshot != nullptr) { + snapshot = reinterpret_cast(options.snapshot)->number_; + } else { + snapshot = versions_->LastSequence(); + } + for (auto mgd_iter : multiget_cf_data) { + mgd_iter.second->super_version = + mgd_iter.second->cfd->GetSuperVersion()->Ref(); + } + mutex_.Unlock(); + + // Contain a list of merge operations if merge occurs. + MergeContext merge_context; + + // Note: this always resizes the values array + size_t num_keys = keys.size(); + std::vector stat_list(num_keys); + values->resize(num_keys); + + // Keep track of bytes that we read for statistics-recording later + uint64_t bytes_read = 0; + PERF_TIMER_STOP(get_snapshot_time); + + // For each of the given keys, apply the entire "get" process as follows: + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + for (size_t i = 0; i < num_keys; ++i) { + merge_context.Clear(); + Status& s = stat_list[i]; + std::string* value = &(*values)[i]; + + LookupKey lkey(keys[i], snapshot); + auto cfh = reinterpret_cast(column_family[i]); + auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID()); + assert(mgd_iter != multiget_cf_data.end()); + auto mgd = mgd_iter->second; + auto super_version = mgd->super_version; + auto cfd = mgd->cfd; + if (super_version->mem->Get(lkey, value, &s, merge_context, + *cfd->options())) { + // Done + } else if (super_version->imm->Get(lkey, value, &s, merge_context, + *cfd->options())) { + // Done + } else { + super_version->current->Get(options, lkey, value, &s, &merge_context, + &mgd->stats); + mgd->have_stat_update = true; + } + + if (s.ok()) { + bytes_read += value->size(); + } + } + + // Post processing (decrement reference counts and record statistics) + PERF_TIMER_START(get_post_process_time); + autovector superversions_to_delete; + + bool schedule_flush_or_compaction = false; + mutex_.Lock(); + for (auto mgd_iter : multiget_cf_data) { + auto mgd = mgd_iter.second; + auto cfd = mgd->cfd; + if (!cfd->options()->disable_seek_compaction && mgd->have_stat_update) { + if (mgd->super_version->current->UpdateStats(mgd->stats)) { + schedule_flush_or_compaction = true; + } + } + if (mgd->super_version->Unref()) { + mgd->super_version->Cleanup(); + superversions_to_delete.push_back(mgd->super_version); + } + } + if (schedule_flush_or_compaction) { + MaybeScheduleFlushOrCompaction(); + } + mutex_.Unlock(); + + for (auto td : superversions_to_delete) { + delete td; + } + for (auto mgd : multiget_cf_data) { + delete mgd.second; + } + + RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS); + RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, num_keys); + RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytes_read); + PERF_TIMER_STOP(get_post_process_time); + + return stat_list; +} + +Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) { + *handle = nullptr; + MutexLock l(&mutex_); + + if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) != + nullptr) { + return Status::InvalidArgument("Column family already exists"); + } + VersionEdit edit; + edit.AddColumnFamily(column_family_name); + uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); + edit.SetColumnFamily(new_id); + edit.SetLogNumber(logfile_number_); + edit.SetComparatorName(options.comparator->Name()); + + // LogAndApply will both write the creation in MANIFEST and create + // ColumnFamilyData object + Status s = versions_->LogAndApply(nullptr, &edit, &mutex_, + db_directory_.get(), false, &options); + if (s.ok()) { + auto cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); + assert(cfd != nullptr); + delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_); + *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); + Log(options_.info_log, "Created column family [%s] (ID %u)", + column_family_name.c_str(), (unsigned)cfd->GetID()); + max_total_in_memory_state_ += cfd->options()->write_buffer_size * + cfd->options()->max_write_buffer_number; + } else { + Log(options_.info_log, "Creating column family [%s] FAILED -- %s", + column_family_name.c_str(), s.ToString().c_str()); + } + return s; +} + +Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + if (cfd->GetID() == 0) { + return Status::InvalidArgument("Can't drop default column family"); + } + + VersionEdit edit; + edit.DropColumnFamily(); + edit.SetColumnFamily(cfd->GetID()); + + Status s; + { + MutexLock l(&mutex_); + if (cfd->IsDropped()) { + s = Status::InvalidArgument("Column family already dropped!\n"); + } + if (s.ok()) { + s = versions_->LogAndApply(cfd, &edit, &mutex_); + } + } + + if (s.ok()) { + assert(cfd->IsDropped()); + max_total_in_memory_state_ -= cfd->options()->write_buffer_size * + cfd->options()->max_write_buffer_number; + Log(options_.info_log, "Dropped column family with id %u\n", cfd->GetID()); + // Flush the memtables. This will make all WAL files referencing dropped + // column family to be obsolete. They will be deleted once user deletes + // column family handle + Write(WriteOptions(), nullptr); // ignore error + } else { + Log(options_.info_log, "Dropping column family with id %u FAILED -- %s\n", + cfd->GetID(), s.ToString().c_str()); + } + + return s; +} + +bool DBImpl::KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found) { + if (value_found != nullptr) { + // falsify later if key-may-exist but can't fetch value + *value_found = true; + } + ReadOptions roptions = options; + roptions.read_tier = kBlockCacheTier; // read from block cache only + auto s = GetImpl(roptions, column_family, key, value, value_found); + + // If options.block_cache != nullptr and the index block of the table didn't + // not present in block_cache, the return value will be Status::Incomplete. + // In this case, key may still exist in the table. + return s.ok() || s.IsIncomplete(); +} + +Iterator* DBImpl::NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + if (options.tailing) { +#ifdef ROCKSDB_LITE + // not supported in lite version + return nullptr; +#else + // TODO(ljin): remove tailing iterator + auto iter = new ForwardIterator(this, options, cfd); + return NewDBIterator(env_, *cfd->options(), cfd->user_comparator(), iter, + kMaxSequenceNumber); +// return new TailingIterator(env_, this, options, cfd); +#endif + } else { + SequenceNumber latest_snapshot = versions_->LastSequence(); + SuperVersion* sv = nullptr; + sv = cfd->GetReferencedSuperVersion(&mutex_); + + auto snapshot = + options.snapshot != nullptr + ? reinterpret_cast(options.snapshot)->number_ + : latest_snapshot; + + // Try to generate a DB iterator tree in continuous memory area to be + // cache friendly. Here is an example of result: + // +-------------------------------+ + // | | + // | ArenaWrappedDBIter | + // | + | + // | +---> Inner Iterator ------------+ + // | | | | + // | | +-- -- -- -- -- -- -- --+ | + // | +--- | Arena | | + // | | | | + // | Allocated Memory: | | + // | | +-------------------+ | + // | | | DBIter | <---+ + // | | + | + // | | | +-> iter_ ------------+ + // | | | | | + // | | +-------------------+ | + // | | | MergingIterator | <---+ + // | | + | + // | | | +->child iter1 ------------+ + // | | | | | | + // | | +->child iter2 ----------+ | + // | | | | | | | + // | | | +->child iter3 --------+ | | + // | | | | | | + // | | +-------------------+ | | | + // | | | Iterator1 | <--------+ + // | | +-------------------+ | | + // | | | Iterator2 | <------+ + // | | +-------------------+ | + // | | | Iterator3 | <----+ + // | | +-------------------+ + // | | | + // +-------+-----------------------+ + // + // ArenaWrappedDBIter inlines an arena area where all the iterartor in the + // the iterator tree is allocated in the order of being accessed when + // querying. + // Laying out the iterators in the order of being accessed makes it more + // likely that any iterator pointer is close to the iterator it points to so + // that they are likely to be in the same cache line and/or page. + ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( + env_, *cfd->options(), cfd->user_comparator(), snapshot); + Iterator* internal_iter = + NewInternalIterator(options, cfd, sv, db_iter->GetArena()); + db_iter->SetIterUnderDBIter(internal_iter); + + return db_iter; + } +} + +Status DBImpl::NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) { + iterators->clear(); + iterators->reserve(column_families.size()); + SequenceNumber latest_snapshot = 0; + std::vector super_versions; + super_versions.reserve(column_families.size()); + + if (!options.tailing) { + mutex_.Lock(); + latest_snapshot = versions_->LastSequence(); + for (auto cfh : column_families) { + auto cfd = reinterpret_cast(cfh)->cfd(); + super_versions.push_back(cfd->GetSuperVersion()->Ref()); + } + mutex_.Unlock(); + } + + if (options.tailing) { +#ifdef ROCKSDB_LITE + return Status::InvalidArgument( + "Tailing interator not supported in RocksDB lite"); +#else + for (auto cfh : column_families) { + auto cfd = reinterpret_cast(cfh)->cfd(); + iterators->push_back(new TailingIterator(env_, this, options, cfd)); + } +#endif + } else { + for (size_t i = 0; i < column_families.size(); ++i) { + auto cfh = reinterpret_cast(column_families[i]); + auto cfd = cfh->cfd(); + + auto snapshot = + options.snapshot != nullptr + ? reinterpret_cast(options.snapshot)->number_ + : latest_snapshot; + + auto iter = NewInternalIterator(options, cfd, super_versions[i]); + iter = NewDBIterator(env_, *cfd->options(), + cfd->user_comparator(), iter, snapshot); + iterators->push_back(iter); + } + } + + return Status::OK(); +} + +bool DBImpl::IsSnapshotSupported() const { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->mem()->IsSnapshotSupported()) { + return false; + } + } + return true; +} + +const Snapshot* DBImpl::GetSnapshot() { + // returns null if the underlying memtable does not support snapshot. + if (!IsSnapshotSupported()) return nullptr; + MutexLock l(&mutex_); + return snapshots_.New(versions_->LastSequence()); +} + +void DBImpl::ReleaseSnapshot(const Snapshot* s) { + MutexLock l(&mutex_); + snapshots_.Delete(reinterpret_cast(s)); +} + +// Convenience methods +Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + return DB::Put(o, column_family, key, val); +} + +Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + auto cfh = reinterpret_cast(column_family); + if (!cfh->cfd()->options()->merge_operator) { + return Status::NotSupported("Provide a merge_operator when opening DB"); + } else { + return DB::Merge(o, column_family, key, val); + } +} + +Status DBImpl::Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key) { + return DB::Delete(options, column_family, key); +} + +Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { + PERF_TIMER_AUTO(write_pre_and_post_process_time); + Writer w(&mutex_); + w.batch = my_batch; + w.sync = options.sync; + w.disableWAL = options.disableWAL; + w.done = false; + + StopWatch sw(env_, options_.statistics.get(), DB_WRITE, false); + mutex_.Lock(); + writers_.push_back(&w); + while (!w.done && &w != writers_.front()) { + w.cv.Wait(); + } + + if (!options.disableWAL) { + RecordTick(options_.statistics.get(), WRITE_WITH_WAL, 1); + } + + if (w.done) { + mutex_.Unlock(); + RecordTick(options_.statistics.get(), WRITE_DONE_BY_OTHER, 1); + return w.status; + } else { + RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1); + } + + uint64_t flush_column_family_if_log_file = 0; + uint64_t max_total_wal_size = (options_.max_total_wal_size == 0) + ? 4 * max_total_in_memory_state_ + : options_.max_total_wal_size; + if (versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1 && + alive_log_files_.begin()->getting_flushed == false && + total_log_size_ > max_total_wal_size) { + flush_column_family_if_log_file = alive_log_files_.begin()->number; + alive_log_files_.begin()->getting_flushed = true; + Log(options_.info_log, + "Flushing all column families with data in WAL number %" PRIu64 + ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64, + flush_column_family_if_log_file, total_log_size_, max_total_wal_size); + } + + Status status; + // refcounting cfd in iteration + bool dead_cfd = false; + autovector superversions_to_free; + autovector logs_to_free; + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->Ref(); + bool force_flush = my_batch == nullptr || + (flush_column_family_if_log_file != 0 && + cfd->GetLogNumber() <= flush_column_family_if_log_file); + // May temporarily unlock and wait. + status = MakeRoomForWrite(cfd, force_flush, &superversions_to_free, + &logs_to_free); + if (cfd->Unref()) { + dead_cfd = true; + } + if (!status.ok()) { + break; + } + } + if (dead_cfd) { + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + } + + uint64_t last_sequence = versions_->LastSequence(); + Writer* last_writer = &w; + if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions + autovector write_batch_group; + BuildBatchGroup(&last_writer, &write_batch_group); + + // Add to log and apply to memtable. We can release the lock + // during this phase since &w is currently responsible for logging + // and protects against concurrent loggers and concurrent writes + // into memtables + { + mutex_.Unlock(); + WriteBatch* updates = nullptr; + if (write_batch_group.size() == 1) { + updates = write_batch_group[0]; + } else { + updates = &tmp_batch_; + for (size_t i = 0; i < write_batch_group.size(); ++i) { + WriteBatchInternal::Append(updates, write_batch_group[i]); + } + } + + const SequenceNumber current_sequence = last_sequence + 1; + WriteBatchInternal::SetSequence(updates, current_sequence); + int my_batch_count = WriteBatchInternal::Count(updates); + last_sequence += my_batch_count; + // Record statistics + RecordTick(options_.statistics.get(), + NUMBER_KEYS_WRITTEN, my_batch_count); + RecordTick(options_.statistics.get(), + BYTES_WRITTEN, + WriteBatchInternal::ByteSize(updates)); + if (options.disableWAL) { + flush_on_destroy_ = true; + } + PERF_TIMER_STOP(write_pre_and_post_process_time); + + if (!options.disableWAL) { + PERF_TIMER_START(write_wal_time); + Slice log_entry = WriteBatchInternal::Contents(updates); + status = log_->AddRecord(log_entry); + total_log_size_ += log_entry.size(); + alive_log_files_.back().AddSize(log_entry.size()); + log_empty_ = false; + RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1); + RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_entry.size()); + if (status.ok() && options.sync) { + if (options_.use_fsync) { + StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS); + status = log_->file()->Fsync(); + } else { + StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS); + status = log_->file()->Sync(); + } + } + PERF_TIMER_STOP(write_wal_time); + } + if (status.ok()) { + PERF_TIMER_START(write_memtable_time); + status = WriteBatchInternal::InsertInto( + updates, column_family_memtables_.get(), false, 0, this, false); + PERF_TIMER_STOP(write_memtable_time); + + if (!status.ok()) { + // Iteration failed (either in-memory writebatch corruption (very + // bad), or the client specified invalid column family). Return + // failure. + // Note that existing logic was not sound. Any partial failure writing + // into the memtable would result in a state that some write ops might + // have succeeded in memtable but Status reports error for all writes. + return status; + } + SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER, + last_sequence); + } + PERF_TIMER_START(write_pre_and_post_process_time); + if (updates == &tmp_batch_) tmp_batch_.Clear(); + mutex_.Lock(); + if (status.ok()) { + versions_->SetLastSequence(last_sequence); + } + } + } + if (options_.paranoid_checks && !status.ok() && bg_error_.ok()) { + bg_error_ = status; // stop compaction & fail any further writes + } + + while (true) { + Writer* ready = writers_.front(); + writers_.pop_front(); + if (ready != &w) { + ready->status = status; + ready->done = true; + ready->cv.Signal(); + } + if (ready == last_writer) break; + } + + // Notify new head of write queue + if (!writers_.empty()) { + writers_.front()->cv.Signal(); + } + mutex_.Unlock(); + + for (auto& sv : superversions_to_free) { + delete sv; + } + for (auto& log : logs_to_free) { + delete log; + } + + PERF_TIMER_STOP(write_pre_and_post_process_time); + return status; +} + +// REQUIRES: Writer list must be non-empty +// REQUIRES: First writer must have a non-nullptr batch +void DBImpl::BuildBatchGroup(Writer** last_writer, + autovector* write_batch_group) { + assert(!writers_.empty()); + Writer* first = writers_.front(); + assert(first->batch != nullptr); + + size_t size = WriteBatchInternal::ByteSize(first->batch); + write_batch_group->push_back(first->batch); + + // Allow the group to grow up to a maximum size, but if the + // original write is small, limit the growth so we do not slow + // down the small write too much. + size_t max_size = 1 << 20; + if (size <= (128<<10)) { + max_size = size + (128<<10); + } + + *last_writer = first; + std::deque::iterator iter = writers_.begin(); + ++iter; // Advance past "first" + for (; iter != writers_.end(); ++iter) { + Writer* w = *iter; + if (w->sync && !first->sync) { + // Do not include a sync write into a batch handled by a non-sync write. + break; + } + + if (!w->disableWAL && first->disableWAL) { + // Do not include a write that needs WAL into a batch that has + // WAL disabled. + break; + } + + if (w->batch != nullptr) { + size += WriteBatchInternal::ByteSize(w->batch); + if (size > max_size) { + // Do not make batch too big + break; + } + + write_batch_group->push_back(w->batch); + } + *last_writer = w; + } +} + +// This function computes the amount of time in microseconds by which a write +// should be delayed based on the number of level-0 files according to the +// following formula: +// if n < bottom, return 0; +// if n >= top, return 1000; +// otherwise, let r = (n - bottom) / +// (top - bottom) +// and return r^2 * 1000. +// The goal of this formula is to gradually increase the rate at which writes +// are slowed. We also tried linear delay (r * 1000), but it seemed to do +// slightly worse. There is no other particular reason for choosing quadratic. +uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) { + uint64_t delay; + if (n >= top) { + delay = 1000; + } + else if (n < bottom) { + delay = 0; + } + else { + // If we are here, we know that: + // level0_start_slowdown <= n < level0_slowdown + // since the previous two conditions are false. + double how_much = + (double) (n - bottom) / + (top - bottom); + delay = std::max(how_much * how_much * 1000, 100.0); + } + assert(delay <= 1000); + return delay; +} + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +Status DBImpl::MakeRoomForWrite( + ColumnFamilyData* cfd, bool force, + autovector* superversions_to_free, + autovector* logs_to_free) { + mutex_.AssertHeld(); + assert(!writers_.empty()); + bool allow_delay = !force; + bool allow_hard_rate_limit_delay = !force; + bool allow_soft_rate_limit_delay = !force; + uint64_t rate_limit_delay_millis = 0; + Status s; + double score; + // Once we schedule background work, we shouldn't schedule it again, since it + // might generate a tight feedback loop, constantly scheduling more background + // work, even if additional background work is not needed + bool schedule_background_work = true; + + while (true) { + if (!bg_error_.ok()) { + // Yield previous error + s = bg_error_; + break; + } else if (allow_delay && cfd->NeedSlowdownForNumLevel0Files()) { + // We are getting close to hitting a hard limit on the number of + // L0 files. Rather than delaying a single write by several + // seconds when we hit the hard limit, start delaying each + // individual write by 0-1ms to reduce latency variance. Also, + // this delay hands over some CPU to the compaction thread in + // case it is sharing the same core as the writer. + uint64_t slowdown = + SlowdownAmount(cfd->current()->NumLevelFiles(0), + cfd->options()->level0_slowdown_writes_trigger, + cfd->options()->level0_stop_writes_trigger); + mutex_.Unlock(); + uint64_t delayed; + { + StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT); + env_->SleepForMicroseconds(slowdown); + delayed = sw.ElapsedMicros(); + } + RecordTick(options_.statistics.get(), STALL_L0_SLOWDOWN_MICROS, delayed); + cfd->internal_stats()->RecordWriteStall(InternalStats::LEVEL0_SLOWDOWN, + delayed); + allow_delay = false; // Do not delay a single write more than once + mutex_.Lock(); + delayed_writes_++; + } else if (!force && !cfd->mem()->ShouldFlush()) { + // There is room in current memtable + if (allow_delay) { + DelayLoggingAndReset(); + } + break; + } else if (cfd->imm()->size() == + cfd->options()->max_write_buffer_number - 1) { + // We have filled up the current memtable, but the previous + // ones are still being flushed, so we wait. + DelayLoggingAndReset(); + Log(options_.info_log, "[%s] wait for memtable flush...\n", + cfd->GetName().c_str()); + if (schedule_background_work) { + MaybeScheduleFlushOrCompaction(); + schedule_background_work = false; + } + uint64_t stall; + { + StopWatch sw(env_, options_.statistics.get(), + STALL_MEMTABLE_COMPACTION_COUNT); + bg_cv_.Wait(); + stall = sw.ElapsedMicros(); + } + RecordTick(options_.statistics.get(), + STALL_MEMTABLE_COMPACTION_MICROS, stall); + cfd->internal_stats()->RecordWriteStall( + InternalStats::MEMTABLE_COMPACTION, stall); + } else if (cfd->current()->NumLevelFiles(0) >= + cfd->options()->level0_stop_writes_trigger) { + // There are too many level-0 files. + DelayLoggingAndReset(); + Log(options_.info_log, "[%s] wait for fewer level0 files...\n", + cfd->GetName().c_str()); + uint64_t stall; + { + StopWatch sw(env_, options_.statistics.get(), + STALL_L0_NUM_FILES_COUNT); + bg_cv_.Wait(); + stall = sw.ElapsedMicros(); + } + RecordTick(options_.statistics.get(), STALL_L0_NUM_FILES_MICROS, stall); + cfd->internal_stats()->RecordWriteStall(InternalStats::LEVEL0_NUM_FILES, + stall); + } else if (allow_hard_rate_limit_delay && + cfd->options()->hard_rate_limit > 1.0 && + (score = cfd->current()->MaxCompactionScore()) > + cfd->options()->hard_rate_limit) { + // Delay a write when the compaction score for any level is too large. + int max_level = cfd->current()->MaxCompactionScoreLevel(); + mutex_.Unlock(); + uint64_t delayed; + { + StopWatch sw(env_, options_.statistics.get(), + HARD_RATE_LIMIT_DELAY_COUNT); + env_->SleepForMicroseconds(1000); + delayed = sw.ElapsedMicros(); + } + cfd->internal_stats()->RecordLevelNSlowdown(max_level, delayed); + // Make sure the following value doesn't round to zero. + uint64_t rate_limit = std::max((delayed / 1000), (uint64_t) 1); + rate_limit_delay_millis += rate_limit; + RecordTick(options_.statistics.get(), + RATE_LIMIT_DELAY_MILLIS, rate_limit); + if (cfd->options()->rate_limit_delay_max_milliseconds > 0 && + rate_limit_delay_millis >= + (unsigned)cfd->options()->rate_limit_delay_max_milliseconds) { + allow_hard_rate_limit_delay = false; + } + mutex_.Lock(); + } else if (allow_soft_rate_limit_delay && + cfd->options()->soft_rate_limit > 0.0 && + (score = cfd->current()->MaxCompactionScore()) > + cfd->options()->soft_rate_limit) { + // Delay a write when the compaction score for any level is too large. + // TODO: add statistics + mutex_.Unlock(); + { + StopWatch sw(env_, options_.statistics.get(), + SOFT_RATE_LIMIT_DELAY_COUNT); + env_->SleepForMicroseconds( + SlowdownAmount(score, cfd->options()->soft_rate_limit, + cfd->options()->hard_rate_limit)); + rate_limit_delay_millis += sw.ElapsedMicros(); + } + allow_soft_rate_limit_delay = false; + mutex_.Lock(); + + } else { + unique_ptr lfile; + log::Writer* new_log = nullptr; + MemTable* new_mem = nullptr; + + // Attempt to switch to a new memtable and trigger flush of old. + // Do this without holding the dbmutex lock. + assert(versions_->PrevLogNumber() == 0); + bool creating_new_log = !log_empty_; + uint64_t new_log_number = + creating_new_log ? versions_->NewFileNumber() : logfile_number_; + SuperVersion* new_superversion = nullptr; + mutex_.Unlock(); + { + DelayLoggingAndReset(); + if (creating_new_log) { + s = env_->NewWritableFile( + LogFileName(options_.wal_dir, new_log_number), &lfile, + env_->OptimizeForLogWrite(storage_options_)); + if (s.ok()) { + // Our final size should be less than write_buffer_size + // (compression, etc) but err on the side of caution. + lfile->SetPreallocationBlockSize(1.1 * + cfd->options()->write_buffer_size); + new_log = new log::Writer(std::move(lfile)); + } + } + + if (s.ok()) { + new_mem = new MemTable(cfd->internal_comparator(), *cfd->options()); + new_superversion = new SuperVersion(); + } + } + mutex_.Lock(); + if (!s.ok()) { + // how do we fail if we're not creating new log? + assert(creating_new_log); + // Avoid chewing through file number space in a tight loop. + versions_->ReuseFileNumber(new_log_number); + assert(!new_mem); + assert(!new_log); + break; + } + if (creating_new_log) { + logfile_number_ = new_log_number; + assert(new_log != nullptr); + logs_to_free->push_back(log_.release()); + log_.reset(new_log); + log_empty_ = true; + alive_log_files_.push_back(LogFileNumberSize(logfile_number_)); + for (auto cfd : *versions_->GetColumnFamilySet()) { + // all this is just optimization to delete logs that + // are no longer needed -- if CF is empty, that means it + // doesn't need that particular log to stay alive, so we just + // advance the log number. no need to persist this in the manifest + if (cfd->mem()->GetFirstSequenceNumber() == 0 && + cfd->imm()->size() == 0) { + cfd->SetLogNumber(logfile_number_); + } + } + } + cfd->mem()->SetNextLogNumber(logfile_number_); + cfd->imm()->Add(cfd->mem()); + if (force) { + cfd->imm()->FlushRequested(); + } + new_mem->Ref(); + cfd->SetMemtable(new_mem); + Log(options_.info_log, "[%s] New memtable created with log file: #%lu\n", + cfd->GetName().c_str(), (unsigned long)logfile_number_); + force = false; // Do not force another compaction if have room + MaybeScheduleFlushOrCompaction(); + superversions_to_free->push_back( + cfd->InstallSuperVersion(new_superversion, &mutex_)); + } + } + return s; +} + +#ifndef ROCKSDB_LITE +Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + // Increment the ref count + mutex_.Lock(); + auto version = cfd->current(); + version->Ref(); + mutex_.Unlock(); + + auto s = version->GetPropertiesOfAllTables(props); + + // Decrement the ref count + mutex_.Lock(); + version->Unref(); + mutex_.Unlock(); + + return s; +} +#endif // ROCKSDB_LITE + +const std::string& DBImpl::GetName() const { + return dbname_; +} + +Env* DBImpl::GetEnv() const { + return env_; +} + +const Options& DBImpl::GetOptions(ColumnFamilyHandle* column_family) const { + auto cfh = reinterpret_cast(column_family); + return *cfh->cfd()->options(); +} + +bool DBImpl::GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) { + value->clear(); + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + DBPropertyType property_type = GetPropertyType(property); + MutexLock l(&mutex_); + return cfd->internal_stats()->GetProperty(property_type, property, value, + cfd); +} + +void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes) { + // TODO(opt): better implementation + Version* v; + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + { + MutexLock l(&mutex_); + v = cfd->current(); + v->Ref(); + } + + for (int i = 0; i < n; i++) { + // Convert user_key into a corresponding internal key. + InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + uint64_t start = versions_->ApproximateOffsetOf(v, k1); + uint64_t limit = versions_->ApproximateOffsetOf(v, k2); + sizes[i] = (limit >= start ? limit - start : 0); + } + + { + MutexLock l(&mutex_); + v->Unref(); + } +} + +inline void DBImpl::DelayLoggingAndReset() { + if (delayed_writes_ > 0) { + Log(options_.info_log, "delayed %d write...\n", delayed_writes_ ); + delayed_writes_ = 0; + } +} + +#ifndef ROCKSDB_LITE +Status DBImpl::GetUpdatesSince( + SequenceNumber seq, unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options) { + + RecordTick(options_.statistics.get(), GET_UPDATES_SINCE_CALLS); + if (seq > versions_->LastSequence()) { + return Status::NotFound("Requested sequence not yet written in the db"); + } + // Get all sorted Wal Files. + // Do binary search and open files and find the seq number. + + std::unique_ptr wal_files(new VectorLogPtr); + Status s = GetSortedWalFiles(*wal_files); + if (!s.ok()) { + return s; + } + + s = RetainProbableWalFiles(*wal_files, seq); + if (!s.ok()) { + return s; + } + iter->reset(new TransactionLogIteratorImpl(options_.wal_dir, &options_, + read_options, storage_options_, + seq, std::move(wal_files), this)); + return (*iter)->status(); +} + +Status DBImpl::DeleteFile(std::string name) { + uint64_t number; + FileType type; + WalFileType log_type; + if (!ParseFileName(name, &number, &type, &log_type) || + (type != kTableFile && type != kLogFile)) { + Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str()); + return Status::InvalidArgument("Invalid file name"); + } + + Status status; + if (type == kLogFile) { + // Only allow deleting archived log files + if (log_type != kArchivedLogFile) { + Log(options_.info_log, "DeleteFile %s failed - not archived log.\n", + name.c_str()); + return Status::NotSupported("Delete only supported for archived logs"); + } + status = env_->DeleteFile(options_.wal_dir + "/" + name.c_str()); + if (!status.ok()) { + Log(options_.info_log, "DeleteFile %s failed -- %s.\n", + name.c_str(), status.ToString().c_str()); + } + return status; + } + + int level; + FileMetaData* metadata; + ColumnFamilyData* cfd; + VersionEdit edit; + DeletionState deletion_state(true); + { + MutexLock l(&mutex_); + status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd); + if (!status.ok()) { + Log(options_.info_log, "DeleteFile %s failed. File not found\n", + name.c_str()); + return Status::InvalidArgument("File not found"); + } + assert((level > 0) && (level < cfd->NumberLevels())); + + // If the file is being compacted no need to delete. + if (metadata->being_compacted) { + Log(options_.info_log, + "DeleteFile %s Skipped. File about to be compacted\n", name.c_str()); + return Status::OK(); + } + + // Only the files in the last level can be deleted externally. + // This is to make sure that any deletion tombstones are not + // lost. Check that the level passed is the last level. + for (int i = level + 1; i < cfd->NumberLevels(); i++) { + if (cfd->current()->NumLevelFiles(i) != 0) { + Log(options_.info_log, + "DeleteFile %s FAILED. File not in last level\n", name.c_str()); + return Status::InvalidArgument("File not in last level"); + } + } + edit.DeleteFile(level, number); + status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get()); + if (status.ok()) { + InstallSuperVersion(cfd, deletion_state); + } + FindObsoleteFiles(deletion_state, false); + } // lock released here + LogFlush(options_.info_log); + // remove files outside the db-lock + if (deletion_state.HaveSomethingToDelete()) { + PurgeObsoleteFiles(deletion_state); + } + { + MutexLock l(&mutex_); + // schedule flush if file deletion means we freed the space for flushes to + // continue + MaybeScheduleFlushOrCompaction(); + } + return status; +} + +void DBImpl::GetLiveFilesMetaData(std::vector* metadata) { + MutexLock l(&mutex_); + versions_->GetLiveFilesMetaData(metadata); +} +#endif // ROCKSDB_LITE + +Status DBImpl::CheckConsistency() { + mutex_.AssertHeld(); + std::vector metadata; + versions_->GetLiveFilesMetaData(&metadata); + + std::string corruption_messages; + for (const auto& md : metadata) { + std::string file_path = dbname_ + md.name; + uint64_t fsize = 0; + Status s = env_->GetFileSize(file_path, &fsize); + if (!s.ok()) { + corruption_messages += + "Can't access " + md.name + ": " + s.ToString() + "\n"; + } else if (fsize != md.size) { + corruption_messages += "Sst file size mismatch: " + md.name + + ". Size recorded in manifest " + + std::to_string(md.size) + ", actual size " + + std::to_string(fsize) + "\n"; + } + } + if (corruption_messages.size() == 0) { + return Status::OK(); + } else { + return Status::Corruption(corruption_messages); + } +} + +Status DBImpl::GetDbIdentity(std::string& identity) { + std::string idfilename = IdentityFileName(dbname_); + unique_ptr idfile; + const EnvOptions soptions; + Status s = env_->NewSequentialFile(idfilename, &idfile, soptions); + if (!s.ok()) { + return s; + } + uint64_t file_size; + s = env_->GetFileSize(idfilename, &file_size); + if (!s.ok()) { + return s; + } + char buffer[file_size]; + Slice id; + s = idfile->Read(file_size, &id, buffer); + if (!s.ok()) { + return s; + } + identity.assign(id.ToString()); + // If last character is '\n' remove it from identity + if (identity.size() > 0 && identity.back() == '\n') { + identity.pop_back(); + } + return s; +} + +// Default implementations of convenience methods that subclasses of DB +// can call if they wish +Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + // Pre-allocate size of write batch conservatively. + // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, + // and we allocate 11 extra bytes for key length, as well as value length. + WriteBatch batch(key.size() + value.size() + 24); + batch.Put(column_family, key, value); + return Write(opt, &batch); +} + +Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key) { + WriteBatch batch; + batch.Delete(column_family, key); + return Write(opt, &batch); +} + +Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + WriteBatch batch; + batch.Merge(column_family, key, value); + return Write(opt, &batch); +} + +// Default implementation -- returns not supported status +Status DB::CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) { + return Status::NotSupported(""); +} +Status DB::DropColumnFamily(ColumnFamilyHandle* column_family) { + return Status::NotSupported(""); +} + +DB::~DB() { } + +Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + return s; +} + +Status DB::Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr) { + *dbptr = nullptr; + handles->clear(); + + size_t max_write_buffer_size = 0; + for (auto cf : column_families) { + max_write_buffer_size = + std::max(max_write_buffer_size, cf.options.write_buffer_size); + if (cf.options.block_cache != nullptr && cf.options.no_block_cache) { + return Status::InvalidArgument( + "no_block_cache is true while block_cache is not nullptr"); + } + } + + DBImpl* impl = new DBImpl(db_options, dbname); + Status s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir); + if (!s.ok()) { + delete impl; + return s; + } + + s = impl->CreateArchivalDirectory(); + if (!s.ok()) { + delete impl; + return s; + } + impl->mutex_.Lock(); + // Handles create_if_missing, error_if_exists + s = impl->Recover(column_families); + if (s.ok()) { + uint64_t new_log_number = impl->versions_->NewFileNumber(); + unique_ptr lfile; + EnvOptions soptions(db_options); + s = impl->options_.env->NewWritableFile( + LogFileName(impl->options_.wal_dir, new_log_number), &lfile, + impl->options_.env->OptimizeForLogWrite(soptions)); + if (s.ok()) { + lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size); + impl->logfile_number_ = new_log_number; + impl->log_.reset(new log::Writer(std::move(lfile))); + + // set column family handles + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (cfd == nullptr) { + s = Status::InvalidArgument("Column family not found: ", cf.name); + break; + } + handles->push_back( + new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + } + } + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_); + } + impl->alive_log_files_.push_back( + DBImpl::LogFileNumberSize(impl->logfile_number_)); + impl->DeleteObsoleteFiles(); + impl->MaybeScheduleFlushOrCompaction(); + impl->MaybeScheduleLogDBDeployStats(); + s = impl->db_directory_->Fsync(); + } + } + + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + if (cfd->options()->compaction_style == kCompactionStyleUniversal || + cfd->options()->compaction_style == kCompactionStyleFIFO) { + Version* current = cfd->current(); + for (int i = 1; i < current->NumberLevels(); ++i) { + int num_files = current->NumLevelFiles(i); + if (num_files > 0) { + s = Status::InvalidArgument( + "Not all files are at level 0. Cannot " + "open with universal or FIFO compaction style."); + break; + } + } + } + if (cfd->options()->merge_operator != nullptr && + !cfd->mem()->IsMergeOperatorSupported()) { + s = Status::InvalidArgument( + "The memtable of column family %s does not support merge operator " + "its options.merge_operator is non-null", cfd->GetName().c_str()); + } + if (!s.ok()) { + break; + } + } + } + + impl->mutex_.Unlock(); + + if (s.ok()) { + impl->opened_successfully_ = true; + *dbptr = impl; + } else { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete impl; + } + return s; +} + +Status DB::ListColumnFamilies(const DBOptions& db_options, + const std::string& name, + std::vector* column_families) { + return VersionSet::ListColumnFamilies(column_families, name, db_options.env); +} + +Snapshot::~Snapshot() { +} + +Status DestroyDB(const std::string& dbname, const Options& options) { + const InternalKeyComparator comparator(options.comparator); + const InternalFilterPolicy filter_policy(options.filter_policy); + const Options& soptions(SanitizeOptions( + dbname, &comparator, &filter_policy, options)); + Env* env = soptions.env; + std::vector filenames; + std::vector archiveFiles; + + std::string archivedir = ArchivalDirectory(dbname); + // Ignore error in case directory does not exist + env->GetChildren(dbname, &filenames); + + if (dbname != soptions.wal_dir) { + std::vector logfilenames; + env->GetChildren(soptions.wal_dir, &logfilenames); + filenames.insert(filenames.end(), logfilenames.begin(), logfilenames.end()); + archivedir = ArchivalDirectory(soptions.wal_dir); + } + + if (filenames.empty()) { + return Status::OK(); + } + + FileLock* lock; + const std::string lockname = LockFileName(dbname); + Status result = env->LockFile(lockname, &lock); + if (result.ok()) { + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && + type != kDBLockFile) { // Lock file will be deleted at end + Status del; + if (type == kMetaDatabase) { + del = DestroyDB(dbname + "/" + filenames[i], options); + } else if (type == kLogFile) { + del = env->DeleteFile(soptions.wal_dir + "/" + filenames[i]); + } else { + del = env->DeleteFile(dbname + "/" + filenames[i]); + } + if (result.ok() && !del.ok()) { + result = del; + } + } + } + + env->GetChildren(archivedir, &archiveFiles); + // Delete archival files. + for (size_t i = 0; i < archiveFiles.size(); ++i) { + if (ParseFileName(archiveFiles[i], &number, &type) && + type == kLogFile) { + Status del = env->DeleteFile(archivedir + "/" + archiveFiles[i]); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + // ignore case where no archival directory is present. + env->DeleteDir(archivedir); + + env->UnlockFile(lock); // Ignore error since state is already gone + env->DeleteFile(lockname); + env->DeleteDir(dbname); // Ignore error in case dir contains other files + env->DeleteDir(soptions.wal_dir); + } + return result; +} + +// +// A global method that can dump out the build version +void DumpLeveldbBuildVersion(Logger * log) { +#if !defined(IOS_CROSS_COMPILE) + // if we compile with Xcode, we don't run build_detect_vesion, so we don't generate util/build_version.cc + Log(log, "Git sha %s", rocksdb_build_git_sha); + Log(log, "Compile time %s %s", + rocksdb_build_compile_time, rocksdb_build_compile_date); +#endif +} + +} // namespace rocksdb diff --git a/db/db_impl.h b/db/db_impl.h new file mode 100644 index 0000000000..6049db8d6d --- /dev/null +++ b/db/db_impl.h @@ -0,0 +1,635 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/log_writer.h" +#include "db/snapshot.h" +#include "db/column_family.h" +#include "db/version_edit.h" +#include "memtable_list.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/transaction_log.h" +#include "util/autovector.h" +#include "util/stats_logger.h" +#include "util/thread_local.h" +#include "db/internal_stats.h" + +namespace rocksdb { + +class MemTable; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; +class CompactionFilterV2; +class Arena; + +class DBImpl : public DB { + public: + DBImpl(const DBOptions& options, const std::string& dbname); + virtual ~DBImpl(); + + // Implementations of the DB interface + using DB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value); + using DB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value); + using DB::Delete; + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key); + using DB::Write; + virtual Status Write(const WriteOptions& options, WriteBatch* updates); + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value); + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values); + + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family, + ColumnFamilyHandle** handle); + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family); + + // Returns false if key doesn't exist in the database and true if it may. + // If value_found is not passed in as null, then return the value if found in + // memory. On return, if value was found, then value_found will be set to true + // , otherwise false. + using DB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found = nullptr); + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family); + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators); + virtual const Snapshot* GetSnapshot(); + virtual void ReleaseSnapshot(const Snapshot* snapshot); + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value); + using DB::GetApproximateSizes; + virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes); + using DB::CompactRange; + virtual Status CompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + bool reduce_level = false, int target_level = -1); + + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family); + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family); + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family); + virtual const std::string& GetName() const; + virtual Env* GetEnv() const; + using DB::GetOptions; + virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const; + using DB::Flush; + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family); + + virtual SequenceNumber GetLatestSequenceNumber() const; + +#ifndef ROCKSDB_LITE + virtual Status DisableFileDeletions(); + virtual Status EnableFileDeletions(bool force); + // All the returned filenames start with "/" + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true); + virtual Status GetSortedWalFiles(VectorLogPtr& files); + + virtual Status GetUpdatesSince( + SequenceNumber seq_number, unique_ptr* iter, + const TransactionLogIterator::ReadOptions& + read_options = TransactionLogIterator::ReadOptions()); + virtual Status DeleteFile(std::string name); + + virtual void GetLiveFilesMetaData(std::vector* metadata); +#endif // ROCKSDB_LITE + + // checks if all live files exist on file system and that their file sizes + // match to our in-memory records + virtual Status CheckConsistency(); + + virtual Status GetDbIdentity(std::string& identity); + + Status RunManualCompaction(ColumnFamilyData* cfd, int input_level, + int output_level, const Slice* begin, + const Slice* end); + +#ifndef ROCKSDB_LITE + // Extra methods (for testing) that are not in the public DB interface + // Implemented in db_impl_debug.cc + + // Compact any files in the named level that overlap [*begin, *end] + Status TEST_CompactRange(int level, const Slice* begin, const Slice* end, + ColumnFamilyHandle* column_family = nullptr); + + // Force current memtable contents to be flushed. + Status TEST_FlushMemTable(bool wait = true); + + // Wait for memtable compaction + Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); + + // Wait for any compaction + Status TEST_WaitForCompact(); + + // Return an internal iterator over the current state of the database. + // The keys of this iterator are internal keys (see format.h). + // The returned iterator should be deleted when no longer needed. + Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family = + nullptr); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t TEST_MaxNextLevelOverlappingBytes(ColumnFamilyHandle* column_family = + nullptr); + + // Return the current manifest file no. + uint64_t TEST_Current_Manifest_FileNo(); + + // Trigger's a background call for testing. + void TEST_PurgeObsoleteteWAL(); + + // get total level0 file size. Only for testing. + uint64_t TEST_GetLevel0TotalSize(); + + void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL) + { + default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL; + } + + void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family, + std::vector>* metadata); + + Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number, + SequenceNumber* sequence); + + Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence); +#endif // NDEBUG + + // needed for CleanupIteratorState + struct DeletionState { + inline bool HaveSomethingToDelete() const { + return candidate_files.size() || + sst_delete_files.size() || + log_delete_files.size(); + } + + // a list of all files that we'll consider deleting + // (every once in a while this is filled up with all files + // in the DB directory) + std::vector candidate_files; + + // the list of all live sst files that cannot be deleted + std::vector sst_live; + + // a list of sst files that we need to delete + std::vector sst_delete_files; + + // a list of log files that we need to delete + std::vector log_delete_files; + + // a list of memtables to be free + autovector memtables_to_free; + + autovector superversions_to_free; + + SuperVersion* new_superversion; // if nullptr no new superversion + + // the current manifest_file_number, log_number and prev_log_number + // that corresponds to the set of files in 'live'. + uint64_t manifest_file_number, pending_manifest_file_number, log_number, + prev_log_number; + + explicit DeletionState(bool create_superversion = false) { + manifest_file_number = 0; + pending_manifest_file_number = 0; + log_number = 0; + prev_log_number = 0; + new_superversion = create_superversion ? new SuperVersion() : nullptr; + } + + ~DeletionState() { + // free pending memtables + for (auto m : memtables_to_free) { + delete m; + } + // free superversions + for (auto s : superversions_to_free) { + delete s; + } + // if new_superversion was not used, it will be non-nullptr and needs + // to be freed here + delete new_superversion; + } + }; + + // Returns the list of live files in 'live' and the list + // of all files in the filesystem in 'candidate_files'. + // If force == false and the last call was less than + // options_.delete_obsolete_files_period_micros microseconds ago, + // it will not fill up the deletion_state + void FindObsoleteFiles(DeletionState& deletion_state, + bool force, + bool no_full_scan = false); + + // Diffs the files listed in filenames and those that do not + // belong to live files are posibly removed. Also, removes all the + // files in sst_delete_files and log_delete_files. + // It is not necessary to hold the mutex when invoking this method. + void PurgeObsoleteFiles(DeletionState& deletion_state); + + ColumnFamilyHandle* DefaultColumnFamily() const; + + protected: + Env* const env_; + const std::string dbname_; + unique_ptr versions_; + const DBOptions options_; + + Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd, + SuperVersion* super_version, + Arena* arena = nullptr); + + private: + friend class DB; + friend class InternalStats; +#ifndef ROCKSDB_LITE + friend class TailingIterator; + friend class ForwardIterator; +#endif + friend struct SuperVersion; + struct CompactionState; + struct Writer; + + Status NewDB(); + + // Recover the descriptor from persistent storage. May do a significant + // amount of work to recover recently logged updates. Any changes to + // be made to the descriptor are added to *edit. + Status Recover(const std::vector& column_families, + bool read_only = false, bool error_if_log_file_exist = false); + + void MaybeIgnoreError(Status* s) const; + + const Status CreateArchivalDirectory(); + + // Delete any unneeded files and stale in-memory entries. + void DeleteObsoleteFiles(); + + // Flush the in-memory write buffer to storage. Switches to a new + // log-file/memtable and writes a new descriptor iff successful. + Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress, + DeletionState& deletion_state, + LogBuffer* log_buffer); + + Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, + bool read_only); + + // The following two methods are used to flush a memtable to + // storage. The first one is used atdatabase RecoveryTime (when the + // database is opened) and is heavyweight because it holds the mutex + // for the entire period. The second method WriteLevel0Table supports + // concurrent flush memtables to storage. + Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, + VersionEdit* edit); + Status WriteLevel0Table(ColumnFamilyData* cfd, autovector& mems, + VersionEdit* edit, uint64_t* filenumber, + LogBuffer* log_buffer); + + uint64_t SlowdownAmount(int n, double bottom, double top); + + // TODO(icanadi) free superversion_to_free and old_log outside of mutex + Status MakeRoomForWrite(ColumnFamilyData* cfd, + bool force /* flush even if there is room? */, + autovector* superversions_to_free, + autovector* logs_to_free); + + void BuildBatchGroup(Writer** last_writer, + autovector* write_batch_group); + + // Force current memtable contents to be flushed. + Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options); + + // Wait for memtable flushed + Status WaitForFlushMemTable(ColumnFamilyData* cfd); + + void MaybeScheduleLogDBDeployStats(); + +#ifndef ROCKSDB_LITE + static void BGLogDBDeployStats(void* db); + void LogDBDeployStats(); +#endif // ROCKSDB_LITE + + void MaybeScheduleFlushOrCompaction(); + static void BGWorkCompaction(void* db); + static void BGWorkFlush(void* db); + void BackgroundCallCompaction(); + void BackgroundCallFlush(); + Status BackgroundCompaction(bool* madeProgress, DeletionState& deletion_state, + LogBuffer* log_buffer); + Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state, + LogBuffer* log_buffer); + void CleanupCompaction(CompactionState* compact, Status status); + Status DoCompactionWork(CompactionState* compact, + DeletionState& deletion_state, + LogBuffer* log_buffer); + + // This function is called as part of compaction. It enables Flush process to + // preempt compaction, since it's higher prioirty + // Returns: micros spent executing + uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd, + DeletionState& deletion_state, + LogBuffer* log_buffer); + + // Call compaction filter if is_compaction_v2 is not true. Then iterate + // through input and compact the kv-pairs + Status ProcessKeyValueCompaction( + SequenceNumber visible_at_tip, + SequenceNumber earliest_snapshot, + SequenceNumber latest_snapshot, + DeletionState& deletion_state, + bool bottommost_level, + int64_t& imm_micros, + Iterator* input, + CompactionState* compact, + bool is_compaction_v2, + LogBuffer* log_buffer); + + // Call compaction_filter_v2->Filter() on kv-pairs in compact + void CallCompactionFilterV2(CompactionState* compact, + CompactionFilterV2* compaction_filter_v2); + + Status OpenCompactionOutputFile(CompactionState* compact); + Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); + Status InstallCompactionResults(CompactionState* compact, + LogBuffer* log_buffer); + void AllocateCompactionOutputFileNumbers(CompactionState* compact); + void ReleaseCompactionUnusedFileNumbers(CompactionState* compact); + +#ifdef ROCKSDB_LITE + void PurgeObsoleteWALFiles() { + // this function is used for archiving WAL files. we don't need this in + // ROCKSDB_LITE + } +#else + void PurgeObsoleteWALFiles(); + + Status GetSortedWalsOfType(const std::string& path, + VectorLogPtr& log_files, + WalFileType type); + + // Requires: all_logs should be sorted with earliest log file first + // Retains all log files in all_logs which contain updates with seq no. + // Greater Than or Equal to the requested SequenceNumber. + Status RetainProbableWalFiles(VectorLogPtr& all_logs, + const SequenceNumber target); + + Status ReadFirstRecord(const WalFileType type, const uint64_t number, + SequenceNumber* sequence); + + Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence); +#endif // ROCKSDB_LITE + + void PrintStatistics(); + + // dump rocksdb.stats to LOG + void MaybeDumpStats(); + + // Return true if the current db supports snapshot. If the current + // DB does not support snapshot, then calling GetSnapshot() will always + // return nullptr. + // + // @see GetSnapshot() + virtual bool IsSnapshotSupported() const; + + // Return the minimum empty level that could hold the total data in the + // input level. Return the input level, if such level could not be found. + int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level); + + // Move the files in the input level to the target level. + // If target_level < 0, automatically calculate the minimum level that could + // hold the data set. + Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1); + + // table_cache_ provides its own synchronization + std::shared_ptr table_cache_; + + // Lock over the persistent DB state. Non-nullptr iff successfully acquired. + FileLock* db_lock_; + + // State below is protected by mutex_ + port::Mutex mutex_; + port::AtomicPointer shutting_down_; + // This condition variable is signaled on these conditions: + // * whenever bg_compaction_scheduled_ goes down to 0 + // * if bg_manual_only_ > 0, whenever a compaction finishes, even if it hasn't + // made any progress + // * whenever a compaction made any progress + // * whenever bg_flush_scheduled_ value decreases (i.e. whenever a flush is + // done, even if it didn't make any progress) + // * whenever there is an error in background flush or compaction + // * whenever bg_logstats_scheduled_ turns to false + port::CondVar bg_cv_; + uint64_t logfile_number_; + unique_ptr log_; + bool log_empty_; + ColumnFamilyHandleImpl* default_cf_handle_; + unique_ptr column_family_memtables_; + struct LogFileNumberSize { + explicit LogFileNumberSize(uint64_t _number) + : number(_number), size(0), getting_flushed(false) {} + void AddSize(uint64_t new_size) { size += new_size; } + uint64_t number; + uint64_t size; + bool getting_flushed; + }; + std::deque alive_log_files_; + uint64_t total_log_size_; + // only used for dynamically adjusting max_total_wal_size. it is a sum of + // [write_buffer_size * max_write_buffer_number] over all column families + uint64_t max_total_in_memory_state_; + + std::string host_name_; + + std::unique_ptr db_directory_; + + // Queue of writers. + std::deque writers_; + WriteBatch tmp_batch_; + + SnapshotList snapshots_; + + // cache for ReadFirstRecord() calls + std::unordered_map read_first_record_cache_; + port::Mutex read_first_record_cache_mutex_; + + // Set of table files to protect from deletion because they are + // part of ongoing compactions. + std::set pending_outputs_; + + // At least one compaction or flush job is pending but not yet scheduled + // because of the max background thread limit. + bool bg_schedule_needed_; + + // count how many background compactions are running or have been scheduled + int bg_compaction_scheduled_; + + // If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual + // compactions (if manual_compaction_ is not null). This mechanism enables + // manual compactions to wait until all other compactions are finished. + int bg_manual_only_; + + // number of background memtable flush jobs, submitted to the HIGH pool + int bg_flush_scheduled_; + + // Has a background stats log thread scheduled? + bool bg_logstats_scheduled_; + + // Information for a manual compaction + struct ManualCompaction { + ColumnFamilyData* cfd; + int input_level; + int output_level; + bool done; + Status status; + bool in_progress; // compaction request being processed? + const InternalKey* begin; // nullptr means beginning of key range + const InternalKey* end; // nullptr means end of key range + InternalKey tmp_storage; // Used to keep track of compaction progress + }; + ManualCompaction* manual_compaction_; + + // Have we encountered a background error in paranoid mode? + Status bg_error_; + + std::unique_ptr logger_; + + int64_t volatile last_log_ts; + + // shall we disable deletion of obsolete files + // if 0 the deletion is enabled. + // if non-zero, files will not be getting deleted + // This enables two different threads to call + // EnableFileDeletions() and DisableFileDeletions() + // without any synchronization + int disable_delete_obsolete_files_; + + // last time when DeleteObsoleteFiles was invoked + uint64_t delete_obsolete_files_last_run_; + + // last time when PurgeObsoleteWALFiles ran. + uint64_t purge_wal_files_last_run_; + + // last time stats were dumped to LOG + std::atomic last_stats_dump_time_microsec_; + + // obsolete files will be deleted every this seconds if ttl deletion is + // enabled and archive size_limit is disabled. + uint64_t default_interval_to_delete_obsolete_WAL_; + + bool flush_on_destroy_; // Used when disableWAL is true. + + static const int KEEP_LOG_FILE_NUM = 1000; + std::string db_absolute_path_; + + // count of the number of contiguous delaying writes + int delayed_writes_; + + // The options to access storage files + const EnvOptions storage_options_; + + // A value of true temporarily disables scheduling of background work + bool bg_work_gate_closed_; + + // Guard against multiple concurrent refitting + bool refitting_level_; + + // Indicate DB was opened successfully + bool opened_successfully_; + + // No copying allowed + DBImpl(const DBImpl&); + void operator=(const DBImpl&); + + // dump the delayed_writes_ to the log file and reset counter. + void DelayLoggingAndReset(); + + // Return the earliest snapshot where seqno is visible. + // Store the snapshot right before that, if any, in prev_snapshot + inline SequenceNumber findEarliestVisibleSnapshot( + SequenceNumber in, + std::vector& snapshots, + SequenceNumber* prev_snapshot); + + // Background threads call this function, which is just a wrapper around + // the cfd->InstallSuperVersion() function. Background threads carry + // deletion_state which can have new_superversion already allocated. + void InstallSuperVersion(ColumnFamilyData* cfd, + DeletionState& deletion_state); + +#ifndef ROCKSDB_LITE + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) + override; +#endif // ROCKSDB_LITE + + // Function that Get and KeyMayExist call with no_io true or false + // Note: 'value_found' from KeyMayExist propagates here + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, std::string* value, + bool* value_found = nullptr); +}; + +// Sanitize db options. The caller should delete result.info_log if +// it is not equal to src.info_log. +extern Options SanitizeOptions(const std::string& db, + const InternalKeyComparator* icmp, + const InternalFilterPolicy* ipolicy, + const Options& src); +extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src); + +// Determine compression type, based on user options, level of the output +// file and whether compression is disabled. +// If enable_compression is false, then compression is always disabled no +// matter what the values of the other two parameters are. +// Otherwise, the compression type is determined based on options and level. +CompressionType GetCompressionType(const Options& options, int level, + const bool enable_compression); + +// Determine compression type for L0 file written by memtable flush. +CompressionType GetCompressionFlush(const Options& options); + +} // namespace rocksdb diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc new file mode 100644 index 0000000000..927a01a043 --- /dev/null +++ b/db/db_impl_debug.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include "db/db_impl.h" + +namespace rocksdb { + +void DBImpl::TEST_PurgeObsoleteteWAL() { PurgeObsoleteWALFiles(); } + +uint64_t DBImpl::TEST_GetLevel0TotalSize() { + MutexLock l(&mutex_); + return default_cf_handle_->cfd()->current()->NumLevelBytes(0); +} + +Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + + mutex_.Lock(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + mutex_.Unlock(); + ReadOptions roptions; + return NewInternalIterator(roptions, cfd, super_version); +} + +int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( + ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + MutexLock l(&mutex_); + return cfd->current()->MaxNextLevelOverlappingBytes(); +} + +void DBImpl::TEST_GetFilesMetaData( + ColumnFamilyHandle* column_family, + std::vector>* metadata) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + MutexLock l(&mutex_); + metadata->resize(NumberLevels()); + for (int level = 0; level < NumberLevels(); level++) { + const std::vector& files = cfd->current()->files_[level]; + + (*metadata)[level].clear(); + for (const auto& f : files) { + (*metadata)[level].push_back(*f); + } + } +} + +uint64_t DBImpl::TEST_Current_Manifest_FileNo() { + return versions_->ManifestFileNumber(); +} + +Status DBImpl::TEST_CompactRange(int level, const Slice* begin, + const Slice* end, + ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + int output_level = + (cfd->options()->compaction_style == kCompactionStyleUniversal || + cfd->options()->compaction_style == kCompactionStyleFIFO) + ? level + : level + 1; + return RunManualCompaction(cfd, level, output_level, begin, end); +} + +Status DBImpl::TEST_FlushMemTable(bool wait) { + FlushOptions fo; + fo.wait = wait; + return FlushMemTable(default_cf_handle_->cfd(), fo); +} + +Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + return WaitForFlushMemTable(cfd); +} + +Status DBImpl::TEST_WaitForCompact() { + // Wait until the compaction completes + + // TODO: a bug here. This function actually does not necessarily + // wait for compact. It actually waits for scheduled compaction + // OR flush to finish. + + MutexLock l(&mutex_); + while ((bg_compaction_scheduled_ || bg_flush_scheduled_) && bg_error_.ok()) { + bg_cv_.Wait(); + } + return bg_error_; +} + +Status DBImpl::TEST_ReadFirstRecord(const WalFileType type, + const uint64_t number, + SequenceNumber* sequence) { + return ReadFirstRecord(type, number, sequence); +} + +Status DBImpl::TEST_ReadFirstLine(const std::string& fname, + SequenceNumber* sequence) { + return ReadFirstLine(fname, sequence); +} +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc new file mode 100644 index 0000000000..43083746da --- /dev/null +++ b/db/db_impl_readonly.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 Facebook. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "db/db_impl_readonly.h" +#include "db/db_impl.h" + +#include +#include +#include +#include +#include +#include +#include +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/merge_context.h" +#include "db/table_cache.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/merge_operator.h" +#include "port/port.h" +#include "table/block.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/build_version.h" + +namespace rocksdb { + +DBImplReadOnly::DBImplReadOnly(const DBOptions& options, + const std::string& dbname) + : DBImpl(options, dbname) { + Log(options_.info_log, "Opening the db in read only mode"); +} + +DBImplReadOnly::~DBImplReadOnly() { +} + +// Implementations of the DB interface +Status DBImplReadOnly::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) { + Status s; + SequenceNumber snapshot = versions_->LastSequence(); + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + SuperVersion* super_version = cfd->GetSuperVersion(); + MergeContext merge_context; + LookupKey lkey(key, snapshot); + if (super_version->mem->Get(lkey, value, &s, merge_context, + *cfd->options())) { + } else { + Version::GetStats stats; + super_version->current->Get(options, lkey, value, &s, &merge_context, + &stats); + } + return s; +} + +Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + SequenceNumber latest_snapshot = versions_->LastSequence(); + Iterator* internal_iter = NewInternalIterator(options, cfd, super_version); + return NewDBIterator( + env_, *cfd->options(), cfd->user_comparator(), internal_iter, + (options.snapshot != nullptr + ? reinterpret_cast(options.snapshot)->number_ + : latest_snapshot)); +} + +Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, + DB** dbptr, bool error_if_log_file_exist) { + *dbptr = nullptr; + + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + + Status s = + DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a + // reference to default column family + delete handles[0]; + } + return s; +} + +Status DB::OpenForReadOnly( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_log_file_exist) { + *dbptr = nullptr; + handles->clear(); + + DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname); + impl->mutex_.Lock(); + Status s = impl->Recover(column_families, true /* read only */, + error_if_log_file_exist); + if (s.ok()) { + // set column family handles + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (cfd == nullptr) { + s = Status::InvalidArgument("Column family not found: ", cf.name); + break; + } + handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + } + } + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_); + } + } + impl->mutex_.Unlock(); + if (s.ok()) { + *dbptr = impl; + } else { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete impl; + } + return s; +} + + +} // namespace rocksdb diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h new file mode 100644 index 0000000000..c4703ba695 --- /dev/null +++ b/db/db_impl_readonly.h @@ -0,0 +1,103 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 Facebook. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#pragma once +#include "db/db_impl.h" + +#include +#include +#include +#include +#include "db/dbformat.h" +#include "db/log_writer.h" +#include "db/snapshot.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/stats_logger.h" + +namespace rocksdb { + +class DBImplReadOnly : public DBImpl { + public: + DBImplReadOnly(const DBOptions& options, const std::string& dbname); + virtual ~DBImplReadOnly(); + + // Implementations of the DB interface + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value); + + // TODO: Implement ReadOnly MultiGet? + + using DBImpl::NewIterator; + virtual Iterator* NewIterator(const ReadOptions&, + ColumnFamilyHandle* column_family); + + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_family, + std::vector* iterators) { + // TODO + return Status::NotSupported("Not supported yet."); + } + + using DBImpl::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::Delete; + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Write(const WriteOptions& options, WriteBatch* updates) { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::CompactRange; + virtual Status CompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + bool reduce_level = false, + int target_level = -1) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status DisableFileDeletions() { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status EnableFileDeletions(bool force) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::Flush; + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) { + return Status::NotSupported("Not supported operation in read only mode."); + } + + private: + friend class DB; + + // No copying allowed + DBImplReadOnly(const DBImplReadOnly&); + void operator=(const DBImplReadOnly&); +}; +} diff --git a/db/db_iter.cc b/db/db_iter.cc new file mode 100644 index 0000000000..1f49b7aa7f --- /dev/null +++ b/db/db_iter.cc @@ -0,0 +1,517 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_iter.h" +#include +#include + +#include "db/filename.h" +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "port/port.h" +#include "util/arena.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/perf_context_imp.h" + +namespace rocksdb { + +#if 0 +static void DumpInternalIter(Iterator* iter) { + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey k; + if (!ParseInternalKey(iter->key(), &k)) { + fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str()); + } else { + fprintf(stderr, "@ '%s'\n", k.DebugString().c_str()); + } + } +} +#endif + +// Memtables and sstables that make the DB representation contain +// (userkey,seq,type) => uservalue entries. DBIter +// combines multiple entries for the same userkey found in the DB +// representation into a single entry while accounting for sequence +// numbers, deletion markers, overwrites, etc. +class DBIter: public Iterator { + public: + // The following is grossly complicated. TODO: clean it up + // Which direction is the iterator currently moving? + // (1) When moving forward, the internal iterator is positioned at + // the exact entry that yields this->key(), this->value() + // (2) When moving backwards, the internal iterator is positioned + // just before all entries whose user key == this->key(). + enum Direction { + kForward, + kReverse + }; + + DBIter(Env* env, const Options& options, const Comparator* cmp, + Iterator* iter, SequenceNumber s, bool arena_mode) + : arena_mode_(arena_mode), + env_(env), + logger_(options.info_log.get()), + user_comparator_(cmp), + user_merge_operator_(options.merge_operator.get()), + iter_(iter), + sequence_(s), + direction_(kForward), + valid_(false), + current_entry_is_merged_(false), + statistics_(options.statistics.get()) { + RecordTick(statistics_, NO_ITERATORS, 1); + max_skip_ = options.max_sequential_skip_in_iterations; + } + virtual ~DBIter() { + RecordTick(statistics_, NO_ITERATORS, -1); + if (!arena_mode_) { + delete iter_; + } else { + iter_->~Iterator(); + } + } + virtual void SetIter(Iterator* iter) { + assert(iter_ == nullptr); + iter_ = iter; + } + virtual bool Valid() const { return valid_; } + virtual Slice key() const { + assert(valid_); + return saved_key_.GetKey(); + } + virtual Slice value() const { + assert(valid_); + return (direction_ == kForward && !current_entry_is_merged_) ? + iter_->value() : saved_value_; + } + virtual Status status() const { + if (status_.ok()) { + return iter_->status(); + } else { + return status_; + } + } + + virtual void Next(); + virtual void Prev(); + virtual void Seek(const Slice& target); + virtual void SeekToFirst(); + virtual void SeekToLast(); + + private: + inline void FindNextUserEntry(bool skipping); + void FindNextUserEntryInternal(bool skipping); + void FindPrevUserEntry(); + bool ParseKey(ParsedInternalKey* key); + void MergeValuesNewToOld(); + + inline void ClearSavedValue() { + if (saved_value_.capacity() > 1048576) { + std::string empty; + swap(empty, saved_value_); + } else { + saved_value_.clear(); + } + } + + bool arena_mode_; + Env* const env_; + Logger* logger_; + const Comparator* const user_comparator_; + const MergeOperator* const user_merge_operator_; + Iterator* iter_; + SequenceNumber const sequence_; + + Status status_; + IterKey saved_key_; // == current key when direction_==kReverse + std::string saved_value_; // == current raw value when direction_==kReverse + Direction direction_; + bool valid_; + bool current_entry_is_merged_; + Statistics* statistics_; + uint64_t max_skip_; + + // No copying allowed + DBIter(const DBIter&); + void operator=(const DBIter&); +}; + +inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { + if (!ParseInternalKey(iter_->key(), ikey)) { + status_ = Status::Corruption("corrupted internal key in DBIter"); + Log(logger_, "corrupted internal key in DBIter: %s", + iter_->key().ToString(true).c_str()); + return false; + } else { + return true; + } +} + +void DBIter::Next() { + assert(valid_); + + if (direction_ == kReverse) { // Switch directions? + direction_ = kForward; + // iter_ is pointing just before the entries for this->key(), + // so advance into the range of entries for this->key() and then + // use the normal skipping code below. + if (!iter_->Valid()) { + iter_->SeekToFirst(); + } else { + iter_->Next(); + } + if (!iter_->Valid()) { + valid_ = false; + saved_key_.Clear(); + return; + } + } + + // If the current value is merged, we might already hit end of iter_ + if (!iter_->Valid()) { + valid_ = false; + return; + } + FindNextUserEntry(true /* skipping the current user key */); +} + + +// PRE: saved_key_ has the current user key if skipping +// POST: saved_key_ should have the next user key if valid_, +// if the current entry is a result of merge +// current_entry_is_merged_ => true +// saved_value_ => the merged value +// +// NOTE: In between, saved_key_ can point to a user key that has +// a delete marker +inline void DBIter::FindNextUserEntry(bool skipping) { + PERF_TIMER_AUTO(find_next_user_entry_time); + FindNextUserEntryInternal(skipping); + PERF_TIMER_STOP(find_next_user_entry_time); +} + +// Actual implementation of DBIter::FindNextUserEntry() +void DBIter::FindNextUserEntryInternal(bool skipping) { + // Loop until we hit an acceptable entry to yield + assert(iter_->Valid()); + assert(direction_ == kForward); + current_entry_is_merged_ = false; + uint64_t num_skipped = 0; + do { + ParsedInternalKey ikey; + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + if (skipping && + user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) { + num_skipped++; // skip this entry + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + } else { + skipping = false; + switch (ikey.type) { + case kTypeDeletion: + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + saved_key_.SetKey(ikey.user_key); + skipping = true; + num_skipped = 0; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + break; + case kTypeValue: + valid_ = true; + saved_key_.SetKey(ikey.user_key); + return; + case kTypeMerge: + // By now, we are sure the current ikey is going to yield a value + saved_key_.SetKey(ikey.user_key); + current_entry_is_merged_ = true; + valid_ = true; + MergeValuesNewToOld(); // Go to a different state machine + return; + default: + assert(false); + break; + } + } + } + // If we have sequentially iterated via numerous keys and still not + // found the next user-key, then it is better to seek so that we can + // avoid too many key comparisons. We seek to the last occurence of + // our current key by looking for sequence number 0. + if (skipping && num_skipped > max_skip_) { + num_skipped = 0; + std::string last_key; + AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), 0, + kValueTypeForSeek)); + iter_->Seek(last_key); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } else { + iter_->Next(); + } + } while (iter_->Valid()); + valid_ = false; +} + +// Merge values of the same user key starting from the current iter_ position +// Scan from the newer entries to older entries. +// PRE: iter_->key() points to the first merge type entry +// saved_key_ stores the user key +// POST: saved_value_ has the merged value for the user key +// iter_ points to the next entry (or invalid) +void DBIter::MergeValuesNewToOld() { + if (!user_merge_operator_) { + Log(logger_, "Options::merge_operator is null."); + throw std::logic_error("DBIter::MergeValuesNewToOld() with" + " Options::merge_operator null"); + } + + // Start the merge process by pushing the first operand + std::deque operands; + operands.push_front(iter_->value().ToString()); + + std::string merge_result; // Temporary string to hold merge result later + ParsedInternalKey ikey; + for (iter_->Next(); iter_->Valid(); iter_->Next()) { + if (!ParseKey(&ikey)) { + // skip corrupted key + continue; + } + + if (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) != 0) { + // hit the next user key, stop right here + break; + } + + if (kTypeDeletion == ikey.type) { + // hit a delete with the same user key, stop right here + // iter_ is positioned after delete + iter_->Next(); + break; + } + + if (kTypeValue == ikey.type) { + // hit a put, merge the put value with operands and store the + // final result in saved_value_. We are done! + // ignore corruption if there is any. + const Slice value = iter_->value(); + user_merge_operator_->FullMerge(ikey.user_key, &value, operands, + &saved_value_, logger_); + // iter_ is positioned after put + iter_->Next(); + return; + } + + if (kTypeMerge == ikey.type) { + // hit a merge, add the value as an operand and run associative merge. + // when complete, add result to operands and continue. + const Slice& value = iter_->value(); + operands.push_front(value.ToString()); + } + } + + // we either exhausted all internal keys under this user key, or hit + // a deletion marker. + // feed null as the existing value to the merge operator, such that + // client can differentiate this scenario and do things accordingly. + user_merge_operator_->FullMerge(saved_key_.GetKey(), nullptr, operands, + &saved_value_, logger_); +} + +void DBIter::Prev() { + assert(valid_); + + // Throw an exception now if merge_operator is provided + // TODO: support backward iteration + if (user_merge_operator_) { + Log(logger_, "Prev not supported yet if merge_operator is provided"); + throw std::logic_error("DBIter::Prev backward iteration not supported" + " if merge_operator is provided"); + } + + if (direction_ == kForward) { // Switch directions? + // iter_ is pointing at the current entry. Scan backwards until + // the key changes so we can use the normal reverse scanning code. + assert(iter_->Valid()); // Otherwise valid_ would have been false + saved_key_.SetKey(ExtractUserKey(iter_->key())); + while (true) { + iter_->Prev(); + if (!iter_->Valid()) { + valid_ = false; + saved_key_.Clear(); + ClearSavedValue(); + return; + } + if (user_comparator_->Compare(ExtractUserKey(iter_->key()), + saved_key_.GetKey()) < 0) { + break; + } + } + direction_ = kReverse; + } + + FindPrevUserEntry(); +} + +void DBIter::FindPrevUserEntry() { + assert(direction_ == kReverse); + uint64_t num_skipped = 0; + + ValueType value_type = kTypeDeletion; + bool saved_key_valid = true; + if (iter_->Valid()) { + do { + ParsedInternalKey ikey; + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + if ((value_type != kTypeDeletion) && + user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) < 0) { + // We encountered a non-deleted value in entries for previous keys, + break; + } + value_type = ikey.type; + if (value_type == kTypeDeletion) { + saved_key_.Clear(); + ClearSavedValue(); + saved_key_valid = false; + } else { + Slice raw_value = iter_->value(); + if (saved_value_.capacity() > raw_value.size() + 1048576) { + std::string empty; + swap(empty, saved_value_); + } + saved_key_.SetKey(ExtractUserKey(iter_->key())); + saved_value_.assign(raw_value.data(), raw_value.size()); + } + } else { + // In the case of ikey.sequence > sequence_, we might have already + // iterated to a different user key. + saved_key_valid = false; + } + num_skipped++; + // If we have sequentially iterated via numerous keys and still not + // found the prev user-key, then it is better to seek so that we can + // avoid too many key comparisons. We seek to the first occurence of + // our current key by looking for max sequence number. + if (saved_key_valid && num_skipped > max_skip_) { + num_skipped = 0; + std::string last_key; + AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), + kMaxSequenceNumber, + kValueTypeForSeek)); + iter_->Seek(last_key); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } else { + iter_->Prev(); + } + } while (iter_->Valid()); + } + + if (value_type == kTypeDeletion) { + // End + valid_ = false; + saved_key_.Clear(); + ClearSavedValue(); + direction_ = kForward; + } else { + valid_ = true; + } +} + +void DBIter::Seek(const Slice& target) { + saved_key_.Clear(); + // now savved_key is used to store internal key. + saved_key_.SetInternalKey(target, sequence_); + PERF_TIMER_AUTO(seek_internal_seek_time); + iter_->Seek(saved_key_.GetKey()); + PERF_TIMER_STOP(seek_internal_seek_time); + if (iter_->Valid()) { + direction_ = kForward; + ClearSavedValue(); + FindNextUserEntry(false /*not skipping */); + } else { + valid_ = false; + } +} + +void DBIter::SeekToFirst() { + direction_ = kForward; + ClearSavedValue(); + PERF_TIMER_AUTO(seek_internal_seek_time); + iter_->SeekToFirst(); + PERF_TIMER_STOP(seek_internal_seek_time); + if (iter_->Valid()) { + FindNextUserEntry(false /* not skipping */); + } else { + valid_ = false; + } +} + +void DBIter::SeekToLast() { + // Throw an exception for now if merge_operator is provided + // TODO: support backward iteration + if (user_merge_operator_) { + Log(logger_, "SeekToLast not supported yet if merge_operator is provided"); + throw std::logic_error("DBIter::SeekToLast: backward iteration not" + " supported if merge_operator is provided"); + } + + direction_ = kReverse; + ClearSavedValue(); + PERF_TIMER_AUTO(seek_internal_seek_time); + iter_->SeekToLast(); + PERF_TIMER_STOP(seek_internal_seek_time); + FindPrevUserEntry(); +} + +Iterator* NewDBIterator(Env* env, const Options& options, + const Comparator* user_key_comparator, + Iterator* internal_iter, + const SequenceNumber& sequence) { + return new DBIter(env, options, user_key_comparator, internal_iter, sequence, + false); +} + +ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); } + +void ArenaWrappedDBIter::SetDBIter(DBIter* iter) { db_iter_ = iter; } + +void ArenaWrappedDBIter::SetIterUnderDBIter(Iterator* iter) { + static_cast(db_iter_)->SetIter(iter); +} + +inline bool ArenaWrappedDBIter::Valid() const { return db_iter_->Valid(); } +inline void ArenaWrappedDBIter::SeekToFirst() { db_iter_->SeekToFirst(); } +inline void ArenaWrappedDBIter::SeekToLast() { db_iter_->SeekToLast(); } +inline void ArenaWrappedDBIter::Seek(const Slice& target) { + db_iter_->Seek(target); +} +inline void ArenaWrappedDBIter::Next() { db_iter_->Next(); } +inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); } +inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); } +inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); } +inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); } +void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1, + void* arg2) { + db_iter_->RegisterCleanup(function, arg1, arg2); +} + +ArenaWrappedDBIter* NewArenaWrappedDbIterator( + Env* env, const Options& options, const Comparator* user_key_comparator, + const SequenceNumber& sequence) { + ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); + Arena* arena = iter->GetArena(); + auto mem = arena->AllocateAligned(sizeof(DBIter)); + DBIter* db_iter = new (mem) + DBIter(env, options, user_key_comparator, nullptr, sequence, true); + iter->SetDBIter(db_iter); + return iter; +} + +} // namespace rocksdb diff --git a/db/db_iter.h b/db/db_iter.h new file mode 100644 index 0000000000..cb9840324f --- /dev/null +++ b/db/db_iter.h @@ -0,0 +1,73 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include "rocksdb/db.h" +#include "db/dbformat.h" +#include "util/arena.h" +#include "util/autovector.h" + +namespace rocksdb { + +class Arena; +class DBIter; + +// Return a new iterator that converts internal keys (yielded by +// "*internal_iter") that were live at the specified "sequence" number +// into appropriate user keys. +extern Iterator* NewDBIterator( + Env* env, + const Options& options, + const Comparator *user_key_comparator, + Iterator* internal_iter, + const SequenceNumber& sequence); + +// A wrapper iterator which wraps DB Iterator and the arena, with which the DB +// iterator is supposed be allocated. This class is used as an entry point of +// a iterator hierarchy whose memory can be allocated inline. In that way, +// accessing the iterator tree can be more cache friendly. It is also faster +// to allocate. +class ArenaWrappedDBIter : public Iterator { + public: + virtual ~ArenaWrappedDBIter(); + + // Get the arena to be used to allocate memory for DBIter to be wrapped, + // as well as child iterators in it. + virtual Arena* GetArena() { return &arena_; } + + // Set the DB Iterator to be wrapped + + virtual void SetDBIter(DBIter* iter); + + // Set the internal iterator wrapped inside the DB Iterator. Usually it is + // a merging iterator. + virtual void SetIterUnderDBIter(Iterator* iter); + virtual bool Valid() const override; + virtual void SeekToFirst() override; + virtual void SeekToLast() override; + virtual void Seek(const Slice& target) override; + virtual void Next() override; + virtual void Prev() override; + virtual Slice key() const override; + virtual Slice value() const override; + virtual Status status() const override; + void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + + private: + DBIter* db_iter_; + Arena arena_; +}; + +// Generate the arena wrapped iterator class. +extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( + Env* env, const Options& options, const Comparator* user_key_comparator, + const SequenceNumber& sequence); + +} // namespace rocksdb diff --git a/db/db_stats_logger.cc b/db/db_stats_logger.cc new file mode 100644 index 0000000000..288e1bf80d --- /dev/null +++ b/db/db_stats_logger.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl.h" +#include +#include +#include +#include "db/version_set.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +void DBImpl::MaybeScheduleLogDBDeployStats() { +// we did say maybe +#ifndef ROCKSDB_LITE + // There is a lock in the actual logger. + if (!logger_ || options_.db_stats_log_interval < 0 + || host_name_.empty()) { + return; + } + + if(bg_logstats_scheduled_ || shutting_down_.Acquire_Load()) { + // Already scheduled + } else { + int64_t current_ts = 0; + Status st = env_->GetCurrentTime(¤t_ts); + if (!st.ok()) { + return; + } + if ((current_ts - last_log_ts) < options_.db_stats_log_interval) { + return; + } + last_log_ts = current_ts; + bg_logstats_scheduled_ = true; + env_->Schedule(&DBImpl::BGLogDBDeployStats, this); + } +} + +void DBImpl::BGLogDBDeployStats(void* db) { + DBImpl* db_inst = reinterpret_cast(db); + db_inst->LogDBDeployStats(); +} + +void DBImpl::LogDBDeployStats() { + mutex_.Lock(); + + if (shutting_down_.Acquire_Load()) { + bg_logstats_scheduled_ = false; + bg_cv_.SignalAll(); + mutex_.Unlock(); + return; + } + + char tmp_ver[100]; + sprintf(tmp_ver, "%d.%d", kMajorVersion, kMinorVersion); + std::string version_info(tmp_ver); + + uint64_t file_total_size = 0; + uint32_t file_total_num = 0; + Version* current = default_cf_handle_->cfd()->current(); + for (int i = 0; i < current->NumberLevels(); i++) { + file_total_num += current->NumLevelFiles(i); + file_total_size += current->NumLevelBytes(i); + } + + Version::LevelSummaryStorage scratch; + const char* file_num_summary = current->LevelSummary(&scratch); + std::string file_num_per_level(file_num_summary); + std::string data_size_per_level(file_num_summary); + + mutex_.Unlock(); + + int64_t unix_ts; + env_->GetCurrentTime(&unix_ts); + + logger_->Log_Deploy_Stats(version_info, host_name_, + db_absolute_path_, file_total_size, file_total_num, file_num_per_level, + data_size_per_level, unix_ts); + + mutex_.Lock(); + bg_logstats_scheduled_ = false; + bg_cv_.SignalAll(); + mutex_.Unlock(); +#endif +} +} diff --git a/db/db_test.cc b/db/db_test.cc new file mode 100644 index 0000000000..5e30b33f7e --- /dev/null +++ b/db/db_test.cc @@ -0,0 +1,6852 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/block_based_table_factory.h" +#include "table/plain_table_factory.h" +#include "util/hash.h" +#include "util/hash_linklist_rep.h" +#include "utilities/merge_operators.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/statistics.h" +#include "util/testharness.h" +#include "util/sync_point.h" +#include "util/testutil.h" + +namespace rocksdb { + +static bool SnappyCompressionSupported(const CompressionOptions& options) { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Snappy_Compress(options, in.data(), in.size(), &out); +} + +static bool ZlibCompressionSupported(const CompressionOptions& options) { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Zlib_Compress(options, in.data(), in.size(), &out); +} + +static bool BZip2CompressionSupported(const CompressionOptions& options) { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::BZip2_Compress(options, in.data(), in.size(), &out); +} + +static bool LZ4CompressionSupported(const CompressionOptions &options) { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::LZ4_Compress(options, in.data(), in.size(), &out); +} + +static bool LZ4HCCompressionSupported(const CompressionOptions &options) { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::LZ4HC_Compress(options, in.data(), in.size(), &out); +} + +static std::string RandomString(Random *rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +namespace anon { +class AtomicCounter { + private: + port::Mutex mu_; + int count_; + public: + AtomicCounter() : count_(0) { } + void Increment() { + MutexLock l(&mu_); + count_++; + } + int Read() { + MutexLock l(&mu_); + return count_; + } + void Reset() { + MutexLock l(&mu_); + count_ = 0; + } +}; + +} + +// Special Env used to delay background operations +class SpecialEnv : public EnvWrapper { + public: + // sstable Sync() calls are blocked while this pointer is non-nullptr. + port::AtomicPointer delay_sstable_sync_; + + // Simulate no-space errors while this pointer is non-nullptr. + port::AtomicPointer no_space_; + + // Simulate non-writable file system while this pointer is non-nullptr + port::AtomicPointer non_writable_; + + // Force sync of manifest files to fail while this pointer is non-nullptr + port::AtomicPointer manifest_sync_error_; + + // Force write to manifest files to fail while this pointer is non-nullptr + port::AtomicPointer manifest_write_error_; + + // Force write to log files to fail while this pointer is non-nullptr + port::AtomicPointer log_write_error_; + + bool count_random_reads_; + anon::AtomicCounter random_read_counter_; + + bool count_sequential_reads_; + anon::AtomicCounter sequential_read_counter_; + + anon::AtomicCounter sleep_counter_; + + explicit SpecialEnv(Env* base) : EnvWrapper(base) { + delay_sstable_sync_.Release_Store(nullptr); + no_space_.Release_Store(nullptr); + non_writable_.Release_Store(nullptr); + count_random_reads_ = false; + count_sequential_reads_ = false; + manifest_sync_error_.Release_Store(nullptr); + manifest_write_error_.Release_Store(nullptr); + log_write_error_.Release_Store(nullptr); + } + + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& soptions) { + class SSTableFile : public WritableFile { + private: + SpecialEnv* env_; + unique_ptr base_; + + public: + SSTableFile(SpecialEnv* env, unique_ptr&& base) + : env_(env), + base_(std::move(base)) { + } + Status Append(const Slice& data) { + if (env_->no_space_.Acquire_Load() != nullptr) { + // Drop writes on the floor + return Status::OK(); + } else { + return base_->Append(data); + } + } + Status Close() { return base_->Close(); } + Status Flush() { return base_->Flush(); } + Status Sync() { + while (env_->delay_sstable_sync_.Acquire_Load() != nullptr) { + env_->SleepForMicroseconds(100000); + } + return base_->Sync(); + } + }; + class ManifestFile : public WritableFile { + private: + SpecialEnv* env_; + unique_ptr base_; + public: + ManifestFile(SpecialEnv* env, unique_ptr&& b) + : env_(env), base_(std::move(b)) { } + Status Append(const Slice& data) { + if (env_->manifest_write_error_.Acquire_Load() != nullptr) { + return Status::IOError("simulated writer error"); + } else { + return base_->Append(data); + } + } + Status Close() { return base_->Close(); } + Status Flush() { return base_->Flush(); } + Status Sync() { + if (env_->manifest_sync_error_.Acquire_Load() != nullptr) { + return Status::IOError("simulated sync error"); + } else { + return base_->Sync(); + } + } + }; + class LogFile : public WritableFile { + private: + SpecialEnv* env_; + unique_ptr base_; + public: + LogFile(SpecialEnv* env, unique_ptr&& b) + : env_(env), base_(std::move(b)) { } + Status Append(const Slice& data) { + if (env_->log_write_error_.Acquire_Load() != nullptr) { + return Status::IOError("simulated writer error"); + } else { + return base_->Append(data); + } + } + Status Close() { return base_->Close(); } + Status Flush() { return base_->Flush(); } + Status Sync() { return base_->Sync(); } + }; + + if (non_writable_.Acquire_Load() != nullptr) { + return Status::IOError("simulated write error"); + } + + Status s = target()->NewWritableFile(f, r, soptions); + if (s.ok()) { + if (strstr(f.c_str(), ".sst") != nullptr) { + r->reset(new SSTableFile(this, std::move(*r))); + } else if (strstr(f.c_str(), "MANIFEST") != nullptr) { + r->reset(new ManifestFile(this, std::move(*r))); + } else if (strstr(f.c_str(), "log") != nullptr) { + r->reset(new LogFile(this, std::move(*r))); + } + } + return s; + } + + Status NewRandomAccessFile(const std::string& f, + unique_ptr* r, + const EnvOptions& soptions) { + class CountingFile : public RandomAccessFile { + private: + unique_ptr target_; + anon::AtomicCounter* counter_; + public: + CountingFile(unique_ptr&& target, + anon::AtomicCounter* counter) + : target_(std::move(target)), counter_(counter) { + } + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + counter_->Increment(); + return target_->Read(offset, n, result, scratch); + } + }; + + Status s = target()->NewRandomAccessFile(f, r, soptions); + if (s.ok() && count_random_reads_) { + r->reset(new CountingFile(std::move(*r), &random_read_counter_)); + } + return s; + } + + Status NewSequentialFile(const std::string& f, unique_ptr* r, + const EnvOptions& soptions) { + class CountingFile : public SequentialFile { + private: + unique_ptr target_; + anon::AtomicCounter* counter_; + + public: + CountingFile(unique_ptr&& target, + anon::AtomicCounter* counter) + : target_(std::move(target)), counter_(counter) {} + virtual Status Read(size_t n, Slice* result, char* scratch) { + counter_->Increment(); + return target_->Read(n, result, scratch); + } + virtual Status Skip(uint64_t n) { return target_->Skip(n); } + }; + + Status s = target()->NewSequentialFile(f, r, soptions); + if (s.ok() && count_sequential_reads_) { + r->reset(new CountingFile(std::move(*r), &sequential_read_counter_)); + } + return s; + } + + virtual void SleepForMicroseconds(int micros) { + sleep_counter_.Increment(); + target()->SleepForMicroseconds(micros); + } +}; + +class DBTest { + private: + const FilterPolicy* filter_policy_; + + protected: + // Sequence of option configurations to try + enum OptionConfig { + kBlockBasedTableWithWholeKeyHashIndex, + kDefault, + kBlockBasedTableWithPrefixHashIndex, + kPlainTableFirstBytePrefix, + kPlainTableAllBytesPrefix, + kVectorRep, + kHashLinkList, + kHashCuckoo, + kMergePut, + kFilter, + kUncompressed, + kNumLevel_3, + kDBLogDir, + kWalDir, + kManifestFileSize, + kCompactOnFlush, + kPerfOptions, + kDeletesFilterFirst, + kHashSkipList, + kUniversalCompaction, + kCompressedBlockCache, + kInfiniteMaxOpenFiles, + kxxHashChecksum, + kFIFOCompaction, + kEnd + }; + int option_config_; + + public: + std::string dbname_; + SpecialEnv* env_; + DB* db_; + std::vector handles_; + + Options last_options_; + + // Skip some options, as they may not be applicable to a specific test. + // To add more skip constants, use values 4, 8, 16, etc. + enum OptionSkip { + kNoSkip = 0, + kSkipDeletesFilterFirst = 1, + kSkipUniversalCompaction = 2, + kSkipMergePut = 4, + kSkipPlainTable = 8, + kSkipHashIndex = 16, + kSkipNoSeekToLast = 32, + kSkipHashCuckoo = 64, + kSkipFIFOCompaction = 128, + }; + + DBTest() : option_config_(kDefault), + env_(new SpecialEnv(Env::Default())) { + filter_policy_ = NewBloomFilterPolicy(10); + dbname_ = test::TmpDir() + "/db_test"; + ASSERT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + ~DBTest() { + Close(); + ASSERT_OK(DestroyDB(dbname_, Options())); + delete env_; + delete filter_policy_; + } + + // Switch to a fresh database with the next option configuration to + // test. Return false if there are no more configurations to test. + bool ChangeOptions(int skip_mask = kNoSkip) { + for(option_config_++; option_config_ < kEnd; option_config_++) { + if ((skip_mask & kSkipDeletesFilterFirst) && + option_config_ == kDeletesFilterFirst) { + continue; + } + if ((skip_mask & kSkipUniversalCompaction) && + option_config_ == kUniversalCompaction) { + continue; + } + if ((skip_mask & kSkipMergePut) && option_config_ == kMergePut) { + continue; + } + if ((skip_mask & kSkipNoSeekToLast) && + (option_config_ == kHashLinkList || + option_config_ == kHashSkipList)) {; + continue; + } + if ((skip_mask & kSkipPlainTable) + && (option_config_ == kPlainTableAllBytesPrefix + || option_config_ == kPlainTableFirstBytePrefix)) { + continue; + } + if ((skip_mask & kSkipPlainTable) && + (option_config_ == kBlockBasedTableWithPrefixHashIndex || + option_config_ == kBlockBasedTableWithWholeKeyHashIndex)) { + continue; + } + if ((skip_mask & kSkipHashCuckoo) && (option_config_ == kHashCuckoo)) { + continue; + } + if ((skip_mask & kSkipFIFOCompaction) && + option_config_ == kFIFOCompaction) { + continue; + } + break; + } + + if (option_config_ >= kEnd) { + Destroy(&last_options_); + return false; + } else { + DestroyAndReopen(); + return true; + } + } + + // Switch between different compaction styles (we have only 2 now). + bool ChangeCompactOptions(Options* prev_options = nullptr) { + if (option_config_ == kDefault) { + option_config_ = kUniversalCompaction; + if (prev_options == nullptr) { + prev_options = &last_options_; + } + Destroy(prev_options); + TryReopen(); + return true; + } else { + return false; + } + } + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + return CurrentOptions(options); + } + + Options CurrentOptions(const Options& defaultOptions) { + // this redudant copy is to minimize code change w/o having lint error. + Options options = defaultOptions; + switch (option_config_) { + case kHashSkipList: + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.memtable_factory.reset(NewHashSkipListRepFactory()); + break; + case kPlainTableFirstBytePrefix: + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.allow_mmap_reads = true; + options.max_sequential_skip_in_iterations = 999999; + break; + case kPlainTableAllBytesPrefix: + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor.reset(NewNoopTransform()); + options.allow_mmap_reads = true; + options.max_sequential_skip_in_iterations = 999999; + break; + case kMergePut: + options.merge_operator = MergeOperators::CreatePutOperator(); + break; + case kFilter: + options.filter_policy = filter_policy_; + break; + case kUncompressed: + options.compression = kNoCompression; + break; + case kNumLevel_3: + options.num_levels = 3; + break; + case kDBLogDir: + options.db_log_dir = test::TmpDir(); + break; + case kWalDir: + options.wal_dir = "/tmp/wal"; + break; + case kManifestFileSize: + options.max_manifest_file_size = 50; // 50 bytes + case kCompactOnFlush: + options.purge_redundant_kvs_while_flush = + !options.purge_redundant_kvs_while_flush; + break; + case kPerfOptions: + options.hard_rate_limit = 2.0; + options.rate_limit_delay_max_milliseconds = 2; + // TODO -- test more options + break; + case kDeletesFilterFirst: + options.filter_deletes = true; + break; + case kVectorRep: + options.memtable_factory.reset(new VectorRepFactory(100)); + break; + case kHashLinkList: + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0)); + break; + case kHashCuckoo: + options.memtable_factory.reset( + NewHashCuckooRepFactory(options.write_buffer_size)); + break; + case kUniversalCompaction: + options.compaction_style = kCompactionStyleUniversal; + break; + case kCompressedBlockCache: + options.allow_mmap_writes = true; + options.block_cache_compressed = NewLRUCache(8*1024*1024); + break; + case kInfiniteMaxOpenFiles: + options.max_open_files = -1; + break; + case kxxHashChecksum: { + BlockBasedTableOptions table_options; + table_options.checksum = kxxHash; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + } + case kFIFOCompaction: { + options.compaction_style = kCompactionStyleFIFO; + break; + } + case kBlockBasedTableWithPrefixHashIndex: { + BlockBasedTableOptions table_options; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + break; + } + case kBlockBasedTableWithWholeKeyHashIndex: { + BlockBasedTableOptions table_options; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewNoopTransform()); + break; + } + default: + break; + } + return options; + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void CreateColumnFamilies(const std::vector& cfs, + const ColumnFamilyOptions* options = nullptr) { + ColumnFamilyOptions cf_opts; + if (options != nullptr) { + cf_opts = ColumnFamilyOptions(*options); + } else { + cf_opts = ColumnFamilyOptions(CurrentOptions()); + } + int cfi = handles_.size(); + handles_.resize(cfi + cfs.size()); + for (auto cf : cfs) { + ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++])); + } + } + + void CreateAndReopenWithCF(const std::vector& cfs, + const Options* options = nullptr) { + CreateColumnFamilies(cfs, options); + std::vector cfs_plus_default = cfs; + cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); + ReopenWithColumnFamilies(cfs_plus_default, options); + } + + void ReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + void ReopenWithColumnFamilies(const std::vector& cfs, + const Options* options = nullptr) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + Status TryReopenWithColumnFamilies( + const std::vector& cfs, + const std::vector& options) { + Close(); + ASSERT_EQ(cfs.size(), options.size()); + std::vector column_families; + for (size_t i = 0; i < cfs.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor(cfs[i], *options[i])); + } + DBOptions db_opts = DBOptions(*options[0]); + return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); + } + + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const Options* options = nullptr) { + Close(); + Options opts = (options == nullptr) ? CurrentOptions() : *options; + std::vector v_opts(cfs.size(), &opts); + return TryReopenWithColumnFamilies(cfs, v_opts); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void Close() { + for (auto h : handles_) { + delete h; + } + handles_.clear(); + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(Options* options = nullptr) { + //Destroy using last options + Destroy(&last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(Options* options) { + Close(); + ASSERT_OK(DestroyDB(dbname_, *options)); + } + + Status ReadOnlyReopen(Options* options) { + return DB::OpenForReadOnly(*options, dbname_, &db_); + } + + Status TryReopen(Options* options = nullptr) { + Close(); + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Flush(int cf = 0) { + if (cf == 0) { + return db_->Flush(FlushOptions()); + } else { + return db_->Flush(FlushOptions(), handles_[cf]); + } + } + + Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) { + if (kMergePut == option_config_ ) { + return db_->Merge(wo, k, v); + } else { + return db_->Put(wo, k, v); + } + } + + Status Put(int cf, const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()) { + if (kMergePut == option_config_) { + return db_->Merge(wo, handles_[cf], k, v); + } else { + return db_->Put(wo, handles_[cf], k, v); + } + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + Status Delete(int cf, const std::string& k) { + return db_->Delete(WriteOptions(), handles_[cf], k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + std::string Get(int cf, const std::string& k, + const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, handles_[cf], k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + // Return a string that contains all key,value pairs in order, + // formatted like "(k1->v1)(k2->v2)". + std::string Contents(int cf = 0) { + std::vector forward; + std::string result; + Iterator* iter = (cf == 0) ? db_->NewIterator(ReadOptions()) + : db_->NewIterator(ReadOptions(), handles_[cf]); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string s = IterStatus(iter); + result.push_back('('); + result.append(s); + result.push_back(')'); + forward.push_back(s); + } + + // Check reverse iteration results are the reverse of forward results + unsigned int matched = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ASSERT_LT(matched, forward.size()); + ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]); + matched++; + } + ASSERT_EQ(matched, forward.size()); + + delete iter; + return result; + } + + std::string AllEntriesFor(const Slice& user_key, int cf = 0) { + Iterator* iter; + if (cf == 0) { + iter = dbfull()->TEST_NewInternalIterator(); + } else { + iter = dbfull()->TEST_NewInternalIterator(handles_[cf]); + } + InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); + iter->Seek(target.Encode()); + std::string result; + if (!iter->status().ok()) { + result = iter->status().ToString(); + } else { + result = "[ "; + bool first = true; + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + if (!ParseInternalKey(iter->key(), &ikey)) { + result += "CORRUPTED"; + } else { + if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) { + break; + } + if (!first) { + result += ", "; + } + first = false; + switch (ikey.type) { + case kTypeValue: + result += iter->value().ToString(); + break; + case kTypeMerge: + // keep it the same as kTypeValue for testing kMergePut + result += iter->value().ToString(); + break; + case kTypeDeletion: + result += "DEL"; + break; + default: + assert(false); + break; + } + } + iter->Next(); + } + if (!first) { + result += " "; + } + result += "]"; + } + delete iter; + return result; + } + + int NumTableFilesAtLevel(int level, int cf = 0) { + std::string property; + if (cf == 0) { + // default cfd + ASSERT_TRUE(db_->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(level), &property)); + } else { + ASSERT_TRUE(db_->GetProperty( + handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level), + &property)); + } + return atoi(property.c_str()); + } + + int TotalTableFiles(int cf = 0, int levels = -1) { + if (levels == -1) { + levels = CurrentOptions().num_levels; + } + int result = 0; + for (int level = 0; level < levels; level++) { + result += NumTableFilesAtLevel(level, cf); + } + return result; + } + + // Return spread of files per level + std::string FilesPerLevel(int cf = 0) { + int num_levels = + (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]); + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < num_levels; level++) { + int f = NumTableFilesAtLevel(level, cf); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + int CountFiles() { + std::vector files; + env_->GetChildren(dbname_, &files); + + std::vector logfiles; + if (dbname_ != last_options_.wal_dir) { + env_->GetChildren(last_options_.wal_dir, &logfiles); + } + + return static_cast(files.size() + logfiles.size()); + } + + int CountLiveFiles() { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + return metadata.size(); + } + + uint64_t Size(const Slice& start, const Slice& limit, int cf = 0) { + Range r(start, limit); + uint64_t size; + if (cf == 0) { + db_->GetApproximateSizes(&r, 1, &size); + } else { + db_->GetApproximateSizes(handles_[1], &r, 1, &size); + } + return size; + } + + void Compact(int cf, const Slice& start, const Slice& limit) { + ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit)); + } + + void Compact(const Slice& start, const Slice& limit) { + ASSERT_OK(db_->CompactRange(&start, &limit)); + } + + // Do n memtable compactions, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int n, const std::string& small, const std::string& large, + int cf = 0) { + for (int i = 0; i < n; i++) { + ASSERT_OK(Put(cf, small, "begin")); + ASSERT_OK(Put(cf, large, "end")); + ASSERT_OK(Flush(cf)); + } + } + + // Prevent pushing of new sstables into deeper levels by adding + // tables that cover a specified range to all levels. + void FillLevels(const std::string& smallest, const std::string& largest, + int cf) { + MakeTables(db_->NumberLevels(handles_[cf]), smallest, largest, cf); + } + + void DumpFileCounts(const char* label) { + fprintf(stderr, "---\n%s:\n", label); + fprintf(stderr, "maxoverlap: %lld\n", + static_cast( + dbfull()->TEST_MaxNextLevelOverlappingBytes())); + for (int level = 0; level < db_->NumberLevels(); level++) { + int num = NumTableFilesAtLevel(level); + if (num > 0) { + fprintf(stderr, " level %3d : %d files\n", level, num); + } + } + } + + std::string DumpSSTableList() { + std::string property; + db_->GetProperty("rocksdb.sstables", &property); + return property; + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } + + Options OptionsForLogIterTest() { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.WAL_ttl_seconds = 1000; + return options; + } + + std::unique_ptr OpenTransactionLogIter( + const SequenceNumber seq) { + unique_ptr iter; + Status status = dbfull()->GetUpdatesSince(seq, &iter); + ASSERT_OK(status); + ASSERT_TRUE(iter->Valid()); + return std::move(iter); + } + + std::string DummyString(size_t len, char c = 'a') { + return std::string(len, c); + } + + void VerifyIterLast(std::string expected_key, int cf = 0) { + Iterator* iter; + ReadOptions ro; + if (cf == 0) { + iter = db_->NewIterator(ro); + } else { + iter = db_->NewIterator(ro, handles_[cf]); + } + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), expected_key); + delete iter; + } + + // Used to test InplaceUpdate + + // If previous value is nullptr or delta is > than previous value, + // sets newValue with delta + // If previous value is not empty, + // updates previous value with 'b' string of previous value size - 1. + static UpdateStatus + updateInPlaceSmallerSize(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue) { + if (prevValue == nullptr) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; + } else { + *prevSize = *prevSize - 1; + std::string str_b = std::string(*prevSize, 'b'); + memcpy(prevValue, str_b.c_str(), str_b.size()); + return UpdateStatus::UPDATED_INPLACE; + } + } + + static UpdateStatus + updateInPlaceSmallerVarintSize(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue) { + if (prevValue == nullptr) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; + } else { + *prevSize = 1; + std::string str_b = std::string(*prevSize, 'b'); + memcpy(prevValue, str_b.c_str(), str_b.size()); + return UpdateStatus::UPDATED_INPLACE; + } + } + + static UpdateStatus + updateInPlaceLargerSize(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; + } + + static UpdateStatus + updateInPlaceNoAction(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue) { + return UpdateStatus::UPDATE_FAILED; + } + + // Utility method to test InplaceUpdate + void validateNumberOfEntries(int numValues, int cf = 0) { + Iterator* iter; + if (cf != 0) { + iter = dbfull()->TEST_NewInternalIterator(handles_[cf]); + } else { + iter = dbfull()->TEST_NewInternalIterator(); + } + iter->SeekToFirst(); + ASSERT_EQ(iter->status().ok(), true); + int seq = numValues; + while (iter->Valid()) { + ParsedInternalKey ikey; + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + + // checks sequence number for updates + ASSERT_EQ(ikey.sequence, (unsigned)seq--); + iter->Next(); + } + delete iter; + ASSERT_EQ(0, seq); + } + + void CopyFile(const std::string& source, const std::string& destination, + uint64_t size = 0) { + const EnvOptions soptions; + unique_ptr srcfile; + ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions)); + unique_ptr destfile; + ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions)); + + if (size == 0) { + // default argument means copy everything + ASSERT_OK(env_->GetFileSize(source, &size)); + } + + char buffer[4096]; + Slice slice; + while (size > 0) { + uint64_t one = std::min(uint64_t(sizeof(buffer)), size); + ASSERT_OK(srcfile->Read(one, &slice, buffer)); + ASSERT_OK(destfile->Append(slice)); + size -= slice.size(); + } + ASSERT_OK(destfile->Close()); + } + +}; + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key%06d", i); + return std::string(buf); +} + +static long TestGetTickerCount(const Options& options, Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); +} + +// A helper function that ensures the table properties returned in +// `GetPropertiesOfAllTablesTest` is correct. +// This test assumes entries size is differnt for each of the tables. +namespace { +void VerifyTableProperties(DB* db, uint64_t expected_entries_size) { + TablePropertiesCollection props; + ASSERT_OK(db->GetPropertiesOfAllTables(&props)); + + ASSERT_EQ(4U, props.size()); + std::unordered_set unique_entries; + + // Indirect test + uint64_t sum = 0; + for (const auto& item : props) { + unique_entries.insert(item.second->num_entries); + sum += item.second->num_entries; + } + + ASSERT_EQ(props.size(), unique_entries.size()); + ASSERT_EQ(expected_entries_size, sum); +} +} // namespace + +TEST(DBTest, Empty) { + do { + Options options; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + std::string num; + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("0", num); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("1", num); + + env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls + Put(1, "k1", std::string(100000, 'x')); // Fill memtable + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("2", num); + + Put(1, "k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("1", num); + + ASSERT_EQ("v1", Get(1, "foo")); + env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls + } while (ChangeOptions()); +} + +TEST(DBTest, ReadOnlyDB) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + Close(); + + Options options; + ASSERT_OK(ReadOnlyReopen(&options)); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + Iterator* iter = db_->NewIterator(ReadOptions()); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ++count; + } + ASSERT_EQ(count, 2); + delete iter; +} + +// Make sure that when options.block_cache is set, after a new table is +// created its index/filter blocks are added to block cache. +TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { + Options options = CurrentOptions(); + std::unique_ptr filter_policy(NewBloomFilterPolicy(20)); + options.filter_policy = filter_policy.get(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, &options); + + ASSERT_OK(Put(1, "key", "val")); + // Create a new table. + ASSERT_OK(Flush(1)); + + // index/filter blocks added to block cache right after table creation. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, /* only index/filter were added */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); + + // Make sure filter block is in cache. + std::string value; + ReadOptions ropt; + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); + + // Miss count should remain the same. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Make sure index block is in cache. + auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + value = Get(1, "key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(index_block_hit + 1, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + value = Get(1, "key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(index_block_hit + 2, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); +} + +TEST(DBTest, GetPropertiesOfAllTablesTest) { + Options options = CurrentOptions(); + Reopen(&options); + // Create 4 tables + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val"); + } + db_->Flush(FlushOptions()); + } + + // 1. Read table properties directly from file + Reopen(&options); + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 2. Put two tables to table cache and + Reopen(&options); + // fetch key from 1st and 2nd table, which will internally place that table to + // the table cache. + for (int i = 0; i < 2; ++i) { + Get(std::to_string(i * 100 + 0)); + } + + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 3. Put all tables to table cache + Reopen(&options); + // fetch key from 1st and 2nd table, which will internally place that table to + // the table cache. + for (int i = 0; i < 4; ++i) { + Get(std::to_string(i * 100 + 0)); + } + VerifyTableProperties(db_, 10 + 11 + 12 + 13); +} + +TEST(DBTest, LevelLimitReopen) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, &options); + + const std::string value(1024 * 1024, ' '); + int i = 0; + while (NumTableFilesAtLevel(2, 1) == 0) { + ASSERT_OK(Put(1, Key(i++), value)); + } + + options.num_levels = 1; + options.max_bytes_for_level_multiplier_additional.resize(1, 1); + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &options); + ASSERT_EQ(s.IsInvalidArgument(), true); + ASSERT_EQ(s.ToString(), + "Invalid argument: db has more levels than options.num_levels"); + + options.num_levels = 10; + options.max_bytes_for_level_multiplier_additional.resize(10, 1); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, &options)); +} + +TEST(DBTest, Preallocation) { + const std::string src = dbname_ + "/alloc_test"; + unique_ptr srcfile; + const EnvOptions soptions; + ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions)); + srcfile->SetPreallocationBlockSize(1024 * 1024); + + // No writes should mean no preallocation + size_t block_size, last_allocated_block; + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 0UL); + + // Small write should preallocate one block + srcfile->Append("test"); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 1UL); + + // Write an entire preallocation block, make sure we increased by two. + std::string buf(block_size, ' '); + srcfile->Append(buf); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 2UL); + + // Write five more blocks at once, ensure we're where we need to be. + buf = std::string(block_size * 5, ' '); + srcfile->Append(buf); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 7UL); +} + +TEST(DBTest, PutDeleteGet) { + do { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_OK(Delete(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + } while (ChangeOptions()); +} + + +TEST(DBTest, GetFromImmutableLayer) { + do { + Options options; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + + env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls + Put(1, "k1", std::string(100000, 'x')); // Fill memtable + Put(1, "k2", std::string(100000, 'y')); // Trigger flush + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(0, "foo")); + env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls + } while (ChangeOptions()); +} + +TEST(DBTest, GetFromVersions) { + do { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(0, "foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetSnapshot) { + do { + CreateAndReopenWithCF({"pikachu"}); + // Try with both a short key and a long key + for (int i = 0; i < 2; i++) { + std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); + ASSERT_OK(Put(1, key, "v1")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK(Put(1, key, "v2")); + ASSERT_EQ("v2", Get(1, key)); + ASSERT_EQ("v1", Get(1, key, s1)); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v2", Get(1, key)); + ASSERT_EQ("v1", Get(1, key, s1)); + db_->ReleaseSnapshot(s1); + } + // skip as HashCuckooRep does not support snapshot + } while (ChangeOptions(kSkipHashCuckoo)); +} + +TEST(DBTest, GetLevel0Ordering) { + do { + CreateAndReopenWithCF({"pikachu"}); + // Check that we process level-0 files in correct order. The code + // below generates two level-0 files where the earlier one comes + // before the later one in the level-0 file list since the earlier + // one has a smaller "smallest" key. + ASSERT_OK(Put(1, "bar", "b")); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v2", Get(1, "foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetOrderedByLevels) { + do { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "foo", "v1")); + Compact(1, "a", "z"); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v2", Get(1, "foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetPicksCorrectFile) { + do { + CreateAndReopenWithCF({"pikachu"}); + // Arrange to have multiple files in a non-level-0 level. + ASSERT_OK(Put(1, "a", "va")); + Compact(1, "a", "b"); + ASSERT_OK(Put(1, "x", "vx")); + Compact(1, "x", "y"); + ASSERT_OK(Put(1, "f", "vf")); + Compact(1, "f", "g"); + ASSERT_EQ("va", Get(1, "a")); + ASSERT_EQ("vf", Get(1, "f")); + ASSERT_EQ("vx", Get(1, "x")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetEncountersEmptyLevel) { + do { + CreateAndReopenWithCF({"pikachu"}); + // Arrange for the following to happen: + // * sstable A in level 0 + // * nothing in level 1 + // * sstable B in level 2 + // Then do enough Get() calls to arrange for an automatic compaction + // of sstable A. A bug would cause the compaction to be marked as + // occuring at level 1 (instead of the correct level 0). + + // Step 1: First place sstables in levels 0 and 2 + int compaction_count = 0; + while (NumTableFilesAtLevel(0, 1) == 0 || NumTableFilesAtLevel(2, 1) == 0) { + ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2"; + compaction_count++; + Put(1, "a", "begin"); + Put(1, "z", "end"); + ASSERT_OK(Flush(1)); + } + + // Step 2: clear level 1 if necessary. + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1); + + // Step 3: read a bunch of times + for (int i = 0; i < 1000; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, "missing")); + } + + // Step 4: Wait for compaction to finish + env_->SleepForMicroseconds(1000000); + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); // XXX + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); +} + +// KeyMayExist can lead to a few false positives, but not false negatives. +// To make test deterministic, use a much larger number of bits per key-20 than +// bits in the key, so that false positives are eliminated +TEST(DBTest, KeyMayExist) { + do { + ReadOptions ropts; + std::string value; + Options options = CurrentOptions(); + options.filter_policy = NewBloomFilterPolicy(20); + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, &options); + + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + + ASSERT_OK(Put(1, "a", "b")); + bool value_found = false; + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); + ASSERT_TRUE(value_found); + ASSERT_EQ("b", value); + + ASSERT_OK(Flush(1)); + value.clear(); + + long numopen = TestGetTickerCount(options, NO_FILE_OPENS); + long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); + ASSERT_TRUE(!value_found); + // assert that no new files were opened and no new blocks were + // read into block cache. + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Delete(1, "a")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Flush(1)); + db_->CompactRange(handles_[1], nullptr, nullptr); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Delete(1, "c")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + delete options.filter_policy; + + // KeyMayExist function only checks data in block caches, which is not used + // by plain table format. + } while ( + ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction)); +} + +TEST(DBTest, NonBlockingIteration) { + do { + ReadOptions non_blocking_opts, regular_opts; + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + non_blocking_opts.read_tier = kBlockCacheTier; + CreateAndReopenWithCF({"pikachu"}, &options); + // write one kv to the database. + ASSERT_OK(Put(1, "a", "b")); + + // scan using non-blocking iterator. We should find it because + // it is in memtable. + Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + delete iter; + + // flush memtable to storage. Now, the key should not be in the + // memtable neither in the block cache. + ASSERT_OK(Flush(1)); + + // verify that a non-blocking iterator does not find any + // kvs. Neither does it do any IOs to storage. + long numopen = TestGetTickerCount(options, NO_FILE_OPENS); + long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = db_->NewIterator(non_blocking_opts, handles_[1]); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + count++; + } + ASSERT_EQ(count, 0); + ASSERT_TRUE(iter->status().IsIncomplete()); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // read in the specified block via a regular get + ASSERT_EQ(Get(1, "a"), "b"); + + // verify that we can find it via a non-blocking scan + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = db_->NewIterator(non_blocking_opts, handles_[1]); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // This test verifies block cache behaviors, which is not used by plain + // table format. + // Exclude kHashCuckoo as it does not support iteration currently + } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | + kSkipHashCuckoo)); +} + +// A delete is skipped for key if KeyMayExist(key) returns False +// Tests Writebatch consistency and proper delete behaviour +TEST(DBTest, FilterDeletes) { + do { + Options options = CurrentOptions(); + options.filter_policy = NewBloomFilterPolicy(20); + options.filter_deletes = true; + CreateAndReopenWithCF({"pikachu"}, &options); + WriteBatch batch; + + batch.Delete(handles_[1], "a"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(AllEntriesFor("a", 1), "[ ]"); // Delete skipped + batch.Clear(); + + batch.Put(handles_[1], "a", "b"); + batch.Delete(handles_[1], "a"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(Get(1, "a"), "NOT_FOUND"); + ASSERT_EQ(AllEntriesFor("a", 1), "[ DEL, b ]"); // Delete issued + batch.Clear(); + + batch.Delete(handles_[1], "c"); + batch.Put(handles_[1], "c", "d"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(Get(1, "c"), "d"); + ASSERT_EQ(AllEntriesFor("c", 1), "[ d ]"); // Delete skipped + batch.Clear(); + + ASSERT_OK(Flush(1)); // A stray Flush + + batch.Delete(handles_[1], "c"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(AllEntriesFor("c", 1), "[ DEL, d ]"); // Delete issued + batch.Clear(); + + delete options.filter_policy; + } while (ChangeCompactOptions()); +} + + +TEST(DBTest, IterSeekBeforePrev) { + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("0", "f")); + ASSERT_OK(Put("1", "h")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("2", "j")); + auto iter = db_->NewIterator(ReadOptions()); + iter->Seek(Slice("c")); + iter->Prev(); + iter->Seek(Slice("a")); + iter->Prev(); + delete iter; +} + +namespace { +std::string MakeLongKey(size_t length, char c) { + return std::string(length, c); +} +} // namespace + +TEST(DBTest, IterLongKeys) { + ASSERT_OK(Put(MakeLongKey(20, 0), "0")); + ASSERT_OK(Put(MakeLongKey(32, 2), "2")); + ASSERT_OK(Put("a", "b")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put(MakeLongKey(50, 1), "1")); + ASSERT_OK(Put(MakeLongKey(127, 3), "3")); + ASSERT_OK(Put(MakeLongKey(64, 4), "4")); + auto iter = db_->NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + iter->Seek(MakeLongKey(20, 0)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4"); + delete iter; + + iter = db_->NewIterator(ReadOptions()); + iter->Seek(MakeLongKey(50, 1)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + delete iter; +} + + +TEST(DBTest, IterNextWithNewerSeq) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = db_->NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Seek(Slice("a")); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->d"); + delete iter; +} + +TEST(DBTest, IterPrevWithNewerSeq) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = db_->NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Seek(Slice("d")); + ASSERT_EQ(IterStatus(iter), "d->e"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->d"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); + + iter->Prev(); + delete iter; +} + +TEST(DBTest, IterPrevWithNewerSeq2) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = db_->NewIterator(ReadOptions()); + iter->Seek(Slice("c")); + ASSERT_EQ(IterStatus(iter), "c->d"); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); + + iter->Prev(); + delete iter; +} + +TEST(DBTest, IterEmpty) { + do { + CreateAndReopenWithCF({"pikachu"}); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("foo"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, IterSingle) { + do { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "a", "va")); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, IterMulti) { + do { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "a", "va")); + ASSERT_OK(Put(1, "b", "vb")); + ASSERT_OK(Put(1, "c", "vc")); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("ax"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("z"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + // Switch from reverse to forward + iter->SeekToLast(); + iter->Prev(); + iter->Prev(); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Switch from forward to reverse + iter->SeekToFirst(); + iter->Next(); + iter->Next(); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Make sure iter stays at snapshot + ASSERT_OK(Put(1, "a", "va2")); + ASSERT_OK(Put(1, "a2", "va3")); + ASSERT_OK(Put(1, "b", "vb2")); + ASSERT_OK(Put(1, "c", "vc2")); + ASSERT_OK(Delete(1, "b")); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +// Check that we can skip over a run of user keys +// by using reseek rather than sequential scan +TEST(DBTest, IterReseek) { + Options options = CurrentOptions(); + options.max_sequential_skip_in_iterations = 3; + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); + + // insert two keys with same userkey and verify that + // reseek is not invoked. For each of these test cases, + // verify that we can find the next key "b". + ASSERT_OK(Put(1, "a", "one")); + ASSERT_OK(Put(1, "a", "two")); + ASSERT_OK(Put(1, "b", "bone")); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "a->two"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of three keys with same userkey and verify + // that reseek is still not invoked. + ASSERT_OK(Put(1, "a", "three")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->three"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of four keys with same userkey and verify + // that reseek is invoked. + ASSERT_OK(Put(1, "a", "four")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->four"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // Testing reverse iterator + // At this point, we have three versions of "a" and one version of "b". + // The reseek statistics is already at 1. + int num_reseeks = + (int)TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION); + + // Insert another version of b and assert that reseek is not invoked + ASSERT_OK(Put(1, "b", "btwo")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->btwo"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 1); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; + + // insert two more versions of b. This makes a total of 4 versions + // of b and 4 versions of a. + ASSERT_OK(Put(1, "b", "bthree")); + ASSERT_OK(Put(1, "b", "bfour")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->bfour"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 2); + iter->Prev(); + + // the previous Prev call should have invoked reseek + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 3); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; +} + +TEST(DBTest, IterSmallAndLargeMix) { + do { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "a", "va")); + ASSERT_OK(Put(1, "b", std::string(100000, 'b'))); + ASSERT_OK(Put(1, "c", "vc")); + ASSERT_OK(Put(1, "d", std::string(100000, 'd'))); + ASSERT_OK(Put(1, "e", std::string(100000, 'e'))); + + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, IterMultiWithDelete) { + do { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "ka", "va")); + ASSERT_OK(Put(1, "kb", "vb")); + ASSERT_OK(Put(1, "kc", "vc")); + ASSERT_OK(Delete(1, "kb")); + ASSERT_EQ("NOT_FOUND", Get(1, "kb")); + + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->Seek("kc"); + ASSERT_EQ(IterStatus(iter), "kc->vc"); + if (!CurrentOptions().merge_operator) { + // TODO: merge operator does not support backward iteration yet + if (kPlainTableAllBytesPrefix != option_config_&& + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_) { + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "ka->va"); + } + } + delete iter; + } while (ChangeOptions()); +} + +TEST(DBTest, IterPrevMaxSkip) { + do { + CreateAndReopenWithCF({"pikachu"}); + for (int i = 0; i < 2; i++) { + ASSERT_OK(Put(1, "key1", "v1")); + ASSERT_OK(Put(1, "key2", "v2")); + ASSERT_OK(Put(1, "key3", "v3")); + ASSERT_OK(Put(1, "key4", "v4")); + ASSERT_OK(Put(1, "key5", "v5")); + } + + VerifyIterLast("key5->v5", 1); + + ASSERT_OK(Delete(1, "key5")); + VerifyIterLast("key4->v4", 1); + + ASSERT_OK(Delete(1, "key4")); + VerifyIterLast("key3->v3", 1); + + ASSERT_OK(Delete(1, "key3")); + VerifyIterLast("key2->v2", 1); + + ASSERT_OK(Delete(1, "key2")); + VerifyIterLast("key1->v1", 1); + + ASSERT_OK(Delete(1, "key1")); + VerifyIterLast("(invalid)", 1); + } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast)); +} + +TEST(DBTest, IterWithSnapshot) { + do { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "key1", "val1")); + ASSERT_OK(Put(1, "key2", "val2")); + ASSERT_OK(Put(1, "key3", "val3")); + ASSERT_OK(Put(1, "key4", "val4")); + ASSERT_OK(Put(1, "key5", "val5")); + + const Snapshot *snapshot = db_->GetSnapshot(); + ReadOptions options; + options.snapshot = snapshot; + Iterator* iter = db_->NewIterator(options, handles_[1]); + + // Put more values after the snapshot + ASSERT_OK(Put(1, "key100", "val100")); + ASSERT_OK(Put(1, "key101", "val101")); + + iter->Seek("key5"); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + if (!CurrentOptions().merge_operator) { + // TODO: merge operator does not support backward iteration yet + if (kPlainTableAllBytesPrefix != option_config_&& + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_) { + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key3->val3"); + + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + } + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + } + db_->ReleaseSnapshot(snapshot); + delete iter; + // skip as HashCuckooRep does not support snapshot + } while (ChangeOptions(kSkipHashCuckoo)); +} + +TEST(DBTest, Recover) { + do { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "baz", "v5")); + + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("v1", Get(1, "foo")); + + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v5", Get(1, "baz")); + ASSERT_OK(Put(1, "bar", "v2")); + ASSERT_OK(Put(1, "foo", "v3")); + + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v4")); + ASSERT_EQ("v4", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v5", Get(1, "baz")); + } while (ChangeOptions()); +} + +TEST(DBTest, RecoverWithTableHandle) { + do { + Options options; + options.create_if_missing = true; + options.write_buffer_size = 100; + options.disable_auto_compactions = true; + options = CurrentOptions(options); + DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "bar", "v2")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "v3")); + ASSERT_OK(Put(1, "bar", "v4")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "big", std::string(100, 'a'))); + ReopenWithColumnFamilies({"default", "pikachu"}); + + std::vector> files; + dbfull()->TEST_GetFilesMetaData(handles_[1], &files); + int total_files = 0; + for (const auto& level : files) { + total_files += level.size(); + } + ASSERT_EQ(total_files, 3); + for (const auto& level : files) { + for (const auto& file : level) { + if (kInfiniteMaxOpenFiles == option_config_) { + ASSERT_TRUE(file.table_reader_handle != nullptr); + } else { + ASSERT_TRUE(file.table_reader_handle == nullptr); + } + } + } + } while (ChangeOptions()); +} + +TEST(DBTest, IgnoreRecoveredLog) { + std::string backup_logs = dbname_ + "/backup_logs"; + + // delete old files in backup_logs directory + env_->CreateDirIfMissing(backup_logs); + std::vector old_files; + env_->GetChildren(backup_logs, &old_files); + for (auto& file : old_files) { + if (file != "." && file != "..") { + env_->DeleteFile(backup_logs + "/" + file); + } + } + + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + options.wal_dir = dbname_ + "/logs"; + DestroyAndReopen(&options); + + // fill up the DB + std::string one, two; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one))); + + // copy the logs to backup + std::vector logs; + env_->GetChildren(options.wal_dir, &logs); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log); + } + } + + // recover the DB + Reopen(&options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + Close(); + + // copy the logs from backup back to wal dir + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + } + } + // this should ignore the log files, recovery should not happen again + // if the recovery happens, the same merge operator would be called twice, + // leading to incorrect results + Reopen(&options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + Close(); + Destroy(&options); + Reopen(&options); + Close(); + + // copy the logs from backup back to wal dir + env_->CreateDirIfMissing(options.wal_dir); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + } + } + // assert that we successfully recovered only from logs, even though we + // destroyed the DB + Reopen(&options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + + // Recovery will fail if DB directory doesn't exist. + Destroy(&options); + // copy the logs from backup back to wal dir + env_->CreateDirIfMissing(options.wal_dir); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + // we won't be needing this file no more + env_->DeleteFile(backup_logs + "/" + log); + } + } + Status s = TryReopen(&options); + ASSERT_TRUE(!s.ok()); + } while (ChangeOptions(kSkipHashCuckoo)); +} + +TEST(DBTest, RollLog) { + do { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "baz", "v5")); + + ReopenWithColumnFamilies({"default", "pikachu"}); + for (int i = 0; i < 10; i++) { + ReopenWithColumnFamilies({"default", "pikachu"}); + } + ASSERT_OK(Put(1, "foo", "v4")); + for (int i = 0; i < 10; i++) { + ReopenWithColumnFamilies({"default", "pikachu"}); + } + } while (ChangeOptions()); +} + +TEST(DBTest, WAL) { + do { + CreateAndReopenWithCF({"pikachu"}); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); + + ReopenWithColumnFamilies({"default", "pikachu"}); + // Both value's should be present. + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v2", Get(1, "foo")); + + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); + + ReopenWithColumnFamilies({"default", "pikachu"}); + // again both values should be present. + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "bar")); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, CheckLock) { + do { + DB* localdb; + Options options = CurrentOptions(); + ASSERT_OK(TryReopen(&options)); + + // second open should fail + ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, FlushMultipleMemtable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + CreateAndReopenWithCF({"pikachu"}, &options); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + ASSERT_OK(Flush(1)); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, NumImmutableMemTable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.write_buffer_size = 1000000; + CreateAndReopenWithCF({"pikachu"}, &options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + SetPerfLevel(kEnableTime);; + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + perf_context.Reset(); + Get(1, "k1"); + ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); + ASSERT_EQ(num, "1"); + + perf_context.Reset(); + Get(1, "k1"); + ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); + perf_context.Reset(); + Get(1, "k2"); + ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value)); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.cur-size-active-mem-table", &num)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "2"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); + ASSERT_EQ(num, "2"); + perf_context.Reset(); + Get(1, "k2"); + ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); + perf_context.Reset(); + Get(1, "k3"); + ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); + perf_context.Reset(); + Get(1, "k1"); + ASSERT_EQ(3, (int) perf_context.get_from_memtable_count); + + ASSERT_OK(Flush(1)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.cur-size-active-mem-table", &num)); + // "200" is the size of the metadata of an empty skiplist, this would + // break if we change the default skiplist implementation + ASSERT_EQ(num, "200"); + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); +} + +class SleepingBackgroundTask { + public: + SleepingBackgroundTask() + : bg_cv_(&mutex_), should_sleep_(true), done_with_sleep_(false) {} + void DoSleep() { + MutexLock l(&mutex_); + while (should_sleep_) { + bg_cv_.Wait(); + } + done_with_sleep_ = true; + bg_cv_.SignalAll(); + } + void WakeUp() { + MutexLock l(&mutex_); + should_sleep_ = false; + bg_cv_.SignalAll(); + } + void WaitUntilDone() { + MutexLock l(&mutex_); + while (!done_with_sleep_) { + bg_cv_.Wait(); + } + } + + static void DoSleepTask(void* arg) { + reinterpret_cast(arg)->DoSleep(); + } + + private: + port::Mutex mutex_; + port::CondVar bg_cv_; // Signalled when background work finishes + bool should_sleep_; + bool done_with_sleep_; +}; + +TEST(DBTest, GetProperty) { + // Set sizes to both background thread pool to be 1 and block them. + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_high, + Env::Priority::HIGH); + + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.compaction_style = kCompactionStyleUniversal; + options.level0_file_num_compaction_trigger = 1; + options.compaction_options_universal.size_ratio = 50; + options.max_background_compactions = 1; + options.max_background_flushes = 1; + options.max_write_buffer_number = 10; + options.min_write_buffer_number_to_merge = 1; + options.write_buffer_size = 1000000; + Reopen(&options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + SetPerfLevel(kEnableTime); + + ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "0"); + perf_context.Reset(); + + ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "2"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "0"); + + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + dbfull()->TEST_WaitForFlushMemTable(); + + ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value)); + ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value)); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "1"); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + +TEST(DBTest, FLUSH) { + do { + CreateAndReopenWithCF({"pikachu"}); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + SetPerfLevel(kEnableTime);; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + // this will now also flush the last 2 writes + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + perf_context.Reset(); + Get(1, "foo"); + ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); + + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); + ASSERT_OK(Flush(1)); + + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("v2", Get(1, "bar")); + perf_context.Reset(); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); + + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); + ASSERT_OK(Flush(1)); + + ReopenWithColumnFamilies({"default", "pikachu"}); + // 'foo' should be there because its put + // has WAL enabled. + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "bar")); + + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, RecoveryWithEmptyLog) { + do { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "foo", "v2")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_OK(Put(1, "foo", "v3")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("v3", Get(1, "foo")); + } while (ChangeOptions()); +} + +// Check that writes done during a memtable compaction are recovered +// if the database is shutdown during the memtable compaction. +TEST(DBTest, RecoverDuringMemtableCompaction) { + do { + Options options; + options.env = env_; + options.write_buffer_size = 1000000; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + // Trigger a long memtable compaction and reopen the database during it + ASSERT_OK(Put(1, "foo", "v1")); // Goes to 1st log file + ASSERT_OK(Put(1, "big1", std::string(10000000, 'x'))); // Fills memtable + ASSERT_OK(Put(1, "big2", std::string(1000, 'y'))); // Triggers compaction + ASSERT_OK(Put(1, "bar", "v2")); // Goes to new log file + + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1")); + ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2")); + } while (ChangeOptions()); +} + +TEST(DBTest, MinorCompactionsHappen) { + do { + Options options; + options.write_buffer_size = 10000; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + const int N = 500; + + int starting_num_tables = TotalTableFiles(1); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v'))); + } + int ending_num_tables = TotalTableFiles(1); + ASSERT_GT(ending_num_tables, starting_num_tables); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i))); + } + + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i))); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, ManifestRollOver) { + do { + Options options; + options.max_manifest_file_size = 10 ; // 10 bytes + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + { + ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1'))); + ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2'))); + ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3'))); + uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_OK(Flush(1)); // This should trigger LogAndApply. + uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_GT(manifest_after_flush, manifest_before_flush); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush); + // check if a new manifest file got inserted or not. + ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1")); + ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2")); + ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3")); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, IdentityAcrossRestarts) { + do { + std::string id1; + ASSERT_OK(db_->GetDbIdentity(id1)); + + Options options = CurrentOptions(); + Reopen(&options); + std::string id2; + ASSERT_OK(db_->GetDbIdentity(id2)); + // id1 should match id2 because identity was not regenerated + ASSERT_EQ(id1.compare(id2), 0); + + std::string idfilename = IdentityFileName(dbname_); + ASSERT_OK(env_->DeleteFile(idfilename)); + Reopen(&options); + std::string id3; + ASSERT_OK(db_->GetDbIdentity(id3)); + // id1 should NOT match id3 because identity was regenerated + ASSERT_NE(id1.compare(id3), 0); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, RecoverWithLargeLog) { + do { + { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, &options); + ASSERT_OK(Put(1, "big1", std::string(200000, '1'))); + ASSERT_OK(Put(1, "big2", std::string(200000, '2'))); + ASSERT_OK(Put(1, "small3", std::string(10, '3'))); + ASSERT_OK(Put(1, "small4", std::string(10, '4'))); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + } + + // Make sure that if we re-open with a small write buffer size that + // we flush table files in the middle of a large log file. + Options options; + options.write_buffer_size = 100000; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3); + ASSERT_EQ(std::string(200000, '1'), Get(1, "big1")); + ASSERT_EQ(std::string(200000, '2'), Get(1, "big2")); + ASSERT_EQ(std::string(10, '3'), Get(1, "small3")); + ASSERT_EQ(std::string(10, '4'), Get(1, "small4")); + ASSERT_GT(NumTableFilesAtLevel(0, 1), 1); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, CompactionsGenerateMultipleFiles) { + Options options; + options.write_buffer_size = 100000000; // Large write buffer + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(1, Key(i), values[i])); + } + + // Reopening moves updates to level-0 + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 1); + for (int i = 0; i < 80; i++) { + ASSERT_EQ(Get(1, Key(i)), values[i]); + } +} + +TEST(DBTest, CompactionTrigger) { + Options options; + options.write_buffer_size = 100<<10; //100KB + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.level0_file_num_compaction_trigger = 3; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(1, Key(i), values[i])); + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(1, Key(i), values[i])); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1); +} + +// This is a static filter used for filtering +// kvs during the compaction process. +static int cfilter_count; +static std::string NEW_VALUE = "NewValue"; + +class KeepFilter : public CompactionFilter { + public: + virtual bool Filter(int level, const Slice& key, const Slice& value, + std::string* new_value, bool* value_changed) const + override { + cfilter_count++; + return false; + } + + virtual const char* Name() const override { return "KeepFilter"; } +}; + +class DeleteFilter : public CompactionFilter { + public: + virtual bool Filter(int level, const Slice& key, const Slice& value, + std::string* new_value, bool* value_changed) const + override { + cfilter_count++; + return true; + } + + virtual const char* Name() const override { return "DeleteFilter"; } +}; + +class ChangeFilter : public CompactionFilter { + public: + explicit ChangeFilter() {} + + virtual bool Filter(int level, const Slice& key, const Slice& value, + std::string* new_value, bool* value_changed) const + override { + assert(new_value != nullptr); + *new_value = NEW_VALUE; + *value_changed = true; + return false; + } + + virtual const char* Name() const override { return "ChangeFilter"; } +}; + +class KeepFilterFactory : public CompactionFilterFactory { + public: + explicit KeepFilterFactory(bool check_context = false) + : check_context_(check_context) {} + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (check_context_) { + ASSERT_EQ(expect_full_compaction_.load(), context.is_full_compaction); + ASSERT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); + } + return std::unique_ptr(new KeepFilter()); + } + + virtual const char* Name() const override { return "KeepFilterFactory"; } + bool check_context_; + std::atomic_bool expect_full_compaction_; + std::atomic_bool expect_manual_compaction_; +}; + +class DeleteFilterFactory : public CompactionFilterFactory { + public: + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (context.is_manual_compaction) { + return std::unique_ptr(new DeleteFilter()); + } else { + return std::unique_ptr(nullptr); + } + } + + virtual const char* Name() const override { return "DeleteFilterFactory"; } +}; + +class ChangeFilterFactory : public CompactionFilterFactory { + public: + explicit ChangeFilterFactory() {} + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + return std::unique_ptr(new ChangeFilter()); + } + + virtual const char* Name() const override { return "ChangeFilterFactory"; } +}; + +// TODO(kailiu) The tests on UniversalCompaction has some issues: +// 1. A lot of magic numbers ("11" or "12"). +// 2. Made assumption on the memtable flush conidtions, which may change from +// time to time. +TEST(DBTest, UniversalCompactionTrigger) { + Options options; + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + // trigger compaction if there are >= 4 files + options.level0_file_num_compaction_trigger = 4; + KeepFilterFactory* filter = new KeepFilterFactory(true); + filter->expect_manual_compaction_.store(false); + options.compaction_filter_factory.reset(filter); + + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + Random rnd(301); + int key_idx = 0; + + filter->expect_full_compaction_.store(true); + // Stage 1: + // Generate a set of files at level 0, but don't trigger level-0 + // compaction. + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Suppose each file flushed from mem table has size 1. Now we compact + // (level0_file_num_compaction_trigger+1)=4 files and should have a big + // file of size 4. + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); + } + + // Stage 2: + // Now we have one file at level 0, with size 4. We also have some data in + // mem table. Let's continue generating new files at level 0, but don't + // trigger level-0 compaction. + // First, clean up memtable before inserting new data. This will generate + // a level-0 file, with size around 0.4 (according to previously written + // data amount). + filter->expect_full_compaction_.store(false); + ASSERT_OK(Flush(1)); + for (int num = 0; num < options.level0_file_num_compaction_trigger - 3; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 3); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1. + // After comapction, we should have 2 files, with size 4, 2.4. + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 2); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); + } + + // Stage 3: + // Now we have 2 files at level 0, with size 4 and 2.4. Continue + // generating new files at level 0. + for (int num = 0; num < options.level0_file_num_compaction_trigger - 3; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 3); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1. + // After comapction, we should have 3 files, with size 4, 2.4, 2. + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); + } + + // Stage 4: + // Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a + // new file of size 1. + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Level-0 compaction is triggered, but no file will be picked up. + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 4); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); + } + + // Stage 5: + // Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate + // a new file of size 1. + filter->expect_full_compaction_.store(true); + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // All files at level 0 will be compacted into a single one. + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); + } +} + +TEST(DBTest, UniversalCompactionSizeAmplification) { + Options options; + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 3; + CreateAndReopenWithCF({"pikachu"}, &options); + + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + + Random rnd(301); + int key_idx = 0; + + // Generate two files in Level 0. Both files are approx the same size. + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); + } + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 2); + + // Flush whatever is remaining in memtable. This is typically + // small, which should not trigger size ratio based compaction + // but will instead trigger size amplification. + ASSERT_OK(Flush(1)); + + dbfull()->TEST_WaitForCompact(); + + // Verify that size amplification did occur + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); +} + +TEST(DBTest, UniversalCompactionOptions) { + Options options; + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 1; + options.compaction_options_universal.compression_size_percent = -1; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + Random rnd(301); + int key_idx = 0; + + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + + if (num < options.level0_file_num_compaction_trigger - 1) { + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); + } + } + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); + } +} + +TEST(DBTest, UniversalCompactionStopStyleSimilarSize) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + // trigger compaction if there are >= 4 files + options.level0_file_num_compaction_trigger = 4; + options.compaction_options_universal.size_ratio = 10; + options.compaction_options_universal.stop_style = kCompactionStopStyleSimilarSize; + options.num_levels=1; + Reopen(&options); + + Random rnd(301); + int key_idx = 0; + + // Stage 1: + // Generate a set of files at level 0, but don't trigger level-0 + // compaction. + for (int num = 0; + num < options.level0_file_num_compaction_trigger-1; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Suppose each file flushed from mem table has size 1. Now we compact + // (level0_file_num_compaction_trigger+1)=4 files and should have a big + // file of size 4. + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + + // Stage 2: + // Now we have one file at level 0, with size 4. We also have some data in + // mem table. Let's continue generating new files at level 0, but don't + // trigger level-0 compaction. + // First, clean up memtable before inserting new data. This will generate + // a level-0 file, with size around 0.4 (according to previously written + // data amount). + dbfull()->Flush(FlushOptions()); + for (int num = 0; + num < options.level0_file_num_compaction_trigger-3; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 3); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1. + // After compaction, we should have 3 files, with size 4, 0.4, 2. + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + // Stage 3: + // Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one + // more file at level-0, which should trigger level-0 compaction. + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Level-0 compaction is triggered, but no file will be picked up. + ASSERT_EQ(NumTableFilesAtLevel(0), 4); +} + +#if defined(SNAPPY) +TEST(DBTest, CompressedCache) { + int num_iter = 80; + + // Run this test three iterations. + // Iteration 1: only a uncompressed block cache + // Iteration 2: only a compressed block cache + // Iteration 3: both block cache and compressed cache + // Iteration 4: both block cache and compressed cache, but DB is not + // compressed + for (int iter = 0; iter < 4; iter++) { + Options options = CurrentOptions(); + options.write_buffer_size = 64*1024; // small write buffer + options.statistics = rocksdb::CreateDBStatistics(); + + switch (iter) { + case 0: + // only uncompressed block cache + options.block_cache = NewLRUCache(8*1024); + options.block_cache_compressed = nullptr; + break; + case 1: + // no block cache, only compressed cache + options.no_block_cache = true; + options.block_cache = nullptr; + options.block_cache_compressed = NewLRUCache(8*1024); + break; + case 2: + // both compressed and uncompressed block cache + options.block_cache = NewLRUCache(1024); + options.block_cache_compressed = NewLRUCache(8*1024); + break; + case 3: + // both block cache and compressed cache, but DB is not compressed + // also, make block cache sizes bigger, to trigger block cache hits + options.block_cache = NewLRUCache(1024 * 1024); + options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); + options.compression = kNoCompression; + break; + default: + ASSERT_TRUE(false); + } + CreateAndReopenWithCF({"pikachu"}, &options); + // default column family doesn't have block cache + Options no_block_cache_opts; + no_block_cache_opts.no_block_cache = true; + no_block_cache_opts.statistics = options.statistics; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, + {&no_block_cache_opts, &options}); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + std::string str; + for (int i = 0; i < num_iter; i++) { + if (i % 4 == 0) { // high compression ratio + str = RandomString(&rnd, 1000); + } + values.push_back(str); + ASSERT_OK(Put(1, Key(i), values[i])); + } + + // flush all data from memtable so that reads are from block cache + ASSERT_OK(Flush(1)); + + for (int i = 0; i < num_iter; i++) { + ASSERT_EQ(Get(1, Key(i)), values[i]); + } + + // check that we triggered the appropriate code paths in the cache + switch (iter) { + case 0: + // only uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 1: + // no block cache, only compressed cache + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 2: + // both compressed and uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 3: + // both compressed and uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + // compressed doesn't have any hits since blocks are not compressed on + // storage + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0); + break; + default: + ASSERT_TRUE(false); + } + + options.create_if_missing = true; + DestroyAndReopen(&options); + } +} + +static std::string CompressibleString(Random* rnd, int len) { + std::string r; + test::CompressibleString(rnd, 0.8, len, &r); + return r; +} + +TEST(DBTest, UniversalCompactionCompressRatio1) { + Options options; + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 1; + options.compaction_options_universal.compression_size_percent = 70; + options = CurrentOptions(options); + Reopen(&options); + + Random rnd(301); + int key_idx = 0; + + // The first compaction (2) is compressed. + for (int num = 0; num < 2; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(), 110000 * 2 * 0.9); + + // The second compaction (4) is compressed + for (int num = 0; num < 2; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(), 110000 * 4 * 0.9); + + // The third compaction (2 4) is compressed since this time it is + // (1 1 3.2) and 3.2/5.2 doesn't reach ratio. + for (int num = 0; num < 2; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(), 110000 * 6 * 0.9); + + // When we start for the compaction up to (2 4 8), the latest + // compressed is not compressed. + for (int num = 0; num < 8; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_GT((int)dbfull()->TEST_GetLevel0TotalSize(), + 110000 * 11 * 0.8 + 110000 * 2); +} + +TEST(DBTest, UniversalCompactionCompressRatio2) { + Options options; + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 1; + options.compaction_options_universal.compression_size_percent = 95; + options = CurrentOptions(options); + Reopen(&options); + + Random rnd(301); + int key_idx = 0; + + // When we start for the compaction up to (2 4 8), the latest + // compressed is compressed given the size ratio to compress. + for (int num = 0; num < 14; num++) { + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(), + 120000 * 12 * 0.8 + 120000 * 2); +} +#endif + +TEST(DBTest, ConvertCompactionStyle) { + Random rnd(301); + int max_key_level_insert = 200; + int max_key_universal_insert = 600; + + // Stage 1: generate a db with level compaction + Options options; + options.write_buffer_size = 100<<10; //100KB + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.max_bytes_for_level_base = 500<<10; // 500KB + options.max_bytes_for_level_multiplier = 1; + options.target_file_size_base = 200<<10; // 200KB + options.target_file_size_multiplier = 1; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + for (int i = 0; i <= max_key_level_insert; i++) { + // each value is 10K + ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + } + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + ASSERT_GT(TotalTableFiles(1, 4), 1); + int non_level0_num_files = 0; + for (int i = 1; i < options.num_levels; i++) { + non_level0_num_files += NumTableFilesAtLevel(i, 1); + } + ASSERT_GT(non_level0_num_files, 0); + + // Stage 2: reopen with universal compaction - should fail + options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options = CurrentOptions(options); + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &options); + ASSERT_TRUE(s.IsInvalidArgument()); + + // Stage 3: compact into a single file and move the file to level 0 + options = CurrentOptions(); + options.disable_auto_compactions = true; + options.target_file_size_base = INT_MAX; + options.target_file_size_multiplier = 1; + options.max_bytes_for_level_base = INT_MAX; + options.max_bytes_for_level_multiplier = 1; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + + dbfull()->CompactRange(handles_[1], nullptr, nullptr, true /* reduce level */, + 0 /* reduce to level 0 */); + + for (int i = 0; i < options.num_levels; i++) { + int num = NumTableFilesAtLevel(i, 1); + if (i == 0) { + ASSERT_EQ(num, 1); + } else { + ASSERT_EQ(num, 0); + } + } + + // Stage 4: re-open in universal compaction style and do some db operations + options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 3; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + + for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) { + ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + } + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + for (int i = 1; i < options.num_levels; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); + } + + // verify keys inserted in both level compaction style and universal + // compaction style + std::string keys_in_db; + Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + keys_in_db.append(iter->key().ToString()); + keys_in_db.push_back(','); + } + delete iter; + + std::string expected_keys; + for (int i = 0; i <= max_key_universal_insert; i++) { + expected_keys.append(Key(i)); + expected_keys.push_back(','); + } + + ASSERT_EQ(keys_in_db, expected_keys); +} + +namespace { +void MinLevelHelper(DBTest* self, Options& options) { + Random rnd(301); + + for (int num = 0; + num < options.level0_file_num_compaction_trigger - 1; + num++) + { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(self->Put(Key(i), values[i])); + } + self->dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(self->Put(Key(i), values[i])); + } + self->dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(self->NumTableFilesAtLevel(0), 0); + ASSERT_EQ(self->NumTableFilesAtLevel(1), 1); +} + +// returns false if the calling-Test should be skipped +bool MinLevelToCompress(CompressionType& type, Options& options, int wbits, + int lev, int strategy) { + fprintf(stderr, "Test with compression options : window_bits = %d, level = %d, strategy = %d}\n", wbits, lev, strategy); + options.write_buffer_size = 100<<10; //100KB + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.level0_file_num_compaction_trigger = 3; + options.create_if_missing = true; + + if (SnappyCompressionSupported(CompressionOptions(wbits, lev, strategy))) { + type = kSnappyCompression; + fprintf(stderr, "using snappy\n"); + } else if (ZlibCompressionSupported( + CompressionOptions(wbits, lev, strategy))) { + type = kZlibCompression; + fprintf(stderr, "using zlib\n"); + } else if (BZip2CompressionSupported( + CompressionOptions(wbits, lev, strategy))) { + type = kBZip2Compression; + fprintf(stderr, "using bzip2\n"); + } else if (LZ4CompressionSupported( + CompressionOptions(wbits, lev, strategy))) { + type = kLZ4Compression; + fprintf(stderr, "using lz4\n"); + } else if (LZ4HCCompressionSupported( + CompressionOptions(wbits, lev, strategy))) { + type = kLZ4HCCompression; + fprintf(stderr, "using lz4hc\n"); + } else { + fprintf(stderr, "skipping test, compression disabled\n"); + return false; + } + options.compression_per_level.resize(options.num_levels); + + // do not compress L0 + for (int i = 0; i < 1; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 1; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + return true; +} +} // namespace + +TEST(DBTest, MinLevelToCompress1) { + Options options = CurrentOptions(); + CompressionType type; + if (!MinLevelToCompress(type, options, -14, -1, 0)) { + return; + } + Reopen(&options); + MinLevelHelper(this, options); + + // do not compress L0 and L1 + for (int i = 0; i < 2; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 2; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + DestroyAndReopen(&options); + MinLevelHelper(this, options); +} + +TEST(DBTest, MinLevelToCompress2) { + Options options = CurrentOptions(); + CompressionType type; + if (!MinLevelToCompress(type, options, 15, -1, 0)) { + return; + } + Reopen(&options); + MinLevelHelper(this, options); + + // do not compress L0 and L1 + for (int i = 0; i < 2; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 2; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + DestroyAndReopen(&options); + MinLevelHelper(this, options); +} + +TEST(DBTest, RepeatedWritesToSameKey) { + do { + Options options; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + // We must have at most one file per level except for level-0, + // which may have up to kL0_StopWritesTrigger files. + const int kMaxFiles = + options.num_levels + options.level0_stop_writes_trigger; + + Random rnd(301); + std::string value = RandomString(&rnd, 2 * options.write_buffer_size); + for (int i = 0; i < 5 * kMaxFiles; i++) { + ASSERT_OK(Put(1, "key", value)); + ASSERT_LE(TotalTableFiles(1), kMaxFiles); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdate) { + do { + Options options; + options.create_if_missing = true; + options.inplace_update_support = true; + options.env = env_; + options.write_buffer_size = 100000; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + // Update key with values of smaller size + int numValues = 10; + for (int i = numValues; i > 0; i--) { + std::string value = DummyString(i, 'a'); + ASSERT_OK(Put(1, "key", value)); + ASSERT_EQ(value, Get(1, "key")); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1, 1); + + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdateLargeNewValue) { + do { + Options options; + options.create_if_missing = true; + options.inplace_update_support = true; + options.env = env_; + options.write_buffer_size = 100000; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + // Update key with values of larger size + int numValues = 10; + for (int i = 0; i < numValues; i++) { + std::string value = DummyString(i, 'a'); + ASSERT_OK(Put(1, "key", value)); + ASSERT_EQ(value, Get(1, "key")); + } + + // All 10 updates exist in the internal iterator + validateNumberOfEntries(numValues, 1); + + } while (ChangeCompactOptions()); +} + + +TEST(DBTest, InPlaceUpdateCallbackSmallerSize) { + do { + Options options; + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + rocksdb::DBTest::updateInPlaceSmallerSize; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + // Update key with values of smaller size + int numValues = 10; + ASSERT_OK(Put(1, "key", DummyString(numValues, 'a'))); + ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key")); + + for (int i = numValues; i > 0; i--) { + ASSERT_OK(Put(1, "key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(i - 1, 'b'), Get(1, "key")); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1, 1); + + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdateCallbackSmallerVarintSize) { + do { + Options options; + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + rocksdb::DBTest::updateInPlaceSmallerVarintSize; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + // Update key with values of smaller varint size + int numValues = 265; + ASSERT_OK(Put(1, "key", DummyString(numValues, 'a'))); + ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key")); + + for (int i = numValues; i > 0; i--) { + ASSERT_OK(Put(1, "key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(1, 'b'), Get(1, "key")); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1, 1); + + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) { + do { + Options options; + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + rocksdb::DBTest::updateInPlaceLargerSize; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + // Update key with values of larger size + int numValues = 10; + for (int i = 0; i < numValues; i++) { + ASSERT_OK(Put(1, "key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(i, 'c'), Get(1, "key")); + } + + // No inplace updates. All updates are puts with new seq number + // All 10 updates exist in the internal iterator + validateNumberOfEntries(numValues, 1); + + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdateCallbackNoAction) { + do { + Options options; + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + rocksdb::DBTest::updateInPlaceNoAction; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + // Callback function requests no actions from db + ASSERT_OK(Put(1, "key", DummyString(1, 'a'))); + ASSERT_EQ(Get(1, "key"), "NOT_FOUND"); + + } while (ChangeCompactOptions()); +} + +TEST(DBTest, CompactionFilter) { + Options options = CurrentOptions(); + options.max_open_files = -1; + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.compaction_filter_factory = std::make_shared(); + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + // Write 100K keys, these are written to a few files in L0. + const std::string value(10, 'x'); + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(1, key, value); + } + ASSERT_OK(Flush(1)); + + // Push all files to the highest level L2. Verify that + // the compaction is each level invokes the filter for + // all the keys in that level. + cfilter_count = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_EQ(cfilter_count, 100000); + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_NE(NumTableFilesAtLevel(2, 1), 0); + cfilter_count = 0; + + // All the files are in the lowest level. + // Verify that all but the 100001st record + // has sequence number zero. The 100001st record + // is at the tip of this snapshot and cannot + // be zeroed out. + // TODO: figure out sequence number squashtoo + int count = 0; + int total = 0; + Iterator* iter = dbfull()->TEST_NewInternalIterator(handles_[1]); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); + } + ASSERT_EQ(total, 100000); + ASSERT_EQ(count, 1); + delete iter; + + // overwrite all the 100K keys once again. + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + ASSERT_OK(Put(1, key, value)); + } + ASSERT_OK(Flush(1)); + + // push all files to the highest level L2. This + // means that all keys should pass at least once + // via the compaction filter + cfilter_count = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_EQ(cfilter_count, 100000); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_NE(NumTableFilesAtLevel(2, 1), 0); + + // create a new database with the compaction + // filter in such a way that it deletes all keys + options.compaction_filter_factory = std::make_shared(); + options.create_if_missing = true; + DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); + + // write all the keys once again. + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + ASSERT_OK(Put(1, key, value)); + } + ASSERT_OK(Flush(1)); + ASSERT_NE(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2, 1), 0); + + // Push all files to the highest level L2. This + // triggers the compaction filter to delete all keys, + // verify that at the end of the compaction process, + // nothing is left. + cfilter_count = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_EQ(cfilter_count, 0); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + + // Scan the entire database to ensure that nothing is left + iter = db_->NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + count = 0; + while (iter->Valid()) { + count++; + iter->Next(); + } + ASSERT_EQ(count, 0); + delete iter; + + // The sequence number of the remaining record + // is not zeroed out even though it is at the + // level Lmax because this record is at the tip + // TODO: remove the following or design a different + // test + count = 0; + iter = dbfull()->TEST_NewInternalIterator(handles_[1]); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + ASSERT_NE(ikey.sequence, (unsigned)0); + count++; + iter->Next(); + } + ASSERT_EQ(count, 0); + delete iter; +} + +// Tests the edge case where compaction does not produce any output -- all +// entries are deleted. The compaction should create bunch of 'DeleteFile' +// entries in VersionEdit, but none of the 'AddFile's. +TEST(DBTest, CompactionFilterDeletesAll) { + Options options; + options.compaction_filter_factory = std::make_shared(); + options.disable_auto_compactions = true; + options.create_if_missing = true; + DestroyAndReopen(&options); + + // put some data + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(std::to_string(table * 100 + i), "val"); + } + Flush(); + } + + // this will produce empty file (delete compaction filter) + ASSERT_OK(db_->CompactRange(nullptr, nullptr)); + ASSERT_EQ(0, CountLiveFiles()); + + Reopen(&options); + + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + // empty db + ASSERT_TRUE(!itr->Valid()); + + delete itr; +} + +TEST(DBTest, CompactionFilterWithValueChange) { + do { + Options options; + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.compaction_filter_factory = + std::make_shared(); + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, &options); + + // Write 100K+1 keys, these are written to a few files + // in L0. We do this so that the current snapshot points + // to the 100001 key.The compaction filter is not invoked + // on keys that are visible via a snapshot because we + // anyways cannot delete it. + const std::string value(10, 'x'); + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(1, key, value); + } + + // push all files to lower levels + ASSERT_OK(Flush(1)); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + + // re-write all data again + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(1, key, value); + } + + // push all files to lower levels. This should + // invoke the compaction filter for all 100000 keys. + ASSERT_OK(Flush(1)); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + + // verify that all keys now have the new value that + // was set by the compaction process. + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + std::string newvalue = Get(1, key); + ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, CompactionFilterContextManual) { + KeepFilterFactory* filter = new KeepFilterFactory(); + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_filter_factory.reset(filter); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 8; + Reopen(&options); + int num_keys_per_file = 400; + for (int j = 0; j < 3; j++) { + // Write several keys. + const std::string value(10, 'x'); + for (int i = 0; i < num_keys_per_file; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%02d", i, j); + Put(key, value); + } + dbfull()->TEST_FlushMemTable(); + // Make sure next file is much smaller so automatic compaction will not + // be triggered. + num_keys_per_file /= 2; + } + + // Force a manual compaction + cfilter_count = 0; + filter->expect_manual_compaction_.store(true); + filter->expect_full_compaction_.store(false); // Manual compaction always + // set this flag. + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(cfilter_count, 700); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + + // Verify total number of keys is correct after manual compaction. + int count = 0; + int total = 0; + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); + } + ASSERT_EQ(total, 700); + ASSERT_EQ(count, 1); + delete iter; +} + +class KeepFilterV2 : public CompactionFilterV2 { + public: + virtual std::vector Filter(int level, + const SliceVector& keys, + const SliceVector& existing_values, + std::vector* new_values, + std::vector* values_changed) + const override { + cfilter_count++; + std::vector ret; + new_values->clear(); + values_changed->clear(); + for (unsigned int i = 0; i < keys.size(); ++i) { + values_changed->push_back(false); + ret.push_back(false); + } + return ret; + } + + virtual const char* Name() const override { + return "KeepFilterV2"; + } +}; + +class DeleteFilterV2 : public CompactionFilterV2 { + public: + virtual std::vector Filter(int level, + const SliceVector& keys, + const SliceVector& existing_values, + std::vector* new_values, + std::vector* values_changed) + const override { + cfilter_count++; + new_values->clear(); + values_changed->clear(); + std::vector ret; + for (unsigned int i = 0; i < keys.size(); ++i) { + values_changed->push_back(false); + ret.push_back(true); + } + return ret; + } + + virtual const char* Name() const override { + return "DeleteFilterV2"; + } +}; + +class ChangeFilterV2 : public CompactionFilterV2 { + public: + virtual std::vector Filter(int level, + const SliceVector& keys, + const SliceVector& existing_values, + std::vector* new_values, + std::vector* values_changed) + const override { + std::vector ret; + new_values->clear(); + values_changed->clear(); + for (unsigned int i = 0; i < keys.size(); ++i) { + values_changed->push_back(true); + new_values->push_back(NEW_VALUE); + ret.push_back(false); + } + return ret; + } + + virtual const char* Name() const override { + return "ChangeFilterV2"; + } +}; + +class KeepFilterFactoryV2 : public CompactionFilterFactoryV2 { + public: + explicit KeepFilterFactoryV2(const SliceTransform* prefix_extractor) + : CompactionFilterFactoryV2(prefix_extractor) { } + + virtual std::unique_ptr + CreateCompactionFilterV2( + const CompactionFilterContext& context) override { + return std::unique_ptr(new KeepFilterV2()); + } + + virtual const char* Name() const override { + return "KeepFilterFactoryV2"; + } +}; + +class DeleteFilterFactoryV2 : public CompactionFilterFactoryV2 { + public: + explicit DeleteFilterFactoryV2(const SliceTransform* prefix_extractor) + : CompactionFilterFactoryV2(prefix_extractor) { } + + virtual std::unique_ptr + CreateCompactionFilterV2( + const CompactionFilterContext& context) override { + return std::unique_ptr(new DeleteFilterV2()); + } + + virtual const char* Name() const override { + return "DeleteFilterFactoryV2"; + } +}; + +class ChangeFilterFactoryV2 : public CompactionFilterFactoryV2 { + public: + explicit ChangeFilterFactoryV2(const SliceTransform* prefix_extractor) + : CompactionFilterFactoryV2(prefix_extractor) { } + + virtual std::unique_ptr + CreateCompactionFilterV2( + const CompactionFilterContext& context) override { + return std::unique_ptr(new ChangeFilterV2()); + } + + virtual const char* Name() const override { + return "ChangeFilterFactoryV2"; + } +}; + +TEST(DBTest, CompactionFilterV2) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.max_mem_compaction_level = 0; + // extract prefix + std::unique_ptr prefix_extractor; + prefix_extractor.reset(NewFixedPrefixTransform(8)); + + options.compaction_filter_factory_v2 + = std::make_shared(prefix_extractor.get()); + // In a testing environment, we can only flush the application + // compaction filter buffer using universal compaction + option_config_ = kUniversalCompaction; + options.compaction_style = (rocksdb::CompactionStyle)1; + Reopen(&options); + + // Write 100K keys, these are written to a few files in L0. + const std::string value(10, 'x'); + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%010d", i , i); + Put(key, value); + } + + dbfull()->TEST_FlushMemTable(); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + + // All the files are in the lowest level. + int count = 0; + int total = 0; + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); + } + + ASSERT_EQ(total, 100000); + // 1 snapshot only. Since we are using universal compacton, + // the sequence no is cleared for better compression + ASSERT_EQ(count, 1); + delete iter; + + // create a new database with the compaction + // filter in such a way that it deletes all keys + options.compaction_filter_factory_v2 = + std::make_shared(prefix_extractor.get()); + options.create_if_missing = true; + DestroyAndReopen(&options); + + // write all the keys once again. + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%010d", i, i); + Put(key, value); + } + + dbfull()->TEST_FlushMemTable(); + ASSERT_NE(NumTableFilesAtLevel(0), 0); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + // Scan the entire database to ensure that nothing is left + iter = db_->NewIterator(ReadOptions()); + iter->SeekToFirst(); + count = 0; + while (iter->Valid()) { + count++; + iter->Next(); + } + + ASSERT_EQ(count, 0); + delete iter; +} + +TEST(DBTest, CompactionFilterV2WithValueChange) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.max_mem_compaction_level = 0; + std::unique_ptr prefix_extractor; + prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.compaction_filter_factory_v2 = + std::make_shared(prefix_extractor.get()); + // In a testing environment, we can only flush the application + // compaction filter buffer using universal compaction + option_config_ = kUniversalCompaction; + options.compaction_style = (rocksdb::CompactionStyle)1; + options = CurrentOptions(options); + Reopen(&options); + + // Write 100K+1 keys, these are written to a few files + // in L0. We do this so that the current snapshot points + // to the 100001 key.The compaction filter is not invoked + // on keys that are visible via a snapshot because we + // anyways cannot delete it. + const std::string value(10, 'x'); + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%010d", i, i); + Put(key, value); + } + + // push all files to lower levels + dbfull()->TEST_FlushMemTable(); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + + // verify that all keys now have the new value that + // was set by the compaction process. + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%010d", i, i); + std::string newvalue = Get(key); + ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); + } +} + +TEST(DBTest, CompactionFilterV2NULLPrefix) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.max_mem_compaction_level = 0; + std::unique_ptr prefix_extractor; + prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.compaction_filter_factory_v2 = + std::make_shared(prefix_extractor.get()); + // In a testing environment, we can only flush the application + // compaction filter buffer using universal compaction + option_config_ = kUniversalCompaction; + options.compaction_style = (rocksdb::CompactionStyle)1; + Reopen(&options); + + // Write 100K+1 keys, these are written to a few files + // in L0. We do this so that the current snapshot points + // to the 100001 key.The compaction filter is not invoked + // on keys that are visible via a snapshot because we + // anyways cannot delete it. + const std::string value(10, 'x'); + char first_key[100]; + snprintf(first_key, sizeof(first_key), "%s0000%010d", "NULL", 1); + Put(first_key, value); + for (int i = 1; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "%08d%010d", i, i); + Put(key, value); + } + + char last_key[100]; + snprintf(last_key, sizeof(last_key), "%s0000%010d", "NULL", 2); + Put(last_key, value); + + // push all files to lower levels + dbfull()->TEST_FlushMemTable(); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + + // verify that all keys now have the new value that + // was set by the compaction process. + std::string newvalue = Get(first_key); + ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); + newvalue = Get(last_key); + ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); + for (int i = 1; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "%08d%010d", i, i); + std::string newvalue = Get(key); + ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); + } +} + +TEST(DBTest, SparseMerge) { + do { + Options options = CurrentOptions(); + options.compression = kNoCompression; + CreateAndReopenWithCF({"pikachu"}, &options); + + FillLevels("A", "Z", 1); + + // Suppose there is: + // small amount of data with prefix A + // large amount of data with prefix B + // small amount of data with prefix C + // and that recent updates have made small changes to all three prefixes. + // Check that we do not do a compaction that merges all of B in one shot. + const std::string value(1000, 'x'); + Put(1, "A", "va"); + // Write approximately 100MB of "B" values + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(1, key, value); + } + Put(1, "C", "vc"); + ASSERT_OK(Flush(1)); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + + // Make sparse update + Put(1, "A", "va2"); + Put(1, "B100", "bvalue2"); + Put(1, "C", "vc2"); + ASSERT_OK(Flush(1)); + + // Compactions should not cause us to create a situation where + // a file overlaps too much data at the next level. + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); + } while (ChangeCompactOptions()); +} + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +TEST(DBTest, ApproximateSizes) { + do { + Options options; + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + options = CurrentOptions(options); + DestroyAndReopen(); + CreateAndReopenWithCF({"pikachu"}, &options); + + ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + const int N = 80; + static const int S1 = 100000; + static const int S2 = 105000; // Allow some expansion from metadata + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), RandomString(&rnd, S1))); + } + + // 0 because GetApproximateSizes() does not account for memtable space + ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0)); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + + for (int compact_start = 0; compact_start < N; compact_start += 10) { + for (int i = 0; i < N; i += 10) { + ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i)); + ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1), + S2 * (i + 1))); + ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10)); + } + ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50)); + ASSERT_TRUE( + Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50)); + + std::string cstart_str = Key(compact_start); + std::string cend_str = Key(compact_start + 9); + Slice cstart = cstart_str; + Slice cend = cend_str; + dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]); + } + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); + } + // ApproximateOffsetOf() is not yet implemented in plain table format. + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | + kSkipPlainTable)); +} + +TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { + do { + Options options = CurrentOptions(); + options.compression = kNoCompression; + CreateAndReopenWithCF({"pikachu"}, &options); + + Random rnd(301); + std::string big1 = RandomString(&rnd, 100000); + ASSERT_OK(Put(1, Key(0), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(1), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(2), big1)); + ASSERT_OK(Put(1, Key(3), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(4), big1)); + ASSERT_OK(Put(1, Key(5), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(6), RandomString(&rnd, 300000))); + ASSERT_OK(Put(1, Key(7), RandomString(&rnd, 10000))); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + + ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0)); + ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000)); + ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000)); + ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000)); + ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000)); + ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000)); + ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000)); + ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000)); + ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000)); + + ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000)); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + } + // ApproximateOffsetOf() is not yet implemented in plain table format. + } while (ChangeOptions(kSkipPlainTable)); +} + +TEST(DBTest, IteratorPinsRef) { + do { + CreateAndReopenWithCF({"pikachu"}); + Put(1, "foo", "hello"); + + // Get iterator that will yield the current contents of the DB. + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); + + // Write to force compactions + Put(1, "foo", "newvalue1"); + for (int i = 0; i < 100; i++) { + // 100K values + ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v'))); + } + Put(1, "foo", "newvalue2"); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("hello", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + delete iter; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, Snapshot) { + do { + CreateAndReopenWithCF({"pikachu"}); + Put(0, "foo", "0v1"); + Put(1, "foo", "1v1"); + const Snapshot* s1 = db_->GetSnapshot(); + Put(0, "foo", "0v2"); + Put(1, "foo", "1v2"); + const Snapshot* s2 = db_->GetSnapshot(); + Put(0, "foo", "0v3"); + Put(1, "foo", "1v3"); + const Snapshot* s3 = db_->GetSnapshot(); + + Put(0, "foo", "0v4"); + Put(1, "foo", "1v4"); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v3", Get(0, "foo", s3)); + ASSERT_EQ("1v3", Get(1, "foo", s3)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + + db_->ReleaseSnapshot(s3); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + + db_->ReleaseSnapshot(s1); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + + db_->ReleaseSnapshot(s2); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + } while (ChangeOptions(kSkipHashCuckoo)); +} + +TEST(DBTest, HiddenValuesAreRemoved) { + do { + CreateAndReopenWithCF({"pikachu"}); + Random rnd(301); + FillLevels("a", "z", 1); + + std::string big = RandomString(&rnd, 50000); + Put(1, "foo", big); + Put(1, "pastfoo", "v"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put(1, "foo", "tiny"); + Put(1, "pastfoo2", "v2"); // Advance sequence number one more + + ASSERT_OK(Flush(1)); + ASSERT_GT(NumTableFilesAtLevel(0, 1), 0); + + ASSERT_EQ(big, Get(1, "foo", snapshot)); + ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000)); + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]"); + Slice x("x"); + dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GE(NumTableFilesAtLevel(1, 1), 1); + dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); + + ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000)); + // ApproximateOffsetOf() is not yet implemented in plain table format, + // which is used by Size(). + // skip HashCuckooRep as it does not support snapshot + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | + kSkipPlainTable | kSkipHashCuckoo)); +} + +TEST(DBTest, CompactBetweenSnapshots) { + do { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}); + Random rnd(301); + FillLevels("a", "z", 1); + + Put(1, "foo", "first"); + const Snapshot* snapshot1 = db_->GetSnapshot(); + Put(1, "foo", "second"); + Put(1, "foo", "third"); + Put(1, "foo", "fourth"); + const Snapshot* snapshot2 = db_->GetSnapshot(); + Put(1, "foo", "fifth"); + Put(1, "foo", "sixth"); + + // All entries (including duplicates) exist + // before any compaction is triggered. + ASSERT_OK(Flush(1)); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ("first", Get(1, "foo", snapshot1)); + ASSERT_EQ(AllEntriesFor("foo", 1), + "[ sixth, fifth, fourth, third, second, first ]"); + + // After a compaction, "second", "third" and "fifth" should + // be removed + FillLevels("a", "z", 1); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ("first", Get(1, "foo", snapshot1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]"); + + // after we release the snapshot1, only two values left + db_->ReleaseSnapshot(snapshot1); + FillLevels("a", "z", 1); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + + // We have only one valid snapshot snapshot2. Since snapshot1 is + // not valid anymore, "first" should be removed by a compaction. + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]"); + + // after we release the snapshot2, only one value should be left + db_->ReleaseSnapshot(snapshot2); + FillLevels("a", "z", 1); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]"); + // skip HashCuckooRep as it does not support snapshot + } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction)); +} + +TEST(DBTest, DeletionMarkers1) { + CreateAndReopenWithCF({"pikachu"}); + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + const int last = CurrentOptions().max_mem_compaction_level; + // foo => v1 is now in last level + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + + // Place a table at level last-1 to prevent merging with preceding mutation + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); + + Delete(1, "foo"); + Put(1, "foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); + ASSERT_OK(Flush(1)); // Moves to level last-2 + if (CurrentOptions().purge_redundant_kvs_while_flush) { + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); + } else { + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); + } + Slice z("z"); + dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]); + // DEL eliminated, but v1 remains because we aren't compacting that level + // (DEL can be eliminated because v2 hides v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); + dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); +} + +TEST(DBTest, DeletionMarkers2) { + CreateAndReopenWithCF({"pikachu"}); + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + const int last = CurrentOptions().max_mem_compaction_level; + // foo => v1 is now in last level + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + + // Place a table at level last-1 to prevent merging with preceding mutation + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); + + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + ASSERT_OK(Flush(1)); // Moves to level last-2 + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]); + // DEL kept: "last" file overlaps + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); +} + +TEST(DBTest, OverlapInLevel0) { + do { + CreateAndReopenWithCF({"pikachu"}); + int tmp = CurrentOptions().max_mem_compaction_level; + ASSERT_EQ(tmp, 2) << "Fix test to match config"; + + //Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0. + ASSERT_OK(Put(1, "100", "v100")); + ASSERT_OK(Put(1, "999", "v999")); + Flush(1); + ASSERT_OK(Delete(1, "100")); + ASSERT_OK(Delete(1, "999")); + Flush(1); + ASSERT_EQ("0,1,1", FilesPerLevel(1)); + + // Make files spanning the following ranges in level-0: + // files[0] 200 .. 900 + // files[1] 300 .. 500 + // Note that files are sorted by smallest key. + ASSERT_OK(Put(1, "300", "v300")); + ASSERT_OK(Put(1, "500", "v500")); + Flush(1); + ASSERT_OK(Put(1, "200", "v200")); + ASSERT_OK(Put(1, "600", "v600")); + ASSERT_OK(Put(1, "900", "v900")); + Flush(1); + ASSERT_EQ("2,1,1", FilesPerLevel(1)); + + // Compact away the placeholder files we created initially + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]); + ASSERT_EQ("2", FilesPerLevel(1)); + + // Do a memtable compaction. Before bug-fix, the compaction would + // not detect the overlap with level-0 files and would incorrectly place + // the deletion in a deeper level. + ASSERT_OK(Delete(1, "600")); + Flush(1); + ASSERT_EQ("3", FilesPerLevel(1)); + ASSERT_EQ("NOT_FOUND", Get(1, "600")); + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); +} + +TEST(DBTest, L0_CompactionBug_Issue44_a) { + do { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "b", "v")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_OK(Delete(1, "b")); + ASSERT_OK(Delete(1, "a")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_OK(Delete(1, "a")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_OK(Put(1, "a", "v")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("(a->v)", Contents(1)); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(a->v)", Contents(1)); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, L0_CompactionBug_Issue44_b) { + do { + CreateAndReopenWithCF({"pikachu"}); + Put(1, "", ""); + ReopenWithColumnFamilies({"default", "pikachu"}); + Delete(1, "e"); + Put(1, "", ""); + ReopenWithColumnFamilies({"default", "pikachu"}); + Put(1, "c", "cv"); + ReopenWithColumnFamilies({"default", "pikachu"}); + Put(1, "", ""); + ReopenWithColumnFamilies({"default", "pikachu"}); + Put(1, "", ""); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ReopenWithColumnFamilies({"default", "pikachu"}); + Put(1, "d", "dv"); + ReopenWithColumnFamilies({"default", "pikachu"}); + Put(1, "", ""); + ReopenWithColumnFamilies({"default", "pikachu"}); + Delete(1, "d"); + Delete(1, "b"); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("(->)(c->cv)", Contents(1)); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(->)(c->cv)", Contents(1)); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, ComparatorCheck) { + class NewComparator : public Comparator { + public: + virtual const char* Name() const { return "rocksdb.NewComparator"; } + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* s, const Slice& l) const { + BytewiseComparator()->FindShortestSeparator(s, l); + } + virtual void FindShortSuccessor(std::string* key) const { + BytewiseComparator()->FindShortSuccessor(key); + } + }; + Options new_options, options; + NewComparator cmp; + do { + CreateAndReopenWithCF({"pikachu"}); + options = CurrentOptions(); + new_options = CurrentOptions(); + new_options.comparator = &cmp; + // only the non-default column family has non-matching comparator + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, + {&options, &new_options}); + ASSERT_TRUE(!s.ok()); + ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) + << s.ToString(); + } while (ChangeCompactOptions(&new_options)); +} + +TEST(DBTest, CustomComparator) { + class NumberComparator : public Comparator { + public: + virtual const char* Name() const { return "test.NumberComparator"; } + virtual int Compare(const Slice& a, const Slice& b) const { + return ToNumber(a) - ToNumber(b); + } + virtual void FindShortestSeparator(std::string* s, const Slice& l) const { + ToNumber(*s); // Check format + ToNumber(l); // Check format + } + virtual void FindShortSuccessor(std::string* key) const { + ToNumber(*key); // Check format + } + private: + static int ToNumber(const Slice& x) { + // Check that there are no extra characters. + ASSERT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size()-1] == ']') + << EscapeString(x); + int val; + char ignored; + ASSERT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1) + << EscapeString(x); + return val; + } + }; + Options new_options; + NumberComparator cmp; + do { + new_options = CurrentOptions(); + new_options.create_if_missing = true; + new_options.comparator = &cmp; + new_options.filter_policy = nullptr; // Cannot use bloom filters + new_options.write_buffer_size = 1000; // Compact more often + new_options = CurrentOptions(new_options); + DestroyAndReopen(&new_options); + CreateAndReopenWithCF({"pikachu"}, &new_options); + ASSERT_OK(Put(1, "[10]", "ten")); + ASSERT_OK(Put(1, "[0x14]", "twenty")); + for (int i = 0; i < 2; i++) { + ASSERT_EQ("ten", Get(1, "[10]")); + ASSERT_EQ("ten", Get(1, "[0xa]")); + ASSERT_EQ("twenty", Get(1, "[20]")); + ASSERT_EQ("twenty", Get(1, "[0x14]")); + ASSERT_EQ("NOT_FOUND", Get(1, "[15]")); + ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]")); + Compact(1, "[0]", "[9999]"); + } + + for (int run = 0; run < 2; run++) { + for (int i = 0; i < 1000; i++) { + char buf[100]; + snprintf(buf, sizeof(buf), "[%d]", i*10); + ASSERT_OK(Put(1, buf, buf)); + } + Compact(1, "[0]", "[1000000]"); + } + } while (ChangeCompactOptions(&new_options)); +} + +TEST(DBTest, ManualCompaction) { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2) + << "Need to update this test to match kMaxMemCompactLevel"; + + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + MakeTables(3, "p", "q", 1); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range falls before files + Compact(1, "", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range falls after files + Compact(1, "r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range overlaps files + Compact(1, "p1", "p9"); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); + + // Populate a different range + MakeTables(3, "c", "e", 1); + ASSERT_EQ("1,1,2", FilesPerLevel(1)); + + // Compact just the new range + Compact(1, "b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel(1)); + + // Compact all + MakeTables(1, "a", "z", 1); + ASSERT_EQ("0,1,2", FilesPerLevel(1)); + db_->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); + + if (iter == 0) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.create_if_missing = true; + DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); + } + } + +} + +TEST(DBTest, DBOpen_Options) { + std::string dbname = test::TmpDir() + "/db_options_test"; + ASSERT_OK(DestroyDB(dbname, Options())); + + // Does not exist, and create_if_missing == false: error + DB* db = nullptr; + Options opts; + opts.create_if_missing = false; + Status s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); + ASSERT_TRUE(db == nullptr); + + // Does not exist, and create_if_missing == true: OK + opts.create_if_missing = true; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; + + // Does exist, and error_if_exists == true: error + opts.create_if_missing = false; + opts.error_if_exists = true; + s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); + ASSERT_TRUE(db == nullptr); + + // Does exist, and error_if_exists == false: OK + opts.create_if_missing = true; + opts.error_if_exists = false; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; +} + +TEST(DBTest, DBOpen_Change_NumLevels) { + Options opts; + opts.create_if_missing = true; + DestroyAndReopen(&opts); + ASSERT_TRUE(db_ != nullptr); + CreateAndReopenWithCF({"pikachu"}, &opts); + + ASSERT_OK(Put(1, "a", "123")); + ASSERT_OK(Put(1, "b", "234")); + db_->CompactRange(handles_[1], nullptr, nullptr); + Close(); + + opts.create_if_missing = false; + opts.num_levels = 2; + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &opts); + ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr); + ASSERT_TRUE(db_ == nullptr); +} + +TEST(DBTest, DestroyDBMetaDatabase) { + std::string dbname = test::TmpDir() + "/db_meta"; + std::string metadbname = MetaDatabaseName(dbname, 0); + std::string metametadbname = MetaDatabaseName(metadbname, 0); + + // Destroy previous versions if they exist. Using the long way. + ASSERT_OK(DestroyDB(metametadbname, Options())); + ASSERT_OK(DestroyDB(metadbname, Options())); + ASSERT_OK(DestroyDB(dbname, Options())); + + // Setup databases + Options opts; + opts.create_if_missing = true; + DB* db = nullptr; + ASSERT_OK(DB::Open(opts, dbname, &db)); + delete db; + db = nullptr; + ASSERT_OK(DB::Open(opts, metadbname, &db)); + delete db; + db = nullptr; + ASSERT_OK(DB::Open(opts, metametadbname, &db)); + delete db; + db = nullptr; + + // Delete databases + ASSERT_OK(DestroyDB(dbname, Options())); + + // Check if deletion worked. + opts.create_if_missing = false; + ASSERT_TRUE(!(DB::Open(opts, dbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(opts, metadbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(opts, metametadbname, &db)).ok()); +} + +// Check that number of files does not grow when we are out of space +TEST(DBTest, NoSpace) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.paranoid_checks = false; + Reopen(&options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + Compact("a", "z"); + const int num_files = CountFiles(); + env_->no_space_.Release_Store(env_); // Force out-of-space errors + env_->sleep_counter_.Reset(); + for (int i = 0; i < 5; i++) { + for (int level = 0; level < dbfull()->NumberLevels()-1; level++) { + dbfull()->TEST_CompactRange(level, nullptr, nullptr); + } + } + + std::string property_value; + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("5", property_value); + + env_->no_space_.Release_Store(nullptr); + ASSERT_LT(CountFiles(), num_files + 3); + + // Check that compaction attempts slept after errors + ASSERT_GE(env_->sleep_counter_.Read(), 5); + } while (ChangeCompactOptions()); +} + +// Check background error counter bumped on flush failures. +TEST(DBTest, NoSpaceFlush) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.max_background_flushes = 1; + Reopen(&options); + + ASSERT_OK(Put("foo", "v1")); + env_->no_space_.Release_Store(env_); // Force out-of-space errors + + std::string property_value; + // Background error count is 0 now. + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("0", property_value); + + dbfull()->TEST_FlushMemTable(false); + + // Wait 300 milliseconds or background-errors turned 1 from 0. + int time_to_sleep_limit = 300000; + while (time_to_sleep_limit > 0) { + int to_sleep = (time_to_sleep_limit > 1000) ? 1000 : time_to_sleep_limit; + time_to_sleep_limit -= to_sleep; + env_->SleepForMicroseconds(to_sleep); + + ASSERT_TRUE( + db_->GetProperty("rocksdb.background-errors", &property_value)); + if (property_value == "1") { + break; + } + } + ASSERT_EQ("1", property_value); + + env_->no_space_.Release_Store(nullptr); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, NonWritableFileSystem) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 1000; + options.env = env_; + Reopen(&options); + ASSERT_OK(Put("foo", "v1")); + env_->non_writable_.Release_Store(env_); // Force errors for new files + std::string big(100000, 'x'); + int errors = 0; + for (int i = 0; i < 20; i++) { + if (!Put("foo", big).ok()) { + errors++; + env_->SleepForMicroseconds(100000); + } + } + ASSERT_GT(errors, 0); + env_->non_writable_.Release_Store(nullptr); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, ManifestWriteError) { + // Test for the following problem: + // (a) Compaction produces file F + // (b) Log record containing F is written to MANIFEST file, but Sync() fails + // (c) GC deletes F + // (d) After reopening DB, reads fail since deleted F is named in log record + + // We iterate twice. In the second iteration, everything is the + // same except the log record never makes it to the MANIFEST file. + for (int iter = 0; iter < 2; iter++) { + port::AtomicPointer* error_type = (iter == 0) + ? &env_->manifest_sync_error_ + : &env_->manifest_write_error_; + + // Insert foo=>bar mapping + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + DestroyAndReopen(&options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_EQ("bar", Get("foo")); + + // Memtable compaction (will succeed) + Flush(); + ASSERT_EQ("bar", Get("foo")); + const int last = dbfull()->MaxMemCompactionLevel(); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level + + // Merging compaction (will fail) + error_type->Release_Store(env_); + dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + ASSERT_EQ("bar", Get("foo")); + + // Recovery: should not lose data + error_type->Release_Store(nullptr); + Reopen(&options); + ASSERT_EQ("bar", Get("foo")); + } +} + +TEST(DBTest, PutFailsParanoid) { + // Test the following: + // (a) A random put fails in paranoid mode (simulate by sync fail) + // (b) All other puts have to fail, even if writes would succeed + // (c) All of that should happen ONLY if paranoid_checks = true + + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); + Status s; + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); + // simulate error + env_->log_write_error_.Release_Store(env_); + s = Put(1, "foo2", "bar2"); + ASSERT_TRUE(!s.ok()); + env_->log_write_error_.Release_Store(nullptr); + s = Put(1, "foo3", "bar3"); + // the next put should fail, too + ASSERT_TRUE(!s.ok()); + // but we're still able to read + ASSERT_EQ("bar", Get(1, "foo")); + + // do the same thing with paranoid checks off + options.paranoid_checks = false; + DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); + // simulate error + env_->log_write_error_.Release_Store(env_); + s = Put(1, "foo2", "bar2"); + ASSERT_TRUE(!s.ok()); + env_->log_write_error_.Release_Store(nullptr); + s = Put(1, "foo3", "bar3"); + // the next put should NOT fail + ASSERT_TRUE(s.ok()); +} + +TEST(DBTest, FilesDeletedAfterCompaction) { + do { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "foo", "v2")); + Compact(1, "a", "z"); + const int num_files = CountLiveFiles(); + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(1, "foo", "v2")); + Compact(1, "a", "z"); + } + ASSERT_EQ(CountLiveFiles(), num_files); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, BloomFilter) { + do { + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.no_block_cache = true; + options.filter_policy = NewBloomFilterPolicy(10); + CreateAndReopenWithCF({"pikachu"}, &options); + + // Populate multiple layers + const int N = 10000; + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + Compact(1, "a", "z"); + for (int i = 0; i < N; i += 100) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + Flush(1); + + // Prevent auto compactions triggered by seeks + env_->delay_sstable_sync_.Release_Store(env_); + + // Lookup present keys. Should rarely read from small sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + int reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d present => %d reads\n", N, reads); + ASSERT_GE(reads, N); + ASSERT_LE(reads, N + 2*N/100); + + // Lookup present keys. Should rarely read from either sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing")); + } + reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d missing => %d reads\n", N, reads); + ASSERT_LE(reads, 3*N/100); + + env_->delay_sstable_sync_.Release_Store(nullptr); + Close(); + delete options.filter_policy; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, SnapshotFiles) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + CreateAndReopenWithCF({"pikachu"}, &options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put((i < 40), Key(i), values[i])); + } + + // assert that nothing makes it to disk yet. + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + + // get a file snapshot + uint64_t manifest_number = 0; + uint64_t manifest_size = 0; + std::vector files; + dbfull()->DisableFileDeletions(); + dbfull()->GetLiveFiles(files, &manifest_size); + + // CURRENT, MANIFEST, *.sst files (one for each CF) + ASSERT_EQ(files.size(), 4U); + + uint64_t number = 0; + FileType type; + + // copy these files to a new snapshot directory + std::string snapdir = dbname_ + ".snapdir/"; + std::string mkdir = "mkdir -p " + snapdir; + ASSERT_EQ(system(mkdir.c_str()), 0); + + for (unsigned int i = 0; i < files.size(); i++) { + // our clients require that GetLiveFiles returns + // files with "/" as first character! + ASSERT_EQ(files[i][0], '/'); + std::string src = dbname_ + files[i]; + std::string dest = snapdir + files[i]; + + uint64_t size; + ASSERT_OK(env_->GetFileSize(src, &size)); + + // record the number and the size of the + // latest manifest file + if (ParseFileName(files[i].substr(1), &number, &type)) { + if (type == kDescriptorFile) { + if (number > manifest_number) { + manifest_number = number; + ASSERT_GE(size, manifest_size); + size = manifest_size; // copy only valid MANIFEST data + } + } + } + CopyFile(src, dest, size); + } + + // release file snapshot + dbfull()->DisableFileDeletions(); + + // overwrite one key, this key should not appear in the snapshot + std::vector extras; + for (unsigned int i = 0; i < 1; i++) { + extras.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(0, Key(i), extras[i])); + } + + // verify that data in the snapshot are correct + std::vector column_families; + column_families.emplace_back("default", ColumnFamilyOptions()); + column_families.emplace_back("pikachu", ColumnFamilyOptions()); + std::vector cf_handles; + DB* snapdb; + DBOptions opts; + opts.create_if_missing = false; + Status stat = + DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb); + ASSERT_OK(stat); + + ReadOptions roptions; + std::string val; + for (unsigned int i = 0; i < 80; i++) { + stat = snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val); + ASSERT_EQ(values[i].compare(val), 0); + } + for (auto cfh : cf_handles) { + delete cfh; + } + delete snapdb; + + // look at the new live files after we added an 'extra' key + // and after we took the first snapshot. + uint64_t new_manifest_number = 0; + uint64_t new_manifest_size = 0; + std::vector newfiles; + dbfull()->DisableFileDeletions(); + dbfull()->GetLiveFiles(newfiles, &new_manifest_size); + + // find the new manifest file. assert that this manifest file is + // the same one as in the previous snapshot. But its size should be + // larger because we added an extra key after taking the + // previous shapshot. + for (unsigned int i = 0; i < newfiles.size(); i++) { + std::string src = dbname_ + "/" + newfiles[i]; + // record the lognumber and the size of the + // latest manifest file + if (ParseFileName(newfiles[i].substr(1), &number, &type)) { + if (type == kDescriptorFile) { + if (number > new_manifest_number) { + uint64_t size; + new_manifest_number = number; + ASSERT_OK(env_->GetFileSize(src, &size)); + ASSERT_GE(size, new_manifest_size); + } + } + } + } + ASSERT_EQ(manifest_number, new_manifest_number); + ASSERT_GT(new_manifest_size, manifest_size); + + // release file snapshot + dbfull()->DisableFileDeletions(); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, CompactOnFlush) { + do { + Options options = CurrentOptions(); + options.purge_redundant_kvs_while_flush = true; + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, &options); + + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]"); + + // Write two new keys + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); + + // Case1: Delete followed by a put + Delete(1, "foo"); + Put(1, "foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); + + // After the current memtable is flushed, the DEL should + // have been removed + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); + + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); + + // Case 2: Delete followed by another delete + Delete(1, "foo"); + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]"); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); + + // Case 3: Put followed by a delete + Put(1, "foo", "v3"); + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]"); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); + + // Case 4: Put followed by another Put + Put(1, "foo", "v4"); + Put(1, "foo", "v5"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); + + // clear database + Delete(1, "foo"); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); + + // Case 5: Put followed by snapshot followed by another Put + // Both puts should remain. + Put(1, "foo", "v6"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put(1, "foo", "v7"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]"); + db_->ReleaseSnapshot(snapshot); + + // clear database + Delete(1, "foo"); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); + + // Case 5: snapshot followed by a put followed by another Put + // Only the last put should remain. + const Snapshot* snapshot1 = db_->GetSnapshot(); + Put(1, "foo", "v8"); + Put(1, "foo", "v9"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]"); + db_->ReleaseSnapshot(snapshot1); + } while (ChangeCompactOptions()); +} + +namespace { +std::vector ListLogFiles(Env* env, const std::string& path) { + std::vector files; + std::vector log_files; + env->GetChildren(path, &files); + uint64_t number; + FileType type; + for (size_t i = 0; i < files.size(); ++i) { + if (ParseFileName(files[i], &number, &type)) { + if (type == kLogFile) { + log_files.push_back(number); + } + } + } + return std::move(log_files); +} +} // namespace + +TEST(DBTest, WALArchivalTtl) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.WAL_ttl_seconds = 1000; + DestroyAndReopen(&options); + + // TEST : Create DB with a ttl and no size limit. + // Put some keys. Count the log files present in the DB just after insert. + // Re-open db. Causes deletion/archival to take place. + // Assert that the files moved under "/archive". + // Reopen db with small ttl. + // Assert that archive was removed. + + std::string archiveDir = ArchivalDirectory(dbname_); + + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < 10; ++j) { + ASSERT_OK(Put(Key(10 * i + j), DummyString(1024))); + } + + std::vector log_files = ListLogFiles(env_, dbname_); + + options.create_if_missing = false; + Reopen(&options); + + std::vector logs = ListLogFiles(env_, archiveDir); + std::set archivedFiles(logs.begin(), logs.end()); + + for (auto& log : log_files) { + ASSERT_TRUE(archivedFiles.find(log) != archivedFiles.end()); + } + } + + std::vector log_files = ListLogFiles(env_, archiveDir); + ASSERT_TRUE(log_files.size() > 0); + + options.WAL_ttl_seconds = 1; + env_->SleepForMicroseconds(2 * 1000 * 1000); + Reopen(&options); + + log_files = ListLogFiles(env_, archiveDir); + ASSERT_TRUE(log_files.empty()); + } while (ChangeCompactOptions()); +} + +namespace { +uint64_t GetLogDirSize(std::string dir_path, SpecialEnv* env) { + uint64_t dir_size = 0; + std::vector files; + env->GetChildren(dir_path, &files); + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile) { + std::string const file_path = dir_path + "/" + f; + uint64_t file_size; + env->GetFileSize(file_path, &file_size); + dir_size += file_size; + } + } + return dir_size; +} +} // namespace + +TEST(DBTest, WALArchivalSizeLimit) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.WAL_ttl_seconds = 0; + options.WAL_size_limit_MB = 1000; + + // TEST : Create DB with huge size limit and no ttl. + // Put some keys. Count the archived log files present in the DB + // just after insert. Assert that there are many enough. + // Change size limit. Re-open db. + // Assert that archive is not greater than WAL_size_limit_MB. + // Set ttl and time_to_check_ to small values. Re-open db. + // Assert that there are no archived logs left. + + DestroyAndReopen(&options); + for (int i = 0; i < 128 * 128; ++i) { + ASSERT_OK(Put(Key(i), DummyString(1024))); + } + Reopen(&options); + + std::string archive_dir = ArchivalDirectory(dbname_); + std::vector log_files = ListLogFiles(env_, archive_dir); + ASSERT_TRUE(log_files.size() > 2); + + options.WAL_size_limit_MB = 8; + Reopen(&options); + dbfull()->TEST_PurgeObsoleteteWAL(); + + uint64_t archive_size = GetLogDirSize(archive_dir, env_); + ASSERT_TRUE(archive_size <= options.WAL_size_limit_MB * 1024 * 1024); + + options.WAL_ttl_seconds = 1; + dbfull()->TEST_SetDefaultTimeToCheck(1); + env_->SleepForMicroseconds(2 * 1000 * 1000); + Reopen(&options); + dbfull()->TEST_PurgeObsoleteteWAL(); + + log_files = ListLogFiles(env_, archive_dir); + ASSERT_TRUE(log_files.empty()); + } while (ChangeCompactOptions()); +} + +namespace { +SequenceNumber ReadRecords( + std::unique_ptr& iter, + int& count) { + count = 0; + SequenceNumber lastSequence = 0; + BatchResult res; + while (iter->Valid()) { + res = iter->GetBatch(); + ASSERT_TRUE(res.sequence > lastSequence); + ++count; + lastSequence = res.sequence; + ASSERT_OK(iter->status()); + iter->Next(); + } + return res.sequence; +} + +void ExpectRecords( + const int expected_no_records, + std::unique_ptr& iter) { + int num_records; + ReadRecords(iter, num_records); + ASSERT_EQ(num_records, expected_no_records); +} +} // namespace + +TEST(DBTest, TransactionLogIterator) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); + Put(0, "key1", DummyString(1024)); + Put(1, "key2", DummyString(1024)); + Put(1, "key2", DummyString(1024)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U); + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(3, iter); + } + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + env_->SleepForMicroseconds(2 * 1000 * 1000); + { + Put(0, "key4", DummyString(1024)); + Put(1, "key5", DummyString(1024)); + Put(0, "key6", DummyString(1024)); + } + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(6, iter); + } + } while (ChangeCompactOptions()); +} + +#ifndef NDEBUG // sync point is not included with DNDEBUG build +TEST(DBTest, TransactionLogIteratorRace) { + static const int LOG_ITERATOR_RACE_TEST_COUNT = 2; + static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = + { { "DBImpl::GetSortedWalFiles:1", "DBImpl::PurgeObsoleteFiles:1", + "DBImpl::PurgeObsoleteFiles:2", "DBImpl::GetSortedWalFiles:2" }, + { "DBImpl::GetSortedWalsOfType:1", "DBImpl::PurgeObsoleteFiles:1", + "DBImpl::PurgeObsoleteFiles:2", "DBImpl::GetSortedWalsOfType:2" }}; + for (int test = 0; test < LOG_ITERATOR_RACE_TEST_COUNT; ++test) { + // Setup sync point dependency to reproduce the race condition of + // a log file moved to archived dir, in the middle of GetSortedWalFiles + rocksdb::SyncPoint::GetInstance()->LoadDependency( + { { sync_points[test][0], sync_points[test][1] }, + { sync_points[test][2], sync_points[test][3] }, + }); + + do { + rocksdb::SyncPoint::GetInstance()->ClearTrace(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + Put("key1", DummyString(1024)); + dbfull()->Flush(FlushOptions()); + Put("key2", DummyString(1024)); + dbfull()->Flush(FlushOptions()); + Put("key3", DummyString(1024)); + dbfull()->Flush(FlushOptions()); + Put("key4", DummyString(1024)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U); + + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(4, iter); + } + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + // trigger async flush, and log move. Well, log move will + // wait until the GetSortedWalFiles:1 to reproduce the race + // condition + FlushOptions flush_options; + flush_options.wait = false; + dbfull()->Flush(flush_options); + + // "key5" would be written in a new memtable and log + Put("key5", DummyString(1024)); + { + // this iter would miss "key4" if not fixed + auto iter = OpenTransactionLogIter(0); + ExpectRecords(5, iter); + } + } while (ChangeCompactOptions()); + } +} +#endif + +TEST(DBTest, TransactionLogIteratorMoveOverZeroFiles) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); + // Do a plain Reopen. + Put(1, "key1", DummyString(1024)); + // Two reopens should create a zero record WAL file. + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + + Put(1, "key2", DummyString(1024)); + + auto iter = OpenTransactionLogIter(0); + ExpectRecords(2, iter); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorStallAtLastRecord) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + Put("key1", DummyString(1024)); + auto iter = OpenTransactionLogIter(0); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + Put("key2", DummyString(1024)); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorJustEmptyFile) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + unique_ptr iter; + Status status = dbfull()->GetUpdatesSince(0, &iter); + // Check that an empty iterator is returned + ASSERT_TRUE(!iter->Valid()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorCheckAfterRestart) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + Put("key1", DummyString(1024)); + Put("key2", DummyString(1023)); + dbfull()->Flush(FlushOptions()); + Reopen(&options); + auto iter = OpenTransactionLogIter(0); + ExpectRecords(2, iter); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorCorruptedLog) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + for (int i = 0; i < 1024; i++) { + Put("key"+std::to_string(i), DummyString(10)); + } + dbfull()->Flush(FlushOptions()); + // Corrupt this log to create a gap + rocksdb::VectorLogPtr wal_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + const auto logfilePath = dbname_ + "/" + wal_files.front()->PathName(); + ASSERT_EQ( + 0, + truncate(logfilePath.c_str(), wal_files.front()->SizeFileBytes() / 2)); + // Insert a new entry to a new log file + Put("key1025", DummyString(10)); + // Try to read from the beginning. Should stop before the gap and read less + // than 1025 entries + auto iter = OpenTransactionLogIter(0); + int count; + int last_sequence_read = ReadRecords(iter, count); + ASSERT_LT(last_sequence_read, 1025); + // Try to read past the gap, should be able to seek to key1025 + auto iter2 = OpenTransactionLogIter(last_sequence_read + 1); + ExpectRecords(1, iter2); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorBatchOperations) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); + WriteBatch batch; + batch.Put(handles_[1], "key1", DummyString(1024)); + batch.Put(handles_[0], "key2", DummyString(1024)); + batch.Put(handles_[1], "key3", DummyString(1024)); + batch.Delete(handles_[0], "key2"); + dbfull()->Write(WriteOptions(), &batch); + Flush(1); + Flush(0); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + Put(1, "key4", DummyString(1024)); + auto iter = OpenTransactionLogIter(3); + ExpectRecords(2, iter); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorBlobs) { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); + { + WriteBatch batch; + batch.Put(handles_[1], "key1", DummyString(1024)); + batch.Put(handles_[0], "key2", DummyString(1024)); + batch.PutLogData(Slice("blob1")); + batch.Put(handles_[1], "key3", DummyString(1024)); + batch.PutLogData(Slice("blob2")); + batch.Delete(handles_[0], "key2"); + dbfull()->Write(WriteOptions(), &batch); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + } + + auto res = OpenTransactionLogIter(0)->GetBatch(); + struct Handler : public WriteBatch::Handler { + std::string seen; + virtual Status PutCF(uint32_t cf, const Slice& key, const Slice& value) { + seen += "Put(" + std::to_string(cf) + ", " + key.ToString() + ", " + + std::to_string(value.size()) + ")"; + return Status::OK(); + } + virtual Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) { + seen += "Merge(" + std::to_string(cf) + ", " + key.ToString() + ", " + + std::to_string(value.size()) + ")"; + return Status::OK(); + } + virtual void LogData(const Slice& blob) { + seen += "LogData(" + blob.ToString() + ")"; + } + virtual Status DeleteCF(uint32_t cf, const Slice& key) { + seen += "Delete(" + std::to_string(cf) + ", " + key.ToString() + ")"; + return Status::OK(); + } + } handler; + res.writeBatchPtr->Iterate(&handler); + ASSERT_EQ( + "Put(1, key1, 1024)" + "Put(0, key2, 1024)" + "LogData(blob1)" + "Put(1, key3, 1024)" + "LogData(blob2)" + "Delete(0, key2)", + handler.seen); +} + +TEST(DBTest, ReadFirstRecordCache) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + DestroyAndReopen(&options); + + std::string path = dbname_ + "/000001.log"; + unique_ptr file; + ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions())); + + SequenceNumber s; + ASSERT_OK(dbfull()->TEST_ReadFirstLine(path, &s)); + ASSERT_EQ(s, 0U); + + ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); + ASSERT_EQ(s, 0U); + + log::Writer writer(std::move(file)); + WriteBatch batch; + batch.Put("foo", "bar"); + WriteBatchInternal::SetSequence(&batch, 10); + writer.AddRecord(WriteBatchInternal::Contents(&batch)); + + env_->count_sequential_reads_ = true; + // sequential_read_counter_ sanity test + ASSERT_EQ(env_->sequential_read_counter_.Read(), 0); + + ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); + ASSERT_EQ(s, 10U); + // did a read + ASSERT_EQ(env_->sequential_read_counter_.Read(), 1); + + ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); + ASSERT_EQ(s, 10U); + // no new reads since the value is cached + ASSERT_EQ(env_->sequential_read_counter_.Read(), 1); +} + +TEST(DBTest, ReadCompaction) { + std::string value(4096, '4'); // a string of size 4K + { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.max_open_files = 20; // only 10 file in file-cache + options.target_file_size_base = 512; + options.write_buffer_size = 64 * 1024; + options.filter_policy = nullptr; + options.block_size = 4096; + options.no_block_cache = true; + options.disable_seek_compaction = false; + + CreateAndReopenWithCF({"pikachu"}, &options); + + // Write 8MB (2000 values, each 4K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + for (int i = 0; i < 2000; i++) { + ASSERT_OK(Put(1, Key(i), value)); + } + + // clear level 0 and 1 if necessary. + Flush(1); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + + // write some new keys into level 0 + for (int i = 0; i < 2000; i = i + 16) { + ASSERT_OK(Put(1, Key(i), value)); + } + Flush(1); + + // Wait for any write compaction to finish + dbfull()->TEST_WaitForCompact(); + + // remember number of files in each level + int l1 = NumTableFilesAtLevel(0, 1); + int l2 = NumTableFilesAtLevel(1, 1); + int l3 = NumTableFilesAtLevel(2, 1); + ASSERT_NE(NumTableFilesAtLevel(0, 1), 0); + ASSERT_NE(NumTableFilesAtLevel(1, 1), 0); + ASSERT_NE(NumTableFilesAtLevel(2, 1), 0); + + // read a bunch of times, trigger read compaction + for (int j = 0; j < 100; j++) { + for (int i = 0; i < 2000; i++) { + Get(1, Key(i)); + } + } + // wait for read compaction to finish + env_->SleepForMicroseconds(1000000); + + // verify that the number of files have decreased + // in some level, indicating that there was a compaction + ASSERT_TRUE(NumTableFilesAtLevel(0, 1) < l1 || + NumTableFilesAtLevel(1, 1) < l2 || + NumTableFilesAtLevel(2, 1) < l3); + } +} + +// Multi-threaded test: +namespace { + +static const int kColumnFamilies = 10; +static const int kNumThreads = 10; +static const int kTestSeconds = 10; +static const int kNumKeys = 1000; + +struct MTState { + DBTest* test; + port::AtomicPointer stop; + port::AtomicPointer counter[kNumThreads]; + port::AtomicPointer thread_done[kNumThreads]; +}; + +struct MTThread { + MTState* state; + int id; +}; + +static void MTThreadBody(void* arg) { + MTThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->state->test->db_; + uintptr_t counter = 0; + fprintf(stderr, "... starting thread %d\n", id); + Random rnd(1000 + id); + char valbuf[1500]; + while (t->state->stop.Acquire_Load() == nullptr) { + t->state->counter[id].Release_Store(reinterpret_cast(counter)); + + int key = rnd.Uniform(kNumKeys); + char keybuf[20]; + snprintf(keybuf, sizeof(keybuf), "%016d", key); + + if (rnd.OneIn(2)) { + // Write values of the form . + // into each of the CFs + // We add some padding for force compactions. + int unique_id = rnd.Uniform(1000000); + WriteBatch batch; + for (int cf = 0; cf < kColumnFamilies; ++cf) { + snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, + static_cast(counter), cf, unique_id); + batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); + } + ASSERT_OK(db->Write(WriteOptions(), &batch)); + } else { + // Read a value and verify that it matches the pattern written above + // and that writes to all column families were atomic (unique_id is the + // same) + std::vector keys(kColumnFamilies, Slice(keybuf)); + std::vector values; + std::vector statuses = + db->MultiGet(ReadOptions(), t->state->test->handles_, keys, &values); + Status s = statuses[0]; + // all statuses have to be the same + for (size_t i = 1; i < statuses.size(); ++i) { + // they are either both ok or both not-found + ASSERT_TRUE((s.ok() && statuses[i].ok()) || + (s.IsNotFound() && statuses[i].IsNotFound())); + } + if (s.IsNotFound()) { + // Key has not yet been written + } else { + // Check that the writer thread counter is >= the counter in the value + ASSERT_OK(s); + int unique_id = -1; + for (int i = 0; i < kColumnFamilies; ++i) { + int k, w, c, cf, u; + ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w, + &c, &cf, &u)) + << values[i]; + ASSERT_EQ(k, key); + ASSERT_GE(w, 0); + ASSERT_LT(w, kNumThreads); + ASSERT_LE((unsigned int)c, reinterpret_cast( + t->state->counter[w].Acquire_Load())); + ASSERT_EQ(cf, i); + if (i == 0) { + unique_id = u; + } else { + // this checks that updates across column families happened + // atomically -- all unique ids are the same + ASSERT_EQ(u, unique_id); + } + } + } + } + counter++; + } + t->state->thread_done[id].Release_Store(t); + fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter)); +} + +} // namespace + +TEST(DBTest, MultiThreaded) { + do { + std::vector cfs; + for (int i = 1; i < kColumnFamilies; ++i) { + cfs.push_back(std::to_string(i)); + } + CreateAndReopenWithCF(cfs); + // Initialize state + MTState mt; + mt.test = this; + mt.stop.Release_Store(0); + for (int id = 0; id < kNumThreads; id++) { + mt.counter[id].Release_Store(0); + mt.thread_done[id].Release_Store(0); + } + + // Start threads + MTThread thread[kNumThreads]; + for (int id = 0; id < kNumThreads; id++) { + thread[id].state = &mt; + thread[id].id = id; + env_->StartThread(MTThreadBody, &thread[id]); + } + + // Let them run for a while + env_->SleepForMicroseconds(kTestSeconds * 1000000); + + // Stop the threads and wait for them to finish + mt.stop.Release_Store(&mt); + for (int id = 0; id < kNumThreads; id++) { + while (mt.thread_done[id].Acquire_Load() == nullptr) { + env_->SleepForMicroseconds(100000); + } + } + // skip as HashCuckooRep does not support snapshot + } while (ChangeOptions(kSkipHashCuckoo)); +} + +// Group commit test: +namespace { + +static const int kGCNumThreads = 4; +static const int kGCNumKeys = 1000; + +struct GCThread { + DB* db; + int id; + std::atomic done; +}; + +static void GCThreadBody(void* arg) { + GCThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->db; + WriteOptions wo; + + for (int i = 0; i < kGCNumKeys; ++i) { + std::string kv(std::to_string(i + id * kGCNumKeys)); + ASSERT_OK(db->Put(wo, kv, kv)); + } + t->done = true; +} + +} // namespace + +TEST(DBTest, GroupCommitTest) { + do { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + Reopen(&options); + + // Start threads + GCThread thread[kGCNumThreads]; + for (int id = 0; id < kGCNumThreads; id++) { + thread[id].id = id; + thread[id].db = db_; + thread[id].done = false; + env_->StartThread(GCThreadBody, &thread[id]); + } + + for (int id = 0; id < kGCNumThreads; id++) { + while (thread[id].done == false) { + env_->SleepForMicroseconds(100000); + } + } + ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0); + + std::vector expected_db; + for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) { + expected_db.push_back(std::to_string(i)); + } + sort(expected_db.begin(), expected_db.end()); + + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + for (auto x : expected_db) { + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ(itr->key().ToString(), x); + ASSERT_EQ(itr->value().ToString(), x); + itr->Next(); + } + ASSERT_TRUE(!itr->Valid()); + delete itr; + + } while (ChangeOptions(kSkipNoSeekToLast)); +} + +namespace { +typedef std::map KVMap; +} + +class ModelDB: public DB { + public: + class ModelSnapshot : public Snapshot { + public: + KVMap map_; + }; + + explicit ModelDB(const Options& options) : options_(options) {} + using DB::Put; + virtual Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& k, const Slice& v) { + WriteBatch batch; + batch.Put(cf, k, v); + return Write(o, &batch); + } + using DB::Merge; + virtual Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& k, const Slice& v) { + WriteBatch batch; + batch.Merge(cf, k, v); + return Write(o, &batch); + } + using DB::Delete; + virtual Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& key) { + WriteBatch batch; + batch.Delete(cf, key); + return Write(o, &batch); + } + using DB::Get; + virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* cf, + const Slice& key, std::string* value) { + return Status::NotSupported(key); + } + + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + std::vector s(keys.size(), + Status::NotSupported("Not implemented.")); + return s; + } + + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) { + return Status(); + } + + using DB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found = nullptr) { + if (value_found != nullptr) { + *value_found = false; + } + return true; // Not Supported directly + } + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { + if (options.snapshot == nullptr) { + KVMap* saved = new KVMap; + *saved = map_; + return new ModelIter(saved, true); + } else { + const KVMap* snapshot_state = + &(reinterpret_cast(options.snapshot)->map_); + return new ModelIter(snapshot_state, false); + } + } + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_family, + std::vector* iterators) { + return Status::NotSupported("Not supported yet"); + } + virtual const Snapshot* GetSnapshot() { + ModelSnapshot* snapshot = new ModelSnapshot; + snapshot->map_ = map_; + return snapshot; + } + + virtual void ReleaseSnapshot(const Snapshot* snapshot) { + delete reinterpret_cast(snapshot); + } + + virtual Status Write(const WriteOptions& options, WriteBatch* batch) { + class Handler : public WriteBatch::Handler { + public: + KVMap* map_; + virtual void Put(const Slice& key, const Slice& value) { + (*map_)[key.ToString()] = value.ToString(); + } + virtual void Merge(const Slice& key, const Slice& value) { + // ignore merge for now + //(*map_)[key.ToString()] = value.ToString(); + } + virtual void Delete(const Slice& key) { + map_->erase(key.ToString()); + } + }; + Handler handler; + handler.map_ = &map_; + return batch->Iterate(&handler); + } + + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) { + return false; + } + using DB::GetApproximateSizes; + virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes) { + for (int i = 0; i < n; i++) { + sizes[i] = 0; + } + } + using DB::CompactRange; + virtual Status CompactRange(ColumnFamilyHandle* column_family, + const Slice* start, const Slice* end, + bool reduce_level, int target_level) { + return Status::NotSupported("Not supported operation."); + } + + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family) { return 1; } + + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) { + return 1; + } + + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { + return -1; + } + + virtual const std::string& GetName() const { + return name_; + } + + virtual Env* GetEnv() const { + return nullptr; + } + + using DB::GetOptions; + virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const { + return options_; + } + + using DB::Flush; + virtual Status Flush(const rocksdb::FlushOptions& options, + ColumnFamilyHandle* column_family) { + Status ret; + return ret; + } + + virtual Status DisableFileDeletions() { + return Status::OK(); + } + virtual Status EnableFileDeletions(bool force) { + return Status::OK(); + } + virtual Status GetLiveFiles(std::vector&, uint64_t* size, + bool flush_memtable = true) { + return Status::OK(); + } + + virtual Status GetSortedWalFiles(VectorLogPtr& files) { + return Status::OK(); + } + + virtual Status DeleteFile(std::string name) { + return Status::OK(); + } + + virtual Status GetDbIdentity(std::string& identity) { + return Status::OK(); + } + + virtual SequenceNumber GetLatestSequenceNumber() const { + return 0; + } + virtual Status GetUpdatesSince( + rocksdb::SequenceNumber, unique_ptr*, + const TransactionLogIterator::ReadOptions& + read_options = TransactionLogIterator::ReadOptions()) { + return Status::NotSupported("Not supported in Model DB"); + } + + virtual ColumnFamilyHandle* DefaultColumnFamily() const { return nullptr; } + + private: + class ModelIter: public Iterator { + public: + ModelIter(const KVMap* map, bool owned) + : map_(map), owned_(owned), iter_(map_->end()) { + } + ~ModelIter() { + if (owned_) delete map_; + } + virtual bool Valid() const { return iter_ != map_->end(); } + virtual void SeekToFirst() { iter_ = map_->begin(); } + virtual void SeekToLast() { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + virtual void Seek(const Slice& k) { + iter_ = map_->lower_bound(k.ToString()); + } + virtual void Next() { ++iter_; } + virtual void Prev() { --iter_; } + virtual Slice key() const { return iter_->first; } + virtual Slice value() const { return iter_->second; } + virtual Status status() const { return Status::OK(); } + private: + const KVMap* const map_; + const bool owned_; // Do we own map_ + KVMap::const_iterator iter_; + }; + const Options options_; + KVMap map_; + std::string name_ = ""; +}; + +static std::string RandomKey(Random* rnd, int minimum = 0) { + int len; + do { + len = (rnd->OneIn(3) + ? 1 // Short sometimes to encourage collisions + : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); + } while (len < minimum); + return test::RandomKey(rnd, len); +} + +static bool CompareIterators(int step, + DB* model, + DB* db, + const Snapshot* model_snap, + const Snapshot* db_snap) { + ReadOptions options; + options.snapshot = model_snap; + Iterator* miter = model->NewIterator(options); + options.snapshot = db_snap; + Iterator* dbiter = db->NewIterator(options); + bool ok = true; + int count = 0; + for (miter->SeekToFirst(), dbiter->SeekToFirst(); + ok && miter->Valid() && dbiter->Valid(); + miter->Next(), dbiter->Next()) { + count++; + if (miter->key().compare(dbiter->key()) != 0) { + fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", + step, + EscapeString(miter->key()).c_str(), + EscapeString(dbiter->key()).c_str()); + ok = false; + break; + } + + if (miter->value().compare(dbiter->value()) != 0) { + fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", + step, + EscapeString(miter->key()).c_str(), + EscapeString(miter->value()).c_str(), + EscapeString(miter->value()).c_str()); + ok = false; + } + } + + if (ok) { + if (miter->Valid() != dbiter->Valid()) { + fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", + step, miter->Valid(), dbiter->Valid()); + ok = false; + } + } + delete miter; + delete dbiter; + return ok; +} + +TEST(DBTest, Randomized) { + Random rnd(test::RandomSeed()); + do { + ModelDB model(CurrentOptions()); + const int N = 10000; + const Snapshot* model_snap = nullptr; + const Snapshot* db_snap = nullptr; + std::string k, v; + for (int step = 0; step < N; step++) { + // TODO(sanjay): Test Get() works + int p = rnd.Uniform(100); + int minimum = 0; + if (option_config_ == kHashSkipList || + option_config_ == kHashLinkList || + option_config_ == kHashCuckoo || + option_config_ == kPlainTableFirstBytePrefix || + option_config_ == kBlockBasedTableWithWholeKeyHashIndex || + option_config_ == kBlockBasedTableWithPrefixHashIndex) { + minimum = 1; + } + if (p < 45) { // Put + k = RandomKey(&rnd, minimum); + v = RandomString(&rnd, + rnd.OneIn(20) + ? 100 + rnd.Uniform(100) + : rnd.Uniform(8)); + ASSERT_OK(model.Put(WriteOptions(), k, v)); + ASSERT_OK(db_->Put(WriteOptions(), k, v)); + + } else if (p < 90) { // Delete + k = RandomKey(&rnd, minimum); + ASSERT_OK(model.Delete(WriteOptions(), k)); + ASSERT_OK(db_->Delete(WriteOptions(), k)); + + + } else { // Multi-element batch + WriteBatch b; + const int num = rnd.Uniform(8); + for (int i = 0; i < num; i++) { + if (i == 0 || !rnd.OneIn(10)) { + k = RandomKey(&rnd, minimum); + } else { + // Periodically re-use the same key from the previous iter, so + // we have multiple entries in the write batch for the same key + } + if (rnd.OneIn(2)) { + v = RandomString(&rnd, rnd.Uniform(10)); + b.Put(k, v); + } else { + b.Delete(k); + } + } + ASSERT_OK(model.Write(WriteOptions(), &b)); + ASSERT_OK(db_->Write(WriteOptions(), &b)); + } + + if ((step % 100) == 0) { + ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); + + // Save a snapshot from each DB this time that we'll use next + // time we compare things, to make sure the current state is + // preserved with the snapshot + if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); + if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); + + Reopen(); + ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + + model_snap = model.GetSnapshot(); + db_snap = db_->GetSnapshot(); + } + } + if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); + if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); + // skip cuckoo hash as it does not support snapshot. + } while (ChangeOptions(kSkipDeletesFilterFirst | + kSkipNoSeekToLast | kSkipHashCuckoo)); +} + +TEST(DBTest, MultiGetSimple) { + do { + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "k1", "v1")); + ASSERT_OK(Put(1, "k2", "v2")); + ASSERT_OK(Put(1, "k3", "v3")); + ASSERT_OK(Put(1, "k4", "v4")); + ASSERT_OK(Delete(1, "k4")); + ASSERT_OK(Put(1, "k5", "v5")); + ASSERT_OK(Delete(1, "no_key")); + + std::vector keys({"k1", "k2", "k3", "k4", "k5", "no_key"}); + + std::vector values(20, "Temporary data to be overwritten"); + std::vector cfs(keys.size(), handles_[1]); + + std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(values[0], "v1"); + ASSERT_EQ(values[1], "v2"); + ASSERT_EQ(values[2], "v3"); + ASSERT_EQ(values[4], "v5"); + + ASSERT_OK(s[0]); + ASSERT_OK(s[1]); + ASSERT_OK(s[2]); + ASSERT_TRUE(s[3].IsNotFound()); + ASSERT_OK(s[4]); + ASSERT_TRUE(s[5].IsNotFound()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, MultiGetEmpty) { + do { + CreateAndReopenWithCF({"pikachu"}); + // Empty Key Set + std::vector keys; + std::vector values; + std::vector cfs; + std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(s.size(), 0U); + + // Empty Database, Empty Key Set + DestroyAndReopen(); + CreateAndReopenWithCF({"pikachu"}); + s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(s.size(), 0U); + + // Empty Database, Search for Keys + keys.resize(2); + keys[0] = "a"; + keys[1] = "b"; + cfs.push_back(handles_[0]); + cfs.push_back(handles_[1]); + s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ((int)s.size(), 2); + ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound()); + } while (ChangeCompactOptions()); +} + +namespace { +void PrefixScanInit(DBTest *dbtest) { + char buf[100]; + std::string keystr; + const int small_range_sstfiles = 5; + const int big_range_sstfiles = 5; + + // Generate 11 sst files with the following prefix ranges. + // GROUP 0: [0,10] (level 1) + // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0) + // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0) + // + // A seek with the previous API would do 11 random I/Os (to all the + // files). With the new API and a prefix filter enabled, we should + // only do 2 random I/O, to the 2 files containing the key. + + // GROUP 0 + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", 10); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + dbtest->dbfull()->CompactRange(nullptr, nullptr); // move to level 1 + + // GROUP 1 + for (int i = 1; i <= small_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", i); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", i+1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + } + + // GROUP 2 + for (int i = 1; i <= big_range_sstfiles; i++) { + std::string keystr; + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", + small_range_sstfiles+i+1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + } +} +} // namespace + +TEST(DBTest, PrefixScan) { + int count; + Slice prefix; + Slice key; + char buf[100]; + Iterator* iter; + snprintf(buf, sizeof(buf), "03______:"); + prefix = Slice(buf, 8); + key = Slice(buf, 9); + // db configs + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.no_block_cache = true; + options.filter_policy = NewBloomFilterPolicy(10); + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.whole_key_filtering = false; + options.disable_auto_compactions = true; + options.max_background_compactions = 2; + options.create_if_missing = true; + options.disable_seek_compaction = true; + options.memtable_factory.reset(NewHashSkipListRepFactory()); + + // 11 RAND I/Os + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + iter = db_->NewIterator(ReadOptions()); + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + if (! iter->key().starts_with(prefix)) { + break; + } + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + Close(); + delete options.filter_policy; +} + +TEST(DBTest, TailingIteratorSingle) { + ReadOptions read_options; + read_options.tailing = true; + + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->SeekToFirst(); + ASSERT_TRUE(!iter->Valid()); + + // add a record and check that iter can see it + ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor")); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "mirko"); + + iter->Next(); + ASSERT_TRUE(!iter->Valid()); +} + +TEST(DBTest, TailingIteratorKeepAdding) { + CreateAndReopenWithCF({"pikachu"}); + ReadOptions read_options; + read_options.tailing = true; + + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + std::string value(1024, 'a'); + + const int num_records = 10000; + for (int i = 0; i < num_records; ++i) { + char buf[32]; + snprintf(buf, sizeof(buf), "%016d", i); + + Slice key(buf, 16); + ASSERT_OK(Put(1, key, value)); + + iter->Seek(key); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); + } +} + +TEST(DBTest, TailingIteratorDeletes) { + CreateAndReopenWithCF({"pikachu"}); + ReadOptions read_options; + read_options.tailing = true; + + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + + // write a single record, read it using the iterator, then delete it + ASSERT_OK(Put(1, "0test", "test")); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "0test"); + ASSERT_OK(Delete(1, "0test")); + + // write many more records + const int num_records = 10000; + std::string value(1024, 'A'); + + for (int i = 0; i < num_records; ++i) { + char buf[32]; + snprintf(buf, sizeof(buf), "1%015d", i); + + Slice key(buf, 16); + ASSERT_OK(Put(1, key, value)); + } + + // force a flush to make sure that no records are read from memtable + ASSERT_OK(Flush(1)); + + // skip "0test" + iter->Next(); + + // make sure we can read all new records using the existing iterator + int count = 0; + for (; iter->Valid(); iter->Next(), ++count) ; + + ASSERT_EQ(count, num_records); +} + +TEST(DBTest, TailingIteratorPrefixSeek) { + ReadOptions read_options; + read_options.tailing = true; + + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + options.memtable_factory.reset(NewHashSkipListRepFactory()); + DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); + + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(Put(1, "0101", "test")); + + ASSERT_OK(Flush(1)); + + ASSERT_OK(Put(1, "0202", "test")); + + // Seek(0102) shouldn't find any records since 0202 has a different prefix + iter->Seek("0102"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("0202"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "0202"); + + iter->Next(); + ASSERT_TRUE(!iter->Valid()); +} + +TEST(DBTest, ChecksumTest) { + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(&options); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Flush()); // table with crc checksum + + table_options.checksum = kxxHash; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(&options); + ASSERT_OK(Put("e", "f")); + ASSERT_OK(Put("g", "h")); + ASSERT_OK(Flush()); // table with xxhash checksum + + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(&options); + ASSERT_EQ("b", Get("a")); + ASSERT_EQ("d", Get("c")); + ASSERT_EQ("f", Get("e")); + ASSERT_EQ("h", Get("g")); + + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(&options); + ASSERT_EQ("b", Get("a")); + ASSERT_EQ("d", Get("c")); + ASSERT_EQ("f", Get("e")); + ASSERT_EQ("h", Get("g")); +} + +TEST(DBTest, FIFOCompactionTest) { + for (int iter = 0; iter < 2; ++iter) { + // first iteration -- auto compaction + // second iteration -- manual compaction + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 100 << 10; // 100KB + options.compaction_options_fifo.max_table_files_size = 500 << 10; // 500KB + options.compression = kNoCompression; + options.create_if_missing = true; + if (iter == 1) { + options.disable_auto_compactions = true; + } + DestroyAndReopen(&options); + + Random rnd(301); + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(std::to_string(i * 100 + j), RandomString(&rnd, 1024))); + } + // flush should happen here + } + if (iter == 0) { + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } else { + ASSERT_OK(db_->CompactRange(nullptr, nullptr)); + } + // only 5 files should survive + ASSERT_EQ(NumTableFilesAtLevel(0), 5); + for (int i = 0; i < 50; ++i) { + // these keys should be deleted in previous compaction + ASSERT_EQ("NOT_FOUND", Get(std::to_string(i))); + } + } +} +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/dbformat.cc b/db/dbformat.cc new file mode 100644 index 0000000000..e53d16dc1b --- /dev/null +++ b/db/dbformat.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/dbformat.h" + +#include +#include "port/port.h" +#include "util/coding.h" +#include "util/perf_context_imp.h" + +namespace rocksdb { + +uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { + assert(seq <= kMaxSequenceNumber); + assert(t <= kValueTypeForSeek); + return (seq << 8) | t; +} + +void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { + result->append(key.user_key.data(), key.user_key.size()); + PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); +} + +std::string ParsedInternalKey::DebugString(bool hex) const { + char buf[50]; + snprintf(buf, sizeof(buf), "' @ %llu : %d", + (unsigned long long) sequence, + int(type)); + std::string result = "'"; + result += user_key.ToString(hex); + result += buf; + return result; +} + +std::string InternalKey::DebugString(bool hex) const { + std::string result; + ParsedInternalKey parsed; + if (ParseInternalKey(rep_, &parsed)) { + result = parsed.DebugString(hex); + } else { + result = "(bad)"; + result.append(EscapeString(rep_)); + } + return result; +} + +const char* InternalKeyComparator::Name() const { + return name_.c_str(); +} + +int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + PERF_COUNTER_ADD(user_key_comparison_count, 1); + if (r == 0) { + const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); + const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +int InternalKeyComparator::Compare(const ParsedInternalKey& a, + const ParsedInternalKey& b) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_->Compare(a.user_key, b.user_key); + PERF_COUNTER_ADD(user_key_comparison_count, 1); + if (r == 0) { + if (a.sequence > b.sequence) { + r = -1; + } else if (a.sequence < b.sequence) { + r = +1; + } else if (a.type > b.type) { + r = -1; + } else if (a.type < b.type) { + r = +1; + } + } + return r; +} + +void InternalKeyComparator::FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Attempt to shorten the user portion of the key + Slice user_start = ExtractUserKey(*start); + Slice user_limit = ExtractUserKey(limit); + std::string tmp(user_start.data(), user_start.size()); + user_comparator_->FindShortestSeparator(&tmp, user_limit); + if (tmp.size() < user_start.size() && + user_comparator_->Compare(user_start, tmp) < 0) { + // User key has become shorter physically, but larger logically. + // Tack on the earliest possible number to the shortened user key. + PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); + assert(this->Compare(*start, tmp) < 0); + assert(this->Compare(tmp, limit) < 0); + start->swap(tmp); + } +} + +void InternalKeyComparator::FindShortSuccessor(std::string* key) const { + Slice user_key = ExtractUserKey(*key); + std::string tmp(user_key.data(), user_key.size()); + user_comparator_->FindShortSuccessor(&tmp); + if (tmp.size() < user_key.size() && + user_comparator_->Compare(user_key, tmp) < 0) { + // User key has become shorter physically, but larger logically. + // Tack on the earliest possible number to the shortened user key. + PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); + assert(this->Compare(*key, tmp) < 0); + key->swap(tmp); + } +} + +const char* InternalFilterPolicy::Name() const { + return user_policy_->Name(); +} + +void InternalFilterPolicy::CreateFilter(const Slice* keys, int n, + std::string* dst) const { + // We rely on the fact that the code in table.cc does not mind us + // adjusting keys[]. + Slice* mkey = const_cast(keys); + for (int i = 0; i < n; i++) { + mkey[i] = ExtractUserKey(keys[i]); + // TODO(sanjay): Suppress dups? + } + user_policy_->CreateFilter(keys, n, dst); +} + +bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const { + return user_policy_->KeyMayMatch(ExtractUserKey(key), f); +} + +LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { + size_t usize = user_key.size(); + size_t needed = usize + 13; // A conservative estimate + char* dst; + if (needed <= sizeof(space_)) { + dst = space_; + } else { + dst = new char[needed]; + } + start_ = dst; + dst = EncodeVarint32(dst, usize + 8); + kstart_ = dst; + memcpy(dst, user_key.data(), usize); + dst += usize; + EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek)); + dst += 8; + end_ = dst; +} + +} // namespace rocksdb diff --git a/db/dbformat.h b/db/dbformat.h new file mode 100644 index 0000000000..9640372d72 --- /dev/null +++ b/db/dbformat.h @@ -0,0 +1,345 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/types.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace rocksdb { + +class InternalKey; + +// Value types encoded as the last component of internal keys. +// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk +// data structures. +// The highest bit of the value type needs to be reserved to SST tables +// for them to do more flexible encoding. +enum ValueType : unsigned char { + kTypeDeletion = 0x0, + kTypeValue = 0x1, + kTypeMerge = 0x2, + // Following types are used only in write ahead logs. They are not used in + // memtables or sst files: + kTypeLogData = 0x3, + kTypeColumnFamilyDeletion = 0x4, + kTypeColumnFamilyValue = 0x5, + kTypeColumnFamilyMerge = 0x6, + kMaxValue = 0x7F +}; + +// kValueTypeForSeek defines the ValueType that should be passed when +// constructing a ParsedInternalKey object for seeking to a particular +// sequence number (since we sort sequence numbers in decreasing order +// and the value type is embedded as the low 8 bits in the sequence +// number in internal keys, we need to use the highest-numbered +// ValueType, not the lowest). +static const ValueType kValueTypeForSeek = kTypeMerge; + +// We leave eight bits empty at the bottom so a type and sequence# +// can be packed together into 64-bits. +static const SequenceNumber kMaxSequenceNumber = + ((0x1ull << 56) - 1); + +struct ParsedInternalKey { + Slice user_key; + SequenceNumber sequence; + ValueType type; + + ParsedInternalKey() { } // Intentionally left uninitialized (for speed) + ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) + : user_key(u), sequence(seq), type(t) { } + std::string DebugString(bool hex = false) const; +}; + +// Return the length of the encoding of "key". +inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { + return key.user_key.size() + 8; +} + +extern uint64_t PackSequenceAndType(uint64_t seq, ValueType t); + +// Append the serialization of "key" to *result. +extern void AppendInternalKey(std::string* result, + const ParsedInternalKey& key); + +// Attempt to parse an internal key from "internal_key". On success, +// stores the parsed data in "*result", and returns true. +// +// On error, returns false, leaves "*result" in an undefined state. +extern bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result); + +// Returns the user key portion of an internal key. +inline Slice ExtractUserKey(const Slice& internal_key) { + assert(internal_key.size() >= 8); + return Slice(internal_key.data(), internal_key.size() - 8); +} + +inline ValueType ExtractValueType(const Slice& internal_key) { + assert(internal_key.size() >= 8); + const size_t n = internal_key.size(); + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + return static_cast(c); +} + +// A comparator for internal keys that uses a specified comparator for +// the user key portion and breaks ties by decreasing sequence number. +class InternalKeyComparator : public Comparator { + private: + const Comparator* user_comparator_; + std::string name_; + public: + explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c), + name_("rocksdb.InternalKeyComparator:" + + std::string(user_comparator_->Name())) { + } + virtual ~InternalKeyComparator() {} + + virtual const char* Name() const; + virtual int Compare(const Slice& a, const Slice& b) const; + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const; + virtual void FindShortSuccessor(std::string* key) const; + + const Comparator* user_comparator() const { return user_comparator_; } + + int Compare(const InternalKey& a, const InternalKey& b) const; + int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const; +}; + +// Filter policy wrapper that converts from internal keys to user keys +class InternalFilterPolicy : public FilterPolicy { + private: + const FilterPolicy* const user_policy_; + public: + explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { } + virtual const char* Name() const; + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const; + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const; +}; + +// Modules in this directory should keep internal keys wrapped inside +// the following class instead of plain strings so that we do not +// incorrectly use string comparisons instead of an InternalKeyComparator. +class InternalKey { + private: + std::string rep_; + public: + InternalKey() { } // Leave rep_ as empty to indicate it is invalid + InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { + AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); + } + + bool Valid() const { + ParsedInternalKey parsed; + return ParseInternalKey(Slice(rep_), &parsed); + } + + void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } + Slice Encode() const { + assert(!rep_.empty()); + return rep_; + } + + Slice user_key() const { return ExtractUserKey(rep_); } + + void SetFrom(const ParsedInternalKey& p) { + rep_.clear(); + AppendInternalKey(&rep_, p); + } + + void Clear() { rep_.clear(); } + + std::string DebugString(bool hex = false) const; +}; + +inline int InternalKeyComparator::Compare( + const InternalKey& a, const InternalKey& b) const { + return Compare(a.Encode(), b.Encode()); +} + +inline bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result) { + const size_t n = internal_key.size(); + if (n < 8) return false; + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + result->sequence = num >> 8; + result->type = static_cast(c); + assert(result->type <= ValueType::kMaxValue); + result->user_key = Slice(internal_key.data(), n - 8); + return (c <= static_cast(kValueTypeForSeek)); +} + +// Update the sequence number in the internal key +inline void UpdateInternalKey(char* internal_key, + const size_t internal_key_size, + uint64_t seq, ValueType t) { + assert(internal_key_size >= 8); + char* seqtype = internal_key + internal_key_size - 8; + uint64_t newval = (seq << 8) | t; + EncodeFixed64(seqtype, newval); +} + +// Get the sequence number from the internal key +inline uint64_t GetInternalKeySeqno(const Slice& internal_key) { + const size_t n = internal_key.size(); + assert(n >= 8); + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + return num >> 8; +} + + +// A helper class useful for DBImpl::Get() +class LookupKey { + public: + // Initialize *this for looking up user_key at a snapshot with + // the specified sequence number. + LookupKey(const Slice& user_key, SequenceNumber sequence); + + ~LookupKey(); + + // Return a key suitable for lookup in a MemTable. + Slice memtable_key() const { return Slice(start_, end_ - start_); } + + // Return an internal key (suitable for passing to an internal iterator) + Slice internal_key() const { return Slice(kstart_, end_ - kstart_); } + + // Return the user key + Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); } + + private: + // We construct a char array of the form: + // klength varint32 <-- start_ + // userkey char[klength] <-- kstart_ + // tag uint64 + // <-- end_ + // The array is a suitable MemTable key. + // The suffix starting with "userkey" can be used as an InternalKey. + const char* start_; + const char* kstart_; + const char* end_; + char space_[200]; // Avoid allocation for short keys + + // No copying allowed + LookupKey(const LookupKey&); + void operator=(const LookupKey&); +}; + +inline LookupKey::~LookupKey() { + if (start_ != space_) delete[] start_; +} + +class IterKey { + public: + IterKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {} + + ~IterKey() { ResetBuffer(); } + + Slice GetKey() const { return Slice(key_, key_size_); } + + void Clear() { key_size_ = 0; } + + void SetKey(const Slice& key) { + size_t size = key.size(); + EnlargeBufferIfNeeded(size); + memcpy(key_, key.data(), size); + key_size_ = size; + } + + void SetInternalKey(const Slice& user_key, SequenceNumber s, + ValueType value_type = kValueTypeForSeek) { + size_t usize = user_key.size(); + EnlargeBufferIfNeeded(usize + sizeof(uint64_t)); + memcpy(key_, user_key.data(), usize); + EncodeFixed64(key_ + usize, PackSequenceAndType(s, value_type)); + key_size_ = usize + sizeof(uint64_t); + } + + void SetInternalKey(const ParsedInternalKey& parsed_key) { + SetInternalKey(parsed_key.user_key, parsed_key.sequence, parsed_key.type); + } + + private: + char* key_; + size_t buf_size_; + size_t key_size_; + char space_[32]; // Avoid allocation for short keys + + void ResetBuffer() { + if (key_ != nullptr && key_ != space_) { + delete[] key_; + } + key_ = space_; + buf_size_ = sizeof(space_); + key_size_ = 0; + } + + // Enlarge the buffer size if needed based on key_size. + // By default, static allocated buffer is used. Once there is a key + // larger than the static allocated buffer, another buffer is dynamically + // allocated, until a larger key buffer is requested. In that case, we + // reallocate buffer and delete the old one. + void EnlargeBufferIfNeeded(size_t key_size) { + // If size is smaller than buffer size, continue using current buffer, + // or the static allocated one, as default + if (key_size > buf_size_) { + // Need to enlarge the buffer. + ResetBuffer(); + key_ = new char[key_size]; + buf_size_ = key_size; + } + } + + // No copying allowed + IterKey(const IterKey&) = delete; + void operator=(const IterKey&) = delete; +}; + +class InternalKeySliceTransform : public SliceTransform { + public: + explicit InternalKeySliceTransform(const SliceTransform* transform) + : transform_(transform) {} + + virtual const char* Name() const { return transform_->Name(); } + + virtual Slice Transform(const Slice& src) const { + auto user_key = ExtractUserKey(src); + return transform_->Transform(user_key); + } + + virtual bool InDomain(const Slice& src) const { + auto user_key = ExtractUserKey(src); + return transform_->InDomain(user_key); + } + + virtual bool InRange(const Slice& dst) const { + auto user_key = ExtractUserKey(dst); + return transform_->InRange(user_key); + } + + const SliceTransform* user_prefix_extractor() const { return transform_; } + + private: + // Like comparator, InternalKeySliceTransform will not take care of the + // deletion of transform_ + const SliceTransform* const transform_; +}; + +} // namespace rocksdb diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc new file mode 100644 index 0000000000..b520f3c4ad --- /dev/null +++ b/db/dbformat_test.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/dbformat.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace rocksdb { + +static std::string IKey(const std::string& user_key, + uint64_t seq, + ValueType vt) { + std::string encoded; + AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); + return encoded; +} + +static std::string Shorten(const std::string& s, const std::string& l) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l); + return result; +} + +static std::string ShortSuccessor(const std::string& s) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result); + return result; +} + +static void TestKey(const std::string& key, + uint64_t seq, + ValueType vt) { + std::string encoded = IKey(key, seq, vt); + + Slice in(encoded); + ParsedInternalKey decoded("", 0, kTypeValue); + + ASSERT_TRUE(ParseInternalKey(in, &decoded)); + ASSERT_EQ(key, decoded.user_key.ToString()); + ASSERT_EQ(seq, decoded.sequence); + ASSERT_EQ(vt, decoded.type); + + ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); +} + +class FormatTest { }; + +TEST(FormatTest, InternalKey_EncodeDecode) { + const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; + const uint64_t seq[] = { + 1, 2, 3, + (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, + (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, + (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 + }; + for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { + for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { + TestKey(keys[k], seq[s], kTypeValue); + TestKey("hello", 1, kTypeDeletion); + } + } +} + +TEST(FormatTest, InternalKeyShortSeparator) { + // When user keys are same + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 99, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 101, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeDeletion))); + + // When user keys are misordered + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("bar", 99, kTypeValue))); + + // When user keys are different, but correctly ordered + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("foo", 100, kTypeValue), + IKey("hello", 200, kTypeValue))); + + // When start user key is prefix of limit user key + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foobar", 200, kTypeValue))); + + // When limit user key is prefix of start user key + ASSERT_EQ(IKey("foobar", 100, kTypeValue), + Shorten(IKey("foobar", 100, kTypeValue), + IKey("foo", 200, kTypeValue))); +} + +TEST(FormatTest, InternalKeyShortestSuccessor) { + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + ShortSuccessor(IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), + ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc new file mode 100644 index 0000000000..14f0324c17 --- /dev/null +++ b/db/deletefile_test.cc @@ -0,0 +1,295 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/db.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "rocksdb/env.h" +#include "rocksdb/transaction_log.h" +#include +#include +#include +#include + +namespace rocksdb { + +class DeleteFileTest { + public: + std::string dbname_; + Options options_; + DB* db_; + Env* env_; + int numlevels_; + + DeleteFileTest() { + db_ = nullptr; + env_ = Env::Default(); + options_.write_buffer_size = 1024*1024*1000; + options_.target_file_size_base = 1024*1024*1000; + options_.max_bytes_for_level_base = 1024*1024*1000; + options_.WAL_ttl_seconds = 300; // Used to test log files + options_.WAL_size_limit_MB = 1024; // Used to test log files + dbname_ = test::TmpDir() + "/deletefile_test"; + options_.wal_dir = dbname_ + "/wal_files"; + + // clean up all the files that might have been there before + std::vector old_files; + env_->GetChildren(dbname_, &old_files); + for (auto file : old_files) { + env_->DeleteFile(dbname_ + "/" + file); + } + env_->GetChildren(options_.wal_dir, &old_files); + for (auto file : old_files) { + env_->DeleteFile(options_.wal_dir + "/" + file); + } + + DestroyDB(dbname_, options_); + numlevels_ = 7; + ASSERT_OK(ReopenDB(true)); + } + + Status ReopenDB(bool create) { + delete db_; + if (create) { + DestroyDB(dbname_, options_); + } + db_ = nullptr; + options_.create_if_missing = create; + return DB::Open(options_, dbname_, &db_); + } + + void CloseDB() { + delete db_; + } + + void AddKeys(int numkeys, int startkey = 0) { + WriteOptions options; + options.sync = false; + ReadOptions roptions; + for (int i = startkey; i < (numkeys + startkey) ; i++) { + std::string temp = std::to_string(i); + Slice key(temp); + Slice value(temp); + ASSERT_OK(db_->Put(options, key, value)); + } + } + + int numKeysInLevels( + std::vector &metadata, + std::vector *keysperlevel = nullptr) { + + if (keysperlevel != nullptr) { + keysperlevel->resize(numlevels_); + } + + int numKeys = 0; + for (size_t i = 0; i < metadata.size(); i++) { + int startkey = atoi(metadata[i].smallestkey.c_str()); + int endkey = atoi(metadata[i].largestkey.c_str()); + int numkeysinfile = (endkey - startkey + 1); + numKeys += numkeysinfile; + if (keysperlevel != nullptr) { + (*keysperlevel)[(int)metadata[i].level] += numkeysinfile; + } + fprintf(stderr, "level %d name %s smallest %s largest %s\n", + metadata[i].level, metadata[i].name.c_str(), + metadata[i].smallestkey.c_str(), + metadata[i].largestkey.c_str()); + } + return numKeys; + } + + void CreateTwoLevels() { + AddKeys(50000, 10000); + DBImpl* dbi = reinterpret_cast(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_WaitForFlushMemTable()); + + AddKeys(50000, 10000); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_WaitForFlushMemTable()); + } + + void CheckFileTypeCounts(std::string& dir, + int required_log, + int required_sst, + int required_manifest) { + std::vector filenames; + env_->GetChildren(dir, &filenames); + + int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; + for (auto file : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(file, &number, &type)) { + log_cnt += (type == kLogFile); + sst_cnt += (type == kTableFile); + manifest_cnt += (type == kDescriptorFile); + } + } + ASSERT_EQ(required_log, log_cnt); + ASSERT_EQ(required_sst, sst_cnt); + ASSERT_EQ(required_manifest, manifest_cnt); + } + +}; + +TEST(DeleteFileTest, AddKeysAndQueryLevels) { + CreateTwoLevels(); + std::vector metadata; + std::vector keysinlevel; + db_->GetLiveFilesMetaData(&metadata); + + std::string level1file = ""; + int level1keycount = 0; + std::string level2file = ""; + int level2keycount = 0; + int level1index = 0; + int level2index = 1; + + ASSERT_EQ((int)metadata.size(), 2); + if (metadata[0].level == 2) { + level1index = 1; + level2index = 0; + } + + level1file = metadata[level1index].name; + int startkey = atoi(metadata[level1index].smallestkey.c_str()); + int endkey = atoi(metadata[level1index].largestkey.c_str()); + level1keycount = (endkey - startkey + 1); + level2file = metadata[level2index].name; + startkey = atoi(metadata[level2index].smallestkey.c_str()); + endkey = atoi(metadata[level2index].largestkey.c_str()); + level2keycount = (endkey - startkey + 1); + + // COntrolled setup. Levels 1 and 2 should both have 50K files. + // This is a little fragile as it depends on the current + // compaction heuristics. + ASSERT_EQ(level1keycount, 50000); + ASSERT_EQ(level2keycount, 50000); + + Status status = db_->DeleteFile("0.sst"); + ASSERT_TRUE(status.IsInvalidArgument()); + + // intermediate level files cannot be deleted. + status = db_->DeleteFile(level1file); + ASSERT_TRUE(status.IsInvalidArgument()); + + // Lowest level file deletion should succeed. + ASSERT_OK(db_->DeleteFile(level2file)); + + CloseDB(); +} + +TEST(DeleteFileTest, PurgeObsoleteFilesTest) { + CreateTwoLevels(); + // there should be only one (empty) log file because CreateTwoLevels() + // flushes the memtables to disk + CheckFileTypeCounts(options_.wal_dir, 1, 0, 0); + // 2 ssts, 1 manifest + CheckFileTypeCounts(dbname_, 0, 2, 1); + std::string first("0"), last("999999"); + Slice first_slice(first), last_slice(last); + db_->CompactRange(&first_slice, &last_slice, true, 2); + // 1 sst after compaction + CheckFileTypeCounts(dbname_, 0, 1, 1); + + // this time, we keep an iterator alive + ReopenDB(true); + Iterator *itr = 0; + CreateTwoLevels(); + itr = db_->NewIterator(ReadOptions()); + db_->CompactRange(&first_slice, &last_slice, true, 2); + // 3 sst after compaction with live iterator + CheckFileTypeCounts(dbname_, 0, 3, 1); + delete itr; + // 1 sst after iterator deletion + CheckFileTypeCounts(dbname_, 0, 1, 1); + + CloseDB(); +} + +TEST(DeleteFileTest, DeleteFileWithIterator) { + CreateTwoLevels(); + ReadOptions options; + Iterator* it = db_->NewIterator(options); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + + std::string level2file = ""; + + ASSERT_EQ((int)metadata.size(), 2); + if (metadata[0].level == 1) { + level2file = metadata[1].name; + } else { + level2file = metadata[0].name; + } + + Status status = db_->DeleteFile(level2file); + fprintf(stdout, "Deletion status %s: %s\n", + level2file.c_str(), status.ToString().c_str()); + ASSERT_TRUE(status.ok()); + it->SeekToFirst(); + int numKeysIterated = 0; + while(it->Valid()) { + numKeysIterated++; + it->Next(); + } + ASSERT_EQ(numKeysIterated, 50000); + delete it; + CloseDB(); +} + +TEST(DeleteFileTest, DeleteLogFiles) { + AddKeys(10, 0); + VectorLogPtr logfiles; + db_->GetSortedWalFiles(logfiles); + ASSERT_GT(logfiles.size(), 0UL); + // Take the last log file which is expected to be alive and try to delete it + // Should not succeed because live logs are not allowed to be deleted + std::unique_ptr alive_log = std::move(logfiles.back()); + ASSERT_EQ(alive_log->Type(), kAliveLogFile); + ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName())); + fprintf(stdout, "Deleting alive log file %s\n", + alive_log->PathName().c_str()); + ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok()); + ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName())); + logfiles.clear(); + + // Call Flush to bring about a new working log file and add more keys + // Call Flush again to flush out memtable and move alive log to archived log + // and try to delete the archived log file + FlushOptions fopts; + db_->Flush(fopts); + AddKeys(10, 0); + db_->Flush(fopts); + db_->GetSortedWalFiles(logfiles); + ASSERT_GT(logfiles.size(), 0UL); + std::unique_ptr archived_log = std::move(logfiles.front()); + ASSERT_EQ(archived_log->Type(), kArchivedLogFile); + ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + + archived_log->PathName())); + fprintf(stdout, "Deleting archived log file %s\n", + archived_log->PathName().c_str()); + ASSERT_OK(db_->DeleteFile(archived_log->PathName())); + ASSERT_TRUE(!env_->FileExists(options_.wal_dir + "/" + + archived_log->PathName())); + CloseDB(); +} + +} //namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} + diff --git a/db/file_indexer.cc b/db/file_indexer.cc new file mode 100644 index 0000000000..2de7660241 --- /dev/null +++ b/db/file_indexer.cc @@ -0,0 +1,202 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/file_indexer.h" +#include +#include "rocksdb/comparator.h" +#include "db/version_edit.h" + +namespace rocksdb { + +FileIndexer::FileIndexer(const uint32_t num_levels, + const Comparator* ucmp) + : num_levels_(num_levels), + ucmp_(ucmp), + next_level_index_(num_levels), + level_rb_(num_levels, -1) { +} + + +uint32_t FileIndexer::NumLevelIndex() { + return next_level_index_.size(); +} + +uint32_t FileIndexer::LevelIndexSize(uint32_t level) { + return next_level_index_[level].size(); +} + +void FileIndexer::GetNextLevelIndex( + const uint32_t level, const uint32_t file_index, const int cmp_smallest, + const int cmp_largest, int32_t* left_bound, int32_t* right_bound) { + assert(level > 0); + + // Last level, no hint + if (level == num_levels_ - 1) { + *left_bound = 0; + *right_bound = -1; + return; + } + + assert(level < num_levels_ - 1); + assert(static_cast(file_index) <= level_rb_[level]); + + const auto& index = next_level_index_[level][file_index]; + + if (cmp_smallest < 0) { + *left_bound = (level > 0 && file_index > 0) ? + next_level_index_[level][file_index - 1].largest_lb : 0; + *right_bound = index.smallest_rb; + } else if (cmp_smallest == 0) { + *left_bound = index.smallest_lb; + *right_bound = index.smallest_rb; + } else if (cmp_smallest > 0 && cmp_largest < 0) { + *left_bound = index.smallest_lb; + *right_bound = index.largest_rb; + } else if (cmp_largest == 0) { + *left_bound = index.largest_lb; + *right_bound = index.largest_rb; + } else if (cmp_largest > 0) { + *left_bound = index.largest_lb; + *right_bound = level_rb_[level + 1]; + } else { + assert(false); + } + + assert(*left_bound >= 0); + assert(*left_bound <= *right_bound + 1); + assert(*right_bound <= level_rb_[level + 1]); +} + +void FileIndexer::ClearIndex() { + for (uint32_t level = 1; level < num_levels_; ++level) { + next_level_index_[level].clear(); + } +} + +void FileIndexer::UpdateIndex(std::vector* const files) { + if (files == nullptr) { + return; + } + + // L1 - Ln-1 + for (uint32_t level = 1; level < num_levels_ - 1; ++level) { + const auto& upper_files = files[level]; + const int32_t upper_size = upper_files.size(); + const auto& lower_files = files[level + 1]; + level_rb_[level] = upper_files.size() - 1; + if (upper_size == 0) { + continue; + } + auto& index = next_level_index_[level]; + index.resize(upper_size); + + CalculateLB(upper_files, lower_files, &index, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->Compare(a->smallest.user_key(), b->largest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { + index->smallest_lb = f_idx; + }); + CalculateLB(upper_files, lower_files, &index, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->Compare(a->largest.user_key(), b->largest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { + index->largest_lb = f_idx; + }); + CalculateRB(upper_files, lower_files, &index, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->Compare(a->smallest.user_key(), b->smallest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { + index->smallest_rb = f_idx; + }); + CalculateRB(upper_files, lower_files, &index, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->Compare(a->largest.user_key(), b->smallest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { + index->largest_rb = f_idx; + }); + } + level_rb_[num_levels_ - 1] = files[num_levels_ - 1].size() - 1; +} + +void FileIndexer::CalculateLB(const std::vector& upper_files, + const std::vector& lower_files, + std::vector* index, + std::function cmp_op, + std::function set_index) { + const int32_t upper_size = upper_files.size(); + const int32_t lower_size = lower_files.size(); + int32_t upper_idx = 0; + int32_t lower_idx = 0; + while (upper_idx < upper_size && lower_idx < lower_size) { + int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]); + + if (cmp == 0) { + set_index(&(*index)[upper_idx], lower_idx); + ++upper_idx; + ++lower_idx; + } else if (cmp > 0) { + // Lower level's file (largest) is smaller, a key won't hit in that + // file. Move to next lower file + ++lower_idx; + } else { + // Lower level's file becomes larger, update the index, and + // move to the next upper file + set_index(&(*index)[upper_idx], lower_idx); + ++upper_idx; + } + } + + while (upper_idx < upper_size) { + // Lower files are exhausted, that means the remaining upper files are + // greater than any lower files. Set the index to be the lower level size. + set_index(&(*index)[upper_idx], lower_size); + ++upper_idx; + } +} + +void FileIndexer::CalculateRB(const std::vector& upper_files, + const std::vector& lower_files, + std::vector* index, + std::function cmp_op, + std::function set_index) { + const int32_t upper_size = upper_files.size(); + const int32_t lower_size = lower_files.size(); + int32_t upper_idx = upper_size - 1; + int32_t lower_idx = lower_size - 1; + while (upper_idx >= 0 && lower_idx >= 0) { + int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]); + + if (cmp == 0) { + set_index(&(*index)[upper_idx], lower_idx); + --upper_idx; + --lower_idx; + } else if (cmp < 0) { + // Lower level's file (smallest) is larger, a key won't hit in that + // file. Move to next lower file. + --lower_idx; + } else { + // Lower level's file becomes smaller, update the index, and move to + // the next the upper file + set_index(&(*index)[upper_idx], lower_idx); + --upper_idx; + } + } + while (upper_idx >= 0) { + // Lower files are exhausted, that means the remaining upper files are + // smaller than any lower files. Set it to -1. + set_index(&(*index)[upper_idx], -1); + --upper_idx; + } +} + +} // namespace rocksdb diff --git a/db/file_indexer.h b/db/file_indexer.h new file mode 100644 index 0000000000..5e405dfe9d --- /dev/null +++ b/db/file_indexer.h @@ -0,0 +1,129 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include + +namespace rocksdb { + +class Comparator; +struct FileMetaData; + +// The file tree structure in Version is prebuilt and the range of each file +// is known. On Version::Get(), it uses binary search to find a potential file +// and then check if a target key can be found in the file by comparing the key +// to each file's smallest and largest key. The results of these comparisions +// can be reused beyond checking if a key falls into a file's range. +// With some pre-calculated knowledge, each key comparision that has been done +// can serve as a hint to narrow down further searches: if a key compared to +// be smaller than a file's smallest or largest, that comparison can be used +// to find out the right bound of next binary search. Similarly, if a key +// compared to be larger than a file's smallest or largest, it can be utilized +// to find out the left bound of next binary search. +// With these hints: it can greatly reduce the range of binary search, +// especially for bottom levels, given that one file most likely overlaps with +// only N files from level below (where N is max_bytes_for_level_multiplier). +// So on level L, we will only look at ~N files instead of N^L files on the +// naive approach. +class FileIndexer { + public: + FileIndexer(const uint32_t num_levels, const Comparator* ucmp); + + uint32_t NumLevelIndex(); + + uint32_t LevelIndexSize(uint32_t level); + + // Return a file index range in the next level to search for a key based on + // smallest and largest key comparision for the current file specified by + // level and file_index. When *left_index < *right_index, both index should + // be valid and fit in the vector size. + void GetNextLevelIndex( + const uint32_t level, const uint32_t file_index, const int cmp_smallest, + const int cmp_largest, int32_t* left_bound, int32_t* right_bound); + + void ClearIndex(); + + void UpdateIndex(std::vector* const files); + + enum { + kLevelMaxIndex = std::numeric_limits::max() + }; + + private: + const uint32_t num_levels_; + const Comparator* ucmp_; + + struct IndexUnit { + IndexUnit() + : smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {} + // During file search, a key is compared against smallest and largest + // from a FileMetaData. It can have 3 possible outcomes: + // (1) key is smaller than smallest, implying it is also smaller than + // larger. Precalculated index based on "smallest < smallest" can + // be used to provide right bound. + // (2) key is in between smallest and largest. + // Precalculated index based on "smallest > greatest" can be used to + // provide left bound. + // Precalculated index based on "largest < smallest" can be used to + // provide right bound. + // (3) key is larger than largest, implying it is also larger than smallest. + // Precalculated index based on "largest > largest" can be used to + // provide left bound. + // + // As a result, we will need to do: + // Compare smallest (<=) and largest keys from upper level file with + // smallest key from lower level to get a right bound. + // Compare smallest (>=) and largest keys from upper level file with + // largest key from lower level to get a left bound. + // + // Example: + // level 1: [50 - 60] + // level 2: [1 - 40], [45 - 55], [58 - 80] + // A key 35, compared to be less than 50, 3rd file on level 2 can be + // skipped according to rule (1). LB = 0, RB = 1. + // A key 53, sits in the middle 50 and 60. 1st file on level 2 can be + // skipped according to rule (2)-a, but the 3rd file cannot be skipped + // because 60 is greater than 58. LB = 1, RB = 2. + // A key 70, compared to be larger than 60. 1st and 2nd file can be skipped + // according to rule (3). LB = 2, RB = 2. + // + // Point to a left most file in a lower level that may contain a key, + // which compares greater than smallest of a FileMetaData (upper level) + int32_t smallest_lb; + // Point to a left most file in a lower level that may contain a key, + // which compares greater than largest of a FileMetaData (upper level) + int32_t largest_lb; + // Point to a right most file in a lower level that may contain a key, + // which compares smaller than smallest of a FileMetaData (upper level) + int32_t smallest_rb; + // Point to a right most file in a lower level that may contain a key, + // which compares smaller than largest of a FileMetaData (upper level) + int32_t largest_rb; + }; + + void CalculateLB(const std::vector& upper_files, + const std::vector& lower_files, + std::vector* index, + std::function cmp_op, + std::function set_index); + + void CalculateRB(const std::vector& upper_files, + const std::vector& lower_files, + std::vector* index, + std::function cmp_op, + std::function set_index); + + std::vector> next_level_index_; + std::vector level_rb_; +}; + +} // namespace rocksdb diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc new file mode 100644 index 0000000000..14d67f4e83 --- /dev/null +++ b/db/file_indexer_test.cc @@ -0,0 +1,330 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "db/file_indexer.h" +#include "db/dbformat.h" +#include "db/version_edit.h" +#include "rocksdb/comparator.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class IntComparator : public Comparator { + public: + int Compare(const Slice& a, const Slice& b) const { + assert(a.size() == 8); + assert(b.size() == 8); + return *reinterpret_cast(a.data()) - + *reinterpret_cast(b.data()); + } + + const char* Name() const { + return "IntComparator"; + } + + void FindShortestSeparator(std::string* start, const Slice& limit) const {} + + void FindShortSuccessor(std::string* key) const {} +}; + + +struct FileIndexerTest { + public: + FileIndexerTest() : + kNumLevels(4), indexer(kNumLevels, &ucmp), + files(new std::vector[kNumLevels]) { + } + + ~FileIndexerTest() { + Reset(); + delete[] files; + } + + void AddFile(int level, int64_t smallest, int64_t largest) { + auto* f = new FileMetaData(); + f->smallest = IntKey(smallest); + f->largest = IntKey(largest); + files[level].push_back(f); + } + + InternalKey IntKey(int64_t v) { + return InternalKey(Slice(reinterpret_cast(&v), 8), 0, kTypeValue); + } + + void Reset() { + for (uint32_t i = 0; i < kNumLevels; ++i) { + for (auto* f : files[i]) { + delete f; + } + files[i].clear(); + } + indexer.ClearIndex(); + } + + void GetNextLevelIndex(const uint32_t level, const uint32_t file_index, + const int cmp_smallest, const int cmp_largest, int32_t* left_index, + int32_t* right_index) { + *left_index = 100; + *right_index = 100; + indexer.GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest, + left_index, right_index); + } + + const uint32_t kNumLevels; + IntComparator ucmp; + FileIndexer indexer; + + std::vector* files; +}; + +TEST(FileIndexerTest, next_level_hint) { + for (uint32_t i = 0; i < kNumLevels; ++i) { + ASSERT_EQ(0U, indexer.LevelIndexSize(i)); + } + + // Case 1: no overlap, files are on the left of next level files + // level 1 + AddFile(1, 100, 200); + AddFile(1, 300, 400); + AddFile(1, 500, 600); + // level 2 + AddFile(2, 1500, 1600); + AddFile(2, 1601, 1699); + AddFile(2, 1700, 1800); + // level 3 + AddFile(3, 2500, 2600); + AddFile(3, 2601, 2699); + AddFile(3, 2700, 2800); + indexer.UpdateIndex(files); + int32_t left = 100; + int32_t right = 100; + for (uint32_t level = 1; level < 3; ++level) { + for (uint32_t f = 0; f < 3; ++f) { + GetNextLevelIndex(level, f, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 0, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 1, 0, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 1, 1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(2, right); + } + } + + // Case 2: no overlap, files are on the right of next level files + Reset(); + for (uint32_t i = 1; i < kNumLevels; ++i) { + ASSERT_EQ(0U, indexer.LevelIndexSize(i)); + } + // level 1 + AddFile(1, 2100, 2200); + AddFile(1, 2300, 2400); + AddFile(1, 2500, 2600); + // level 2 + AddFile(2, 1500, 1600); + AddFile(2, 1501, 1699); + AddFile(2, 1700, 1800); + // level 3 + AddFile(3, 500, 600); + AddFile(3, 501, 699); + AddFile(3, 700, 800); + indexer.UpdateIndex(files); + for (uint32_t level = 1; level < 3; ++level) { + for (uint32_t f = 0; f < 3; ++f) { + GetNextLevelIndex(level, f, -1, -1, &left, &right); + ASSERT_EQ(f == 0 ? 0 : 3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 0, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, 0, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, 1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + } + } + + // Case 3: empty L2 + Reset(); + for (uint32_t i = 1; i < kNumLevels; ++i) { + ASSERT_EQ(0U, indexer.LevelIndexSize(i)); + } + // level 1 + AddFile(1, 2100, 2200); + AddFile(1, 2300, 2400); + AddFile(1, 2500, 2600); + // level 3 + AddFile(3, 500, 600); + AddFile(3, 501, 699); + AddFile(3, 700, 800); + indexer.UpdateIndex(files); + for (uint32_t f = 0; f < 3; ++f) { + GetNextLevelIndex(1, f, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 0, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, 0, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, 1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + } + + + // Case 4: mixed + Reset(); + for (uint32_t i = 1; i < kNumLevels; ++i) { + ASSERT_EQ(0U, indexer.LevelIndexSize(i)); + } + // level 1 + AddFile(1, 100, 200); + AddFile(1, 250, 400); + AddFile(1, 450, 500); + // level 2 + AddFile(2, 100, 150); // 0 + AddFile(2, 200, 250); // 1 + AddFile(2, 251, 300); // 2 + AddFile(2, 301, 350); // 3 + AddFile(2, 500, 600); // 4 + // level 3 + AddFile(3, 0, 50); + AddFile(3, 100, 200); + AddFile(3, 201, 250); + indexer.UpdateIndex(files); + // level 1, 0 + GetNextLevelIndex(1, 0, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(0, right); + GetNextLevelIndex(1, 0, 0, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(0, right); + GetNextLevelIndex(1, 0, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 0, 1, 0, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 0, 1, 1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(4, right); + // level 1, 1 + GetNextLevelIndex(1, 1, -1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 1, 0, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 1, 1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 1, 1, 0, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 1, 1, 1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + // level 1, 2 + GetNextLevelIndex(1, 2, -1, -1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 2, 0, -1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 2, 1, -1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + GetNextLevelIndex(1, 2, 1, 0, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + GetNextLevelIndex(1, 2, 1, 1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + // level 2, 0 + GetNextLevelIndex(2, 0, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 0, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 1, 0, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 1, 1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(2, right); + // level 2, 1 + GetNextLevelIndex(2, 1, -1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 1, 0, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 1, 1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, 1, 1, 0, &left, &right); + ASSERT_EQ(2, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, 1, 1, 1, &left, &right); + ASSERT_EQ(2, left); + ASSERT_EQ(2, right); + // level 2, [2 - 4], no overlap + for (uint32_t f = 2; f <= 4; ++f) { + GetNextLevelIndex(2, f, -1, -1, &left, &right); + ASSERT_EQ(f == 2 ? 2 : 3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 0, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 1, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 1, 0, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 1, 1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/filename.cc b/db/filename.cc new file mode 100644 index 0000000000..d19f0fd536 --- /dev/null +++ b/db/filename.cc @@ -0,0 +1,266 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/filename.h" + +#include +#include +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "util/logging.h" + +namespace rocksdb { + +// Given a path, flatten the path name by replacing all chars not in +// {[0-9,a-z,A-Z,-,_,.]} with _. And append '\0' at the end. +// Return the number of chars stored in dest not including the trailing '\0'. +static int FlattenPath(const std::string& path, char* dest, int len) { + int write_idx = 0; + int i = 0; + int src_len = path.size(); + + while (i < src_len && write_idx < len - 1) { + if ((path[i] >= 'a' && path[i] <= 'z') || + (path[i] >= '0' && path[i] <= '9') || + (path[i] >= 'A' && path[i] <= 'Z') || + path[i] == '-' || + path[i] == '.' || + path[i] == '_'){ + dest[write_idx++] = path[i]; + } else { + if (i > 0) + dest[write_idx++] = '_'; + } + i++; + } + + dest[write_idx] = '\0'; + return write_idx; +} + +static std::string MakeFileName(const std::string& name, uint64_t number, + const char* suffix) { + char buf[100]; + snprintf(buf, sizeof(buf), "/%06llu.%s", + static_cast(number), + suffix); + return name + buf; +} + +std::string LogFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "log"); +} + +std::string ArchivalDirectory(const std::string& dir) { + return dir + "/" + ARCHIVAL_DIR; +} +std::string ArchivedLogFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log"); +} + +std::string TableFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "sst"); +} + +std::string DescriptorFileName(const std::string& dbname, uint64_t number) { + assert(number > 0); + char buf[100]; + snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", + static_cast(number)); + return dbname + buf; +} + +std::string CurrentFileName(const std::string& dbname) { + return dbname + "/CURRENT"; +} + +std::string LockFileName(const std::string& dbname) { + return dbname + "/LOCK"; +} + +std::string TempFileName(const std::string& dbname, uint64_t number) { + return MakeFileName(dbname, number, "dbtmp"); +} + +std::string InfoLogFileName(const std::string& dbname, + const std::string& db_path, const std::string& log_dir) { + if (log_dir.empty()) + return dbname + "/LOG"; + + char flatten_db_path[256]; + FlattenPath(db_path, flatten_db_path, 256); + return log_dir + "/" + flatten_db_path + "_LOG"; +} + +// Return the name of the old info log file for "dbname". +std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, + const std::string& db_path, const std::string& log_dir) { + char buf[50]; + snprintf(buf, sizeof(buf), "%llu", static_cast(ts)); + + if (log_dir.empty()) + return dbname + "/LOG.old." + buf; + + char flatten_db_path[256]; + FlattenPath(db_path, flatten_db_path, 256); + return log_dir + "/" + flatten_db_path + "_LOG.old." + buf; +} + +std::string MetaDatabaseName(const std::string& dbname, uint64_t number) { + char buf[100]; + snprintf(buf, sizeof(buf), "/METADB-%llu", + static_cast(number)); + return dbname + buf; +} + +std::string IdentityFileName(const std::string& dbname) { + return dbname + "/IDENTITY"; +} + +// Owned filenames have the form: +// dbname/IDENTITY +// dbname/CURRENT +// dbname/LOCK +// dbname/LOG +// dbname/LOG.old.[0-9]+ +// dbname/MANIFEST-[0-9]+ +// dbname/[0-9]+.(log|sst) +// dbname/METADB-[0-9]+ +// Disregards / at the beginning +bool ParseFileName(const std::string& fname, + uint64_t* number, + FileType* type, + WalFileType* log_type) { + Slice rest(fname); + if (fname.length() > 1 && fname[0] == '/') { + rest.remove_prefix(1); + } + if (rest == "IDENTITY") { + *number = 0; + *type = kIdentityFile; + } else if (rest == "CURRENT") { + *number = 0; + *type = kCurrentFile; + } else if (rest == "LOCK") { + *number = 0; + *type = kDBLockFile; + } else if (rest == "LOG" || rest == "LOG.old") { + *number = 0; + *type = kInfoLogFile; + } else if (rest.starts_with("LOG.old.")) { + uint64_t ts_suffix; + // sizeof also counts the trailing '\0'. + rest.remove_prefix(sizeof("LOG.old.") - 1); + if (!ConsumeDecimalNumber(&rest, &ts_suffix)) { + return false; + } + *number = ts_suffix; + *type = kInfoLogFile; + } else if (rest.starts_with("MANIFEST-")) { + rest.remove_prefix(strlen("MANIFEST-")); + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (!rest.empty()) { + return false; + } + *type = kDescriptorFile; + *number = num; + } else if (rest.starts_with("METADB-")) { + rest.remove_prefix(strlen("METADB-")); + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (!rest.empty()) { + return false; + } + *type = kMetaDatabase; + *number = num; + } else { + // Avoid strtoull() to keep filename format independent of the + // current locale + bool archive_dir_found = false; + if (rest.starts_with(ARCHIVAL_DIR)) { + if (rest.size() <= ARCHIVAL_DIR.size()) { + return false; + } + rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also + if (log_type) { + *log_type = kArchivedLogFile; + } + archive_dir_found = true; + } + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + Slice suffix = rest; + if (suffix == Slice(".log")) { + *type = kLogFile; + if (log_type && !archive_dir_found) { + *log_type = kAliveLogFile; + } + } else if (archive_dir_found) { + return false; // Archive dir can contain only log files + } else if (suffix == Slice(".sst")) { + *type = kTableFile; + } else if (suffix == Slice(".dbtmp")) { + *type = kTempFile; + } else { + return false; + } + *number = num; + } + return true; +} + +Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number, + Directory* directory_to_fsync) { + // Remove leading "dbname/" and add newline to manifest file name + std::string manifest = DescriptorFileName(dbname, descriptor_number); + Slice contents = manifest; + assert(contents.starts_with(dbname + "/")); + contents.remove_prefix(dbname.size() + 1); + std::string tmp = TempFileName(dbname, descriptor_number); + Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp, true); + if (s.ok()) { + s = env->RenameFile(tmp, CurrentFileName(dbname)); + } + if (s.ok()) { + if (directory_to_fsync != nullptr) { + directory_to_fsync->Fsync(); + } + } else { + env->DeleteFile(tmp); + } + return s; +} + +Status SetIdentityFile(Env* env, const std::string& dbname) { + std::string id = env->GenerateUniqueId(); + assert(!id.empty()); + // Reserve the filename dbname/000000.dbtmp for the temporary identity file + std::string tmp = TempFileName(dbname, 0); + Status s = WriteStringToFile(env, id, tmp, true); + if (s.ok()) { + s = env->RenameFile(tmp, IdentityFileName(dbname)); + } + if (!s.ok()) { + env->DeleteFile(tmp); + } + return s; +} + +} // namespace rocksdb diff --git a/db/filename.h b/db/filename.h new file mode 100644 index 0000000000..c4c306946b --- /dev/null +++ b/db/filename.h @@ -0,0 +1,110 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// File names used by DB code + +#pragma once +#include +#include +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/transaction_log.h" +#include "port/port.h" + +namespace rocksdb { + +class Env; +class Directory; + +enum FileType { + kLogFile, + kDBLockFile, + kTableFile, + kDescriptorFile, + kCurrentFile, + kTempFile, + kInfoLogFile, // Either the current one, or an old one + kMetaDatabase, + kIdentityFile +}; + +// Return the name of the log file with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string LogFileName(const std::string& dbname, uint64_t number); + +static const std::string ARCHIVAL_DIR = "archive"; + +extern std::string ArchivalDirectory(const std::string& dbname); + +// Return the name of the archived log file with the specified number +// in the db named by "dbname". The result will be prefixed with "dbname". +extern std::string ArchivedLogFileName(const std::string& dbname, + uint64_t num); + +// Return the name of the sstable with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string TableFileName(const std::string& dbname, uint64_t number); + +// Return the name of the descriptor file for the db named by +// "dbname" and the specified incarnation number. The result will be +// prefixed with "dbname". +extern std::string DescriptorFileName(const std::string& dbname, + uint64_t number); + +// Return the name of the current file. This file contains the name +// of the current manifest file. The result will be prefixed with +// "dbname". +extern std::string CurrentFileName(const std::string& dbname); + +// Return the name of the lock file for the db named by +// "dbname". The result will be prefixed with "dbname". +extern std::string LockFileName(const std::string& dbname); + +// Return the name of a temporary file owned by the db named "dbname". +// The result will be prefixed with "dbname". +extern std::string TempFileName(const std::string& dbname, uint64_t number); + +// Return the name of the info log file for "dbname". +extern std::string InfoLogFileName(const std::string& dbname, + const std::string& db_path="", const std::string& log_dir=""); + +// Return the name of the old info log file for "dbname". +extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, + const std::string& db_path="", const std::string& log_dir=""); + +// Return the name to use for a metadatabase. The result will be prefixed with +// "dbname". +extern std::string MetaDatabaseName(const std::string& dbname, + uint64_t number); + +// Return the name of the Identity file which stores a unique number for the db +// that will get regenerated if the db loses all its data and is recreated fresh +// either from a backup-image or empty +extern std::string IdentityFileName(const std::string& dbname); + +// If filename is a rocksdb file, store the type of the file in *type. +// The number encoded in the filename is stored in *number. If the +// filename was successfully parsed, returns true. Else return false. +extern bool ParseFileName(const std::string& filename, + uint64_t* number, + FileType* type, + WalFileType* log_type = nullptr); + +// Make the CURRENT file point to the descriptor file with the +// specified number. +extern Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number, + Directory* directory_to_fsync); + +// Make the IDENTITY file for the db +extern Status SetIdentityFile(Env* env, const std::string& dbname); + +} // namespace rocksdb diff --git a/db/filename_test.cc b/db/filename_test.cc new file mode 100644 index 0000000000..0baa7fdae1 --- /dev/null +++ b/db/filename_test.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/filename.h" + +#include "db/dbformat.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace rocksdb { + +class FileNameTest { }; + +TEST(FileNameTest, Parse) { + Slice db; + FileType type; + uint64_t number; + + // Successful parses + static struct { + const char* fname; + uint64_t number; + FileType type; + } cases[] = { + { "100.log", 100, kLogFile }, + { "0.log", 0, kLogFile }, + { "0.sst", 0, kTableFile }, + { "CURRENT", 0, kCurrentFile }, + { "LOCK", 0, kDBLockFile }, + { "MANIFEST-2", 2, kDescriptorFile }, + { "MANIFEST-7", 7, kDescriptorFile }, + { "METADB-2", 2, kMetaDatabase }, + { "METADB-7", 7, kMetaDatabase }, + { "LOG", 0, kInfoLogFile }, + { "LOG.old", 0, kInfoLogFile }, + { "18446744073709551615.log", 18446744073709551615ull, kLogFile }, + }; + for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { + std::string f = cases[i].fname; + ASSERT_TRUE(ParseFileName(f, &number, &type)) << f; + ASSERT_EQ(cases[i].type, type) << f; + ASSERT_EQ(cases[i].number, number) << f; + } + + // Errors + static const char* errors[] = { + "", + "foo", + "foo-dx-100.log", + ".log", + "", + "manifest", + "CURREN", + "CURRENTX", + "MANIFES", + "MANIFEST", + "MANIFEST-", + "XMANIFEST-3", + "MANIFEST-3x", + "META", + "METADB", + "METADB-", + "XMETADB-3", + "METADB-3x", + "LOC", + "LOCKx", + "LO", + "LOGx", + "18446744073709551616.log", + "184467440737095516150.log", + "100", + "100.", + "100.lop" + }; + for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { + std::string f = errors[i]; + ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f; + }; +} + +TEST(FileNameTest, Construction) { + uint64_t number; + FileType type; + std::string fname; + + fname = CurrentFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(0U, number); + ASSERT_EQ(kCurrentFile, type); + + fname = LockFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(0U, number); + ASSERT_EQ(kDBLockFile, type); + + fname = LogFileName("foo", 192); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(192U, number); + ASSERT_EQ(kLogFile, type); + + fname = TableFileName("bar", 200); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(200U, number); + ASSERT_EQ(kTableFile, type); + + fname = DescriptorFileName("bar", 100); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(100U, number); + ASSERT_EQ(kDescriptorFile, type); + + fname = TempFileName("tmp", 999); + ASSERT_EQ("tmp/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(999U, number); + ASSERT_EQ(kTempFile, type); + + fname = MetaDatabaseName("met", 100); + ASSERT_EQ("met/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(100U, number); + ASSERT_EQ(kMetaDatabase, type); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc new file mode 100644 index 0000000000..35a31ddc56 --- /dev/null +++ b/db/forward_iterator.cc @@ -0,0 +1,383 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE +#include "db/forward_iterator.h" + +#include +#include +#include +#include "db/db_impl.h" +#include "db/db_iter.h" +#include "db/column_family.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/merger.h" +#include "db/dbformat.h" + +namespace rocksdb { + +// Usage: +// LevelIterator iter; +// iter.SetFileIndex(file_index); +// iter.Seek(target); +// iter.Next() +class LevelIterator : public Iterator { + public: + LevelIterator(const ColumnFamilyData* const cfd, + const ReadOptions& read_options, + const std::vector& files) + : cfd_(cfd), read_options_(read_options), files_(files), valid_(false), + file_index_(std::numeric_limits::max()) {} + + void SetFileIndex(uint32_t file_index) { + assert(file_index < files_.size()); + if (file_index != file_index_) { + file_index_ = file_index; + file_iter_.reset(cfd_->table_cache()->NewIterator( + read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), + *(files_[file_index_]), nullptr /* table_reader_ptr */, false)); + } + valid_ = false; + } + void SeekToLast() override { + status_ = Status::NotSupported("LevelIterator::SeekToLast()"); + valid_ = false; + } + void Prev() { + status_ = Status::NotSupported("LevelIterator::Prev()"); + valid_ = false; + } + bool Valid() const override { + return valid_; + } + void SeekToFirst() override { + SetFileIndex(0); + file_iter_->SeekToFirst(); + valid_ = file_iter_->Valid(); + } + void Seek(const Slice& internal_key) override { + assert(file_iter_ != nullptr); + file_iter_->Seek(internal_key); + valid_ = file_iter_->Valid(); + assert(valid_); + } + void Next() override { + assert(valid_); + file_iter_->Next(); + while (!file_iter_->Valid()) { + if (file_index_ + 1 >= files_.size()) { + valid_ = false; + return; + } + SetFileIndex(file_index_ + 1); + file_iter_->SeekToFirst(); + } + valid_ = file_iter_->Valid(); + } + Slice key() const override { + assert(valid_); + return file_iter_->key(); + } + Slice value() const override { + assert(valid_); + return file_iter_->value(); + } + Status status() const override { + return status_; + } + + private: + const ColumnFamilyData* const cfd_; + const ReadOptions& read_options_; + const std::vector& files_; + + bool valid_; + uint32_t file_index_; + Status status_; + std::unique_ptr file_iter_; +}; + +ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, + ColumnFamilyData* cfd) + : db_(db), + read_options_(read_options), + cfd_(cfd), + prefix_extractor_(cfd->options()->prefix_extractor.get()), + user_comparator_(cfd->user_comparator()), + immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())), + sv_(nullptr), + mutable_iter_(nullptr), + current_(nullptr), + valid_(false), + is_prev_set_(false) {} + +ForwardIterator::~ForwardIterator() { + Cleanup(); +} + +void ForwardIterator::Cleanup() { + delete mutable_iter_; + for (auto* m : imm_iters_) { + delete m; + } + imm_iters_.clear(); + for (auto* f : l0_iters_) { + delete f; + } + l0_iters_.clear(); + for (auto* l : level_iters_) { + delete l; + } + level_iters_.clear(); + + if (sv_ != nullptr && sv_->Unref()) { + DBImpl::DeletionState deletion_state; + db_->mutex_.Lock(); + sv_->Cleanup(); + db_->FindObsoleteFiles(deletion_state, false, true); + db_->mutex_.Unlock(); + delete sv_; + if (deletion_state.HaveSomethingToDelete()) { + db_->PurgeObsoleteFiles(deletion_state); + } + } +} + +bool ForwardIterator::Valid() const { + return valid_; +} + +void ForwardIterator::SeekToFirst() { + if (sv_ == nullptr || + sv_ ->version_number != cfd_->GetSuperVersionNumber()) { + RebuildIterators(); + } + SeekInternal(Slice(), true); +} + +void ForwardIterator::Seek(const Slice& internal_key) { + if (sv_ == nullptr || + sv_ ->version_number != cfd_->GetSuperVersionNumber()) { + RebuildIterators(); + } + SeekInternal(internal_key, false); +} + +void ForwardIterator::SeekInternal(const Slice& internal_key, + bool seek_to_first) { + // mutable + seek_to_first ? mutable_iter_->SeekToFirst() : + mutable_iter_->Seek(internal_key); + + // immutable + // TODO(ljin): NeedToSeekImmutable has negative impact on performance + // if it turns to need to seek immutable often. We probably want to have + // an option to turn it off. + if (seek_to_first || NeedToSeekImmutable(internal_key)) { + { + auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator())); + immutable_min_heap_.swap(tmp); + } + for (auto* m : imm_iters_) { + seek_to_first ? m->SeekToFirst() : m->Seek(internal_key); + if (m->Valid()) { + immutable_min_heap_.push(m); + } + } + + auto* files = sv_->current->files_; + for (uint32_t i = 0; i < files[0].size(); ++i) { + if (seek_to_first) { + l0_iters_[i]->SeekToFirst(); + } else { + // If the target key passes over the larget key, we are sure Next() + // won't go over this file. + if (user_comparator_->Compare(ExtractUserKey(internal_key), + files[0][i]->largest.user_key()) > 0) { + continue; + } + l0_iters_[i]->Seek(internal_key); + } + if (l0_iters_[i]->Valid()) { + immutable_min_heap_.push(l0_iters_[i]); + } + } + for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) { + if (files[level].empty()) { + continue; + } + assert(level_iters_[level - 1] != nullptr); + uint32_t f_idx = 0; + if (!seek_to_first) { + f_idx = FindFileInRange( + files[level], internal_key, 0, files[level].size()); + } + if (f_idx < files[level].size()) { + level_iters_[level - 1]->SetFileIndex(f_idx); + seek_to_first ? level_iters_[level - 1]->SeekToFirst() : + level_iters_[level - 1]->Seek(internal_key); + if (level_iters_[level - 1]->Valid()) { + immutable_min_heap_.push(level_iters_[level - 1]); + } + } + } + + if (seek_to_first || immutable_min_heap_.empty()) { + is_prev_set_ = false; + } else { + prev_key_.SetKey(internal_key); + is_prev_set_ = true; + } + } + + UpdateCurrent(); +} + +void ForwardIterator::Next() { + assert(valid_); + + if (sv_ == nullptr || + sv_ ->version_number != cfd_->GetSuperVersionNumber()) { + std::string current_key = key().ToString(); + Slice old_key(current_key.data(), current_key.size()); + + RebuildIterators(); + SeekInternal(old_key, false); + if (!valid_ || key().compare(old_key) != 0) { + return; + } + } else if (current_ != mutable_iter_) { + // It is going to advance immutable iterator + prev_key_.SetKey(current_->key()); + is_prev_set_ = true; + } + + current_->Next(); + if (current_->Valid() && current_ != mutable_iter_) { + immutable_min_heap_.push(current_); + } + UpdateCurrent(); +} + +Slice ForwardIterator::key() const { + assert(valid_); + return current_->key(); +} + +Slice ForwardIterator::value() const { + assert(valid_); + return current_->value(); +} + +Status ForwardIterator::status() const { + if (!status_.ok()) { + return status_; + } else if (!mutable_iter_->status().ok()) { + return mutable_iter_->status(); + } + return Status::OK(); +} + +void ForwardIterator::RebuildIterators() { + // Clean up + Cleanup(); + // New + sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_)); + mutable_iter_ = sv_->mem->NewIterator(read_options_); + sv_->imm->AddIterators(read_options_, &imm_iters_); + const auto& l0_files = sv_->current->files_[0]; + l0_iters_.reserve(l0_files.size()); + for (const auto* l0 : l0_files) { + l0_iters_.push_back(cfd_->table_cache()->NewIterator( + read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0)); + } + level_iters_.reserve(sv_->current->NumberLevels() - 1); + for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) { + if (sv_->current->files_[level].empty()) { + level_iters_.push_back(nullptr); + } else { + level_iters_.push_back(new LevelIterator(cfd_, read_options_, + sv_->current->files_[level])); + } + } + + current_ = nullptr; + is_prev_set_ = false; +} + +void ForwardIterator::UpdateCurrent() { + if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) { + current_ = nullptr; + } else if (immutable_min_heap_.empty()) { + current_ = mutable_iter_; + } else if (!mutable_iter_->Valid()) { + current_ = immutable_min_heap_.top(); + immutable_min_heap_.pop(); + } else { + current_ = immutable_min_heap_.top(); + assert(current_ != nullptr); + assert(current_->Valid()); + int cmp = cfd_->internal_comparator().InternalKeyComparator::Compare( + mutable_iter_->key(), current_->key()) > 0; + assert(cmp != 0); + if (cmp > 0) { + immutable_min_heap_.pop(); + } else { + current_ = mutable_iter_; + } + } + valid_ = (current_ != nullptr); + if (!status_.ok()) { + status_ = Status::OK(); + } +} + +bool ForwardIterator::NeedToSeekImmutable(const Slice& target) { + if (!is_prev_set_) { + return true; + } + Slice prev_key = prev_key_.GetKey(); + if (prefix_extractor_ && prefix_extractor_->Transform(target).compare( + prefix_extractor_->Transform(prev_key)) != 0) { + return true; + } + if (cfd_->internal_comparator().InternalKeyComparator::Compare( + prev_key, target) >= 0) { + return true; + } + if (immutable_min_heap_.empty() || + cfd_->internal_comparator().InternalKeyComparator::Compare( + target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key() + : current_->key()) > 0) { + return true; + } + return false; +} + +uint32_t ForwardIterator::FindFileInRange( + const std::vector& files, const Slice& internal_key, + uint32_t left, uint32_t right) { + while (left < right) { + uint32_t mid = (left + right) / 2; + const FileMetaData* f = files[mid]; + if (cfd_->internal_comparator().InternalKeyComparator::Compare( + f->largest.Encode(), internal_key) < 0) { + // Key at "mid.largest" is < "target". Therefore all + // files at or before "mid" are uninteresting. + left = mid + 1; + } else { + // Key at "mid.largest" is >= "target". Therefore all files + // after "mid" are uninteresting. + right = mid; + } + } + return right; +} + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/db/forward_iterator.h b/db/forward_iterator.h new file mode 100644 index 0000000000..d539ae3c70 --- /dev/null +++ b/db/forward_iterator.h @@ -0,0 +1,105 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "db/dbformat.h" + +namespace rocksdb { + +class DBImpl; +class Env; +struct SuperVersion; +class ColumnFamilyData; +class LevelIterator; +struct FileMetaData; + +class MinIterComparator { + public: + explicit MinIterComparator(const Comparator* comparator) : + comparator_(comparator) {} + + bool operator()(Iterator* a, Iterator* b) { + return comparator_->Compare(a->key(), b->key()) > 0; + } + private: + const Comparator* comparator_; +}; + +typedef std::priority_queue, + MinIterComparator> MinIterHeap; + +/** + * ForwardIterator is a special type of iterator that only supports Seek() + * and Next(). It is expected to perform better than TailingIterator by + * removing the encapsulation and making all information accessible within + * the iterator. At the current implementation, snapshot is taken at the + * time Seek() is called. The Next() followed do not see new values after. + */ +class ForwardIterator : public Iterator { + public: + ForwardIterator(DBImpl* db, const ReadOptions& read_options, + ColumnFamilyData* cfd); + virtual ~ForwardIterator(); + + void SeekToLast() override { + status_ = Status::NotSupported("ForwardIterator::SeekToLast()"); + valid_ = false; + } + void Prev() { + status_ = Status::NotSupported("ForwardIterator::Prev"); + valid_ = false; + } + + virtual bool Valid() const override; + void SeekToFirst() override; + virtual void Seek(const Slice& target) override; + virtual void Next() override; + virtual Slice key() const override; + virtual Slice value() const override; + virtual Status status() const override; + + private: + void Cleanup(); + void RebuildIterators(); + void SeekInternal(const Slice& internal_key, bool seek_to_first); + void UpdateCurrent(); + bool NeedToSeekImmutable(const Slice& internal_key); + uint32_t FindFileInRange( + const std::vector& files, const Slice& internal_key, + uint32_t left, uint32_t right); + + DBImpl* const db_; + const ReadOptions read_options_; + ColumnFamilyData* const cfd_; + const SliceTransform* const prefix_extractor_; + const Comparator* user_comparator_; + MinIterHeap immutable_min_heap_; + + SuperVersion* sv_; + Iterator* mutable_iter_; + std::vector imm_iters_; + std::vector l0_iters_; + std::vector level_iters_; + Iterator* current_; + // internal iterator status + Status status_; + bool valid_; + + IterKey prev_key_; + bool is_prev_set_; +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/db/internal_stats.cc b/db/internal_stats.cc new file mode 100644 index 0000000000..e8b22a7f82 --- /dev/null +++ b/db/internal_stats.cc @@ -0,0 +1,369 @@ +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/internal_stats.h" +#include "db/column_family.h" + +#include + +namespace rocksdb { + +DBPropertyType GetPropertyType(const Slice& property) { + Slice in = property; + Slice prefix("rocksdb."); + if (!in.starts_with(prefix)) return kUnknown; + in.remove_prefix(prefix.size()); + + if (in.starts_with("num-files-at-level")) { + return kNumFilesAtLevel; + } else if (in == "levelstats") { + return kLevelStats; + } else if (in == "stats") { + return kStats; + } else if (in == "sstables") { + return kSsTables; + } else if (in == "num-immutable-mem-table") { + return kNumImmutableMemTable; + } else if (in == "mem-table-flush-pending") { + return kMemtableFlushPending; + } else if (in == "compaction-pending") { + return kCompactionPending; + } else if (in == "background-errors") { + return kBackgroundErrors; + } else if (in == "cur-size-active-mem-table") { + return kCurSizeActiveMemTable; + } else if (in == "num-entries-active-mem-table") { + return kNumEntriesInMutableMemtable; + } else if (in == "num-entries-imm-mem-tables") { + return kNumEntriesInImmutableMemtable; + } + return kUnknown; +} + +bool InternalStats::GetProperty(DBPropertyType property_type, + const Slice& property, std::string* value, + ColumnFamilyData* cfd) { + Version* current = cfd->current(); + Slice in = property; + + switch (property_type) { + case kNumFilesAtLevel: { + in.remove_prefix(strlen("rocksdb.num-files-at-level")); + uint64_t level; + bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); + if (!ok || (int)level >= number_levels_) { + return false; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", + current->NumLevelFiles(static_cast(level))); + *value = buf; + return true; + } + } + case kLevelStats: { + char buf[1000]; + snprintf(buf, sizeof(buf), + "Level Files Size(MB)\n" + "--------------------\n"); + value->append(buf); + + for (int level = 0; level < number_levels_; level++) { + snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level, + current->NumLevelFiles(level), + current->NumLevelBytes(level) / 1048576.0); + value->append(buf); + } + return true; + } + case kStats: { + char buf[1000]; + + uint64_t wal_bytes = 0; + uint64_t wal_synced = 0; + uint64_t user_bytes_written = 0; + uint64_t write_other = 0; + uint64_t write_self = 0; + uint64_t write_with_wal = 0; + uint64_t total_bytes_written = 0; + uint64_t total_bytes_read = 0; + uint64_t micros_up = env_->NowMicros() - started_at_; + // Add "+1" to make sure seconds_up is > 0 and avoid NaN later + double seconds_up = (micros_up + 1) / 1000000.0; + uint64_t total_slowdown = 0; + uint64_t total_slowdown_count = 0; + uint64_t interval_bytes_written = 0; + uint64_t interval_bytes_read = 0; + uint64_t interval_bytes_new = 0; + double interval_seconds_up = 0; + + if (statistics_) { + wal_bytes = statistics_->getTickerCount(WAL_FILE_BYTES); + wal_synced = statistics_->getTickerCount(WAL_FILE_SYNCED); + user_bytes_written = statistics_->getTickerCount(BYTES_WRITTEN); + write_other = statistics_->getTickerCount(WRITE_DONE_BY_OTHER); + write_self = statistics_->getTickerCount(WRITE_DONE_BY_SELF); + write_with_wal = statistics_->getTickerCount(WRITE_WITH_WAL); + } + + snprintf( + buf, sizeof(buf), + " Compactions\n" + "Level Files Size(MB) Score Time(sec) Read(MB) Write(MB) Rn(MB) " + " " + "Rnp1(MB) Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s) Rn " + "Rnp1 " + " Wnp1 NewW Count msComp msStall Ln-stall Stall-cnt\n" + "--------------------------------------------------------------------" + "--" + "--------------------------------------------------------------------" + "--" + "----------------------------------------------------------------\n"); + value->append(buf); + for (int level = 0; level < number_levels_; level++) { + int files = current->NumLevelFiles(level); + if (compaction_stats_[level].micros > 0 || files > 0) { + int64_t bytes_read = compaction_stats_[level].bytes_readn + + compaction_stats_[level].bytes_readnp1; + int64_t bytes_new = compaction_stats_[level].bytes_written - + compaction_stats_[level].bytes_readnp1; + double amplify = + (compaction_stats_[level].bytes_readn == 0) + ? 0.0 + : (compaction_stats_[level].bytes_written + + compaction_stats_[level].bytes_readnp1 + + compaction_stats_[level].bytes_readn) / + (double)compaction_stats_[level].bytes_readn; + + total_bytes_read += bytes_read; + total_bytes_written += compaction_stats_[level].bytes_written; + + uint64_t stalls = level == 0 ? (stall_counts_[LEVEL0_SLOWDOWN] + + stall_counts_[LEVEL0_NUM_FILES] + + stall_counts_[MEMTABLE_COMPACTION]) + : stall_leveln_slowdown_count_[level]; + + double stall_us = level == 0 ? (stall_micros_[LEVEL0_SLOWDOWN] + + stall_micros_[LEVEL0_NUM_FILES] + + stall_micros_[MEMTABLE_COMPACTION]) + : stall_leveln_slowdown_[level]; + + snprintf(buf, sizeof(buf), + "%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f " + "%10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %8d %9.1f %9.1f " + "%9lu\n", + level, files, current->NumLevelBytes(level) / 1048576.0, + current->NumLevelBytes(level) / + cfd->compaction_picker()->MaxBytesForLevel(level), + compaction_stats_[level].micros / 1e6, + bytes_read / 1048576.0, + compaction_stats_[level].bytes_written / 1048576.0, + compaction_stats_[level].bytes_readn / 1048576.0, + compaction_stats_[level].bytes_readnp1 / 1048576.0, + bytes_new / 1048576.0, amplify, + // +1 to avoid division by 0 + (bytes_read / 1048576.0) / + ((compaction_stats_[level].micros + 1) / 1000000.0), + (compaction_stats_[level].bytes_written / 1048576.0) / + ((compaction_stats_[level].micros + 1) / 1000000.0), + compaction_stats_[level].files_in_leveln, + compaction_stats_[level].files_in_levelnp1, + compaction_stats_[level].files_out_levelnp1, + compaction_stats_[level].files_out_levelnp1 - + compaction_stats_[level].files_in_levelnp1, + compaction_stats_[level].count, + (int)((double)compaction_stats_[level].micros / 1000.0 / + (compaction_stats_[level].count + 1)), + (double)stall_us / 1000.0 / (stalls + 1), + stall_us / 1000000.0, (unsigned long)stalls); + total_slowdown += stall_leveln_slowdown_[level]; + total_slowdown_count += stall_leveln_slowdown_count_[level]; + value->append(buf); + } + } + + interval_bytes_new = user_bytes_written - last_stats_.ingest_bytes_; + interval_bytes_read = + total_bytes_read - last_stats_.compaction_bytes_read_; + interval_bytes_written = + total_bytes_written - last_stats_.compaction_bytes_written_; + interval_seconds_up = seconds_up - last_stats_.seconds_up_; + + snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n", + seconds_up, interval_seconds_up); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Writes cumulative: %llu total, %llu batches, " + "%.1f per batch, %.2f ingest GB\n", + (unsigned long long)(write_other + write_self), + (unsigned long long)write_self, + (write_other + write_self) / (double)(write_self + 1), + user_bytes_written / (1048576.0 * 1024)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "WAL cumulative: %llu WAL writes, %llu WAL syncs, " + "%.2f writes per sync, %.2f GB written\n", + (unsigned long long)write_with_wal, + (unsigned long long)wal_synced, + write_with_wal / (double)(wal_synced + 1), + wal_bytes / (1048576.0 * 1024)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Compaction IO cumulative (GB): " + "%.2f new, %.2f read, %.2f write, %.2f read+write\n", + user_bytes_written / (1048576.0 * 1024), + total_bytes_read / (1048576.0 * 1024), + total_bytes_written / (1048576.0 * 1024), + (total_bytes_read + total_bytes_written) / (1048576.0 * 1024)); + value->append(buf); + + snprintf( + buf, sizeof(buf), + "Compaction IO cumulative (MB/sec): " + "%.1f new, %.1f read, %.1f write, %.1f read+write\n", + user_bytes_written / 1048576.0 / seconds_up, + total_bytes_read / 1048576.0 / seconds_up, + total_bytes_written / 1048576.0 / seconds_up, + (total_bytes_read + total_bytes_written) / 1048576.0 / seconds_up); + value->append(buf); + + // +1 to avoid divide by 0 and NaN + snprintf( + buf, sizeof(buf), + "Amplification cumulative: %.1f write, %.1f compaction\n", + (double)(total_bytes_written + wal_bytes) / (user_bytes_written + 1), + (double)(total_bytes_written + total_bytes_read + wal_bytes) / + (user_bytes_written + 1)); + value->append(buf); + + uint64_t interval_write_other = write_other - last_stats_.write_other_; + uint64_t interval_write_self = write_self - last_stats_.write_self_; + + snprintf(buf, sizeof(buf), + "Writes interval: %llu total, %llu batches, " + "%.1f per batch, %.1f ingest MB\n", + (unsigned long long)(interval_write_other + interval_write_self), + (unsigned long long)interval_write_self, + (double)(interval_write_other + interval_write_self) / + (interval_write_self + 1), + (user_bytes_written - last_stats_.ingest_bytes_) / 1048576.0); + value->append(buf); + + uint64_t interval_write_with_wal = + write_with_wal - last_stats_.write_with_wal_; + + uint64_t interval_wal_synced = wal_synced - last_stats_.wal_synced_; + uint64_t interval_wal_bytes = wal_bytes - last_stats_.wal_bytes_; + + snprintf(buf, sizeof(buf), + "WAL interval: %llu WAL writes, %llu WAL syncs, " + "%.2f writes per sync, %.2f MB written\n", + (unsigned long long)interval_write_with_wal, + (unsigned long long)interval_wal_synced, + interval_write_with_wal / (double)(interval_wal_synced + 1), + interval_wal_bytes / (1048576.0 * 1024)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Compaction IO interval (MB): " + "%.2f new, %.2f read, %.2f write, %.2f read+write\n", + interval_bytes_new / 1048576.0, interval_bytes_read / 1048576.0, + interval_bytes_written / 1048576.0, + (interval_bytes_read + interval_bytes_written) / 1048576.0); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Compaction IO interval (MB/sec): " + "%.1f new, %.1f read, %.1f write, %.1f read+write\n", + interval_bytes_new / 1048576.0 / interval_seconds_up, + interval_bytes_read / 1048576.0 / interval_seconds_up, + interval_bytes_written / 1048576.0 / interval_seconds_up, + (interval_bytes_read + interval_bytes_written) / 1048576.0 / + interval_seconds_up); + value->append(buf); + + // +1 to avoid divide by 0 and NaN + snprintf( + buf, sizeof(buf), + "Amplification interval: %.1f write, %.1f compaction\n", + (double)(interval_bytes_written + wal_bytes) / + (interval_bytes_new + 1), + (double)(interval_bytes_written + interval_bytes_read + wal_bytes) / + (interval_bytes_new + 1)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Stalls(secs): %.3f level0_slowdown, %.3f level0_numfiles, " + "%.3f memtable_compaction, %.3f leveln_slowdown\n", + stall_micros_[LEVEL0_SLOWDOWN] / 1000000.0, + stall_micros_[LEVEL0_NUM_FILES] / 1000000.0, + stall_micros_[MEMTABLE_COMPACTION] / 1000000.0, + total_slowdown / 1000000.0); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Stalls(count): %lu level0_slowdown, %lu level0_numfiles, " + "%lu memtable_compaction, %lu leveln_slowdown\n", + (unsigned long)stall_counts_[LEVEL0_SLOWDOWN], + (unsigned long)stall_counts_[LEVEL0_NUM_FILES], + (unsigned long)stall_counts_[MEMTABLE_COMPACTION], + (unsigned long)total_slowdown_count); + value->append(buf); + + last_stats_.compaction_bytes_read_ = total_bytes_read; + last_stats_.compaction_bytes_written_ = total_bytes_written; + last_stats_.ingest_bytes_ = user_bytes_written; + last_stats_.seconds_up_ = seconds_up; + last_stats_.wal_bytes_ = wal_bytes; + last_stats_.wal_synced_ = wal_synced; + last_stats_.write_with_wal_ = write_with_wal; + last_stats_.write_other_ = write_other; + last_stats_.write_self_ = write_self; + + return true; + } + case kSsTables: + *value = current->DebugString(); + return true; + case kNumImmutableMemTable: + *value = std::to_string(cfd->imm()->size()); + return true; + case kMemtableFlushPending: + // Return number of mem tables that are ready to flush (made immutable) + *value = std::to_string(cfd->imm()->IsFlushPending() ? 1 : 0); + return true; + case kCompactionPending: + // 1 if the system already determines at least one compacdtion is needed. + // 0 otherwise, + *value = std::to_string(current->NeedsCompaction() ? 1 : 0); + return true; + case kBackgroundErrors: + // Accumulated number of errors in background flushes or compactions. + *value = std::to_string(GetBackgroundErrorCount()); + return true; + case kCurSizeActiveMemTable: + // Current size of the active memtable + *value = std::to_string(cfd->mem()->ApproximateMemoryUsage()); + return true; + case kNumEntriesInMutableMemtable: + // Current size of the active memtable + *value = std::to_string(cfd->mem()->GetNumEntries()); + return true; + case kNumEntriesInImmutableMemtable: + // Current size of the active memtable + *value = std::to_string(cfd->imm()->current()->GetTotalNumEntries()); + return true; + default: + return false; + } +} + +} // namespace rocksdb diff --git a/db/internal_stats.h b/db/internal_stats.h new file mode 100644 index 0000000000..2a743593d9 --- /dev/null +++ b/db/internal_stats.h @@ -0,0 +1,187 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// + +#pragma once +#include "rocksdb/statistics.h" +#include "util/statistics.h" +#include "db/version_set.h" + +#include +#include + +class ColumnFamilyData; + +namespace rocksdb { + +class MemTableList; +class DBImpl; + +enum DBPropertyType { + kNumFilesAtLevel, // Number of files at a specific level + kLevelStats, // Return number of files and total sizes of each level + kStats, // Return general statitistics of DB + kSsTables, // Return a human readable string of current SST files + kNumImmutableMemTable, // Return number of immutable mem tables + kMemtableFlushPending, // Return 1 if mem table flushing is pending, + // otherwise 0. + kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0. + kBackgroundErrors, // Return accumulated background errors encountered. + kCurSizeActiveMemTable, // Return current size of the active memtable + kNumEntriesInMutableMemtable, // Return number of entries in the mutable + // memtable. + kNumEntriesInImmutableMemtable, // Return sum of number of entries in all + // the immutable mem tables. + kUnknown, +}; + +extern DBPropertyType GetPropertyType(const Slice& property); + +class InternalStats { + public: + enum WriteStallType { + LEVEL0_SLOWDOWN, + MEMTABLE_COMPACTION, + LEVEL0_NUM_FILES, + WRITE_STALLS_ENUM_MAX, + }; + + InternalStats(int num_levels, Env* env, Statistics* statistics) + : compaction_stats_(num_levels), + stall_micros_(WRITE_STALLS_ENUM_MAX, 0), + stall_counts_(WRITE_STALLS_ENUM_MAX, 0), + stall_leveln_slowdown_(num_levels, 0), + stall_leveln_slowdown_count_(num_levels, 0), + bg_error_count_(0), + number_levels_(num_levels), + statistics_(statistics), + env_(env), + started_at_(env->NowMicros()) {} + + // Per level compaction stats. compaction_stats_[level] stores the stats for + // compactions that produced data for the specified "level". + struct CompactionStats { + uint64_t micros; + + // Bytes read from level N during compaction between levels N and N+1 + int64_t bytes_readn; + + // Bytes read from level N+1 during compaction between levels N and N+1 + int64_t bytes_readnp1; + + // Total bytes written during compaction between levels N and N+1 + int64_t bytes_written; + + // Files read from level N during compaction between levels N and N+1 + int files_in_leveln; + + // Files read from level N+1 during compaction between levels N and N+1 + int files_in_levelnp1; + + // Files written during compaction between levels N and N+1 + int files_out_levelnp1; + + // Number of compactions done + int count; + + CompactionStats() + : micros(0), + bytes_readn(0), + bytes_readnp1(0), + bytes_written(0), + files_in_leveln(0), + files_in_levelnp1(0), + files_out_levelnp1(0), + count(0) {} + + void Add(const CompactionStats& c) { + this->micros += c.micros; + this->bytes_readn += c.bytes_readn; + this->bytes_readnp1 += c.bytes_readnp1; + this->bytes_written += c.bytes_written; + this->files_in_leveln += c.files_in_leveln; + this->files_in_levelnp1 += c.files_in_levelnp1; + this->files_out_levelnp1 += c.files_out_levelnp1; + this->count += 1; + } + }; + + void AddCompactionStats(int level, const CompactionStats& stats) { + compaction_stats_[level].Add(stats); + } + + void RecordWriteStall(WriteStallType write_stall_type, uint64_t micros) { + stall_micros_[write_stall_type] += micros; + stall_counts_[write_stall_type]++; + } + + void RecordLevelNSlowdown(int level, uint64_t micros) { + stall_leveln_slowdown_[level] += micros; + stall_leveln_slowdown_count_[level] += micros; + } + + uint64_t GetBackgroundErrorCount() const { return bg_error_count_; } + + uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; } + + bool GetProperty(DBPropertyType property_type, const Slice& property, + std::string* value, ColumnFamilyData* cfd); + + private: + std::vector compaction_stats_; + + // Used to compute per-interval statistics + struct StatsSnapshot { + uint64_t compaction_bytes_read_; // Bytes read by compaction + uint64_t compaction_bytes_written_; // Bytes written by compaction + uint64_t ingest_bytes_; // Bytes written by user + uint64_t wal_bytes_; // Bytes written to WAL + uint64_t wal_synced_; // Number of times WAL is synced + uint64_t write_with_wal_; // Number of writes that request WAL + // These count the number of writes processed by the calling thread or + // another thread. + uint64_t write_other_; + uint64_t write_self_; + double seconds_up_; + + StatsSnapshot() + : compaction_bytes_read_(0), + compaction_bytes_written_(0), + ingest_bytes_(0), + wal_bytes_(0), + wal_synced_(0), + write_with_wal_(0), + write_other_(0), + write_self_(0), + seconds_up_(0) {} + }; + + // Counters from the previous time per-interval stats were computed + StatsSnapshot last_stats_; + + // These count the number of microseconds for which MakeRoomForWrite stalls. + std::vector stall_micros_; + std::vector stall_counts_; + std::vector stall_leveln_slowdown_; + std::vector stall_leveln_slowdown_count_; + + // Total number of background errors encountered. Every time a flush task + // or compaction task fails, this counter is incremented. The failure can + // be caused by any possible reason, including file system errors, out of + // resources, or input file corruption. Failing when retrying the same flush + // or compaction will cause the counter to increase too. + uint64_t bg_error_count_; + + int number_levels_; + Statistics* statistics_; + Env* env_; + uint64_t started_at_; +}; + +} // namespace rocksdb diff --git a/db/log_and_apply_bench.cc b/db/log_and_apply_bench.cc new file mode 100644 index 0000000000..ab9716deb3 --- /dev/null +++ b/db/log_and_apply_bench.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + + +#include + +#include "util/testharness.h" +#include "util/benchharness.h" +#include "db/version_set.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +std::string MakeKey(unsigned int num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%016u", num); + return std::string(buf); +} + +void BM_LogAndApply(int iters, int num_base_files) { + VersionSet* vset; + ColumnFamilyData* default_cfd; + uint64_t fnum = 1; + port::Mutex mu; + MutexLock l(&mu); + + BENCHMARK_SUSPEND { + std::string dbname = test::TmpDir() + "/rocksdb_test_benchmark"; + ASSERT_OK(DestroyDB(dbname, Options())); + + DB* db = nullptr; + Options opts; + opts.create_if_missing = true; + Status s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; + + Options options; + EnvOptions sopt; + vset = new VersionSet(dbname, &options, sopt, nullptr); + std::vector dummy; + dummy.push_back(ColumnFamilyDescriptor()); + ASSERT_OK(vset->Recover(dummy)); + default_cfd = vset->GetColumnFamilySet()->GetDefault(); + VersionEdit vbase; + for (int i = 0; i < num_base_files; i++) { + InternalKey start(MakeKey(2 * fnum), 1, kTypeValue); + InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion); + vbase.AddFile(2, ++fnum, 1 /* file size */, start, limit, 1, 1); + } + ASSERT_OK(vset->LogAndApply(default_cfd, &vbase, &mu)); + } + + for (int i = 0; i < iters; i++) { + VersionEdit vedit; + vedit.DeleteFile(2, fnum); + InternalKey start(MakeKey(2 * fnum), 1, kTypeValue); + InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion); + vedit.AddFile(2, ++fnum, 1 /* file size */, start, limit, 1, 1); + vset->LogAndApply(default_cfd, &vedit, &mu); + } +} + +BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_1_file, 1000, 1) +BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_100_files, 1000, 100) +BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_10000_files, 1000, 10000) +BENCHMARK_NAMED_PARAM(BM_LogAndApply, 100_iters_100000_files, 100, 100000) + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::benchmark::RunBenchmarks(); + return 0; +} diff --git a/db/log_format.h b/db/log_format.h new file mode 100644 index 0000000000..919c087e24 --- /dev/null +++ b/db/log_format.h @@ -0,0 +1,35 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Log format information shared by reader and writer. +// See ../doc/log_format.txt for more detail. + +#pragma once +namespace rocksdb { +namespace log { + +enum RecordType { + // Zero is reserved for preallocated files + kZeroType = 0, + kFullType = 1, + + // For fragments + kFirstType = 2, + kMiddleType = 3, + kLastType = 4 +}; +static const int kMaxRecordType = kLastType; + +static const unsigned int kBlockSize = 32768; + +// Header is checksum (4 bytes), type (1 byte), length (2 bytes). +static const int kHeaderSize = 4 + 1 + 2; + +} // namespace log +} // namespace rocksdb diff --git a/db/log_reader.cc b/db/log_reader.cc new file mode 100644 index 0000000000..be1fb8ceb6 --- /dev/null +++ b/db/log_reader.cc @@ -0,0 +1,339 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" + +#include +#include "rocksdb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace rocksdb { +namespace log { + +Reader::Reporter::~Reporter() { +} + +Reader::Reader(unique_ptr&& file, Reporter* reporter, + bool checksum, uint64_t initial_offset) + : file_(std::move(file)), + reporter_(reporter), + checksum_(checksum), + backing_store_(new char[kBlockSize]), + buffer_(), + eof_(false), + read_error_(false), + eof_offset_(0), + last_record_offset_(0), + end_of_buffer_offset_(0), + initial_offset_(initial_offset) { +} + +Reader::~Reader() { + delete[] backing_store_; +} + +bool Reader::SkipToInitialBlock() { + size_t offset_in_block = initial_offset_ % kBlockSize; + uint64_t block_start_location = initial_offset_ - offset_in_block; + + // Don't search a block if we'd be in the trailer + if (offset_in_block > kBlockSize - 6) { + offset_in_block = 0; + block_start_location += kBlockSize; + } + + end_of_buffer_offset_ = block_start_location; + + // Skip to start of first block that can contain the initial record + if (block_start_location > 0) { + Status skip_status = file_->Skip(block_start_location); + if (!skip_status.ok()) { + ReportDrop(block_start_location, skip_status); + return false; + } + } + + return true; +} + +bool Reader::ReadRecord(Slice* record, std::string* scratch) { + if (last_record_offset_ < initial_offset_) { + if (!SkipToInitialBlock()) { + return false; + } + } + + scratch->clear(); + record->clear(); + bool in_fragmented_record = false; + // Record offset of the logical record that we're reading + // 0 is a dummy value to make compilers happy + uint64_t prospective_record_offset = 0; + + Slice fragment; + while (true) { + uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); + const unsigned int record_type = ReadPhysicalRecord(&fragment); + switch (record_type) { + case kFullType: + if (in_fragmented_record) { + // Handle bug in earlier versions of log::Writer where + // it could emit an empty kFirstType record at the tail end + // of a block followed by a kFullType or kFirstType record + // at the beginning of the next block. + if (scratch->empty()) { + in_fragmented_record = false; + } else { + ReportCorruption(scratch->size(), "partial record without end(1)"); + } + } + prospective_record_offset = physical_record_offset; + scratch->clear(); + *record = fragment; + last_record_offset_ = prospective_record_offset; + return true; + + case kFirstType: + if (in_fragmented_record) { + // Handle bug in earlier versions of log::Writer where + // it could emit an empty kFirstType record at the tail end + // of a block followed by a kFullType or kFirstType record + // at the beginning of the next block. + if (scratch->empty()) { + in_fragmented_record = false; + } else { + ReportCorruption(scratch->size(), "partial record without end(2)"); + } + } + prospective_record_offset = physical_record_offset; + scratch->assign(fragment.data(), fragment.size()); + in_fragmented_record = true; + break; + + case kMiddleType: + if (!in_fragmented_record) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(1)"); + } else { + scratch->append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + if (!in_fragmented_record) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(2)"); + } else { + scratch->append(fragment.data(), fragment.size()); + *record = Slice(*scratch); + last_record_offset_ = prospective_record_offset; + return true; + } + break; + + case kEof: + if (in_fragmented_record) { + // This can be caused by the writer dying immediately after + // writing a physical record but before completing the next; don't + // treat it as a corruption, just ignore the entire logical record. + scratch->clear(); + } + return false; + + case kBadRecord: + if (in_fragmented_record) { + ReportCorruption(scratch->size(), "error in middle of record"); + in_fragmented_record = false; + scratch->clear(); + } + break; + + default: { + char buf[40]; + snprintf(buf, sizeof(buf), "unknown record type %u", record_type); + ReportCorruption( + (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), + buf); + in_fragmented_record = false; + scratch->clear(); + break; + } + } + } + return false; +} + +uint64_t Reader::LastRecordOffset() { + return last_record_offset_; +} + +void Reader::UnmarkEOF() { + if (read_error_) { + return; + } + + eof_ = false; + + if (eof_offset_ == 0) { + return; + } + + // If the EOF was in the middle of a block (a partial block was read) we have + // to read the rest of the block as ReadPhysicalRecord can only read full + // blocks and expects the file position indicator to be aligned to the start + // of a block. + // + // consumed_bytes + buffer_size() + remaining == kBlockSize + + size_t consumed_bytes = eof_offset_ - buffer_.size(); + size_t remaining = kBlockSize - eof_offset_; + + // backing_store_ is used to concatenate what is left in buffer_ and + // the remainder of the block. If buffer_ already uses backing_store_, + // we just append the new data. + if (buffer_.data() != backing_store_ + consumed_bytes) { + // Buffer_ does not use backing_store_ for storage. + // Copy what is left in buffer_ to backing_store. + memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size()); + } + + Slice read_buffer; + Status status = file_->Read(remaining, &read_buffer, + backing_store_ + eof_offset_); + + size_t added = read_buffer.size(); + end_of_buffer_offset_ += added; + + if (!status.ok()) { + if (added > 0) { + ReportDrop(added, status); + } + + read_error_ = true; + return; + } + + if (read_buffer.data() != backing_store_ + eof_offset_) { + // Read did not write to backing_store_ + memmove(backing_store_ + eof_offset_, read_buffer.data(), + read_buffer.size()); + } + + buffer_ = Slice(backing_store_ + consumed_bytes, + eof_offset_ + added - consumed_bytes); + + if (added < remaining) { + eof_ = true; + eof_offset_ += added; + } else { + eof_offset_ = 0; + } +} + +void Reader::ReportCorruption(size_t bytes, const char* reason) { + ReportDrop(bytes, Status::Corruption(reason)); +} + +void Reader::ReportDrop(size_t bytes, const Status& reason) { + if (reporter_ != nullptr && + end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) { + reporter_->Corruption(bytes, reason); + } +} + +unsigned int Reader::ReadPhysicalRecord(Slice* result) { + while (true) { + if (buffer_.size() < (size_t)kHeaderSize) { + if (!eof_ && !read_error_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + Status status = file_->Read(kBlockSize, &buffer_, backing_store_); + end_of_buffer_offset_ += buffer_.size(); + if (!status.ok()) { + buffer_.clear(); + ReportDrop(kBlockSize, status); + read_error_ = true; + return kEof; + } else if (buffer_.size() < (size_t)kBlockSize) { + eof_ = true; + eof_offset_ = buffer_.size(); + } + continue; + } else { + // Note that if buffer_ is non-empty, we have a truncated header at the + // end of the file, which can be caused by the writer crashing in the + // middle of writing the header. Instead of considering this an error, + // just report EOF. + buffer_.clear(); + return kEof; + } + } + + // Parse the header + const char* header = buffer_.data(); + const uint32_t a = static_cast(header[4]) & 0xff; + const uint32_t b = static_cast(header[5]) & 0xff; + const unsigned int type = header[6]; + const uint32_t length = a | (b << 8); + if (kHeaderSize + length > buffer_.size()) { + size_t drop_size = buffer_.size(); + buffer_.clear(); + if (!eof_) { + ReportCorruption(drop_size, "bad record length"); + return kBadRecord; + } + // If the end of the file has been reached without reading |length| bytes + // of payload, assume the writer died in the middle of writing the record. + // Don't report a corruption. + return kEof; + } + + if (type == kZeroType && length == 0) { + // Skip zero length record without reporting any drops since + // such records are produced by the mmap based writing code in + // env_posix.cc that preallocates file regions. + // NOTE: this should never happen in DB written by new RocksDB versions, + // since we turn off mmap writes to manifest and log files + buffer_.clear(); + return kBadRecord; + } + + // Check crc + if (checksum_) { + uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); + uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); + if (actual_crc != expected_crc) { + // Drop the rest of the buffer since "length" itself may have + // been corrupted and if we trust it, we could find some + // fragment of a real log record that just happens to look + // like a valid log record. + size_t drop_size = buffer_.size(); + buffer_.clear(); + ReportCorruption(drop_size, "checksum mismatch"); + return kBadRecord; + } + } + + buffer_.remove_prefix(kHeaderSize + length); + + // Skip physical record that started before initial_offset_ + if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length < + initial_offset_) { + result->clear(); + return kBadRecord; + } + + *result = Slice(header + kHeaderSize, length); + return type; + } +} + +} // namespace log +} // namespace rocksdb diff --git a/db/log_reader.h b/db/log_reader.h new file mode 100644 index 0000000000..81d334da29 --- /dev/null +++ b/db/log_reader.h @@ -0,0 +1,130 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "db/log_format.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +class SequentialFile; +using std::unique_ptr; + +namespace log { + +class Reader { + public: + // Interface for reporting errors. + class Reporter { + public: + virtual ~Reporter(); + + // Some corruption was detected. "size" is the approximate number + // of bytes dropped due to the corruption. + virtual void Corruption(size_t bytes, const Status& status) = 0; + }; + + // Create a reader that will return log records from "*file". + // "*file" must remain live while this Reader is in use. + // + // If "reporter" is non-nullptr, it is notified whenever some data is + // dropped due to a detected corruption. "*reporter" must remain + // live while this Reader is in use. + // + // If "checksum" is true, verify checksums if available. + // + // The Reader will start reading at the first record located at physical + // position >= initial_offset within the file. + Reader(unique_ptr&& file, Reporter* reporter, + bool checksum, uint64_t initial_offset); + + ~Reader(); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. May use + // "*scratch" as temporary storage. The contents filled in *record + // will only be valid until the next mutating operation on this + // reader or the next mutation to *scratch. + bool ReadRecord(Slice* record, std::string* scratch); + + // Returns the physical offset of the last record returned by ReadRecord. + // + // Undefined before the first call to ReadRecord. + uint64_t LastRecordOffset(); + + // returns true if the reader has encountered an eof condition. + bool IsEOF() { + return eof_; + } + + // when we know more data has been written to the file. we can use this + // function to force the reader to look again in the file. + // Also aligns the file position indicator to the start of the next block + // by reading the rest of the data from the EOF position to the end of the + // block that was partially read. + void UnmarkEOF(); + + SequentialFile* file() { return file_.get(); } + + private: + const unique_ptr file_; + Reporter* const reporter_; + bool const checksum_; + char* const backing_store_; + Slice buffer_; + bool eof_; // Last Read() indicated EOF by returning < kBlockSize + bool read_error_; // Error occurred while reading from file + + // Offset of the file position indicator within the last block when an + // EOF was detected. + size_t eof_offset_; + + // Offset of the last record returned by ReadRecord. + uint64_t last_record_offset_; + // Offset of the first location past the end of buffer_. + uint64_t end_of_buffer_offset_; + + // Offset at which to start looking for the first record to return + uint64_t const initial_offset_; + + // Extend record types with the following special values + enum { + kEof = kMaxRecordType + 1, + // Returned whenever we find an invalid physical record. + // Currently there are three situations in which this happens: + // * The record has an invalid CRC (ReadPhysicalRecord reports a drop) + // * The record is a 0-length record (No drop is reported) + // * The record is below constructor's initial_offset (No drop is reported) + kBadRecord = kMaxRecordType + 2 + }; + + // Skips all blocks that are completely before "initial_offset_". + // + // Returns true on success. Handles reporting. + bool SkipToInitialBlock(); + + // Return type, or one of the preceding special values + unsigned int ReadPhysicalRecord(Slice* result); + + // Reports dropped bytes to the reporter. + // buffer_ must be updated to remove the dropped bytes prior to invocation. + void ReportCorruption(size_t bytes, const char* reason); + void ReportDrop(size_t bytes, const Status& reason); + + // No copying allowed + Reader(const Reader&); + void operator=(const Reader&); +}; + +} // namespace log +} // namespace rocksdb diff --git a/db/log_test.cc b/db/log_test.cc new file mode 100644 index 0000000000..6577a6a9cb --- /dev/null +++ b/db/log_test.cc @@ -0,0 +1,689 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "rocksdb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace rocksdb { +namespace log { + +// Construct a string of the specified length made out of the supplied +// partial string. +static std::string BigString(const std::string& partial_string, size_t n) { + std::string result; + while (result.size() < n) { + result.append(partial_string); + } + result.resize(n); + return result; +} + +// Construct a string from a number +static std::string NumberString(int n) { + char buf[50]; + snprintf(buf, sizeof(buf), "%d.", n); + return std::string(buf); +} + +// Return a skewed potentially long string +static std::string RandomSkewedString(int i, Random* rnd) { + return BigString(NumberString(i), rnd->Skewed(17)); +} + +class LogTest { + private: + class StringDest : public WritableFile { + public: + std::string contents_; + + explicit StringDest(Slice& reader_contents) : + WritableFile(), + contents_(""), + reader_contents_(reader_contents), + last_flush_(0) { + reader_contents_ = Slice(contents_.data(), 0); + }; + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { + ASSERT_TRUE(reader_contents_.size() <= last_flush_); + size_t offset = last_flush_ - reader_contents_.size(); + reader_contents_ = Slice( + contents_.data() + offset, + contents_.size() - offset); + last_flush_ = contents_.size(); + + return Status::OK(); + } + virtual Status Sync() { return Status::OK(); } + virtual Status Append(const Slice& slice) { + contents_.append(slice.data(), slice.size()); + return Status::OK(); + } + void Drop(size_t bytes) { + contents_.resize(contents_.size() - bytes); + reader_contents_ = Slice( + reader_contents_.data(), reader_contents_.size() - bytes); + last_flush_ = contents_.size(); + } + + private: + Slice& reader_contents_; + size_t last_flush_; + }; + + class StringSource : public SequentialFile { + public: + Slice& contents_; + bool force_error_; + size_t force_error_position_; + bool force_eof_; + size_t force_eof_position_; + bool returned_partial_; + explicit StringSource(Slice& contents) : + contents_(contents), + force_error_(false), + force_error_position_(0), + force_eof_(false), + force_eof_position_(0), + returned_partial_(false) { } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; + + if (force_error_) { + if (force_error_position_ >= n) { + force_error_position_ -= n; + } else { + *result = Slice(contents_.data(), force_error_position_); + contents_.remove_prefix(force_error_position_); + force_error_ = false; + returned_partial_ = true; + return Status::Corruption("read error"); + } + } + + if (contents_.size() < n) { + n = contents_.size(); + returned_partial_ = true; + } + + if (force_eof_) { + if (force_eof_position_ >= n) { + force_eof_position_ -= n; + } else { + force_eof_ = false; + n = force_eof_position_; + returned_partial_ = true; + } + } + + // By using scratch we ensure that caller has control over the + // lifetime of result.data() + memcpy(scratch, contents_.data(), n); + *result = Slice(scratch, n); + + contents_.remove_prefix(n); + return Status::OK(); + } + + virtual Status Skip(uint64_t n) { + if (n > contents_.size()) { + contents_.clear(); + return Status::NotFound("in-memory file skipepd past end"); + } + + contents_.remove_prefix(n); + + return Status::OK(); + } + }; + + class ReportCollector : public Reader::Reporter { + public: + size_t dropped_bytes_; + std::string message_; + + ReportCollector() : dropped_bytes_(0) { } + virtual void Corruption(size_t bytes, const Status& status) { + dropped_bytes_ += bytes; + message_.append(status.ToString()); + } + }; + + std::string& dest_contents() { + auto dest = dynamic_cast(writer_.file()); + assert(dest); + return dest->contents_; + } + + const std::string& dest_contents() const { + auto dest = dynamic_cast(writer_.file()); + assert(dest); + return dest->contents_; + } + + void reset_source_contents() { + auto src = dynamic_cast(reader_.file()); + assert(src); + src->contents_ = dest_contents(); + } + + Slice reader_contents_; + unique_ptr dest_holder_; + unique_ptr source_holder_; + ReportCollector report_; + Writer writer_; + Reader reader_; + + // Record metadata for testing initial offset functionality + static size_t initial_offset_record_sizes_[]; + static uint64_t initial_offset_last_record_offsets_[]; + + public: + LogTest() : reader_contents_(), + dest_holder_(new StringDest(reader_contents_)), + source_holder_(new StringSource(reader_contents_)), + writer_(std::move(dest_holder_)), + reader_(std::move(source_holder_), &report_, true/*checksum*/, + 0/*initial_offset*/) { + } + + void Write(const std::string& msg) { + writer_.AddRecord(Slice(msg)); + } + + size_t WrittenBytes() const { + return dest_contents().size(); + } + + std::string Read() { + std::string scratch; + Slice record; + if (reader_.ReadRecord(&record, &scratch)) { + return record.ToString(); + } else { + return "EOF"; + } + } + + void IncrementByte(int offset, int delta) { + dest_contents()[offset] += delta; + } + + void SetByte(int offset, char new_byte) { + dest_contents()[offset] = new_byte; + } + + void ShrinkSize(int bytes) { + auto dest = dynamic_cast(writer_.file()); + assert(dest); + dest->Drop(bytes); + } + + void FixChecksum(int header_offset, int len) { + // Compute crc of type/len/data + uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len); + crc = crc32c::Mask(crc); + EncodeFixed32(&dest_contents()[header_offset], crc); + } + + void ForceError(size_t position = 0) { + auto src = dynamic_cast(reader_.file()); + src->force_error_ = true; + src->force_error_position_ = position; + } + + size_t DroppedBytes() const { + return report_.dropped_bytes_; + } + + std::string ReportMessage() const { + return report_.message_; + } + + void ForceEOF(size_t position = 0) { + auto src = dynamic_cast(reader_.file()); + src->force_eof_ = true; + src->force_eof_position_ = position; + } + + void UnmarkEOF() { + auto src = dynamic_cast(reader_.file()); + src->returned_partial_ = false; + reader_.UnmarkEOF(); + } + + bool IsEOF() { + return reader_.IsEOF(); + } + + // Returns OK iff recorded error message contains "msg" + std::string MatchError(const std::string& msg) const { + if (report_.message_.find(msg) == std::string::npos) { + return report_.message_; + } else { + return "OK"; + } + } + + void WriteInitialOffsetLog() { + for (int i = 0; i < 4; i++) { + std::string record(initial_offset_record_sizes_[i], + static_cast('a' + i)); + Write(record); + } + } + + void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) { + WriteInitialOffsetLog(); + unique_ptr source(new StringSource(reader_contents_)); + unique_ptr offset_reader( + new Reader(std::move(source), &report_, true/*checksum*/, + WrittenBytes() + offset_past_end)); + Slice record; + std::string scratch; + ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch)); + } + + void CheckInitialOffsetRecord(uint64_t initial_offset, + int expected_record_offset) { + WriteInitialOffsetLog(); + unique_ptr source(new StringSource(reader_contents_)); + unique_ptr offset_reader( + new Reader(std::move(source), &report_, true/*checksum*/, + initial_offset)); + Slice record; + std::string scratch; + ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); + ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset], + record.size()); + ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset], + offset_reader->LastRecordOffset()); + ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]); + } + +}; + +size_t LogTest::initial_offset_record_sizes_[] = + {10000, // Two sizable records in first block + 10000, + 2 * log::kBlockSize - 1000, // Span three blocks + 1}; + +uint64_t LogTest::initial_offset_last_record_offsets_[] = + {0, + kHeaderSize + 10000, + 2 * (kHeaderSize + 10000), + 2 * (kHeaderSize + 10000) + + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize}; + + +TEST(LogTest, Empty) { + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, ReadWrite) { + Write("foo"); + Write("bar"); + Write(""); + Write("xxxx"); + ASSERT_EQ("foo", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("xxxx", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("EOF", Read()); // Make sure reads at eof work +} + +TEST(LogTest, ManyBlocks) { + for (int i = 0; i < 100000; i++) { + Write(NumberString(i)); + } + for (int i = 0; i < 100000; i++) { + ASSERT_EQ(NumberString(i), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, Fragmentation) { + Write("small"); + Write(BigString("medium", 50000)); + Write(BigString("large", 100000)); + ASSERT_EQ("small", Read()); + ASSERT_EQ(BigString("medium", 50000), Read()); + ASSERT_EQ(BigString("large", 100000), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, MarginalTrailer) { + // Make a trailer that is exactly the same length as an empty record. + const int n = kBlockSize - 2*kHeaderSize; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, MarginalTrailer2) { + // Make a trailer that is exactly the same length as an empty record. + const int n = kBlockSize - 2*kHeaderSize; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes()); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); +} + +TEST(LogTest, ShortTrailer) { + const int n = kBlockSize - 2*kHeaderSize + 4; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, AlignedEof) { + const int n = kBlockSize - 2*kHeaderSize + 4; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes()); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, RandomRead) { + const int N = 500; + Random write_rnd(301); + for (int i = 0; i < N; i++) { + Write(RandomSkewedString(i, &write_rnd)); + } + Random read_rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +// Tests of all the error paths in log_reader.cc follow: + +TEST(LogTest, ReadError) { + Write("foo"); + ForceError(); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes()); + ASSERT_EQ("OK", MatchError("read error")); +} + +TEST(LogTest, BadRecordType) { + Write("foo"); + // Type is stored in header[6] + IncrementByte(6, 100); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("unknown record type")); +} + +TEST(LogTest, TruncatedTrailingRecordIsIgnored) { + Write("foo"); + ShrinkSize(4); // Drop all payload as well as a header byte + ASSERT_EQ("EOF", Read()); + // Truncated last record is ignored, not treated as an error + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); +} + +TEST(LogTest, BadLength) { + const int kPayloadSize = kBlockSize - kHeaderSize; + Write(BigString("bar", kPayloadSize)); + Write("foo"); + // Least significant size byte is stored in header[4]. + IncrementByte(4, 1); + ASSERT_EQ("foo", Read()); + ASSERT_EQ(kBlockSize, DroppedBytes()); + ASSERT_EQ("OK", MatchError("bad record length")); +} + +TEST(LogTest, BadLengthAtEndIsIgnored) { + Write("foo"); + ShrinkSize(1); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); +} + +TEST(LogTest, ChecksumMismatch) { + Write("foo"); + IncrementByte(0, 10); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(10U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("checksum mismatch")); +} + +TEST(LogTest, UnexpectedMiddleType) { + Write("foo"); + SetByte(6, kMiddleType); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST(LogTest, UnexpectedLastType) { + Write("foo"); + SetByte(6, kLastType); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST(LogTest, UnexpectedFullType) { + Write("foo"); + Write("bar"); + SetByte(6, kFirstType); + FixChecksum(0, 3); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST(LogTest, UnexpectedFirstType) { + Write("foo"); + Write(BigString("bar", 100000)); + SetByte(6, kFirstType); + FixChecksum(0, 3); + ASSERT_EQ(BigString("bar", 100000), Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST(LogTest, MissingLastIsIgnored) { + Write(BigString("bar", kBlockSize)); + // Remove the LAST block, including header. + ShrinkSize(14); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("", ReportMessage()); + ASSERT_EQ(0U, DroppedBytes()); +} + +TEST(LogTest, PartialLastIsIgnored) { + Write(BigString("bar", kBlockSize)); + // Cause a bad record length in the LAST block. + ShrinkSize(1); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("", ReportMessage()); + ASSERT_EQ(0U, DroppedBytes()); +} + +TEST(LogTest, ErrorJoinsRecords) { + // Consider two fragmented records: + // first(R1) last(R1) first(R2) last(R2) + // where the middle two fragments disappear. We do not want + // first(R1),last(R2) to get joined and returned as a valid record. + + // Write records that span two blocks + Write(BigString("foo", kBlockSize)); + Write(BigString("bar", kBlockSize)); + Write("correct"); + + // Wipe the middle block + for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) { + SetByte(offset, 'x'); + } + + ASSERT_EQ("correct", Read()); + ASSERT_EQ("EOF", Read()); + const unsigned int dropped = DroppedBytes(); + ASSERT_LE(dropped, 2*kBlockSize + 100); + ASSERT_GE(dropped, 2*kBlockSize); +} + +TEST(LogTest, ReadStart) { + CheckInitialOffsetRecord(0, 0); +} + +TEST(LogTest, ReadSecondOneOff) { + CheckInitialOffsetRecord(1, 1); +} + +TEST(LogTest, ReadSecondTenThousand) { + CheckInitialOffsetRecord(10000, 1); +} + +TEST(LogTest, ReadSecondStart) { + CheckInitialOffsetRecord(10007, 1); +} + +TEST(LogTest, ReadThirdOneOff) { + CheckInitialOffsetRecord(10008, 2); +} + +TEST(LogTest, ReadThirdStart) { + CheckInitialOffsetRecord(20014, 2); +} + +TEST(LogTest, ReadFourthOneOff) { + CheckInitialOffsetRecord(20015, 3); +} + +TEST(LogTest, ReadFourthFirstBlockTrailer) { + CheckInitialOffsetRecord(log::kBlockSize - 4, 3); +} + +TEST(LogTest, ReadFourthMiddleBlock) { + CheckInitialOffsetRecord(log::kBlockSize + 1, 3); +} + +TEST(LogTest, ReadFourthLastBlock) { + CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3); +} + +TEST(LogTest, ReadFourthStart) { + CheckInitialOffsetRecord( + 2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize, + 3); +} + +TEST(LogTest, ReadEnd) { + CheckOffsetPastEndReturnsNoRecords(0); +} + +TEST(LogTest, ReadPastEnd) { + CheckOffsetPastEndReturnsNoRecords(5); +} + +TEST(LogTest, ClearEofSingleBlock) { + Write("foo"); + Write("bar"); + ForceEOF(3 + kHeaderSize + 2); + ASSERT_EQ("foo", Read()); + UnmarkEOF(); + ASSERT_EQ("bar", Read()); + ASSERT_TRUE(IsEOF()); + ASSERT_EQ("EOF", Read()); + Write("xxx"); + UnmarkEOF(); + ASSERT_EQ("xxx", Read()); + ASSERT_TRUE(IsEOF()); +} + +TEST(LogTest, ClearEofMultiBlock) { + size_t num_full_blocks = 5; + size_t n = (kBlockSize - kHeaderSize) * num_full_blocks + 25; + Write(BigString("foo", n)); + Write(BigString("bar", n)); + ForceEOF(n + num_full_blocks * kHeaderSize + 10); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_TRUE(IsEOF()); + UnmarkEOF(); + ASSERT_EQ(BigString("bar", n), Read()); + ASSERT_TRUE(IsEOF()); + Write(BigString("xxx", n)); + UnmarkEOF(); + ASSERT_EQ(BigString("xxx", n), Read()); + ASSERT_TRUE(IsEOF()); +} + +TEST(LogTest, ClearEofError) { + // If an error occurs during Read() in UnmarkEOF(), the records contained + // in the buffer should be returned on subsequent calls of ReadRecord() + // until no more full records are left, whereafter ReadRecord() should return + // false to indicate that it cannot read any further. + + Write("foo"); + Write("bar"); + UnmarkEOF(); + ASSERT_EQ("foo", Read()); + ASSERT_TRUE(IsEOF()); + Write("xxx"); + ForceError(0); + UnmarkEOF(); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, ClearEofError2) { + Write("foo"); + Write("bar"); + UnmarkEOF(); + ASSERT_EQ("foo", Read()); + Write("xxx"); + ForceError(3); + UnmarkEOF(); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("read error")); +} + +} // namespace log +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/log_writer.cc b/db/log_writer.cc new file mode 100644 index 0000000000..df601a4706 --- /dev/null +++ b/db/log_writer.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_writer.h" + +#include +#include "rocksdb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace rocksdb { +namespace log { + +Writer::Writer(unique_ptr&& dest) + : dest_(std::move(dest)), + block_offset_(0) { + for (int i = 0; i <= kMaxRecordType; i++) { + char t = static_cast(i); + type_crc_[i] = crc32c::Value(&t, 1); + } +} + +Writer::~Writer() { +} + +Status Writer::AddRecord(const Slice& slice) { + const char* ptr = slice.data(); + size_t left = slice.size(); + + // Fragment the record if necessary and emit it. Note that if slice + // is empty, we still want to iterate once to emit a single + // zero-length record + Status s; + bool begin = true; + do { + const int leftover = kBlockSize - block_offset_; + assert(leftover >= 0); + if (leftover < kHeaderSize) { + // Switch to a new block + if (leftover > 0) { + // Fill the trailer (literal below relies on kHeaderSize being 7) + assert(kHeaderSize == 7); + dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); + } + block_offset_ = 0; + } + + // Invariant: we never leave < kHeaderSize bytes in a block. + assert(kBlockSize - block_offset_ - kHeaderSize >= 0); + + const size_t avail = kBlockSize - block_offset_ - kHeaderSize; + const size_t fragment_length = (left < avail) ? left : avail; + + RecordType type; + const bool end = (left == fragment_length); + if (begin && end) { + type = kFullType; + } else if (begin) { + type = kFirstType; + } else if (end) { + type = kLastType; + } else { + type = kMiddleType; + } + + s = EmitPhysicalRecord(type, ptr, fragment_length); + ptr += fragment_length; + left -= fragment_length; + begin = false; + } while (s.ok() && left > 0); + return s; +} + +Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { + assert(n <= 0xffff); // Must fit in two bytes + assert(block_offset_ + kHeaderSize + n <= kBlockSize); + + // Format the header + char buf[kHeaderSize]; + buf[4] = static_cast(n & 0xff); + buf[5] = static_cast(n >> 8); + buf[6] = static_cast(t); + + // Compute the crc of the record type and the payload. + uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); + crc = crc32c::Mask(crc); // Adjust for storage + EncodeFixed32(buf, crc); + + // Write the header and the payload + Status s = dest_->Append(Slice(buf, kHeaderSize)); + if (s.ok()) { + s = dest_->Append(Slice(ptr, n)); + if (s.ok()) { + s = dest_->Flush(); + } + } + block_offset_ += kHeaderSize + n; + return s; +} + +} // namespace log +} // namespace rocksdb diff --git a/db/log_writer.h b/db/log_writer.h new file mode 100644 index 0000000000..d7b7afff09 --- /dev/null +++ b/db/log_writer.h @@ -0,0 +1,55 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "db/log_format.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +class WritableFile; + +using std::unique_ptr; + +namespace log { + +class Writer { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this Writer is in use. + explicit Writer(unique_ptr&& dest); + ~Writer(); + + Status AddRecord(const Slice& slice); + + WritableFile* file() { return dest_.get(); } + const WritableFile* file() const { return dest_.get(); } + + private: + unique_ptr dest_; + int block_offset_; // Current offset in block + + // crc32c values for all supported record types. These are + // pre-computed to reduce the overhead of computing the crc of the + // record type stored in the header. + uint32_t type_crc_[kMaxRecordType + 1]; + + Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); + + // No copying allowed + Writer(const Writer&); + void operator=(const Writer&); +}; + +} // namespace log +} // namespace rocksdb diff --git a/db/memtable.cc b/db/memtable.cc new file mode 100644 index 0000000000..c6b915b997 --- /dev/null +++ b/db/memtable.cc @@ -0,0 +1,620 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/memtable.h" + +#include +#include +#include + +#include "db/dbformat.h" +#include "db/merge_context.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice_transform.h" +#include "table/merger.h" +#include "util/arena.h" +#include "util/coding.h" +#include "util/murmurhash.h" +#include "util/mutexlock.h" +#include "util/perf_context_imp.h" +#include "util/statistics.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) + : comparator_(cmp), + refs_(0), + kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)), + kWriteBufferSize(options.write_buffer_size), + arena_(options.arena_block_size), + table_(options.memtable_factory->CreateMemTableRep( + comparator_, &arena_, options.prefix_extractor.get(), + options.info_log.get())), + num_entries_(0), + flush_in_progress_(false), + flush_completed_(false), + file_number_(0), + first_seqno_(0), + mem_next_logfile_number_(0), + locks_(options.inplace_update_support ? options.inplace_update_num_locks + : 0), + prefix_extractor_(options.prefix_extractor.get()), + should_flush_(ShouldFlushNow()) { + // if should_flush_ == true without an entry inserted, something must have + // gone wrong already. + assert(!should_flush_); + if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) { + prefix_bloom_.reset(new DynamicBloom( + options.memtable_prefix_bloom_bits, options.bloom_locality, + options.memtable_prefix_bloom_probes, nullptr, + options.memtable_prefix_bloom_huge_page_tlb_size, + options.info_log.get())); + } +} + +MemTable::~MemTable() { + assert(refs_ == 0); +} + +size_t MemTable::ApproximateMemoryUsage() { + size_t arena_usage = arena_.ApproximateMemoryUsage(); + size_t table_usage = table_->ApproximateMemoryUsage(); + // let MAX_USAGE = std::numeric_limits::max() + // then if arena_usage + total_usage >= MAX_USAGE, return MAX_USAGE. + // the following variation is to avoid numeric overflow. + if (arena_usage >= std::numeric_limits::max() - table_usage) { + return std::numeric_limits::max(); + } + // otherwise, return the actual usage + return arena_usage + table_usage; +} + +bool MemTable::ShouldFlushNow() const { + // In a lot of times, we cannot allocate arena blocks that exactly matches the + // buffer size. Thus we have to decide if we should over-allocate or + // under-allocate. + // This constant avariable can be interpreted as: if we still have more than + // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over + // allocate one more block. + const double kAllowOverAllocationRatio = 0.6; + + // If arena still have room for new block allocation, we can safely say it + // shouldn't flush. + auto allocated_memory = + table_->ApproximateMemoryUsage() + arena_.MemoryAllocatedBytes(); + + // if we can still allocate one more block without exceeding the + // over-allocation ratio, then we should not flush. + if (allocated_memory + kArenaBlockSize < + kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) { + return false; + } + + // if user keeps adding entries that exceeds kWriteBufferSize, we need to + // flush earlier even though we still have much available memory left. + if (allocated_memory > + kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) { + return true; + } + + // In this code path, Arena has already allocated its "last block", which + // means the total allocatedmemory size is either: + // (1) "moderately" over allocated the memory (no more than `0.6 * arena + // block size`. Or, + // (2) the allocated memory is less than write buffer size, but we'll stop + // here since if we allocate a new arena block, we'll over allocate too much + // more (half of the arena block size) memory. + // + // In either case, to avoid over-allocate, the last block will stop allocation + // when its usage reaches a certain ratio, which we carefully choose "0.75 + // full" as the stop condition because it addresses the following issue with + // great simplicity: What if the next inserted entry's size is + // bigger than AllocatedAndUnused()? + // + // The answer is: if the entry size is also bigger than 0.25 * + // kArenaBlockSize, a dedicated block will be allocated for it; otherwise + // arena will anyway skip the AllocatedAndUnused() and allocate a new, empty + // and regular block. In either case, we *overly* over-allocated. + // + // Therefore, setting the last block to be at most "0.75 full" avoids both + // cases. + // + // NOTE: the average percentage of waste space of this approach can be counted + // as: "arena block size * 0.25 / write buffer size". User who specify a small + // write buffer size and/or big arena block size may suffer. + return arena_.AllocatedAndUnused() < kArenaBlockSize / 4; +} + +int MemTable::KeyComparator::operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const { + // Internal keys are encoded as length-prefixed strings. + Slice k1 = GetLengthPrefixedSlice(prefix_len_key1); + Slice k2 = GetLengthPrefixedSlice(prefix_len_key2); + return comparator.Compare(k1, k2); +} + +int MemTable::KeyComparator::operator()(const char* prefix_len_key, + const Slice& key) + const { + // Internal keys are encoded as length-prefixed strings. + Slice a = GetLengthPrefixedSlice(prefix_len_key); + return comparator.Compare(a, key); +} + +Slice MemTableRep::UserKey(const char* key) const { + Slice slice = GetLengthPrefixedSlice(key); + return Slice(slice.data(), slice.size() - 8); +} + +KeyHandle MemTableRep::Allocate(const size_t len, char** buf) { + *buf = arena_->Allocate(len); + return static_cast(*buf); +} + +// Encode a suitable internal key target for "target" and return it. +// Uses *scratch as scratch space, and the returned pointer will point +// into this scratch space. +const char* EncodeKey(std::string* scratch, const Slice& target) { + scratch->clear(); + PutVarint32(scratch, target.size()); + scratch->append(target.data(), target.size()); + return scratch->data(); +} + +class MemTableIterator: public Iterator { + public: + MemTableIterator(const MemTable& mem, const ReadOptions& options, + bool enforce_total_order, Arena* arena) + : bloom_(nullptr), + prefix_extractor_(mem.prefix_extractor_), + valid_(false), + arena_mode_(arena != nullptr) { + if (prefix_extractor_ != nullptr && !enforce_total_order) { + bloom_ = mem.prefix_bloom_.get(); + iter_ = mem.table_->GetDynamicPrefixIterator(arena); + } else { + iter_ = mem.table_->GetIterator(arena); + } + } + + ~MemTableIterator() { + if (arena_mode_) { + iter_->~Iterator(); + } else { + delete iter_; + } + } + + virtual bool Valid() const { return valid_; } + virtual void Seek(const Slice& k) { + if (bloom_ != nullptr && + !bloom_->MayContain(prefix_extractor_->Transform(ExtractUserKey(k)))) { + valid_ = false; + return; + } + iter_->Seek(k, nullptr); + valid_ = iter_->Valid(); + } + virtual void SeekToFirst() { + iter_->SeekToFirst(); + valid_ = iter_->Valid(); + } + virtual void SeekToLast() { + iter_->SeekToLast(); + valid_ = iter_->Valid(); + } + virtual void Next() { + assert(Valid()); + iter_->Next(); + valid_ = iter_->Valid(); + } + virtual void Prev() { + assert(Valid()); + iter_->Prev(); + valid_ = iter_->Valid(); + } + virtual Slice key() const { + assert(Valid()); + return GetLengthPrefixedSlice(iter_->key()); + } + virtual Slice value() const { + assert(Valid()); + Slice key_slice = GetLengthPrefixedSlice(iter_->key()); + return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); + } + + virtual Status status() const { return Status::OK(); } + + private: + DynamicBloom* bloom_; + const SliceTransform* const prefix_extractor_; + MemTableRep::Iterator* iter_; + bool valid_; + bool arena_mode_; + + // No copying allowed + MemTableIterator(const MemTableIterator&); + void operator=(const MemTableIterator&); +}; + +Iterator* MemTable::NewIterator(const ReadOptions& options, + bool enforce_total_order, Arena* arena) { + if (arena == nullptr) { + return new MemTableIterator(*this, options, enforce_total_order, nullptr); + } else { + auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); + return new (mem) + MemTableIterator(*this, options, enforce_total_order, arena); + } +} + +port::RWMutex* MemTable::GetLock(const Slice& key) { + static murmur_hash hash; + return &locks_[hash(key) % locks_.size()]; +} + +void MemTable::Add(SequenceNumber s, ValueType type, + const Slice& key, /* user key */ + const Slice& value) { + // Format of an entry is concatenation of: + // key_size : varint32 of internal_key.size() + // key bytes : char[internal_key.size()] + // value_size : varint32 of value.size() + // value bytes : char[value.size()] + size_t key_size = key.size(); + size_t val_size = value.size(); + size_t internal_key_size = key_size + 8; + const size_t encoded_len = + VarintLength(internal_key_size) + internal_key_size + + VarintLength(val_size) + val_size; + char* buf = nullptr; + KeyHandle handle = table_->Allocate(encoded_len, &buf); + assert(buf != nullptr); + char* p = EncodeVarint32(buf, internal_key_size); + memcpy(p, key.data(), key_size); + p += key_size; + EncodeFixed64(p, (s << 8) | type); + p += 8; + p = EncodeVarint32(p, val_size); + memcpy(p, value.data(), val_size); + assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); + table_->Insert(handle); + num_entries_++; + + if (prefix_bloom_) { + assert(prefix_extractor_); + prefix_bloom_->Add(prefix_extractor_->Transform(key)); + } + + // The first sequence number inserted into the memtable + assert(first_seqno_ == 0 || s > first_seqno_); + if (first_seqno_ == 0) { + first_seqno_ = s; + } + + should_flush_ = ShouldFlushNow(); +} + +// Callback from MemTable::Get() +namespace { + +struct Saver { + Status* status; + const LookupKey* key; + bool* found_final_value; // Is value set correctly? Used by KeyMayExist + bool* merge_in_progress; + std::string* value; + const MergeOperator* merge_operator; + // the merge operations encountered; + MergeContext* merge_context; + MemTable* mem; + Logger* logger; + Statistics* statistics; + bool inplace_update_support; +}; +} // namespace + +static bool SaveValue(void* arg, const char* entry) { + Saver* s = reinterpret_cast(arg); + MergeContext* merge_context = s->merge_context; + const MergeOperator* merge_operator = s->merge_operator; + + assert(s != nullptr && merge_context != nullptr); + + // entry format is: + // klength varint32 + // userkey char[klength-8] + // tag uint64 + // vlength varint32 + // value char[vlength] + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + uint32_t key_length; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (s->mem->GetInternalKeyComparator().user_comparator()->Compare( + Slice(key_ptr, key_length - 8), s->key->user_key()) == 0) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + switch (static_cast(tag & 0xff)) { + case kTypeValue: { + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadLock(); + } + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + *(s->status) = Status::OK(); + if (*(s->merge_in_progress)) { + assert(merge_operator); + if (!merge_operator->FullMerge(s->key->user_key(), &v, + merge_context->GetOperands(), s->value, + s->logger)) { + RecordTick(s->statistics, NUMBER_MERGE_FAILURES); + *(s->status) = + Status::Corruption("Error: Could not perform merge."); + } + } else { + s->value->assign(v.data(), v.size()); + } + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->Unlock(); + } + *(s->found_final_value) = true; + return false; + } + case kTypeDeletion: { + if (*(s->merge_in_progress)) { + assert(merge_operator); + *(s->status) = Status::OK(); + if (!merge_operator->FullMerge(s->key->user_key(), nullptr, + merge_context->GetOperands(), s->value, + s->logger)) { + RecordTick(s->statistics, NUMBER_MERGE_FAILURES); + *(s->status) = + Status::Corruption("Error: Could not perform merge."); + } + } else { + *(s->status) = Status::NotFound(); + } + *(s->found_final_value) = true; + return false; + } + case kTypeMerge: { + std::string merge_result; // temporary area for merge results later + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + *(s->merge_in_progress) = true; + merge_context->PushOperand(v); + return true; + } + default: + assert(false); + return true; + } + } + + // s->state could be Corrupt, merge or notfound + return false; +} + +bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, + MergeContext& merge_context, const Options& options) { + PERF_TIMER_AUTO(get_from_memtable_time); + + Slice user_key = key.user_key(); + bool found_final_value = false; + bool merge_in_progress = s->IsMergeInProgress(); + + if (prefix_bloom_ && + !prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) { + // iter is null if prefix bloom says the key does not exist + } else { + Saver saver; + saver.status = s; + saver.found_final_value = &found_final_value; + saver.merge_in_progress = &merge_in_progress; + saver.key = &key; + saver.value = value; + saver.status = s; + saver.mem = this; + saver.merge_context = &merge_context; + saver.merge_operator = options.merge_operator.get(); + saver.logger = options.info_log.get(); + saver.inplace_update_support = options.inplace_update_support; + saver.statistics = options.statistics.get(); + table_->Get(key, &saver, SaveValue); + } + + // No change to value, since we have not yet found a Put/Delete + if (!found_final_value && merge_in_progress) { + *s = Status::MergeInProgress(""); + } + PERF_TIMER_STOP(get_from_memtable_time); + PERF_COUNTER_ADD(get_from_memtable_count, 1); + return found_final_value; +} + +void MemTable::Update(SequenceNumber seq, + const Slice& key, + const Slice& value) { + LookupKey lkey(key, seq); + Slice mem_key = lkey.memtable_key(); + + std::unique_ptr iter( + table_->GetIterator(lkey.user_key())); + iter->Seek(lkey.internal_key(), mem_key.data()); + + if (iter->Valid()) { + // entry format is: + // key_length varint32 + // userkey char[klength-8] + // tag uint64 + // vlength varint32 + // value char[vlength] + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Compare( + Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + switch (static_cast(tag & 0xff)) { + case kTypeValue: { + Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); + uint32_t prev_size = prev_value.size(); + uint32_t new_size = value.size(); + + // Update value, if new value size <= previous value size + if (new_size <= prev_size ) { + char* p = EncodeVarint32(const_cast(key_ptr) + key_length, + new_size); + WriteLock wl(GetLock(lkey.user_key())); + memcpy(p, value.data(), value.size()); + assert((unsigned)((p + value.size()) - entry) == + (unsigned)(VarintLength(key_length) + key_length + + VarintLength(value.size()) + value.size())); + return; + } + } + default: + // If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData + // we don't have enough space for update inplace + Add(seq, kTypeValue, key, value); + return; + } + } + } + + // key doesn't exist + Add(seq, kTypeValue, key, value); +} + +bool MemTable::UpdateCallback(SequenceNumber seq, + const Slice& key, + const Slice& delta, + const Options& options) { + LookupKey lkey(key, seq); + Slice memkey = lkey.memtable_key(); + + std::unique_ptr iter( + table_->GetIterator(lkey.user_key())); + iter->Seek(lkey.internal_key(), memkey.data()); + + if (iter->Valid()) { + // entry format is: + // key_length varint32 + // userkey char[klength-8] + // tag uint64 + // vlength varint32 + // value char[vlength] + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Compare( + Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + switch (static_cast(tag & 0xff)) { + case kTypeValue: { + Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); + uint32_t prev_size = prev_value.size(); + + char* prev_buffer = const_cast(prev_value.data()); + uint32_t new_prev_size = prev_size; + + std::string str_value; + WriteLock wl(GetLock(lkey.user_key())); + auto status = options.inplace_callback(prev_buffer, &new_prev_size, + delta, &str_value); + if (status == UpdateStatus::UPDATED_INPLACE) { + // Value already updated by callback. + assert(new_prev_size <= prev_size); + if (new_prev_size < prev_size) { + // overwrite the new prev_size + char* p = EncodeVarint32(const_cast(key_ptr) + key_length, + new_prev_size); + if (VarintLength(new_prev_size) < VarintLength(prev_size)) { + // shift the value buffer as well. + memcpy(p, prev_buffer, new_prev_size); + } + } + RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED); + should_flush_ = ShouldFlushNow(); + return true; + } else if (status == UpdateStatus::UPDATED) { + Add(seq, kTypeValue, key, Slice(str_value)); + RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN); + should_flush_ = ShouldFlushNow(); + return true; + } else if (status == UpdateStatus::UPDATE_FAILED) { + // No action required. Return. + should_flush_ = ShouldFlushNow(); + return true; + } + } + default: + break; + } + } + } + // If the latest value is not kTypeValue + // or key doesn't exist + return false; +} + +size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) { + Slice memkey = key.memtable_key(); + + // A total ordered iterator is costly for some memtablerep (prefix aware + // reps). By passing in the user key, we allow efficient iterator creation. + // The iterator only needs to be ordered within the same user key. + std::unique_ptr iter( + table_->GetIterator(key.user_key())); + iter->Seek(key.internal_key(), memkey.data()); + + size_t num_successive_merges = 0; + + for (; iter->Valid(); iter->Next()) { + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Compare( + Slice(iter_key_ptr, key_length - 8), key.user_key()) != 0) { + break; + } + + const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8); + if (static_cast(tag & 0xff) != kTypeMerge) { + break; + } + + ++num_successive_merges; + } + + return num_successive_merges; +} + +void MemTableRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + auto iter = GetIterator(k.user_key()); + for (iter->Seek(k.internal_key(), k.memtable_key().data()); + iter->Valid() && callback_func(callback_args, iter->key()); + iter->Next()) { + } +} + +} // namespace rocksdb diff --git a/db/memtable.h b/db/memtable.h new file mode 100644 index 0000000000..8bad2773a3 --- /dev/null +++ b/db/memtable.h @@ -0,0 +1,222 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include "db/dbformat.h" +#include "db/skiplist.h" +#include "db/version_edit.h" +#include "rocksdb/db.h" +#include "rocksdb/memtablerep.h" +#include "util/arena.h" +#include "util/dynamic_bloom.h" + +namespace rocksdb { + +class Arena; +class Mutex; +class MemTableIterator; +class MergeContext; + +class MemTable { + public: + struct KeyComparator : public MemTableRep::KeyComparator { + const InternalKeyComparator comparator; + explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } + virtual int operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const; + virtual int operator()(const char* prefix_len_key, + const Slice& key) const override; + }; + + // MemTables are reference counted. The initial reference count + // is zero and the caller must call Ref() at least once. + explicit MemTable(const InternalKeyComparator& comparator, + const Options& options); + + ~MemTable(); + + // Increase reference count. + void Ref() { ++refs_; } + + // Drop reference count. + // If the refcount goes to zero return this memtable, otherwise return null + MemTable* Unref() { + --refs_; + assert(refs_ >= 0); + if (refs_ <= 0) { + return this; + } + return nullptr; + } + + // Returns an estimate of the number of bytes of data in use by this + // data structure. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + size_t ApproximateMemoryUsage(); + + // This method heuristically determines if the memtable should continue to + // host more data. + bool ShouldFlush() const { return should_flush_; } + + // Return an iterator that yields the contents of the memtable. + // + // The caller must ensure that the underlying MemTable remains live + // while the returned iterator is live. The keys returned by this + // iterator are internal keys encoded by AppendInternalKey in the + // db/dbformat.{h,cc} module. + // + // By default, it returns an iterator for prefix seek if prefix_extractor + // is configured in Options. + // arena: If not null, the arena needs to be used to allocate the Iterator. + // Calling ~Iterator of the iterator will destroy all the states but + // those allocated in arena. + Iterator* NewIterator(const ReadOptions& options, + bool enforce_total_order = false, + Arena* arena = nullptr); + + // Add an entry into memtable that maps key to value at the + // specified sequence number and with the specified type. + // Typically value will be empty if type==kTypeDeletion. + void Add(SequenceNumber seq, ValueType type, + const Slice& key, + const Slice& value); + + // If memtable contains a value for key, store it in *value and return true. + // If memtable contains a deletion for key, store a NotFound() error + // in *status and return true. + // If memtable contains Merge operation as the most recent entry for a key, + // and the merge process does not stop (not reaching a value or delete), + // prepend the current merge operand to *operands. + // store MergeInProgress in s, and return false. + // Else, return false. + bool Get(const LookupKey& key, std::string* value, Status* s, + MergeContext& merge_context, const Options& options); + + // Attempts to update the new_value inplace, else does normal Add + // Pseudocode + // if key exists in current memtable && prev_value is of type kTypeValue + // if new sizeof(new_value) <= sizeof(prev_value) + // update inplace + // else add(key, new_value) + // else add(key, new_value) + void Update(SequenceNumber seq, + const Slice& key, + const Slice& value); + + // If prev_value for key exits, attempts to update it inplace. + // else returns false + // Pseudocode + // if key exists in current memtable && prev_value is of type kTypeValue + // new_value = delta(prev_value) + // if sizeof(new_value) <= sizeof(prev_value) + // update inplace + // else add(key, new_value) + // else return false + bool UpdateCallback(SequenceNumber seq, + const Slice& key, + const Slice& delta, + const Options& options); + + // Returns the number of successive merge entries starting from the newest + // entry for the key up to the last non-merge entry or last entry for the + // key in the memtable. + size_t CountSuccessiveMergeEntries(const LookupKey& key); + + // Get total number of entries in the mem table. + uint64_t GetNumEntries() const { return num_entries_; } + + // Returns the edits area that is needed for flushing the memtable + VersionEdit* GetEdits() { return &edit_; } + + // Returns the sequence number of the first element that was inserted + // into the memtable + SequenceNumber GetFirstSequenceNumber() { return first_seqno_; } + + // Returns the next active logfile number when this memtable is about to + // be flushed to storage + uint64_t GetNextLogNumber() { return mem_next_logfile_number_; } + + // Sets the next active logfile number when this memtable is about to + // be flushed to storage + void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; } + + // Notify the underlying storage that no more items will be added + void MarkImmutable() { table_->MarkReadOnly(); } + + // return true if the current MemTableRep supports merge operator. + bool IsMergeOperatorSupported() const { + return table_->IsMergeOperatorSupported(); + } + + // return true if the current MemTableRep supports snapshots. + bool IsSnapshotSupported() const { return table_->IsSnapshotSupported(); } + + // Get the lock associated for the key + port::RWMutex* GetLock(const Slice& key); + + const InternalKeyComparator& GetInternalKeyComparator() const { + return comparator_.comparator; + } + + const Arena& TEST_GetArena() const { return arena_; } + + private: + // Dynamically check if we can add more incoming entries. + bool ShouldFlushNow() const; + + friend class MemTableIterator; + friend class MemTableBackwardIterator; + friend class MemTableList; + + KeyComparator comparator_; + int refs_; + const size_t kArenaBlockSize; + const size_t kWriteBufferSize; + Arena arena_; + unique_ptr table_; + + uint64_t num_entries_; + + // These are used to manage memtable flushes to storage + bool flush_in_progress_; // started the flush + bool flush_completed_; // finished the flush + uint64_t file_number_; // filled up after flush is complete + + // The updates to be applied to the transaction log when this + // memtable is flushed to storage. + VersionEdit edit_; + + // The sequence number of the kv that was inserted first + SequenceNumber first_seqno_; + + // The log files earlier than this number can be deleted. + uint64_t mem_next_logfile_number_; + + // rw locks for inplace updates + std::vector locks_; + + // No copying allowed + MemTable(const MemTable&); + void operator=(const MemTable&); + + const SliceTransform* const prefix_extractor_; + std::unique_ptr prefix_bloom_; + + // a flag indicating if a memtable has met the criteria to flush + bool should_flush_; +}; + +extern const char* EncodeKey(std::string* scratch, const Slice& target); + +} // namespace rocksdb diff --git a/db/memtable_list.cc b/db/memtable_list.cc new file mode 100644 index 0000000000..de1a18eee2 --- /dev/null +++ b/db/memtable_list.cc @@ -0,0 +1,286 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "db/memtable_list.h" + +#include +#include "rocksdb/db.h" +#include "db/memtable.h" +#include "db/version_set.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "table/merger.h" +#include "util/coding.h" +#include "util/log_buffer.h" + +namespace rocksdb { + +class InternalKeyComparator; +class Mutex; +class VersionSet; + +MemTableListVersion::MemTableListVersion(MemTableListVersion* old) { + if (old != nullptr) { + memlist_ = old->memlist_; + size_ = old->size_; + for (auto& m : memlist_) { + m->Ref(); + } + } +} + +void MemTableListVersion::Ref() { ++refs_; } + +void MemTableListVersion::Unref(autovector* to_delete) { + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + // if to_delete is equal to nullptr it means we're confident + // that refs_ will not be zero + assert(to_delete != nullptr); + for (const auto& m : memlist_) { + MemTable* x = m->Unref(); + if (x != nullptr) { + to_delete->push_back(x); + } + } + delete this; + } +} + +int MemTableListVersion::size() const { return size_; } + +// Returns the total number of memtables in the list +int MemTableList::size() const { + assert(num_flush_not_started_ <= current_->size_); + return current_->size_; +} + +// Search all the memtables starting from the most recent one. +// Return the most recent value found, if any. +// Operands stores the list of merge operations to apply, so far. +bool MemTableListVersion::Get(const LookupKey& key, std::string* value, + Status* s, MergeContext& merge_context, + const Options& options) { + for (auto& memtable : memlist_) { + if (memtable->Get(key, value, s, merge_context, options)) { + return true; + } + } + return false; +} + +void MemTableListVersion::AddIterators(const ReadOptions& options, + std::vector* iterator_list) { + for (auto& m : memlist_) { + iterator_list->push_back(m->NewIterator(options)); + } +} + +void MemTableListVersion::AddIterators( + const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder) { + for (auto& m : memlist_) { + merge_iter_builder->AddIterator( + m->NewIterator(options, merge_iter_builder->GetArena())); + } +} + +uint64_t MemTableListVersion::GetTotalNumEntries() const { + uint64_t total_num = 0; + for (auto& m : memlist_) { + total_num += m->GetNumEntries(); + } + return total_num; +} + +// caller is responsible for referencing m +void MemTableListVersion::Add(MemTable* m) { + assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable + memlist_.push_front(m); + ++size_; +} + +// caller is responsible for unreferencing m +void MemTableListVersion::Remove(MemTable* m) { + assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable + memlist_.remove(m); + --size_; +} + +// Returns true if there is at least one memtable on which flush has +// not yet started. +bool MemTableList::IsFlushPending() const { + if ((flush_requested_ && num_flush_not_started_ >= 1) || + (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) { + assert(imm_flush_needed.NoBarrier_Load() != nullptr); + return true; + } + return false; +} + +// Returns the memtables that need to be flushed. +void MemTableList::PickMemtablesToFlush(autovector* ret) { + const auto& memlist = current_->memlist_; + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* m = *it; + if (!m->flush_in_progress_) { + assert(!m->flush_completed_); + num_flush_not_started_--; + if (num_flush_not_started_ == 0) { + imm_flush_needed.Release_Store(nullptr); + } + m->flush_in_progress_ = true; // flushing will start very soon + ret->push_back(m); + } + } + flush_requested_ = false; // start-flush request is complete +} + +void MemTableList::RollbackMemtableFlush(const autovector& mems, + uint64_t file_number, + std::set* pending_outputs) { + assert(!mems.empty()); + + // If the flush was not successful, then just reset state. + // Maybe a suceeding attempt to flush will be successful. + for (MemTable* m : mems) { + assert(m->flush_in_progress_); + assert(m->file_number_ == 0); + + m->flush_in_progress_ = false; + m->flush_completed_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + } + pending_outputs->erase(file_number); + imm_flush_needed.Release_Store(reinterpret_cast(1)); +} + +// Record a successful flush in the manifest file +Status MemTableList::InstallMemtableFlushResults( + ColumnFamilyData* cfd, const autovector& mems, VersionSet* vset, + port::Mutex* mu, Logger* info_log, uint64_t file_number, + std::set& pending_outputs, autovector* to_delete, + Directory* db_directory, LogBuffer* log_buffer) { + mu->AssertHeld(); + + // flush was sucessful + for (size_t i = 0; i < mems.size(); ++i) { + // All the edits are associated with the first memtable of this batch. + assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0); + + mems[i]->flush_completed_ = true; + mems[i]->file_number_ = file_number; + } + + // if some other thread is already commiting, then return + Status s; + if (commit_in_progress_) { + return s; + } + + // Only a single thread can be executing this piece of code + commit_in_progress_ = true; + + // scan all memtables from the earliest, and commit those + // (in that order) that have finished flushing. Memetables + // are always committed in the order that they were created. + while (!current_->memlist_.empty() && s.ok()) { + MemTable* m = current_->memlist_.back(); // get the last element + if (!m->flush_completed_) { + break; + } + + LogToBuffer(log_buffer, "[%s] Level-0 commit table #%lu started", + cfd->GetName().c_str(), (unsigned long)m->file_number_); + + // this can release and reacquire the mutex. + s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory); + + // we will be changing the version in the next code path, + // so we better create a new one, since versions are immutable + InstallNewVersion(); + + // All the later memtables that have the same filenum + // are part of the same batch. They can be committed now. + uint64_t mem_id = 1; // how many memtables has been flushed. + do { + if (s.ok()) { // commit new state + LogToBuffer(log_buffer, + "[%s] Level-0 commit table #%lu: memtable #%lu done", + cfd->GetName().c_str(), (unsigned long)m->file_number_, + (unsigned long)mem_id); + current_->Remove(m); + assert(m->file_number_ > 0); + + // pending_outputs can be cleared only after the newly created file + // has been written to a committed version so that other concurrently + // executing compaction threads do not mistakenly assume that this + // file is not live. + pending_outputs.erase(m->file_number_); + if (m->Unref() != nullptr) { + to_delete->push_back(m); + } + } else { + //commit failed. setup state so that we can flush again. + Log(info_log, + "Level-0 commit table #%lu: memtable #%lu failed", + (unsigned long)m->file_number_, + (unsigned long)mem_id); + m->flush_completed_ = false; + m->flush_in_progress_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + pending_outputs.erase(m->file_number_); + m->file_number_ = 0; + imm_flush_needed.Release_Store((void *)1); + } + ++mem_id; + } while (!current_->memlist_.empty() && (m = current_->memlist_.back()) && + m->file_number_ == file_number); + } + commit_in_progress_ = false; + return s; +} + +// New memtables are inserted at the front of the list. +void MemTableList::Add(MemTable* m) { + assert(current_->size_ >= num_flush_not_started_); + InstallNewVersion(); + // this method is used to move mutable memtable into an immutable list. + // since mutable memtable is already refcounted by the DBImpl, + // and when moving to the imutable list we don't unref it, + // we don't have to ref the memtable here. we just take over the + // reference from the DBImpl. + current_->Add(m); + m->MarkImmutable(); + num_flush_not_started_++; + if (num_flush_not_started_ == 1) { + imm_flush_needed.Release_Store((void *)1); + } +} + +// Returns an estimate of the number of bytes of data in use. +size_t MemTableList::ApproximateMemoryUsage() { + size_t size = 0; + for (auto& memtable : current_->memlist_) { + size += memtable->ApproximateMemoryUsage(); + } + return size; +} + +void MemTableList::InstallNewVersion() { + if (current_->refs_ == 1) { + // we're the only one using the version, just keep using it + } else { + // somebody else holds the current version, we need to create new one + MemTableListVersion* version = current_; + current_ = new MemTableListVersion(current_); + current_->Ref(); + version->Unref(); + } +} + +} // namespace rocksdb diff --git a/db/memtable_list.h b/db/memtable_list.h new file mode 100644 index 0000000000..e56710fc98 --- /dev/null +++ b/db/memtable_list.h @@ -0,0 +1,156 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +#include +#include +#include +#include +#include +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/iterator.h" + +#include "db/dbformat.h" +#include "db/skiplist.h" +#include "db/memtable.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "util/autovector.h" +#include "util/log_buffer.h" + +namespace rocksdb { + +class ColumnFamilyData; +class InternalKeyComparator; +class Mutex; +class MergeIteratorBuilder; + +// keeps a list of immutable memtables in a vector. the list is immutable +// if refcount is bigger than one. It is used as a state for Get() and +// Iterator code paths +class MemTableListVersion { + public: + explicit MemTableListVersion(MemTableListVersion* old = nullptr); + + void Ref(); + void Unref(autovector* to_delete = nullptr); + + int size() const; + + // Search all the memtables starting from the most recent one. + // Return the most recent value found, if any. + bool Get(const LookupKey& key, std::string* value, Status* s, + MergeContext& merge_context, const Options& options); + + void AddIterators(const ReadOptions& options, + std::vector* iterator_list); + + void AddIterators(const ReadOptions& options, + MergeIteratorBuilder* merge_iter_builder); + + uint64_t GetTotalNumEntries() const; + + private: + // REQUIRE: m is mutable memtable + void Add(MemTable* m); + // REQUIRE: m is mutable memtable + void Remove(MemTable* m); + + friend class MemTableList; + std::list memlist_; + int size_ = 0; + int refs_ = 0; +}; + +// This class stores references to all the immutable memtables. +// The memtables are flushed to L0 as soon as possible and in +// any order. If there are more than one immutable memtable, their +// flushes can occur concurrently. However, they are 'committed' +// to the manifest in FIFO order to maintain correctness and +// recoverability from a crash. +class MemTableList { + public: + // A list of memtables. + explicit MemTableList(int min_write_buffer_number_to_merge) + : min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge), + current_(new MemTableListVersion()), + num_flush_not_started_(0), + commit_in_progress_(false), + flush_requested_(false) { + imm_flush_needed.Release_Store(nullptr); + current_->Ref(); + } + ~MemTableList() {} + + MemTableListVersion* current() { return current_; } + + // so that background threads can detect non-nullptr pointer to + // determine whether there is anything more to start flushing. + port::AtomicPointer imm_flush_needed; + + // Returns the total number of memtables in the list + int size() const; + + // Returns true if there is at least one memtable on which flush has + // not yet started. + bool IsFlushPending() const; + + // Returns the earliest memtables that needs to be flushed. The returned + // memtables are guaranteed to be in the ascending order of created time. + void PickMemtablesToFlush(autovector* mems); + + // Reset status of the given memtable list back to pending state so that + // they can get picked up again on the next round of flush. + void RollbackMemtableFlush(const autovector& mems, + uint64_t file_number, + std::set* pending_outputs); + + // Commit a successful flush in the manifest file + Status InstallMemtableFlushResults(ColumnFamilyData* cfd, + const autovector& m, + VersionSet* vset, port::Mutex* mu, + Logger* info_log, uint64_t file_number, + std::set& pending_outputs, + autovector* to_delete, + Directory* db_directory, + LogBuffer* log_buffer); + + // New memtables are inserted at the front of the list. + // Takes ownership of the referenced held on *m by the caller of Add(). + void Add(MemTable* m); + + // Returns an estimate of the number of bytes of data in use. + size_t ApproximateMemoryUsage(); + + // Request a flush of all existing memtables to storage + void FlushRequested() { flush_requested_ = true; } + + // Copying allowed + // MemTableList(const MemTableList&); + // void operator=(const MemTableList&); + + private: + // DB mutex held + void InstallNewVersion(); + + int min_write_buffer_number_to_merge_; + + MemTableListVersion* current_; + + // the number of elements that still need flushing + int num_flush_not_started_; + + // committing in progress + bool commit_in_progress_; + + // Requested a flush of all memtables to storage + bool flush_requested_; + +}; + +} // namespace rocksdb diff --git a/db/merge_context.h b/db/merge_context.h new file mode 100644 index 0000000000..bf483a8275 --- /dev/null +++ b/db/merge_context.h @@ -0,0 +1,69 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "db/dbformat.h" +#include "rocksdb/slice.h" +#include +#include + +namespace rocksdb { + +const std::deque empty_operand_list; + +// The merge context for merging a user key. +// When doing a Get(), DB will create such a class and pass it when +// issuing Get() operation to memtables and version_set. The operands +// will be fetched from the context when issuing partial of full merge. +class MergeContext { +public: + // Clear all the operands + void Clear() { + if (operand_list) { + operand_list->clear(); + } + } + // Replace all operands with merge_result, which are expected to be the + // merge result of them. + void PushPartialMergeResult(std::string& merge_result) { + assert (operand_list); + operand_list->clear(); + operand_list->push_front(std::move(merge_result)); + } + // Push a merge operand + void PushOperand(const Slice& operand_slice) { + Initialize(); + operand_list->push_front(operand_slice.ToString()); + } + // return total number of operands in the list + size_t GetNumOperands() const { + if (!operand_list) { + return 0; + } + return operand_list->size(); + } + // Get the operand at the index. + Slice GetOperand(int index) const { + assert (operand_list); + return (*operand_list)[index]; + } + // Return all the operands. + const std::deque& GetOperands() const { + if (!operand_list) { + return empty_operand_list; + } + return *operand_list; + } +private: + void Initialize() { + if (!operand_list) { + operand_list.reset(new std::deque()); + } + } + std::unique_ptr> operand_list; +}; + +} // namespace rocksdb + diff --git a/db/merge_helper.cc b/db/merge_helper.cc new file mode 100644 index 0000000000..0e36f6ae07 --- /dev/null +++ b/db/merge_helper.cc @@ -0,0 +1,209 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "merge_helper.h" +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/merge_operator.h" +#include "util/statistics.h" +#include +#include + +namespace rocksdb { + +// PRE: iter points to the first merge type entry +// POST: iter points to the first entry beyond the merge process (or the end) +// keys_, operands_ are updated to reflect the merge result. +// keys_ stores the list of keys encountered while merging. +// operands_ stores the list of merge operands encountered while merging. +// keys_[i] corresponds to operands_[i] for each i. +void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before, + bool at_bottom, Statistics* stats, int* steps) { + // Get a copy of the internal key, before it's invalidated by iter->Next() + // Also maintain the list of merge operands seen. + keys_.clear(); + operands_.clear(); + keys_.push_front(iter->key().ToString()); + operands_.push_front(iter->value().ToString()); + + success_ = false; // Will become true if we hit Put/Delete or bottom + + // We need to parse the internal key again as the parsed key is + // backed by the internal key! + // Assume no internal key corruption as it has been successfully parsed + // by the caller. + // Invariant: keys_.back() will not change. Hence, orig_ikey is always valid. + ParsedInternalKey orig_ikey; + ParseInternalKey(keys_.back(), &orig_ikey); + + bool hit_the_next_user_key = false; + std::string merge_result; // Temporary value for merge results + if (steps) { + ++(*steps); + } + for (iter->Next(); iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + assert(operands_.size() >= 1); // Should be invariants! + assert(keys_.size() == operands_.size()); + + if (!ParseInternalKey(iter->key(), &ikey)) { + // stop at corrupted key + if (assert_valid_internal_key_) { + assert(!"corrupted internal key is not expected"); + } + break; + } + + if (user_comparator_->Compare(ikey.user_key, orig_ikey.user_key) != 0) { + // hit a different user key, stop right here + hit_the_next_user_key = true; + break; + } + + if (stop_before && ikey.sequence <= stop_before) { + // hit an entry that's visible by the previous snapshot, can't touch that + break; + } + + // At this point we are guaranteed that we need to process this key. + + if (kTypeDeletion == ikey.type) { + // hit a delete + // => merge nullptr with operands_ + // => store result in operands_.back() (and update keys_.back()) + // => change the entry type to kTypeValue for keys_.back() + // We are done! Return a success if the merge passes. + success_ = user_merge_operator_->FullMerge(ikey.user_key, nullptr, + operands_, &merge_result, + logger_); + + // We store the result in keys_.back() and operands_.back() + // if nothing went wrong (i.e.: no operand corruption on disk) + if (success_) { + std::string& key = keys_.back(); // The original key encountered + orig_ikey.type = kTypeValue; + UpdateInternalKey(&key[0], key.size(), + orig_ikey.sequence, orig_ikey.type); + swap(operands_.back(), merge_result); + } else { + RecordTick(stats, NUMBER_MERGE_FAILURES); + } + + // move iter to the next entry (before doing anything else) + iter->Next(); + if (steps) { + ++(*steps); + } + return; + } + + if (kTypeValue == ikey.type) { + // hit a put + // => merge the put value with operands_ + // => store result in operands_.back() (and update keys_.back()) + // => change the entry type to kTypeValue for keys_.back() + // We are done! Success! + const Slice value = iter->value(); + success_ = user_merge_operator_->FullMerge(ikey.user_key, &value, + operands_, &merge_result, + logger_); + + // We store the result in keys_.back() and operands_.back() + // if nothing went wrong (i.e.: no operand corruption on disk) + if (success_) { + std::string& key = keys_.back(); // The original key encountered + orig_ikey.type = kTypeValue; + UpdateInternalKey(&key[0], key.size(), + orig_ikey.sequence, orig_ikey.type); + swap(operands_.back(), merge_result); + } else { + RecordTick(stats, NUMBER_MERGE_FAILURES); + } + + // move iter to the next entry + iter->Next(); + if (steps) { + ++(*steps); + } + return; + } + + if (kTypeMerge == ikey.type) { + // hit a merge + // => merge the operand into the front of the operands_ list + // => use the user's associative merge function to determine how. + // => then continue because we haven't yet seen a Put/Delete. + assert(!operands_.empty()); // Should have at least one element in it + + // keep queuing keys and operands until we either meet a put / delete + // request or later did a partial merge. + keys_.push_front(iter->key().ToString()); + operands_.push_front(iter->value().ToString()); + if (steps) { + ++(*steps); + } + } + } + + // We are sure we have seen this key's entire history if we are at the + // last level and exhausted all internal keys of this user key. + // NOTE: !iter->Valid() does not necessarily mean we hit the + // beginning of a user key, as versions of a user key might be + // split into multiple files (even files on the same level) + // and some files might not be included in the compaction/merge. + // + // There are also cases where we have seen the root of history of this + // key without being sure of it. Then, we simply miss the opportunity + // to combine the keys. Since VersionSet::SetupOtherInputs() always makes + // sure that all merge-operands on the same level get compacted together, + // this will simply lead to these merge operands moving to the next level. + // + // So, we only perform the following logic (to merge all operands together + // without a Put/Delete) if we are certain that we have seen the end of key. + bool surely_seen_the_beginning = hit_the_next_user_key && at_bottom; + if (surely_seen_the_beginning) { + // do a final merge with nullptr as the existing value and say + // bye to the merge type (it's now converted to a Put) + assert(kTypeMerge == orig_ikey.type); + assert(operands_.size() >= 1); + assert(operands_.size() == keys_.size()); + success_ = user_merge_operator_->FullMerge(orig_ikey.user_key, nullptr, + operands_, &merge_result, + logger_); + + if (success_) { + std::string& key = keys_.back(); // The original key encountered + orig_ikey.type = kTypeValue; + UpdateInternalKey(&key[0], key.size(), + orig_ikey.sequence, orig_ikey.type); + + // The final value() is always stored in operands_.back() + swap(operands_.back(),merge_result); + } else { + RecordTick(stats, NUMBER_MERGE_FAILURES); + // Do nothing if not success_. Leave keys() and operands() as they are. + } + } else { + // We haven't seen the beginning of the key nor a Put/Delete. + // Attempt to use the user's associative merge function to + // merge the stacked merge operands into a single operand. + + if (operands_.size() >= 2 && + operands_.size() >= min_partial_merge_operands_ && + user_merge_operator_->PartialMergeMulti( + orig_ikey.user_key, + std::deque(operands_.begin(), operands_.end()), + &merge_result, logger_)) { + // Merging of operands (associative merge) was successful. + // Replace operands with the merge result + operands_.clear(); + operands_.push_front(std::move(merge_result)); + keys_.erase(keys_.begin(), keys_.end() - 1); + } + } +} + +} // namespace rocksdb diff --git a/db/merge_helper.h b/db/merge_helper.h new file mode 100644 index 0000000000..fef153eb0d --- /dev/null +++ b/db/merge_helper.h @@ -0,0 +1,105 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef MERGE_HELPER_H +#define MERGE_HELPER_H + +#include "db/dbformat.h" +#include "rocksdb/slice.h" +#include +#include + +namespace rocksdb { + +class Comparator; +class Iterator; +class Logger; +class MergeOperator; +class Statistics; + +class MergeHelper { + public: + MergeHelper(const Comparator* user_comparator, + const MergeOperator* user_merge_operator, Logger* logger, + unsigned min_partial_merge_operands, + bool assert_valid_internal_key) + : user_comparator_(user_comparator), + user_merge_operator_(user_merge_operator), + logger_(logger), + min_partial_merge_operands_(min_partial_merge_operands), + assert_valid_internal_key_(assert_valid_internal_key), + keys_(), + operands_(), + success_(false) {} + + // Merge entries until we hit + // - a corrupted key + // - a Put/Delete, + // - a different user key, + // - a specific sequence number (snapshot boundary), + // or - the end of iteration + // iter: (IN) points to the first merge type entry + // (OUT) points to the first entry not included in the merge process + // stop_before: (IN) a sequence number that merge should not cross. + // 0 means no restriction + // at_bottom: (IN) true if the iterator covers the bottem level, which means + // we could reach the start of the history of this user key. + void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0, + bool at_bottom = false, Statistics* stats = nullptr, + int* steps = nullptr); + + // Query the merge result + // These are valid until the next MergeUntil call + // If the merging was successful: + // - IsSuccess() will be true + // - key() will have the latest sequence number of the merges. + // The type will be Put or Merge. See IMPORTANT 1 note, below. + // - value() will be the result of merging all the operands together + // - The user should ignore keys() and values(). + // + // IMPORTANT 1: the key type could change after the MergeUntil call. + // Put/Delete + Merge + ... + Merge => Put + // Merge + ... + Merge => Merge + // + // If the merge operator is not associative, and if a Put/Delete is not found + // then the merging will be unsuccessful. In this case: + // - IsSuccess() will be false + // - keys() contains the list of internal keys seen in order of iteration. + // - values() contains the list of values (merges) seen in the same order. + // values() is parallel to keys() so that the first entry in + // keys() is the key associated with the first entry in values() + // and so on. These lists will be the same length. + // All of these pairs will be merges over the same user key. + // See IMPORTANT 2 note below. + // - The user should ignore key() and value(). + // + // IMPORTANT 2: The entries were traversed in order from BACK to FRONT. + // So keys().back() was the first key seen by iterator. + // TODO: Re-style this comment to be like the first one + bool IsSuccess() { return success_; } + Slice key() { assert(success_); return Slice(keys_.back()); } + Slice value() { assert(success_); return Slice(operands_.back()); } + const std::deque& keys() { assert(!success_); return keys_; } + const std::deque& values() { + assert(!success_); return operands_; + } + + private: + const Comparator* user_comparator_; + const MergeOperator* user_merge_operator_; + Logger* logger_; + unsigned min_partial_merge_operands_; + bool assert_valid_internal_key_; // enforce no internal key corruption? + + // the scratch area that holds the result of MergeUntil + // valid up to the next MergeUntil call + std::deque keys_; // Keeps track of the sequence of keys seen + std::deque operands_; // Parallel with keys_; stores the values + bool success_; +}; + +} // namespace rocksdb + +#endif diff --git a/db/merge_operator.cc b/db/merge_operator.cc new file mode 100644 index 0000000000..a14df8a871 --- /dev/null +++ b/db/merge_operator.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +/** + * Back-end implementation details specific to the Merge Operator. + */ + +#include "rocksdb/merge_operator.h" + +namespace rocksdb { + +// The default implementation of PartialMergeMulti, which invokes +// PartialMerge multiple times internally and merges two operands at +// a time. +bool MergeOperator::PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const { + assert(operand_list.size() >= 2); + // Simply loop through the operands + std::string temp_value; + Slice temp_slice(operand_list[0]); + + for (size_t i = 1; i < operand_list.size(); ++i) { + auto& operand = operand_list[i]; + if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) { + return false; + } + swap(temp_value, *new_value); + temp_slice = Slice(*new_value); + } + + // The result will be in *new_value. All merges succeeded. + return true; +} + +// Given a "real" merge from the library, call the user's +// associative merge function one-by-one on each of the operands. +// NOTE: It is assumed that the client's merge-operator will handle any errors. +bool AssociativeMergeOperator::FullMerge( + const Slice& key, + const Slice* existing_value, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const { + + // Simply loop through the operands + Slice temp_existing; + std::string temp_value; + for (const auto& operand : operand_list) { + Slice value(operand); + if (!Merge(key, existing_value, value, &temp_value, logger)) { + return false; + } + swap(temp_value, *new_value); + temp_existing = Slice(*new_value); + existing_value = &temp_existing; + } + + // The result will be in *new_value. All merges succeeded. + return true; +} + +// Call the user defined simple merge on the operands; +// NOTE: It is assumed that the client's merge-operator will handle any errors. +bool AssociativeMergeOperator::PartialMerge( + const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const { + return Merge(key, &left_operand, right_operand, new_value, logger); +} + +} // namespace rocksdb diff --git a/db/merge_test.cc b/db/merge_test.cc new file mode 100644 index 0000000000..9bdf54332e --- /dev/null +++ b/db/merge_test.cc @@ -0,0 +1,472 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "db/dbformat.h" +#include "db/db_impl.h" +#include "db/write_batch_internal.h" +#include "utilities/merge_operators.h" +#include "util/testharness.h" +#include "utilities/db_ttl.h" + +using namespace std; +using namespace rocksdb; + +namespace { + int numMergeOperatorCalls; + void resetNumMergeOperatorCalls() { + numMergeOperatorCalls = 0; + } + + int num_partial_merge_calls; + void resetNumPartialMergeCalls() { + num_partial_merge_calls = 0; + } +} + +class CountMergeOperator : public AssociativeMergeOperator { + public: + CountMergeOperator() { + mergeOperator_ = MergeOperators::CreateUInt64AddOperator(); + } + + virtual bool Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const override { + ++numMergeOperatorCalls; + if (existing_value == nullptr) { + new_value->assign(value.data(), value.size()); + return true; + } + + return mergeOperator_->PartialMerge( + key, + *existing_value, + value, + new_value, + logger); + } + + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const { + ++num_partial_merge_calls; + return mergeOperator_->PartialMergeMulti(key, operand_list, new_value, + logger); + } + + virtual const char* Name() const override { + return "UInt64AddOperator"; + } + + private: + std::shared_ptr mergeOperator_; +}; + +namespace { +std::shared_ptr OpenDb(const string& dbname, const bool ttl = false, + const size_t max_successive_merges = 0, + const uint32_t min_partial_merge_operands = 2) { + DB* db; + Options options; + options.create_if_missing = true; + options.merge_operator = std::make_shared(); + options.max_successive_merges = max_successive_merges; + options.min_partial_merge_operands = min_partial_merge_operands; + Status s; + DestroyDB(dbname, Options()); + if (ttl) { + cout << "Opening database with TTL\n"; + DBWithTTL* db_with_ttl; + s = DBWithTTL::Open(options, dbname, &db_with_ttl); + db = db_with_ttl; + } else { + s = DB::Open(options, dbname, &db); + } + if (!s.ok()) { + cerr << s.ToString() << endl; + assert(false); + } + return std::shared_ptr(db); +} +} // namespace + +// Imagine we are maintaining a set of uint64 counters. +// Each counter has a distinct name. And we would like +// to support four high level operations: +// set, add, get and remove +// This is a quick implementation without a Merge operation. +class Counters { + + protected: + std::shared_ptr db_; + + WriteOptions put_option_; + ReadOptions get_option_; + WriteOptions delete_option_; + + uint64_t default_; + + public: + explicit Counters(std::shared_ptr db, uint64_t defaultCount = 0) + : db_(db), + put_option_(), + get_option_(), + delete_option_(), + default_(defaultCount) { + assert(db_); + } + + virtual ~Counters() {} + + // public interface of Counters. + // All four functions return false + // if the underlying level db operation failed. + + // mapped to a levedb Put + bool set(const string& key, uint64_t value) { + // just treat the internal rep of int64 as the string + Slice slice((char *)&value, sizeof(value)); + auto s = db_->Put(put_option_, key, slice); + + if (s.ok()) { + return true; + } else { + cerr << s.ToString() << endl; + return false; + } + } + + // mapped to a rocksdb Delete + bool remove(const string& key) { + auto s = db_->Delete(delete_option_, key); + + if (s.ok()) { + return true; + } else { + cerr << s.ToString() << std::endl; + return false; + } + } + + // mapped to a rocksdb Get + bool get(const string& key, uint64_t *value) { + string str; + auto s = db_->Get(get_option_, key, &str); + + if (s.IsNotFound()) { + // return default value if not found; + *value = default_; + return true; + } else if (s.ok()) { + // deserialization + if (str.size() != sizeof(uint64_t)) { + cerr << "value corruption\n"; + return false; + } + *value = DecodeFixed64(&str[0]); + return true; + } else { + cerr << s.ToString() << std::endl; + return false; + } + } + + // 'add' is implemented as get -> modify -> set + // An alternative is a single merge operation, see MergeBasedCounters + virtual bool add(const string& key, uint64_t value) { + uint64_t base = default_; + return get(key, &base) && set(key, base + value); + } + + + // convenience functions for testing + void assert_set(const string& key, uint64_t value) { + assert(set(key, value)); + } + + void assert_remove(const string& key) { + assert(remove(key)); + } + + uint64_t assert_get(const string& key) { + uint64_t value = default_; + int result = get(key, &value); + assert(result); + if (result == 0) exit(1); // Disable unused variable warning. + return value; + } + + void assert_add(const string& key, uint64_t value) { + int result = add(key, value); + assert(result); + if (result == 0) exit(1); // Disable unused variable warning. + } +}; + +// Implement 'add' directly with the new Merge operation +class MergeBasedCounters : public Counters { + private: + WriteOptions merge_option_; // for merge + + public: + explicit MergeBasedCounters(std::shared_ptr db, uint64_t defaultCount = 0) + : Counters(db, defaultCount), + merge_option_() { + } + + // mapped to a rocksdb Merge operation + virtual bool add(const string& key, uint64_t value) override { + char encoded[sizeof(uint64_t)]; + EncodeFixed64(encoded, value); + Slice slice(encoded, sizeof(uint64_t)); + auto s = db_->Merge(merge_option_, key, slice); + + if (s.ok()) { + return true; + } else { + cerr << s.ToString() << endl; + return false; + } + } +}; + +namespace { +void dumpDb(DB* db) { + auto it = unique_ptr(db->NewIterator(ReadOptions())); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + uint64_t value = DecodeFixed64(it->value().data()); + cout << it->key().ToString() << ": " << value << endl; + } + assert(it->status().ok()); // Check for any errors found during the scan +} + +void testCounters(Counters& counters, DB* db, bool test_compaction) { + + FlushOptions o; + o.wait = true; + + counters.assert_set("a", 1); + + if (test_compaction) db->Flush(o); + + assert(counters.assert_get("a") == 1); + + counters.assert_remove("b"); + + // defaut value is 0 if non-existent + assert(counters.assert_get("b") == 0); + + counters.assert_add("a", 2); + + if (test_compaction) db->Flush(o); + + // 1+2 = 3 + assert(counters.assert_get("a")== 3); + + dumpDb(db); + + std::cout << "1\n"; + + // 1+...+49 = ? + uint64_t sum = 0; + for (int i = 1; i < 50; i++) { + counters.assert_add("b", i); + sum += i; + } + assert(counters.assert_get("b") == sum); + + std::cout << "2\n"; + dumpDb(db); + + std::cout << "3\n"; + + if (test_compaction) { + db->Flush(o); + + cout << "Compaction started ...\n"; + db->CompactRange(nullptr, nullptr); + cout << "Compaction ended\n"; + + dumpDb(db); + + assert(counters.assert_get("a")== 3); + assert(counters.assert_get("b") == sum); + } +} + +void testSuccessiveMerge( + Counters& counters, int max_num_merges, int num_merges) { + + counters.assert_remove("z"); + uint64_t sum = 0; + + for (int i = 1; i <= num_merges; ++i) { + resetNumMergeOperatorCalls(); + counters.assert_add("z", i); + sum += i; + + if (i % (max_num_merges + 1) == 0) { + assert(numMergeOperatorCalls == max_num_merges + 1); + } else { + assert(numMergeOperatorCalls == 0); + } + + resetNumMergeOperatorCalls(); + assert(counters.assert_get("z") == sum); + assert(numMergeOperatorCalls == i % (max_num_merges + 1)); + } +} + +void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge, + int count) { + FlushOptions o; + o.wait = true; + + // Test case 1: partial merge should be called when the number of merge + // operands exceeds the threshold. + uint64_t tmp_sum = 0; + resetNumPartialMergeCalls(); + for (int i = 1; i <= count; i++) { + counters->assert_add("b", i); + tmp_sum += i; + } + db->Flush(o); + db->CompactRange(nullptr, nullptr); + ASSERT_EQ(tmp_sum, counters->assert_get("b")); + if (count > max_merge) { + // in this case, FullMerge should be called instead. + ASSERT_EQ(num_partial_merge_calls, 0); + } else { + // if count >= min_merge, then partial merge should be called once. + ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1)); + } + + // Test case 2: partial merge should not be called when a put is found. + resetNumPartialMergeCalls(); + tmp_sum = 0; + db->Put(rocksdb::WriteOptions(), "c", "10"); + for (int i = 1; i <= count; i++) { + counters->assert_add("c", i); + tmp_sum += i; + } + db->Flush(o); + db->CompactRange(nullptr, nullptr); + ASSERT_EQ(tmp_sum, counters->assert_get("c")); + ASSERT_EQ(num_partial_merge_calls, 0); +} + +void testSingleBatchSuccessiveMerge( + DB* db, + int max_num_merges, + int num_merges) { + assert(num_merges > max_num_merges); + + Slice key("BatchSuccessiveMerge"); + uint64_t merge_value = 1; + Slice merge_value_slice((char *)&merge_value, sizeof(merge_value)); + + // Create the batch + WriteBatch batch; + for (int i = 0; i < num_merges; ++i) { + batch.Merge(key, merge_value_slice); + } + + // Apply to memtable and count the number of merges + resetNumMergeOperatorCalls(); + { + Status s = db->Write(WriteOptions(), &batch); + assert(s.ok()); + } + assert(numMergeOperatorCalls == + num_merges - (num_merges % (max_num_merges + 1))); + + // Get the value + resetNumMergeOperatorCalls(); + string get_value_str; + { + Status s = db->Get(ReadOptions(), key, &get_value_str); + assert(s.ok()); + } + assert(get_value_str.size() == sizeof(uint64_t)); + uint64_t get_value = DecodeFixed64(&get_value_str[0]); + ASSERT_EQ(get_value, num_merges * merge_value); + ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1))); +} + +void runTest(int argc, const string& dbname, const bool use_ttl = false) { + auto db = OpenDb(dbname, use_ttl); + + { + cout << "Test read-modify-write counters... \n"; + Counters counters(db, 0); + testCounters(counters, db.get(), true); + } + + bool compact = false; + if (argc > 1) { + compact = true; + cout << "Turn on Compaction\n"; + } + + { + cout << "Test merge-based counters... \n"; + MergeBasedCounters counters(db, 0); + testCounters(counters, db.get(), compact); + } + + DestroyDB(dbname, Options()); + db.reset(); + + { + cout << "Test merge in memtable... \n"; + size_t max_merge = 5; + auto db = OpenDb(dbname, use_ttl, max_merge); + MergeBasedCounters counters(db, 0); + testCounters(counters, db.get(), compact); + testSuccessiveMerge(counters, max_merge, max_merge * 2); + testSingleBatchSuccessiveMerge(db.get(), 5, 7); + DestroyDB(dbname, Options()); + } + + { + cout << "Test Partial-Merge\n"; + size_t max_merge = 100; + for (uint32_t min_merge = 5; min_merge < 25; min_merge += 5) { + for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) { + auto db = OpenDb(dbname, use_ttl, max_merge, min_merge); + MergeBasedCounters counters(db, 0); + testPartialMerge(&counters, db.get(), max_merge, min_merge, count); + DestroyDB(dbname, Options()); + } + { + auto db = OpenDb(dbname, use_ttl, max_merge, min_merge); + MergeBasedCounters counters(db, 0); + testPartialMerge(&counters, db.get(), max_merge, min_merge, + min_merge * 10); + DestroyDB(dbname, Options()); + } + } + } +} +} // namespace + +int main(int argc, char *argv[]) { + //TODO: Make this test like a general rocksdb unit-test + runTest(argc, test::TmpDir() + "/merge_testdb"); + runTest(argc, test::TmpDir() + "/merge_testdbttl", true); // Run test on TTL database + printf("Passed all tests!\n"); + return 0; +} diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc new file mode 100644 index 0000000000..a182fb5214 --- /dev/null +++ b/db/perf_context_test.cc @@ -0,0 +1,358 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include +#include "/usr/include/valgrind/callgrind.h" + +#include "rocksdb/db.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/memtablerep.h" +#include "util/histogram.h" +#include "util/stop_watch.h" +#include "util/testharness.h" + + +bool FLAGS_random_key = false; +bool FLAGS_use_set_based_memetable = false; +int FLAGS_total_keys = 100; +int FLAGS_write_buffer_size = 1000000000; +int FLAGS_max_write_buffer_number = 8; +int FLAGS_min_write_buffer_number_to_merge = 7; + +// Path to the database on file system +const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test"; + +namespace rocksdb { + +std::shared_ptr OpenDb() { + DB* db; + Options options; + options.create_if_missing = true; + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + + if (FLAGS_use_set_based_memetable) { + auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0); + options.memtable_factory.reset( + NewHashSkipListRepFactory(prefix_extractor)); + } + + Status s = DB::Open(options, kDbName, &db); + ASSERT_OK(s); + return std::shared_ptr(db); +} + +class PerfContextTest { }; + +TEST(PerfContextTest, SeekIntoDeletion) { + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + db->Put(write_options, key, value); + } + + for (int i = 0; i < FLAGS_total_keys -1 ; ++i) { + std::string key = "k" + std::to_string(i); + db->Delete(write_options, key); + } + + HistogramImpl hist_get; + HistogramImpl hist_get_time; + for (int i = 0; i < FLAGS_total_keys - 1; ++i) { + std::string key = "k" + std::to_string(i); + std::string value; + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + auto status = db->Get(read_options, key, &value); + auto elapsed_nanos = timer.ElapsedNanos(); + ASSERT_TRUE(status.IsNotFound()); + hist_get.Add(perf_context.user_key_comparison_count); + hist_get_time.Add(elapsed_nanos); + } + + std::cout << "Get uesr key comparison: \n" << hist_get.ToString() + << "Get time: \n" << hist_get_time.ToString(); + + HistogramImpl hist_seek_to_first; + std::unique_ptr iter(db->NewIterator(read_options)); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->SeekToFirst(); + hist_seek_to_first.Add(perf_context.user_key_comparison_count); + auto elapsed_nanos = timer.ElapsedNanos(); + + std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString() + << "ikey skipped: " << perf_context.internal_key_skipped_count << "\n" + << "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n" + << "elapsed: " << elapsed_nanos << "\n"; + + HistogramImpl hist_seek; + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::unique_ptr iter(db->NewIterator(read_options)); + std::string key = "k" + std::to_string(i); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->Seek(key); + auto elapsed_nanos = timer.ElapsedNanos(); + hist_seek.Add(perf_context.user_key_comparison_count); + std::cout << "seek cmp: " << perf_context.user_key_comparison_count + << " ikey skipped " << perf_context.internal_key_skipped_count + << " idelete skipped " << perf_context.internal_delete_skipped_count + << " elapsed: " << elapsed_nanos << "ns\n"; + + perf_context.Reset(); + ASSERT_TRUE(iter->Valid()); + StopWatchNano timer2(Env::Default(), true); + iter->Next(); + auto elapsed_nanos2 = timer2.ElapsedNanos(); + std::cout << "next cmp: " << perf_context.user_key_comparison_count + << "elapsed: " << elapsed_nanos2 << "ns\n"; + } + + std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString(); +} + +TEST(PerfContextTest, StopWatchNanoOverhead) { + // profile the timer cost by itself! + const int kTotalIterations = 1000000; + std::vector timings(kTotalIterations); + + StopWatchNano timer(Env::Default(), true); + for (auto& timing : timings) { + timing = timer.ElapsedNanos(true /* reset */); + } + + HistogramImpl histogram; + for (const auto timing : timings) { + histogram.Add(timing); + } + + std::cout << histogram.ToString(); +} + +TEST(PerfContextTest, StopWatchOverhead) { + // profile the timer cost by itself! + const int kTotalIterations = 1000000; + std::vector timings(kTotalIterations); + + StopWatch timer(Env::Default()); + for (auto& timing : timings) { + timing = timer.ElapsedMicros(); + } + + HistogramImpl histogram; + uint64_t prev_timing = 0; + for (const auto timing : timings) { + histogram.Add(timing - prev_timing); + prev_timing = timing; + } + + std::cout << histogram.ToString(); +} + +void ProfileKeyComparison() { + DestroyDB(kDbName, Options()); // Start this test with a fresh DB + + auto db = OpenDb(); + + WriteOptions write_options; + ReadOptions read_options; + + HistogramImpl hist_put; + HistogramImpl hist_get; + HistogramImpl hist_get_snapshot; + HistogramImpl hist_get_memtable; + HistogramImpl hist_get_post_process; + HistogramImpl hist_num_memtable_checked; + HistogramImpl hist_write_pre_post; + HistogramImpl hist_write_wal_time; + HistogramImpl hist_write_memtable_time; + + std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; + + std::vector keys; + for (int i = 0; i < FLAGS_total_keys; ++i) { + keys.push_back(i); + } + + if (FLAGS_random_key) { + std::random_shuffle(keys.begin(), keys.end()); + } + + for (const int i : keys) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + perf_context.Reset(); + db->Put(write_options, key, value); + hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time); + hist_write_wal_time.Add(perf_context.write_wal_time); + hist_write_memtable_time.Add(perf_context.write_memtable_time); + hist_put.Add(perf_context.user_key_comparison_count); + + perf_context.Reset(); + db->Get(read_options, key, &value); + hist_get_snapshot.Add(perf_context.get_snapshot_time); + hist_get_memtable.Add(perf_context.get_from_memtable_time); + hist_num_memtable_checked.Add(perf_context.get_from_memtable_count); + hist_get_post_process.Add(perf_context.get_post_process_time); + hist_get.Add(perf_context.user_key_comparison_count); + } + + std::cout << "Put uesr key comparison: \n" << hist_put.ToString() + << "Get uesr key comparison: \n" << hist_get.ToString(); + std::cout << "Put(): Pre and Post Process Time: \n" + << hist_write_pre_post.ToString() + << " Writing WAL time: \n" + << hist_write_wal_time.ToString() << "\n" + << " Writing Mem Table time: \n" + << hist_write_memtable_time.ToString() << "\n"; + + std::cout << "Get(): Time to get snapshot: \n" + << hist_get_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_get_memtable.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" + << hist_get_post_process.ToString() << "\n"; +} + +TEST(PerfContextTest, KeyComparisonCount) { + SetPerfLevel(kEnableCount); + ProfileKeyComparison(); + + SetPerfLevel(kDisable); + ProfileKeyComparison(); + + SetPerfLevel(kEnableTime); + ProfileKeyComparison(); +} + +// make perf_context_test +// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison +// For one memtable: +// ./perf_context_test --write_buffer_size=500000 --total_keys=10000 +// For two memtables: +// ./perf_context_test --write_buffer_size=250000 --total_keys=10000 +// Specify --random_key=1 to shuffle the key before insertion +// Results show that, for sequential insertion, worst-case Seek Key comparison +// is close to the total number of keys (linear), when there is only one +// memtable. When there are two memtables, even the avg Seek Key comparison +// starts to become linear to the input size. + +TEST(PerfContextTest, SeekKeyComparison) { + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; + + std::vector keys; + for (int i = 0; i < FLAGS_total_keys; ++i) { + keys.push_back(i); + } + + if (FLAGS_random_key) { + std::random_shuffle(keys.begin(), keys.end()); + } + + HistogramImpl hist_put_time; + HistogramImpl hist_wal_time; + HistogramImpl hist_time_diff; + + SetPerfLevel(kEnableTime); + StopWatchNano timer(Env::Default()); + for (const int i : keys) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + perf_context.Reset(); + timer.Start(); + db->Put(write_options, key, value); + auto put_time = timer.ElapsedNanos(); + hist_put_time.Add(put_time); + hist_wal_time.Add(perf_context.write_wal_time); + hist_time_diff.Add(put_time - perf_context.write_wal_time); + } + + std::cout << "Put time:\n" << hist_put_time.ToString() + << "WAL time:\n" << hist_wal_time.ToString() + << "time diff:\n" << hist_time_diff.ToString(); + + HistogramImpl hist_seek; + HistogramImpl hist_next; + + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + std::unique_ptr iter(db->NewIterator(read_options)); + perf_context.Reset(); + iter->Seek(key); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value().ToString(), value); + hist_seek.Add(perf_context.user_key_comparison_count); + } + + std::unique_ptr iter(db->NewIterator(read_options)); + for (iter->SeekToFirst(); iter->Valid();) { + perf_context.Reset(); + iter->Next(); + hist_next.Add(perf_context.user_key_comparison_count); + } + + std::cout << "Seek:\n" << hist_seek.ToString() + << "Next:\n" << hist_next.ToString(); +} + +} + +int main(int argc, char** argv) { + + for (int i = 1; i < argc; i++) { + int n; + char junk; + + if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { + FLAGS_write_buffer_size = n; + } + + if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) { + FLAGS_total_keys = n; + } + + if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_random_key = n; + } + + if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_use_set_based_memetable = n; + } + + } + + std::cout << kDbName << "\n"; + + rocksdb::test::RunAllTests(); + return 0; +} diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc new file mode 100644 index 0000000000..17e3e61d89 --- /dev/null +++ b/db/plain_table_db_test.cc @@ -0,0 +1,853 @@ +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "table/meta_blocks.h" +#include "table/plain_table_factory.h" +#include "table/plain_table_reader.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "utilities/merge_operators.h" + +using std::unique_ptr; + +namespace rocksdb { + +class PlainTableDBTest { + protected: + private: + std::string dbname_; + Env* env_; + DB* db_; + + Options last_options_; + + public: + PlainTableDBTest() : env_(Env::Default()) { + dbname_ = test::TmpDir() + "/plain_table_db_test"; + ASSERT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + ~PlainTableDBTest() { + delete db_; + ASSERT_OK(DestroyDB(dbname_, Options())); + } + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + options.table_factory.reset(NewPlainTableFactory(16, 2, 0.8, 3)); + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.allow_mmap_reads = true; + return options; + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void Close() { + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(Options* options = nullptr) { + //Destroy using last options + Destroy(&last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(Options* options) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + } + + Status PureReopen(Options* options, DB** db) { + return DB::Open(*options, dbname_, db); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const Slice& k, const Slice& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + + int NumTableFilesAtLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } +}; + +TEST(PlainTableDBTest, Empty) { + ASSERT_TRUE(dbfull() != nullptr); + ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); +} + +class TestPlainTableReader : public PlainTableReader { + public: + TestPlainTableReader(const EnvOptions& storage_options, + const InternalKeyComparator& icomparator, + uint64_t file_size, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, + const TableProperties* table_properties, + unique_ptr&& file, + const Options& options, bool* expect_bloom_not_match) + : PlainTableReader(options, std::move(file), storage_options, icomparator, + file_size, bloom_bits_per_key, hash_table_ratio, + index_sparseness, table_properties, 2 * 1024 * 1024), + expect_bloom_not_match_(expect_bloom_not_match) { + Status s = PopulateIndex(const_cast(table_properties)); + ASSERT_TRUE(s.ok()); + } + + virtual ~TestPlainTableReader() {} + + private: + virtual bool MatchBloom(uint32_t hash) const override { + bool ret = PlainTableReader::MatchBloom(hash); + ASSERT_TRUE(!*expect_bloom_not_match_ || !ret); + return ret; + } + bool* expect_bloom_not_match_; +}; + +extern const uint64_t kPlainTableMagicNumber; +class TestPlainTableFactory : public PlainTableFactory { + public: + explicit TestPlainTableFactory(bool* expect_bloom_not_match, + uint32_t user_key_len, int bloom_bits_per_key, + double hash_table_ratio, + size_t index_sparseness, + size_t huge_page_tlb_size) + : PlainTableFactory(user_key_len, user_key_len, hash_table_ratio, + index_sparseness, huge_page_tlb_size), + bloom_bits_per_key_(bloom_bits_per_key), + hash_table_ratio_(hash_table_ratio), + index_sparseness_(index_sparseness), + expect_bloom_not_match_(expect_bloom_not_match) {} + + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override { + TableProperties* props = nullptr; + auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, + options.env, options.info_log.get(), &props); + ASSERT_TRUE(s.ok()); + + std::unique_ptr new_reader(new TestPlainTableReader( + soptions, internal_comparator, file_size, bloom_bits_per_key_, + hash_table_ratio_, index_sparseness_, props, std::move(file), options, + expect_bloom_not_match_)); + + *table = std::move(new_reader); + return s; + } + + private: + int bloom_bits_per_key_; + double hash_table_ratio_; + size_t index_sparseness_; + bool* expect_bloom_not_match_; +}; + +TEST(PlainTableDBTest, Flush) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.table_factory.reset(NewTotalOrderPlainTableFactory( + 16, bloom_bits, 2, huge_page_tlb_size)); + } else { + options.table_factory.reset(NewPlainTableFactory( + 16, bloom_bits, 0.75, 16, huge_page_tlb_size)); + } + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("1000000000000foo", "v3")); + dbfull()->TEST_FlushMemTable(); + + TablePropertiesCollection ptc; + reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_EQ(1U, ptc.size()); + auto row = ptc.begin(); + auto tp = row->second; + ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at( + "plain_table_hash_table_size")); + ASSERT_EQ(total_order ? "9" : "0", (tp->user_collected_properties).at( + "plain_table_sub_index_size")); + + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); + } + } + } +} + +TEST(PlainTableDBTest, Flush2) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor = nullptr; + options.table_factory.reset( + new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits, + 0, 2, huge_page_tlb_size)); + } else { + options.table_factory.reset( + new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits, + 0.75, 16, huge_page_tlb_size)); + } + DestroyAndReopen(&options); + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("1000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_OK(Put("1000000000000foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("1000000000000foo")); + + ASSERT_OK(Put("0000000000000eee", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000eee")); + + ASSERT_OK(Delete("0000000000000bar")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); + + ASSERT_OK(Put("0000000000000eee", "v5")); + ASSERT_OK(Put("9000000000000eee", "v5")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v5", Get("0000000000000eee")); + + // Test Bloom Filter + if (bloom_bits > 0) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar")); + + // Key doesn't exist any more but prefix exists. + if (total_order) { + ASSERT_EQ("NOT_FOUND", Get("1000000000000not")); + ASSERT_EQ("NOT_FOUND", Get("0000000000000not")); + } + expect_bloom_not_match = false; + } + } + } + } +} + +TEST(PlainTableDBTest, Iterator) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor = nullptr; + options.table_factory.reset( + new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits, + 0, 2, huge_page_tlb_size)); + } else { + options.table_factory.reset( + new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits, + 0.75, 16, huge_page_tlb_size)); + } + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000foo002", "v_2")); + ASSERT_OK(Put("0000000000000bar", "random")); + ASSERT_OK(Put("1000000000foo001", "v1")); + ASSERT_OK(Put("3000000000000bar", "bar_v")); + ASSERT_OK(Put("1000000000foo003", "v__3")); + ASSERT_OK(Put("1000000000foo004", "v__4")); + ASSERT_OK(Put("1000000000foo005", "v__5")); + ASSERT_OK(Put("1000000000foo007", "v__7")); + ASSERT_OK(Put("1000000000foo008", "v__8")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get("1000000000foo001")); + ASSERT_EQ("v__3", Get("1000000000foo003")); + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo002", iter->key().ToString()); + ASSERT_EQ("v_2", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo003", iter->key().ToString()); + ASSERT_EQ("v__3", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo004", iter->key().ToString()); + ASSERT_EQ("v__4", iter->value().ToString()); + + iter->Seek("3000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + ASSERT_EQ("bar_v", iter->value().ToString()); + + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Seek("1000000000foo005"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo006"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); + + iter->Seek("1000000000foo008"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + if (total_order == 0) { + iter->Seek("1000000000foo009"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + } + + // Test Bloom Filter + if (bloom_bits > 0) { + if (!total_order) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + iter->Seek("2not000000000bar"); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); + expect_bloom_not_match = false; + } else { + expect_bloom_not_match = true; + ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); + expect_bloom_not_match = false; + } + } + + delete iter; + } + } + } +} + +namespace { +std::string MakeLongKey(size_t length, char c) { + return std::string(length, c); +} +} // namespace + +TEST(PlainTableDBTest, IteratorLargeKeys) { + Options options = CurrentOptions(); + options.table_factory.reset(NewTotalOrderPlainTableFactory(0, 0, 16)); + options.create_if_missing = true; + options.prefix_extractor.reset(); + DestroyAndReopen(&options); + + std::string key_list[] = { + MakeLongKey(30, '0'), + MakeLongKey(16, '1'), + MakeLongKey(32, '2'), + MakeLongKey(60, '3'), + MakeLongKey(90, '4'), + MakeLongKey(50, '5'), + MakeLongKey(26, '6') + }; + + for (size_t i = 0; i < 7; i++) { + ASSERT_OK(Put(key_list[i], std::to_string(i))); + } + + dbfull()->TEST_FlushMemTable(); + + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + iter->Seek(key_list[0]); + + for (size_t i = 0; i < 7; i++) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(key_list[i], iter->key().ToString()); + ASSERT_EQ(std::to_string(i), iter->value().ToString()); + iter->Next(); + } + + ASSERT_TRUE(!iter->Valid()); + + delete iter; +} + +// A test comparator which compare two strings in this way: +// (1) first compare prefix of 8 bytes in alphabet order, +// (2) if two strings share the same prefix, sort the other part of the string +// in the reverse alphabet order. +class SimpleSuffixReverseComparator : public Comparator { + public: + SimpleSuffixReverseComparator() {} + + virtual const char* Name() const { return "SimpleSuffixReverseComparator"; } + + virtual int Compare(const Slice& a, const Slice& b) const { + Slice prefix_a = Slice(a.data(), 8); + Slice prefix_b = Slice(b.data(), 8); + int prefix_comp = prefix_a.compare(prefix_b); + if (prefix_comp != 0) { + return prefix_comp; + } else { + Slice suffix_a = Slice(a.data() + 8, a.size() - 8); + Slice suffix_b = Slice(b.data() + 8, b.size() - 8); + return -(suffix_a.compare(suffix_b)); + } + } + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const {} + + virtual void FindShortSuccessor(std::string* key) const {} +}; + +TEST(PlainTableDBTest, IteratorReverseSuffixComparator) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + SimpleSuffixReverseComparator comp; + options.comparator = ∁ + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000foo002", "v_2")); + ASSERT_OK(Put("0000000000000bar", "random")); + ASSERT_OK(Put("1000000000foo001", "v1")); + ASSERT_OK(Put("3000000000000bar", "bar_v")); + ASSERT_OK(Put("1000000000foo003", "v__3")); + ASSERT_OK(Put("1000000000foo004", "v__4")); + ASSERT_OK(Put("1000000000foo005", "v__5")); + ASSERT_OK(Put("1000000000foo007", "v__7")); + ASSERT_OK(Put("1000000000foo008", "v__8")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get("1000000000foo001")); + ASSERT_EQ("v__3", Get("1000000000foo003")); + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + iter->Seek("1000000000foo009"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo004", iter->key().ToString()); + ASSERT_EQ("v__4", iter->value().ToString()); + + iter->Seek("3000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + ASSERT_EQ("bar_v", iter->value().ToString()); + + iter->Seek("1000000000foo005"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo006"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo008"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + + delete iter; +} + +TEST(PlainTableDBTest, HashBucketConflict) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (unsigned char i = 1; i <= 3; i++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + options.table_factory.reset( + NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo0", "v3")); + ASSERT_OK(Put("2000000000000fo1", "v4")); + ASSERT_OK(Put("2000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo3", "v")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("2000000000000fo0")); + ASSERT_EQ("v4", Get("2000000000000fo1")); + + ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); + + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + + iter->Seek("5000000000000fo0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("2000000000000fo8"); + ASSERT_TRUE(!iter->Valid() || + options.comparator->Compare(iter->key(), "20000001") > 0); + + iter->Seek("5000000000000fo8"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("3000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; + } + } +} + +TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (unsigned char i = 1; i <= 3; i++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + SimpleSuffixReverseComparator comp; + options.comparator = ∁ + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + options.table_factory.reset( + NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo0", "v3")); + ASSERT_OK(Put("2000000000000fo1", "v4")); + ASSERT_OK(Put("2000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo3", "v")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("2000000000000fo0")); + ASSERT_EQ("v4", Get("2000000000000fo1")); + + ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); + + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000var"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo3", iter->key().ToString()); + + iter->Seek("5000000000000var"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo2", iter->key().ToString()); + + std::string seek_key = "2000000000000bar"; + iter->Seek(seek_key); + ASSERT_TRUE(!iter->Valid() || + options.prefix_extractor->Transform(iter->key()) != + options.prefix_extractor->Transform(seek_key)); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("3000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; + } + } +} + +TEST(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 5)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v3")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("5000000000000fo2")); + + ASSERT_EQ("NOT_FOUND", Get("8000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("1000000000000bar")); + + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + + iter->Seek("5000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000fo8"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; +} + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key_______%06d", i); + return std::string(buf); +} + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +TEST(PlainTableDBTest, CompactionTrigger) { + Options options = CurrentOptions(); + options.write_buffer_size = 100 << 10; //100KB + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.level0_file_num_compaction_trigger = 3; + Reopen(&options); + + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/prefix_test.cc b/db/prefix_test.cc new file mode 100644 index 0000000000..64a4d06172 --- /dev/null +++ b/db/prefix_test.cc @@ -0,0 +1,499 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include +#include +#include + +#include +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/memtablerep.h" +#include "util/histogram.h" +#include "util/stop_watch.h" +#include "util/testharness.h" + +using GFLAGS::ParseCommandLineFlags; + +DEFINE_bool(trigger_deadlock, false, + "issue delete in range scan to trigger PrefixHashMap deadlock"); +DEFINE_uint64(bucket_count, 100000, "number of buckets"); +DEFINE_uint64(num_locks, 10001, "number of locks"); +DEFINE_bool(random_prefix, false, "randomize prefix"); +DEFINE_uint64(total_prefixes, 100000, "total number of prefixes"); +DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix"); +DEFINE_int64(write_buffer_size, 33554432, ""); +DEFINE_int64(max_write_buffer_number, 2, ""); +DEFINE_int64(min_write_buffer_number_to_merge, 1, ""); +DEFINE_int32(skiplist_height, 4, ""); +DEFINE_int32(memtable_prefix_bloom_bits, 10000000, ""); +DEFINE_int32(memtable_prefix_bloom_probes, 10, ""); +DEFINE_int32(memtable_prefix_bloom_huge_page_tlb_size, 2 * 1024 * 1024, ""); +DEFINE_int32(value_size, 40, ""); + +// Path to the database on file system +const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test"; + +namespace rocksdb { + +struct TestKey { + uint64_t prefix; + uint64_t sorted; + + TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {} +}; + +// return a slice backed by test_key +inline Slice TestKeyToSlice(const TestKey& test_key) { + return Slice((const char*)&test_key, sizeof(test_key)); +} + +inline const TestKey* SliceToTestKey(const Slice& slice) { + return (const TestKey*)slice.data(); +} + +class TestKeyComparator : public Comparator { + public: + + // Compare needs to be aware of the possibility of a and/or b is + // prefix only + virtual int Compare(const Slice& a, const Slice& b) const { + const TestKey* key_a = SliceToTestKey(a); + const TestKey* key_b = SliceToTestKey(b); + if (key_a->prefix != key_b->prefix) { + if (key_a->prefix < key_b->prefix) return -1; + if (key_a->prefix > key_b->prefix) return 1; + } else { + ASSERT_TRUE(key_a->prefix == key_b->prefix); + // note, both a and b could be prefix only + if (a.size() != b.size()) { + // one of them is prefix + ASSERT_TRUE( + (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) || + (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey))); + if (a.size() < b.size()) return -1; + if (a.size() > b.size()) return 1; + } else { + // both a and b are prefix + if (a.size() == sizeof(uint64_t)) { + return 0; + } + + // both a and b are whole key + ASSERT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey)); + if (key_a->sorted < key_b->sorted) return -1; + if (key_a->sorted > key_b->sorted) return 1; + if (key_a->sorted == key_b->sorted) return 0; + } + } + return 0; + } + + virtual const char* Name() const override { + return "TestKeyComparator"; + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + } + + virtual void FindShortSuccessor(std::string* key) const {} + +}; + +namespace { +void PutKey(DB* db, WriteOptions write_options, uint64_t prefix, + uint64_t suffix, const Slice& value) { + TestKey test_key(prefix, suffix); + Slice key = TestKeyToSlice(test_key); + ASSERT_OK(db->Put(write_options, key, value)); +} + +void SeekIterator(Iterator* iter, uint64_t prefix, uint64_t suffix) { + TestKey test_key(prefix, suffix); + Slice key = TestKeyToSlice(test_key); + iter->Seek(key); +} + +const std::string kNotFoundResult = "NOT_FOUND"; + +std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix, + uint64_t suffix) { + TestKey test_key(prefix, suffix); + Slice key = TestKeyToSlice(test_key); + + std::string result; + Status s = db->Get(read_options, key, &result); + if (s.IsNotFound()) { + result = kNotFoundResult; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; +} +} // namespace + +class PrefixTest { + public: + std::shared_ptr OpenDb() { + DB* db; + + options.create_if_missing = true; + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + + options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits; + options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes; + options.memtable_prefix_bloom_huge_page_tlb_size = + FLAGS_memtable_prefix_bloom_huge_page_tlb_size; + + Status s = DB::Open(options, kDbName, &db); + ASSERT_OK(s); + return std::shared_ptr(db); + } + + void FirstOption() { + option_config_ = kBegin; + } + + bool NextOptions(int bucket_count) { + // skip some options + option_config_++; + if (option_config_ < kEnd) { + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + switch(option_config_) { + case kHashSkipList: + options.memtable_factory.reset( + NewHashSkipListRepFactory(bucket_count, FLAGS_skiplist_height)); + return true; + case kHashLinkList: + options.memtable_factory.reset( + NewHashLinkListRepFactory(bucket_count)); + return true; + case kHashLinkListHugePageTlb: + options.memtable_factory.reset( + NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024)); + return true; + default: + return false; + } + } + return false; + } + + PrefixTest() : option_config_(kBegin) { + options.comparator = new TestKeyComparator(); + } + ~PrefixTest() { + delete options.comparator; + } + protected: + enum OptionConfig { + kBegin, + kHashSkipList, + kHashLinkList, + kHashLinkListHugePageTlb, + kEnd + }; + int option_config_; + Options options; +}; + +TEST(PrefixTest, TestResult) { + for (int num_buckets = 1; num_buckets <= 2; num_buckets++) { + FirstOption(); + while (NextOptions(num_buckets)) { + std::cout << "*** Mem table: " << options.memtable_factory->Name() + << " number of buckets: " << num_buckets + << std::endl; + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + // 1. Insert one row. + Slice v16("v16"); + PutKey(db.get(), write_options, 1, 6, v16); + std::unique_ptr iter(db->NewIterator(read_options)); + SeekIterator(iter.get(), 1, 6); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(!iter->Valid()); + + ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 7)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 0, 6)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 2, 6)); + + // 2. Insert an entry for the same prefix as the last entry in the bucket. + Slice v17("v17"); + PutKey(db.get(), write_options, 1, 7, v17); + iter.reset(db->NewIterator(read_options)); + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 6); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(!iter->Valid()); + + // 3. Insert an entry for the same prefix as the head of the bucket. + Slice v15("v15"); + PutKey(db.get(), write_options, 1, 5, v15); + iter.reset(db->NewIterator(read_options)); + + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + + ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5)); + ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6)); + ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7)); + + // 4. Insert an entry with a larger prefix + Slice v22("v22"); + PutKey(db.get(), write_options, 2, 2, v22); + iter.reset(db->NewIterator(read_options)); + + SeekIterator(iter.get(), 2, 2); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + // 5. Insert an entry with a smaller prefix + Slice v02("v02"); + PutKey(db.get(), write_options, 0, 2, v02); + iter.reset(db->NewIterator(read_options)); + + SeekIterator(iter.get(), 0, 2); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v02 == iter->value()); + SeekIterator(iter.get(), 0, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v02 == iter->value()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + // 6. Insert to the beginning and the end of the first prefix + Slice v13("v13"); + Slice v18("v18"); + PutKey(db.get(), write_options, 1, 3, v13); + PutKey(db.get(), write_options, 1, 8, v18); + iter.reset(db->NewIterator(read_options)); + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 3); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v13 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v18 == iter->value()); + + SeekIterator(iter.get(), 0, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v02 == iter->value()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + + ASSERT_EQ(v22.ToString(), Get(db.get(), read_options, 2, 2)); + ASSERT_EQ(v02.ToString(), Get(db.get(), read_options, 0, 2)); + ASSERT_EQ(v13.ToString(), Get(db.get(), read_options, 1, 3)); + ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5)); + ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6)); + ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7)); + ASSERT_EQ(v18.ToString(), Get(db.get(), read_options, 1, 8)); + } + } +} + +TEST(PrefixTest, DynamicPrefixIterator) { + while (NextOptions(FLAGS_bucket_count)) { + std::cout << "*** Mem table: " << options.memtable_factory->Name() + << std::endl; + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + std::vector prefixes; + for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) { + prefixes.push_back(i); + } + + if (FLAGS_random_prefix) { + std::random_shuffle(prefixes.begin(), prefixes.end()); + } + + HistogramImpl hist_put_time; + HistogramImpl hist_put_comparison; + + // insert x random prefix, each with y continuous element. + for (auto prefix : prefixes) { + for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { + TestKey test_key(prefix, sorted); + + Slice key = TestKeyToSlice(test_key); + std::string value(FLAGS_value_size, 0); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + ASSERT_OK(db->Put(write_options, key, value)); + hist_put_time.Add(timer.ElapsedNanos()); + hist_put_comparison.Add(perf_context.user_key_comparison_count); + } + } + + std::cout << "Put key comparison: \n" << hist_put_comparison.ToString() + << "Put time: \n" << hist_put_time.ToString(); + + // test seek existing keys + HistogramImpl hist_seek_time; + HistogramImpl hist_seek_comparison; + + std::unique_ptr iter(db->NewIterator(read_options)); + + for (auto prefix : prefixes) { + TestKey test_key(prefix, FLAGS_items_per_prefix / 2); + Slice key = TestKeyToSlice(test_key); + std::string value = "v" + std::to_string(0); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + auto key_prefix = options.prefix_extractor->Transform(key); + uint64_t total_keys = 0; + for (iter->Seek(key); + iter->Valid() && iter->key().starts_with(key_prefix); + iter->Next()) { + if (FLAGS_trigger_deadlock) { + std::cout << "Behold the deadlock!\n"; + db->Delete(write_options, iter->key()); + } + total_keys++; + } + hist_seek_time.Add(timer.ElapsedNanos()); + hist_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2); + } + + std::cout << "Seek key comparison: \n" + << hist_seek_comparison.ToString() + << "Seek time: \n" + << hist_seek_time.ToString(); + + // test non-existing keys + HistogramImpl hist_no_seek_time; + HistogramImpl hist_no_seek_comparison; + + for (auto prefix = FLAGS_total_prefixes; + prefix < FLAGS_total_prefixes + 10000; + prefix++) { + TestKey test_key(prefix, 0); + Slice key = TestKeyToSlice(test_key); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->Seek(key); + hist_no_seek_time.Add(timer.ElapsedNanos()); + hist_no_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_TRUE(!iter->Valid()); + } + + std::cout << "non-existing Seek key comparison: \n" + << hist_no_seek_comparison.ToString() + << "non-existing Seek time: \n" + << hist_no_seek_time.ToString(); + } +} + +} + +int main(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + std::cout << kDbName << "\n"; + + rocksdb::test::RunAllTests(); + return 0; +} + +#endif // GFLAGS diff --git a/db/repair.cc b/db/repair.cc new file mode 100644 index 0000000000..03571a8294 --- /dev/null +++ b/db/repair.cc @@ -0,0 +1,403 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// We recover the contents of the descriptor from the other files we find. +// (1) Any log files are first converted to tables +// (2) We scan every table to compute +// (a) smallest/largest for the table +// (b) largest sequence number in the table +// (3) We generate descriptor contents: +// - log number is set to zero +// - next-file-number is set to 1 + largest file number we found +// - last-sequence-number is set to largest sequence# found across +// all tables (see 2c) +// - compaction pointers are cleared +// - every table file is added at level 0 +// +// Possible optimization 1: +// (a) Compute total size and use to pick appropriate max-level M +// (b) Sort tables by largest sequence# in the table +// (c) For each table: if it overlaps earlier table, place in level-0, +// else place in level-M. +// Possible optimization 2: +// Store per-table metadata (smallest, largest, largest-seq#, ...) +// in the table's meta section to speed up ScanTable. + +#ifndef ROCKSDB_LITE + +#include "db/builder.h" +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "db/write_batch_internal.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" + +namespace rocksdb { + +namespace { + +class Repairer { + public: + Repairer(const std::string& dbname, const Options& options) + : dbname_(dbname), + env_(options.env), + icmp_(options.comparator), + ipolicy_(options.filter_policy), + options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)), + raw_table_cache_( + // TableCache can be small since we expect each table to be opened + // once. + NewLRUCache(10, options_.table_cache_numshardbits, + options_.table_cache_remove_scan_count_limit)), + next_file_number_(1) { + table_cache_ = new TableCache(dbname_, &options_, storage_options_, + raw_table_cache_.get()); + edit_ = new VersionEdit(); + } + + ~Repairer() { + delete table_cache_; + raw_table_cache_.reset(); + delete edit_; + } + + Status Run() { + Status status = FindFiles(); + if (status.ok()) { + ConvertLogFilesToTables(); + ExtractMetaData(); + status = WriteDescriptor(); + } + if (status.ok()) { + unsigned long long bytes = 0; + for (size_t i = 0; i < tables_.size(); i++) { + bytes += tables_[i].meta.file_size; + } + Log(options_.info_log, + "**** Repaired rocksdb %s; " + "recovered %d files; %llu bytes. " + "Some data may have been lost. " + "****", + dbname_.c_str(), + static_cast(tables_.size()), + bytes); + } + return status; + } + + private: + struct TableInfo { + FileMetaData meta; + SequenceNumber min_sequence; + SequenceNumber max_sequence; + }; + + std::string const dbname_; + Env* const env_; + InternalKeyComparator const icmp_; + InternalFilterPolicy const ipolicy_; + Options const options_; + std::shared_ptr raw_table_cache_; + TableCache* table_cache_; + VersionEdit* edit_; + + std::vector manifests_; + std::vector table_numbers_; + std::vector logs_; + std::vector tables_; + uint64_t next_file_number_; + const EnvOptions storage_options_; + + Status FindFiles() { + std::vector filenames; + Status status = env_->GetChildren(dbname_, &filenames); + if (!status.ok()) { + return status; + } + if (filenames.empty()) { + return Status::Corruption(dbname_, "repair found no files"); + } + + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { + if (type == kDescriptorFile) { + manifests_.push_back(filenames[i]); + } else { + if (number + 1 > next_file_number_) { + next_file_number_ = number + 1; + } + if (type == kLogFile) { + logs_.push_back(number); + } else if (type == kTableFile) { + table_numbers_.push_back(number); + } else { + // Ignore other files + } + } + } + } + return status; + } + + void ConvertLogFilesToTables() { + for (size_t i = 0; i < logs_.size(); i++) { + std::string logname = LogFileName(dbname_, logs_[i]); + Status status = ConvertLogToTable(logs_[i]); + if (!status.ok()) { + Log(options_.info_log, "Log #%llu: ignoring conversion error: %s", + (unsigned long long) logs_[i], + status.ToString().c_str()); + } + ArchiveFile(logname); + } + } + + Status ConvertLogToTable(uint64_t log) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + std::shared_ptr info_log; + uint64_t lognum; + virtual void Corruption(size_t bytes, const Status& s) { + // We print error messages for corruption, but continue repairing. + Log(info_log, "Log #%llu: dropping %d bytes; %s", + (unsigned long long) lognum, + static_cast(bytes), + s.ToString().c_str()); + } + }; + + // Open the log file + std::string logname = LogFileName(dbname_, log); + unique_ptr lfile; + Status status = env_->NewSequentialFile(logname, &lfile, storage_options_); + if (!status.ok()) { + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log; + reporter.lognum = log; + // We intentially make log::Reader do checksumming so that + // corruptions cause entire commits to be skipped instead of + // propagating bad information (like overly large sequence + // numbers). + log::Reader reader(std::move(lfile), &reporter, false/*do not checksum*/, + 0/*initial_offset*/); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + MemTable* mem = new MemTable(icmp_, options_); + auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_); + mem->Ref(); + int counter = 0; + while (reader.ReadRecord(&record, &scratch)) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + status = WriteBatchInternal::InsertInto(&batch, cf_mems_default); + if (status.ok()) { + counter += WriteBatchInternal::Count(&batch); + } else { + Log(options_.info_log, "Log #%llu: ignoring %s", + (unsigned long long) log, + status.ToString().c_str()); + status = Status::OK(); // Keep going with rest of file + } + } + + // Do not record a version edit for this conversion to a Table + // since ExtractMetaData() will also generate edits. + FileMetaData meta; + meta.number = next_file_number_++; + ReadOptions ro; + Iterator* iter = mem->NewIterator(ro, true /* enforce_total_order */); + status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_, + iter, &meta, icmp_, 0, 0, kNoCompression); + delete iter; + delete mem->Unref(); + delete cf_mems_default; + mem = nullptr; + if (status.ok()) { + if (meta.file_size > 0) { + table_numbers_.push_back(meta.number); + } + } + Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", + (unsigned long long) log, + counter, + (unsigned long long) meta.number, + status.ToString().c_str()); + return status; + } + + void ExtractMetaData() { + for (size_t i = 0; i < table_numbers_.size(); i++) { + TableInfo t; + t.meta.number = table_numbers_[i]; + Status status = ScanTable(&t); + if (!status.ok()) { + std::string fname = TableFileName(dbname_, table_numbers_[i]); + Log(options_.info_log, "Table #%llu: ignoring %s", + (unsigned long long) table_numbers_[i], + status.ToString().c_str()); + ArchiveFile(fname); + } else { + tables_.push_back(t); + } + } + } + + Status ScanTable(TableInfo* t) { + std::string fname = TableFileName(dbname_, t->meta.number); + int counter = 0; + Status status = env_->GetFileSize(fname, &t->meta.file_size); + if (status.ok()) { + FileMetaData dummy_meta(t->meta.number, t->meta.file_size); + Iterator* iter = table_cache_->NewIterator( + ReadOptions(), storage_options_, icmp_, dummy_meta); + bool empty = true; + ParsedInternalKey parsed; + t->min_sequence = 0; + t->max_sequence = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + if (!ParseInternalKey(key, &parsed)) { + Log(options_.info_log, "Table #%llu: unparsable key %s", + (unsigned long long) t->meta.number, + EscapeString(key).c_str()); + continue; + } + + counter++; + if (empty) { + empty = false; + t->meta.smallest.DecodeFrom(key); + } + t->meta.largest.DecodeFrom(key); + if (parsed.sequence < t->min_sequence) { + t->min_sequence = parsed.sequence; + } + if (parsed.sequence > t->max_sequence) { + t->max_sequence = parsed.sequence; + } + } + if (!iter->status().ok()) { + status = iter->status(); + } + delete iter; + } + Log(options_.info_log, "Table #%llu: %d entries %s", + (unsigned long long) t->meta.number, + counter, + status.ToString().c_str()); + return status; + } + + Status WriteDescriptor() { + std::string tmp = TempFileName(dbname_, 1); + unique_ptr file; + Status status = env_->NewWritableFile( + tmp, &file, env_->OptimizeForManifestWrite(storage_options_)); + if (!status.ok()) { + return status; + } + + SequenceNumber max_sequence = 0; + for (size_t i = 0; i < tables_.size(); i++) { + if (max_sequence < tables_[i].max_sequence) { + max_sequence = tables_[i].max_sequence; + } + } + + edit_->SetComparatorName(icmp_.user_comparator()->Name()); + edit_->SetLogNumber(0); + edit_->SetNextFile(next_file_number_); + edit_->SetLastSequence(max_sequence); + + for (size_t i = 0; i < tables_.size(); i++) { + // TODO(opt): separate out into multiple levels + const TableInfo& t = tables_[i]; + edit_->AddFile(0, t.meta.number, t.meta.file_size, + t.meta.smallest, t.meta.largest, + t.min_sequence, t.max_sequence); + } + + //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); + { + log::Writer log(std::move(file)); + std::string record; + edit_->EncodeTo(&record); + status = log.AddRecord(record); + } + + if (!status.ok()) { + env_->DeleteFile(tmp); + } else { + // Discard older manifests + for (size_t i = 0; i < manifests_.size(); i++) { + ArchiveFile(dbname_ + "/" + manifests_[i]); + } + + // Install new manifest + status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); + if (status.ok()) { + status = SetCurrentFile(env_, dbname_, 1, nullptr); + } else { + env_->DeleteFile(tmp); + } + } + return status; + } + + void ArchiveFile(const std::string& fname) { + // Move into another directory. E.g., for + // dir/foo + // rename to + // dir/lost/foo + const char* slash = strrchr(fname.c_str(), '/'); + std::string new_dir; + if (slash != nullptr) { + new_dir.assign(fname.data(), slash - fname.data()); + } + new_dir.append("/lost"); + env_->CreateDir(new_dir); // Ignore error + std::string new_file = new_dir; + new_file.append("/"); + new_file.append((slash == nullptr) ? fname.c_str() : slash + 1); + Status s = env_->RenameFile(fname, new_file); + Log(options_.info_log, "Archiving %s: %s\n", + fname.c_str(), s.ToString().c_str()); + } +}; +} // namespace + +Status RepairDB(const std::string& dbname, const Options& options) { + Repairer repairer(dbname, options); + return repairer.Run(); +} + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc new file mode 100644 index 0000000000..a86ff0a17d --- /dev/null +++ b/db/simple_table_db_test.cc @@ -0,0 +1,800 @@ +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "rocksdb/statistics.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/table_builder.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "utilities/merge_operators.h" + +using std::unique_ptr; + +// IS THIS FILE STILL NEEDED? +namespace rocksdb { + +// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built +// as production quality. +// SimpleTable requires the input key size to be fixed 16 bytes, value cannot +// be longer than 150000 bytes and stored data on disk in this format: +// +--------------------------------------------+ <= key1 offset +// | key1 | value_size (4 bytes) | | +// +----------------------------------------+ | +// | value1 | +// | | +// +----------------------------------------+---+ <= key2 offset +// | key2 | value_size (4 bytes) | | +// +----------------------------------------+ | +// | value2 | +// | | +// | ...... | +// +-----------------+--------------------------+ <= index_block_offset +// | key1 | key1 offset (8 bytes) | +// +-----------------+--------------------------+ +// | key2 | key2 offset (8 bytes) | +// +-----------------+--------------------------+ +// | key3 | key3 offset (8 bytes) | +// +-----------------+--------------------------+ +// | ...... | +// +-----------------+------------+-------------+ +// | index_block_offset (8 bytes) | +// +------------------------------+ + +// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built +// as production quality. +class SimpleTableReader: public TableReader { +public: + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. + // + // If successful, returns ok and sets "*table" to the newly opened + // table. The client should delete "*table" when no longer needed. + // If there was an error while initializing the table, sets "*table" + // to nullptr and returns a non-ok status. Does not take ownership of + // "*source", but the client must ensure that "source" remains live + // for the duration of the returned table's lifetime. + // + // *file must remain live while this Table is in use. + static Status Open(const Options& options, const EnvOptions& soptions, + unique_ptr && file, uint64_t file_size, + unique_ptr* table_reader); + + Iterator* NewIterator(const ReadOptions&, Arena* arena) override; + + Status Get(const ReadOptions&, const Slice& key, void* arg, + bool (*handle_result)(void* arg, const ParsedInternalKey& k, + const Slice& v, bool), + void (*mark_key_may_exist)(void*) = nullptr) override; + + uint64_t ApproximateOffsetOf(const Slice& key) override; + + void SetupForCompaction() override; + + std::shared_ptr GetTableProperties() const override; + + ~SimpleTableReader(); + +private: + struct Rep; + Rep* rep_; + + explicit SimpleTableReader(Rep* rep) { + rep_ = rep; + } + friend class TableCache; + friend class SimpleTableIterator; + + Status GetOffset(const Slice& target, uint64_t* offset); + + // No copying allowed + explicit SimpleTableReader(const TableReader&) = delete; + void operator=(const TableReader&) = delete; +}; + +// Iterator to iterate SimpleTable +class SimpleTableIterator: public Iterator { +public: + explicit SimpleTableIterator(SimpleTableReader* table); + ~SimpleTableIterator(); + + bool Valid() const; + + void SeekToFirst(); + + void SeekToLast(); + + void Seek(const Slice& target); + + void Next(); + + void Prev(); + + Slice key() const; + + Slice value() const; + + Status status() const; + +private: + SimpleTableReader* table_; + uint64_t offset_; + uint64_t next_offset_; + Slice key_; + Slice value_; + char tmp_str_[4]; + char* key_str_; + char* value_str_; + int value_str_len_; + Status status_; + // No copying allowed + SimpleTableIterator(const SimpleTableIterator&) = delete; + void operator=(const Iterator&) = delete; +}; + +struct SimpleTableReader::Rep { + ~Rep() { + } + Rep(const EnvOptions& storage_options, uint64_t index_start_offset, + int num_entries) : + soptions(storage_options), index_start_offset(index_start_offset), + num_entries(num_entries) { + } + + Options options; + const EnvOptions& soptions; + Status status; + unique_ptr file; + uint64_t index_start_offset; + int num_entries; + std::shared_ptr table_properties; + + const static int user_key_size = 16; + const static int offset_length = 8; + const static int key_footer_len = 8; + + static int GetInternalKeyLength() { + return user_key_size + key_footer_len; + } +}; + +SimpleTableReader::~SimpleTableReader() { + delete rep_; +} + +Status SimpleTableReader::Open(const Options& options, + const EnvOptions& soptions, + unique_ptr && file, + uint64_t size, + unique_ptr* table_reader) { + char footer_space[Rep::offset_length]; + Slice footer_input; + Status s = file->Read(size - Rep::offset_length, Rep::offset_length, + &footer_input, footer_space); + if (s.ok()) { + uint64_t index_start_offset = DecodeFixed64(footer_space); + + int num_entries = (size - Rep::offset_length - index_start_offset) + / (Rep::GetInternalKeyLength() + Rep::offset_length); + SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions, + index_start_offset, + num_entries); + + rep->file = std::move(file); + rep->options = options; + table_reader->reset(new SimpleTableReader(rep)); + } + return s; +} + +void SimpleTableReader::SetupForCompaction() { +} + +std::shared_ptr SimpleTableReader::GetTableProperties() + const { + return rep_->table_properties; +} + +Iterator* SimpleTableReader::NewIterator(const ReadOptions& options, + Arena* arena) { + if (arena == nullptr) { + return new SimpleTableIterator(this); + } else { + auto mem = arena->AllocateAligned(sizeof(SimpleTableIterator)); + return new (mem) SimpleTableIterator(this); + } +} + +Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) { + uint32_t left = 0; + uint32_t right = rep_->num_entries - 1; + char key_chars[Rep::GetInternalKeyLength()]; + Slice tmp_slice; + + uint32_t target_offset = 0; + while (left <= right) { + uint32_t mid = (left + right + 1) / 2; + + uint64_t offset_to_read = rep_->index_start_offset + + (Rep::GetInternalKeyLength() + Rep::offset_length) * mid; + Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(), + &tmp_slice, key_chars); + if (!s.ok()) { + return s; + } + + InternalKeyComparator ikc(rep_->options.comparator); + int compare_result = ikc.Compare(tmp_slice, target); + + if (compare_result < 0) { + if (left == right) { + target_offset = right + 1; + break; + } + left = mid; + } else { + if (left == right) { + target_offset = left; + break; + } + right = mid - 1; + } + } + + if (target_offset >= (uint32_t) rep_->num_entries) { + *offset = rep_->index_start_offset; + return Status::OK(); + } + + char value_offset_chars[Rep::offset_length]; + + int64_t offset_for_value_offset = rep_->index_start_offset + + (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset + + Rep::GetInternalKeyLength(); + Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length, + &tmp_slice, value_offset_chars); + if (s.ok()) { + *offset = DecodeFixed64(value_offset_chars); + } + return s; +} + +Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k, + void* arg, + bool (*saver)(void*, const ParsedInternalKey&, + const Slice&, bool), + void (*mark_key_may_exist)(void*)) { + Status s; + SimpleTableIterator* iter = new SimpleTableIterator(this); + for (iter->Seek(k); iter->Valid(); iter->Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(iter->key(), &parsed_key)) { + return Status::Corruption(Slice()); + } + + if (!(*saver)(arg, parsed_key, iter->value(), true)) { + break; + } + } + s = iter->status(); + delete iter; + return s; +} + +uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) { + return 0; +} + +SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) : + table_(table) { + key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()]; + value_str_len_ = -1; + SeekToFirst(); +} + +SimpleTableIterator::~SimpleTableIterator() { + delete[] key_str_; + if (value_str_len_ >= 0) { + delete[] value_str_; + } +} + +bool SimpleTableIterator::Valid() const { + return offset_ < table_->rep_->index_start_offset; +} + +void SimpleTableIterator::SeekToFirst() { + next_offset_ = 0; + Next(); +} + +void SimpleTableIterator::SeekToLast() { + assert(false); +} + +void SimpleTableIterator::Seek(const Slice& target) { + Status s = table_->GetOffset(target, &next_offset_); + if (!s.ok()) { + status_ = s; + } + Next(); +} + +void SimpleTableIterator::Next() { + offset_ = next_offset_; + if (offset_ >= table_->rep_->index_start_offset) { + return; + } + Slice result; + int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength(); + + Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result, + key_str_); + next_offset_ += internal_key_size; + key_ = result; + + Slice value_size_slice; + s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_); + next_offset_ += 4; + uint32_t value_size = DecodeFixed32(tmp_str_); + + Slice value_slice; + if ((int) value_size > value_str_len_) { + if (value_str_len_ >= 0) { + delete[] value_str_; + } + value_str_ = new char[value_size]; + value_str_len_ = value_size; + } + s = table_->rep_->file->Read(next_offset_, value_size, &value_slice, + value_str_); + next_offset_ += value_size; + value_ = value_slice; +} + +void SimpleTableIterator::Prev() { + assert(false); +} + +Slice SimpleTableIterator::key() const { + Log(table_->rep_->options.info_log, "key!!!!"); + return key_; +} + +Slice SimpleTableIterator::value() const { + return value_; +} + +Status SimpleTableIterator::status() const { + return status_; +} + +class SimpleTableBuilder: public TableBuilder { +public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). The output file + // will be part of level specified by 'level'. A value of -1 means + // that the caller does not know which level the output file will reside. + SimpleTableBuilder(const Options& options, WritableFile* file, + CompressionType compression_type); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~SimpleTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + +private: + struct Rep; + Rep* rep_; + + // No copying allowed + SimpleTableBuilder(const SimpleTableBuilder&) = delete; + void operator=(const SimpleTableBuilder&) = delete; +}; + +struct SimpleTableBuilder::Rep { + Options options; + WritableFile* file; + uint64_t offset = 0; + Status status; + + uint64_t num_entries = 0; + + bool closed = false; // Either Finish() or Abandon() has been called. + + const static int user_key_size = 16; + const static int offset_length = 8; + const static int key_footer_len = 8; + + static int GetInternalKeyLength() { + return user_key_size + key_footer_len; + } + + std::string index; + + Rep(const Options& opt, WritableFile* f) : + options(opt), file(f) { + } + ~Rep() { + } +}; + +SimpleTableBuilder::SimpleTableBuilder(const Options& options, + WritableFile* file, + CompressionType compression_type) : + rep_(new SimpleTableBuilder::Rep(options, file)) { +} + +SimpleTableBuilder::~SimpleTableBuilder() { + delete (rep_); +} + +void SimpleTableBuilder::Add(const Slice& key, const Slice& value) { + assert((int ) key.size() == Rep::GetInternalKeyLength()); + + // Update index + rep_->index.append(key.data(), key.size()); + PutFixed64(&(rep_->index), rep_->offset); + + // Write key-value pair + rep_->file->Append(key); + rep_->offset += Rep::GetInternalKeyLength(); + + std::string size; + int value_size = value.size(); + PutFixed32(&size, value_size); + Slice sizeSlice(size); + rep_->file->Append(sizeSlice); + rep_->file->Append(value); + rep_->offset += value_size + 4; + + rep_->num_entries++; +} + +Status SimpleTableBuilder::status() const { + return Status::OK(); +} + +Status SimpleTableBuilder::Finish() { + Rep* r = rep_; + assert(!r->closed); + r->closed = true; + + uint64_t index_offset = rep_->offset; + Slice index_slice(rep_->index); + rep_->file->Append(index_slice); + rep_->offset += index_slice.size(); + + std::string index_offset_str; + PutFixed64(&index_offset_str, index_offset); + Slice foot_slice(index_offset_str); + rep_->file->Append(foot_slice); + rep_->offset += foot_slice.size(); + + return Status::OK(); +} + +void SimpleTableBuilder::Abandon() { + rep_->closed = true; +} + +uint64_t SimpleTableBuilder::NumEntries() const { + return rep_->num_entries; +} + +uint64_t SimpleTableBuilder::FileSize() const { + return rep_->offset; +} + +class SimpleTableFactory: public TableFactory { +public: + ~SimpleTableFactory() { + } + SimpleTableFactory() { + } + const char* Name() const override { + return "SimpleTable"; + } + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_key, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const; + + TableBuilder* NewTableBuilder(const Options& options, + const InternalKeyComparator& internal_key, + WritableFile* file, + CompressionType compression_type) const; +}; + +Status SimpleTableFactory::NewTableReader( + const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_key, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const { + + return SimpleTableReader::Open(options, soptions, std::move(file), file_size, + table_reader); +} + +TableBuilder* SimpleTableFactory::NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_key, + WritableFile* file, CompressionType compression_type) const { + return new SimpleTableBuilder(options, file, compression_type); +} + +class SimpleTableDBTest { +protected: +public: + std::string dbname_; + Env* env_; + DB* db_; + + Options last_options_; + + SimpleTableDBTest() : + env_(Env::Default()) { + dbname_ = test::TmpDir() + "/simple_table_db_test"; + ASSERT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + ~SimpleTableDBTest() { + delete db_; + ASSERT_OK(DestroyDB(dbname_, Options())); + } + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + options.table_factory.reset(new SimpleTableFactory()); + return options; + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void Close() { + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(Options* options = nullptr) { + //Destroy using last options + Destroy(&last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(Options* options) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + } + + Status PureReopen(Options* options, DB** db) { + return DB::Open(*options, dbname_, db); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const Slice& k, const Slice& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + + int NumTableFilesAtLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } +}; + +TEST(SimpleTableDBTest, Empty) { + ASSERT_TRUE(db_ != nullptr); + ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); +} + +TEST(SimpleTableDBTest, ReadWrite) { + ASSERT_OK(Put("0000000000000foo", "v1")); + ASSERT_EQ("v1", Get("0000000000000foo")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("0000000000000foo", "v3")); + ASSERT_EQ("v3", Get("0000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); +} + +TEST(SimpleTableDBTest, Flush) { + ASSERT_OK(Put("0000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("0000000000000foo", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); +} + +TEST(SimpleTableDBTest, Flush2) { + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("0000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_OK(Put("0000000000000foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("0000000000000foo")); + + ASSERT_OK(Put("0000000000000eee", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000eee")); + + ASSERT_OK(Delete("0000000000000bar")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); + + ASSERT_OK(Put("0000000000000eee", "v5")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v5", Get("0000000000000eee")); +} + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key_______%06d", i); + return std::string(buf); +} + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +TEST(SimpleTableDBTest, CompactionTrigger) { + Options options = CurrentOptions(); + options.write_buffer_size = 100 << 10; //100KB + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.level0_file_num_compaction_trigger = 3; + Reopen(&options); + + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/skiplist.h b/db/skiplist.h new file mode 100644 index 0000000000..751f7c3ec9 --- /dev/null +++ b/db/skiplist.h @@ -0,0 +1,429 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread safety +// ------------- +// +// Writes require external synchronization, most likely a mutex. +// Reads require a guarantee that the SkipList will not be destroyed +// while the read is in progress. Apart from that, reads progress +// without any internal locking or synchronization. +// +// Invariants: +// +// (1) Allocated nodes are never deleted until the SkipList is +// destroyed. This is trivially guaranteed by the code since we +// never delete any skip list nodes. +// +// (2) The contents of a Node except for the next/prev pointers are +// immutable after the Node has been linked into the SkipList. +// Only Insert() modifies the list, and it is careful to initialize +// a node and use release-stores to publish the nodes in one or +// more lists. +// +// ... prev vs. next pointer ordering ... +// + +#pragma once +#include +#include +#include "util/arena.h" +#include "port/port.h" +#include "util/arena.h" +#include "util/random.h" + +namespace rocksdb { + +template +class SkipList { + private: + struct Node; + + public: + // Create a new SkipList object that will use "cmp" for comparing keys, + // and will allocate memory using "*arena". Objects allocated in the arena + // must remain allocated for the lifetime of the skiplist object. + explicit SkipList(Comparator cmp, Arena* arena, + int32_t max_height = 12, int32_t branching_factor = 4); + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + void Insert(const Key& key); + + // Returns true iff an entry that compares equal to key is in the list. + bool Contains(const Key& key) const; + + // Iteration over the contents of a skip list + class Iterator { + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator(const SkipList* list); + + // Change the underlying skiplist used for this iterator + // This enables us not changing the iterator without deallocating + // an old one and then allocating a new one + void SetList(const SkipList* list); + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const; + + // Returns the key at the current position. + // REQUIRES: Valid() + const Key& key() const; + + // Advances to the next position. + // REQUIRES: Valid() + void Next(); + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev(); + + // Advance to the first entry with a key >= target + void Seek(const Key& target); + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToFirst(); + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToLast(); + + private: + const SkipList* list_; + Node* node_; + // Intentionally copyable + }; + + private: + const int32_t kMaxHeight_; + const int32_t kBranching_; + + // Immutable after construction + Comparator const compare_; + Arena* const arena_; // Arena used for allocations of nodes + + Node* const head_; + + // Modified only by Insert(). Read racily by readers, but stale + // values are ok. + port::AtomicPointer max_height_; // Height of the entire list + + // Used for optimizing sequential insert patterns + Node** prev_; + int32_t prev_height_; + + inline int GetMaxHeight() const { + return static_cast( + reinterpret_cast(max_height_.NoBarrier_Load())); + } + + // Read/written only by Insert(). + Random rnd_; + + Node* NewNode(const Key& key, int height); + int RandomHeight(); + bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + + // Return true if key is greater than the data stored in "n" + bool KeyIsAfterNode(const Key& key, Node* n) const; + + // Return the earliest node that comes at or after key. + // Return nullptr if there is no such node. + // + // If prev is non-nullptr, fills prev[level] with pointer to previous + // node at "level" for every level in [0..max_height_-1]. + Node* FindGreaterOrEqual(const Key& key, Node** prev) const; + + // Return the latest node with a key < key. + // Return head_ if there is no such node. + Node* FindLessThan(const Key& key) const; + + // Return the last node in the list. + // Return head_ if list is empty. + Node* FindLast() const; + + // No copying allowed + SkipList(const SkipList&); + void operator=(const SkipList&); +}; + +// Implementation details follow +template +struct SkipList::Node { + explicit Node(const Key& k) : key(k) { } + + Key const key; + + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + Node* Next(int n) { + assert(n >= 0); + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return reinterpret_cast(next_[n].Acquire_Load()); + } + void SetNext(int n, Node* x) { + assert(n >= 0); + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_[n].Release_Store(x); + } + + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next(int n) { + assert(n >= 0); + return reinterpret_cast(next_[n].NoBarrier_Load()); + } + void NoBarrier_SetNext(int n, Node* x) { + assert(n >= 0); + next_[n].NoBarrier_Store(x); + } + + private: + // Array of length equal to the node height. next_[0] is lowest level link. + port::AtomicPointer next_[1]; +}; + +template +typename SkipList::Node* +SkipList::NewNode(const Key& key, int height) { + char* mem = arena_->AllocateAligned( + sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); + return new (mem) Node(key); +} + +template +inline SkipList::Iterator::Iterator(const SkipList* list) { + SetList(list); +} + +template +inline void SkipList::Iterator::SetList(const SkipList* list) { + list_ = list; + node_ = nullptr; +} + +template +inline bool SkipList::Iterator::Valid() const { + return node_ != nullptr; +} + +template +inline const Key& SkipList::Iterator::key() const { + assert(Valid()); + return node_->key; +} + +template +inline void SkipList::Iterator::Next() { + assert(Valid()); + node_ = node_->Next(0); +} + +template +inline void SkipList::Iterator::Prev() { + // Instead of using explicit "prev" links, we just search for the + // last node that falls before key. + assert(Valid()); + node_ = list_->FindLessThan(node_->key); + if (node_ == list_->head_) { + node_ = nullptr; + } +} + +template +inline void SkipList::Iterator::Seek(const Key& target) { + node_ = list_->FindGreaterOrEqual(target, nullptr); +} + +template +inline void SkipList::Iterator::SeekToFirst() { + node_ = list_->head_->Next(0); +} + +template +inline void SkipList::Iterator::SeekToLast() { + node_ = list_->FindLast(); + if (node_ == list_->head_) { + node_ = nullptr; + } +} + +template +int SkipList::RandomHeight() { + // Increase height with probability 1 in kBranching + int height = 1; + while (height < kMaxHeight_ && ((rnd_.Next() % kBranching_) == 0)) { + height++; + } + assert(height > 0); + assert(height <= kMaxHeight_); + return height; +} + +template +bool SkipList::KeyIsAfterNode(const Key& key, Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, key) < 0); +} + +template +typename SkipList::Node* SkipList:: + FindGreaterOrEqual(const Key& key, Node** prev) const { + // Use prev as an optimization hint and fallback to slow path + if (prev && !KeyIsAfterNode(key, prev[0]->Next(0))) { + Node* x = prev[0]; + Node* next = x->Next(0); + if ((x == head_) || KeyIsAfterNode(key, x)) { + // Adjust all relevant insertion points to the previous entry + for (int i = 1; i < prev_height_; i++) { + prev[i] = x; + } + return next; + } + } + // Normal lookup + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + // Make sure the lists are sorted. + // If x points to head_ or next points nullptr, it is trivially satisfied. + assert((x == head_) || (next == nullptr) || KeyIsAfterNode(next->key, x)); + if (KeyIsAfterNode(key, next)) { + // Keep searching in this list + x = next; + } else { + if (prev != nullptr) prev[level] = x; + if (level == 0) { + return next; + } else { + // Switch to next list + level--; + } + } + } +} + +template +typename SkipList::Node* +SkipList::FindLessThan(const Key& key) const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + assert(x == head_ || compare_(x->key, key) < 0); + Node* next = x->Next(level); + if (next == nullptr || compare_(next->key, key) >= 0) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +typename SkipList::Node* SkipList::FindLast() + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (next == nullptr) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +SkipList::SkipList(const Comparator cmp, Arena* arena, + int32_t max_height, + int32_t branching_factor) + : kMaxHeight_(max_height), + kBranching_(branching_factor), + compare_(cmp), + arena_(arena), + head_(NewNode(0 /* any key will do */, max_height)), + max_height_(reinterpret_cast(1)), + prev_height_(1), + rnd_(0xdeadbeef) { + assert(kMaxHeight_ > 0); + assert(kBranching_ > 0); + // Allocate the prev_ Node* array, directly from the passed-in arena. + // prev_ does not need to be freed, as its life cycle is tied up with + // the arena as a whole. + prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_); + for (int i = 0; i < kMaxHeight_; i++) { + head_->SetNext(i, nullptr); + prev_[i] = head_; + } +} + +template +void SkipList::Insert(const Key& key) { + // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() + // here since Insert() is externally synchronized. + Node* x = FindGreaterOrEqual(key, prev_); + + // Our data structure does not allow duplicate insertion + assert(x == nullptr || !Equal(key, x->key)); + + int height = RandomHeight(); + if (height > GetMaxHeight()) { + for (int i = GetMaxHeight(); i < height; i++) { + prev_[i] = head_; + } + //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); + + // It is ok to mutate max_height_ without any synchronization + // with concurrent readers. A concurrent reader that observes + // the new value of max_height_ will see either the old value of + // new level pointers from head_ (nullptr), or a new value set in + // the loop below. In the former case the reader will + // immediately drop to the next level since nullptr sorts after all + // keys. In the latter case the reader will use the new node. + max_height_.NoBarrier_Store(reinterpret_cast(height)); + } + + x = NewNode(key, height); + for (int i = 0; i < height; i++) { + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i)); + prev_[i]->SetNext(i, x); + } + prev_[0] = x; + prev_height_ = height; +} + +template +bool SkipList::Contains(const Key& key) const { + Node* x = FindGreaterOrEqual(key, nullptr); + if (x != nullptr && Equal(key, x->key)) { + return true; + } else { + return false; + } +} + +} // namespace rocksdb diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc new file mode 100644 index 0000000000..b87ddcbb03 --- /dev/null +++ b/db/skiplist_test.cc @@ -0,0 +1,383 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/skiplist.h" +#include +#include "rocksdb/env.h" +#include "util/arena.h" +#include "util/hash.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace rocksdb { + +typedef uint64_t Key; + +struct TestComparator { + int operator()(const Key& a, const Key& b) const { + if (a < b) { + return -1; + } else if (a > b) { + return +1; + } else { + return 0; + } + } +}; + +class SkipTest { }; + +TEST(SkipTest, Empty) { + Arena arena; + TestComparator cmp; + SkipList list(cmp, &arena); + ASSERT_TRUE(!list.Contains(10)); + + SkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToFirst(); + ASSERT_TRUE(!iter.Valid()); + iter.Seek(100); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToLast(); + ASSERT_TRUE(!iter.Valid()); +} + +TEST(SkipTest, InsertAndLookup) { + const int N = 2000; + const int R = 5000; + Random rnd(1000); + std::set keys; + Arena arena; + TestComparator cmp; + SkipList list(cmp, &arena); + for (int i = 0; i < N; i++) { + Key key = rnd.Next() % R; + if (keys.insert(key).second) { + list.Insert(key); + } + } + + for (int i = 0; i < R; i++) { + if (list.Contains(i)) { + ASSERT_EQ(keys.count(i), 1U); + } else { + ASSERT_EQ(keys.count(i), 0U); + } + } + + // Simple iterator tests + { + SkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + + iter.Seek(0); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), iter.key()); + + iter.SeekToFirst(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), iter.key()); + + iter.SeekToLast(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.rbegin()), iter.key()); + } + + // Forward iteration test + for (int i = 0; i < R; i++) { + SkipList::Iterator iter(&list); + iter.Seek(i); + + // Compare against model iterator + std::set::iterator model_iter = keys.lower_bound(i); + for (int j = 0; j < 3; j++) { + if (model_iter == keys.end()) { + ASSERT_TRUE(!iter.Valid()); + break; + } else { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, iter.key()); + ++model_iter; + iter.Next(); + } + } + } + + // Backward iteration test + { + SkipList::Iterator iter(&list); + iter.SeekToLast(); + + // Compare against model iterator + for (std::set::reverse_iterator model_iter = keys.rbegin(); + model_iter != keys.rend(); + ++model_iter) { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, iter.key()); + iter.Prev(); + } + ASSERT_TRUE(!iter.Valid()); + } +} + +// We want to make sure that with a single writer and multiple +// concurrent readers (with no synchronization other than when a +// reader's iterator is created), the reader always observes all the +// data that was present in the skip list when the iterator was +// constructor. Because insertions are happening concurrently, we may +// also observe new values that were inserted since the iterator was +// constructed, but we should never miss any values that were present +// at iterator construction time. +// +// We generate multi-part keys: +// +// where: +// key is in range [0..K-1] +// gen is a generation number for key +// hash is hash(key,gen) +// +// The insertion code picks a random key, sets gen to be 1 + the last +// generation number inserted for that key, and sets hash to Hash(key,gen). +// +// At the beginning of a read, we snapshot the last inserted +// generation number for each key. We then iterate, including random +// calls to Next() and Seek(). For every key we encounter, we +// check that it is either expected given the initial snapshot or has +// been concurrently added since the iterator started. +class ConcurrentTest { + private: + static const uint32_t K = 4; + + static uint64_t key(Key key) { return (key >> 40); } + static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } + static uint64_t hash(Key key) { return key & 0xff; } + + static uint64_t HashNumbers(uint64_t k, uint64_t g) { + uint64_t data[2] = { k, g }; + return Hash(reinterpret_cast(data), sizeof(data), 0); + } + + static Key MakeKey(uint64_t k, uint64_t g) { + assert(sizeof(Key) == sizeof(uint64_t)); + assert(k <= K); // We sometimes pass K to seek to the end of the skiplist + assert(g <= 0xffffffffu); + return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); + } + + static bool IsValidKey(Key k) { + return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); + } + + static Key RandomTarget(Random* rnd) { + switch (rnd->Next() % 10) { + case 0: + // Seek to beginning + return MakeKey(0, 0); + case 1: + // Seek to end + return MakeKey(K, 0); + default: + // Seek to middle + return MakeKey(rnd->Next() % K, 0); + } + } + + // Per-key generation + struct State { + port::AtomicPointer generation[K]; + void Set(int k, intptr_t v) { + generation[k].Release_Store(reinterpret_cast(v)); + } + intptr_t Get(int k) { + return reinterpret_cast(generation[k].Acquire_Load()); + } + + State() { + for (unsigned int k = 0; k < K; k++) { + Set(k, 0); + } + } + }; + + // Current state of the test + State current_; + + Arena arena_; + + // SkipList is not protected by mu_. We just use a single writer + // thread to modify it. + SkipList list_; + + public: + ConcurrentTest() : list_(TestComparator(), &arena_) {} + + // REQUIRES: External synchronization + void WriteStep(Random* rnd) { + const uint32_t k = rnd->Next() % K; + const intptr_t g = current_.Get(k) + 1; + const Key key = MakeKey(k, g); + list_.Insert(key); + current_.Set(k, g); + } + + void ReadStep(Random* rnd) { + // Remember the initial committed state of the skiplist. + State initial_state; + for (unsigned int k = 0; k < K; k++) { + initial_state.Set(k, current_.Get(k)); + } + + Key pos = RandomTarget(rnd); + SkipList::Iterator iter(&list_); + iter.Seek(pos); + while (true) { + Key current; + if (!iter.Valid()) { + current = MakeKey(K, 0); + } else { + current = iter.key(); + ASSERT_TRUE(IsValidKey(current)) << current; + } + ASSERT_LE(pos, current) << "should not go backwards"; + + // Verify that everything in [pos,current) was not present in + // initial_state. + while (pos < current) { + ASSERT_LT(key(pos), K) << pos; + + // Note that generation 0 is never inserted, so it is ok if + // <*,0,*> is missing. + ASSERT_TRUE((gen(pos) == 0U) || + (gen(pos) > (uint64_t)initial_state.Get(key(pos))) + ) << "key: " << key(pos) + << "; gen: " << gen(pos) + << "; initgen: " + << initial_state.Get(key(pos)); + + // Advance to next key in the valid key space + if (key(pos) < key(current)) { + pos = MakeKey(key(pos) + 1, 0); + } else { + pos = MakeKey(key(pos), gen(pos) + 1); + } + } + + if (!iter.Valid()) { + break; + } + + if (rnd->Next() % 2) { + iter.Next(); + pos = MakeKey(key(pos), gen(pos) + 1); + } else { + Key new_target = RandomTarget(rnd); + if (new_target > pos) { + pos = new_target; + iter.Seek(new_target); + } + } + } + } +}; +const uint32_t ConcurrentTest::K; + +// Simple test that does single-threaded testing of the ConcurrentTest +// scaffolding. +TEST(SkipTest, ConcurrentWithoutThreads) { + ConcurrentTest test; + Random rnd(test::RandomSeed()); + for (int i = 0; i < 10000; i++) { + test.ReadStep(&rnd); + test.WriteStep(&rnd); + } +} + +class TestState { + public: + ConcurrentTest t_; + int seed_; + port::AtomicPointer quit_flag_; + + enum ReaderState { + STARTING, + RUNNING, + DONE + }; + + explicit TestState(int s) + : seed_(s), + quit_flag_(nullptr), + state_(STARTING), + state_cv_(&mu_) {} + + void Wait(ReaderState s) { + mu_.Lock(); + while (state_ != s) { + state_cv_.Wait(); + } + mu_.Unlock(); + } + + void Change(ReaderState s) { + mu_.Lock(); + state_ = s; + state_cv_.Signal(); + mu_.Unlock(); + } + + private: + port::Mutex mu_; + ReaderState state_; + port::CondVar state_cv_; +}; + +static void ConcurrentReader(void* arg) { + TestState* state = reinterpret_cast(arg); + Random rnd(state->seed_); + int64_t reads = 0; + state->Change(TestState::RUNNING); + while (!state->quit_flag_.Acquire_Load()) { + state->t_.ReadStep(&rnd); + ++reads; + } + state->Change(TestState::DONE); +} + +static void RunConcurrent(int run) { + const int seed = test::RandomSeed() + (run * 100); + Random rnd(seed); + const int N = 1000; + const int kSize = 1000; + for (int i = 0; i < N; i++) { + if ((i % 100) == 0) { + fprintf(stderr, "Run %d of %d\n", i, N); + } + TestState state(seed + 1); + Env::Default()->Schedule(ConcurrentReader, &state); + state.Wait(TestState::RUNNING); + for (int i = 0; i < kSize; i++) { + state.t_.WriteStep(&rnd); + } + state.quit_flag_.Release_Store(&state); // Any non-nullptr arg will do + state.Wait(TestState::DONE); + } +} + +TEST(SkipTest, Concurrent1) { RunConcurrent(1); } +TEST(SkipTest, Concurrent2) { RunConcurrent(2); } +TEST(SkipTest, Concurrent3) { RunConcurrent(3); } +TEST(SkipTest, Concurrent4) { RunConcurrent(4); } +TEST(SkipTest, Concurrent5) { RunConcurrent(5); } + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/snapshot.h b/db/snapshot.h new file mode 100644 index 0000000000..2c2e3eac80 --- /dev/null +++ b/db/snapshot.h @@ -0,0 +1,86 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/db.h" + +namespace rocksdb { + +class SnapshotList; + +// Snapshots are kept in a doubly-linked list in the DB. +// Each SnapshotImpl corresponds to a particular sequence number. +class SnapshotImpl : public Snapshot { + public: + SequenceNumber number_; // const after creation + + private: + friend class SnapshotList; + + // SnapshotImpl is kept in a doubly-linked circular list + SnapshotImpl* prev_; + SnapshotImpl* next_; + + SnapshotList* list_; // just for sanity checks +}; + +class SnapshotList { + public: + SnapshotList() { + list_.prev_ = &list_; + list_.next_ = &list_; + list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging + } + + bool empty() const { return list_.next_ == &list_; } + SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; } + SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; } + + const SnapshotImpl* New(SequenceNumber seq) { + SnapshotImpl* s = new SnapshotImpl; + s->number_ = seq; + s->list_ = this; + s->next_ = &list_; + s->prev_ = list_.prev_; + s->prev_->next_ = s; + s->next_->prev_ = s; + return s; + } + + void Delete(const SnapshotImpl* s) { + assert(s->list_ == this); + s->prev_->next_ = s->next_; + s->next_->prev_ = s->prev_; + delete s; + } + + // retrieve all snapshot numbers. They are sorted in ascending order. + void getAll(std::vector& ret) { + if (empty()) return; + SnapshotImpl* s = &list_; + while (s->next_ != &list_) { + ret.push_back(s->next_->number_); + s = s ->next_; + } + } + + // get the sequence number of the most recent snapshot + const SequenceNumber GetNewest() { + if (empty()) { + return 0; + } + return newest()->number_; + } + + private: + // Dummy head of doubly-linked list of snapshots + SnapshotImpl list_; +}; + +} // namespace rocksdb diff --git a/db/table_cache.cc b/db/table_cache.cc new file mode 100644 index 0000000000..f4757cbfe4 --- /dev/null +++ b/db/table_cache.cc @@ -0,0 +1,198 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/table_cache.h" + +#include "db/filename.h" +#include "db/version_edit.h" + +#include "rocksdb/statistics.h" +#include "table/iterator_wrapper.h" +#include "table/table_reader.h" +#include "util/coding.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +static void DeleteEntry(const Slice& key, void* value) { + TableReader* table_reader = reinterpret_cast(value); + delete table_reader; +} + +static void UnrefEntry(void* arg1, void* arg2) { + Cache* cache = reinterpret_cast(arg1); + Cache::Handle* h = reinterpret_cast(arg2); + cache->Release(h); +} + +static Slice GetSliceForFileNumber(uint64_t* file_number) { + return Slice(reinterpret_cast(file_number), + sizeof(*file_number)); +} + +TableCache::TableCache(const std::string& dbname, const Options* options, + const EnvOptions& storage_options, Cache* const cache) + : env_(options->env), + dbname_(dbname), + options_(options), + storage_options_(storage_options), + cache_(cache) {} + +TableCache::~TableCache() { +} + +TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) { + return reinterpret_cast(cache_->Value(handle)); +} + +void TableCache::ReleaseHandle(Cache::Handle* handle) { + cache_->Release(handle); +} + +Status TableCache::FindTable(const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + uint64_t file_number, uint64_t file_size, + Cache::Handle** handle, bool* table_io, + const bool no_io) { + Status s; + Slice key = GetSliceForFileNumber(&file_number); + *handle = cache_->Lookup(key); + if (*handle == nullptr) { + if (no_io) { // Dont do IO and return a not-found status + return Status::Incomplete("Table not found in table_cache, no_io is set"); + } + if (table_io != nullptr) { + *table_io = true; // we had to do IO from storage + } + std::string fname = TableFileName(dbname_, file_number); + unique_ptr file; + unique_ptr table_reader; + s = env_->NewRandomAccessFile(fname, &file, toptions); + RecordTick(options_->statistics.get(), NO_FILE_OPENS); + if (s.ok()) { + if (options_->advise_random_on_open) { + file->Hint(RandomAccessFile::RANDOM); + } + StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS); + s = options_->table_factory->NewTableReader( + *options_, toptions, internal_comparator, std::move(file), file_size, + &table_reader); + } + + if (!s.ok()) { + assert(table_reader == nullptr); + RecordTick(options_->statistics.get(), NO_FILE_ERRORS); + // We do not cache error results so that if the error is transient, + // or somebody repairs the file, we recover automatically. + } else { + assert(file.get() == nullptr); + *handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry); + } + } + return s; +} + +Iterator* TableCache::NewIterator(const ReadOptions& options, + const EnvOptions& toptions, + const InternalKeyComparator& icomparator, + const FileMetaData& file_meta, + TableReader** table_reader_ptr, + bool for_compaction, Arena* arena) { + if (table_reader_ptr != nullptr) { + *table_reader_ptr = nullptr; + } + TableReader* table_reader = file_meta.table_reader; + Cache::Handle* handle = nullptr; + Status s; + if (table_reader == nullptr) { + s = FindTable(toptions, icomparator, file_meta.number, file_meta.file_size, + &handle, nullptr, options.read_tier == kBlockCacheTier); + if (!s.ok()) { + return NewErrorIterator(s, arena); + } + table_reader = GetTableReaderFromHandle(handle); + } + + Iterator* result = table_reader->NewIterator(options, arena); + if (handle != nullptr) { + result->RegisterCleanup(&UnrefEntry, cache_, handle); + } + if (table_reader_ptr != nullptr) { + *table_reader_ptr = table_reader; + } + + if (for_compaction) { + table_reader->SetupForCompaction(); + } + + return result; +} + +Status TableCache::Get(const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, void* arg, + bool (*saver)(void*, const ParsedInternalKey&, + const Slice&, bool), + bool* table_io, void (*mark_key_may_exist)(void*)) { + TableReader* t = file_meta.table_reader; + Status s; + Cache::Handle* handle = nullptr; + if (!t) { + s = FindTable(storage_options_, internal_comparator, file_meta.number, + file_meta.file_size, &handle, table_io, + options.read_tier == kBlockCacheTier); + if (s.ok()) { + t = GetTableReaderFromHandle(handle); + } + } + if (s.ok()) { + s = t->Get(options, k, arg, saver, mark_key_may_exist); + if (handle != nullptr) { + ReleaseHandle(handle); + } + } else if (options.read_tier && s.IsIncomplete()) { + // Couldnt find Table in cache but treat as kFound if no_io set + (*mark_key_may_exist)(arg); + return Status::OK(); + } + return s; +} +Status TableCache::GetTableProperties( + const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::shared_ptr* properties, bool no_io) { + Status s; + auto table_reader = file_meta.table_reader; + // table already been pre-loaded? + if (table_reader) { + *properties = table_reader->GetTableProperties(); + + return s; + } + + bool table_io; + Cache::Handle* table_handle = nullptr; + s = FindTable(toptions, internal_comparator, file_meta.number, + file_meta.file_size, &table_handle, &table_io, no_io); + if (!s.ok()) { + return s; + } + assert(table_handle); + auto table = GetTableReaderFromHandle(table_handle); + *properties = table->GetTableProperties(); + ReleaseHandle(table_handle); + return s; +} + +void TableCache::Evict(Cache* cache, uint64_t file_number) { + cache->Erase(GetSliceForFileNumber(&file_number)); +} + +} // namespace rocksdb diff --git a/db/table_cache.h b/db/table_cache.h new file mode 100644 index 0000000000..1aa61db014 --- /dev/null +++ b/db/table_cache.h @@ -0,0 +1,95 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread-safe (provides internal synchronization) + +#pragma once +#include +#include + +#include "db/dbformat.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "table/table_reader.h" + +namespace rocksdb { + +class Env; +class Arena; +struct FileMetaData; + +// TODO(sdong): try to come up with a better API to pass the file information +// other than simply passing FileMetaData. +class TableCache { + public: + TableCache(const std::string& dbname, const Options* options, + const EnvOptions& storage_options, Cache* cache); + ~TableCache(); + + // Return an iterator for the specified file number (the corresponding + // file length must be exactly "file_size" bytes). If "tableptr" is + // non-nullptr, also sets "*tableptr" to point to the Table object + // underlying the returned iterator, or nullptr if no Table object underlies + // the returned iterator. The returned "*tableptr" object is owned by + // the cache and should not be deleted, and is valid for as long as the + // returned iterator is live. + Iterator* NewIterator(const ReadOptions& options, const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + TableReader** table_reader_ptr = nullptr, + bool for_compaction = false, Arena* arena = nullptr); + + // If a seek to internal key "k" in specified file finds an entry, + // call (*handle_result)(arg, found_key, found_value) repeatedly until + // it returns false. + Status Get(const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, void* arg, + bool (*handle_result)(void*, const ParsedInternalKey&, + const Slice&, bool), + bool* table_io, void (*mark_key_may_exist)(void*) = nullptr); + + // Evict any entry for the specified file number + static void Evict(Cache* cache, uint64_t file_number); + + // Find table reader + Status FindTable(const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + uint64_t file_number, uint64_t file_size, Cache::Handle**, + bool* table_io = nullptr, const bool no_io = false); + + // Get TableReader from a cache handle. + TableReader* GetTableReaderFromHandle(Cache::Handle* handle); + + // Get the table properties of a given table. + // @no_io: indicates if we should load table to the cache if it is not present + // in table cache yet. + // @returns: `properties` will be reset on success. Please note that we will + // return Status::Incomplete() if table is not present in cache and + // we set `no_io` to be true. + Status GetTableProperties(const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::shared_ptr* properties, + bool no_io = false); + + // Release the handle from a cache + void ReleaseHandle(Cache::Handle* handle); + + private: + Env* const env_; + const std::string dbname_; + const Options* options_; + const EnvOptions& storage_options_; + Cache* const cache_; +}; + +} // namespace rocksdb diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc new file mode 100644 index 0000000000..25bd700362 --- /dev/null +++ b/db/table_properties_collector.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/table_properties_collector.h" + +#include "db/dbformat.h" +#include "util/coding.h" + +namespace rocksdb { + +Status InternalKeyPropertiesCollector::Add( + const Slice& key, const Slice& value) { + ParsedInternalKey ikey; + if (!ParseInternalKey(key, &ikey)) { + return Status::InvalidArgument("Invalid internal key"); + } + + if (ikey.type == ValueType::kTypeDeletion) { + ++deleted_keys_; + } + + return Status::OK(); +} + +Status InternalKeyPropertiesCollector::Finish( + UserCollectedProperties* properties) { + assert(properties); + assert(properties->find( + InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end()); + std::string val; + + PutVarint64(&val, deleted_keys_); + properties->insert({ InternalKeyTablePropertiesNames::kDeletedKeys, val }); + + return Status::OK(); +} + +UserCollectedProperties +InternalKeyPropertiesCollector::GetReadableProperties() const { + return { + { "kDeletedKeys", std::to_string(deleted_keys_) } + }; +} + + +Status UserKeyTablePropertiesCollector::Add( + const Slice& key, const Slice& value) { + ParsedInternalKey ikey; + if (!ParseInternalKey(key, &ikey)) { + return Status::InvalidArgument("Invalid internal key"); + } + + return collector_->Add(ikey.user_key, value); +} + +Status UserKeyTablePropertiesCollector::Finish( + UserCollectedProperties* properties) { + return collector_->Finish(properties); +} + +UserCollectedProperties +UserKeyTablePropertiesCollector::GetReadableProperties() const { + return collector_->GetReadableProperties(); +} + + +const std::string InternalKeyTablePropertiesNames::kDeletedKeys + = "rocksdb.deleted.keys"; + +uint64_t GetDeletedKeys( + const UserCollectedProperties& props) { + auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys); + if (pos == props.end()) { + return 0; + } + Slice raw = pos->second; + uint64_t val = 0; + return GetVarint64(&raw, &val) ? val : 0; +} + +} // namespace rocksdb diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h new file mode 100644 index 0000000000..aafcb52021 --- /dev/null +++ b/db/table_properties_collector.h @@ -0,0 +1,95 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file defines a collection of statistics collectors. +#pragma once + +#include "rocksdb/table_properties.h" + +#include +#include +#include + +namespace rocksdb { + +struct InternalKeyTablePropertiesNames { + static const std::string kDeletedKeys; +}; + +// Collecting the statistics for internal keys. Visible only by internal +// rocksdb modules. +class InternalKeyPropertiesCollector : public TablePropertiesCollector { + public: + virtual Status Add(const Slice& key, const Slice& value) override; + + virtual Status Finish(UserCollectedProperties* properties) override; + + virtual const char* Name() const override { + return "InternalKeyPropertiesCollector"; + } + + UserCollectedProperties GetReadableProperties() const override; + + private: + uint64_t deleted_keys_ = 0; +}; + +class InternalKeyPropertiesCollectorFactory + : public TablePropertiesCollectorFactory { + public: + virtual TablePropertiesCollector* CreateTablePropertiesCollector() { + return new InternalKeyPropertiesCollector(); + } + + virtual const char* Name() const override { + return "InternalKeyPropertiesCollectorFactory"; + } +}; + +// When rocksdb creates a new table, it will encode all "user keys" into +// "internal keys", which contains meta information of a given entry. +// +// This class extracts user key from the encoded internal key when Add() is +// invoked. +class UserKeyTablePropertiesCollector : public TablePropertiesCollector { + public: + // transfer of ownership + explicit UserKeyTablePropertiesCollector(TablePropertiesCollector* collector) + : collector_(collector) {} + + virtual ~UserKeyTablePropertiesCollector() {} + + virtual Status Add(const Slice& key, const Slice& value) override; + + virtual Status Finish(UserCollectedProperties* properties) override; + + virtual const char* Name() const override { return collector_->Name(); } + + UserCollectedProperties GetReadableProperties() const override; + + protected: + std::unique_ptr collector_; +}; + +class UserKeyTablePropertiesCollectorFactory + : public TablePropertiesCollectorFactory { + public: + explicit UserKeyTablePropertiesCollectorFactory( + std::shared_ptr user_collector_factory) + : user_collector_factory_(user_collector_factory) {} + virtual TablePropertiesCollector* CreateTablePropertiesCollector() { + return new UserKeyTablePropertiesCollector( + user_collector_factory_->CreateTablePropertiesCollector()); + } + + virtual const char* Name() const override { + return user_collector_factory_->Name(); + } + + private: + std::shared_ptr user_collector_factory_; +}; + +} // namespace rocksdb diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc new file mode 100644 index 0000000000..dd4e8d1107 --- /dev/null +++ b/db/table_properties_collector_test.cc @@ -0,0 +1,313 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include + +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "db/table_properties_collector.h" +#include "rocksdb/table.h" +#include "table/block_based_table_factory.h" +#include "table/meta_blocks.h" +#include "table/plain_table_factory.h" +#include "table/table_builder.h" +#include "util/coding.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class TablePropertiesTest { +}; + +// TODO(kailiu) the following classes should be moved to some more general +// places, so that other tests can also make use of them. +// `FakeWritableFile` and `FakeRandomeAccessFile` bypass the real file system +// and therefore enable us to quickly setup the tests. +class FakeWritableFile : public WritableFile { + public: + ~FakeWritableFile() { } + + const std::string& contents() const { return contents_; } + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + + virtual Status Append(const Slice& data) { + contents_.append(data.data(), data.size()); + return Status::OK(); + } + + private: + std::string contents_; +}; + + +class FakeRandomeAccessFile : public RandomAccessFile { + public: + explicit FakeRandomeAccessFile(const Slice& contents) + : contents_(contents.data(), contents.size()) { + } + + virtual ~FakeRandomeAccessFile() { } + + uint64_t Size() const { return contents_.size(); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + if (offset > contents_.size()) { + return Status::InvalidArgument("invalid Read offset"); + } + if (offset + n > contents_.size()) { + n = contents_.size() - offset; + } + memcpy(scratch, &contents_[offset], n); + *result = Slice(scratch, n); + return Status::OK(); + } + + private: + std::string contents_; +}; + + +class DumbLogger : public Logger { + public: + virtual void Logv(const char* format, va_list ap) { } + virtual size_t GetLogFileSize() const { return 0; } +}; + +// Utilities test functions +namespace { +void MakeBuilder(const Options& options, + const InternalKeyComparator& internal_comparator, + std::unique_ptr* writable, + std::unique_ptr* builder) { + writable->reset(new FakeWritableFile); + builder->reset(options.table_factory->NewTableBuilder( + options, internal_comparator, writable->get(), options.compression)); +} +} // namespace + +// Collects keys that starts with "A" in a table. +class RegularKeysStartWithA: public TablePropertiesCollector { + public: + const char* Name() const { return "RegularKeysStartWithA"; } + + Status Finish(UserCollectedProperties* properties) { + std::string encoded; + PutVarint32(&encoded, count_); + *properties = UserCollectedProperties { + { "TablePropertiesTest", "Rocksdb" }, + { "Count", encoded } + }; + return Status::OK(); + } + + Status Add(const Slice& user_key, const Slice& value) { + // simply asssume all user keys are not empty. + if (user_key.data()[0] == 'A') { + ++count_; + } + return Status::OK(); + } + + virtual UserCollectedProperties GetReadableProperties() const { + return UserCollectedProperties{}; + } + + private: + uint32_t count_ = 0; +}; + +class RegularKeysStartWithAFactory : public TablePropertiesCollectorFactory { + public: + virtual TablePropertiesCollector* CreateTablePropertiesCollector() { + return new RegularKeysStartWithA(); + } + const char* Name() const { return "RegularKeysStartWithA"; } +}; + +extern uint64_t kBlockBasedTableMagicNumber; +extern uint64_t kPlainTableMagicNumber; +namespace { +void TestCustomizedTablePropertiesCollector( + uint64_t magic_number, bool encode_as_internal, const Options& options, + const InternalKeyComparator& internal_comparator) { + // make sure the entries will be inserted with order. + std::map kvs = { + {"About ", "val5"}, // starts with 'A' + {"Abstract", "val2"}, // starts with 'A' + {"Around ", "val7"}, // starts with 'A' + {"Beyond ", "val3"}, + {"Builder ", "val1"}, + {"Cancel ", "val4"}, + {"Find ", "val6"}, + }; + + // -- Step 1: build table + std::unique_ptr builder; + std::unique_ptr writable; + MakeBuilder(options, internal_comparator, &writable, &builder); + + for (const auto& kv : kvs) { + if (encode_as_internal) { + InternalKey ikey(kv.first, 0, ValueType::kTypeValue); + builder->Add(ikey.Encode(), kv.second); + } else { + builder->Add(kv.first, kv.second); + } + } + ASSERT_OK(builder->Finish()); + + // -- Step 2: Read properties + FakeRandomeAccessFile readable(writable->contents()); + TableProperties* props; + Status s = ReadTableProperties( + &readable, + writable->contents().size(), + magic_number, + Env::Default(), + nullptr, + &props + ); + std::unique_ptr props_guard(props); + ASSERT_OK(s); + + auto user_collected = props->user_collected_properties; + + ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest")); + + uint32_t starts_with_A = 0; + Slice key(user_collected.at("Count")); + ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); + ASSERT_EQ(3u, starts_with_A); +} +} // namespace + +TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) { + // Test properties collectors with internal keys or regular keys + // for block based table + for (bool encode_as_internal : { true, false }) { + Options options; + std::shared_ptr collector_factory( + new RegularKeysStartWithAFactory()); + if (encode_as_internal) { + options.table_properties_collector_factories.emplace_back( + new UserKeyTablePropertiesCollectorFactory(collector_factory)); + } else { + options.table_properties_collector_factories.resize(1); + options.table_properties_collector_factories[0] = collector_factory; + } + test::PlainInternalKeyComparator ikc(options.comparator); + TestCustomizedTablePropertiesCollector(kBlockBasedTableMagicNumber, + encode_as_internal, options, ikc); + } + + // test plain table + Options options; + options.table_properties_collector_factories.emplace_back( + new RegularKeysStartWithAFactory()); + options.table_factory = std::make_shared(8, 8, 0); + test::PlainInternalKeyComparator ikc(options.comparator); + TestCustomizedTablePropertiesCollector(kPlainTableMagicNumber, true, options, + ikc); +} + +namespace { +void TestInternalKeyPropertiesCollector( + uint64_t magic_number, + bool sanitized, + std::shared_ptr table_factory) { + InternalKey keys[] = { + InternalKey("A ", 0, ValueType::kTypeValue), + InternalKey("B ", 0, ValueType::kTypeValue), + InternalKey("C ", 0, ValueType::kTypeValue), + InternalKey("W ", 0, ValueType::kTypeDeletion), + InternalKey("X ", 0, ValueType::kTypeDeletion), + InternalKey("Y ", 0, ValueType::kTypeDeletion), + InternalKey("Z ", 0, ValueType::kTypeDeletion), + }; + + std::unique_ptr builder; + std::unique_ptr writable; + Options options; + test::PlainInternalKeyComparator pikc(options.comparator); + + options.table_factory = table_factory; + if (sanitized) { + options.table_properties_collector_factories.emplace_back( + new RegularKeysStartWithAFactory()); + // with sanitization, even regular properties collector will be able to + // handle internal keys. + auto comparator = options.comparator; + // HACK: Set options.info_log to avoid writing log in + // SanitizeOptions(). + options.info_log = std::make_shared(); + options = SanitizeOptions("db", // just a place holder + &pikc, nullptr, // don't care filter policy + options); + options.comparator = comparator; + } else { + options.table_properties_collector_factories = { + std::make_shared()}; + } + + for (int iter = 0; iter < 2; ++iter) { + MakeBuilder(options, pikc, &writable, &builder); + for (const auto& k : keys) { + builder->Add(k.Encode(), "val"); + } + + ASSERT_OK(builder->Finish()); + + FakeRandomeAccessFile readable(writable->contents()); + TableProperties* props; + Status s = + ReadTableProperties(&readable, writable->contents().size(), + magic_number, Env::Default(), nullptr, &props); + ASSERT_OK(s); + + std::unique_ptr props_guard(props); + auto user_collected = props->user_collected_properties; + uint64_t deleted = GetDeletedKeys(user_collected); + ASSERT_EQ(4u, deleted); + + if (sanitized) { + uint32_t starts_with_A = 0; + Slice key(user_collected.at("Count")); + ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); + ASSERT_EQ(1u, starts_with_A); + } + } +} +} // namespace + +TEST(TablePropertiesTest, InternalKeyPropertiesCollector) { + TestInternalKeyPropertiesCollector( + kBlockBasedTableMagicNumber, + true /* sanitize */, + std::make_shared() + ); + TestInternalKeyPropertiesCollector( + kBlockBasedTableMagicNumber, + true /* not sanitize */, + std::make_shared() + ); + TestInternalKeyPropertiesCollector( + kPlainTableMagicNumber, + false /* not sanitize */, + std::make_shared(8, 8, 0) + ); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/tailing_iter.cc b/db/tailing_iter.cc new file mode 100644 index 0000000000..67b59b2c95 --- /dev/null +++ b/db/tailing_iter.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE +#include "db/tailing_iter.h" + +#include +#include +#include +#include "db/db_impl.h" +#include "db/db_iter.h" +#include "db/column_family.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/merger.h" + +namespace rocksdb { + +TailingIterator::TailingIterator(Env* const env, DBImpl* db, + const ReadOptions& read_options, ColumnFamilyData* cfd) + : env_(env), + db_(db), + read_options_(read_options), + cfd_(cfd), + super_version_(nullptr), + current_(nullptr), + status_(Status::InvalidArgument("Seek() not called on this iterator")) {} + +TailingIterator::~TailingIterator() { + Cleanup(); +} + +bool TailingIterator::Valid() const { + return current_ != nullptr; +} + +void TailingIterator::SeekToFirst() { + if (!IsCurrentVersion()) { + CreateIterators(); + } + + mutable_->SeekToFirst(); + immutable_->SeekToFirst(); + UpdateCurrent(); +} + +void TailingIterator::Seek(const Slice& target) { + if (!IsCurrentVersion()) { + CreateIterators(); + } + + mutable_->Seek(target); + + // We maintain the interval (prev_key_, immutable_->key()] such that there + // are no records with keys within that range in immutable_ other than + // immutable_->key(). Since immutable_ can't change in this version, we don't + // need to do a seek if 'target' belongs to that interval (i.e. immutable_ is + // already at the correct position)! + // + // If prefix seek is used and immutable_ is not valid, seek if target has a + // different prefix than prev_key. + // + // prev_key_ is updated by Next(). SeekImmutable() sets prev_key_ to + // 'target' -- in this case, prev_key_ is included in the interval, so + // prev_inclusive_ has to be set. + + const Comparator* cmp = cfd_->user_comparator(); + if (!is_prev_set_ || cmp->Compare(prev_key_, target) >= !is_prev_inclusive_ || + (immutable_->Valid() && cmp->Compare(target, immutable_->key()) > 0) || + (cfd_->options()->prefix_extractor != nullptr && !IsSamePrefix(target))) { + SeekImmutable(target); + } + + UpdateCurrent(); +} + +void TailingIterator::Next() { + assert(Valid()); + + if (!IsCurrentVersion()) { + // save the current key, create new iterators and then seek + std::string current_key = key().ToString(); + Slice key_slice(current_key.data(), current_key.size()); + + CreateIterators(); + Seek(key_slice); + + if (!Valid() || key().compare(key_slice) != 0) { + // record with current_key no longer exists + return; + } + + } else if (current_ == immutable_.get()) { + // immutable iterator is advanced -- update prev_key_ + prev_key_ = key().ToString(); + is_prev_inclusive_ = false; + is_prev_set_ = true; + } + + current_->Next(); + UpdateCurrent(); +} + +Slice TailingIterator::key() const { + assert(Valid()); + return current_->key(); +} + +Slice TailingIterator::value() const { + assert(Valid()); + return current_->value(); +} + +Status TailingIterator::status() const { + if (!status_.ok()) { + return status_; + } else if (!mutable_->status().ok()) { + return mutable_->status(); + } else { + return immutable_->status(); + } +} + +void TailingIterator::Prev() { + status_ = Status::NotSupported("This iterator doesn't support Prev()"); +} + +void TailingIterator::SeekToLast() { + status_ = Status::NotSupported("This iterator doesn't support SeekToLast()"); +} + +void TailingIterator::Cleanup() { + // Release old super version if necessary + mutable_.reset(); + immutable_.reset(); + if (super_version_ != nullptr && super_version_->Unref()) { + DBImpl::DeletionState deletion_state; + db_->mutex_.Lock(); + super_version_->Cleanup(); + db_->FindObsoleteFiles(deletion_state, false, true); + db_->mutex_.Unlock(); + delete super_version_; + if (deletion_state.HaveSomethingToDelete()) { + db_->PurgeObsoleteFiles(deletion_state); + } + } +} + +void TailingIterator::CreateIterators() { + Cleanup(); + super_version_= cfd_->GetReferencedSuperVersion(&(db_->mutex_)); + + Iterator* mutable_iter = super_version_->mem->NewIterator(read_options_); + // create a DBIter that only uses memtable content; see NewIterator() + mutable_.reset( + NewDBIterator(env_, *cfd_->options(), cfd_->user_comparator(), + mutable_iter, kMaxSequenceNumber)); + + std::vector list; + super_version_->imm->AddIterators(read_options_, &list); + super_version_->current->AddIterators( + read_options_, *cfd_->soptions(), &list); + Iterator* immutable_iter = + NewMergingIterator(&cfd_->internal_comparator(), &list[0], list.size()); + + // create a DBIter that only uses memtable content; see NewIterator() + immutable_.reset( + NewDBIterator(env_, *cfd_->options(), cfd_->user_comparator(), + immutable_iter, kMaxSequenceNumber)); + + current_ = nullptr; + is_prev_set_ = false; +} + +void TailingIterator::UpdateCurrent() { + current_ = nullptr; + + if (mutable_->Valid()) { + current_ = mutable_.get(); + } + const Comparator* cmp = cfd_->user_comparator(); + if (immutable_->Valid() && + (current_ == nullptr || + cmp->Compare(immutable_->key(), current_->key()) < 0)) { + current_ = immutable_.get(); + } + + if (!status_.ok()) { + // reset status that was set by Prev() or SeekToLast() + status_ = Status::OK(); + } +} + +bool TailingIterator::IsCurrentVersion() const { + return super_version_ != nullptr && + super_version_->version_number == cfd_->GetSuperVersionNumber(); +} + +bool TailingIterator::IsSamePrefix(const Slice& target) const { + const SliceTransform* extractor = cfd_->options()->prefix_extractor.get(); + + assert(extractor); + assert(is_prev_set_); + + return extractor->Transform(target) + .compare(extractor->Transform(prev_key_)) == 0; +} + +void TailingIterator::SeekImmutable(const Slice& target) { + prev_key_ = target.ToString(); + is_prev_inclusive_ = true; + is_prev_set_ = true; + + immutable_->Seek(target); +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/db/tailing_iter.h b/db/tailing_iter.h new file mode 100644 index 0000000000..6b9c513753 --- /dev/null +++ b/db/tailing_iter.h @@ -0,0 +1,97 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" + +namespace rocksdb { + +class DBImpl; +class Env; +struct SuperVersion; +class ColumnFamilyData; + +/** + * TailingIterator is a special type of iterator that doesn't use an (implicit) + * snapshot. In other words, it can be used to read data that was added to the + * db after the iterator had been created. + * + * TailingIterator is optimized for sequential reading. It doesn't support + * Prev() and SeekToLast() operations. + */ +class TailingIterator : public Iterator { + public: + TailingIterator(Env* const env, DBImpl* db, const ReadOptions& read_options, + ColumnFamilyData* cfd); + virtual ~TailingIterator(); + + virtual bool Valid() const override; + virtual void SeekToFirst() override; + virtual void SeekToLast() override; + virtual void Seek(const Slice& target) override; + virtual void Next() override; + virtual void Prev() override; + virtual Slice key() const override; + virtual Slice value() const override; + virtual Status status() const override; + + private: + void Cleanup(); + + Env* const env_; + DBImpl* const db_; + const ReadOptions read_options_; + ColumnFamilyData* const cfd_; + SuperVersion* super_version_; + + // TailingIterator merges the contents of the two iterators below (one using + // mutable memtable contents only, other over SSTs and immutable memtables). + // See DBIter::GetTailingIteratorPair(). + std::unique_ptr mutable_; + std::unique_ptr immutable_; + + // points to either mutable_ or immutable_ + Iterator* current_; + + // key that precedes immutable iterator's current key + std::string prev_key_; + + // unless prev_set is true, prev_key/prev_head is not valid and shouldn't be + // used; reset by createIterators() + bool is_prev_set_; + + // prev_key_ was set by SeekImmutable(), which means that the interval of + // keys covered by immutable_ is [prev_key_, current], i.e. it includes the + // left endpoint + bool is_prev_inclusive_; + + // internal iterator status + Status status_; + + // check if this iterator's version matches DB's version + bool IsCurrentVersion() const; + + // check if SeekImmutable() is needed due to target having a different prefix + // than prev_key_ (used when in prefix seek mode) + bool IsSamePrefix(const Slice& target) const; + + // creates mutable_ and immutable_ iterators and updates version_number_ + void CreateIterators(); + + // set current_ to be one of the iterators with the smallest key + void UpdateCurrent(); + + // seek on immutable_ and update prev_key + void SeekImmutable(const Slice& target); +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc new file mode 100644 index 0000000000..bfcf7b3281 --- /dev/null +++ b/db/transaction_log_impl.cc @@ -0,0 +1,262 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE +#include "db/transaction_log_impl.h" +#include "db/write_batch_internal.h" + +namespace rocksdb { + +TransactionLogIteratorImpl::TransactionLogIteratorImpl( + const std::string& dir, const DBOptions* options, + const TransactionLogIterator::ReadOptions& read_options, + const EnvOptions& soptions, const SequenceNumber seq, + std::unique_ptr files, DBImpl const* const dbimpl) + : dir_(dir), + options_(options), + read_options_(read_options), + soptions_(soptions), + startingSequenceNumber_(seq), + files_(std::move(files)), + started_(false), + isValid_(false), + currentFileIndex_(0), + currentBatchSeq_(0), + currentLastSeq_(0), + dbimpl_(dbimpl) { + assert(files_ != nullptr); + assert(dbimpl_ != nullptr); + + reporter_.env = options_->env; + reporter_.info_log = options_->info_log.get(); + SeekToStartSequence(); // Seek till starting sequence +} + +Status TransactionLogIteratorImpl::OpenLogFile( + const LogFile* logFile, + unique_ptr* file) { + Env* env = options_->env; + if (logFile->Type() == kArchivedLogFile) { + std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber()); + return env->NewSequentialFile(fname, file, soptions_); + } else { + std::string fname = LogFileName(dir_, logFile->LogNumber()); + Status status = env->NewSequentialFile(fname, file, soptions_); + if (!status.ok()) { + // If cannot open file in DB directory. + // Try the archive dir, as it could have moved in the meanwhile. + fname = ArchivedLogFileName(dir_, logFile->LogNumber()); + status = env->NewSequentialFile(fname, file, soptions_); + } + return status; + } +} + +BatchResult TransactionLogIteratorImpl::GetBatch() { + assert(isValid_); // cannot call in a non valid state. + BatchResult result; + result.sequence = currentBatchSeq_; + result.writeBatchPtr = std::move(currentBatch_); + return result; +} + +Status TransactionLogIteratorImpl::status() { + return currentStatus_; +} + +bool TransactionLogIteratorImpl::Valid() { + return started_ && isValid_; +} + +bool TransactionLogIteratorImpl::RestrictedRead( + Slice* record, + std::string* scratch) { + // Don't read if no more complete entries to read from logs + if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) { + return false; + } + return currentLogReader_->ReadRecord(record, scratch); +} + +void TransactionLogIteratorImpl::SeekToStartSequence( + uint64_t startFileIndex, + bool strict) { + std::string scratch; + Slice record; + started_ = false; + isValid_ = false; + if (files_->size() <= startFileIndex) { + return; + } + Status s = OpenLogReader(files_->at(startFileIndex).get()); + if (!s.ok()) { + currentStatus_ = s; + reporter_.Info(currentStatus_.ToString().c_str()); + return; + } + while (RestrictedRead(&record, &scratch)) { + if (record.size() < 12) { + reporter_.Corruption( + record.size(), Status::Corruption("very small log record")); + continue; + } + UpdateCurrentWriteBatch(record); + if (currentLastSeq_ >= startingSequenceNumber_) { + if (strict && currentBatchSeq_ != startingSequenceNumber_) { + currentStatus_ = Status::Corruption("Gap in sequence number. Could not " + "seek to required sequence number"); + reporter_.Info(currentStatus_.ToString().c_str()); + return; + } else if (strict) { + reporter_.Info("Could seek required sequence number. Iterator will " + "continue."); + } + isValid_ = true; + started_ = true; // set started_ as we could seek till starting sequence + return; + } else { + isValid_ = false; + } + } + + // Could not find start sequence in first file. Normally this must be the + // only file. Otherwise log the error and let the iterator return next entry + // If strict is set, we want to seek exactly till the start sequence and it + // should have been present in the file we scanned above + if (strict) { + currentStatus_ = Status::Corruption("Gap in sequence number. Could not " + "seek to required sequence number"); + reporter_.Info(currentStatus_.ToString().c_str()); + } else if (files_->size() != 1) { + currentStatus_ = Status::Corruption("Start sequence was not found, " + "skipping to the next available"); + reporter_.Info(currentStatus_.ToString().c_str()); + // Let NextImpl find the next available entry. started_ remains false + // because we don't want to check for gaps while moving to start sequence + NextImpl(true); + } +} + +void TransactionLogIteratorImpl::Next() { + return NextImpl(false); +} + +void TransactionLogIteratorImpl::NextImpl(bool internal) { + std::string scratch; + Slice record; + isValid_ = false; + if (!internal && !started_) { + // Runs every time until we can seek to the start sequence + return SeekToStartSequence(); + } + while(true) { + assert(currentLogReader_); + if (currentLogReader_->IsEOF()) { + currentLogReader_->UnmarkEOF(); + } + while (RestrictedRead(&record, &scratch)) { + if (record.size() < 12) { + reporter_.Corruption( + record.size(), Status::Corruption("very small log record")); + continue; + } else { + // started_ should be true if called by application + assert(internal || started_); + // started_ should be false if called internally + assert(!internal || !started_); + UpdateCurrentWriteBatch(record); + if (internal && !started_) { + started_ = true; + } + return; + } + } + + // Open the next file + if (currentFileIndex_ < files_->size() - 1) { + ++currentFileIndex_; + Status status =OpenLogReader(files_->at(currentFileIndex_).get()); + if (!status.ok()) { + isValid_ = false; + currentStatus_ = status; + return; + } + } else { + isValid_ = false; + if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) { + currentStatus_ = Status::OK(); + } else { + currentStatus_ = Status::Corruption("NO MORE DATA LEFT"); + } + return; + } + } +} + +bool TransactionLogIteratorImpl::IsBatchExpected( + const WriteBatch* batch, + const SequenceNumber expectedSeq) { + assert(batch); + SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch); + if (batchSeq != expectedSeq) { + char buf[200]; + snprintf(buf, sizeof(buf), + "Discontinuity in log records. Got seq=%lu, Expected seq=%lu, " + "Last flushed seq=%lu.Log iterator will reseek the correct " + "batch.", + (unsigned long)batchSeq, + (unsigned long)expectedSeq, + (unsigned long)dbimpl_->GetLatestSequenceNumber()); + reporter_.Info(buf); + return false; + } + return true; +} + +void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) { + std::unique_ptr batch(new WriteBatch()); + WriteBatchInternal::SetContents(batch.get(), record); + + SequenceNumber expectedSeq = currentLastSeq_ + 1; + // If the iterator has started, then confirm that we get continuous batches + if (started_ && !IsBatchExpected(batch.get(), expectedSeq)) { + // Seek to the batch having expected sequence number + if (expectedSeq < files_->at(currentFileIndex_)->StartSequence()) { + // Expected batch must lie in the previous log file + // Avoid underflow. + if (currentFileIndex_ != 0) { + currentFileIndex_--; + } + } + startingSequenceNumber_ = expectedSeq; + // currentStatus_ will be set to Ok if reseek succeeds + currentStatus_ = Status::NotFound("Gap in sequence numbers"); + return SeekToStartSequence(currentFileIndex_, true); + } + + currentBatchSeq_ = WriteBatchInternal::Sequence(batch.get()); + currentLastSeq_ = currentBatchSeq_ + + WriteBatchInternal::Count(batch.get()) - 1; + // currentBatchSeq_ can only change here + assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber()); + + currentBatch_ = move(batch); + isValid_ = true; + currentStatus_ = Status::OK(); +} + +Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) { + unique_ptr file; + Status status = OpenLogFile(logFile, &file); + if (!status.ok()) { + return status; + } + assert(file); + currentLogReader_.reset(new log::Reader(std::move(file), &reporter_, + read_options_.verify_checksums_, 0)); + return Status::OK(); +} +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h new file mode 100644 index 0000000000..319b01cb13 --- /dev/null +++ b/db/transaction_log_impl.h @@ -0,0 +1,120 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE +#pragma once +#include + +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "rocksdb/transaction_log.h" +#include "db/db_impl.h" +#include "db/log_reader.h" +#include "db/filename.h" + +namespace rocksdb { + +struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + virtual void Corruption(size_t bytes, const Status& s) { + Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str()); + } + virtual void Info(const char* s) { + Log(info_log, "%s", s); + } +}; + +class LogFileImpl : public LogFile { + public: + LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq, + uint64_t sizeBytes) : + logNumber_(logNum), + type_(logType), + startSequence_(startSeq), + sizeFileBytes_(sizeBytes) { + } + + std::string PathName() const { + if (type_ == kArchivedLogFile) { + return ArchivedLogFileName("", logNumber_); + } + return LogFileName("", logNumber_); + } + + uint64_t LogNumber() const { return logNumber_; } + + WalFileType Type() const { return type_; } + + SequenceNumber StartSequence() const { return startSequence_; } + + uint64_t SizeFileBytes() const { return sizeFileBytes_; } + + bool operator < (const LogFile& that) const { + return LogNumber() < that.LogNumber(); + } + + private: + uint64_t logNumber_; + WalFileType type_; + SequenceNumber startSequence_; + uint64_t sizeFileBytes_; + +}; + +class TransactionLogIteratorImpl : public TransactionLogIterator { + public: + TransactionLogIteratorImpl( + const std::string& dir, const DBOptions* options, + const TransactionLogIterator::ReadOptions& read_options, + const EnvOptions& soptions, const SequenceNumber seqNum, + std::unique_ptr files, DBImpl const* const dbimpl); + + virtual bool Valid(); + + virtual void Next(); + + virtual Status status(); + + virtual BatchResult GetBatch(); + + private: + const std::string& dir_; + const DBOptions* options_; + const TransactionLogIterator::ReadOptions read_options_; + const EnvOptions& soptions_; + SequenceNumber startingSequenceNumber_; + std::unique_ptr files_; + bool started_; + bool isValid_; // not valid when it starts of. + Status currentStatus_; + size_t currentFileIndex_; + std::unique_ptr currentBatch_; + unique_ptr currentLogReader_; + Status OpenLogFile(const LogFile* logFile, unique_ptr* file); + LogReporter reporter_; + SequenceNumber currentBatchSeq_; // sequence number at start of current batch + SequenceNumber currentLastSeq_; // last sequence in the current batch + DBImpl const * const dbimpl_; // The db on whose log files this iterates + + // Reads from transaction log only if the writebatch record has been written + bool RestrictedRead(Slice* record, std::string* scratch); + // Seeks to startingSequenceNumber reading from startFileIndex in files_. + // If strict is set,then must get a batch starting with startingSequenceNumber + void SeekToStartSequence(uint64_t startFileIndex = 0, bool strict = false); + // Implementation of Next. SeekToStartSequence calls it internally with + // internal=true to let it find next entry even if it has to jump gaps because + // the iterator may start off from the first available entry but promises to + // be continuous after that + void NextImpl(bool internal = false); + // Check if batch is expected, else return false + bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expectedSeq); + // Update current batch if a continuous batch is found, else return false + void UpdateCurrentWriteBatch(const Slice& record); + Status OpenLogReader(const LogFile* file); +}; +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/db/version_edit.cc b/db/version_edit.cc new file mode 100644 index 0000000000..2ac35c58cc --- /dev/null +++ b/db/version_edit.cc @@ -0,0 +1,364 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" + +#include "db/version_set.h" +#include "util/coding.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + +// Tag numbers for serialized VersionEdit. These numbers are written to +// disk and should not be changed. +enum Tag { + kComparator = 1, + kLogNumber = 2, + kNextFileNumber = 3, + kLastSequence = 4, + kCompactPointer = 5, + kDeletedFile = 6, + kNewFile = 7, + // 8 was used for large value refs + kPrevLogNumber = 9, + + // these are new formats divergent from open source leveldb + kNewFile2 = 100, // store smallest & largest seqno + + kColumnFamily = 200, // specify column family for version edit + kColumnFamilyAdd = 201, + kColumnFamilyDrop = 202, + kMaxColumnFamily = 203, +}; + +void VersionEdit::Clear() { + comparator_.clear(); + max_level_ = 0; + log_number_ = 0; + prev_log_number_ = 0; + last_sequence_ = 0; + next_file_number_ = 0; + max_column_family_ = 0; + has_comparator_ = false; + has_log_number_ = false; + has_prev_log_number_ = false; + has_next_file_number_ = false; + has_last_sequence_ = false; + has_max_column_family_ = false; + deleted_files_.clear(); + new_files_.clear(); + column_family_ = 0; + is_column_family_add_ = 0; + is_column_family_drop_ = 0; + column_family_name_.clear(); +} + +void VersionEdit::EncodeTo(std::string* dst) const { + if (has_comparator_) { + PutVarint32(dst, kComparator); + PutLengthPrefixedSlice(dst, comparator_); + } + if (has_log_number_) { + PutVarint32(dst, kLogNumber); + PutVarint64(dst, log_number_); + } + if (has_prev_log_number_) { + PutVarint32(dst, kPrevLogNumber); + PutVarint64(dst, prev_log_number_); + } + if (has_next_file_number_) { + PutVarint32(dst, kNextFileNumber); + PutVarint64(dst, next_file_number_); + } + if (has_last_sequence_) { + PutVarint32(dst, kLastSequence); + PutVarint64(dst, last_sequence_); + } + if (has_max_column_family_) { + PutVarint32(dst, kMaxColumnFamily); + PutVarint32(dst, max_column_family_); + } + + for (const auto& deleted : deleted_files_) { + PutVarint32(dst, kDeletedFile); + PutVarint32(dst, deleted.first /* level */); + PutVarint64(dst, deleted.second /* file number */); + } + + for (size_t i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + PutVarint32(dst, kNewFile2); + PutVarint32(dst, new_files_[i].first); // level + PutVarint64(dst, f.number); + PutVarint64(dst, f.file_size); + PutLengthPrefixedSlice(dst, f.smallest.Encode()); + PutLengthPrefixedSlice(dst, f.largest.Encode()); + PutVarint64(dst, f.smallest_seqno); + PutVarint64(dst, f.largest_seqno); + } + + // 0 is default and does not need to be explicitly written + if (column_family_ != 0) { + PutVarint32(dst, kColumnFamily); + PutVarint32(dst, column_family_); + } + + if (is_column_family_add_) { + PutVarint32(dst, kColumnFamilyAdd); + PutLengthPrefixedSlice(dst, Slice(column_family_name_)); + } + + if (is_column_family_drop_) { + PutVarint32(dst, kColumnFamilyDrop); + } +} + +static bool GetInternalKey(Slice* input, InternalKey* dst) { + Slice str; + if (GetLengthPrefixedSlice(input, &str)) { + dst->DecodeFrom(str); + return dst->Valid(); + } else { + return false; + } +} + +bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) { + uint32_t v; + if (GetVarint32(input, &v)) { + *level = v; + if (max_level_ < *level) { + max_level_ = *level; + } + return true; + } else { + return false; + } +} + +Status VersionEdit::DecodeFrom(const Slice& src) { + Clear(); + Slice input = src; + const char* msg = nullptr; + uint32_t tag; + + // Temporary storage for parsing + int level; + uint64_t number; + FileMetaData f; + Slice str; + InternalKey key; + + while (msg == nullptr && GetVarint32(&input, &tag)) { + switch (tag) { + case kComparator: + if (GetLengthPrefixedSlice(&input, &str)) { + comparator_ = str.ToString(); + has_comparator_ = true; + } else { + msg = "comparator name"; + } + break; + + case kLogNumber: + if (GetVarint64(&input, &log_number_)) { + has_log_number_ = true; + } else { + msg = "log number"; + } + break; + + case kPrevLogNumber: + if (GetVarint64(&input, &prev_log_number_)) { + has_prev_log_number_ = true; + } else { + msg = "previous log number"; + } + break; + + case kNextFileNumber: + if (GetVarint64(&input, &next_file_number_)) { + has_next_file_number_ = true; + } else { + msg = "next file number"; + } + break; + + case kLastSequence: + if (GetVarint64(&input, &last_sequence_)) { + has_last_sequence_ = true; + } else { + msg = "last sequence number"; + } + break; + + case kMaxColumnFamily: + if (GetVarint32(&input, &max_column_family_)) { + has_max_column_family_ = true; + } else { + msg = "max column family"; + } + break; + + case kCompactPointer: + if (GetLevel(&input, &level, &msg) && + GetInternalKey(&input, &key)) { + // we don't use compact pointers anymore, + // but we should not fail if they are still + // in manifest + } else { + if (!msg) { + msg = "compaction pointer"; + } + } + break; + + case kDeletedFile: + if (GetLevel(&input, &level, &msg) && + GetVarint64(&input, &number)) { + deleted_files_.insert(std::make_pair(level, number)); + } else { + if (!msg) { + msg = "deleted file"; + } + } + break; + + case kNewFile: + if (GetLevel(&input, &level, &msg) && + GetVarint64(&input, &f.number) && + GetVarint64(&input, &f.file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest)) { + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file entry"; + } + } + break; + + case kNewFile2: + if (GetLevel(&input, &level, &msg) && + GetVarint64(&input, &f.number) && + GetVarint64(&input, &f.file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest) && + GetVarint64(&input, &f.smallest_seqno) && + GetVarint64(&input, &f.largest_seqno) ) { + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file2 entry"; + } + } + break; + + case kColumnFamily: + if (!GetVarint32(&input, &column_family_)) { + if (!msg) { + msg = "set column family id"; + } + } + break; + + case kColumnFamilyAdd: + if (GetLengthPrefixedSlice(&input, &str)) { + is_column_family_add_ = true; + column_family_name_ = str.ToString(); + } else { + if (!msg) { + msg = "column family add"; + } + } + break; + + case kColumnFamilyDrop: + is_column_family_drop_ = true; + break; + + default: + msg = "unknown tag"; + break; + } + } + + if (msg == nullptr && !input.empty()) { + msg = "invalid tag"; + } + + Status result; + if (msg != nullptr) { + result = Status::Corruption("VersionEdit", msg); + } + return result; +} + +std::string VersionEdit::DebugString(bool hex_key) const { + std::string r; + r.append("VersionEdit {"); + if (has_comparator_) { + r.append("\n Comparator: "); + r.append(comparator_); + } + if (has_log_number_) { + r.append("\n LogNumber: "); + AppendNumberTo(&r, log_number_); + } + if (has_prev_log_number_) { + r.append("\n PrevLogNumber: "); + AppendNumberTo(&r, prev_log_number_); + } + if (has_next_file_number_) { + r.append("\n NextFile: "); + AppendNumberTo(&r, next_file_number_); + } + if (has_last_sequence_) { + r.append("\n LastSeq: "); + AppendNumberTo(&r, last_sequence_); + } + for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); + iter != deleted_files_.end(); + ++iter) { + r.append("\n DeleteFile: "); + AppendNumberTo(&r, iter->first); + r.append(" "); + AppendNumberTo(&r, iter->second); + } + for (size_t i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + r.append("\n AddFile: "); + AppendNumberTo(&r, new_files_[i].first); + r.append(" "); + AppendNumberTo(&r, f.number); + r.append(" "); + AppendNumberTo(&r, f.file_size); + r.append(" "); + r.append(f.smallest.DebugString(hex_key)); + r.append(" .. "); + r.append(f.largest.DebugString(hex_key)); + } + r.append("\n ColumnFamily: "); + AppendNumberTo(&r, column_family_); + if (is_column_family_add_) { + r.append("\n ColumnFamilyAdd: "); + r.append(column_family_name_); + } + if (is_column_family_drop_) { + r.append("\n ColumnFamilyDrop"); + } + if (has_max_column_family_) { + r.append("\n MaxColumnFamily: "); + AppendNumberTo(&r, max_column_family_); + } + r.append("\n}\n"); + return r; +} + +} // namespace rocksdb diff --git a/db/version_edit.h b/db/version_edit.h new file mode 100644 index 0000000000..acaec8a4f1 --- /dev/null +++ b/db/version_edit.h @@ -0,0 +1,176 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include +#include "rocksdb/cache.h" +#include "db/dbformat.h" + +namespace rocksdb { + +class VersionSet; + +struct FileMetaData { + int refs; + int allowed_seeks; // Seeks allowed until compaction + uint64_t number; + uint64_t file_size; // File size in bytes + InternalKey smallest; // Smallest internal key served by table + InternalKey largest; // Largest internal key served by table + bool being_compacted; // Is this file undergoing compaction? + SequenceNumber smallest_seqno;// The smallest seqno in this file + SequenceNumber largest_seqno; // The largest seqno in this file + + // Needs to be disposed when refs becomes 0. + Cache::Handle* table_reader_handle; + // Table reader in table_reader_handle + TableReader* table_reader; + + FileMetaData(uint64_t number, uint64_t file_size) + : refs(0), + allowed_seeks(1 << 30), + number(number), + file_size(file_size), + being_compacted(false), + table_reader_handle(nullptr), + table_reader(nullptr) {} + FileMetaData() : FileMetaData(0, 0) {} +}; + +class VersionEdit { + public: + VersionEdit() { Clear(); } + ~VersionEdit() { } + + void Clear(); + + void SetComparatorName(const Slice& name) { + has_comparator_ = true; + comparator_ = name.ToString(); + } + void SetLogNumber(uint64_t num) { + has_log_number_ = true; + log_number_ = num; + } + void SetPrevLogNumber(uint64_t num) { + has_prev_log_number_ = true; + prev_log_number_ = num; + } + void SetNextFile(uint64_t num) { + has_next_file_number_ = true; + next_file_number_ = num; + } + void SetLastSequence(SequenceNumber seq) { + has_last_sequence_ = true; + last_sequence_ = seq; + } + void SetMaxColumnFamily(uint32_t max_column_family) { + has_max_column_family_ = true; + max_column_family_ = max_column_family; + } + + // Add the specified file at the specified number. + // REQUIRES: This version has not been saved (see VersionSet::SaveTo) + // REQUIRES: "smallest" and "largest" are smallest and largest keys in file + void AddFile(int level, uint64_t file, + uint64_t file_size, + const InternalKey& smallest, + const InternalKey& largest, + const SequenceNumber& smallest_seqno, + const SequenceNumber& largest_seqno) { + assert(smallest_seqno <= largest_seqno); + FileMetaData f; + f.number = file; + f.file_size = file_size; + f.smallest = smallest; + f.largest = largest; + f.smallest_seqno = smallest_seqno; + f.largest_seqno = largest_seqno; + new_files_.push_back(std::make_pair(level, f)); + } + + // Delete the specified "file" from the specified "level". + void DeleteFile(int level, uint64_t file) { + deleted_files_.insert({level, file}); + } + + // Number of edits + int NumEntries() { + return new_files_.size() + deleted_files_.size(); + } + + bool IsColumnFamilyManipulation() { + return is_column_family_add_ || is_column_family_drop_; + } + + void SetColumnFamily(uint32_t column_family_id) { + column_family_ = column_family_id; + } + + // set column family ID by calling SetColumnFamily() + void AddColumnFamily(const std::string& name) { + assert(!is_column_family_drop_); + assert(!is_column_family_add_); + assert(NumEntries() == 0); + is_column_family_add_ = true; + column_family_name_ = name; + } + + // set column family ID by calling SetColumnFamily() + void DropColumnFamily() { + assert(!is_column_family_drop_); + assert(!is_column_family_add_); + assert(NumEntries() == 0); + is_column_family_drop_ = true; + } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(const Slice& src); + + std::string DebugString(bool hex_key = false) const; + + private: + friend class VersionSet; + + typedef std::set< std::pair> DeletedFileSet; + + bool GetLevel(Slice* input, int* level, const char** msg); + + int max_level_; + std::string comparator_; + uint64_t log_number_; + uint64_t prev_log_number_; + uint64_t next_file_number_; + uint32_t max_column_family_; + SequenceNumber last_sequence_; + bool has_comparator_; + bool has_log_number_; + bool has_prev_log_number_; + bool has_next_file_number_; + bool has_last_sequence_; + bool has_max_column_family_; + + DeletedFileSet deleted_files_; + std::vector> new_files_; + + // Each version edit record should have column_family_id set + // If it's not set, it is default (0) + uint32_t column_family_; + // a version edit can be either column_family add or + // column_family drop. If it's column family add, + // it also includes column family name. + bool is_column_family_drop_; + bool is_column_family_add_; + std::string column_family_name_; +}; + +} // namespace rocksdb diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc new file mode 100644 index 0000000000..7842b32634 --- /dev/null +++ b/db/version_edit_test.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" +#include "util/testharness.h" + +namespace rocksdb { + +static void TestEncodeDecode(const VersionEdit& edit) { + std::string encoded, encoded2; + edit.EncodeTo(&encoded); + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + parsed.EncodeTo(&encoded2); + ASSERT_EQ(encoded, encoded2); +} + +class VersionEditTest { }; + +TEST(VersionEditTest, EncodeDecode) { + static const uint64_t kBig = 1ull << 50; + + VersionEdit edit; + for (int i = 0; i < 4; i++) { + TestEncodeDecode(edit); + edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, + InternalKey("foo", kBig + 500 + i, kTypeValue), + InternalKey("zoo", kBig + 600 + i, kTypeDeletion), + kBig + 500 + i, + kBig + 600 + i); + edit.DeleteFile(4, kBig + 700 + i); + } + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + TestEncodeDecode(edit); +} + +TEST(VersionEditTest, ColumnFamilyTest) { + VersionEdit edit; + edit.SetColumnFamily(2); + edit.AddColumnFamily("column_family"); + edit.SetMaxColumnFamily(5); + TestEncodeDecode(edit); + + edit.Clear(); + edit.SetColumnFamily(3); + edit.DropColumnFamily(); + TestEncodeDecode(edit); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/version_set.cc b/db/version_set.cc new file mode 100644 index 0000000000..c6a9e6ab12 --- /dev/null +++ b/db/version_set.cc @@ -0,0 +1,2822 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" + +#define __STDC_FORMAT_MACROS +#include +#include +#include +#include +#include +#include +#include + +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/merge_context.h" +#include "db/table_cache.h" +#include "db/compaction.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "table/table_reader.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "table/format.h" +#include "table/plain_table_factory.h" +#include "table/meta_blocks.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +static uint64_t TotalFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->file_size; + } + return sum; +} + +Version::~Version() { + assert(refs_ == 0); + + // Remove from linked list + prev_->next_ = next_; + next_->prev_ = prev_; + + // Drop references to files + for (int level = 0; level < num_levels_; level++) { + for (size_t i = 0; i < files_[level].size(); i++) { + FileMetaData* f = files_[level][i]; + assert(f->refs > 0); + f->refs--; + if (f->refs <= 0) { + if (f->table_reader_handle) { + cfd_->table_cache()->ReleaseHandle(f->table_reader_handle); + f->table_reader_handle = nullptr; + } + vset_->obsolete_files_.push_back(f); + } + } + } + delete[] files_; +} + +int FindFileInRange(const InternalKeyComparator& icmp, + const std::vector& files, + const Slice& key, + uint32_t left, + uint32_t right) { + while (left < right) { + uint32_t mid = (left + right) / 2; + const FileMetaData* f = files[mid]; + if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) { + // Key at "mid.largest" is < "target". Therefore all + // files at or before "mid" are uninteresting. + left = mid + 1; + } else { + // Key at "mid.largest" is >= "target". Therefore all files + // after "mid" are uninteresting. + right = mid; + } + } + return right; +} + +int FindFile(const InternalKeyComparator& icmp, + const std::vector& files, + const Slice& key) { + return FindFileInRange(icmp, files, key, 0, files.size()); +} + +static bool AfterFile(const Comparator* ucmp, + const Slice* user_key, const FileMetaData* f) { + // nullptr user_key occurs before all keys and is therefore never after *f + return (user_key != nullptr && + ucmp->Compare(*user_key, f->largest.user_key()) > 0); +} + +static bool BeforeFile(const Comparator* ucmp, + const Slice* user_key, const FileMetaData* f) { + // nullptr user_key occurs after all keys and is therefore never before *f + return (user_key != nullptr && + ucmp->Compare(*user_key, f->smallest.user_key()) < 0); +} + +bool SomeFileOverlapsRange( + const InternalKeyComparator& icmp, + bool disjoint_sorted_files, + const std::vector& files, + const Slice* smallest_user_key, + const Slice* largest_user_key) { + const Comparator* ucmp = icmp.user_comparator(); + if (!disjoint_sorted_files) { + // Need to check against all files + for (size_t i = 0; i < files.size(); i++) { + const FileMetaData* f = files[i]; + if (AfterFile(ucmp, smallest_user_key, f) || + BeforeFile(ucmp, largest_user_key, f)) { + // No overlap + } else { + return true; // Overlap + } + } + return false; + } + + // Binary search over file list + uint32_t index = 0; + if (smallest_user_key != nullptr) { + // Find the earliest possible internal key for smallest_user_key + InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek); + index = FindFile(icmp, files, small.Encode()); + } + + if (index >= files.size()) { + // beginning of range is after all files, so no overlap. + return false; + } + + return !BeforeFile(ucmp, largest_user_key, files[index]); +} + +namespace { +// Used for LevelFileNumIterator to pass "block handle" value, +// which actually means file information in this iterator. +// It contains subset of fields of FileMetaData, that is sufficient +// for table cache to use. +struct EncodedFileMetaData { + uint64_t number; // file number + uint64_t file_size; // file size + TableReader* table_reader; // cached table reader +}; +} // namespace + +// An internal iterator. For a given version/level pair, yields +// information about the files in the level. For a given entry, key() +// is the largest key that occurs in the file, and value() is an +// 16-byte value containing the file number and file size, both +// encoded using EncodeFixed64. +class Version::LevelFileNumIterator : public Iterator { + public: + LevelFileNumIterator(const InternalKeyComparator& icmp, + const std::vector* flist) + : icmp_(icmp), + flist_(flist), + index_(flist->size()) { // Marks as invalid + } + virtual bool Valid() const { + return index_ < flist_->size(); + } + virtual void Seek(const Slice& target) { + index_ = FindFile(icmp_, *flist_, target); + } + virtual void SeekToFirst() { index_ = 0; } + virtual void SeekToLast() { + index_ = flist_->empty() ? 0 : flist_->size() - 1; + } + virtual void Next() { + assert(Valid()); + index_++; + } + virtual void Prev() { + assert(Valid()); + if (index_ == 0) { + index_ = flist_->size(); // Marks as invalid + } else { + index_--; + } + } + Slice key() const { + assert(Valid()); + return (*flist_)[index_]->largest.Encode(); + } + Slice value() const { + assert(Valid()); + auto* file_meta = (*flist_)[index_]; + current_value_.number = file_meta->number; + current_value_.file_size = file_meta->file_size; + current_value_.table_reader = file_meta->table_reader; + return Slice(reinterpret_cast(¤t_value_), + sizeof(EncodedFileMetaData)); + } + virtual Status status() const { return Status::OK(); } + private: + const InternalKeyComparator icmp_; + const std::vector* const flist_; + uint32_t index_; + mutable EncodedFileMetaData current_value_; +}; + +class Version::LevelFileIteratorState : public TwoLevelIteratorState { + public: + LevelFileIteratorState(TableCache* table_cache, + const ReadOptions& read_options, const EnvOptions& env_options, + const InternalKeyComparator& icomparator, bool for_compaction, + bool prefix_enabled) + : TwoLevelIteratorState(prefix_enabled), + table_cache_(table_cache), read_options_(read_options), + env_options_(env_options), icomparator_(icomparator), + for_compaction_(for_compaction) {} + + Iterator* NewSecondaryIterator(const Slice& meta_handle) override { + if (meta_handle.size() != sizeof(EncodedFileMetaData)) { + return NewErrorIterator( + Status::Corruption("FileReader invoked with unexpected value")); + } else { + const EncodedFileMetaData* encoded_meta = + reinterpret_cast(meta_handle.data()); + FileMetaData meta(encoded_meta->number, encoded_meta->file_size); + meta.table_reader = encoded_meta->table_reader; + return table_cache_->NewIterator(read_options_, env_options_, + icomparator_, meta, nullptr /* don't need reference to table*/, + for_compaction_); + } + } + + bool PrefixMayMatch(const Slice& internal_key) override { + return true; + } + + private: + TableCache* table_cache_; + const ReadOptions read_options_; + const EnvOptions& env_options_; + const InternalKeyComparator& icomparator_; + bool for_compaction_; +}; + +Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { + auto table_cache = cfd_->table_cache(); + auto options = cfd_->options(); + for (int level = 0; level < num_levels_; level++) { + for (const auto& file_meta : files_[level]) { + auto fname = TableFileName(vset_->dbname_, file_meta->number); + // 1. If the table is already present in table cache, load table + // properties from there. + std::shared_ptr table_properties; + Status s = table_cache->GetTableProperties( + vset_->storage_options_, cfd_->internal_comparator(), *file_meta, + &table_properties, true /* no io */); + if (s.ok()) { + props->insert({fname, table_properties}); + continue; + } + + // We only ignore error type `Incomplete` since it's by design that we + // disallow table when it's not in table cache. + if (!s.IsIncomplete()) { + return s; + } + + // 2. Table is not present in table cache, we'll read the table properties + // directly from the properties block in the file. + std::unique_ptr file; + s = options->env->NewRandomAccessFile(fname, &file, + vset_->storage_options_); + if (!s.ok()) { + return s; + } + + TableProperties* raw_table_properties; + // By setting the magic number to kInvalidTableMagicNumber, we can by + // pass the magic number check in the footer. + s = ReadTableProperties( + file.get(), file_meta->file_size, + Footer::kInvalidTableMagicNumber /* table's magic number */, + vset_->env_, options->info_log.get(), &raw_table_properties); + if (!s.ok()) { + return s; + } + RecordTick(options->statistics.get(), + NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); + + props->insert({fname, std::shared_ptr( + raw_table_properties)}); + } + } + + return Status::OK(); +} + +void Version::AddIterators(const ReadOptions& read_options, + const EnvOptions& soptions, + std::vector* iters) { + // Merge all level zero files together since they may overlap + for (const FileMetaData* file : files_[0]) { + iters->push_back(cfd_->table_cache()->NewIterator( + read_options, soptions, cfd_->internal_comparator(), *file)); + } + + // For levels > 0, we can use a concatenating iterator that sequentially + // walks through the non-overlapping files in the level, opening them + // lazily. + for (int level = 1; level < num_levels_; level++) { + if (!files_[level].empty()) { + iters->push_back(NewTwoLevelIterator(new LevelFileIteratorState( + cfd_->table_cache(), read_options, soptions, + cfd_->internal_comparator(), false /* for_compaction */, + cfd_->options()->prefix_extractor != nullptr), + new LevelFileNumIterator(cfd_->internal_comparator(), &files_[level]))); + } + } +} + +void Version::AddIterators(const ReadOptions& read_options, + const EnvOptions& soptions, + MergeIteratorBuilder* merge_iter_builder) { + // Merge all level zero files together since they may overlap + for (const FileMetaData* file : files_[0]) { + merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator( + read_options, soptions, cfd_->internal_comparator(), *file, nullptr, + false, merge_iter_builder->GetArena())); + } + + // For levels > 0, we can use a concatenating iterator that sequentially + // walks through the non-overlapping files in the level, opening them + // lazily. + for (int level = 1; level < num_levels_; level++) { + if (!files_[level].empty()) { + merge_iter_builder->AddIterator(NewTwoLevelIterator( + new LevelFileIteratorState( + cfd_->table_cache(), read_options, soptions, + cfd_->internal_comparator(), false /* for_compaction */, + cfd_->options()->prefix_extractor != nullptr), + new LevelFileNumIterator(cfd_->internal_comparator(), &files_[level]), + merge_iter_builder->GetArena())); + } + } +} + +// Callback from TableCache::Get() +namespace { +enum SaverState { + kNotFound, + kFound, + kDeleted, + kCorrupt, + kMerge // saver contains the current merge result (the operands) +}; +struct Saver { + SaverState state; + const Comparator* ucmp; + Slice user_key; + bool* value_found; // Is value set correctly? Used by KeyMayExist + std::string* value; + const MergeOperator* merge_operator; + // the merge operations encountered; + MergeContext* merge_context; + Logger* logger; + bool didIO; // did we do any disk io? + Statistics* statistics; +}; +} + +// Called from TableCache::Get and Table::Get when file/block in which +// key may exist are not there in TableCache/BlockCache respectively. In this +// case we can't guarantee that key does not exist and are not permitted to do +// IO to be certain.Set the status=kFound and value_found=false to let the +// caller know that key may exist but is not there in memory +static void MarkKeyMayExist(void* arg) { + Saver* s = reinterpret_cast(arg); + s->state = kFound; + if (s->value_found != nullptr) { + *(s->value_found) = false; + } +} + +static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, + const Slice& v, bool didIO) { + Saver* s = reinterpret_cast(arg); + MergeContext* merge_contex = s->merge_context; + std::string merge_result; // temporary area for merge results later + + assert(s != nullptr && merge_contex != nullptr); + + // TODO: didIO and Merge? + s->didIO = didIO; + if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) { + // Key matches. Process it + switch (parsed_key.type) { + case kTypeValue: + if (kNotFound == s->state) { + s->state = kFound; + s->value->assign(v.data(), v.size()); + } else if (kMerge == s->state) { + assert(s->merge_operator != nullptr); + s->state = kFound; + if (!s->merge_operator->FullMerge(s->user_key, &v, + merge_contex->GetOperands(), + s->value, s->logger)) { + RecordTick(s->statistics, NUMBER_MERGE_FAILURES); + s->state = kCorrupt; + } + } else { + assert(false); + } + return false; + + case kTypeDeletion: + if (kNotFound == s->state) { + s->state = kDeleted; + } else if (kMerge == s->state) { + s->state = kFound; + if (!s->merge_operator->FullMerge(s->user_key, nullptr, + merge_contex->GetOperands(), + s->value, s->logger)) { + RecordTick(s->statistics, NUMBER_MERGE_FAILURES); + s->state = kCorrupt; + } + } else { + assert(false); + } + return false; + + case kTypeMerge: + assert(s->state == kNotFound || s->state == kMerge); + s->state = kMerge; + merge_contex->PushOperand(v); + return true; + + default: + assert(false); + break; + } + } + + // s->state could be Corrupt, merge or notfound + + return false; +} + +namespace { +bool NewestFirst(FileMetaData* a, FileMetaData* b) { + return a->number > b->number; +} +bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) { + if (a->smallest_seqno != b->smallest_seqno) { + return a->smallest_seqno > b->smallest_seqno; + } + if (a->largest_seqno != b->largest_seqno) { + return a->largest_seqno > b->largest_seqno; + } + // Break ties by file number + return NewestFirst(a, b); +} +bool BySmallestKey(FileMetaData* a, FileMetaData* b, + const InternalKeyComparator* cmp) { + int r = cmp->Compare(a->smallest, b->smallest); + if (r != 0) { + return (r < 0); + } + // Break ties by file number + return (a->number < b->number); +} +} // anonymous namespace + +Version::Version(ColumnFamilyData* cfd, VersionSet* vset, + uint64_t version_number) + : cfd_(cfd), + internal_comparator_((cfd == nullptr) ? nullptr + : &cfd->internal_comparator()), + user_comparator_((cfd == nullptr) + ? nullptr + : internal_comparator_->user_comparator()), + table_cache_((cfd == nullptr) ? nullptr : cfd->table_cache()), + merge_operator_((cfd == nullptr) ? nullptr + : cfd->options()->merge_operator.get()), + info_log_((cfd == nullptr) ? nullptr : cfd->options()->info_log.get()), + db_statistics_((cfd == nullptr) ? nullptr + : cfd->options()->statistics.get()), + vset_(vset), + next_(this), + prev_(this), + refs_(0), + // cfd is nullptr if Version is dummy + num_levels_(cfd == nullptr ? 0 : cfd->NumberLevels()), + files_(new std::vector[num_levels_]), + files_by_size_(num_levels_), + next_file_to_compact_by_size_(num_levels_), + file_to_compact_(nullptr), + file_to_compact_level_(-1), + compaction_score_(num_levels_), + compaction_level_(num_levels_), + version_number_(version_number), + file_indexer_(num_levels_, cfd == nullptr ? nullptr + : cfd->internal_comparator().user_comparator()) { +} + +void Version::Get(const ReadOptions& options, + const LookupKey& k, + std::string* value, + Status* status, + MergeContext* merge_context, + GetStats* stats, + bool* value_found) { + Slice ikey = k.internal_key(); + Slice user_key = k.user_key(); + + assert(status->ok() || status->IsMergeInProgress()); + Saver saver; + saver.state = status->ok()? kNotFound : kMerge; + saver.ucmp = user_comparator_; + saver.user_key = user_key; + saver.value_found = value_found; + saver.value = value; + saver.merge_operator = merge_operator_; + saver.merge_context = merge_context; + saver.logger = info_log_; + saver.didIO = false; + saver.statistics = db_statistics_; + + stats->seek_file = nullptr; + stats->seek_file_level = -1; + FileMetaData* last_file_read = nullptr; + int last_file_read_level = -1; + + // We can search level-by-level since entries never hop across + // levels. Therefore we are guaranteed that if we find data + // in an smaller level, later levels are irrelevant (unless we + // are MergeInProgress). + + int32_t search_left_bound = 0; + int32_t search_right_bound = FileIndexer::kLevelMaxIndex; + for (int level = 0; level < num_levels_; ++level) { + int num_files = files_[level].size(); + if (num_files == 0) { + // When current level is empty, the search bound generated from upper + // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is + // also empty. + assert(search_left_bound == 0); + assert(search_right_bound == -1 || + search_right_bound == FileIndexer::kLevelMaxIndex); + // Since current level is empty, it will need to search all files in the + // next level + search_left_bound = 0; + search_right_bound = FileIndexer::kLevelMaxIndex; + continue; + } + + // Get the list of files to search in this level + FileMetaData* const* files = &files_[level][0]; + + // Some files may overlap each other. We find + // all files that overlap user_key and process them in order from + // newest to oldest. In the context of merge-operator, + // this can occur at any level. Otherwise, it only occurs + // at Level-0 (since Put/Deletes are always compacted into a single entry). + int32_t start_index; + if (level == 0) { + // On Level-0, we read through all files to check for overlap. + start_index = 0; + } else { + // On Level-n (n>=1), files are sorted. Binary search to find the earliest + // file whose largest key >= ikey. Search left bound and right bound are + // used to narrow the range. + if (search_left_bound == search_right_bound) { + start_index = search_left_bound; + } else if (search_left_bound < search_right_bound) { + if (search_right_bound == FileIndexer::kLevelMaxIndex) { + search_right_bound = num_files - 1; + } + start_index = FindFileInRange(cfd_->internal_comparator(), + files_[level], ikey, search_left_bound, search_right_bound); + } else { + // search_left_bound > search_right_bound, key does not exist in this + // level. Since no comparision is done in this level, it will need to + // search all files in the next level. + search_left_bound = 0; + search_right_bound = FileIndexer::kLevelMaxIndex; + continue; + } + } + // Traverse each relevant file to find the desired key +#ifndef NDEBUG + FileMetaData* prev_file = nullptr; +#endif + + for (int32_t i = start_index; i < num_files;) { + FileMetaData* f = files[i]; + // Check if key is within a file's range. If search left bound and right + // bound point to the same find, we are sure key falls in range. + assert(level == 0 || i == start_index || + user_comparator_->Compare(user_key, f->smallest.user_key()) <= 0); + + int cmp_smallest = user_comparator_->Compare(user_key, f->smallest.user_key()); + int cmp_largest = -1; + if (cmp_smallest >= 0) { + cmp_largest = user_comparator_->Compare(user_key, f->largest.user_key()); + } + + // Setup file search bound for the next level based on the comparison + // results + if (level > 0) { + file_indexer_.GetNextLevelIndex(level, i, cmp_smallest, cmp_largest, + &search_left_bound, &search_right_bound); + } + // Key falls out of current file's range + if (cmp_smallest < 0 || cmp_largest > 0) { + if (level == 0) { + ++i; + continue; + } else { + break; + } + } + +#ifndef NDEBUG + // Sanity check to make sure that the files are correctly sorted + if (prev_file) { + if (level != 0) { + int comp_sign = + internal_comparator_->Compare(prev_file->largest, f->smallest); + assert(comp_sign < 0); + } else { + // level == 0, the current file cannot be newer than the previous one. + if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { + assert(!NewestFirstBySeqNo(f, prev_file)); + } else { + assert(!NewestFirst(f, prev_file)); + } + } + } + prev_file = f; +#endif + bool tableIO = false; + *status = table_cache_->Get(options, *internal_comparator_, *f, ikey, + &saver, SaveValue, &tableIO, MarkKeyMayExist); + // TODO: examine the behavior for corrupted key + if (!status->ok()) { + return; + } + + if (last_file_read != nullptr && stats->seek_file == nullptr) { + // We have had more than one seek for this read. Charge the 1st file. + stats->seek_file = last_file_read; + stats->seek_file_level = last_file_read_level; + } + + // If we did any IO as part of the read, then we remember it because + // it is a possible candidate for seek-based compaction. saver.didIO + // is true if the block had to be read in from storage and was not + // pre-exisiting in the block cache. Also, if this file was not pre- + // existing in the table cache and had to be freshly opened that needed + // the index blocks to be read-in, then tableIO is true. One thing + // to note is that the index blocks are not part of the block cache. + if (saver.didIO || tableIO) { + last_file_read = f; + last_file_read_level = level; + } + + switch (saver.state) { + case kNotFound: + break; // Keep searching in other files + case kFound: + return; + case kDeleted: + *status = Status::NotFound(); // Use empty error message for speed + return; + case kCorrupt: + *status = Status::Corruption("corrupted key for ", user_key); + return; + case kMerge: + break; + } + if (level > 0 && cmp_largest < 0) { + break; + } else { + ++i; + } + } + } + + + if (kMerge == saver.state) { + // merge_operands are in saver and we hit the beginning of the key history + // do a final merge of nullptr and operands; + if (merge_operator_->FullMerge(user_key, nullptr, + saver.merge_context->GetOperands(), value, + info_log_)) { + *status = Status::OK(); + } else { + RecordTick(db_statistics_, NUMBER_MERGE_FAILURES); + *status = Status::Corruption("could not perform end-of-key merge for ", + user_key); + } + } else { + *status = Status::NotFound(); // Use an empty error message for speed + } +} + +bool Version::UpdateStats(const GetStats& stats) { + FileMetaData* f = stats.seek_file; + if (f != nullptr) { + f->allowed_seeks--; + if (f->allowed_seeks <= 0 && file_to_compact_ == nullptr) { + file_to_compact_ = f; + file_to_compact_level_ = stats.seek_file_level; + return true; + } + } + return false; +} + +void Version::ComputeCompactionScore( + std::vector& size_being_compacted) { + double max_score = 0; + int max_score_level = 0; + + int num_levels_to_check = + (cfd_->options()->compaction_style != kCompactionStyleUniversal && + cfd_->options()->compaction_style != kCompactionStyleFIFO) + ? NumberLevels() - 1 + : 1; + + for (int level = 0; level < num_levels_to_check; level++) { + double score; + if (level == 0) { + // We treat level-0 specially by bounding the number of files + // instead of number of bytes for two reasons: + // + // (1) With larger write-buffer sizes, it is nice not to do too + // many level-0 compactions. + // + // (2) The files in level-0 are merged on every read and + // therefore we wish to avoid too many files when the individual + // file size is small (perhaps because of a small write-buffer + // setting, or very high compression ratios, or lots of + // overwrites/deletions). + int numfiles = 0; + uint64_t total_size = 0; + for (unsigned int i = 0; i < files_[level].size(); i++) { + if (!files_[level][i]->being_compacted) { + total_size += files_[level][i]->file_size; + numfiles++; + } + } + if (cfd_->options()->compaction_style == kCompactionStyleFIFO) { + score = static_cast(total_size) / + cfd_->options()->compaction_options_fifo.max_table_files_size; + } else if (numfiles >= cfd_->options()->level0_stop_writes_trigger) { + // If we are slowing down writes, then we better compact that first + score = 1000000; + } else if (numfiles >= cfd_->options()->level0_slowdown_writes_trigger) { + score = 10000; + } else { + score = static_cast(numfiles) / + cfd_->options()->level0_file_num_compaction_trigger; + } + } else { + // Compute the ratio of current size to size limit. + const uint64_t level_bytes = + TotalFileSize(files_[level]) - size_being_compacted[level]; + score = static_cast(level_bytes) / + cfd_->compaction_picker()->MaxBytesForLevel(level); + if (max_score < score) { + max_score = score; + max_score_level = level; + } + } + compaction_level_[level] = level; + compaction_score_[level] = score; + } + + // update the max compaction score in levels 1 to n-1 + max_compaction_score_ = max_score; + max_compaction_score_level_ = max_score_level; + + // sort all the levels based on their score. Higher scores get listed + // first. Use bubble sort because the number of entries are small. + for (int i = 0; i < NumberLevels() - 2; i++) { + for (int j = i + 1; j < NumberLevels() - 1; j++) { + if (compaction_score_[i] < compaction_score_[j]) { + double score = compaction_score_[i]; + int level = compaction_level_[i]; + compaction_score_[i] = compaction_score_[j]; + compaction_level_[i] = compaction_level_[j]; + compaction_score_[j] = score; + compaction_level_[j] = level; + } + } + } +} + +namespace { + +// Compator that is used to sort files based on their size +// In normal mode: descending size +bool CompareSizeDescending(const Version::Fsize& first, + const Version::Fsize& second) { + return (first.file->file_size > second.file->file_size); +} +// A static compator used to sort files based on their seqno +// In universal style : descending seqno +bool CompareSeqnoDescending(const Version::Fsize& first, + const Version::Fsize& second) { + if (first.file->smallest_seqno > second.file->smallest_seqno) { + assert(first.file->largest_seqno > second.file->largest_seqno); + return true; + } + assert(first.file->largest_seqno <= second.file->largest_seqno); + return false; +} + +} // anonymous namespace + +void Version::UpdateFilesBySize() { + if (cfd_->options()->compaction_style == kCompactionStyleFIFO) { + // don't need this + return; + } + // No need to sort the highest level because it is never compacted. + int max_level = + (cfd_->options()->compaction_style == kCompactionStyleUniversal) + ? NumberLevels() + : NumberLevels() - 1; + + for (int level = 0; level < max_level; level++) { + const std::vector& files = files_[level]; + std::vector& files_by_size = files_by_size_[level]; + assert(files_by_size.size() == 0); + + // populate a temp vector for sorting based on size + std::vector temp(files.size()); + for (unsigned int i = 0; i < files.size(); i++) { + temp[i].index = i; + temp[i].file = files[i]; + } + + // sort the top number_of_files_to_sort_ based on file size + if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { + int num = temp.size(); + std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), + CompareSeqnoDescending); + } else { + int num = Version::number_of_files_to_sort_; + if (num > (int)temp.size()) { + num = temp.size(); + } + std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), + CompareSizeDescending); + } + assert(temp.size() == files.size()); + + // initialize files_by_size_ + for (unsigned int i = 0; i < temp.size(); i++) { + files_by_size.push_back(temp[i].index); + } + next_file_to_compact_by_size_[level] = 0; + assert(files_[level].size() == files_by_size_[level].size()); + } +} + +void Version::Ref() { + ++refs_; +} + +bool Version::Unref() { + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + delete this; + return true; + } + return false; +} + +bool Version::NeedsCompaction() const { + if (file_to_compact_ != nullptr) { + return true; + } + // In universal compaction case, this check doesn't really + // check the compaction condition, but checks num of files threshold + // only. We are not going to miss any compaction opportunity + // but it's likely that more compactions are scheduled but + // ending up with nothing to do. We can improve it later. + // TODO(sdong): improve this function to be accurate for universal + // compactions. + int num_levels_to_check = + (cfd_->options()->compaction_style != kCompactionStyleUniversal && + cfd_->options()->compaction_style != kCompactionStyleFIFO) + ? NumberLevels() - 1 + : 1; + for (int i = 0; i < num_levels_to_check; i++) { + if (compaction_score_[i] >= 1) { + return true; + } + } + return false; +} + +bool Version::OverlapInLevel(int level, + const Slice* smallest_user_key, + const Slice* largest_user_key) { + return SomeFileOverlapsRange(cfd_->internal_comparator(), (level > 0), + files_[level], smallest_user_key, + largest_user_key); +} + +int Version::PickLevelForMemTableOutput( + const Slice& smallest_user_key, + const Slice& largest_user_key) { + int level = 0; + if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) { + // Push to next level if there is no overlap in next level, + // and the #bytes overlapping in the level after that are limited. + InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey limit(largest_user_key, 0, static_cast(0)); + std::vector overlaps; + int max_mem_compact_level = cfd_->options()->max_mem_compaction_level; + while (max_mem_compact_level > 0 && level < max_mem_compact_level) { + if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { + break; + } + if (level + 2 >= num_levels_) { + level++; + break; + } + GetOverlappingInputs(level + 2, &start, &limit, &overlaps); + const uint64_t sum = TotalFileSize(overlaps); + if (sum > cfd_->compaction_picker()->MaxGrandParentOverlapBytes(level)) { + break; + } + level++; + } + } + + return level; +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// If hint_index is specified, then it points to a file in the +// overlapping range. +// The file_index returns a pointer to any file in an overlapping range. +void Version::GetOverlappingInputs(int level, + const InternalKey* begin, + const InternalKey* end, + std::vector* inputs, + int hint_index, + int* file_index) { + inputs->clear(); + Slice user_begin, user_end; + if (begin != nullptr) { + user_begin = begin->user_key(); + } + if (end != nullptr) { + user_end = end->user_key(); + } + if (file_index) { + *file_index = -1; + } + const Comparator* user_cmp = cfd_->internal_comparator().user_comparator(); + if (begin != nullptr && end != nullptr && level > 0) { + GetOverlappingInputsBinarySearch(level, user_begin, user_end, inputs, + hint_index, file_index); + return; + } + for (size_t i = 0; i < files_[level].size(); ) { + FileMetaData* f = files_[level][i++]; + const Slice file_start = f->smallest.user_key(); + const Slice file_limit = f->largest.user_key(); + if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) { + // "f" is completely before specified range; skip it + } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) { + // "f" is completely after specified range; skip it + } else { + inputs->push_back(f); + if (level == 0) { + // Level-0 files may overlap each other. So check if the newly + // added file has expanded the range. If so, restart search. + if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) { + user_begin = file_start; + inputs->clear(); + i = 0; + } else if (end != nullptr + && user_cmp->Compare(file_limit, user_end) > 0) { + user_end = file_limit; + inputs->clear(); + i = 0; + } + } else if (file_index) { + *file_index = i-1; + } + } + } +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// Employ binary search to find at least one file that overlaps the +// specified range. From that file, iterate backwards and +// forwards to find all overlapping files. +void Version::GetOverlappingInputsBinarySearch( + int level, + const Slice& user_begin, + const Slice& user_end, + std::vector* inputs, + int hint_index, + int* file_index) { + assert(level > 0); + int min = 0; + int mid = 0; + int max = files_[level].size() -1; + bool foundOverlap = false; + const Comparator* user_cmp = cfd_->internal_comparator().user_comparator(); + + // if the caller already knows the index of a file that has overlap, + // then we can skip the binary search. + if (hint_index != -1) { + mid = hint_index; + foundOverlap = true; + } + + while (!foundOverlap && min <= max) { + mid = (min + max)/2; + FileMetaData* f = files_[level][mid]; + const Slice file_start = f->smallest.user_key(); + const Slice file_limit = f->largest.user_key(); + if (user_cmp->Compare(file_limit, user_begin) < 0) { + min = mid + 1; + } else if (user_cmp->Compare(user_end, file_start) < 0) { + max = mid - 1; + } else { + foundOverlap = true; + break; + } + } + + // If there were no overlapping files, return immediately. + if (!foundOverlap) { + return; + } + // returns the index where an overlap is found + if (file_index) { + *file_index = mid; + } + ExtendOverlappingInputs(level, user_begin, user_end, inputs, mid); +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// The midIndex specifies the index of at least one file that +// overlaps the specified range. From that file, iterate backward +// and forward to find all overlapping files. +void Version::ExtendOverlappingInputs( + int level, + const Slice& user_begin, + const Slice& user_end, + std::vector* inputs, + unsigned int midIndex) { + + const Comparator* user_cmp = cfd_->internal_comparator().user_comparator(); +#ifndef NDEBUG + { + // assert that the file at midIndex overlaps with the range + assert(midIndex < files_[level].size()); + FileMetaData* f = files_[level][midIndex]; + const Slice fstart = f->smallest.user_key(); + const Slice flimit = f->largest.user_key(); + if (user_cmp->Compare(fstart, user_begin) >= 0) { + assert(user_cmp->Compare(fstart, user_end) <= 0); + } else { + assert(user_cmp->Compare(flimit, user_begin) >= 0); + } + } +#endif + int startIndex = midIndex + 1; + int endIndex = midIndex; + int count __attribute__((unused)) = 0; + + // check backwards from 'mid' to lower indices + for (int i = midIndex; i >= 0 ; i--) { + FileMetaData* f = files_[level][i]; + const Slice file_limit = f->largest.user_key(); + if (user_cmp->Compare(file_limit, user_begin) >= 0) { + startIndex = i; + assert((count++, true)); + } else { + break; + } + } + // check forward from 'mid+1' to higher indices + for (unsigned int i = midIndex+1; i < files_[level].size(); i++) { + FileMetaData* f = files_[level][i]; + const Slice file_start = f->smallest.user_key(); + if (user_cmp->Compare(file_start, user_end) <= 0) { + assert((count++, true)); + endIndex = i; + } else { + break; + } + } + assert(count == endIndex - startIndex + 1); + + // insert overlapping files into vector + for (int i = startIndex; i <= endIndex; i++) { + FileMetaData* f = files_[level][i]; + inputs->push_back(f); + } +} + +// Returns true iff the first or last file in inputs contains +// an overlapping user key to the file "just outside" of it (i.e. +// just after the last file, or just before the first file) +// REQUIRES: "*inputs" is a sorted list of non-overlapping files +bool Version::HasOverlappingUserKey( + const std::vector* inputs, + int level) { + + // If inputs empty, there is no overlap. + // If level == 0, it is assumed that all needed files were already included. + if (inputs->empty() || level == 0){ + return false; + } + + const Comparator* user_cmp = cfd_->internal_comparator().user_comparator(); + const std::vector& files = files_[level]; + const size_t kNumFiles = files.size(); + + // Check the last file in inputs against the file after it + size_t last_file = FindFile(cfd_->internal_comparator(), files, + inputs->back()->largest.Encode()); + assert(0 <= last_file && last_file < kNumFiles); // File should exist! + if (last_file < kNumFiles-1) { // If not the last file + const Slice last_key_in_input = files[last_file]->largest.user_key(); + const Slice first_key_after = files[last_file+1]->smallest.user_key(); + if (user_cmp->Compare(last_key_in_input, first_key_after) == 0) { + // The last user key in input overlaps with the next file's first key + return true; + } + } + + // Check the first file in inputs against the file just before it + size_t first_file = FindFile(cfd_->internal_comparator(), files, + inputs->front()->smallest.Encode()); + assert(0 <= first_file && first_file <= last_file); // File should exist! + if (first_file > 0) { // If not first file + const Slice& first_key_in_input = files[first_file]->smallest.user_key(); + const Slice& last_key_before = files[first_file-1]->largest.user_key(); + if (user_cmp->Compare(first_key_in_input, last_key_before) == 0) { + // The first user key in input overlaps with the previous file's last key + return true; + } + } + + return false; +} + +int64_t Version::NumLevelBytes(int level) const { + assert(level >= 0); + assert(level < NumberLevels()); + return TotalFileSize(files_[level]); +} + +const char* Version::LevelSummary(LevelSummaryStorage* scratch) const { + int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files["); + for (int i = 0; i < NumberLevels(); i++) { + int sz = sizeof(scratch->buffer) - len; + int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size())); + if (ret < 0 || ret >= sz) break; + len += ret; + } + if (len > 0) { + // overwrite the last space + --len; + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); + return scratch->buffer; +} + +const char* Version::LevelFileSummary(FileSummaryStorage* scratch, + int level) const { + int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size["); + for (const auto& f : files_[level]) { + int sz = sizeof(scratch->buffer) - len; + char sztxt[16]; + AppendHumanBytes(f->file_size, sztxt, 16); + int ret = snprintf(scratch->buffer + len, sz, + "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ", f->number, + f->smallest_seqno, sztxt, + static_cast(f->being_compacted)); + if (ret < 0 || ret >= sz) + break; + len += ret; + } + // overwrite the last space (only if files_[level].size() is non-zero) + if (files_[level].size() && len > 0) { + --len; + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); + return scratch->buffer; +} + +int64_t Version::MaxNextLevelOverlappingBytes() { + uint64_t result = 0; + std::vector overlaps; + for (int level = 1; level < NumberLevels() - 1; level++) { + for (const auto& f : files_[level]) { + GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps); + const uint64_t sum = TotalFileSize(overlaps); + if (sum > result) { + result = sum; + } + } + } + return result; +} + +void Version::AddLiveFiles(std::set* live) { + for (int level = 0; level < NumberLevels(); level++) { + const std::vector& files = files_[level]; + for (const auto& file : files) { + live->insert(file->number); + } + } +} + +std::string Version::DebugString(bool hex) const { + std::string r; + for (int level = 0; level < num_levels_; level++) { + // E.g., + // --- level 1 --- + // 17:123['a' .. 'd'] + // 20:43['e' .. 'g'] + r.append("--- level "); + AppendNumberTo(&r, level); + r.append(" --- version# "); + AppendNumberTo(&r, version_number_); + r.append(" ---\n"); + const std::vector& files = files_[level]; + for (size_t i = 0; i < files.size(); i++) { + r.push_back(' '); + AppendNumberTo(&r, files[i]->number); + r.push_back(':'); + AppendNumberTo(&r, files[i]->file_size); + r.append("["); + r.append(files[i]->smallest.DebugString(hex)); + r.append(" .. "); + r.append(files[i]->largest.DebugString(hex)); + r.append("]\n"); + } + } + return r; +} + +// this is used to batch writes to the manifest file +struct VersionSet::ManifestWriter { + Status status; + bool done; + port::CondVar cv; + ColumnFamilyData* cfd; + VersionEdit* edit; + + explicit ManifestWriter(port::Mutex* mu, ColumnFamilyData* cfd, + VersionEdit* e) + : done(false), cv(mu), cfd(cfd), edit(e) {} +}; + +// A helper class so we can efficiently apply a whole sequence +// of edits to a particular state without creating intermediate +// Versions that contain full copies of the intermediate state. +class VersionSet::Builder { + private: + // Helper to sort v->files_ + // kLevel0LevelCompaction -- NewestFirst (also used for FIFO compaction) + // kLevel0UniversalCompaction -- NewestFirstBySeqNo + // kLevelNon0 -- BySmallestKey + struct FileComparator { + enum SortMethod { + kLevel0LevelCompaction = 0, + kLevel0UniversalCompaction = 1, + kLevelNon0 = 2, + } sort_method; + const InternalKeyComparator* internal_comparator; + + bool operator()(FileMetaData* f1, FileMetaData* f2) const { + switch (sort_method) { + case kLevel0LevelCompaction: + return NewestFirst(f1, f2); + case kLevel0UniversalCompaction: + return NewestFirstBySeqNo(f1, f2); + case kLevelNon0: + return BySmallestKey(f1, f2, internal_comparator); + } + assert(false); + return false; + } + }; + + typedef std::set FileSet; + struct LevelState { + std::set deleted_files; + FileSet* added_files; + }; + + ColumnFamilyData* cfd_; + Version* base_; + LevelState* levels_; + FileComparator level_zero_cmp_; + FileComparator level_nonzero_cmp_; + + public: + Builder(ColumnFamilyData* cfd) : cfd_(cfd), base_(cfd->current()) { + base_->Ref(); + levels_ = new LevelState[base_->NumberLevels()]; + level_zero_cmp_.sort_method = + (cfd_->options()->compaction_style == kCompactionStyleUniversal) + ? FileComparator::kLevel0UniversalCompaction + : FileComparator::kLevel0LevelCompaction; + level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0; + level_nonzero_cmp_.internal_comparator = &cfd->internal_comparator(); + + levels_[0].added_files = new FileSet(level_zero_cmp_); + for (int level = 1; level < base_->NumberLevels(); level++) { + levels_[level].added_files = new FileSet(level_nonzero_cmp_); + } + } + + ~Builder() { + for (int level = 0; level < base_->NumberLevels(); level++) { + const FileSet* added = levels_[level].added_files; + std::vector to_unref; + to_unref.reserve(added->size()); + for (FileSet::const_iterator it = added->begin(); + it != added->end(); ++it) { + to_unref.push_back(*it); + } + delete added; + for (uint32_t i = 0; i < to_unref.size(); i++) { + FileMetaData* f = to_unref[i]; + f->refs--; + if (f->refs <= 0) { + if (f->table_reader_handle) { + cfd_->table_cache()->ReleaseHandle(f->table_reader_handle); + f->table_reader_handle = nullptr; + } + delete f; + } + } + } + + delete[] levels_; + base_->Unref(); + } + + void CheckConsistency(Version* v) { +#ifndef NDEBUG + // make sure the files are sorted correctly + for (int level = 0; level < v->NumberLevels(); level++) { + for (size_t i = 1; i < v->files_[level].size(); i++) { + auto f1 = v->files_[level][i - 1]; + auto f2 = v->files_[level][i]; + if (level == 0) { + assert(level_zero_cmp_(f1, f2)); + if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { + assert(f1->largest_seqno > f2->largest_seqno); + } + } else { + assert(level_nonzero_cmp_(f1, f2)); + + // Make sure there is no overlap in levels > 0 + if (cfd_->internal_comparator().Compare(f1->largest, f2->smallest) >= + 0) { + fprintf(stderr, "overlapping ranges in same level %s vs. %s\n", + (f1->largest).DebugString().c_str(), + (f2->smallest).DebugString().c_str()); + abort(); + } + } + } + } +#endif + } + + void CheckConsistencyForDeletes(VersionEdit* edit, unsigned int number, + int level) { +#ifndef NDEBUG + // a file to be deleted better exist in the previous version + bool found = false; + for (int l = 0; !found && l < base_->NumberLevels(); l++) { + const std::vector& base_files = base_->files_[l]; + for (unsigned int i = 0; i < base_files.size(); i++) { + FileMetaData* f = base_files[i]; + if (f->number == number) { + found = true; + break; + } + } + } + // if the file did not exist in the previous version, then it + // is possibly moved from lower level to higher level in current + // version + for (int l = level+1; !found && l < base_->NumberLevels(); l++) { + const FileSet* added = levels_[l].added_files; + for (FileSet::const_iterator added_iter = added->begin(); + added_iter != added->end(); ++added_iter) { + FileMetaData* f = *added_iter; + if (f->number == number) { + found = true; + break; + } + } + } + + // maybe this file was added in a previous edit that was Applied + if (!found) { + const FileSet* added = levels_[level].added_files; + for (FileSet::const_iterator added_iter = added->begin(); + added_iter != added->end(); ++added_iter) { + FileMetaData* f = *added_iter; + if (f->number == number) { + found = true; + break; + } + } + } + assert(found); +#endif + } + + // Apply all of the edits in *edit to the current state. + void Apply(VersionEdit* edit) { + CheckConsistency(base_); + + // Delete files + const VersionEdit::DeletedFileSet& del = edit->deleted_files_; + for (const auto& del_file : del) { + const auto level = del_file.first; + const auto number = del_file.second; + levels_[level].deleted_files.insert(number); + CheckConsistencyForDeletes(edit, number, level); + } + + // Add new files + for (const auto& new_file : edit->new_files_) { + const int level = new_file.first; + FileMetaData* f = new FileMetaData(new_file.second); + f->refs = 1; + + // We arrange to automatically compact this file after + // a certain number of seeks. Let's assume: + // (1) One seek costs 10ms + // (2) Writing or reading 1MB costs 10ms (100MB/s) + // (3) A compaction of 1MB does 25MB of IO: + // 1MB read from this level + // 10-12MB read from next level (boundaries may be misaligned) + // 10-12MB written to next level + // This implies that 25 seeks cost the same as the compaction + // of 1MB of data. I.e., one seek costs approximately the + // same as the compaction of 40KB of data. We are a little + // conservative and allow approximately one seek for every 16KB + // of data before triggering a compaction. + f->allowed_seeks = (f->file_size / 16384); + if (f->allowed_seeks < 100) f->allowed_seeks = 100; + + levels_[level].deleted_files.erase(f->number); + levels_[level].added_files->insert(f); + } + } + + // Save the current state in *v. + void SaveTo(Version* v) { + CheckConsistency(base_); + CheckConsistency(v); + + for (int level = 0; level < base_->NumberLevels(); level++) { + const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_; + // Merge the set of added files with the set of pre-existing files. + // Drop any deleted files. Store the result in *v. + const auto& base_files = base_->files_[level]; + auto base_iter = base_files.begin(); + auto base_end = base_files.end(); + const auto& added_files = *levels_[level].added_files; + v->files_[level].reserve(base_files.size() + added_files.size()); + + for (const auto& added : added_files) { + // Add all smaller files listed in base_ + for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp); + base_iter != bpos; + ++base_iter) { + MaybeAddFile(v, level, *base_iter); + } + + MaybeAddFile(v, level, added); + } + + // Add remaining base files + for (; base_iter != base_end; ++base_iter) { + MaybeAddFile(v, level, *base_iter); + } + } + + CheckConsistency(v); + + v->file_indexer_.UpdateIndex(v->files_); + } + + void LoadTableHandlers() { + for (int level = 0; level < cfd_->NumberLevels(); level++) { + for (auto& file_meta : *(levels_[level].added_files)) { + assert (!file_meta->table_reader_handle); + bool table_io; + cfd_->table_cache()->FindTable( + base_->vset_->storage_options_, cfd_->internal_comparator(), + file_meta->number, file_meta->file_size, + &file_meta->table_reader_handle, &table_io, false); + if (file_meta->table_reader_handle != nullptr) { + // Load table_reader + file_meta->table_reader = + cfd_->table_cache()->GetTableReaderFromHandle( + file_meta->table_reader_handle); + } + } + } + } + + void MaybeAddFile(Version* v, int level, FileMetaData* f) { + if (levels_[level].deleted_files.count(f->number) > 0) { + // File is deleted: do nothing + } else { + auto* files = &v->files_[level]; + if (level > 0 && !files->empty()) { + // Must not overlap + assert(cfd_->internal_comparator().Compare( + (*files)[files->size() - 1]->largest, f->smallest) < 0); + } + f->refs++; + files->push_back(f); + } + } +}; + +VersionSet::VersionSet(const std::string& dbname, const DBOptions* options, + const EnvOptions& storage_options, Cache* table_cache) + : column_family_set_(new ColumnFamilySet(dbname, options, storage_options, + table_cache)), + env_(options->env), + dbname_(dbname), + options_(options), + next_file_number_(2), + manifest_file_number_(0), // Filled by Recover() + pending_manifest_file_number_(0), + last_sequence_(0), + prev_log_number_(0), + current_version_number_(0), + manifest_file_size_(0), + storage_options_(storage_options), + storage_options_compactions_(storage_options_) {} + +VersionSet::~VersionSet() { + // we need to delete column_family_set_ because its destructor depends on + // VersionSet + column_family_set_.reset(); + for (auto file : obsolete_files_) { + delete file; + } + obsolete_files_.clear(); +} + +void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, + Version* v) { + // Make "v" current + assert(v->refs_ == 0); + Version* current = column_family_data->current(); + assert(v != current); + if (current != nullptr) { + assert(current->refs_ > 0); + current->Unref(); + } + column_family_data->SetCurrent(v); + v->Ref(); + + // Append to linked list + v->prev_ = column_family_data->dummy_versions()->prev_; + v->next_ = column_family_data->dummy_versions(); + v->prev_->next_ = v; + v->next_->prev_ = v; +} + +Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, + VersionEdit* edit, port::Mutex* mu, + Directory* db_directory, bool new_descriptor_log, + const ColumnFamilyOptions* options) { + mu->AssertHeld(); + + // column_family_data can be nullptr only if this is column_family_add. + // in that case, we also need to specify ColumnFamilyOptions + if (column_family_data == nullptr) { + assert(edit->is_column_family_add_); + assert(options != nullptr); + } + + // queue our request + ManifestWriter w(mu, column_family_data, edit); + manifest_writers_.push_back(&w); + while (!w.done && &w != manifest_writers_.front()) { + w.cv.Wait(); + } + if (w.done) { + return w.status; + } + if (column_family_data != nullptr && column_family_data->IsDropped()) { + // if column family is dropped by the time we get here, no need to write + // anything to the manifest + manifest_writers_.pop_front(); + // Notify new head of write queue + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } + return Status::OK(); + } + + std::vector batch_edits; + Version* v = nullptr; + std::unique_ptr builder(nullptr); + + // process all requests in the queue + ManifestWriter* last_writer = &w; + assert(!manifest_writers_.empty()); + assert(manifest_writers_.front() == &w); + if (edit->IsColumnFamilyManipulation()) { + // no group commits for column family add or drop + LogAndApplyCFHelper(edit); + batch_edits.push_back(edit); + } else { + v = new Version(column_family_data, this, current_version_number_++); + builder.reset(new Builder(column_family_data)); + for (const auto& writer : manifest_writers_) { + if (writer->edit->IsColumnFamilyManipulation() || + writer->cfd->GetID() != column_family_data->GetID()) { + // no group commits for column family add or drop + // also, group commits across column families are not supported + break; + } + last_writer = writer; + LogAndApplyHelper(column_family_data, builder.get(), v, last_writer->edit, + mu); + batch_edits.push_back(last_writer->edit); + } + builder->SaveTo(v); + } + + // Initialize new descriptor log file if necessary by creating + // a temporary file that contains a snapshot of the current version. + uint64_t new_manifest_file_size = 0; + Status s; + + assert(pending_manifest_file_number_ == 0); + if (!descriptor_log_ || + manifest_file_size_ > options_->max_manifest_file_size) { + pending_manifest_file_number_ = NewFileNumber(); + batch_edits.back()->SetNextFile(next_file_number_); + new_descriptor_log = true; + } else { + pending_manifest_file_number_ = manifest_file_number_; + } + + if (new_descriptor_log) { + // if we're writing out new snapshot make sure to persist max column family + if (column_family_set_->GetMaxColumnFamily() > 0) { + edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily()); + } + } + + // Unlock during expensive operations. New writes cannot get here + // because &w is ensuring that all new writes get queued. + { + std::vector size_being_compacted; + if (!edit->IsColumnFamilyManipulation()) { + size_being_compacted.resize(v->NumberLevels() - 1); + // calculate the amount of data being compacted at every level + column_family_data->compaction_picker()->SizeBeingCompacted( + size_being_compacted); + } + + mu->Unlock(); + + if (!edit->IsColumnFamilyManipulation() && options_->max_open_files == -1) { + // unlimited table cache. Pre-load table handle now. + // Need to do it out of the mutex. + builder->LoadTableHandlers(); + } + + // This is fine because everything inside of this block is serialized -- + // only one thread can be here at the same time + if (new_descriptor_log) { + unique_ptr descriptor_file; + s = env_->NewWritableFile( + DescriptorFileName(dbname_, pending_manifest_file_number_), + &descriptor_file, env_->OptimizeForManifestWrite(storage_options_)); + if (s.ok()) { + descriptor_file->SetPreallocationBlockSize( + options_->manifest_preallocation_size); + descriptor_log_.reset(new log::Writer(std::move(descriptor_file))); + s = WriteSnapshot(descriptor_log_.get()); + } + } + + if (!edit->IsColumnFamilyManipulation()) { + // The calls to ComputeCompactionScore and UpdateFilesBySize are cpu-heavy + // and is best called outside the mutex. + v->ComputeCompactionScore(size_being_compacted); + v->UpdateFilesBySize(); + } + + // Write new record to MANIFEST log + if (s.ok()) { + for (auto& e : batch_edits) { + std::string record; + e->EncodeTo(&record); + s = descriptor_log_->AddRecord(record); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + if (options_->use_fsync) { + StopWatch sw(env_, options_->statistics.get(), + MANIFEST_FILE_SYNC_MICROS); + s = descriptor_log_->file()->Fsync(); + } else { + StopWatch sw(env_, options_->statistics.get(), + MANIFEST_FILE_SYNC_MICROS); + s = descriptor_log_->file()->Sync(); + } + } + if (!s.ok()) { + Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str()); + bool all_records_in = true; + for (auto& e : batch_edits) { + std::string record; + e->EncodeTo(&record); + if (!ManifestContains(pending_manifest_file_number_, record)) { + all_records_in = false; + break; + } + } + if (all_records_in) { + Log(options_->info_log, + "MANIFEST contains log record despite error; advancing to new " + "version to prevent mismatch between in-memory and logged state" + " If paranoid is set, then the db is now in readonly mode."); + s = Status::OK(); + } + } + } + + // If we just created a new descriptor file, install it by writing a + // new CURRENT file that points to it. + if (s.ok() && new_descriptor_log) { + s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_, + db_directory); + if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) { + // delete old manifest file + Log(options_->info_log, + "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", + manifest_file_number_, pending_manifest_file_number_); + // we don't care about an error here, PurgeObsoleteFiles will take care + // of it later + env_->DeleteFile(DescriptorFileName(dbname_, manifest_file_number_)); + } + } + + if (s.ok()) { + // find offset in manifest file where this version is stored. + new_manifest_file_size = descriptor_log_->file()->GetFileSize(); + } + + LogFlush(options_->info_log); + mu->Lock(); + } + + // Install the new version + if (s.ok()) { + if (edit->is_column_family_add_) { + // no group commit on column family add + assert(batch_edits.size() == 1); + assert(options != nullptr); + CreateColumnFamily(*options, edit); + } else if (edit->is_column_family_drop_) { + assert(batch_edits.size() == 1); + column_family_data->SetDropped(); + if (column_family_data->Unref()) { + delete column_family_data; + } + } else { + uint64_t max_log_number_in_batch = 0; + for (auto& e : batch_edits) { + if (e->has_log_number_) { + max_log_number_in_batch = + std::max(max_log_number_in_batch, e->log_number_); + } + } + if (max_log_number_in_batch != 0) { + assert(column_family_data->GetLogNumber() <= max_log_number_in_batch); + column_family_data->SetLogNumber(max_log_number_in_batch); + } + AppendVersion(column_family_data, v); + } + + manifest_file_number_ = pending_manifest_file_number_; + manifest_file_size_ = new_manifest_file_size; + prev_log_number_ = edit->prev_log_number_; + } else { + Log(options_->info_log, "Error in committing version %lu to [%s]", + (unsigned long)v->GetVersionNumber(), + column_family_data->GetName().c_str()); + delete v; + if (new_descriptor_log) { + descriptor_log_.reset(); + env_->DeleteFile( + DescriptorFileName(dbname_, pending_manifest_file_number_)); + } + } + pending_manifest_file_number_ = 0; + + // wake up all the waiting writers + while (true) { + ManifestWriter* ready = manifest_writers_.front(); + manifest_writers_.pop_front(); + if (ready != &w) { + ready->status = s; + ready->done = true; + ready->cv.Signal(); + } + if (ready == last_writer) break; + } + // Notify new head of write queue + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } + return s; +} + +void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) { + assert(edit->IsColumnFamilyManipulation()); + edit->SetNextFile(next_file_number_); + edit->SetLastSequence(last_sequence_); + if (edit->is_column_family_drop_) { + // if we drop column family, we have to make sure to save max column family, + // so that we don't reuse existing ID + edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily()); + } +} + +void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, Builder* builder, + Version* v, VersionEdit* edit, + port::Mutex* mu) { + mu->AssertHeld(); + assert(!edit->IsColumnFamilyManipulation()); + + if (edit->has_log_number_) { + assert(edit->log_number_ >= cfd->GetLogNumber()); + assert(edit->log_number_ < next_file_number_); + } + + if (!edit->has_prev_log_number_) { + edit->SetPrevLogNumber(prev_log_number_); + } + edit->SetNextFile(next_file_number_); + edit->SetLastSequence(last_sequence_); + + builder->Apply(edit); +} + +Status VersionSet::Recover( + const std::vector& column_families, + bool read_only) { + std::unordered_map cf_name_to_options; + for (auto cf : column_families) { + cf_name_to_options.insert({cf.name, cf.options}); + } + // keeps track of column families in manifest that were not found in + // column families parameters. if those column families are not dropped + // by subsequent manifest records, Recover() will return failure status + std::unordered_map column_families_not_found; + + // Read "CURRENT" file, which contains a pointer to the current manifest file + std::string manifest_filename; + Status s = ReadFileToString( + env_, CurrentFileName(dbname_), &manifest_filename + ); + if (!s.ok()) { + return s; + } + if (manifest_filename.empty() || + manifest_filename.back() != '\n') { + return Status::Corruption("CURRENT file does not end with newline"); + } + // remove the trailing '\n' + manifest_filename.resize(manifest_filename.size() - 1); + FileType type; + bool parse_ok = + ParseFileName(manifest_filename, &manifest_file_number_, &type); + if (!parse_ok || type != kDescriptorFile) { + return Status::Corruption("CURRENT file corrupted"); + } + + Log(options_->info_log, "Recovering from manifest file: %s\n", + manifest_filename.c_str()); + + manifest_filename = dbname_ + "/" + manifest_filename; + unique_ptr manifest_file; + s = env_->NewSequentialFile(manifest_filename, &manifest_file, + storage_options_); + if (!s.ok()) { + return s; + } + uint64_t manifest_file_size; + s = env_->GetFileSize(manifest_filename, &manifest_file_size); + if (!s.ok()) { + return s; + } + + bool have_log_number = false; + bool have_prev_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + uint64_t last_sequence = 0; + uint64_t log_number = 0; + uint64_t prev_log_number = 0; + uint32_t max_column_family = 0; + std::unordered_map builders; + + // add default column family + auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName); + if (default_cf_iter == cf_name_to_options.end()) { + return Status::InvalidArgument("Default column family not specified"); + } + VersionEdit default_cf_edit; + default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); + default_cf_edit.SetColumnFamily(0); + ColumnFamilyData* default_cfd = + CreateColumnFamily(default_cf_iter->second, &default_cf_edit); + builders.insert({0, new Builder(default_cfd)}); + + { + VersionSet::LogReporter reporter; + reporter.status = &s; + log::Reader reader(std::move(manifest_file), &reporter, true /*checksum*/, + 0 /*initial_offset*/); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + + // Not found means that user didn't supply that column + // family option AND we encountered column family add + // record. Once we encounter column family drop record, + // we will delete the column family from + // column_families_not_found. + bool cf_in_not_found = + column_families_not_found.find(edit.column_family_) != + column_families_not_found.end(); + // in builders means that user supplied that column family + // option AND that we encountered column family add record + bool cf_in_builders = + builders.find(edit.column_family_) != builders.end(); + + // they can't both be true + assert(!(cf_in_not_found && cf_in_builders)); + + ColumnFamilyData* cfd = nullptr; + + if (edit.is_column_family_add_) { + if (cf_in_builders || cf_in_not_found) { + s = Status::Corruption( + "Manifest adding the same column family twice"); + break; + } + auto cf_options = cf_name_to_options.find(edit.column_family_name_); + if (cf_options == cf_name_to_options.end()) { + column_families_not_found.insert( + {edit.column_family_, edit.column_family_name_}); + } else { + cfd = CreateColumnFamily(cf_options->second, &edit); + builders.insert({edit.column_family_, new Builder(cfd)}); + } + } else if (edit.is_column_family_drop_) { + if (cf_in_builders) { + auto builder = builders.find(edit.column_family_); + assert(builder != builders.end()); + delete builder->second; + builders.erase(builder); + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + if (cfd->Unref()) { + delete cfd; + cfd = nullptr; + } else { + // who else can have reference to cfd!? + assert(false); + } + } else if (cf_in_not_found) { + column_families_not_found.erase(edit.column_family_); + } else { + s = Status::Corruption( + "Manifest - dropping non-existing column family"); + break; + } + } else if (!cf_in_not_found) { + if (!cf_in_builders) { + s = Status::Corruption( + "Manifest record referencing unknown column family"); + break; + } + + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + // this should never happen since cf_in_builders is true + assert(cfd != nullptr); + if (edit.max_level_ >= cfd->current()->NumberLevels()) { + s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + + // if it is not column family add or column family drop, + // then it's a file add/delete, which should be forwarded + // to builder + auto builder = builders.find(edit.column_family_); + assert(builder != builders.end()); + builder->second->Apply(&edit); + } + + if (cfd != nullptr) { + if (edit.has_log_number_) { + if (cfd->GetLogNumber() > edit.log_number_) { + Log(options_->info_log, + "MANIFEST corruption detected, but ignored - Log numbers in " + "records NOT monotonically increasing"); + } else { + cfd->SetLogNumber(edit.log_number_); + have_log_number = true; + } + } + if (edit.has_comparator_ && + edit.comparator_ != cfd->user_comparator()->Name()) { + s = Status::InvalidArgument( + cfd->user_comparator()->Name(), + "does not match existing comparator " + edit.comparator_); + break; + } + } + + if (edit.has_prev_log_number_) { + prev_log_number = edit.prev_log_number_; + have_prev_log_number = true; + } + + if (edit.has_next_file_number_) { + next_file = edit.next_file_number_; + have_next_file = true; + } + + if (edit.has_max_column_family_) { + max_column_family = edit.max_column_family_; + } + + if (edit.has_last_sequence_) { + last_sequence = edit.last_sequence_; + have_last_sequence = true; + } + } + } + + if (s.ok()) { + if (!have_next_file) { + s = Status::Corruption("no meta-nextfile entry in descriptor"); + } else if (!have_log_number) { + s = Status::Corruption("no meta-lognumber entry in descriptor"); + } else if (!have_last_sequence) { + s = Status::Corruption("no last-sequence-number entry in descriptor"); + } + + if (!have_prev_log_number) { + prev_log_number = 0; + } + + column_family_set_->UpdateMaxColumnFamily(max_column_family); + + MarkFileNumberUsed(prev_log_number); + MarkFileNumberUsed(log_number); + } + + // there were some column families in the MANIFEST that weren't specified + // in the argument. This is OK in read_only mode + if (read_only == false && column_families_not_found.size() > 0) { + std::string list_of_not_found; + for (const auto& cf : column_families_not_found) { + list_of_not_found += ", " + cf.second; + } + list_of_not_found = list_of_not_found.substr(2); + s = Status::InvalidArgument( + "You have to open all column families. Column families not opened: " + + list_of_not_found); + } + + if (s.ok()) { + for (auto cfd : *column_family_set_) { + auto builders_iter = builders.find(cfd->GetID()); + assert(builders_iter != builders.end()); + auto builder = builders_iter->second; + + if (options_->max_open_files == -1) { + // unlimited table cache. Pre-load table handle now. + // Need to do it out of the mutex. + builder->LoadTableHandlers(); + } + + Version* v = new Version(cfd, this, current_version_number_++); + builder->SaveTo(v); + + // Install recovered version + std::vector size_being_compacted(v->NumberLevels() - 1); + cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted); + v->ComputeCompactionScore(size_being_compacted); + v->UpdateFilesBySize(); + AppendVersion(cfd, v); + } + + manifest_file_size_ = manifest_file_size; + next_file_number_ = next_file + 1; + last_sequence_ = last_sequence; + prev_log_number_ = prev_log_number; + + Log(options_->info_log, "Recovered from manifest file:%s succeeded," + "manifest_file_number is %lu, next_file_number is %lu, " + "last_sequence is %lu, log_number is %lu," + "prev_log_number is %lu," + "max_column_family is %u\n", + manifest_filename.c_str(), + (unsigned long)manifest_file_number_, + (unsigned long)next_file_number_, + (unsigned long)last_sequence_, + (unsigned long)log_number, + (unsigned long)prev_log_number_, + column_family_set_->GetMaxColumnFamily()); + + for (auto cfd : *column_family_set_) { + Log(options_->info_log, + "Column family [%s] (ID %u), log number is %" PRIu64 "\n", + cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); + } + } + + for (auto builder : builders) { + delete builder.second; + } + + return s; +} + +Status VersionSet::ListColumnFamilies(std::vector* column_families, + const std::string& dbname, Env* env) { + // these are just for performance reasons, not correcntes, + // so we're fine using the defaults + EnvOptions soptions; + // Read "CURRENT" file, which contains a pointer to the current manifest file + std::string current; + Status s = ReadFileToString(env, CurrentFileName(dbname), ¤t); + if (!s.ok()) { + return s; + } + if (current.empty() || current[current.size()-1] != '\n') { + return Status::Corruption("CURRENT file does not end with newline"); + } + current.resize(current.size() - 1); + + std::string dscname = dbname + "/" + current; + unique_ptr file; + s = env->NewSequentialFile(dscname, &file, soptions); + if (!s.ok()) { + return s; + } + + std::map column_family_names; + // default column family is always implicitly there + column_family_names.insert({0, kDefaultColumnFamilyName}); + VersionSet::LogReporter reporter; + reporter.status = &s; + log::Reader reader(std::move(file), &reporter, true /*checksum*/, + 0 /*initial_offset*/); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + if (edit.is_column_family_add_) { + if (column_family_names.find(edit.column_family_) != + column_family_names.end()) { + s = Status::Corruption("Manifest adding the same column family twice"); + break; + } + column_family_names.insert( + {edit.column_family_, edit.column_family_name_}); + } else if (edit.is_column_family_drop_) { + if (column_family_names.find(edit.column_family_) == + column_family_names.end()) { + s = Status::Corruption( + "Manifest - dropping non-existing column family"); + break; + } + column_family_names.erase(edit.column_family_); + } + } + + column_families->clear(); + if (s.ok()) { + for (const auto& iter : column_family_names) { + column_families->push_back(iter.second); + } + } + + return s; +} + +#ifndef ROCKSDB_LITE +Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, + const Options* options, + const EnvOptions& storage_options, + int new_levels) { + if (new_levels <= 1) { + return Status::InvalidArgument( + "Number of levels needs to be bigger than 1"); + } + + ColumnFamilyOptions cf_options(*options); + std::shared_ptr tc(NewLRUCache( + options->max_open_files - 10, options->table_cache_numshardbits, + options->table_cache_remove_scan_count_limit)); + VersionSet versions(dbname, options, storage_options, tc.get()); + Status status; + + std::vector dummy; + ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName, + ColumnFamilyOptions(*options)); + dummy.push_back(dummy_descriptor); + status = versions.Recover(dummy); + if (!status.ok()) { + return status; + } + + Version* current_version = + versions.GetColumnFamilySet()->GetDefault()->current(); + int current_levels = current_version->NumberLevels(); + + if (current_levels <= new_levels) { + return Status::OK(); + } + + // Make sure there are file only on one level from + // (new_levels-1) to (current_levels-1) + int first_nonempty_level = -1; + int first_nonempty_level_filenum = 0; + for (int i = new_levels - 1; i < current_levels; i++) { + int file_num = current_version->NumLevelFiles(i); + if (file_num != 0) { + if (first_nonempty_level < 0) { + first_nonempty_level = i; + first_nonempty_level_filenum = file_num; + } else { + char msg[255]; + snprintf(msg, sizeof(msg), + "Found at least two levels containing files: " + "[%d:%d],[%d:%d].\n", + first_nonempty_level, first_nonempty_level_filenum, i, + file_num); + return Status::InvalidArgument(msg); + } + } + } + + std::vector* old_files_list = current_version->files_; + // we need to allocate an array with the old number of levels size to + // avoid SIGSEGV in WriteSnapshot() + // however, all levels bigger or equal to new_levels will be empty + std::vector* new_files_list = + new std::vector[current_levels]; + for (int i = 0; i < new_levels - 1; i++) { + new_files_list[i] = old_files_list[i]; + } + + if (first_nonempty_level > 0) { + new_files_list[new_levels - 1] = old_files_list[first_nonempty_level]; + } + + delete[] current_version->files_; + current_version->files_ = new_files_list; + current_version->num_levels_ = new_levels; + + VersionEdit ve; + port::Mutex dummy_mutex; + MutexLock l(&dummy_mutex); + return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), &ve, + &dummy_mutex, nullptr, true); +} + +Status VersionSet::DumpManifest(Options& options, std::string& dscname, + bool verbose, bool hex) { + // Open the specified manifest file. + unique_ptr file; + Status s = options.env->NewSequentialFile(dscname, &file, storage_options_); + if (!s.ok()) { + return s; + } + + bool have_prev_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + uint64_t last_sequence = 0; + uint64_t prev_log_number = 0; + int count = 0; + std::unordered_map comparators; + std::unordered_map builders; + + // add default column family + VersionEdit default_cf_edit; + default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); + default_cf_edit.SetColumnFamily(0); + ColumnFamilyData* default_cfd = + CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit); + builders.insert({0, new Builder(default_cfd)}); + + { + VersionSet::LogReporter reporter; + reporter.status = &s; + log::Reader reader(std::move(file), &reporter, true/*checksum*/, + 0/*initial_offset*/); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + + // Write out each individual edit + if (verbose) { + printf("*************************Edit[%d] = %s\n", + count, edit.DebugString(hex).c_str()); + } + count++; + + bool cf_in_builders = + builders.find(edit.column_family_) != builders.end(); + + if (edit.has_comparator_) { + comparators.insert({edit.column_family_, edit.comparator_}); + } + + ColumnFamilyData* cfd = nullptr; + + if (edit.is_column_family_add_) { + if (cf_in_builders) { + s = Status::Corruption( + "Manifest adding the same column family twice"); + break; + } + cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit); + builders.insert({edit.column_family_, new Builder(cfd)}); + } else if (edit.is_column_family_drop_) { + if (!cf_in_builders) { + s = Status::Corruption( + "Manifest - dropping non-existing column family"); + break; + } + auto builder_iter = builders.find(edit.column_family_); + delete builder_iter->second; + builders.erase(builder_iter); + comparators.erase(edit.column_family_); + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + assert(cfd != nullptr); + cfd->Unref(); + delete cfd; + cfd = nullptr; + } else { + if (!cf_in_builders) { + s = Status::Corruption( + "Manifest record referencing unknown column family"); + break; + } + + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + // this should never happen since cf_in_builders is true + assert(cfd != nullptr); + + // if it is not column family add or column family drop, + // then it's a file add/delete, which should be forwarded + // to builder + auto builder = builders.find(edit.column_family_); + assert(builder != builders.end()); + builder->second->Apply(&edit); + } + + if (cfd != nullptr && edit.has_log_number_) { + cfd->SetLogNumber(edit.log_number_); + } + + if (edit.has_prev_log_number_) { + prev_log_number = edit.prev_log_number_; + have_prev_log_number = true; + } + + if (edit.has_next_file_number_) { + next_file = edit.next_file_number_; + have_next_file = true; + } + + if (edit.has_last_sequence_) { + last_sequence = edit.last_sequence_; + have_last_sequence = true; + } + + if (edit.has_max_column_family_) { + column_family_set_->UpdateMaxColumnFamily(edit.max_column_family_); + } + } + } + file.reset(); + + if (s.ok()) { + if (!have_next_file) { + s = Status::Corruption("no meta-nextfile entry in descriptor"); + printf("no meta-nextfile entry in descriptor"); + } else if (!have_last_sequence) { + printf("no last-sequence-number entry in descriptor"); + s = Status::Corruption("no last-sequence-number entry in descriptor"); + } + + if (!have_prev_log_number) { + prev_log_number = 0; + } + } + + if (s.ok()) { + for (auto cfd : *column_family_set_) { + auto builders_iter = builders.find(cfd->GetID()); + assert(builders_iter != builders.end()); + auto builder = builders_iter->second; + + Version* v = new Version(cfd, this, current_version_number_++); + builder->SaveTo(v); + std::vector size_being_compacted(v->NumberLevels() - 1); + cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted); + v->ComputeCompactionScore(size_being_compacted); + v->UpdateFilesBySize(); + delete builder; + + printf("--------------- Column family \"%s\" (ID %u) --------------\n", + cfd->GetName().c_str(), (unsigned int)cfd->GetID()); + printf("log number: %lu\n", (unsigned long)cfd->GetLogNumber()); + auto comparator = comparators.find(cfd->GetID()); + if (comparator != comparators.end()) { + printf("comparator: %s\n", comparator->second.c_str()); + } else { + printf("comparator: \n"); + } + printf("%s \n", v->DebugString(hex).c_str()); + delete v; + } + + next_file_number_ = next_file + 1; + last_sequence_ = last_sequence; + prev_log_number_ = prev_log_number; + + printf( + "next_file_number %lu last_sequence " + "%lu prev_log_number %lu max_column_family %u\n", + (unsigned long)next_file_number_, (unsigned long)last_sequence, + (unsigned long)prev_log_number, + column_family_set_->GetMaxColumnFamily()); + } + + return s; +} +#endif // ROCKSDB_LITE + +void VersionSet::MarkFileNumberUsed(uint64_t number) { + if (next_file_number_ <= number) { + next_file_number_ = number + 1; + } +} + +Status VersionSet::WriteSnapshot(log::Writer* log) { + // TODO: Break up into multiple records to reduce memory usage on recovery? + + // WARNING: This method doesn't hold a mutex!! + + // This is done without DB mutex lock held, but only within single-threaded + // LogAndApply. Column family manipulations can only happen within LogAndApply + // (the same single thread), so we're safe to iterate. + for (auto cfd : *column_family_set_) { + { + // Store column family info + VersionEdit edit; + if (cfd->GetID() != 0) { + // default column family is always there, + // no need to explicitly write it + edit.AddColumnFamily(cfd->GetName()); + edit.SetColumnFamily(cfd->GetID()); + } + edit.SetComparatorName( + cfd->internal_comparator().user_comparator()->Name()); + std::string record; + edit.EncodeTo(&record); + Status s = log->AddRecord(record); + if (!s.ok()) { + return s; + } + } + + { + // Save files + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + + for (int level = 0; level < cfd->NumberLevels(); level++) { + for (const auto& f : cfd->current()->files_[level]) { + edit.AddFile(level, + f->number, + f->file_size, + f->smallest, + f->largest, + f->smallest_seqno, + f->largest_seqno); + } + } + edit.SetLogNumber(cfd->GetLogNumber()); + std::string record; + edit.EncodeTo(&record); + Status s = log->AddRecord(record); + if (!s.ok()) { + return s; + } + } + } + + return Status::OK(); +} + +// Opens the mainfest file and reads all records +// till it finds the record we are looking for. +bool VersionSet::ManifestContains(uint64_t manifest_file_number, + const std::string& record) const { + std::string fname = + DescriptorFileName(dbname_, manifest_file_number); + Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str()); + unique_ptr file; + Status s = env_->NewSequentialFile(fname, &file, storage_options_); + if (!s.ok()) { + Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str()); + Log(options_->info_log, + "ManifestContains: is unable to reopen the manifest file %s", + fname.c_str()); + return false; + } + log::Reader reader(std::move(file), nullptr, true/*checksum*/, 0); + Slice r; + std::string scratch; + bool result = false; + while (reader.ReadRecord(&r, &scratch)) { + if (r == Slice(record)) { + result = true; + break; + } + } + Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0); + return result; +} + + +uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { + uint64_t result = 0; + for (int level = 0; level < v->NumberLevels(); level++) { + const std::vector& files = v->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + if (v->cfd_->internal_comparator().Compare(files[i]->largest, ikey) <= + 0) { + // Entire file is before "ikey", so just add the file size + result += files[i]->file_size; + } else if (v->cfd_->internal_comparator().Compare(files[i]->smallest, + ikey) > 0) { + // Entire file is after "ikey", so ignore + if (level > 0) { + // Files other than level 0 are sorted by meta->smallest, so + // no further files in this level will contain data for + // "ikey". + break; + } + } else { + // "ikey" falls in the range for this table. Add the + // approximate offset of "ikey" within the table. + TableReader* table_reader_ptr; + Iterator* iter = v->cfd_->table_cache()->NewIterator( + ReadOptions(), storage_options_, v->cfd_->internal_comparator(), + *(files[i]), &table_reader_ptr); + if (table_reader_ptr != nullptr) { + result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode()); + } + delete iter; + } + } + } + return result; +} + +void VersionSet::AddLiveFiles(std::vector* live_list) { + // pre-calculate space requirement + int64_t total_files = 0; + for (auto cfd : *column_family_set_) { + Version* dummy_versions = cfd->dummy_versions(); + for (Version* v = dummy_versions->next_; v != dummy_versions; + v = v->next_) { + for (int level = 0; level < v->NumberLevels(); level++) { + total_files += v->files_[level].size(); + } + } + } + + // just one time extension to the right size + live_list->reserve(live_list->size() + total_files); + + for (auto cfd : *column_family_set_) { + Version* dummy_versions = cfd->dummy_versions(); + for (Version* v = dummy_versions->next_; v != dummy_versions; + v = v->next_) { + for (int level = 0; level < v->NumberLevels(); level++) { + for (const auto& f : v->files_[level]) { + live_list->push_back(f->number); + } + } + } + } +} + +Iterator* VersionSet::MakeInputIterator(Compaction* c) { + auto cfd = c->column_family_data(); + ReadOptions read_options; + read_options.verify_checksums = + cfd->options()->verify_checksums_in_compaction; + read_options.fill_cache = false; + + // Level-0 files have to be merged together. For other levels, + // we will make a concatenating iterator per level. + // TODO(opt): use concatenating iterator for level-0 if there is no overlap + const int space = (c->level() == 0 ? c->inputs(0)->size() + 1 : 2); + Iterator** list = new Iterator*[space]; + int num = 0; + for (int which = 0; which < 2; which++) { + if (!c->inputs(which)->empty()) { + if (c->level() + which == 0) { + for (const auto& file : *c->inputs(which)) { + list[num++] = cfd->table_cache()->NewIterator( + read_options, storage_options_compactions_, + cfd->internal_comparator(), *file, nullptr, + true /* for compaction */); + } + } else { + // Create concatenating iterator for the files from this level + list[num++] = NewTwoLevelIterator(new Version::LevelFileIteratorState( + cfd->table_cache(), read_options, storage_options_, + cfd->internal_comparator(), true /* for_compaction */, + false /* prefix enabled */), + new Version::LevelFileNumIterator(cfd->internal_comparator(), + c->inputs(which))); + } + } + } + assert(num <= space); + Iterator* result = NewMergingIterator( + &c->column_family_data()->internal_comparator(), list, num); + delete[] list; + return result; +} + +// verify that the files listed in this compaction are present +// in the current version +bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { +#ifndef NDEBUG + Version* version = c->column_family_data()->current(); + if (c->input_version() != version) { + Log(options_->info_log, + "[%s] VerifyCompactionFileConsistency version mismatch", + c->column_family_data()->GetName().c_str()); + } + + // verify files in level + int level = c->level(); + for (int i = 0; i < c->num_input_files(0); i++) { + uint64_t number = c->input(0,i)->number; + + // look for this file in the current version + bool found = false; + for (unsigned int j = 0; j < version->files_[level].size(); j++) { + FileMetaData* f = version->files_[level][j]; + if (f->number == number) { + found = true; + break; + } + } + if (!found) { + return false; // input files non existant in current version + } + } + // verify level+1 files + level++; + for (int i = 0; i < c->num_input_files(1); i++) { + uint64_t number = c->input(1,i)->number; + + // look for this file in the current version + bool found = false; + for (unsigned int j = 0; j < version->files_[level].size(); j++) { + FileMetaData* f = version->files_[level][j]; + if (f->number == number) { + found = true; + break; + } + } + if (!found) { + return false; // input files non existant in current version + } + } +#endif + return true; // everything good +} + +Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, + FileMetaData** meta, + ColumnFamilyData** cfd) { + for (auto cfd_iter : *column_family_set_) { + Version* version = cfd_iter->current(); + for (int level = 0; level < version->NumberLevels(); level++) { + for (const auto& file : version->files_[level]) { + if (file->number == number) { + *meta = file; + *filelevel = level; + *cfd = cfd_iter; + return Status::OK(); + } + } + } + } + return Status::NotFound("File not present in any level"); +} + +void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { + for (auto cfd : *column_family_set_) { + for (int level = 0; level < cfd->NumberLevels(); level++) { + for (const auto& file : cfd->current()->files_[level]) { + LiveFileMetaData filemetadata; + filemetadata.column_family_name = cfd->GetName(); + filemetadata.name = TableFileName("", file->number); + filemetadata.level = level; + filemetadata.size = file->file_size; + filemetadata.smallestkey = file->smallest.user_key().ToString(); + filemetadata.largestkey = file->largest.user_key().ToString(); + filemetadata.smallest_seqno = file->smallest_seqno; + filemetadata.largest_seqno = file->largest_seqno; + metadata->push_back(filemetadata); + } + } + } +} + +void VersionSet::GetObsoleteFiles(std::vector* files) { + files->insert(files->end(), obsolete_files_.begin(), obsolete_files_.end()); + obsolete_files_.clear(); +} + +ColumnFamilyData* VersionSet::CreateColumnFamily( + const ColumnFamilyOptions& options, VersionEdit* edit) { + assert(edit->is_column_family_add_); + + Version* dummy_versions = new Version(nullptr, this); + auto new_cfd = column_family_set_->CreateColumnFamily( + edit->column_family_name_, edit->column_family_, dummy_versions, options); + + Version* v = new Version(new_cfd, this, current_version_number_++); + + AppendVersion(new_cfd, v); + new_cfd->CreateNewMemtable(); + new_cfd->SetLogNumber(edit->log_number_); + return new_cfd; +} + +} // namespace rocksdb diff --git a/db/version_set.h b/db/version_set.h new file mode 100644 index 0000000000..7c8d7146e8 --- /dev/null +++ b/db/version_set.h @@ -0,0 +1,499 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// The representation of a DBImpl consists of a set of Versions. The +// newest version is called "current". Older versions may be kept +// around to provide a consistent view to live iterators. +// +// Each Version keeps track of a set of Table files per level. The +// entire set of versions is maintained in a VersionSet. +// +// Version,VersionSet are thread-compatible, but require external +// synchronization on all accesses. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "db/dbformat.h" +#include "db/version_edit.h" +#include "port/port.h" +#include "db/table_cache.h" +#include "db/compaction.h" +#include "db/compaction_picker.h" +#include "db/column_family.h" +#include "db/log_reader.h" +#include "db/file_indexer.h" + +namespace rocksdb { + +namespace log { class Writer; } + +class Compaction; +class CompactionPicker; +class Iterator; +class LogBuffer; +class LookupKey; +class MemTable; +class Version; +class VersionSet; +class MergeContext; +class ColumnFamilyData; +class ColumnFamilySet; +class TableCache; +class MergeIteratorBuilder; + +// Return the smallest index i such that files[i]->largest >= key. +// Return files.size() if there is no such file. +// REQUIRES: "files" contains a sorted list of non-overlapping files. +extern int FindFile(const InternalKeyComparator& icmp, + const std::vector& files, + const Slice& key); + +// Returns true iff some file in "files" overlaps the user key range +// [*smallest,*largest]. +// smallest==nullptr represents a key smaller than all keys in the DB. +// largest==nullptr represents a key largest than all keys in the DB. +// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges +// in sorted order. +extern bool SomeFileOverlapsRange( + const InternalKeyComparator& icmp, + bool disjoint_sorted_files, + const std::vector& files, + const Slice* smallest_user_key, + const Slice* largest_user_key); + +class Version { + public: + // Append to *iters a sequence of iterators that will + // yield the contents of this Version when merged together. + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + void AddIterators(const ReadOptions&, const EnvOptions& soptions, + std::vector* iters); + + void AddIterators(const ReadOptions&, const EnvOptions& soptions, + MergeIteratorBuilder* merger_iter_builder); + + // Lookup the value for key. If found, store it in *val and + // return OK. Else return a non-OK status. Fills *stats. + // Uses *operands to store merge_operator operations to apply later + // REQUIRES: lock is not held + struct GetStats { + FileMetaData* seek_file; + int seek_file_level; + }; + void Get(const ReadOptions&, const LookupKey& key, std::string* val, + Status* status, MergeContext* merge_context, GetStats* stats, + bool* value_found = nullptr); + + // Adds "stats" into the current state. Returns true if a new + // compaction may need to be triggered, false otherwise. + // REQUIRES: lock is held + bool UpdateStats(const GetStats& stats); + + // Updates internal structures that keep track of compaction scores + // We use compaction scores to figure out which compaction to do next + // REQUIRES: If Version is not yet saved to current_, it can be called without + // a lock. Once a version is saved to current_, call only with mutex held + void ComputeCompactionScore(std::vector& size_being_compacted); + + // Reference count management (so Versions do not disappear out from + // under live iterators) + void Ref(); + // Decrease reference count. Delete the object if no reference left + // and return true. Otherwise, return false. + bool Unref(); + + // Returns true iff some level needs a compaction. + bool NeedsCompaction() const; + + // Returns the maxmimum compaction score for levels 1 to max + double MaxCompactionScore() const { return max_compaction_score_; } + + // See field declaration + int MaxCompactionScoreLevel() const { return max_compaction_score_level_; } + + void GetOverlappingInputs( + int level, + const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys + std::vector* inputs, + int hint_index = -1, // index of overlap file + int* file_index = nullptr); // return index of overlap file + + void GetOverlappingInputsBinarySearch( + int level, + const Slice& begin, // nullptr means before all keys + const Slice& end, // nullptr means after all keys + std::vector* inputs, + int hint_index, // index of overlap file + int* file_index); // return index of overlap file + + void ExtendOverlappingInputs( + int level, + const Slice& begin, // nullptr means before all keys + const Slice& end, // nullptr means after all keys + std::vector* inputs, + unsigned int index); // start extending from this index + + // Returns true iff some file in the specified level overlaps + // some part of [*smallest_user_key,*largest_user_key]. + // smallest_user_key==NULL represents a key smaller than all keys in the DB. + // largest_user_key==NULL represents a key largest than all keys in the DB. + bool OverlapInLevel(int level, + const Slice* smallest_user_key, + const Slice* largest_user_key); + + // Returns true iff the first or last file in inputs contains + // an overlapping user key to the file "just outside" of it (i.e. + // just after the last file, or just before the first file) + // REQUIRES: "*inputs" is a sorted list of non-overlapping files + bool HasOverlappingUserKey(const std::vector* inputs, + int level); + + + // Return the level at which we should place a new memtable compaction + // result that covers the range [smallest_user_key,largest_user_key]. + int PickLevelForMemTableOutput(const Slice& smallest_user_key, + const Slice& largest_user_key); + + int NumberLevels() const { return num_levels_; } + + // REQUIRES: lock is held + int NumLevelFiles(int level) const { return files_[level].size(); } + + // Return the combined file size of all files at the specified level. + int64_t NumLevelBytes(int level) const; + + // Return a human-readable short (single-line) summary of the number + // of files per level. Uses *scratch as backing store. + struct LevelSummaryStorage { + char buffer[100]; + }; + struct FileSummaryStorage { + char buffer[1000]; + }; + const char* LevelSummary(LevelSummaryStorage* scratch) const; + // Return a human-readable short (single-line) summary of files + // in a specified level. Uses *scratch as backing store. + const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const; + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t MaxNextLevelOverlappingBytes(); + + // Add all files listed in the current version to *live. + void AddLiveFiles(std::set* live); + + // Return a human readable string that describes this version's contents. + std::string DebugString(bool hex = false) const; + + // Returns the version nuber of this version + uint64_t GetVersionNumber() const { return version_number_; } + + // REQUIRES: lock is held + // On success, *props will be populated with all SSTables' table properties. + // The keys of `props` are the sst file name, the values of `props` are the + // tables' propertis, represented as shared_ptr. + Status GetPropertiesOfAllTables(TablePropertiesCollection* props); + + // used to sort files by size + struct Fsize { + int index; + FileMetaData* file; + }; + + private: + friend class Compaction; + friend class VersionSet; + friend class DBImpl; + friend class ColumnFamilyData; + friend class CompactionPicker; + friend class LevelCompactionPicker; + friend class UniversalCompactionPicker; + friend class FIFOCompactionPicker; + friend class ForwardIterator; + + class LevelFileNumIterator; + class LevelFileIteratorState; + + bool PrefixMayMatch(const ReadOptions& options, Iterator* level_iter, + const Slice& internal_prefix) const; + + // Sort all files for this version based on their file size and + // record results in files_by_size_. The largest files are listed first. + void UpdateFilesBySize(); + + ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs + const InternalKeyComparator* internal_comparator_; + const Comparator* user_comparator_; + TableCache* table_cache_; + const MergeOperator* merge_operator_; + Logger* info_log_; + Statistics* db_statistics_; + VersionSet* vset_; // VersionSet to which this Version belongs + Version* next_; // Next version in linked list + Version* prev_; // Previous version in linked list + int refs_; // Number of live refs to this version + int num_levels_; // Number of levels + + // List of files per level, files in each level are arranged + // in increasing order of keys + std::vector* files_; + + // A list for the same set of files that are stored in files_, + // but files in each level are now sorted based on file + // size. The file with the largest size is at the front. + // This vector stores the index of the file from files_. + std::vector> files_by_size_; + + // An index into files_by_size_ that specifies the first + // file that is not yet compacted + std::vector next_file_to_compact_by_size_; + + // Only the first few entries of files_by_size_ are sorted. + // There is no need to sort all the files because it is likely + // that on a running system, we need to look at only the first + // few largest files because a new version is created every few + // seconds/minutes (because of concurrent compactions). + static const int number_of_files_to_sort_ = 50; + + // Next file to compact based on seek stats. + FileMetaData* file_to_compact_; + int file_to_compact_level_; + + // Level that should be compacted next and its compaction score. + // Score < 1 means compaction is not strictly needed. These fields + // are initialized by Finalize(). + // The most critical level to be compacted is listed first + // These are used to pick the best compaction level + std::vector compaction_score_; + std::vector compaction_level_; + double max_compaction_score_; // max score in l1 to ln-1 + int max_compaction_score_level_; // level on which max score occurs + + // A version number that uniquely represents this version. This is + // used for debugging and logging purposes only. + uint64_t version_number_; + + Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0); + FileIndexer file_indexer_; + + ~Version(); + + // re-initializes the index that is used to offset into files_by_size_ + // to find the next compaction candidate file. + void ResetNextCompactionIndex(int level) { + next_file_to_compact_by_size_[level] = 0; + } + + // No copying allowed + Version(const Version&); + void operator=(const Version&); +}; + +class VersionSet { + public: + VersionSet(const std::string& dbname, const DBOptions* options, + const EnvOptions& storage_options, Cache* table_cache); + ~VersionSet(); + + // Apply *edit to the current version to form a new descriptor that + // is both saved to persistent state and installed as the new + // current version. Will release *mu while actually writing to the file. + // column_family_options has to be set if edit is column family add + // REQUIRES: *mu is held on entry. + // REQUIRES: no other thread concurrently calls LogAndApply() + Status LogAndApply(ColumnFamilyData* column_family_data, VersionEdit* edit, + port::Mutex* mu, Directory* db_directory = nullptr, + bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = + nullptr); + + // Recover the last saved descriptor from persistent storage. + // If read_only == true, Recover() will not complain if some column families + // are not opened + Status Recover(const std::vector& column_families, + bool read_only = false); + + // Reads a manifest file and returns a list of column families in + // column_families. + static Status ListColumnFamilies(std::vector* column_families, + const std::string& dbname, Env* env); + +#ifndef ROCKSDB_LITE + // Try to reduce the number of levels. This call is valid when + // only one level from the new max level to the old + // max level containing files. + // The call is static, since number of levels is immutable during + // the lifetime of a RocksDB instance. It reduces number of levels + // in a DB by applying changes to manifest. + // For example, a db currently has 7 levels [0-6], and a call to + // to reduce to 5 [0-4] can only be executed when only one level + // among [4-6] contains files. + static Status ReduceNumberOfLevels(const std::string& dbname, + const Options* options, + const EnvOptions& storage_options, + int new_levels); + + // printf contents (for debugging) + Status DumpManifest(Options& options, std::string& manifestFileName, + bool verbose, bool hex = false); + +#endif // ROCKSDB_LITE + + // Return the current manifest file number + uint64_t ManifestFileNumber() const { return manifest_file_number_; } + + uint64_t PendingManifestFileNumber() const { + return pending_manifest_file_number_; + } + + // Allocate and return a new file number + uint64_t NewFileNumber() { return next_file_number_++; } + + // Arrange to reuse "file_number" unless a newer file number has + // already been allocated. + // REQUIRES: "file_number" was returned by a call to NewFileNumber(). + void ReuseFileNumber(uint64_t file_number) { + if (next_file_number_ == file_number + 1) { + next_file_number_ = file_number; + } + } + + // Return the last sequence number. + uint64_t LastSequence() const { + return last_sequence_.load(std::memory_order_acquire); + } + + // Set the last sequence number to s. + void SetLastSequence(uint64_t s) { + assert(s >= last_sequence_); + last_sequence_.store(s, std::memory_order_release); + } + + // Mark the specified file number as used. + void MarkFileNumberUsed(uint64_t number); + + // Return the log file number for the log file that is currently + // being compacted, or zero if there is no such log file. + uint64_t PrevLogNumber() const { return prev_log_number_; } + + // Returns the minimum log number such that all + // log numbers less than or equal to it can be deleted + uint64_t MinLogNumber() const { + uint64_t min_log_num = std::numeric_limits::max(); + for (auto cfd : *column_family_set_) { + if (min_log_num > cfd->GetLogNumber()) { + min_log_num = cfd->GetLogNumber(); + } + } + return min_log_num; + } + + // Create an iterator that reads over the compaction inputs for "*c". + // The caller should delete the iterator when no longer needed. + Iterator* MakeInputIterator(Compaction* c); + + // Add all files listed in any live version to *live. + void AddLiveFiles(std::vector* live_list); + + // Return the approximate offset in the database of the data for + // "key" as of version "v". + uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); + + // Return the size of the current manifest file + uint64_t ManifestFileSize() const { return manifest_file_size_; } + + // verify that the files that we started with for a compaction + // still exist in the current version and in the same original level. + // This ensures that a concurrent compaction did not erroneously + // pick the same files to compact. + bool VerifyCompactionFileConsistency(Compaction* c); + + Status GetMetadataForFile(uint64_t number, int* filelevel, + FileMetaData** metadata, ColumnFamilyData** cfd); + + void GetLiveFilesMetaData( + std::vector *metadata); + + void GetObsoleteFiles(std::vector* files); + + ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } + + private: + class Builder; + struct ManifestWriter; + + friend class Version; + + struct LogReporter : public log::Reader::Reporter { + Status* status; + virtual void Corruption(size_t bytes, const Status& s) { + if (this->status->ok()) *this->status = s; + } + }; + + // Save current contents to *log + Status WriteSnapshot(log::Writer* log); + + void AppendVersion(ColumnFamilyData* column_family_data, Version* v); + + bool ManifestContains(uint64_t manifest_file_number, + const std::string& record) const; + + ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options, + VersionEdit* edit); + + std::unique_ptr column_family_set_; + + Env* const env_; + const std::string dbname_; + const DBOptions* const options_; + uint64_t next_file_number_; + uint64_t manifest_file_number_; + uint64_t pending_manifest_file_number_; + std::atomic last_sequence_; + uint64_t prev_log_number_; // 0 or backing store for memtable being compacted + + // Opened lazily + unique_ptr descriptor_log_; + + // generates a increasing version number for every new version + uint64_t current_version_number_; + + // Queue of writers to the manifest file + std::deque manifest_writers_; + + // Current size of manifest file + uint64_t manifest_file_size_; + + std::vector obsolete_files_; + + // storage options for all reads and writes except compactions + const EnvOptions& storage_options_; + + // storage options used for compactions. This is a copy of + // storage_options_ but with readaheads set to readahead_compactions_. + const EnvOptions storage_options_compactions_; + + // No copying allowed + VersionSet(const VersionSet&); + void operator=(const VersionSet&); + + void LogAndApplyCFHelper(VersionEdit* edit); + void LogAndApplyHelper(ColumnFamilyData* cfd, Builder* b, Version* v, + VersionEdit* edit, port::Mutex* mu); +}; + +} // namespace rocksdb diff --git a/db/version_set_test.cc b/db/version_set_test.cc new file mode 100644 index 0000000000..1af95dd3fe --- /dev/null +++ b/db/version_set_test.cc @@ -0,0 +1,184 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class FindFileTest { + public: + std::vector files_; + bool disjoint_sorted_files_; + + FindFileTest() : disjoint_sorted_files_(true) { } + + ~FindFileTest() { + for (unsigned int i = 0; i < files_.size(); i++) { + delete files_[i]; + } + } + + void Add(const char* smallest, const char* largest, + SequenceNumber smallest_seq = 100, + SequenceNumber largest_seq = 100) { + FileMetaData* f = new FileMetaData; + f->number = files_.size() + 1; + f->smallest = InternalKey(smallest, smallest_seq, kTypeValue); + f->largest = InternalKey(largest, largest_seq, kTypeValue); + files_.push_back(f); + } + + int Find(const char* key) { + InternalKey target(key, 100, kTypeValue); + InternalKeyComparator cmp(BytewiseComparator()); + return FindFile(cmp, files_, target.Encode()); + } + + bool Overlaps(const char* smallest, const char* largest) { + InternalKeyComparator cmp(BytewiseComparator()); + Slice s(smallest != nullptr ? smallest : ""); + Slice l(largest != nullptr ? largest : ""); + return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_, + (smallest != nullptr ? &s : nullptr), + (largest != nullptr ? &l : nullptr)); + } +}; + +TEST(FindFileTest, Empty) { + ASSERT_EQ(0, Find("foo")); + ASSERT_TRUE(! Overlaps("a", "z")); + ASSERT_TRUE(! Overlaps(nullptr, "z")); + ASSERT_TRUE(! Overlaps("a", nullptr)); + ASSERT_TRUE(! Overlaps(nullptr, nullptr)); +} + +TEST(FindFileTest, Single) { + Add("p", "q"); + ASSERT_EQ(0, Find("a")); + ASSERT_EQ(0, Find("p")); + ASSERT_EQ(0, Find("p1")); + ASSERT_EQ(0, Find("q")); + ASSERT_EQ(1, Find("q1")); + ASSERT_EQ(1, Find("z")); + + ASSERT_TRUE(! Overlaps("a", "b")); + ASSERT_TRUE(! Overlaps("z1", "z2")); + ASSERT_TRUE(Overlaps("a", "p")); + ASSERT_TRUE(Overlaps("a", "q")); + ASSERT_TRUE(Overlaps("a", "z")); + ASSERT_TRUE(Overlaps("p", "p1")); + ASSERT_TRUE(Overlaps("p", "q")); + ASSERT_TRUE(Overlaps("p", "z")); + ASSERT_TRUE(Overlaps("p1", "p2")); + ASSERT_TRUE(Overlaps("p1", "z")); + ASSERT_TRUE(Overlaps("q", "q")); + ASSERT_TRUE(Overlaps("q", "q1")); + + ASSERT_TRUE(! Overlaps(nullptr, "j")); + ASSERT_TRUE(! Overlaps("r", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, "p")); + ASSERT_TRUE(Overlaps(nullptr, "p1")); + ASSERT_TRUE(Overlaps("q", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, nullptr)); +} + + +TEST(FindFileTest, Multiple) { + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + ASSERT_EQ(0, Find("100")); + ASSERT_EQ(0, Find("150")); + ASSERT_EQ(0, Find("151")); + ASSERT_EQ(0, Find("199")); + ASSERT_EQ(0, Find("200")); + ASSERT_EQ(1, Find("201")); + ASSERT_EQ(1, Find("249")); + ASSERT_EQ(1, Find("250")); + ASSERT_EQ(2, Find("251")); + ASSERT_EQ(2, Find("299")); + ASSERT_EQ(2, Find("300")); + ASSERT_EQ(2, Find("349")); + ASSERT_EQ(2, Find("350")); + ASSERT_EQ(3, Find("351")); + ASSERT_EQ(3, Find("400")); + ASSERT_EQ(3, Find("450")); + ASSERT_EQ(4, Find("451")); + + ASSERT_TRUE(! Overlaps("100", "149")); + ASSERT_TRUE(! Overlaps("251", "299")); + ASSERT_TRUE(! Overlaps("451", "500")); + ASSERT_TRUE(! Overlaps("351", "399")); + + ASSERT_TRUE(Overlaps("100", "150")); + ASSERT_TRUE(Overlaps("100", "200")); + ASSERT_TRUE(Overlaps("100", "300")); + ASSERT_TRUE(Overlaps("100", "400")); + ASSERT_TRUE(Overlaps("100", "500")); + ASSERT_TRUE(Overlaps("375", "400")); + ASSERT_TRUE(Overlaps("450", "450")); + ASSERT_TRUE(Overlaps("450", "500")); +} + +TEST(FindFileTest, MultipleNullBoundaries) { + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + ASSERT_TRUE(! Overlaps(nullptr, "149")); + ASSERT_TRUE(! Overlaps("451", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, nullptr)); + ASSERT_TRUE(Overlaps(nullptr, "150")); + ASSERT_TRUE(Overlaps(nullptr, "199")); + ASSERT_TRUE(Overlaps(nullptr, "200")); + ASSERT_TRUE(Overlaps(nullptr, "201")); + ASSERT_TRUE(Overlaps(nullptr, "400")); + ASSERT_TRUE(Overlaps(nullptr, "800")); + ASSERT_TRUE(Overlaps("100", nullptr)); + ASSERT_TRUE(Overlaps("200", nullptr)); + ASSERT_TRUE(Overlaps("449", nullptr)); + ASSERT_TRUE(Overlaps("450", nullptr)); +} + +TEST(FindFileTest, OverlapSequenceChecks) { + Add("200", "200", 5000, 3000); + ASSERT_TRUE(! Overlaps("199", "199")); + ASSERT_TRUE(! Overlaps("201", "300")); + ASSERT_TRUE(Overlaps("200", "200")); + ASSERT_TRUE(Overlaps("190", "200")); + ASSERT_TRUE(Overlaps("200", "210")); +} + +TEST(FindFileTest, OverlappingFiles) { + Add("150", "600"); + Add("400", "500"); + disjoint_sorted_files_ = false; + ASSERT_TRUE(! Overlaps("100", "149")); + ASSERT_TRUE(! Overlaps("601", "700")); + ASSERT_TRUE(Overlaps("100", "150")); + ASSERT_TRUE(Overlaps("100", "200")); + ASSERT_TRUE(Overlaps("100", "300")); + ASSERT_TRUE(Overlaps("100", "400")); + ASSERT_TRUE(Overlaps("100", "500")); + ASSERT_TRUE(Overlaps("375", "400")); + ASSERT_TRUE(Overlaps("450", "450")); + ASSERT_TRUE(Overlaps("450", "500")); + ASSERT_TRUE(Overlaps("450", "700")); + ASSERT_TRUE(Overlaps("600", "700")); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/write_batch.cc b/db/write_batch.cc new file mode 100644 index 0000000000..734d1e376d --- /dev/null +++ b/db/write_batch.cc @@ -0,0 +1,489 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch::rep_ := +// sequence: fixed64 +// count: fixed32 +// data: record[count] +// record := +// kTypeValue varstring varstring +// kTypeMerge varstring varstring +// kTypeDeletion varstring +// kTypeColumnFamilyValue varint32 varstring varstring +// kTypeColumnFamilyMerge varint32 varstring varstring +// kTypeColumnFamilyDeletion varint32 varstring varstring +// varstring := +// len: varint32 +// data: uint8[len] + +#include "rocksdb/write_batch.h" +#include "rocksdb/options.h" +#include "rocksdb/merge_operator.h" +#include "db/dbformat.h" +#include "db/db_impl.h" +#include "db/memtable.h" +#include "db/snapshot.h" +#include "db/write_batch_internal.h" +#include "util/coding.h" +#include "util/statistics.h" +#include + +namespace rocksdb { + +// WriteBatch header has an 8-byte sequence number followed by a 4-byte count. +static const size_t kHeader = 12; + +WriteBatch::WriteBatch(size_t reserved_bytes) { + rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader); + Clear(); +} + +WriteBatch::~WriteBatch() { } + +WriteBatch::Handler::~Handler() { } + +void WriteBatch::Handler::Put(const Slice& key, const Slice& value) { + // you need to either implement Put or PutCF + throw std::runtime_error("Handler::Put not implemented!"); +} + +void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) { + throw std::runtime_error("Handler::Merge not implemented!"); +} + +void WriteBatch::Handler::Delete(const Slice& key) { + // you need to either implement Delete or DeleteCF + throw std::runtime_error("Handler::Delete not implemented!"); +} + +void WriteBatch::Handler::LogData(const Slice& blob) { + // If the user has not specified something to do with blobs, then we ignore + // them. +} + +bool WriteBatch::Handler::Continue() { + return true; +} + +void WriteBatch::Clear() { + rep_.clear(); + rep_.resize(kHeader); +} + +int WriteBatch::Count() const { + return WriteBatchInternal::Count(this); +} + +Status WriteBatch::Iterate(Handler* handler) const { + Slice input(rep_); + if (input.size() < kHeader) { + return Status::Corruption("malformed WriteBatch (too small)"); + } + + input.remove_prefix(kHeader); + Slice key, value, blob; + int found = 0; + Status s; + while (s.ok() && !input.empty() && handler->Continue()) { + char tag = input[0]; + input.remove_prefix(1); + uint32_t column_family = 0; // default + switch (tag) { + case kTypeColumnFamilyValue: + if (!GetVarint32(&input, &column_family)) { + return Status::Corruption("bad WriteBatch Put"); + } + // intentional fallthrough + case kTypeValue: + if (GetLengthPrefixedSlice(&input, &key) && + GetLengthPrefixedSlice(&input, &value)) { + s = handler->PutCF(column_family, key, value); + found++; + } else { + return Status::Corruption("bad WriteBatch Put"); + } + break; + case kTypeColumnFamilyDeletion: + if (!GetVarint32(&input, &column_family)) { + return Status::Corruption("bad WriteBatch Delete"); + } + // intentional fallthrough + case kTypeDeletion: + if (GetLengthPrefixedSlice(&input, &key)) { + s = handler->DeleteCF(column_family, key); + found++; + } else { + return Status::Corruption("bad WriteBatch Delete"); + } + break; + case kTypeColumnFamilyMerge: + if (!GetVarint32(&input, &column_family)) { + return Status::Corruption("bad WriteBatch Merge"); + } + // intentional fallthrough + case kTypeMerge: + if (GetLengthPrefixedSlice(&input, &key) && + GetLengthPrefixedSlice(&input, &value)) { + s = handler->MergeCF(column_family, key, value); + found++; + } else { + return Status::Corruption("bad WriteBatch Merge"); + } + break; + case kTypeLogData: + if (GetLengthPrefixedSlice(&input, &blob)) { + handler->LogData(blob); + } else { + return Status::Corruption("bad WriteBatch Blob"); + } + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + } + if (!s.ok()) { + return s; + } + if (found != WriteBatchInternal::Count(this)) { + return Status::Corruption("WriteBatch has wrong count"); + } else { + return Status::OK(); + } +} + +int WriteBatchInternal::Count(const WriteBatch* b) { + return DecodeFixed32(b->rep_.data() + 8); +} + +void WriteBatchInternal::SetCount(WriteBatch* b, int n) { + EncodeFixed32(&b->rep_[8], n); +} + +SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { + return SequenceNumber(DecodeFixed64(b->rep_.data())); +} + +void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { + EncodeFixed64(&b->rep_[0], seq); +} + +void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, + const Slice& key, const Slice& value) { + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeValue)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, value); +} + +namespace { +inline uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) { + uint32_t column_family_id = 0; + if (column_family != nullptr) { + auto cfh = reinterpret_cast(column_family); + column_family_id = cfh->GetID(); + } + return column_family_id; +} +} // namespace + +void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value); +} + +void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, + const SliceParts& key, const SliceParts& value) { + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeValue)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, key); + PutLengthPrefixedSliceParts(&b->rep_, value); +} + +void WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) { + WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value); +} + +void WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, + const Slice& key) { + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); +} + +void WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) { + WriteBatchInternal::Delete(this, GetColumnFamilyID(column_family), key); +} + +void WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, + const Slice& key, const Slice& value) { + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeMerge)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyMerge)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, value); +} + +void WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + WriteBatchInternal::Merge(this, GetColumnFamilyID(column_family), key, value); +} + +void WriteBatch::PutLogData(const Slice& blob) { + rep_.push_back(static_cast(kTypeLogData)); + PutLengthPrefixedSlice(&rep_, blob); +} + +namespace { +class MemTableInserter : public WriteBatch::Handler { + public: + SequenceNumber sequence_; + ColumnFamilyMemTables* cf_mems_; + bool recovery_; + uint64_t log_number_; + DBImpl* db_; + const bool dont_filter_deletes_; + + MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems, + bool recovery, uint64_t log_number, DB* db, + const bool dont_filter_deletes) + : sequence_(sequence), + cf_mems_(cf_mems), + recovery_(recovery), + log_number_(log_number), + db_(reinterpret_cast(db)), + dont_filter_deletes_(dont_filter_deletes) { + assert(cf_mems); + if (!dont_filter_deletes_) { + assert(db_); + } + } + + bool SeekToColumnFamily(uint32_t column_family_id, Status* s) { + bool found = cf_mems_->Seek(column_family_id); + if (recovery_ && (!found || log_number_ < cf_mems_->GetLogNumber())) { + // if in recovery envoronment: + // * If column family was not found, it might mean that the WAL write + // batch references to the column family that was dropped after the + // insert. We don't want to fail the whole write batch in that case -- we + // just ignore the update. + // * If log_number_ < cf_mems_->GetLogNumber(), this means that column + // family already contains updates from this log. We can't apply updates + // twice because of update-in-place or merge workloads -- ignore the + // update + *s = Status::OK(); + return false; + } + if (!found) { + assert(!recovery_); + // If the column family was not found in non-recovery enviornment + // (client's write code-path), we have to fail the write and return + // the failure status to the client. + *s = Status::InvalidArgument( + "Invalid column family specified in write batch"); + return false; + } + return true; + } + + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + Status seek_status; + if (!SeekToColumnFamily(column_family_id, &seek_status)) { + ++sequence_; + return seek_status; + } + MemTable* mem = cf_mems_->GetMemTable(); + const Options* options = cf_mems_->GetOptions(); + if (!options->inplace_update_support) { + mem->Add(sequence_, kTypeValue, key, value); + } else if (options->inplace_callback == nullptr) { + mem->Update(sequence_, key, value); + RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED); + } else { + if (mem->UpdateCallback(sequence_, key, value, *options)) { + } else { + // key not found in memtable. Do sst get, update, add + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + ReadOptions ropts; + ropts.snapshot = &read_from_snapshot; + + std::string prev_value; + std::string merged_value; + + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + Status s = db_->Get(ropts, cf_handle, key, &prev_value); + + char* prev_buffer = const_cast(prev_value.c_str()); + uint32_t prev_size = prev_value.size(); + auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr, + s.ok() ? &prev_size : nullptr, + value, &merged_value); + if (status == UpdateStatus::UPDATED_INPLACE) { + // prev_value is updated in-place with final value. + mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size)); + RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN); + } else if (status == UpdateStatus::UPDATED) { + // merged_value contains the final value. + mem->Add(sequence_, kTypeValue, key, Slice(merged_value)); + RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN); + } + } + } + // Since all Puts are logged in trasaction logs (if enabled), always bump + // sequence number. Even if the update eventually fails and does not result + // in memtable add/update. + sequence_++; + return Status::OK(); + } + + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + Status seek_status; + if (!SeekToColumnFamily(column_family_id, &seek_status)) { + ++sequence_; + return seek_status; + } + MemTable* mem = cf_mems_->GetMemTable(); + const Options* options = cf_mems_->GetOptions(); + bool perform_merge = false; + + if (options->max_successive_merges > 0 && db_ != nullptr) { + LookupKey lkey(key, sequence_); + + // Count the number of successive merges at the head + // of the key in the memtable + size_t num_merges = mem->CountSuccessiveMergeEntries(lkey); + + if (num_merges >= options->max_successive_merges) { + perform_merge = true; + } + } + + if (perform_merge) { + // 1) Get the existing value + std::string get_value; + + // Pass in the sequence number so that we also include previous merge + // operations in the same batch. + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + ReadOptions read_options; + read_options.snapshot = &read_from_snapshot; + + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + db_->Get(read_options, cf_handle, key, &get_value); + Slice get_value_slice = Slice(get_value); + + // 2) Apply this merge + auto merge_operator = options->merge_operator.get(); + assert(merge_operator); + + std::deque operands; + operands.push_front(value.ToString()); + std::string new_value; + if (!merge_operator->FullMerge(key, &get_value_slice, operands, + &new_value, options->info_log.get())) { + // Failed to merge! + RecordTick(options->statistics.get(), NUMBER_MERGE_FAILURES); + + // Store the delta in memtable + perform_merge = false; + } else { + // 3) Add value to memtable + mem->Add(sequence_, kTypeValue, key, new_value); + } + } + + if (!perform_merge) { + // Add merge operator to memtable + mem->Add(sequence_, kTypeMerge, key, value); + } + + sequence_++; + return Status::OK(); + } + + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { + Status seek_status; + if (!SeekToColumnFamily(column_family_id, &seek_status)) { + ++sequence_; + return seek_status; + } + MemTable* mem = cf_mems_->GetMemTable(); + const Options* options = cf_mems_->GetOptions(); + if (!dont_filter_deletes_ && options->filter_deletes) { + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + ReadOptions ropts; + ropts.snapshot = &read_from_snapshot; + std::string value; + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) { + RecordTick(options->statistics.get(), NUMBER_FILTERED_DELETES); + return Status::OK(); + } + } + mem->Add(sequence_, kTypeDeletion, key, Slice()); + sequence_++; + return Status::OK(); + } +}; +} // namespace + +Status WriteBatchInternal::InsertInto(const WriteBatch* b, + ColumnFamilyMemTables* memtables, + bool recovery, uint64_t log_number, + DB* db, const bool dont_filter_deletes) { + MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables, + recovery, log_number, db, dont_filter_deletes); + return b->Iterate(&inserter); +} + +void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { + assert(contents.size() >= kHeader); + b->rep_.assign(contents.data(), contents.size()); +} + +void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) { + SetCount(dst, Count(dst) + Count(src)); + assert(src->rep_.size() >= kHeader); + dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader); +} + +} // namespace rocksdb diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h new file mode 100644 index 0000000000..85e85b33d7 --- /dev/null +++ b/db/write_batch_internal.h @@ -0,0 +1,123 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" + +namespace rocksdb { + +class MemTable; + +class ColumnFamilyMemTables { + public: + virtual ~ColumnFamilyMemTables() {} + virtual bool Seek(uint32_t column_family_id) = 0; + // returns true if the update to memtable should be ignored + // (useful when recovering from log whose updates have already + // been processed) + virtual uint64_t GetLogNumber() const = 0; + virtual MemTable* GetMemTable() const = 0; + virtual const Options* GetOptions() const = 0; + virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0; +}; + +class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { + public: + ColumnFamilyMemTablesDefault(MemTable* mem, const Options* options) + : ok_(false), mem_(mem), options_(options) {} + + bool Seek(uint32_t column_family_id) override { + ok_ = (column_family_id == 0); + return ok_; + } + + uint64_t GetLogNumber() const override { return 0; } + + MemTable* GetMemTable() const override { + assert(ok_); + return mem_; + } + + const Options* GetOptions() const override { + assert(ok_); + return options_; + } + + ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; } + + private: + bool ok_; + MemTable* mem_; + const Options* const options_; +}; + +// WriteBatchInternal provides static methods for manipulating a +// WriteBatch that we don't want in the public WriteBatch interface. +class WriteBatchInternal { + public: + // WriteBatch methods with column_family_id instead of ColumnFamilyHandle* + static void Put(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const Slice& value); + + static void Put(WriteBatch* batch, uint32_t column_family_id, + const SliceParts& key, const SliceParts& value); + + static void Delete(WriteBatch* batch, uint32_t column_family_id, + const Slice& key); + + static void Merge(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const Slice& value); + + // Return the number of entries in the batch. + static int Count(const WriteBatch* batch); + + // Set the count for the number of entries in the batch. + static void SetCount(WriteBatch* batch, int n); + + // Return the seqeunce number for the start of this batch. + static SequenceNumber Sequence(const WriteBatch* batch); + + // Store the specified number as the seqeunce number for the start of + // this batch. + static void SetSequence(WriteBatch* batch, SequenceNumber seq); + + static Slice Contents(const WriteBatch* batch) { + return Slice(batch->rep_); + } + + static size_t ByteSize(const WriteBatch* batch) { + return batch->rep_.size(); + } + + static void SetContents(WriteBatch* batch, const Slice& contents); + + // Inserts batch entries into memtable + // If dont_filter_deletes is false AND options.filter_deletes is true, + // then --> Drops deletes in batch if db->KeyMayExist returns false + // If recovery == true, this means InsertInto is executed on a recovery + // code-path. WriteBatch referencing a dropped column family can be + // found on a recovery code-path and should be ignored (recovery should not + // fail). Additionally, the memtable will be updated only if + // memtables->GetLogNumber() >= log_number + // However, if recovery == false, any WriteBatch referencing + // non-existing column family will return a failure. Also, log_number is + // ignored in that case + static Status InsertInto(const WriteBatch* batch, + ColumnFamilyMemTables* memtables, + bool recovery = false, uint64_t log_number = 0, + DB* db = nullptr, + const bool dont_filter_deletes = true); + + static void Append(WriteBatch* dst, const WriteBatch* src); +}; + +} // namespace rocksdb diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc new file mode 100644 index 0000000000..febd35c05d --- /dev/null +++ b/db/write_batch_test.cc @@ -0,0 +1,323 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/db.h" + +#include +#include "db/memtable.h" +#include "db/column_family.h" +#include "db/write_batch_internal.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace rocksdb { + +static std::string PrintContents(WriteBatch* b) { + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + Options options; + options.memtable_factory = factory; + MemTable* mem = new MemTable(cmp, options); + mem->Ref(); + std::string state; + ColumnFamilyMemTablesDefault cf_mems_default(mem, &options); + Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default); + int count = 0; + Iterator* iter = mem->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + memset((void *)&ikey, 0, sizeof(ikey)); + ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); + switch (ikey.type) { + case kTypeValue: + state.append("Put("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; + case kTypeMerge: + state.append("Merge("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; + case kTypeDeletion: + state.append("Delete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + count++; + break; + default: + assert(false); + break; + } + state.append("@"); + state.append(NumberToString(ikey.sequence)); + } + delete iter; + if (!s.ok()) { + state.append(s.ToString()); + } else if (count != WriteBatchInternal::Count(b)) { + state.append("CountMismatch()"); + } + delete mem->Unref(); + return state; +} + +class WriteBatchTest { }; + +TEST(WriteBatchTest, Empty) { + WriteBatch batch; + ASSERT_EQ("", PrintContents(&batch)); + ASSERT_EQ(0, WriteBatchInternal::Count(&batch)); + ASSERT_EQ(0, batch.Count()); +} + +TEST(WriteBatchTest, Multiple) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + batch.Put(Slice("baz"), Slice("boo")); + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch)); + ASSERT_EQ(3, WriteBatchInternal::Count(&batch)); + ASSERT_EQ("Put(baz, boo)@102" + "Delete(box)@101" + "Put(foo, bar)@100", + PrintContents(&batch)); + ASSERT_EQ(3, batch.Count()); +} + +TEST(WriteBatchTest, Corruption) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + WriteBatchInternal::SetSequence(&batch, 200); + Slice contents = WriteBatchInternal::Contents(&batch); + WriteBatchInternal::SetContents(&batch, + Slice(contents.data(),contents.size()-1)); + ASSERT_EQ("Put(foo, bar)@200" + "Corruption: bad WriteBatch Delete", + PrintContents(&batch)); +} + +TEST(WriteBatchTest, Append) { + WriteBatch b1, b2; + WriteBatchInternal::SetSequence(&b1, 200); + WriteBatchInternal::SetSequence(&b2, 300); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("", + PrintContents(&b1)); + ASSERT_EQ(0, b1.Count()); + b2.Put("a", "va"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200", + PrintContents(&b1)); + ASSERT_EQ(1, b1.Count()); + b2.Clear(); + b2.Put("b", "vb"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200" + "Put(b, vb)@201", + PrintContents(&b1)); + ASSERT_EQ(2, b1.Count()); + b2.Delete("foo"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200" + "Put(b, vb)@202" + "Put(b, vb)@201" + "Delete(foo)@203", + PrintContents(&b1)); + ASSERT_EQ(4, b1.Count()); +} + +namespace { + struct TestHandler : public WriteBatch::Handler { + std::string seen; + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + seen += "Put(" + key.ToString() + ", " + value.ToString() + ")"; + } else { + seen += "PutCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ", " + value.ToString() + ")"; + } + return Status::OK(); + } + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")"; + } else { + seen += "MergeCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ", " + value.ToString() + ")"; + } + return Status::OK(); + } + virtual void LogData(const Slice& blob) { + seen += "LogData(" + blob.ToString() + ")"; + } + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { + if (column_family_id == 0) { + seen += "Delete(" + key.ToString() + ")"; + } else { + seen += "DeleteCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ")"; + } + return Status::OK(); + } + }; +} + +TEST(WriteBatchTest, Blob) { + WriteBatch batch; + batch.Put(Slice("k1"), Slice("v1")); + batch.Put(Slice("k2"), Slice("v2")); + batch.Put(Slice("k3"), Slice("v3")); + batch.PutLogData(Slice("blob1")); + batch.Delete(Slice("k2")); + batch.PutLogData(Slice("blob2")); + batch.Merge(Slice("foo"), Slice("bar")); + ASSERT_EQ(5, batch.Count()); + ASSERT_EQ("Merge(foo, bar)@4" + "Put(k1, v1)@0" + "Delete(k2)@3" + "Put(k2, v2)@1" + "Put(k3, v3)@2", + PrintContents(&batch)); + + TestHandler handler; + batch.Iterate(&handler); + ASSERT_EQ( + "Put(k1, v1)" + "Put(k2, v2)" + "Put(k3, v3)" + "LogData(blob1)" + "Delete(k2)" + "LogData(blob2)" + "Merge(foo, bar)", + handler.seen); +} + +TEST(WriteBatchTest, Continue) { + WriteBatch batch; + + struct Handler : public TestHandler { + int num_seen = 0; + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + ++num_seen; + return TestHandler::PutCF(column_family_id, key, value); + } + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + ++num_seen; + return TestHandler::MergeCF(column_family_id, key, value); + } + virtual void LogData(const Slice& blob) { + ++num_seen; + TestHandler::LogData(blob); + } + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { + ++num_seen; + return TestHandler::DeleteCF(column_family_id, key); + } + virtual bool Continue() override { + return num_seen < 3; + } + } handler; + + batch.Put(Slice("k1"), Slice("v1")); + batch.PutLogData(Slice("blob1")); + batch.Delete(Slice("k1")); + batch.PutLogData(Slice("blob2")); + batch.Merge(Slice("foo"), Slice("bar")); + batch.Iterate(&handler); + ASSERT_EQ( + "Put(k1, v1)" + "LogData(blob1)" + "Delete(k1)", + handler.seen); +} + +TEST(WriteBatchTest, PutGatherSlices) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + + { + // Try a write where the key is one slice but the value is two + Slice key_slice("baz"); + Slice value_slices[2] = { Slice("header"), Slice("payload") }; + batch.Put(SliceParts(&key_slice, 1), + SliceParts(value_slices, 2)); + } + + { + // One where the key is composite but the value is a single slice + Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") }; + Slice value_slice("value"); + batch.Put(SliceParts(key_slices, 3), + SliceParts(&value_slice, 1)); + } + + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ("Put(baz, headerpayload)@101" + "Put(foo, bar)@100" + "Put(keypart2part3, value)@102", + PrintContents(&batch)); + ASSERT_EQ(3, batch.Count()); +} + +namespace { +class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { + public: + explicit ColumnFamilyHandleImplDummy(int id) + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {} + uint32_t GetID() const override { return id_; } + + private: + uint32_t id_; +}; +} // namespace anonymous + +TEST(WriteBatchTest, ColumnFamiliesBatchTest) { + WriteBatch batch; + ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); + batch.Put(&zero, Slice("foo"), Slice("bar")); + batch.Put(&two, Slice("twofoo"), Slice("bar2")); + batch.Put(&eight, Slice("eightfoo"), Slice("bar8")); + batch.Delete(&eight, Slice("eightfoo")); + batch.Merge(&three, Slice("threethree"), Slice("3three")); + batch.Put(&zero, Slice("foo"), Slice("bar")); + batch.Merge(Slice("omom"), Slice("nom")); + + TestHandler handler; + batch.Iterate(&handler); + ASSERT_EQ( + "Put(foo, bar)" + "PutCF(2, twofoo, bar2)" + "PutCF(8, eightfoo, bar8)" + "DeleteCF(8, eightfoo)" + "MergeCF(3, threethree, 3three)" + "Put(foo, bar)" + "Merge(omom, nom)", + handler.seen); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/doc/doc.css b/doc/doc.css new file mode 100644 index 0000000000..700c564e43 --- /dev/null +++ b/doc/doc.css @@ -0,0 +1,89 @@ +body { + margin-left: 0.5in; + margin-right: 0.5in; + background: white; + color: black; +} + +h1 { + margin-left: -0.2in; + font-size: 14pt; +} +h2 { + margin-left: -0in; + font-size: 12pt; +} +h3 { + margin-left: -0in; +} +h4 { + margin-left: -0in; +} +hr { + margin-left: -0in; +} + +/* Definition lists: definition term bold */ +dt { + font-weight: bold; +} + +address { + text-align: center; +} +code,samp,var { + color: blue; +} +kbd { + color: #600000; +} +div.note p { + float: right; + width: 3in; + margin-right: 0%; + padding: 1px; + border: 2px solid #6060a0; + background-color: #fffff0; +} + +ul { + margin-top: -0em; + margin-bottom: -0em; +} + +ol { + margin-top: -0em; + margin-bottom: -0em; +} + +UL.nobullets { + list-style-type: none; + list-style-image: none; + margin-left: -1em; +} + +p { + margin: 1em 0 1em 0; + padding: 0 0 0 0; +} + +pre { + line-height: 1.3em; + padding: 0.4em 0 0.8em 0; + margin: 0 0 0 0; + border: 0 0 0 0; + color: blue; +} + +.datatable { + margin-left: auto; + margin-right: auto; + margin-top: 2em; + margin-bottom: 2em; + border: 1px solid; +} + +.datatable td,th { + padding: 0 0.5em 0 0.5em; + text-align: right; +} diff --git a/doc/index.html b/doc/index.html new file mode 100644 index 0000000000..71f515e761 --- /dev/null +++ b/doc/index.html @@ -0,0 +1,831 @@ + + + + +RocksDB + + + +

RocksDB

+
The Facebook Database Engineering Team
+
Build on earlier work on leveldb by Sanjay Ghemawat + (sanjay@google.com) and Jeff Dean (jeff@google.com)
+

+The rocksdb library provides a persistent key value store. Keys and +values are arbitrary byte arrays. The keys are ordered within the key +value store according to a user-specified comparator function. + +

+

Opening A Database

+

+A rocksdb database has a name which corresponds to a file system +directory. All of the contents of database are stored in this +directory. The following example shows how to open a database, +creating it if necessary: +

+

+  #include <assert>
+  #include "rocksdb/db.h"
+
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &db);
+  assert(status.ok());
+  ...
+
+If you want to raise an error if the database already exists, add +the following line before the rocksdb::DB::Open call: +
+  options.error_if_exists = true;
+
+

Status

+

+You may have noticed the rocksdb::Status type above. Values of this +type are returned by most functions in rocksdb that may encounter an +error. You can check if such a result is ok, and also print an +associated error message: +

+

+   rocksdb::Status s = ...;
+   if (!s.ok()) cerr << s.ToString() << endl;
+
+

Closing A Database

+

+When you are done with a database, just delete the database object. +Example: +

+

+  ... open the db as described above ...
+  ... do something with db ...
+  delete db;
+
+

Reads And Writes

+

+The database provides Put, Delete, and Get methods to +modify/query the database. For example, the following code +moves the value stored under key1 to key2. +

+  std::string value;
+  rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key1, &value);
+  if (s.ok()) s = db->Put(rocksdb::WriteOptions(), key2, value);
+  if (s.ok()) s = db->Delete(rocksdb::WriteOptions(), key1);
+
+ +

Atomic Updates

+

+Note that if the process dies after the Put of key2 but before the +delete of key1, the same value may be left stored under multiple keys. +Such problems can be avoided by using the WriteBatch class to +atomically apply a set of updates: +

+

+  #include "rocksdb/write_batch.h"
+  ...
+  std::string value;
+  rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key1, &value);
+  if (s.ok()) {
+    rocksdb::WriteBatch batch;
+    batch.Delete(key1);
+    batch.Put(key2, value);
+    s = db->Write(rocksdb::WriteOptions(), &batch);
+  }
+
+The WriteBatch holds a sequence of edits to be made to the database, +and these edits within the batch are applied in order. Note that we +called Delete before Put so that if key1 is identical to key2, +we do not end up erroneously dropping the value entirely. +

+Apart from its atomicity benefits, WriteBatch may also be used to +speed up bulk updates by placing lots of individual mutations into the +same batch. + +

Synchronous Writes

+By default, each write to leveldb is asynchronous: it +returns after pushing the write from the process into the operating +system. The transfer from operating system memory to the underlying +persistent storage happens asynchronously. The sync flag +can be turned on for a particular write to make the write operation +not return until the data being written has been pushed all the way to +persistent storage. (On Posix systems, this is implemented by calling +either fsync(...) or fdatasync(...) or +msync(..., MS_SYNC) before the write operation returns.) +
+  rocksdb::WriteOptions write_options;
+  write_options.sync = true;
+  db->Put(write_options, ...);
+
+Asynchronous writes are often more than a thousand times as fast as +synchronous writes. The downside of asynchronous writes is that a +crash of the machine may cause the last few updates to be lost. Note +that a crash of just the writing process (i.e., not a reboot) will not +cause any loss since even when sync is false, an update +is pushed from the process memory into the operating system before it +is considered done. + +

+Asynchronous writes can often be used safely. For example, when +loading a large amount of data into the database you can handle lost +updates by restarting the bulk load after a crash. A hybrid scheme is +also possible where every Nth write is synchronous, and in the event +of a crash, the bulk load is restarted just after the last synchronous +write finished by the previous run. (The synchronous write can update +a marker that describes where to restart on a crash.) + +

+WriteBatch provides an alternative to asynchronous writes. +Multiple updates may be placed in the same WriteBatch and +applied together using a synchronous write (i.e., +write_options.sync is set to true). The extra cost of +the synchronous write will be amortized across all of the writes in +the batch. + +

+We also provide a way to completely disable Write Ahead Log for a +particular write. If you set write_option.disableWAL to true, the +write will not go to the log at all and may be lost in an event of +process crash. + +

+When opening a DB, you can disable syncing of data files by setting +Options::disableDataSync to true. This can be useful when doing +bulk-loading or big idempotent operations. Once the operation is +finished, you can manually call sync() to flush all dirty buffers +to stable storage. + +

+RocksDB by default uses faster fdatasync() to sync files. If you want +to use fsync(), you can set Options::use_fsync to true. You should set +this to true on filesystems like ext3 that can lose files after a +reboot. + +

+

Concurrency

+

+A database may only be opened by one process at a time. +The rocksdb implementation acquires a lock from the +operating system to prevent misuse. Within a single process, the +same rocksdb::DB object may be safely shared by multiple +concurrent threads. I.e., different threads may write into or fetch +iterators or call Get on the same database without any +external synchronization (the leveldb implementation will +automatically do the required synchronization). However other objects +(like Iterator and WriteBatch) may require external synchronization. +If two threads share such an object, they must protect access to it +using their own locking protocol. More details are available in +the public header files. + +

+

Merge operators

+

+Merge operators provide efficient support for read-modify-write operation. +More on the interface and implementation can be found on: +

+ + Merge Operator +

+ + Merge Operator Implementation + +

+

Iteration

+

+The following example demonstrates how to print all key,value pairs +in a database. +

+

+  rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions());
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+  delete it;
+
+The following variation shows how to process just the keys in the +range [start,limit): +

+

+  for (it->Seek(start);
+       it->Valid() && it->key().ToString() < limit;
+       it->Next()) {
+    ...
+  }
+
+You can also process entries in reverse order. (Caveat: reverse +iteration may be somewhat slower than forward iteration.) +

+

+  for (it->SeekToLast(); it->Valid(); it->Prev()) {
+    ...
+  }
+
+

Snapshots

+

+Snapshots provide consistent read-only views over the entire state of +the key-value store. ReadOptions::snapshot may be non-NULL to indicate +that a read should operate on a particular version of the DB state. +If ReadOptions::snapshot is NULL, the read will operate on an +implicit snapshot of the current state. +

+Snapshots are created by the DB::GetSnapshot() method: +

+

+  rocksdb::ReadOptions options;
+  options.snapshot = db->GetSnapshot();
+  ... apply some updates to db ...
+  rocksdb::Iterator* iter = db->NewIterator(options);
+  ... read using iter to view the state when the snapshot was created ...
+  delete iter;
+  db->ReleaseSnapshot(options.snapshot);
+
+Note that when a snapshot is no longer needed, it should be released +using the DB::ReleaseSnapshot interface. This allows the +implementation to get rid of state that was being maintained just to +support reading as of that snapshot. +

Slice

+

+The return value of the it->key() and it->value() calls above +are instances of the rocksdb::Slice type. Slice is a simple +structure that contains a length and a pointer to an external byte +array. Returning a Slice is a cheaper alternative to returning a +std::string since we do not need to copy potentially large keys and +values. In addition, rocksdb methods do not return null-terminated +C-style strings since rocksdb keys and values are allowed to +contain '\0' bytes. +

+C++ strings and null-terminated C-style strings can be easily converted +to a Slice: +

+

+   rocksdb::Slice s1 = "hello";
+
+   std::string str("world");
+   rocksdb::Slice s2 = str;
+
+A Slice can be easily converted back to a C++ string: +
+   std::string str = s1.ToString();
+   assert(str == std::string("hello"));
+
+Be careful when using Slices since it is up to the caller to ensure that +the external byte array into which the Slice points remains live while +the Slice is in use. For example, the following is buggy: +

+

+   rocksdb::Slice slice;
+   if (...) {
+     std::string str = ...;
+     slice = str;
+   }
+   Use(slice);
+
+When the if statement goes out of scope, str will be destroyed and the +backing storage for slice will disappear. +

+

Comparators

+

+The preceding examples used the default ordering function for key, +which orders bytes lexicographically. You can however supply a custom +comparator when opening a database. For example, suppose each +database key consists of two numbers and we should sort by the first +number, breaking ties by the second number. First, define a proper +subclass of rocksdb::Comparator that expresses these rules: +

+

+  class TwoPartComparator : public rocksdb::Comparator {
+   public:
+    // Three-way comparison function:
+    //   if a < b: negative result
+    //   if a > b: positive result
+    //   else: zero result
+    int Compare(const rocksdb::Slice& a, const rocksdb::Slice& b) const {
+      int a1, a2, b1, b2;
+      ParseKey(a, &a1, &a2);
+      ParseKey(b, &b1, &b2);
+      if (a1 < b1) return -1;
+      if (a1 > b1) return +1;
+      if (a2 < b2) return -1;
+      if (a2 > b2) return +1;
+      return 0;
+    }
+
+    // Ignore the following methods for now:
+    const char* Name() const { return "TwoPartComparator"; }
+    void FindShortestSeparator(std::string*, const rocksdb::Slice&) const { }
+    void FindShortSuccessor(std::string*) const { }
+  };
+
+Now create a database using this custom comparator: +

+

+  TwoPartComparator cmp;
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  options.comparator = &cmp;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &db);
+  ...
+
+

Backwards compatibility

+

+The result of the comparator's Name method is attached to the +database when it is created, and is checked on every subsequent +database open. If the name changes, the rocksdb::DB::Open call will +fail. Therefore, change the name if and only if the new key format +and comparison function are incompatible with existing databases, and +it is ok to discard the contents of all existing databases. +

+You can however still gradually evolve your key format over time with +a little bit of pre-planning. For example, you could store a version +number at the end of each key (one byte should suffice for most uses). +When you wish to switch to a new key format (e.g., adding an optional +third part to the keys processed by TwoPartComparator), +(a) keep the same comparator name (b) increment the version number +for new keys (c) change the comparator function so it uses the +version numbers found in the keys to decide how to interpret them. + + +

+

MemTable and Table factories

+

+By default, we keep the data in memory in skiplist memtable and the data +on disk in a table format described here: + + RocksDB Table Format. +

+Since one of the goals of RocksDB is to have +different parts of the system easily pluggable, we support different +implementations of both memtable and table format. You can supply +your own memtable factory by setting Options::memtable_factory +and your own table factory by setting Options::table_factory. +For available memtable factories, please refer to +rocksdb/memtablerep.h and for table factores to +rocksdb/table.h. These features are both in active development +and please be wary of any API changes that might break your application +going forward. +

+You can also read more about memtables here: + +Memtables wiki + + +

+

Performance

+

+Performance can be tuned by changing the default values of the +types defined in include/rocksdb/options.h. + +

+

Block size

+

+rocksdb groups adjacent keys together into the same block and such a +block is the unit of transfer to and from persistent storage. The +default block size is approximately 4096 uncompressed bytes. +Applications that mostly do bulk scans over the contents of the +database may wish to increase this size. Applications that do a lot +of point reads of small values may wish to switch to a smaller block +size if performance measurements indicate an improvement. There isn't +much benefit in using blocks smaller than one kilobyte, or larger than +a few megabytes. Also note that compression will be more effective +with larger block sizes. To change block size parameter, use +Options::block_size. +

+

Write buffer

+

+Options::write_buffer_size specifies the amount of data +to build up in memory before converting to a sorted on-disk file. +Larger values increase performance, especially during bulk loads. +Up to max_write_buffer_number write buffers may be held in memory +at the same time, +so you may wish to adjust this parameter to control memory usage. +Also, a larger write buffer will result in a longer recovery time +the next time the database is opened. +Related option is +Options::max_write_buffer_number, which is maximum number +of write buffers that are built up in memory. The default is 2, so that +when 1 write buffer is being flushed to storage, new writes can continue +to the other write buffer. +Options::min_write_buffer_number_to_merge is the minimum number +of write buffers that will be merged together before writing to storage. +If set to 1, then all write buffers are flushed to L0 as individual files and +this increases read amplification because a get request has to check in all +of these files. Also, an in-memory merge may result in writing lesser +data to storage if there are duplicate records in each of these +individual write buffers. Default: 1 +

+

Compression

+

+Each block is individually compressed before being written to +persistent storage. Compression is on by default since the default +compression method is very fast, and is automatically disabled for +uncompressible data. In rare cases, applications may want to disable +compression entirely, but should only do so if benchmarks show a +performance improvement: +

+

+  rocksdb::Options options;
+  options.compression = rocksdb::kNoCompression;
+  ... rocksdb::DB::Open(options, name, ...) ....
+
+

Cache

+

+The contents of the database are stored in a set of files in the +filesystem and each file stores a sequence of compressed blocks. If +options.block_cache is non-NULL, it is used to cache frequently +used uncompressed block contents. If options.block_cache_compressed +is non-NULL, it is used to cache frequently used compressed blocks. Compressed +cache is an alternative to OS cache, which also caches compressed blocks. If +compressed cache is used, the OS cache will be disabled automatically by setting +options.allow_os_buffer to false. +

+

+  #include "rocksdb/cache.h"
+
+  rocksdb::Options options;
+  options.block_cache = rocksdb::NewLRUCache(100 * 1048576);  // 100MB uncompressed cache
+  options.block_cache_compressed = rocksdb::NewLRUCache(100 * 1048576);  // 100MB compressed cache
+  rocksdb::DB* db;
+  rocksdb::DB::Open(options, name, &db);
+  ... use the db ...
+  delete db
+  delete options.block_cache;
+  delete options.block_cache_compressed;
+
+

+When performing a bulk read, the application may wish to disable +caching so that the data processed by the bulk read does not end up +displacing most of the cached contents. A per-iterator option can be +used to achieve this: +

+

+  rocksdb::ReadOptions options;
+  options.fill_cache = false;
+  rocksdb::Iterator* it = db->NewIterator(options);
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ...
+  }
+
+

+You can also disable block cache by setting options.no_block_cache +to true. +

Key Layout

+

+Note that the unit of disk transfer and caching is a block. Adjacent +keys (according to the database sort order) will usually be placed in +the same block. Therefore the application can improve its performance +by placing keys that are accessed together near each other and placing +infrequently used keys in a separate region of the key space. +

+For example, suppose we are implementing a simple file system on top +of rocksdb. The types of entries we might wish to store are: +

+

+   filename -> permission-bits, length, list of file_block_ids
+   file_block_id -> data
+
+We might want to prefix filename keys with one letter (say '/') and the +file_block_id keys with a different letter (say '0') so that scans +over just the metadata do not force us to fetch and cache bulky file +contents. +

+

Filters

+

+Because of the way rocksdb data is organized on disk, +a single Get() call may involve multiple reads from disk. +The optional FilterPolicy mechanism can be used to reduce +the number of disk reads substantially. +

+   rocksdb::Options options;
+   options.filter_policy = NewBloomFilter(10);
+   rocksdb::DB* db;
+   rocksdb::DB::Open(options, "/tmp/testdb", &db);
+   ... use the database ...
+   delete db;
+   delete options.filter_policy;
+
+The preceding code associates a +Bloom filter +based filtering policy with the database. Bloom filter based +filtering relies on keeping some number of bits of data in memory per +key (in this case 10 bits per key since that is the argument we passed +to NewBloomFilter). This filter will reduce the number of unnecessary +disk reads needed for Get() calls by a factor of +approximately a 100. Increasing the bits per key will lead to a +larger reduction at the cost of more memory usage. We recommend that +applications whose working set does not fit in memory and that do a +lot of random reads set a filter policy. +

+If you are using a custom comparator, you should ensure that the filter +policy you are using is compatible with your comparator. For example, +consider a comparator that ignores trailing spaces when comparing keys. +NewBloomFilter must not be used with such a comparator. +Instead, the application should provide a custom filter policy that +also ignores trailing spaces. For example: +

+  class CustomFilterPolicy : public rocksdb::FilterPolicy {
+   private:
+    FilterPolicy* builtin_policy_;
+   public:
+    CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
+    ~CustomFilterPolicy() { delete builtin_policy_; }
+
+    const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
+
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      std::vector<Slice> trimmed(n);
+      for (int i = 0; i < n; i++) {
+        trimmed[i] = RemoveTrailingSpaces(keys[i]);
+      }
+      return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
+    }
+
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      return builtin_policy_->KeyMayMatch(RemoveTrailingSpaces(key), filter);
+    }
+  };
+
+

+Advanced applications may provide a filter policy that does not use +a bloom filter but uses some other mechanism for summarizing a set +of keys. See rocksdb/filter_policy.h for detail. +

+

Checksums

+

+rocksdb associates checksums with all data it stores in the file system. +There are two separate controls provided over how aggressively these +checksums are verified: +

+

    +
  • ReadOptions::verify_checksums may be set to true to force + checksum verification of all data that is read from the file system on + behalf of a particular read. By default, no such verification is + done. +

    +

  • Options::paranoid_checks may be set to true before opening a + database to make the database implementation raise an error as soon as + it detects an internal corruption. Depending on which portion of the + database has been corrupted, the error may be raised when the database + is opened, or later by another database operation. By default, + paranoid checking is off so that the database can be used even if + parts of its persistent storage have been corrupted. +

    + If a database is corrupted (perhaps it cannot be opened when + paranoid checking is turned on), the rocksdb::RepairDB function + may be used to recover as much of the data as possible. +

    +

+ +

+

Compaction

+

+You can read more on Compactions here: + + Multi-threaded compactions + +

+Here we give overview of the options that impact behavior of Compactions: +

    +

    +

  • Options::compaction_style - RocksDB currently supports two +compaction algorithms - Universal style and Level style. This option switches +between the two. Can be kCompactionStyleUniversal or kCompactionStyleLevel. +If this is kCompactionStyleUniversal, then you can configure universal style +parameters with Options::compaction_options_universal. +

    +

  • Options::disable_auto_compactions - Disable automatic compactions. +Manual compactions can still be issued on this database. +

    +

  • Options::compaction_filter - Allows an application to modify/delete +a key-value during background compaction. The client must provide +compaction_filter_factory if it requires a new compaction filter to be used +for different compaction processes. Client should specify only one of filter +or factory. +

    +

  • Options::compaction_filter_factory - a factory that provides +compaction filter objects which allow an application to modify/delete a +key-value during background compaction. +
+

+Other options impacting performance of compactions and when they get triggered +are: +

    +

    +

  • Options::access_hint_on_compaction_start - Specify the file access +pattern once a compaction is started. It will be applied to all input files of a compaction. Default: NORMAL +

    +

  • Options::level0_file_num_compaction_trigger - Number of files to trigger level-0 compaction. +A negative value means that level-0 compaction will not be triggered by number of files at all. +

    +

  • Options::max_mem_compaction_level - Maximum level to which a new compacted memtable is pushed if it +does not create overlap. We try to push to level 2 to avoid the relatively expensive level 0=>1 compactions and to avoid some +expensive manifest file operations. We do not push all the way to the largest level since that can generate a lot of wasted disk +space if the same key space is being repeatedly overwritten. +

    +

  • Options::target_file_size_base and Options::target_file_size_multiplier - +Target file size for compaction. target_file_size_base is per-file size for level-1. +Target file size for level L can be calculated by target_file_size_base * (target_file_size_multiplier ^ (L-1)) +For example, if target_file_size_base is 2MB and target_file_size_multiplier is 10, then each file on level-1 will +be 2MB, and each file on level 2 will be 20MB, and each file on level-3 will be 200MB. Default target_file_size_base is 2MB +and default target_file_size_multiplier is 1. +

    +

  • Options::expanded_compaction_factor - Maximum number of bytes in all compacted files. We avoid expanding +the lower level file set of a compaction if it would make the total compaction cover more than +(expanded_compaction_factor * targetFileSizeLevel()) many bytes. +

    +

  • Options::source_compaction_factor - Maximum number of bytes in all source files to be compacted in a +single compaction run. We avoid picking too many files in the source level so that we do not exceed the total source bytes +for compaction to exceed (source_compaction_factor * targetFileSizeLevel()) many bytes. +Default:1, i.e. pick maxfilesize amount of data as the source of a compaction. +

    +

  • Options::max_grandparent_overlap_factor - Control maximum bytes of overlaps in grandparent (i.e., level+2) before we +stop building a single file in a level->level+1 compaction. +

    +

  • Options::disable_seek_compaction - Disable compaction triggered by seek. +With bloomfilter and fast storage, a miss on one level is very cheap if the file handle is cached in table cache +(which is true if max_open_files is large). +

    +

  • Options::max_background_compactions - Maximum number of concurrent background jobs, submitted to +the default LOW priority thread pool +
+ +

+You can learn more about all of those options in rocksdb/options.h + +

Universal style compaction specific settings

+

+If you're using Universal style compaction, there is an object CompactionOptionsUniversal +that hold all the different options for that compaction. The exact definition is in +rocksdb/universal_compaction.h and you can set it in Options::compaction_options_universal. +Here we give short overview of options in CompactionOptionsUniversal: +

    +

    +

  • CompactionOptionsUniversal::size_ratio - Percentage flexibility while comparing file size. If the candidate file(s) + size is 1% smaller than the next file's size, then include next file into + this candidate set. Default: 1 +

    +

  • CompactionOptionsUniversal::min_merge_width - The minimum number of files in a single compaction run. Default: 2 +

    +

  • CompactionOptionsUniversal::max_merge_width - The maximum number of files in a single compaction run. Default: UINT_MAX +

    +

  • CompactionOptionsUniversal::max_size_amplification_percent - The size amplification is defined as the amount (in percentage) of +additional storage needed to store a single byte of data in the database. For example, a size amplification of 2% means that a database that +contains 100 bytes of user-data may occupy upto 102 bytes of physical storage. By this definition, a fully compacted database has +a size amplification of 0%. Rocksdb uses the following heuristic to calculate size amplification: it assumes that all files excluding +the earliest file contribute to the size amplification. Default: 200, which means that a 100 byte database could require upto +300 bytes of storage. +

    +

  • CompactionOptionsUniversal::compression_size_percent - If this option is set to be -1 (the default value), all the output files +will follow compression type specified. If this option is not negative, we will try to make sure compressed +size is just above this value. In normal cases, at least this percentage +of data will be compressed. +When we are compacting to a new file, here is the criteria whether +it needs to be compressed: assuming here are the list of files sorted +by generation time: [ A1...An B1...Bm C1...Ct ], +where A1 is the newest and Ct is the oldest, and we are going to compact +B1...Bm, we calculate the total size of all the files as total_size, as +well as the total size of C1...Ct as total_C, the compaction output file +will be compressed iff total_C / total_size < this percentage +

    +

  • CompactionOptionsUniversal::stop_style - The algorithm used to stop picking files into a single compaction run. +Can be kCompactionStopStyleSimilarSize (pick files of similar size) or kCompactionStopStyleTotalSize (total size of picked files > next file). +Default: kCompactionStopStyleTotalSize +
+ +

Thread pools

+

+A thread pool is associated with Env environment object. The client has to create a thread pool by setting the number of background +threads using method Env::SetBackgroundThreads() defined in rocksdb/env.h. +We use the thread pool for compactions and memtable flushes. +Since memtable flushes are in critical code path (stalling memtable flush can stall writes, increasing p99), we suggest +having two thread pools - with priorities HIGH and LOW. Memtable flushes can be set up to be scheduled on HIGH thread pool. +There are two options available for configuration of background compactions and flushes: +

    +

    +

  • Options::max_background_compactions - Maximum number of concurrent background jobs, +submitted to the default LOW priority thread pool +

    +

  • Options::max_background_flushes - Maximum number of concurrent background memtable flush jobs, submitted to +the HIGH priority thread pool. By default, all background jobs (major compaction and memtable flush) go +to the LOW priority pool. If this option is set to a positive number, memtable flush jobs will be submitted to the HIGH priority pool. +It is important when the same Env is shared by multiple db instances. Without a separate pool, long running major compaction jobs could +potentially block memtable flush jobs of other db instances, leading to unnecessary Put stalls. +
+

+

+  #include "rocksdb/env.h"
+  #include "rocksdb/db.h"
+
+  auto env = rocksdb::Env::Default();
+  env->SetBackgroundThreads(2, rocksdb::Env::LOW);
+  env->SetBackgroundThreads(1, rocksdb::Env::HIGH);
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.env = env;
+  options.max_background_compactions = 2;
+  options.max_background_flushes = 1;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &db);
+  assert(status.ok());
+  ...
+
+

Approximate Sizes

+

+The GetApproximateSizes method can used to get the approximate +number of bytes of file system space used by one or more key ranges. +

+

+   rocksdb::Range ranges[2];
+   ranges[0] = rocksdb::Range("a", "c");
+   ranges[1] = rocksdb::Range("x", "z");
+   uint64_t sizes[2];
+   rocksdb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
+
+The preceding call will set sizes[0] to the approximate number of +bytes of file system space used by the key range [a..c) and +sizes[1] to the approximate number of bytes used by the key range +[x..z). +

+

Environment

+

+All file operations (and other operating system calls) issued by the +rocksdb implementation are routed through a rocksdb::Env object. +Sophisticated clients may wish to provide their own Env +implementation to get better control. For example, an application may +introduce artificial delays in the file IO paths to limit the impact +of rocksdb on other activities in the system. +

+

+  class SlowEnv : public rocksdb::Env {
+    .. implementation of the Env interface ...
+  };
+
+  SlowEnv env;
+  rocksdb::Options options;
+  options.env = &env;
+  Status s = rocksdb::DB::Open(options, ...);
+
+

Porting

+

+rocksdb may be ported to a new platform by providing platform +specific implementations of the types/methods/functions exported by +rocksdb/port/port.h. See rocksdb/port/port_example.h for more +details. +

+In addition, the new platform may need a new default rocksdb::Env +implementation. See rocksdb/util/env_posix.h for an example. + +

Statistics

+

+To be able to efficiently tune your application, it is always helpful if you +have access to usage statistics. You can collect those statistics by setting +Options::table_properties_collectors or +Options::statistics. For more information, refer to +rocksdb/table_properties.h and rocksdb/statistics.h. +These should not add significant overhead to your application and we +recommend exporting them to other monitoring tools. + +

Purging WAL files

+

+By default, old write-ahead logs are deleted automatically when they fall out +of scope and application doesn't need them anymore. There are options that +enable the user to archive the logs and then delete them lazily, either in +TTL fashion or based on size limit. + +The options are Options::WAL_ttl_seconds and +Options::WAL_size_limit_MB. Here is how they can be used: +

    +
  • +

    +If both set to 0, logs will be deleted asap and will never get into the archive. +

  • +

    +If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, WAL +files will be checked every 10 min and if total size is greater then +WAL_size_limit_MB, they will be deleted starting with the +earliest until size_limit is met. All empty files will be deleted. +

  • +

    +If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then +WAL files will be checked every WAL_ttl_seconds / 2 and those +that are older than WAL_ttl_seconds will be deleted. +

  • +

    +If both are not 0, WAL files will be checked every 10 min and both +checks will be performed with ttl being first. +

+ +

Other Information

+

+Details about the rocksdb implementation may be found in +the following documents: +

+ + + diff --git a/doc/log_format.txt b/doc/log_format.txt new file mode 100644 index 0000000000..3a0414b65a --- /dev/null +++ b/doc/log_format.txt @@ -0,0 +1,75 @@ +The log file contents are a sequence of 32KB blocks. The only +exception is that the tail of the file may contain a partial block. + +Each block consists of a sequence of records: + block := record* trailer? + record := + checksum: uint32 // crc32c of type and data[] + length: uint16 + type: uint8 // One of FULL, FIRST, MIDDLE, LAST + data: uint8[length] + +A record never starts within the last six bytes of a block (since it +won't fit). Any leftover bytes here form the trailer, which must +consist entirely of zero bytes and must be skipped by readers. + +Aside: if exactly seven bytes are left in the current block, and a new +non-zero length record is added, the writer must emit a FIRST record +(which contains zero bytes of user data) to fill up the trailing seven +bytes of the block and then emit all of the user data in subsequent +blocks. + +More types may be added in the future. Some Readers may skip record +types they do not understand, others may report that some data was +skipped. + +FULL == 1 +FIRST == 2 +MIDDLE == 3 +LAST == 4 + +The FULL record contains the contents of an entire user record. + +FIRST, MIDDLE, LAST are types used for user records that have been +split into multiple fragments (typically because of block boundaries). +FIRST is the type of the first fragment of a user record, LAST is the +type of the last fragment of a user record, and MID is the type of all +interior fragments of a user record. + +Example: consider a sequence of user records: + A: length 1000 + B: length 97270 + C: length 8000 +A will be stored as a FULL record in the first block. + +B will be split into three fragments: first fragment occupies the rest +of the first block, second fragment occupies the entirety of the +second block, and the third fragment occupies a prefix of the third +block. This will leave six bytes free in the third block, which will +be left empty as the trailer. + +C will be stored as a FULL record in the fourth block. + +=================== + +Some benefits over the recordio format: + +(1) We do not need any heuristics for resyncing - just go to next +block boundary and scan. If there is a corruption, skip to the next +block. As a side-benefit, we do not get confused when part of the +contents of one log file are embedded as a record inside another log +file. + +(2) Splitting at approximate boundaries (e.g., for mapreduce) is +simple: find the next block boundary and skip records until we +hit a FULL or FIRST record. + +(3) We do not need extra buffering for large records. + +Some downsides compared to recordio format: + +(1) No packing of tiny records. This could be fixed by adding a new +record type, so it is a shortcoming of the current implementation, +not necessarily the format. + +(2) No compression. Again, this could be fixed by adding new record types. diff --git a/doc/rockslogo.jpg b/doc/rockslogo.jpg new file mode 100644 index 0000000000000000000000000000000000000000..363905af5c269839cf2ebeab95ebc6414e97a88d GIT binary patch literal 137232 zcmeEv2UHcymUbU<&N(X?BudVTWRV~_NDc=i=O7>mN)8evBN<6b1__cgNRGlG3J7=r z!9)I=d*8eF&b&7>Z{C`x)(yLQ9%>@^G^P?lGc2Otm#U<&>K z*Q+>sO0u#h>Y8fuN)P403jlzl_z>#oghT)U4vy}wnhN(B^z;oF&_@6y024q1_yEAn z!p&LrvAh-lZc1{p3~pee8~OL`WEueP1OO9Uis}puzsLVW5}}2&t2+Qd)WOtuEv?)v zK=}nIKlO5VzKMr}GKsnUjf{kHBfEkH1Z9dF+2&XI&M#?xm1S;ZO9w|wFwISwoh=)WZP^$_P*B z-JQjFc)Yy4xS>`S+&2~a+xA~3_$}psEd1el+&9nrt?n2eSXrBSJauQdsZks9)qiJPr!K1v5f8Pw4RY;H$Os5{ijkpb%XAGPp5 zne7i{xDkKFH7F40UjZa0+yKES34pve1fURO1IR{M;1$Sk?e-8;2e^6jbm_nTihEE7 zum5uVHwUCx@K+=^s13tSw5+BUgN28y=Zy@$6E_D`01LneNB~NJ4qyb>04{(Z5C+5n z89)wD2Gjs8Kp%JlSOB(w1KqAPk5E;(#O|4af%affAqsr~>MNW}qGD0S1B3 zz!b0mz<>?l8vqARfOD`TLxfk;8*AP*s$5PgU##0KIB@qqY4 zLLiZl1V|d>Eu;kU0n!L*hx9|nAajrv$Ts8*J~B@Lw*r4FSF zWddad1&(rsii=8v%84q5s)VYCYJ=*98ityTT8LVU+KoDex`BFvhK5Fl#)>A4riiAG zW{2j77KN6HR*CirZ47MP;xpsl!+(PB zi=T{NjX#RNOMpqhLLf_EMi4-dPS8j&LvTz;Ovp#5M(98oMfi@epKyx^orr}wYe z1yLSRC(#-)GBG2uEU^V~2ys4fH}M7u8VM_j5{Vs2BuP2R2nn2&h*W@7ht!)iowS7% zMutqrLZ(Falq{C4ifo$fjGUJI9=R2HIC%y6IQdTsY6=+&D~bq;N{T6pGfDb$Fy{`3bf9& zskEK6yL4o9GIVxyiFB=W+w{crQuI*zMEW-R9R^Z{dkjw*-Y|4Az;DsqQoQAH>+P+P zTj#e~Z)@KUzFm3y3nLbz2%|M)B4Y<5oQa-EmC2u}lxdzBlUan>mN|vFm-#0PD~m46 zOO|?;EmjIvC01Y7Qr1N_TsCPo7q(osNp>`L5q5j_O!m(lNF0J3wjAjkqnt>bLYz>} zOwO@8sCPu~INr&i(zcjfN}-fg&hAiybLC6FaBD@ZJ;D)>^cL-0~aSjbhV zLTFo(9VA8jw&7^asS7jJwEM*F1zTRWMXLqmc-u`{w`>ywE@Bfq) zlYJ)p=>hTsg$EH2M&yX(wB^#|mgH~C+sK#7!xaP-{1iSZqAIE=#w*S!(JEOel_TA`-N6e2LAJwY?YD#La)xM~+s=KIv)Iifv z*GSjc)a29j*X(^v{Mh92yT>P5_qAfR7PQ&4J+wP?@O6xIN_2keKG03jh3WC=J<}W1 zr_zV&HyEHB=ol0l92?3RCK+xR2^xhNO&hZsdmHyZp?dQ4Ns9@-iJ3{YDXOWi={wU4 zGc~hZvtx55^9*yig{(!2#WzbC%S6j)PhI$qV@gx-$cLp~fn;XYq|<$VkMQ2Z?XI{a_> zKlfjGcJJBS07!s&Kt~{B;ETYGAcdgf=UC6}pAQG~1}6mn2+<8`dO`al=*3#7LTG6i zUYKjx%uDf?*{@JuL0^r8-wjUMMJOpd&WvWOap=8Jw4eH~*RGZHHln-zx^ z=NLB~FCAZ;K$zg0u=ZN@bzLGuVnpJPB$K56WP#-D6s#1F6xf@GZyHjWQsYuD(rnYF z((k8NX3%7WXZ*~x%pA{>$*Rbv&5q1I%Yo+1zEyZzm&=lyoQIa@nYWp*liyn)QczMz zRTx=#S>#v*E7mOTDiJOzc}MduwiKzsO)q3gYt$7u8OynWR;Qcf%hKocRrYY zn66T(>Zlf}uBc(I$*3i+ji`gv`P9MdZR=qT`VC`^%8eaO;!U+5xjz;)-)c^8A#I6m z#cF-gdfn#RcKpft({8&>`&x%d$3mxG=VX^g*GRWYcVCZuPiODF-nKr;zUF?h{>A~3 zf%-wA!MY*Aq1s`=;o1?Qk-Aag(T2~WpPR-c##+av$J-|!O!Q1DO%6^yni`wdo}QgC znt{z)&Th>;ojaKKoIhU(T0~up_(Jd{d5LBzcbR>;5+(#|S$VKBwEB2;Va;Og+q&EO z#m0+G+|A^#^j}N1__vz3<+nfY818I*bN+U|8@5Ndm$A>bU-Mn&`w(0YzH#7saD5nc zM0r&7L*PfpvHJ1SiNneH&jlGkM z&KqhC0II402gn9608|JOfDB5I8#V!<0{OwOyh0g5{hO=_Vf#xOBmj5`{e?)}a1W5* z-)zj!!2`(G|8hpof&6~Q9HhUdMi0tC{;P~4a+B|R0gwfdAds6K3EaUA85Q|vLqkD9 zMny*h2TyQxMaRI#M909wKtsdC#l*tK#=*fs$Hc|M#lZtb?3d_mR8m_wop4acMnf5Zy(=~7olM< zUxh~`yiQC?PI;4>mYbJfP*_x4@~*17rnauWp|Po>v#YzOx37O-d}4BHdS-TRer0uS zePi?M*7nZ9;n9!dlb?vwvm3p@_W4z<-!%JodJ%$pA%Sgxf_|eH1j*}0aY7W-TYP9l z_cYMWT!?S;KgS@sAOH44J0{~@O*pB!>lhXplfVk|!HsIaX!egO7W_|X_M2kA>opJH zAVa{$LnZ{Ifyc@FxcT#K4~z z_!9$vV&G2<{E2}-G4Lk_{=~qa82DGm0J}_oo-uP`Co=0*fzvE}|5{srp*$2tZ5^Tg`{E(a=TY0wb8 ztUUhPqA|5O2Fj2v+x8yse+u69Npj$ z##RY?h4|z^Hb(<4xMsrp|CjE+zSwk`>`>wYE)Q8th#ZObZ;k%f#{c7a`k{N_Wa9M0 zA7wUb?;tfAa^_$b_5Uw|n#sq9AQsA@j!0v2qz$Mf8Zy5G6zd2!Bz|6>+$>!$10j6h z_HnPzXc}s>l3TNq+kT?J0Q)(5i(nWoD>|(nrW}dZe_DTktjquF@{es5#ca>yrngl< zZUjdVTW$pKp9OiN34B^RWsFgS5%<$t83j3!qJmo}9%V)bizr}zng1;Bzrr;R89t54 zq+0GA5)HYhe;4tpE=!sBC`tK^20kgsk?6Ah2B8}$Ne1lkXSI3O7BpbXk?6p{n*hIr zn8s}tIhMvGuxoW=XN^z|Ys{eX0}|Ec#l|#$dy35Zn6g-ey)*55zp?jz<737W@w-S* z>3)EFjYdD+Czkf_^6lSM}iZ&-W-A)8McJAth1h`xv9wphMl|29p1>km68| z$uBSQ0CnIr{f`C|f@0P*AuI!S*9x$8<=erwEq#w`xfbD|^w6@~N|RO{>}zxwNRH2I z27E8)r;}k^;D8P~*R^=Qnw$145sB9N8C=3!(Td~!v6p* zHyNWE3$+qwo~%}tV1pS*H1q%vCkcrjB-1b{5Iyhz2Cw!pKXy2*=&Vc`bCbewncW0$ zARYIQzTf}7bnp0~$N9I0i*c}2xjP@_)_??dSKz6@4%>+fP==HxLo8Zx--swM1YjoW%7*;+SKysm%hPXAlpL;1x!%JV7e zAj<0g(Jl#6*07lru*XHqL2_sRhROUbjQ=vhqgX>r$XdIdYBWXq29<4xon1jgvmWw8 zL(b!G`qp3G7O;O*QuWy9ldq%L-3D34ZEvA|^$;J$MapYX(X*5YS4WRT1J?O4Z=^Mn{HNa%ky?lb&d<~4@ z8|f#r=t(f!!I$_FlTMmccidrHy@aodlns@Yq|-mZ)nZ$3f0HIYZZGlPJldNnCF)>0 zOyG$llIdWh)jNvl%55#Zd=qF*kLLXi){|0(7*t8oXq$ZE<>l9CK^U5}hE+<<*29RN zk@4Xx1@Z-A#gS}SLVjpxWlQH#J!c8Kc4Dnkx6r)iYlnXK)9mTgO`qqm3ldK4!YT1$ zwstN7Cpqu@ved%@U-YmxI}i!`mT!|WxC7ufAkILg z!ewro`S}+{U9s49TE2aO?CSljen*_wGCO!{6mE@EL7F7w=3PO2K~m3JtI{8~q%CNM z8Tv(f*%m}-u^otIkXlIuV=#n_mkJJeM|Ng-DD!7369s&i6n(^(w)`bwW7AOFedjeR za_oReCoWY4Zz=7mbsj7ES8_k;ZK2o}X&bc|^$T~nXiEZHxu6O;fr(*WFew}(9!&*f zIm2HHUinT4v0YEgI! zsV=2vp`j*W?y>SCrHXHg1KaM?CsI4Pt)$mLSRW$(8jz*Gdb}5im{RaMH@T2+sZFMA zR$@?!%cI!cTXV+f9T=prl7a<`>o)S_>*DDz;6J$rR*J5HK{Y;a*$fkd$u>AUSS0i> zQ$yE4$2Q#Q61SD|8fed_zc3E)Mf9E&H+@#wZX=LlT-2xT!#g*Y?Lw25Vj0NDPr&VS z?kvQ^s1Uv)uMd{qa)no3;g(+mh^B)NtrBbBiq$v=SXO_!ZhGxGYCB%4Dd4VHlcguR zG$2a2ae)UOr!TJo?1gsv zAEMs6VAS+AaA9l$U+${J^X@s_^JlEwN-H07Nj2aU-zxHuk-4Ky5ywApcVE(esB~>4 z@IGF_!vePQg_Cfe%$Qkk{Uoh~D89Y=G1qG#^~?~wb5VOyQKbLfYam2X8qs8eFuKCO z2HsY8njr3mTmzAnBO@*B%6kjV@4jz%5nw;NI}N^Oh&LC#S3zK=!jVAYh@IRiZ3$EV=`D|J5(+nKDjmM z9r3z6d6iLzGeGxU%id6Z^zNQ}MS#Vd4T_;$yDa|KA%4cTmvp1+vtXySo8^4dVQQy~ zd+1p;wA?p~#-O|gl z`eZFz&84)*MX7YEn*^+{oCVV+sk&Ah9YS6rLh((8kQS+ z2r}lXw}u>LVNaYRk1v>5?6Aw+)YzJ^V;j#>ofLP6ubOY(;sL$*^90g+46cUpk7wDd z8a$jN6V~qT2l!Vhmn`|Tdoe|2Bp8ZF%JBS=Q~gn%HacXlWfq)iu>dLgjfIHdZwS)= zRnwtwMuZhd27S2^vo-QoZp8LKI+*^K=?V!$)CwEE3MOfLI8)0z+vdJUdQ|gRqqogcV3(n9)I4JsjduSG-rxjW&!%U)~|37X_Dg9aUqZ>4HZWX)- zM^S@6++T`BIOS7V;)Lk!FpO_NYhF{s;Dn|G|^!iVFt#{8s%pzY{JoY*C9?Bn<(L+x?W9u5TDi zA#os|#JqZVCaQveSsAriUwZSdSNgkorEC(!N!STP zhD@6+kLH!ElNm{0D{Vo)3h0McJyRYS-NQcWCuyfrJQTU# z8>EQrUAa@Xc6$&J96NQyL0F2ipP00JUQzO+sV>FvF1)!-pnF45vJS8GMO~9;)9(4j z*^WQ-;N!{N87WHr9fRJm!bsiE4)oqwY6}c6DS0?hvF^Ly$v-%9(8z4PYmzq~_O{47 zLcE;&o>0kCha)el4_PeHA>2J*vUz!N$Q5_`IKf`sO7TJ~6SOGn~CP8I>^m@l3Dzer0E^#Uq~8Pz6rb%8p6Py{hWtHreA5x&y1v z`P-ZD#PzyFqK^x!ss+!!%q#||_)A^`!5;o@;PIvRSM>v&x|u8rxL#`m)w8b=`g>#) zFXAhc9IW4FW)E_vXcLOxUny9^_4AAd$E}4YqlGn@={$gu;@{e$S?sPKG~9ihU_`Ds{eQYPdqs7pjsmSO|TB5+G0<<8Y``wIL-eLXnx z1y^5zRe;WF{&7os?Zijjg~PHze(yy&gOd7TmdSzxLqijty4s(0{j(PZh|6mLMFSCd z&V{g+G+g_>$6SDLoc6!WWQC_Ix%s?|rtYhYIW5SBCQB|E2z^>eM^5|EHph5{p$&IB z2|`F)tY(U@zKpuCH!B+!-#$x@x1b9}?j(MbX)wRfp1)YtDP>d@EVSaiINd6FHMj&^ zMqhA|_wXP%SR;?sVoZpRkp^>Z$;r9!(j^CKywK*Ci2G~9Y*O6B%HK!8c;>vCUD~!r zU+l*rq!lJ4D07rnXi1;K2)2nZxHhY!&DGyG)Z4^Ru(&MJ^|PwDc;i6E>4huNa4H(g4}_Hl!nI9^y$gRnfG_32dtn`!x5;C#ScJq75u5A!{XE|j7sK;~ zh4mqC`98%pn4V|_O0dB@99 zH_Td3AE{2ox!qr-L%Q~5j0Aqg!j5m?E5S@_ZFSt%0`V{H{QEld-xz71zem$54P`~K zZp*vG>cK|7qvLGYDRCsQJ5XlaX+a%6NXC%^EDMp)K%z|S-z*u%W8$vAN8uSF$zE9$ zVe%s$r8{naV~MykpIz0X5v@2^s?1^dz0Iu@Gi}wG|La>}@+GeF*!E}CS6I3}taBCz z_+bpSTpQ(6J(<9Z`%OB|t>k%6SXft_s&*q`X6tz$K21abMl{tV`HFn&PT zUn`6T)v7V2e}g-Yr1*)ZBG6`jUXG_lajt)q^suexJL2}sPqDE?#wPO!$F)!O6Zo~N zJ}adK?;{6eh@RutM4Wt(caG}5Eq~`@r2pv0!=Z&wtrWfOWrCvWDc(vg{*$LQhdLA1 z^-{}_)i9T^gp)&UawzjPP3t1xx;W(yjP>kM3+rOJZXWsxH z?RBE`mNL}&ETK{+pRBZT4<&_5u-TT})`aC@A+UvY@I1;?cR;#Ec(?JO`U=y&?2%Kh z6YT|0!h-}_%{gvrwP3*}%~nr&_tW*I5T{{kfmxk5Zx<@z-%gI;JkbyM?2n8Qo`-7e zLQB0(Xz>WsY8Dwe$tU+H+R>3S0+6#2hTRgx+Ear@JMH4{5`7YDYAZ47nI>i)S=Mh# zYY#NcFs>^5zp15Ax=c&9FW+M+`!QiCJPUp@7j-3tc)8lWUt4q`(-3jFJH~yhks$w3 zPG$jC4weEC5*1aa)VJ_m`!pTK6JAd?Z8C3yeIYC0IrC0;&%pQ;Q$&X$`ap;R4aEZ% zVvK*mJqpeyK7Uyayr5FM2D)cvowH+>XDJ;d(46GU?clDD9F@N>Ij|0YH+HCKq8H8f zHoh#4&foAkKhW#Q&y0BU1GHQU zx|HMO^H@=<-nsO+w12(b=CtB?>utA>HqHz#Ifa&*LKC#1wkXZ>nNHEB{-%cR?OyFq z)6fB2yE-qW%J#gRm#R1iMv8HRYX~ofM=M)@rtS_TW_3s##+}yo&M7_4tJ!{#~QYA2a?Rks%DCB$~HY zlbcreD?n@e9-Cx%yebX&^h<*QB z3r_fjO5nC6DVMK>7F$s<7Bi8 zmVq7$no4WWN9+ZgRf>###JRbTJ)peth7TUPWvn^ty?^d4k!%r!YPse6aK0if*t^mL zUHqp?NzXT$B4;F7Vh8MAPWogStN<&nw5e<0h3lr`TBPR3G%dNqX_5sR7GC;~}o~C50Sk>;r5|LWuj|(potBhEiWUX&#QXvt7 zVk=S357#(aVdcwzit}>i+k>ZceW(0#p?pyHw*1?u3!|FzVLNGef5SPBT>4Xu#L4$ljMLbxX*3X4s|Lnr;AdqMoQOZUB90LSOI z$u?PjddJuzU7OU8Z+}pFq8{RW4H#lz@KfAc^wuzu4vCK+vER7{W@k>k4(K~g2-{v= zSkcyIP{q7C*|r^7wtrO!+F>$+17HB(L74gueG{Gzj8tc2M^J#KLqiCEw5 z_i-KvFeql}9GjG$OBv9F&I~bT=kVK=-H5 zMT2&_nqTsp#l9DdnG<5YM}2-sN{o(o57%g$EGeIKP%>JjF~u8t%0_o?ikgjDY>&6` zNjsPz#1>ATU8dJMnB2R}-W6Pqx?-Ggp;9Xl)AN3|X_y>g8&kk$^Xx;Oa@E4ZD9+ne zY!=E?DTiy|)kNUV{55cNEAZ#LAZhoKnG`~UCy^?8OMVG5;z?om=-VkyN8K9hDj)gj zt}MT3gZ;=9Ga=oMT*IGRy#`dr4fb#Od0DKvH#gOWcpou^p--&p=W8zxQmasy1KdIT z^zZ3010=n@PErt@g%|e`l$G79+4zV>>T)Vo_EfI#qtBT*qG;)e7?bZ3KN#GM zPK5Mbm>|~R$$|5opPm4Sjl%7-Z+5Ldq0!uWJDrwgMxrucGrd;>pa}4MSyIe|5Xsg@ym5Xh+r)AVX@=v9+A5^iaYdt*K?^OON7Mt>st(1N^%yNOL1s9tC zWm)!HAc7@(H&NO`dXBzxiF9&>01<|uwJ?EHkzHA-j`>$9W;aIPs{cZXaU~Cjxamp5 zHR!{aapddar7H@1qOj6RO6W4##e+JsR;?_>@q*~wXl|5u+Z=7_UQU#Ub81Pc zDcflW*%$%Cy7XjHRQ|#Zmrd>Jc|X1<#b%!EBSUV zS-kYr-UFU2b{z;=B4x-q|?!lBKVle=Mx_F@GVsc<=??N2XRz=u`7<#Pbl&WXAnynU;xB zuAPq*`Smdrb^AQ$Er-0LOYKj|wuDYL+eF%6c_wfL9z5^TmHKcAkMlcQx&}_s0v=)3 z@R4woz`3(B3=t}^ zevn9T>rs^&7;%t(()9A_M7OJYc_+8TsWy^SSN|<(PYiQFVoicd)u?#5{R{m__U>!= zQTH`aBi&)o!OHQ4cbF%Zr@=)LY5`rL)CJwc}O^B(mT zZX0};j`m|@mo%whX}SK-%e2j9xd?1;*BsVVC_6ytlsL$$Oa_bb7_Y8CJMpqd;qd))&Rn~8HV>_p-n3GMLXsEy*P(l z4M|CLcboE`XPWdYXb#9dt27}3Ik;pqaK(fel&a?+;A$4N9igR_Iod>H(s_JDU~xB# zFF1tt!?|5sbaOzr8OH$*n;9%|Mb)kLqpPEmv>;jLQe}GI)k}D5w6~RJiH>TIrhmza z63a}fs()-zAS`3epNrZ|SBnU&U0T<~Xd$OP!gxA1&Zg=29_L(Cjf-SdALyBRf<@toBPoYp9m0TIr zO}x;+@1@!b<%-*qRRM-kI{6>F7AP37<)HDl z=NG;2E7TQO8@Mru{!2L6(!5ObkjKMSG935S16Pd8Wx#?1{}Tc9Pv{nlNR?C4L*rR0 zn($mHnG%?Ked8x&`)^!R{^)9`%KjS6qZyG5<9$n##EfE7ZoTb^B*xw?$zB^783O|| zGyG4O;z>a^orjK=xk)umtYHwe!=9^V;JNb6poA4PAUo34Xg`7RTe`+L7|Ai>I5ILW zFc2mbx0k*y zu(wenaMj13D_>y)>WjLDsh{$-p;2}0!-@7_F>-~HS4S zKo>pmbVk1JVDwd#m+<4}kcZ(a5IfQ=R2kb;yX?;Dtt~51G~K6F&~4pQRd2 zNgT@45lx#Z9Li@kd8AgJDNVH|E)FT0?xKe1g=jubp(gD$|!1Py+z4kFsog+6J0r;rihE zUN*rMc3?ZN0BA9w^Cv{yvr8)qce>!=NvDtGW`rCbwzj*`xG|MDSUGK`7uEW`x%x~- z&$uOx;A*=dAve6N4-kUVzc0EVRxv(T>%ynBbA>L6PQxbiY9{VD+zz(+IBwN2KXPuN zko>(*Jc(<#Vs1%4C-#T#3DV`OW=7D)dVHC5!2|DIAA8AC(x_*zEqF4j?*cw@fx3Zk2i=Re z3b6XiUg;l~#WRQDCKId|R3_(gfgQ7;ZPk3yoAYtg=TLAld6al*JNd-3DzLCTbj}b_ z!M(=a<&z?8@B+%9C2we+Ll!+76p-_mX*}cEQ z9f?zVwKS-O#iqF4WmH1%kf*uTP?lL`kyVqh@K16}wcDvwQ=~@nuVnh~tRG9@q1V}^ z1UCy8`iqUZf0RTU=#_;pzjW80s9C@kA;Y#OLE(@^Aq@Hu6j5B9gQ^bvc-P9?`dTG| zlmAGi%ud#-inxvw$s0ck_MYqG>+BTl5E(j`f~&r4eCEsCy(`{1Z|*67|Mq+bI|sY( zo>D_k5j6_DIG?bhDgqNZ_2pBC{A{;oH~D}m zXLDTYs%}gUTL11CeTPmYikazo>kNT&ioDmB31xOwiUjP5&!fBW4K5@>D6aXd9iT^E z15V3Ch`_G2@~P?Y6^uLOKm1s3nZ4KFb*Nxr%K?nnxje5uH+SjQx~o%BTct;HbQC)#U(=cqG+FUfm3+(q!ci(8UG^gXWm zC%fX8iA~#Efrgfow>o_!+03b;pgo69mP=I^hLUSnm{WeVP2PQr^jx~TmeroufXSo> za#p^1Yi7hIXL4Xw-)jz`FI!%W&iDggDBLD(GGcOOk}ha2Gxo0V5WISF4h|(I-_RC! z4y!*rpF=amP`s0q%J;3oQ4^i+t&`*Dhccg{*xLIB>qVIyDE2;IA=6v~PZ3V><_-2? z@KjGeW*scmg7U^9_JRqm{K~Ixr=?MD%AzAKuX@bB_p_7-vK3$1tKIfx%WZ|7#{E~O zuzai0M>RE?P{~jKx4uqc*Rni%G|RDu&g_oNR#S)#X_~i-KL_Box<1oxZK7GC{ z4YZs6!7k{8?k%)5sBRyrP}9~7)lZ(w5K|(`0m#+0_7`QCN{oWh2Wl@yS7#!%ou}y3 z;>_>N2-uAX?XIqnt>qauwttZ%R=s<&jE_Jg&8eoDPMWALBL8_LkX3)B&neg>{^jvW z+<{a*{B1LR>Eb=N!fF=l1b#>V8q-0ti#1y-^g{0*mu_b*t@(g#smegyiWHoBVPS5@ z2)4=UzAhNY^M0n2%FRVjo0t(B)zzJW{5(>OFLjgJd!!qsRY%e`9kBOv?$*Wk?kP2j zbFwV$UH0n2bzv3d2W{PvMF=1NbUo*v^%eQOHk6WTKi|MYA8x#o1guRy(3Y9#H<>|`btoo2iHU@#fPTc!4I!*^ZvmHO$^fk zMTVfxNf<2Sat_{U(0IVpX@YMj))~%yjLSd2>4ili-mE<2EJDcy=RAwt`l-vAr$QQ9 z@W`Gz@7;O|S7KU}Mfp^vk1@-#PJ+q-KH1yo5;4-RBF#ddl|FvSUr0{&+P402?6nB` z(447X*dAmDwZYW_LjtSmdwJ;mRS=9YKV0hGY+~-hj|XHmKXrw@bYlfO7l#;yi5lu+ zl(L7sVT5@ku^j#lCub?q5^lNv!tmjd+5=*}@gKNUgHH22$J7mb4mpb}-rG(2`c45s7kVEjC-_)CBKX zkhtbsV@~W&U~I?@GE!KCQ}4IUm7WYEoX+*VZPZ#3cUPmiPtH!dmIrhDJ&5eiTsdDY z8qK5g(DhTQ*I^M2Xs7DyCwOtFyUe_WW!KSP5Xhd`TvlFD&LZruNUuWSA9uwve6vp~ zJNK}a#{|Wr<5lVBHnMk}n$V8w-@y3~_B`GvCsmX9b->|#H`#ED6MI363fSk6LI#Lh-U|o`4Hn!QT5}%M>#$Wgy*z_Je zd)L|SYhf%0ZF+Du_R&+%flH!qEw20P;vk9?R9^35c<-=;5f+FL%sUA@aJ@i5q@F8+ zULTTSAIq*L#J<^$-*d9Pl|WwAI+@qdkdd2j&jbSCfh-ZqgBX5_R2&z6{K>P!?{lDU z4Cwr4-S*<83?lphzgwEvBlUYhOT)BB|9Zl`297e`vINDk2PIUQVG4sa(Q(vv2ibMK zo!Rr^y<$txS$D+`(A%@{Cdy%{9*Tm`BqV+kB^$?7wX_UWq2eH+yee5#QEGXhZ-OFi zB^~3d*6oCf_;?b3nN!iMi~bVUVE-Lwdx<_$bz*-bYbw{$r+huiC|{G*d&w5_a_N4L z(Ta)YUg6gX!NZpqWZzFfZWji6fd-pjXdPI`7e5Q>^38hVFS@e%QtxW8h5YB&-G0H0 z9CGMhUV{Cn_=^n(nlKrh@Z)CzLV@kWXEF8XMptv$)#ST=FzR9R4mFA$7w^|b$y~R^ zzuD{Uv-)A%!!(*V&uIw0Jst|`B?zBNoh{+5!lwhD=p5Ld@9VV33YT7q?dF-+UBkV6 z`O@F%ZTl+rFm`7ee?b@7PtQ&J2eFSEyfoO99G2vSqV8_W=LfM=R$bu-wwGpC(RWg3 zYF}vNB7Ux_*3P|>Ie8%R``x~i|bcIcNMF}e2wX_eKE^`P`q{W*XS%0SUQt4O(HDU$f9C98V^_aj7 zBrVo{#@_MA1_^sT!o_QCnT)yn%a@b4t0EX5R21*g+rFsnvhwe-ysbZkLQG@Me;AYv zMbJ(3&~d>lUbFUCPdxQds0*#Nci`=V8yjJy4BuK8?_x1Nw~#nBMPH92-G3Yzv$<4I zxqt{jno&C$)~CE3Hh@V`k=45|*OsJDv&K)P#tN9Hu6&_gFlcLSdpIA_Y>)16&5&6 z!cm1M;PR|I+e~5YswBQhCOe`RQf2x462DkLT)B?+flFDac1ChnB+e^CPZIO>-B@t0 z8Qcy|ICM49llO3Ia=s|l3d1TP*nQ{O@ ze=gvicanT??|?jpn`cITvSPDwQZT)wd7)h1_%!jY>l1JeU(Jcsh^J(2{BmCv`#~*y z`Idfot*sk=x;t;5bTkMPHHA>>L7_Fyy-;C)}PRE0DUn@Z$51L^|Ram z&9eabnPGAMYC!`IBQ2Ce`F_y!4eK3XSB_WGuXVVw*WTDBf!~5=C`$%0t=$?r0|lDz z<6xU|v4}BEV;Px-g4~EP&YK1Bzq8=~?wi<`l$Qc?JpQsi_GUJC68-zkdU+4y(DYv+ zGd(}P-R!p7yVtgojRF11>o5KKJW$v8Kwg8w;Qo$Fq%xG9*;;^QRe zVOB%_uAa=K-TBSHn{GY9rZj2Y`(h`zC^(P(n|(pOPS{tZTg98MoMQx+>y zmIGyX-?tKM)O;O&QnUWiw)K+wyzH53AZRuq{-AXYgnNZnY&>;OIJKZvaE@M({#r!n zcmB+KdVj<=ry9Oj-4_)p;5nZ1Dn4j{PP36#{vM4hzVZPUC+y&g>%iLd^1?mu3YEg? zilPXyJ>EO9;xFCDn~g^*eITlGVU+BmooVi?zO$+lo*7#=KgrqLshwo!V&uE9=j?xnn&eW*id9WV*B$esWB$8=%?C zH>E>*PP$8)WOeNO=U)4jOKoQBNx}qKJB@tRa!e0v<~8JvzRr*=3|f65h1gY``yE!3 zR+>e&<*)68ovcXan==zy78E=AhIv!v$Y1qnslW7^n)}FIJ}1sxI>(vu{nPRbVQFfA zKe|W@mlKf}{6`edcYzsA@@7Ti{@9ogxd%(qLc`jRQAFi^;f44s>p+|r< z=ad9MDyAh3v_g2TJi&fp+esMx!p?$cc(in**mf%nTD=lEn7`tN6s zX~i|B7{?i76@w{@^q)``;B@F)7^gcd5>aF%t(DLHS(CQbe9 zOAX&7+j5(Ga2)hSVKwn5V8z`-36YtqEa~xwTown=UrlCFzScpYu9ow3CBL`W!$hv} z67umO^go9p4Wg})r2BUnuXu4Q2)5hO!&RMLWMNzu{J@WB!@iKd1~6h@{h+T2%-tHQ zo45L&Y3C5=1FgyTW3*xHJFMCtI`6OK8k2@aOHm@E&lSDhdar>hujq@%@YNZ$)?f#d z`(aSWy;^(4g8KNfX$yYkcx6+I+e0&0-94oDQKhXlmtE5}5c9?xZsEq#;y5X`=}+7g|g%*GN8x*gOr`#Epw3c0*X{A%#)sokC~}4{7RyNr?SvB zdc;%kix=@gwMGO~2(*Wu3n9q9sZBZNO;0zMP@P8P^{rYNpNzu5~ zSfn_16q|DBJ*~tbPf?fRLY%Lrgw^nn2J~M8ME;V6_6FtUp_`>H%wLlVW$z5cI-mcT zH2zRZE}kk+vi^MdS*aW+e_~R(X-nMff-GKvo0GZInmO8IH#SWZerc??N_`p`u4)YU zeQL8}BH-Bj9UKju1gykd3kBWe`?8mWi&Vtt!xK?^s=(?$h1&YF!Sa z52Z&zu`u4{Y-`@(<1%$oFtMw;ycL`99{MdWj~<_H|5T7zeg`R*>g-i}Hc^0xcbg_6 zdNq~~@p3INJl%Mbd%rwqLgLkn`n&Qm$!|54E(X69CTI~pIF(6;hu%tMZSl84928v8 zBivTY0}jWJ^kX|cg?*!sD-ugF<{XGsnN*#3g%yc}PLa9s(b_}L3jA;gR=h=vyGw9)3KVxJ z9y~xu&)0p<-gmEi&RT1qd(Xc6{t@9xo_yaI-n?_nF~*$p;Bw8^9fI3gsK^x3%$t|Y zQ@v#_fZtsyZXP!H!5ZsnoN(pq%}uh``Vb@F9CvpwozdhdC=#}+NgFdG!;DJgw&gak zp15!Dt7PiEf!<=yTMxsfS;-I_vDZe)N}OODA2Jj;%Wo7gJxf1ZL4-JZ(TWA{n^x8h z$W^LT`%Eft78WV9B*rdqEF2C;q@>RlUb6TV%aHhxBBS>KZu+~l6cXAwrxSeUDVkX| z`<&%8O)mDT3cQq}9{bhWjOU=5^1W?ART^Z0tqi@yt zx{d=O#hqCLp?2b|jW$_$>oYRm+aPuX1=bUu59u-VAnrI{6ZP#zp}6*f@kyhd-uh{# z8`53I@Ii<>oX$*E(`Y!BtmnJJc9ypa2hqskxBxlH!eG$xeQ?X=$9T_Hsw@L~~s9 zzA=-d2v0xE$~5aX#WwL7>-51+vBoU#;^esA7uk+EpiY1+vZB-O&I>{=aeR8UPqAOA zvB&|&9Rz6CW!VTz=+7))ebYj;Q&r;9;JX{qFXo&)oAC@M@lx5?(obI5*T*zkG60&HRCMYAFLvZLqEmU+1F}@-vwb`k&1fOjwFD-I@`P>f%$v zj7w%&KFBbZ^4TQZlTh4;+acgTetj=XNB@CUI1{exJWRyG;ps|hTUiTBVzKv3IwZdi z_T_9OIPk1swM%n4ODF_T;(>bS&5XV}tUMw-9r_rNuc8r-@vyw)MzruZX&f|ipYlyi z7`;5?;9-V)Z9CCB_2g#Av14fxE3&TNsg>^}A*+z`kt$7tO@2n8ReXp@a#Vbj?4wjT z`MF9knvlYxRSGV?DIxL}>)hX<29j~$17lmMJrkUoIvaoWw+(*=N;%i7f#9aO#5LQ5 zAEcNo8qcZh6u{b-%x_>l@Rqu%+sR3Bcs(y;Ahh!4h}-4jiw8&V!K)<|k4fk= zb!(f6XYXUaiV!Ck>hHOe8vUAiYBV>}l$}CzH@5GYmVY~wQU>BhM7B1Ung>L4G&bJQ zAR-}!S~E-u9#d+=rd%GZrUd-aT?FK`|J`^E^l7OMzqGb3mHOjK`SQ1EuFfz5;N|SW z`*uny{3}Isi^4J=#KcZtY{@qfkRE)Ip|d^})Kh|!M-DjqS%%dkcvcqJr%$;OAqw5L zoHEUYyB96r?;Z?wftX18i2ajhXf2=m{~7{1?r*#KpU=8XKQ|9cWb04Q_|g<4^rawjLHHa+BWc zZ1&2*Y-P22OjgOHq3UitcZ?|!4U-t!pQUw9Se}s^@cn9HQ0F7V z;F%AtsH$oXru#xR@}wTFu;}riyY{X)cYs(FBwuDW*ROY(_$;|37ec~wGEebta&h!g zY!V({(8W_$=Un%UFTE5;8;p3zqF#zRX^-WIhb7@t`BNO6VD$aOOYKJK=9#XcB%ah< z8l{|sqrPg$aDTV&Or6H6B==d9+&E9OL|(7u>YlQ}eQ}q~b_cRM#~ zoc0Jx;Tt<0!yVz;8M4khg`fWVUUB7hMWHOjY2ogyIP9%OaV?{@%1&rb(NZJLm62E@ zf57}3)H29jY9D&0GHc^w0q!C^eVV_&4NWpwV%%uqr;~E4GvadwhBUITLPF+JjZxrN z6_rg^69P}w55o$|?RXT`<(F7+MtS3gUETG4Xw4Y4Q}2o%Q(23?_NQ_Yu=qf{nfxVV zBW@`#tMw8u$;9~=m#4l!K7(t7rgmcsnL*1E`cdt2qh}f~;ydUspsX<}a4+;@p10N0P*KxKI{Y6Ho7?AAjSW zU*+xz;!}&atleahK8{VpkYjDaDq#D-zmd@YM#R%$b$lAJ`dEhHMhE>yruPC;?KPue z2rE2gvR{^RSdy5}5b~cud!f4X{6?9ueJqc$d&;4UR!g{JWzDZ(L~xcx6$fkkOEMmS zJ)a&KyrSy-;Fyo7hC-Sk}&+wEPIk8I%_kr&(+ zx>IU2zd<^-r(zf)t=l4Lmg5i%esZplFfx8-Z*4hnMn@a-sJ6bAlX3Xf+t%jl4Ez_| ztoD&SW@eHt&8ab*ksG&yU*%5#dlC)>aQXF=d zDf;B3jo%<@#7ZrqCgcEb^$S1R_eKs? zrQA5im$)>kf;=l(Z7SNRm$$aGm?&8ZwptS%E1)8C&~R>5dTKo@(+XUDIcz1}?+KFX zz|jwkB2k`kE5k+Iyr~1KMWRl)t?lj55-^L}Y5a>&mQS*_caKE^y0?@@SvxieLE+GN zFtVsw1@MOpVIGvB)lai z4#9pTMYxi3izyo(MTrh#@PTZPkn@HmKgv%GrE|ly&eY}42CT!2pv;7VU$m;5qH``5 z@+X&9FAufm2~)SCIOE(3?QU75K@h);Gpy5)1M?dJ8yExZ!|3cxm%3HvTq|Is@h!HI z?CzZ>BPloZqCC-+85J;#yXBe_`b;SEMtn}yFFLCd?Xs>Sw2BC*i6-@&*xuI{^*0sG zhB<9EXUH}SRKsPua8mCTJARFe0I6WxHL(QTmcwlM2{AoT9EWC;V=m~`j%}{KT@vO= z66ST)E88u3`V2_&hTv6c6IH=J$vQOngEZjR!(l-!7y2mQ=A46TtjkAuTZgf~tZPBi zdF)IV-cbHxy_fZBBKxApFs`~7+JT&vqRZc zD8bIsJ>+>_+^VudvFjc+y51zhF|dF&Ly;A(3kU8LpR`8bCL>JCP`b1qfb70oz|A&S zrB<0+*n!Vrs!wV|e4wlIktt!@Aw4ZtZUx&~d(0-@1lA z@LbZ9N3(7^BwF)X^GZ-D^p4CUDvJ~AHp2!67z79C>_n+c4i3iOhjGL(zYcp!q3H+= z2EoP_21&A|2nCFShafB#w}k* zmiH4755@ow%74Kmy%isgo3|*E_LPI3|B3-yJ~7f9|}K0-R$ z4z;pkmMg`%-b_$C1neWUb5hHn`ydJY~H(}=)&`7kk;k0r~98(3Ka46{(`$~16 z)b}X4K&m46*=%iH;j_EGPmDTfMyy%Cn^;gz#PvNbwN?8ZWZhsdwv3^gf0{<;ASaNu9dN*=M6G?ONj_ zDR1Y@BFC9vT|*-NlI;=+UN5hbd%4pp_86bqrES?Y*c^Y)635J84Sus?u^wIi67PN) zTV35}SAy80cOVc$|E?lyT-VIxG9K*RcRtte{H=n9#+~k?^*gmk)VduFYl@n+$X#-D zkOE%mM{iiLAC9M32dRD}cBiDUx}G>3f1_5fl;z!gpx0;;E92){Ib2DC6Twkj@v?9h zdJCt&E^LfXwCM9SmZp}#<}Gt}ic(MFT3QGl{+gwHxx~buJ8Ji=E1N;s_|m}DTEtG{ z=ve;KZ;+(}jnC_r+UQqfWvg|KvdIp7Yqs7{&akx?ET8C20!_pOXdcn)RCx+v@K*!v z_`CQ?EsuP&W*~vPRAwizHpL;oudt7;X;}!PK55+5p~A_Yg7@sVTkDLO7QE02RmEjZ zKUfMXKh3mdLF9XHztQapVhL`U=_7Re#fJ%5cyWJ}#QFqMFn>7f?IDn;R-Y00SsiOW z0dSqLvtARC>+B^PtNHhH@Q6ZR3Kv|2DMD?V?9ZlJpxQpfQct_Kw0vv%iu3tS?9d4p z#>&qD(q>$|<6nPC4P~TgT4a8>9WB!XPY;E3161o!0rLsTz%DkQ0!=$jX=2!B_lo0B zl%!{hTAH^aCx1hp!>fk|_xU~wIr&anWAKu}`)J&n;7t8eu{py?5#gq zYE8MxT83$EqPV3aiSNZmAp+96&rwof|7bLHLkl2utkucu4LRV3BRQUgWy>8EcK zQ+7C<`@LrOs>Uv#Hbc`_cDbsL6X>AB>})`yJ+OnG4(Xvi+NWDjXMtGn)4iTzRmY39 zbsc=n_;ui~W>`zgNf{}MT_5QY2QZ}Wv4z%9>%~pq%4r4Z#rnlMPnw?15lOR;Gdpai zsNU7XkYn7zzT?_BlY*@5o?y>c(vRDSHW5n^X(Y)C(%2a|ei~g&@@Z8M6$jbNxZHTN zQnvy<44Pbc`Iqxjmk&=esRXJmvcITngm@My`iXv=uQT%Yp+X|oEkk7eTzl05K5oWtTw08rh5FW#Hh&4q>QN*S`#pfT!LsFC+=QLm`d8JrbYnrQp5RTXcrzw92W3@OwF!#yYA?h#eF zu;8kIKBqabU=<*!d}q?CeWi=rHX;xJ<4piRkme4Wgm?DV6spf0ZO-2`Ugu#}A)isZ z+LuO_d-yw-&|@@1>pqgHYY}_RFJ;CLdy3bMI_~!UH*R1YI9@UxXa^n{*=9x<5w=)1 z%s^$>g51idI0`8}Earban)Ix>ggi9nMJMN7GcuXaGtFqq+XlCA5^^3e&7tf?oO70t z6Gb6wtwUz(>v#65s;dE>I3Fn-raa|$us#hfN{f0Z|D3Yp^W0OOT0DILh14bcdJ!Vxt|K{Di57YJ+2>@@QN;(g`2C4&siEqKFLUN0UtXW?8xRqaLV7tl5wZOOF zyl*&_N`u#jm%cB<5&>6;>IHyhXaH9T!yhcuG_dbTq3Nv13c}OBJPKwi7cDBA4UZ0; znfS%eFg;xrC0@$t`CMJkLQ(0Pgd2@UWJCK1_w?!r=`gCYC_BMqQQ7`XExI^8t2yED zAj^{2sf)(%m5OGs_S@D$_gm)D<z5FPiASoq zMtyHpiE*7B9mU13rB-13{N~?oL$py%{PSgTAC_mW$md@!i}ee+Y@2c@(CKi!Sjx}* zTD>Sq#H3nRJ+Y?1TVkqf5{gSH{1(t(KY<8R5G9`SZOuuK(gY+vXFRFInA=z{?R>Xc zti8--z1&otMw5V#!Ggo4T;i|4Px{uR8awa!-u_6LJ1PKh*unOe$^R(E!Ztfl$yc09 zHJp-p0vH`M|Bdm|1gy*kfR!1j{{KUKgT$Ff=|i1PlX1uE;|}U}B7TF?v?**x6b$wZ<5AfG(s|L8Z z!LoYHep!bquqZP#iygd4gm+t5dM*zz5KjPfY@=k|YRZ)* z=054o{T&2S-Qnk}9%Fn-i|k_0IE9+bF2;wHbMfvxDd#Q4Yg77)8>II(1Y4Ry>u06P z1?m+wPM0qLA}}xs{p<#)-;s9Xy^kTlcUc>@y$C}=ojg{5H9z~Vzeoh_#--A#vz^bc zA=Q(XldDY|Ix!?QXC0~>d=%-=RPh@`A;bJ@`7VYQ71#|hc@D&>tIPMTbv`Z?xIc!& zD)r{NCLx?rY?gN4i|4hJMen_ZP@G2ZXRkwl?qLY*))p<-j`ojpd{~Yz!-q?fIY@(% zbY1$JIkhPtD)p`P8Pc1b2gyHNp5%B)Ry#gxhA()%G}l|#Ibktzpq)luqG^+FAU@Bg z&VVopw1sL{VbPsDdek_e;zVzLup*_bdL=IPqo*bjV|ZtoTs%#@ zmMI3x-tfv{8Br2;`p2f1dZA1ltQj&Po6#t)`~$aR%ZpbiOT1`=CI3B?r6$rhUSoF8 zL;qJyP4JH0w;K38mWE*MQ??YX#o9M^>h4jHRgVMfV;F`2D6Y7xiK@t5K-;4h*NL8c zw$821%l$ZNej{y^yBkh0r|dYcx0|IMppde54?WAf=97H)L!WNlB;%hj>F=Wp+^^`P zX;3;OAi3HDfB8X+9H}EtXLI$zf)9o zesgi}Hd};bZt&tYQYRX%mp{LgQrcw4W&8p=DSRYg&A^&LP_Xa-%Q5YMq|U|E35^7f z)#0~5wOk|CV*vSts9GE;pp47jtY&eR>7x+4@WC+TJP7Na7bi}P@Ai0_4F zcj!G~5vvJr2oDd@ z%?e}XJ87cF(V~Ydg0QBu%8J3Jkl7x9$3Yu&paOGqQqsFB7Ods;C^f^dX&c`Z=6u?y zxLFq5SHF{i=<*TjoXrnynE$*hq3V#5t7w$d)WmK%abLDlTN$!LzbDp)G}h1i8gF{o zK`@6%z3Sh+{8;s~mDGp75(Qq7dPXA441GkEu|9ZjM!jw+_7fVgCvtu7?{@v62K_~O zALkWs=VZs~*KAdMIZjtaJ(er*!_cCL63O%2VF}Us!=+VVb5;BFucE>K>hZ0r8!=C+}r$ECQ?*Ak8u>VmF_8-0Pk1Mdv(RyV= z`z#~z&zZgbNALbW{#w}eH9%C@rYl+fAu4hc^$Y=`LQwAu5(I)nl-63s(6DhoykD*E z)v|}#Vh^m5LjfLrzNG`I+PvcleAw?U*uwzhiN^uU+Hx3`7-yl)W#YAArIKldf0t8^ zs%PYoO{*gaD`^y_kvS;7*By}`klHCV)iu8oS;Dw*vh%AH4gYZyMYSRgBV*IkCx<1U z{{|84?d~^UGd3LyVF>Ve^wUVu%#*wuHJVSnY7|mkBOq9061X7j0#^7(Li6VQHRyrz zI#XDR-C^b$-Mp@A(QSr#WJ@8BsfjkVI}!WR^JSmY*vYUp>1V5Z%apdYDGnheOS{+C zTEW!upNvA=HjsD)tRM+if@{-N1RCHHAq~i!hdvrVM8rkBT0H_ftUkLBJnx`1vpbX+ z#I9qv<-MU&@N@{xhb@bmmTrO9!}oJ4#wBo9$r2It82> z5Ca|A6Hxye%uOq^R6qCg-HSd3%vY4j%`;D(Ria-_NUhHm(3-MYw#LLQMMl>FxSsNd zyOo?$n%*Gs(m8~}Dl~hVHUVtgMUj?w5hP|+rYD^0r^wn=wO+MMxu===L<5ot9u{%G zC39Dvs=Z;^hJ|F76DOHxWuujTlx|NcSC~(3Xw}JL9uUSJn)WDmocDr*g8C2^WB;#Rk_y_&9;|Q$w-O+mhU$Hv!a7wnolzGGNwP${1`w=vSD$lm91nuw@Qjfy%s_?9w$@^EDU?v{?$ zdx;TE(L3;;*dmuA+S9_RzA(kcimA1SbH*J=2n{Pf>5n2ldFcRoiYs{vlvL+}ysAd_ zi?=0~)*97RV)7ihmZjH5dFL%DH$&lF((~+Trs_pSs=K=$adifwht=~2M!dY<7zdPP z*CmE<=D-bde^J>SW`;2nlJn9SLwY_6c%c#V;!yxT#TlYA8pVN|>9DHo>BqTqoio=3 zW{vy#iZp8TIdx2tMCQ$; zdOb_7XGjlp7LcB>G5ze}WYcRSr<=-6V%CqzTyAQ43bo3`Kyvp+@aKMFyt0#yM?brC zX?>&Gr|M=q8uzHlwX}VfB&2xuk*VZIH?o(hC!VX{Tc^_RK-TuAZnw_8$w=*Dreirw z5(;3ad!5B7m7E6(2XvjB5J_BZmJeaY?TVnMG&jZhEv{}?xmmWC7@8m2N1YyZAARTU zP}*9jaFUY-U;`Paj`ix%Tk$fEva4X8?=64RA1^&TX2#R$rHl0%9Wwwgo}I9J;wO`3>@C z*_Zssn=C573tS7h7l>@m%_kl9g+FWFGpVkOjEz!Uc+);F?ca{=->dHF3tJZp_jy}q zUFMl&SRah4$iVh~WDDzg!2|BlPuIn){}!G^=JQa)v)Y#^FVedU|a4fW3X z_yv5gimOCg#Y)!87&4j#4VP&+h8>ixfC+}i^^KQo1$((=nE@M0fZe*I7XeGDJPu9O z52s--oi0F6A!_%RD`d$Pj5giH2?`}x;UMf1w-mActMy0^^{9!#j+9+xKKGjL?0LDT_*;rE zv(66XR6!eI=v0QaZQ5g{1o}IU5iRp-dGj-`<6Cw(-JPtLZJE4+R*32P<_f}WRBS)S zDOyi2?dzYb7`X5Qb#QKgm-!mLh&t75{9*p)%4UPdaf3s~(B{q)cRqK2xS+EwK?_h{ z4@N>VNq1A3dh~mebGlivE?$;+2s_jJ9n5T~3eK9l)6kpIM|+?&ZUj(})d$IEw(>vh zHxtaMKK>PD{cS!VOYp&6tl<4m_~QNIhN>zn%M?zkY-=vBcQ|c1r+g+q!g#Mz=hrsR z-MIyhshInI6^e6f+R9RM%laG)B;}4K`%(DyWbmMZTZCgGHHhI-kB5PG8v3g>F^`n*U z)b~~Znv%4dWDf~@EBK#?p(zY5giyXWvM5{GW0Ia{&|X^OZ#lg(3Ar-S(k^qo!hL3X zPpZ;BzbhdbdT*5zW2fy*Da5n|4U)z?w#DG@90x<0(AdqG@yW9n?8)w7cl9Iv^ zJs~P+fBeain}b~jXZ2?*7eE|{VxF*vVW<~!7h|*+J?I6M&AIG30uGo>@Y zd^um(A*=Pkc(7^g_7IsFcovQ2Y11o*U3JS@>wqlqfDQbMNHG!??Ye8DtPtVj2<0mzuDT%4>IlYo*f-wWHg=T_dyssp~_uEFnXv9T86Z>sR0;$g)c#?QlNb!mFmm!s-y$&hE01M|9BX zv2&PNz`ZdVLZ?se@sSl52>)g}Q{-s}t$XNdAG(QG5GO#oFxZFK=;G7;ia`O8`In6I z4!^$Tdy}i+_7(}Z`}ayjTC@c`zPC<46nH;OtJsXI9$Q)P22_HeheO7IFx+~7@OY-@ zubb&+QwRm0gw?(YER&V!hgN?0@NQz?q;}FIY-w(hk|QhfWsX%g zV9x|3VNl-ANAuRuJOe#zH6F^^Se~an2=_nqLgs z#jHMe)>VrEm|!h~v)f6IeQw(QvmbQ{{Al+Y@yC^O6SwtGb=XvP8#htaDGC|NKXZ?^ zfZX%HY$D!2Ja+!$_HNNI>ysvbHuA7p@|qv%Q*FQ{{&$l9?+E1oJ7W6Zf3Ge^66K$b z79OM1IPOO4`@$Q07&-+!?8Cl)tzygvF=9Q#Vxd=k5=9(ewD#oNy`l!i9JoR4Qm9vz zlCaHtc3RD>P4+BBqqu(B389tuz&ifbAtG*RZOc_Wz$E1z3T26 zBqDXqVYQ}=e*yKXQ^6#wVs^rCP)lXDn{C=b^Earo(qQRA1Cj zai6scuDxBk5r)~_ol%j{$>;?mn^rLXNDn!0F!t;fqhF_!bcC^Oo>rTsDJngi;JzrR zzceFk1#NJnf?{cFyzNO^Yq2gqI#8W8v84;+=d=du0tPjzDU=~ou zwZX@LaKGj1)fuI2e8{>-&^gKdWt&#UX&yJ-v)gCq5_Sjd z@(mBN93=iKH-CmpXO9nBYo3Lze9a)V0PVCsc?;37!^OkMeeZRum5uxE>%xFD>Oti? zsq<^yEw9_e(^hzhzO^hUZnM&6F5kXjuXMW+C_+6C3HbZJkAGM8`9FS#PBM>g)1>j& zY}o?Uk)_eJETLG{pY&y(vm}MslJ;LI_P&b3anc{%f4JhSp$_-vtXY&~@0s`Z&aql= z!w%%JN5^L9o~S;3idQoZ<^JL*Mq<&pR!*LCb0Q0FVLjONW;}ctdg?V6cx0<`5W|d7v=)= zp|8937)AUNgoRu&QULY#*GS9TyJ!IG`aggjr+`w=oK+N3nPMb%L;71LsVg~ICEX7fMC640Df4J-ZZ8W&byZ{)0fMPV) zZLF(awy*w7PX^b+h1n;8-f>GedpqgnB39li%Va`13^!|=j_y4kO7~!>u;h%%7r@C ztzVJ;1;4abB1#gBe=VT-vB3`Z7Qw>$|ym2CD=hB=$4Y#$Ms69YW?9E;Wk*h&F z+0RzD8khHdk@H=&Idw@G%IYY+iUzyVI#f8@e+*A2(?C9#4;pExt(%fv8K;4q#` znB~_e(cfs*{^`z+6Kci7^6akoFf~UE!4<_170a4`l}+q;QUrCYZ46fdG`{zu2tbak2x&1~&ZL$uVwm&8 zzJ!TLlv8tQuG`wA!hF1h=aqd%V_lC>>@#7xh)_3Q(@vxT+;vZ8XzyB?#`S zsJb`jT%{+RsqEKG&+mB88myll@($HUJq;GjZ(@LJ>-a)F=5AaUqlp?5zvSa)LcQ@_ zShfPfIEw!1k0P&sE&wGQF<-hiU#6f{p5H}j02CKYrMMmAlyQv$W zb$<>;9V>j=Pu+vIqQ;#a-K<7+g%R{$L@*=iuUEE-rYNUgg zSRdq2Z461=ruT_H&Hj-;`dfyR$3xGSh$y4LoGH4p#SV1Q&6IgnO=((jk}Oc{=rJe` zV*@fV;^Qx&`RdE`CBr*7lipG`*G=^N9S(t#mpzeA`^{AL5#=1-;mKceDBI4K+87l_ z$nU7)_h7N?Eq_8(PDdkUy!aDW&+RN^1$-+N(BH@{X*8pqu-W{MSugtTZZ{xx{Dq@K zB*}JK@yi1tmD>E9y>O!_EQ|NbLhrB=Sy!-rYPY~%c>1nj3_3{j!+Y2asB5-n5=m}C z=N-1-TdhS&rYH71$MFVe?Q3@XsGA^JUti*6bCHo;VeN~HUlW5o zrKHTVG(1C|fD4g5*M;dJMigbcN)#K{(-^O2DCiJ20$ws4?KJ&h8iOzQ_Tcj{N@ev;8@qb}2lmR27?JxR@X(;>Ua zrz~yWI^Q%dJjxRHH1I!OP?qr)SKAW>oaV$cuz=l<9tTu6HVpPlH!n~M6SDThOlVb; z`a~K-qS6jFEIk>M>I!xiciej-r}3vPL!)ei)o_LG^q-qw0B`ZRh)bt2E#f$;#Wv5= zV%lK-X69rySYR-71CYg3f!lM6bB{Klq_ner!n$#}%S-=8`!BD2h}D|6!u`i|tart} z3>Z9)|3pn_Ndq24eZV9498YeRRWYHmn98|mO4W9DaJiWKSsdLNL6%L# zIyJH6;=XvHI=@4HVMLPI%yJ`y+Uu1Rv|*%2o=?sW2GYkVGt{NzzQTRM&1wGFpAbvq zC(({UIYcpP& zpH13aOMm&{TStUWzuCd&f$-tz$m%-oyP{JF2_B?!OT7+NIy0KI;b(@+<)m!>~m`Q=F(2ctv7UvykF#^{71knh zkogQI^@T;>RO3FcGg_po*e5kkm)I8BJQJ4ri6ZeO{)W`(aMo2J|J5j4ESv4vCi(jN zUhJZtp2GeMyj-`0b#Qs~Ul!A?cf)OSLp2nBuvdTn`l#|j9NDaN`lR?eJyx%}ZwjKS zo?jr!HfF`iNqZ~z{*?%M%FSd2db8_`y#KJ1CX?deVav=4fO~xf)_{Qj|Dp+t7Rdut z{u^Whd;wLaRzAtQ*rm+7*zq!AgN$4#2EQXN+Ej0_sWf(T7GB_6pBQvtCU^%kXHl+_wz2NWYJ>d1!Dy9AEwD`@jWWZ51gt9kd#*Mo)etvpY^HbF<`E5AsoUh(maV^nn1J&} z@0I+}VuM7TVS(+y)K&2X*bBKAXr)YijJoDhsp-qDDLFQ<^xkpHEv;iNFAr3%2%dQp z%x!CHHO-L~dWTgZo1ifJPYx{kJ~vm3ul+4q*9z8;84aZO>a)up2;++^D>y5bf8yF| z@Mdk0!UBrP^8`e1R({tX4o-XO$)jo;8cat*C0WA?H!|=;g!>iWdOQrx{*=u%m6Lss zyKa^l(XkvXnc2Ne{VEHk`MQoQyKiE+`PCh+jp~T7?ib>3oi3B*KVB!TIP9C|`G_@n z{{}G(Td&demrM&du%*=0RGI6A9re58I5Oe4OBj0|VTjRG*m;3-YptF@@qZ&lKN(+{JVNfv9~5k7PVIV)$#}`y=1^&wSO9W!557)l*BUPkw{a z>zleh7~83P`n0N2`_>vxoVFfAvQ#gY-2vQGHXl9o@?ZVRIe5(3`7EYk6KTb5Zscl4 zpnsM^K0OzX&o9U+l>;1d>i-mh{cURU|1D|tm7bR4D?VU5c{P`Aq?ROD4^&}j0%!gA zTipLYdW|AzeKfQYuaiN<9_>}AzcVi5Q{$|!SFXv#CxWw#Gb8n)O+uVvG^~59Z6d)( zMV7#6YCwfG$$b%il`{5CheAy~n~L)pT&3<;V)0@r_Im!o!z?u6q^3>&L1RmYSK^$H zL(PWax=Zc7NS6$SZ9LG*dYj)<%;Lx1@cpIU)%4~irdrEbvnD~FB#oXITG*Ri?04as zRoHd5gbYgrhZ7Cf_!g#c#r`LVG}LnMf+(#}hvSCV)>KPzLN?Xy&l)e0(#mQLo$B!$ zJVKO~epM8UxDVjP%e1~3MXRfh8z;H%QM$n%V#GX?*3NhSfP>2UqNA`8{>8cWK>!>C zJa>P4kkpG4X^H~Wrn+dioIh&QqK#NIpf&~7a=vRyJCntUr+iOt3Fn@ISswj7)kLL@ z+4wy(C}HWWYTtgrOio3WEN&F$tOg5r;b*ZQdu7jL;K{#w5sExgxnU1*9egHAMh=F5 z@p3=o+um~N-wc(WX-7tXZdhjgp;zmruek#;wqBe{Iildrx>nLR(9Mf37)9OitXlcL z;>o8Ok<=G`8@@lcc&S_(ndbg3CJdn5RGebpB!_}9S<1wIo8ZMC8F?KGGzzW$T^_n+Lzl(s*X|7 zAY||=qL#OmP^YfUF20&$zl@A0knko6r#4~ElQpd@&TkA_V_$asWjN1T$rVA&)1B?F z_;B%i&!l5pjN^-U;U@ud9u(STPsh(}M<478=>4Kh5}R=`lgDXR;eo#+b1w`~1+o9) zzCv^{(c5ehd+888iEml9Db3|R%S%Xen1C$#lDS*%8?vv)KTWIO-yJUT&4}VgJ6K{E zn(8XVLeY@#o}CfLwVF6cRm$XR zFHoj>vis|TL_*V1!SbJjL48D@x?c=?L-$W_9AC4rX>Q8Omj<32?j>STM2yP(tcr;VAw{J*yta5GVmBAtX*>NakBzGqQS9qDd zgEI9n(=idV!o4_D{9?|G;AyrEy6_JA>$(JcMOGN*`{2nwpGHU&6`tbDG*`8-O6ZJE z)2<&)c#lr+RD#dzg}h6W_Du|un2w2 zPhWr5e&gKGOgNnV1zAEX*8F|I(f4v2`WbE{eB|Qru$%-h<41uV7e~MELfLWn&Fvi^ zZ2W|b{#)4i*YNSb^Vkf&z*MFjt>mU9mJf0n*dPamm?=pghnW#}`HnY?JA0QibbAMR z;T`m@k!JhcX{xnCcNQ(Zu%h#mZWc>#qP{}>Sf}^hzi3BAlbNXI-?vaIE^dj>9OlkF zb%zS=c#3L7c?CM^_#EG)R>9cZeU@(?FTcvijQi2SF;-vYyw-?Vf^e3WG-dk_hX%fS zuwqTO9+i!Z1`4qudPoQq9d~2%sw{?v9Lez~Su;8iulG)VgR&DjN1ybV93ZCvO`z=? zKtR@JI;`Um8}^bIv`G0*H~{?*2;}I3cqwj)r$+nwMj5RBuc>xGC{+Mr&i{-_DZ2f} zWkei6K2*oJ*&il&duJn?iO82ULs@g=N+Tq2l#kXklj`(Qg@YFvGp!7}+8?(geT)FU z+1xtm?Wl87;|WgSCCCNBw&?wP?-mMAJ;(p3b*xa@>#V`y%Np(WI~rla*l>od>t6 zH`)VqquhSKLEJ-;Q2Twtb$22EOr;-Xla43(p}DkU9uaWsD-pNW{GO?NE};a8BH=l| z?-reMosg`$3xy$zPG~F?sSc&|OvMnW-6+}^)1h3l=6+%BqbLd?Uw8Fe4t^**U+R#3 z*^GH^PQ*nyY4kpd&7$kWi6GuQ)$%e@4LlEfa~lk^ycvDqFmWVp09bgkP%a8h3B4CV zJ)0ztb$iyyIWVRtpH^9xBDrYM+qa>ujAfq?mbZe+sk*C(ne-avP(mjfBc?kPr)nP5 z=p1?7#Z*&ePAui}vXN%hb)0q@;bYC3GO>OYYoMVbCt#;iP4TTm;i+j-)cP~&RH6&1^Sn!mh!71((N}&+kinmB{ zQrw}q6)5f&+!LS>^gFYA@3YR{Ypt`-*>~JA?mhSZ!5~b=AagQj-uHQ)pB+JMZ2?ks z2NDEdy4OUNzr{qQuiK-B$hCwn9;i@Tn(h4j*#p8!2i*b90c*&eP8827&rq|Ml$vV4 z>du_Qe(gfD{o2f=G1Uqm$!>gUv?x&s3amGU(u8 z%#XN3W-9c8{15x{iP8x#Bvznp{P@dDr^u^^uZTfO-kHNhK=g4swUz%!Hy={P8#CHHwxFExP{ky6NcYIBAOygLstUx6N zP)$%h{m&}K|8uIvKb2~JaO|r;5GGtr*Bxj4wY=-!R3ZP2()?dP=DF1K7ibPw8LQ$f zm$>2y)x+Su?BRQ*&r%_c%PVbm7RuN9i0S7&Zd^HmfKGjn`=(41q9ui62DXiU*+~`t z5LpTzj$sv^eqpagAmu@Oe&c4Xv9`j*u`I2!==!Ui_#9wMMmcp^YB^B`cFQ+V?)1xk zg($z*r;YEtMRKo>8ej`V`T7-KK`^*GnU7nNE->RPRzLK{tq1|Z^SeLl@A&cf*VuvX z)cj6n(%~X(`~IQhfe@3fjuFsB@c>`0Am~^BpRme|O<>qhHD}@(-tUI4E~cqZ`P?ZG zHuNa6JZa#*u#vsSlF-}wo5!djfx#w|A-Khi9s2_sWy*v7mTDV=oo!hUji^tx9)erx zp)5Z<1CrJ0BIDfbz0}vy&rki+q-2e&R8q0VX!=kP;T`BF#=%?|ZN@V+ z)jU$#qk8?ck1Z2~n?BVWP_?=5!P#kT`Ad%BI(t|N@>$ip@(HAnqdo8OefzKj-(vIc zvq~nL-9-!Z?e4KdB(!yOmr)b{Lw^g%{d&Tr%1z|;j@{3|L;6=sH^8R+r zHfVi0dwLkvTJ0W!*1Wuw=uOs}yefZK{Zam9Ah{4n%%&-i3WzES#0KhD?*&u(3ws8y zQh%@Ymuwad^SI~ZInGAi|5I)J8J1`;=^>QQi8mdPBWN~-FR6w9kt2|GQQ8=AYKV!5RrgM`KPw3WZS-Qbl>qgAfr<2e?i1L;?C+&U3nJnA@D_N3d{5xZJe3XU<9VHEWqEmb1bWSn0M0>K)f)VuyG14B?a39$wA@5;dUt zJuml%Px(6I-^UwNU*oTAen<(pqd;4tTC@tz+Uv;sl9d#t4qdAPv!f2=x-blR&m*@1 zua4xBrnPHtewws#U@Cv@iv3&biQ`7jH7jB?BlKZkd znpEpdXurvfCHb9`;^0-*^5PRC=C038!F1rr{>M33-mRbM$%M}ni zsA;_9i8O{-bg$Po`dD{^7!=90O)>)e`efo^Y0dzJ^fNKYl^eyaoTprCBkZgjUQ^N7 z(C8kO3t#EyA2)+7K8!N5;gWLmsICr(WTpEqH2@OSGk53$kuPoKx1gt2}*mr}g zSB9@IQd=kh-20r}h56$2FVKq}N!}a9Ixr?@HT;?qE9Js|*e*yo=*2o9)*zeH*hp<< zMo12kiBtE!+C{duT4vGcd-f~`gtCuhc}&}7f6utb?U@&XXGeR>Ipb@fB)h)a9ozG> zt@W@EqDJ;Kyvo{RD$RMwg@Dd$t8PNgBz&UR{FJkf)l<7>4FI|S;^fAk&{oh{E&JPR zO@sH0lx0G2-jrhLKRaOH-)?Q*tRMGt?+Gp`&|=kBh}Z?Mj!GRfr~l#-mD!YdsYJ&E zqKVSMypOw$-5<@9X{0kB(g%t}(n5QO35b%292nW~yn)ya2i zfEm|Z`!E}6WfO-yZ{t0N=R5j$pHsaylq7VO37q&^qhUp$=88&<3hak(03K<>KoB}J!rtvR8o%3*L2eH%6Qe0d{+LQIHSUbN`2`U>qIly zV@>1SqZ;ma{yOwD>v*l*m?Z{NNeaP>4GsaV_$rpFcgeOh1$mA3(yppA;Jg(RT4Wb3 z(iC#^$9^qhtaz`LvYOs`85RbsKJ2syqXhTU$_BjUCiwUBKjekKF#Yy&>nm$G1;?E8 zWu1y*bxNnorl_=>tXY={&sY=L%5Jk^SF$r;vm|^TuljHW|JjeVI3=8?e?G`DTo`D0 z)ghjz_n|JvVqW)&8^RypC3!yk*AKhKUA90RHbU&w>DYC?*dd}4e!%40vKAP#c=V&tumpD}2OWW+5~Je>_=5C5DEB(XM>R9Q#nt~Y zuKANx^#6cM%zy1Y@e6zNzCCQqpVDUk`DeNM@(;`L-#PsM@BO}va}}@^V_tR$td)Co zY}MQdA&N}5nQ3&LC0Gh7!2g^ZKPP!#frQv1W#yeO|0{>b*&$rK{i-tpSw8L%enU9h zoTBc%+c06|{l^c8wUR#ZFI|aDG{;SBIkna}?&C^y$lhYaZL8Isvz;xvYo0>7F*|;s z*?UfJZmIbN5rfN|uqTdb_Hb|Di~!rHhtqh}=w{ipj$5f$4aL&#^FbXbe!d~@(vHTJ zvYhGOkh#`DXPqjdksV4^1I3q0)qXyz?g=*JUSW)9!U{+C5rG9fIBn}M?lWce*8NPD z!*oa%e+FbP9`A$KKjgbnQI4LyMGk7JEGCxHm+7ne)L#DuB1y6Er?dFgu5*SFfyPCa ztveu9p2<@E-pT$gW4KJSr|z?AYc=jK!0|OlPsqHs8aBTu{Snuv`0I3*FUYAU_(z*# z$w}mc;f@PeU2me&>cm`@u;toMW>GS6Y7-iuwfLS={p{8}Ns0F&Ed(7t4EF_J=M73= zlou{}_keLTm<}b!-Qz6o2oF!pZlOhf1aDl{VsBewE4!b--itxpm}y~#@KcT86b@ep z7K&;`-M2XH1#GKezfcNpaeG zI$OzZSa+7FeD$46JiAwi8q(HZ8innz;_XL?5BKMAM&-r<&Q<>EoiG|F8OlWx!bCfm z-qOOyRH^EzDyocX2kzp_tA{^m%K#59>#_wI@Ufx3m%RJU@#qEH@7gc^BU#mM0CAz< zWAc|afy5Q;eC2cV1%AwZWwBg9YpH}F$9<<_>VEDKdL`?J8nj$eLr>(MO--rRaG5?I z+N6^q{&5Sk1p=a}99MeUmdLJj#!Hvc?kR3!6zXfTyD!^E7Xt4un%B+NX%fp}dO_q* z!M471*BMRHXi4L~747=^rlxcA#zqUT55E5U%0(eYh?qMNKqR<&4p8wVCBkHke_Etk zx;@Bq&3j!8w>*a{^gr>kuCLFSoBHj$8di}-g0{sC3UcF{f0Ob1P{QZ>qETO;UnsyT zdPYkHEA(hW-2FlFrw5GDQc^+V=?A2@z90$-jq6z$?C_h%b=) zRZW(9|F`|Y+|FYDaSc2bRhj;>T(7TQ_2yoMG5KB)P(~RYxBuN@Lz3R=nDqf!v{ln0 zGM1Z|^-8I};{o7C8G*D|!x=yG;+ili)8em|;h8dQ=WM8HPsJ+x71%=ap%H$L;E$Z-s|!8h@9bdMbUM^(OF++IGUhr>w4xT*#5mkeRE%6AMn z@0-2-S|iDj6F874$tn{*fs6#3B0oLI492`&>iGU@6#is6G4Fkpu9PN+GY|34mp*Dw zfgM6yMcV#F#e@}fqF92Nmlvd#+ji#hjp9;kWoYPE^ze@?!Z|&$vbb%`Li|I#!_Qtr zlDL7RLQFr*Gppxxt%7`|0O_PhuvjGP(53pRPQqhm`TgS2^i+4cu;m z%|CD}f(RyXG2?%0lTAqA!T3$0T8Sp`ec6Nv1qveJO0 zO?ShnO#O(whNM>h3%(@oa0$(a!;*bW`}MQ@m!rSgLL9d1>m_L1zBZHwk=0L8Q#% zM>3TWmFgGg`4cw4(9wh8i-Np@0ue;j?NDDka^iEunZY?2tQtSzRpz4bnLxeOqFS7(J8Oq50E zZn=wW&xWk`ngZN3|k zK7-_4Z&F;CA-=m|nnVeHMecoBtMx6kbyTH4dUZXR-2zy?1ikiO1f%C#D}fdK!|3V| z_WcU8z*wgtU&}C6&*xFU%njMxug7MeYD4Bek-t(1Gh6r#*j(njoM%J2X0iYlbems` zaNi1SMo}Kk9BO{-mi6hu{9{y?y$KN2!LK~{@Kb`Rkx_<>ne=C+=fTHR=Zor8@TJP= zAEgg(*cLh?zHZk{3b05I&To^#1VGaQdSAPpf3VjCDoq0C7V4R~yLxs-l{wYnovoZI z*sjX7Xi=MVp%Cts8zqgb>LL$hjp_5SMu>QnW+%NB3hrrL&ZR*sqJ0#q@C=% z9xY2}w+UI^8xJ(bk*yUhKyDmTN#3-c_=XE%oixKyn?gk??&{p!gkdbJq}UIk23(M}giLR(@Qmj)Xg7!;dz2Bb+hLctfFk9>y$Lzb1bEPRsVeKcagBvY zLb}+q;Sxyq#nNWOiN>t74mv}rpJseU^XY5x_B#jrM@K61zxMd}SOA$)_1MGk+1Xj~ zaIWm9pUAn|iwc$jxRm&FA<)YXra=1YAq$n4IP6sB{$YPS6aGVQQ6cwW3(7=;b@>8M zEqLvg53f0`ylvml_(;c&X_e*J;P=gEi-dJWoR9h9a4vcAnS$aRY^6}T&D&x%f$e@( zzduu--T&c!_jjB3fA1V_69ubmXjvC&*}h@icn!SD-)Eo4_e$_enB_W+JHTz~m+7~s z^nrNbELt#GJPw!5Z49J{|M#v;f8XW&zwKwOw%%`AF~13R*4C=J+ufG`mPc2!ml6h7Tb6u~rU7|QP@&naVBk}fsHYmG8D-DciHPsi^BkNnv z7&De79|R8^8LCrG6zNp&T@;)z8S-`+8aB>}{_w{glcUF4&MBkws{RlSihh=7%XSp; z&{=SS;Iv!=W~tNJX?(@^?T$sVS@?eT;5ZL(G_2j={>CvyEK+A`^u;u+Hw2emFsHva$yQ?U?2thQ5#xcnB-i-dlDhEs?0z0Hs0 z)Zscpy=AEN_Pa{66rJvXG>eN`Up*Y7n$%J9dtOETzuQViG z>Ii5B-$sf+By|8YRQ2)HYI&+_Q?+!s{Byz)1GXxNK-d+NX2RM!r#nzj6|?Z{8Z$WwcxQ@2{a9ZT;`uA#OJfKxdv1U@aK!&V1`iByN$dawj$JRjMm5 z^$DCzv_Y%hnbs1)z_=s2{!)0ColLp?s*Gu+jPn+Y#;pqj9OTm@%TyPCM0Tt}z($K3eQ+ zJ;?7bJ6Jl&A~E-5COMih6Rt2B9mq61I6=5J*O)j$9n$fGd=>u`w}_KSSl^%Gc26Qx z_Zlc}AGIt@1I4WsKHWC#YbFRY5!%^`ueX~?3pW_hc)rE18udX~SC^@%zu!q|HAvwu zR&W*EE)DGUEmCO;Q|95zloWGDia+W@J`rBNQ)Vc>mH$;MNP+eOxhIUN5xl#5G8KNX zZxf+ox%~T`%4sW+Cy!QwD8kh)sR}iD0e_r2XV_6XB{Dk2I}e08Ietuys>O(af$#)@ z=yK8UAc7sx+r;~|~f1i-(aUv};IkdL=ato_Dx-GUef(HkNBra0w=NnO>KARUm(J zRE*^PF$>{99ns7*;0);l-SFphy3HgxU|!J1XC$7_nO{O* z>04?tkXv#Bl$7u`BdFF#=fd%LS9VNKz@-=OLDg@ladenJ;g#(~3xQ|38wrU-hV5hW z)4u7jLstB+H@o-akcdLg0>08S<_^aQZ~QL52lT^tY)I9HNt0AGcLeO;DRq_yDaSmc z+j5RkV#x};VezE?3$&7ta6AE1l&JXR*C`6ioj7TWRAedW>e7LZd+}KmmzU^lU?f)% z`(muZ=3Z*#w?iHnzxhnPvPA!KVSgpDA|Sn8CZVWgCf*(9@6mGUC^XPsjQ=)PbuKvN z(>5c;kD7BlE#qA-VWt_4k{}X)FeHIU?hv=_NnLncl{jn66n4#)T^}6`G;}2`!k&b~ zMkDdP$Hxq(8}c62iE4DT(QO}}{y}w|jb#HImYu6a`DA5G9{7%L0L~QJ6Jx+YXR7{c zB)mhXZ8C#!qzW}_TWR=bMy!BK=Y;}-KH=5{+M^x-Zprf>xaH?*rx#ci+UjzQ`+M^s zb+NgfN{*ug!h(?L`vp4Fy?V>*zg)WI-XO=McWK?Exl5}7y<}~jLbzV;(`_E1HeIbY zJaPhYbn8Ap`oZxEKH^|=tZFw*M%c@_Ry`w^9LYu7t;C*;_uUxFN1By}H+3W!mC3B# z_BOB6lWQ?ip*dIE|E}SxyY6~o^Tz&Rv*Ek3Gr;FWvW7PG9%pC&SdedI>nGUClwnUc z^b9IbvD|yILiE0lU$hKfcL`odVz*2m@yLOJy-j>(5U_wTMJ>$S416uFD#1rPlvxchv2;6b%SCX{Aul5hP(!;8w zU^4+IokAL@ZsqJyZG&97`97{xWH4L#lZ+AN$f_9PH9Jxzp1}*w;6)BOu8sgpOE>7` zN~QOc*u$BAFx+9P`D)O&HH?9VA-ke`<>BQgholj|#+j47lRY2xy<|*&=Teo#z8fll z*4K2rwMIs<#M)}#Bfu(rY3OKnl=6C>tDr7Q%mh#rNwK9`?GIdnJsZ1B#OpAYk?S@M z?B7yV3anTB1h}*tgJ^@f*mGmZlFn0{Zj~LHU&s4X$)jtR%c2=K5^eK$8$K|@$Jd{+ zWUi+ypeXJV>#Rn-L}Z8P@buhH$+&PtoKozx;y3BretW?`I5-sPMfv(=$RgC5fu|@6 zP>z9iQ?HVxFx1+fr~2RYvRCuy`;3((UMD0CJ=sb1;de+=T(p?yD01p_((4|Z%*)A9 zn}Qq$h)rJF#LImLwiU6P(b=AT(My`y1<#{<+T*^QMd~YK;?+&m-;Lxcod!vg?Hwtd z4Rf81dO(phw1JQP?MNa6mMc@&lS%<>P91CIm}g=zKb*InDqg+jtXy^)m!Z7zn&AsW z&gYF!r!~YgWDP=``G8%ii`#(Ued{bnaY6D$gY8-Oj60DoZf-6-9k;8L8scT*q)7|H zzr=M0g4m{pl1v^0x#yL@`DJ?cbPoi;%eJ|TOm-eaDQ3^*{628*)!cq0Tu2<7_WWK^ z>%p1u($obqqWt7&KAQehaTQ$k;<9XDW3R<(CFKseUdd$01E2byV_4kyMN}hm-~*zK zW=wL&3&qO%ChF9T(!n3$tSld?G9;9+%sdE0e9xo0oeShC4p#qhe6Ecg-^9u%g@T|l zb!TpYw@^pP1O|{XD=t{#U@nl@5Np`s&rsaO@Zw|p^BrN|Bip_12MZUC@?L!c`6t7) zo1}drsneevrud*9{QO5y|4&&8<8JJVuq}C$&?6y$&K34g8H@jn#rS{lF%tYoNuNuD z)yqErD~Ij>vOxtE)$fxh770^N5=IQ?=5kc{D(YT-qBO4*UIyW?WhTI2FTUxkx)iEX zH8(eiRSfWcP>NOfw+O3k7?-axwz15A(nK=4t@X}oqAR=KsL~9=ID}Pn$T=ZIjqocS zC!D4JB`k7PK%@hm#Kg2){Hf?1t{L4WSR2w#J_Y8;3F370w>G9i#DUdvq%a zjs+F^`K*h|l3W^i^@;2Er5QSMrw(>2{ycG8x$vopT8v6*as%aSqn@sS@C$W-0by(- z$J$U08NJIiI|0MhlE8n>lVkLH+UP9$@(sEY}%k(>gHGB7;>-DLwQVRKf` z_8E(PJ;uM##0FSyoqo0!J4xuxe)Xo0^^K#05OE<<%slwehyZ zn*DvWpZ8WdzuX)%+!{ud<9;D5R$R$%&+tG^HYG6;R>UnI>ZC3gk{a9@dI0UFG}XJv z$mIgOmqalno%21>n1JaSK8xSFdpT5x6F`{n(Qm+hqBgHIAs`#881y}zJM?DcXX@#; zan1FtZN;|#(Yd0kiF&AvL;&jg-JB|Vuw<%s%D#f@tx;vV`Sl+wP|z(k5V9I<$UJw=~3Ovt=;qe={k^ZG*~ zQ2bXCfdb%HewRhp&8GbWyKYH>8ReY)3|qB*ZH2ME>vn%h0t=EeTSa$n3jr{JvdCSB zLzqY!CVinoQ3Q9USMkr1MM&LyXb*buw{SQ?AaR zql)L0_D?ZTp(PGOwFBmNr}#51svR zoY~dEVv6TE62){MxJ4~hnhHSsIb7L{Xr zZKUmVX_k8B>z_E;KlRCO`?9X6Fymq^2b9qByp9&8q~UC!`>Vx|@%eiA)Zam5Qb&+m zS;R6DZD=N%GzjWja#hyq0Mm%E_h9bg$m^?6gl7*~m-s&VSHzha)%p zVBT;-QC^_*I4M^0{#|_hjk>IeqfC;H9 zB>bK@e$?-TLl5C}f~Ckk1j5Y}yR=9#8|5y3l#<-U)Lbu!rUSS^Kwg$K(ex0pXcX^f z`}#W!^psYI_Qy?cLm?Q!sqNf47-pEanK`rzRXurg3I=lI`-p%2$|9QC0gWWiNJ#z zi%7EMRIg0kZ37xk&AV>U0d-zOrD$z_x(iFlX9uI z0K@jr!*;^laX_5kCM7F#%SMN}S&H2O4j$zhF;PB+g5$82VOE{>A29afMA?n<{z#9F zJpkeHt3Q7B4>$aO==boWZYE)siNoEckH=o4)04_~E?$ZhwPQfDL2-FzqUQERD0xYKlLLgCFTFxLhy_2?)~F zZD=x19JTMgei3x_vM^@sBR)No&E2i3u6vEG7~bYuV|obd3M@2$+V=%I`*dS<|Osl{%-Z8E9>j<-$2!CLot6PRa-YDW6W6F+-1KE!hMJ_$|-;&f6{S}Kk zn-tyls5G>38gO)R*pPagNGM}ROjwcm43G2~vdxL%S=SB4+ezw*1UToF)VFPj@ltFv z)o-9-t9ylO-VKz|wH;Wj-Aezk$DIw@lR`A$b!9 zSTfOz$g$)T>W9dOB1^|l^7G-f_N{4&OY(k#=QN~P+mvDP>W2J~gY4$-&FUg_IWj_9o-#oMEQM`&q)%CZQ!XDZCPYUXXlepN zmweZ;39-pY@k+2vp_+E?ia4DAfw$O{0`PoVKmGTkCta&Z8CHwcB|@7A5@A|?nK`&G zIH*)O6p0((<0PykEUyI07ORE#HxqiMzRS^-K*X!crrHdGwc*MoTSMa9jS#W1y~&?g z-qp>$^my!@;4K)o8+J#R-2P7@&L2L&?SnTaRE2k zN4siMZuh3k*2YgRYd(b+DOD(#z8PT_&Xlksq<7UH9-28}y!WyiJrrv7Z7bG`+Q9d& zy(1H9(a8F%&b!X|K*02V>Tr?Ovpf{l!rWmoEy1qo?v{{aZ#KF;lP~C{Zi zxKkWwnON@mps%dU3f!y4T;*$P(EQZhJ= zh}3H$tmz6f_Gt^Fb*gAykxIV4{QV^>BZZ?7CbOtJ9&#Ifpg-&6@nL9IWeU1^v%_AE zGHUd&IUW1fz!% z4z-){!Ohk0&zHiCjENt*I$;6kP)aMtMVWVCMOk_2Clw>)_0+z&H*+e`=;{3f^Y7n3 z(End~t(92W_H9Im?%f|-!oW4nDNT>@{|-(5SHF|cU6mN)l`6rH_&!vuS>V<)@6TR^_3gs*iyLHAHg zQbS!+Kdn5w1qL`)`ZG(FW{Gp7I1;qtH?&cSsoz@qlMek1tK+Rtcb)GD)EKMt>PFJm z%!XH{Ih>{K+9%ENF66dnx3q-5sz}|`Oy%|QafMqXK?!0XA(AX*Kp?jG(ejpuzA7Vf z)LViD*u@K8NqmCbgCzH5`$&8NFUBXtY9Nl5r`Op5Hh?L=GNYh;$O)2!Eu<;qJoOBJ z4IZ|P6WA^sLm#IX!rHmoO8%|||0}zJjb{?_0b~b=dExV2+_@@jCQ%Q;N;7s&Xyr2* zNSu8;H6FdOT5GXqioj@{-q;dp`hY)N*yoK4_1@Z{3F%jBRNMa^ndkpjs4E(D)r2|r z_*WFgq<%t1%R?N{7#}9VN%;Ig41kpGU)&3CgWFq3CVZWXBAD^F&%6_7=n?K18m{_# z_2p;U)Eaur_YtWV$Q6=3?riKEeH&)-d_ObIkj_%;I%;JnAw;f2GEsco^IA}uXx91; zrDRXH?~BchEd^-f8Ta!G>*UVSv3=#aW(PpVSqu&JmC8Bjo?t7@Z`Ra3xBynF4*I<+ zIX^7trR?q!{cqvxP_mj))6eoA1-xDg)F_(J^jrR0;n$=EelvLbmMa@G$%q3 zcl($ALk;!>0n0j;@;eZcpKH{1=F3*NxZ!gZ6``t?e{?o~?${rl4H>J(zokvyR)LK% zF=I`Wo!B`w1!qwf7;QNGdYa$MY)NAaDjkd|f!}0J+i+5SpINiGH1g^y&o$KcZxGsmpt# zyl}4j=53dU4_j|(ki>0w76x+B16!unRBZD0y%aN=iau)|VOxEbrHsQ<=#NI9k;Fqk zl9_jocZEDXhEsIh7;k7V%WJb{{Z5?EGk|ni^;xs7R+_!_N{sBD&~{vJ-1WyvG6t4{ zALmiVAN}n_$r|w)?mGbxV$eoPjl`2OjNZj);p^Y=_H#bGdx_rsb`Hj(g2!bJFk zJv;8w0`rhKtrgBIN{Q1*;Htc4W<3KsMZ1cvT@3cds)oroHLI(&$rQ3@JuUTF$#Rv0 za0HWrp8I~+6A#dI>Iw7Wqy}3Bbqd=41(I7o7*{oJb-U6vsHM-FIK5JrUcTyKGx}ZV zll8NC%gd(be5NJzz>NwcxYq%+MH5&kX7W;XRPNFC#c<%2e89tq{<^OH_r87QX7vdC zxBamQG4A7DDi1R^zhVnN;(QCDp)@fl^%+4$7oQpN;+DW6%b;@5C!{0}UN^X1BcyV+Z8YfREUzby7H2S@7Qkg8yv(Rc4D) zYzLEEK}R{NFqs*S(tTv%(s_~c6SY!zxSf<^oSc8101+hH)@4SP_eegEBr#K|s--8U zo5!nqeRB-$oM;=5w4rcLoevfA8PAY``v4iN9h_1Iya_w8JQU6_GqW@ZU>}H$RwIf0ZH@?~IpJql@l6-#wL@i7+FeOCbPl^6maw zra+uaM75B6Ca>g{v`CIPo}57SJ{&Ubdbk_6>00y)|B$&OOY*hnIMbrtSEX%2wuX~P zHzIm2@*H~I8SLDW=3;amfS+YP|ANFyp^68#*9Zgb#}l(mdVawwTlXs;R~OD~!Dye; z9pqZ0CX(%J^uwm(6^f`7R0MwixU7dY{XWU_*Sb;z%!#iap+mZBR}G^djglmiJr$Yn zeKtp_5?en+!T?PFL-3+1rSeK1PQ`T4WH&b1L_#qyzGJlarFNw5H5jyQ{OXCW#*#dN;3X063{9 zRr?Hx6ChWyC@rm-RJs<*1}4%kE}eD*NMyhq8LRiCWYmf-OE)u5dSsriRWv5JunZ9l zv&P0f4!P*~!Sv)i(2nmix^xin>e~<);}w?w`pDDYF`=)oc9+2kIGrZ6Ram_QeMd5` z=0t*5o*JZ3-ehie6k1d6Fa@o22lJxjNPl`-8ci3Nq`4!;RAmKsG8)8c+#8^e0g}#T z<@(=hkrw9biW>&VvU;S%l>6z8ZU*u%Q2t!Qp#_{+W?BEEgjZU60o(1^C+0Y3x4i`S zBhkqk*kbh1yr{TUH-(cynJVYQD0AvdY2UQ8Ew6n67v^gNY}!->e%% z=&-~!x{toromk}Gs)f7Dt#-^~%u|{eA?#||Yl#bSErq8edBs~Z7$PeE{e)TjwaE*F z=G?qteUx6VW;8u6K>v>c1|c>)H}?{Z3=v5MH~Fd%V>_x{=JNcGP|Q+y8gFAnfQ0%l z@YtsNVi6nj&Qpt3kCf{6I2^K8BU0bYme0Vl=59L~yWpAi$}L3jo1Q@I$G<=#*5eJ| zW%daX+XxUOVJ+s)QVY74zhh%wE3gBVs?$0$60ytg4cCSlY zDSY6|A9AJBKNLX!OKsP;;U|XAkBl067;N+xqNtv(#i>7oR3%Qc`;qP(gQJ(KCL_^W zDxUU>OWGLRjM(jMp`TJXwZn+dQ?}foqZ!&&z81wx`J35TY_u{U2m5(egX!-y+F@r} zG~!k0wa+jT39jyN)KHQ#PrvUer<_;IGD!9O&$w-v2Ro30Ia7LkQmVrk~EpSuGT;c4lM`+tF8 zz_fyDi{T$J0AM6+LU*a=KZUzu@>SsdGS{Rd=~IpQK4%Am#RaCvcHE11V=OrsL`emE); zJGw|uEQXXh6bVD)qpds`-KsCE{G(Tp#HPo4ZV=S!pjxWh0z=y9jjp;{D~AIRZ1znC zhA~rm;t}qKrz#a*IcNC1oUJ4`#wa@FqmndaE>Q~+X#4R$O@M3(zD{a2HhuU4e z#t<(XZ`Z!H2v!SgvN)}ecCFyXGfnK(`3q#8w3ELT5V~|*h96y_Z;ob)v^Nu54*gs? zz^#QSh zh(0o*auc~~30c20_F7qJqdC>lr`TrP__WpWv)uigPJ_efH{pQ#^a!%DAX(4!GNyJe zkMIZc!uMN33TV2DcVG5+o`E&4B(YK}!y&q)9m@*>lXq7``+T}14R)NA_%~D{d-!mA z7HDZfAbfTC00nfw$9m2?&s6TJ0B(qd{N~qBRd;@XyFd3Ir+TsDkkz72`C$F6@=a9{f_ohSs2{ISuZBDFfiw1TZS zD<4Pw1xl2IyJH^?^1_=-8ZvK;+h=O*_Cq=#W&7`X0Cizv&Qbm0?{zS9fK4;a^6%2q zth|ODCaw=WJ?5`Fosy0Fk6J*pIQTXhJv@?l6;~@5iqlR^Z6~j^C-o6Op|pnxMXp%Q zuyFUCGZE?^n}JWI`tWg?%>4Z13cdLMg~&@$?!}%s(1h?#GV9|UpIVWzIC&qUEYjh$ z|2X&LG~SS-dOh-nZQqKwl)2#`?`n`yf6$h&ZKwhG@QNjgO}1gCYbQcne?-_YN{?(uVFtue_HsqH(?}G>c`_76Ef!f z?X#*)(I?*%RuvH1ri|pueS%K`&c6&1TC{74{H?w({goqO40hD}oPY?rGkRzFjs!(q0JQh&J_-QME+)8Ewr8)#;v7d51TRT90j@KZgr z;@+@ae^}5z4Y%D`XG?4rxQhPlk9RZB(AuLau&ObJ9Gp$jF4v}>;A__1Vd?4>vE=9>d>f`5Xq zX%9eFz+VA}yPq+m6Rl5M?dr}WPY511-Y3njP0isCRG>N&#AR8A4z-d4e(h;c@Ac~I zvP7@!UGG<=^?cPzUWw1N5lQU?Y@LW^h6heMDL=D77_$OLKr1Xv>J%6pPH;1eAPcY?dbx?A-Ebu678@W;kL8Q^*v_h)6-aO_I~d%o^K#@Gqkpm{=XEO) z#s?~dFFwfh4_Uj;2&KI^A0>0&sSI<(24MhW~ zB6)2ReaQ@|0$rk0i12W^2{ywTQF*_vjU%?;53sICP;Wp3AnIkK!a~|WxQyDPfZy}w z^efgYtu|?!tqx1os7xDs%O916#RI#*7vcW_LaXmN;x^RvfOETG-DJpAR(tZhUq)G3 zngN4gu8IJ_TvE>E3(};*!%KWdR}2dSG$t@cSh!Os*f{h@0fe(|T7kn}(oiL(BW3ob z9bvBwyHsq7!{gfW_~!ynjFNC-{>z?g4Zu|jf=fZCo=%vW{Ml+Do$dIX} z@!t!Ni;nKLiXzMAKY#_ZR)IpXv=*tX)HA|V`^#4DJhB`0^?dj|_oPk&9X9(#e)9ZA zNS=&gCcEDy2Jb!f{c^dl9zY_QlX}J|;b3+o161{sQ{uhrJN zN2Ii>j{mvnM1(aUNZ>4X3M>;$!;!B(t4_R+eX6Xf&P6J@G&;KA_G~HeCv91-t$_mL z#CkPyE2}Wa`VItu{}}M(4G4JUe!ip59kV$T@=phMI5VFl3UqL-Y67~C)S{9_TS63cGG){NNDLWiJd?jr{InKB9ZNZt{AoH%V=@Bd zPqoC~aOCGaN0!1BWT2iL|0~(rJSg9j5Yz@8MAZ9X49geuS^fKo4YMVB_tn+;aC}7PWC))#kKyA2yfhytBoCQX`c`Qr|Nq#yd zJ*=dOD!iA{yMfjUJ0e;&RqbVp3}C?6S!PSbzi5eiBTZ=Kr1pN$cgyhB(|hE8Vrwy+ z$Wl0wWXIXlWQ)rUhBLSQi~?DAuJl@X_9>*L=j9oq{uC-*5dPD`;m4h36EAo)Z!h}e z=#_mQXhMs%7zkC8V+4>w-l2*}J0c*W&*T&9R|h$rqDTZe*l1s}TqC@q&?t|?lPM`a z9vrKnh--keVkka@%4?gtZOhQI2=YCVehu6B{#$BMgc|`b=|eXBHo%2 zOEBP7^ZV#pzkc|RjN?TraZ^;D)BWm8qpByFEd1w;J_FrzQ0i+5PwK`T?P4%$vVk{^Gsu+_sxOvK?}qPD&}OKQ8~HXkrTsgg>~M%ofFf1?=-b z)~pz*M&7HLVyqOLrq#&(&h7EuCRcRl;|^dKNW_bSFps zBZtKAt|lk_)M5r}_lm_6hDPpM3nK zEWcFOWKgzX=ONC~Bcn_g|>iZ0M+ZQeb z89wRC`4;z_nb0C24Qqp}wwQKeAIv)`_*A92-f$NCA*B}ladCzA5@Czni7_S8;qFuP zY*p4NvPJOku`D>+L+VXu4>wSt8yaB3?MJSCgx;HFtVL`{2lfxSl&`Q8CX$GHIgW8P z8S`KI@eQLGB}HQtN%`Lvaq=Lsa8GDk>(dg6nxD70xp42f;jbr;qWXMHn=BSc={VxeP~hEE z3Y0c!q=-o0dv2cr{>o~v8PPKnFTcWZY(SP(Q+XJzfDoUgz0HJXo|(?c39zU`dzAh1 z{}fsIKl6XQrXZ(zO_(-9Gvze3zaO~`0;a-z|DC7xe=oBA?{Z$6bHZ8a+vJf&>e)NJ z#>s6|DwJ6dbNdVTH*N4#=#(6?#eTV|<59`DHZ3Zcsv1V=19)+eTcSGW{s{8-$ELyf zJY@=9KaR?|lJzkLJLv6k&M64k-??Z`nb?lY!RoX2^h2YsL?qu6e?Pb+LjS;(Iw=U4 z4dnT-7*#;T2-dz?b4+~;ppMB^JpOS27;oNg%h{u3d%<-lQGyQdG(82o9-G*CnWW4y zKBpA3e1f-@nZ|%59v~5s_g!wu!V&ewTrFo-4?*5U7c#6^Wi~UhY~@dsEW4&EiyE9)&G%z+#_@QB4ciaclW=L zx!l?A6X@m{Vor?FvIFh1u=>b;UL#$19hHuHUY`-P6@o~(@V-IEYEdcZMM9qUxZW;a z(<@L1+sO_BIbrtvzV`dT3=}q}GYa&q|J!w73fT=H+gb?9h-I1etYCEawZD~59)DfJ z;Tdk#LHMws%aNVPNp>5dzz@i77#_-Qu|gkv4dhbM?}Dd!5+nVVX-6}XI)X_Ar=eU* z2^3y&va{hgLo9b2Q5SuJbVDZ9_BaK0&XHkj(q3Bj4|;69l8kcoY)J3h@~+^%Y_au^ zu3!dpHv#dR+az8AWFOxqZ~g%fu8p)eL(*`hYaV1ZRu4T_hW}RVvfXtig_N`K^xbDY?2T;c~097 z9@vX=1i|G^yWRT7!Fsh5^#+_tD=^1vA^Yzi&yF)(*hfdRJ$*qBr55IR?6}w`pu7J% zulFZ#zB>T+tY9;56hC?m85?YoAWU2b=Rm004trfjENN5`c?>tP1@#z*{#6!86Cw3aTwZ7yfg=x?qhXl3^jh1JRp<`#u^jcQ$r!yV^1x98Q$fK+c0jAqw^ zn6l#X8k2t~pL{k=`8V>(D9r{32@A*}mkq4h0lEL7-E7SXfnNyl?0y9%deztCncb$aLH{=4@+n-n{2dh{uf zgNNb#m{wdhpN|l8&%o>2osU8kT7R_FHD*4Q3Ht3qr+;SR42xI#C(?}bu76k$Y4c1z z<}h1lU&){DwOGwc?zpnN3)%RZNia6RAH^KwK1G7`DeHQd6DA!ThV=E9?X3VE$>(kB zMRBUvO&Uzg@EdXa$y0(kHUzq)2F_k-3wgPOwGKA>zDt^(Tv1JIKd$(34hw|f{}W3} z{|YG|1;moJYP=x@VoA$KyK7nS2Dm>HJ;VDAAfpSr%!^m|R739aCb61`9GOk_}E{o__puW#{s8tulSqRBj!hB;oc9^{a~PD^pu&0K^0kJv1L0_%%&- z0?TyF#!08@oJ-$h29axBL=OQvJomW`-eFBPT60ss(48k`Clnk(*1wfg`Ta?XI( zfzRU#>@3+TAonN|I4PV;_tIX7;j+HXU_B zvX=bHfG*`J?_7(JYw1kOpj-+9^Z01)*yD94vcb=&Gumz~15~d?#*PiFyZI+0ZcUgzv{MVr=1 z^O)Iys|Lqdh9_Eqbc9X+^gRM5 zRmtoxJDJ2Cn2mV%!BD#g-;AprH6$K!&EDL~m>uO^85~&(AUh+X1re?pXG}{E*k9Da zw}G}}cMKD*cvFnHF!uoj|9Mh%tA;qp$^sl#NYtLQ!Kz z&cqapN@Op|4{q~EIy)y=S>c-^XDo1UxKOe7$+;nhRT}%UB118$8i=LV(Q5}PV_$#C zqq*(lv60N9UZQuX!`GH(P+4i{M=E zhE?TSJYXF7_=fy|;rXVq;@Tf@ts0Id2MfUKZd70c&KPg{7{y)wH87UA38-!~0bNaP zu7B~Az$Cy7MC+fU0QUUmaVlxRug;uhfV{Bz7w?T&&eiRlRcIaH7~4gBh`r)fymbDE zz(%TJ6(Z5L0K7i&FJ1~70QmRUUkU=DXX~gw+-(Ry5MnCDinWOSUw&n`?wa!zgWmPZ zCDB!Q=3l&DKy4#9FZy5p`k$C^z~(T4%ZAYAs(7wd1b|VHQ|-t0^6W3NdDnHzUzIz* zxPN$CIgYu()Ft04c36iT7;;Fqsd+*UfZ6uF-4w}AQ3Fl!l%t`|_cWeg@uT}>W1l@4 zRdZa0VYqGr^H&%lp0JM(hV359xh(u_u>DD8WF?#RO^RTPM@?=sNUp7`7fSt86?~L= zYWj${^-7ipcdF(@4^(Zc*<%^K16~!AX>i7|S}PV{Ed@uNShXfBpe--~_GI_RnAD0f zxgU~uB8PD6O#C))s4u^m-;A^IWflv6!4#pBfv5O0)&y5}PX7FcZZ^tOb-bR8-ByxX zqA)JLyw`R^vRHYjdhjR#Ec)uPlNaCJE1Ul*!gB^gc|&}@hG91abp(*XJT?8k&5NgN z@@J_o{dAzpHQkFFi2iByKI&8jGAsujKxK}1zDH4~gKEAgGQCaI934jIgnV0+C@^d1 z>1$0w7*;@SI#|gNHaYZIlO`a$?#pwPR!NXtzaRRR3$D zoXd)kSe&~V9Y5o`@K?!#FI_k_a=`^B(T3K>kFkx-cZ)pzS+cWuTw+PgJP-Uo)a<5; z!ZSFJg|9zpFBc8K?_mNeeQpjrRx5xniJpgtI(eJG9J=cA1IoOKWwCJcoU0{RP0&V{-p+DhAB^&6l;%9!%mzCmz@CP2iIs=Ys&OTLTv=bg@ai%~WRTy=D8yKF4sP2n5$QC~byR9^ zA}=Wge+X0s0_ZItev_1g?vf2f`VNjeoVC;Et~B?bSU=X6e51sLq$$2VxVFHjeHTO8 z115*)0h&z!u1L6CK?LJ{L+woL6V&VDasADibMpJlV}{e?_wnB4QmUZf!kF7jl8{?|z;wne%OW$mVzJ7^TujYrW^gp)PZsgbzG4i5s*~Eva z_o?Yz#^Ju8vslq`e(lK|evpH24J_q3E~_xwD2bTqQ_6v5d~UoM7vG>81M@w9r3%XV z#vAg(ajZ$j@&Rj&aJV3ZFP4y>D9qC%dwp1gK_I4aVKW^tihZ%&R(NmTtp7Q7OK}c3 z8d!niYXfW`PGDJag&s%GwWh@v3ML&@I*-$P^#Ym6K#yNCi!yBK4%H_Gjn~u!@cD$S zI-|8&3a#S?b)!6!xy#;|G--M}Z+)6aI|3Lm&oyDH8A}6cxe-q8-1Kp&adijSk{vYs zBXk%#rbu6|AcU2vX{v5H^Wn;rI!Jly6XKvkVK`QoXm&Vj>TuZt*3YZK9>g|*a3;Ma-P|)Pi#IZ5pPc>8^`$wIIw{(V?*PNQ~OiCsE03uPf2 z>8PKn#EE@VG-t((FOVbhBoX_J*GyD|VtNN*oAW!{pU=D$OHrG!K4gBaowl8ta;q#0 zMK7CA#9XN9q$srdTPIsWP;y5NkunQv%%0i5!z&>6NS)n$N;F#uErH$AK+s^MzD*Tw z3^5wX7V+s*O-r{69Zeg*Y1N+3JHM)PQ@mg0l65>&_O?{{N_+<4RsBiApYiIAwI4;1 z4f$x+zk3%|1e_jn{CDr7CO8X04(GGaAwJ(d!P5H)HdDaFZ-+Ow60iwW2N6Vx;V(LI zzLb{u)7Jr8NL5E!7R2^Qul<633(?G9blA4|iRYe3`DTc!$$&F85d4ez(Z&a@ZXxj} zfAJ2dv%}SwDJQ(trRLSWv}#Wk9wCQB2dT?KaOV_O{6WO1SQCV6%KD__zJ(AH~uMuuDP zo?L|FIxG1u3d%)WTW+e6v^p8hGfy+?S0*EQea6e_k(Guv(qWidyAmx*SzBWqFd79V z-DvL#Yn#Z=%LIggiRu)CKT-e22rVU!I-V_RJlAnib#4F5hzF>2_=&yh+{J)X5KBQs znesZ6o>)scQ$NX~JmJqx<+*|$ZA3+M8Q=N#mdwY6=_+6tU;Hn~nt$eE7q8lq0R5l#CrsoxdsCOT9f&ewlHQJA2#1 z#no8KdyBTr*(uv^7t#l;v9Dmpy6Ha^g&*g)A=={x#4CL*f8jj^2w?c1fgiX1d4Z$y z0!f4Rn&0_auch!Zuf_04Kq!ZO-!cRa^dKPXRx{0llZAxgF00oXAY0(I8dr=;^lzUcs$r`3_&D9an4JFyM1br`0)F zxaN2mc5LplvR2O+lZ>33`!l&z}Ez)WT zF6@L^X32R<-@)RewJZnwqNov&px~6mb}9yutZ0G|9EkpFCLuap zoDrPSY>|G_(a$zNn^qa7oO3mAurqYhalTEs<3qT?{f9N#PZ$8=v4gRoq()SfKn?dS`D z@G=u<=wN=x*`EkTAGpbz?-(1S6xk^%CD{dW$x~j_jhD|Cv)4C{T5V3SNXL40!x*1LsH?7WPe+Qk-4_f z@h$?o4zCuL>XI@Qp};t~R+CQ5?vw(*>xBfY(ure;~uMVbHe)s^4W zJ?SfR1t`zsv|J^j6G{2)?QLc&;r^Ufhq$6}Y4Yqm$4@z3l)v@5%*fm6p%Trh-HrF* zyLluhQ&hg#3ruP740b&f_{5?pbV#qt4CG8X(gF!@nwg=eH8j5#QFpEO0J9 z(d4XbUne%VdDct+w6JqfXGzv{mc91=uXLOo&`9`UK6EIs?qBuSM_%iiDjh}-FuIcT zQGa2)&T|Jss@B}I@>s_pN=Ii1YB#I^r>*(k+1$lNW`$FQTZJEZ_WZDZa>z%R)>z}T zOvqG09Xz=lAhxRk+|_8cN7X?fJu5aFW?MlF&9p57G}51BqpOUQV%vOXKU>Tt_}Gs=Ox>j z6b`!VG~`r5)lbF(6y0>r)H+rgX0aLc;fSNzzjzvJFktE!>?~(jvf{(P-WTRn>AgE| ziq0sWU%0fDD~tSgY?*cu*mSHZYaaBH#q8Pb3UF=Dt3=$DDo~` zXtjD(6-(=Z1StS7Jf|^)?!+a_sjX#X@RuFAbrO39P;jJi5eD4DITd}$bk$2Zpd2d)8H-Kx14b(?7|)+(+!Hx*>9f$> z0vk}O>ebtr)YI()cx=g)lFDl&;;T8zIn?6_q-CVLqE{br%$bC@pTOqAaO;`0c_#IF z**UINqny7J*p2U17r!nlRZJ^Y{$~Rp2ow9ELSyjd-z@O1tn~0(wqQD}VsVGBjk!(f z3g7K%-BGEQ@VFAaw$I(>%JEH*{HTbd-Ad*mfzBNU-?b-oQ+osAlV@78p4ub(<+G}u z?@{%qSL3kI3}Bf-V)Nt-WFk))1s8;Nv#?_V3URHS^sLCl`NHWGS-%gObrvuCztAn` zZEYDg4nyDFn%-zBjP=enm3uH!F_Mp}R*msVmoy+Eyw}ARJO$eG|KKy!(c;`qNxrzp zBGhngmgU_z5;^R1arFM+>=BvBU{UZq;?_DiZ`3q8>A9me5}NusTG{+xS!J!lyYk+zuEtQZMPPqb#1i-b`=}Uum{8GbaQfsa(1=owPTV$<2vJ$rOE|;d9p!?v{qf zC$u1(tUsgy`v-Df%7BUpeMHEysO4I z#4DbV#K$jrSb6CSsyJ+^JsVaFlT!-SP3&5JqP3(!sWRtodS!g0_UjMe4b6o73vA*) zV?Ox*+I#-pERkhf_@lq)Y0LU$@;{gU|JP`?fAtZ*m3zLPCDv)6>Av2s+V!f@`@TS< zH!+D=+{@RaK)i*ve6)>0Igip*2}f?D#yya8?sV#Y*jI>Np^wIZ69iQJ z+b-pJoipWl?UAY*F$rTo;hK71H4G0^at)og^LC%Za!zhc{fs926g}>OIb_ei!B>bM%kYU-%ddvW%WzC1|&^gZF5as9Wq-vHNN0*>tb*|kOU4Y*E% zS*R|&R$o>!tE4R8)AB_s=lbltqSo38X)W34G$2Pb7tI|=0bA?190hv`Y*8yj4~KwB z*z;B>P}@!HS!a=aq_^hy#)emQ&pck;Nz~}(9g>&{Lezc3jX-51JWG9Y_e~bAXV18R zNN<+lz@K<{x1xY%1rM)Gfdb=K2~lbn@!?lTOf8wRyAyIIP;G{su^#tQP!4xf5Rct~ z{{UDmgtqDUb~ijrSx_VMXM*NufH0??j10ar#egthr)(| zC+(`Mg9KBH&chY&V44HCqF3jv8^j|zujB#ZC zS(}9p=!TU@``({gl*H*84J-_* zv_c}PlJV7v-58&3yx&A0Lh)`!TBJH;8IF&-XP^E-Z!?uO#s$_#h^bZw*YQ}H`O2}l zxSA!J0kAIxvgd4>Vy})TVKzfLKAro^7wQ@#WyS-C3sa{}v)y(ds#Ayr+w8zbhg&xO zPYXQvuNK||xsS+^u&6(z6~aAq)n>oVic-ZzDHuGNW0VMQF}x4LC%i`v&P9CZn+WmI zNH#1{RWD3$vD=;0@wm!6jqv2u&~Wien3l-X@>J^s7^sJqie2(lTFz@rJShwlOd#z$7hH6EN{kq{=-Z$!*>Qw05Vnrru?xr(|!|!K)K$G|$UZV_oJdX}2=fS$yP6Jxwc9 z^%w@C;Hod`dc^ z2<>Hk9R+b_)U_)clx>naYh!XVnAkJLw7uz&5bgJOC&U#p&Xp8q$L3dH)cIvEyuiP1 z(;~qg0T^kiHSQS2T1TkJZZ*KA$?@jY{`Q%r))(ptD3o4l&uTc~@s^@ZwC#@@s}whN7XH z`a2Azwd{P~^dHcsRJRUBdTgDgsG^6v(>d&hY+QT|&xmb$yLp=1*IUxmI{Ya4b;+}P zKiJA1_OS{3T?+Pj~!Tm=IwFX+@eLJS$2L#aHgOVW5Q}9RN0Gm0T^@pG4I8LonJ;L5D(%&y*`PwJl zM|+-g*naKOCMlHwAwrQiSoh} zxSI1jsGY!ba^001DyD-G)e{gB_Lb^*!MG{K!rE;z@@wdv!JDsMct#yWc*3K9@u&k< z7c;J9uZMEJAgCGO zCXKEL>*1s+(2~hd$swf5tgN%sSegW>tspNL+E^aQ=tJb6)|oh#z)isy`HTA5v!JtP z^vM@MY5R*uvOnO+8u15b^0F!UYg)3w_A)*IAbRP=(Cf;gB)!9^5WIFWulO^YtHwTB zs>fK)HpcStv8(MR9t)LYxf=U6!9!Ia2}~<0?}j)Yb%5q=XhP+Ll-HA1&}&7FxEJr~ z=8f2s5XR5%&r1a?2IV)dXjP~9=S4*RWJT)QKA??UPYo+jr;`+PFOW88KfZhxWWc04 zIdH}lKS{K;(fy&#J?V7_gxPLmufAe$h|*YS=#!Ji?DAsUJeJpRImM)lF+^Rjf_JUR zUr@pzV>Ukln%Z8sRD0nSfgxBWI$A34`LTP3;qIo(=0x_y{J!ajG+dH|o*HYx9kfdi zLM)RmmEMup{{fVIgyYz4OqEGbg_iq|r6IaYx;-%xY3A?A?%5a)p=XEM{4;AO^fk=K zI+PI^owEkov=!tY?FD}F9>YebvUhA!P-++M8& zm0Ud4H8NR7&nP%)V88i~b7(l|zuMma^M1G@@N@?+_5+fMN3pS*NMBy>en`yphxGR^ zAL2g(0lO`G;P;@u9OP^8rNF!mF3#z?9)F3BI9S9i|JE+?QM11aG&-PzK49wsm>h8o%w>yA28VuX@a2&9s+*+euT+jB z)|}%f*9$-b>u=!W5r39$ih$5e73d7q;yvUICJ$)igAkPGU|PuAX-5wo%H24o$f2Pi zuNTe0)k)xZSYMxp&dXZ=_E#d*2Ko|sb2+9DeS~sM4MbRfg{ZF~?kxQXJe+ysT;dbB z`L(>v$0DxOgikzj^MnWFk&t*=t^jo7M7N-mzp%={MnMRa3@exOx~&a)G>cAY{gCo3 zKehHbLv;){`>-z^;|{0ZrlOrvisxgwvAMHVuPh}0PlGFH4w>59=4g(o^uD`^dn+0j9KY>{vJKJ`a+Pg6cIodEZo7s%Fwx6UNsvZqHpXQ{g{$8~Fv3-MXDB zC_OiDK5Nys+|o`pYnqb|9r=q_O$H=LwAmN=0CuTOka2XB`|aA=FPD7B6%1T9UDBs> zF_OZ=Y8=8Ne*{0Bbi#g*^(a!;kqVlrmy1VBo_FYZfR89CS=(QBt_BhFF{XNnb;k99 z(QsVmALrnCv*;3^`%aG=Tau3|Ez>Qz$D;d(;`{JBc4}5VC9t*KY4r521{5%w9TD7; zbnmFTN|T26V!WndU-}U+bzC;iX|!gnR7zO4+w_IBU&@$`4DfQo#GJkDjXVu%&Gi)><1aaRPO6zl?Gb^sV zb+~oMF!(n86Y8^Wbwa$M_)KQz`~9?6-2WjgB;dIZQ!nLLH&E(qKzo;4-%N(t#@WYF z;Q{{rvHmvgwezHHut`3`+xYZP7Vadm#eN#i35=J}U1zUz|A$Zpm@KCGTEqvmLB2RU zzg5@psja#^z^UFp#tS#G?*{7|N)W5Vj}MG}k=wR7%l&ibn|H`CPj7+Lgt`Rupcdry zam2&s1CdDgRkbL&|Peg8g~U%WyrtdLM#;dr8WhkOT(~t2{^Z{w>wJ z2VlS?giDeb3rbO(SN5rorXIeZQEX}DLrieTcGT-uck!cdR%CI81X$o)`(jv0UnU3D z*VgNpKPfBN#O?5^!6eF466Q~?*n`@h{m7b(p=Xs~RvoIt&V+MMrd%Dj3QN|HS31`% zw6?y)_6vA14ps?LRnIe2dOI$K2uA-%!)I#=HYndq?^%u%h|OOdahO#7)oQ8;l=8;ENpv&vSU!f^+b&%XI9rwH*W3H+K@hHyBevGlKXm9NXy43}?|B!s zU{|zb9gm<0v;gKWo0J|K#3^v?y z1wy~W!Gw?N7r^9L!={9DXERwpQuMna=CHffBY~-8uGd{DWZLUWgV2jL%803|y1h-K z?&ZpqXYX40?MI1?_S*6sm+BK%G|L(-_pX3R zZwX~5Q2+nMG+4m6dv6M-!Z-v1Q( zP+!a0v<5koDtPg=@~GTl6>mVkJ4kwx13$$wFyR?Xe&D;6-hgOh#ST_dAOO}UZMb$+ z!`P@-EyHVrvG)^Czd&DL{8TdFXYV2WF9lYdJuGW=3z_aRrp!=%|J*?=6OdN3 z!@bSU_MUPODEc6wRV(DU*(}ZU6!nCZu?#%26ijWJ*nAQwnQ8p z)SP`{Rl(}v+WxRo^uEVfI?dxIdv)eHC~w!1DG@wMlKaL+m1oO_yc|_9K&3{V3x8$d zXr9KJx})2bPnstqf6#9|qcPW4BdFJ01&3XYb+`L^j1m0SEHRSl>Et^Xk_%5AfASx@4a-d9j^amj!d#=BG>t*>Jq?d} z^cgu4KNE2z8kAiCd8gc*{Y!G%7R!&@9L^`w6TG!lJ{~n9cvX!XY5jzU#QZAM%i~q~G zU3!pI^^u}M#Mn5LC<8i)U{G9h!M<*3nQ#`W6ql8IUf9QX`7yHoc}tPwF{w4&qE)H) zSKcqh_mud1*&Haz_R<4@_r7Sm=g?3lZuj`b@$ro<88H=apjQ<;Q^ss8-hViiKq%eZ zg@}Z$5p`)vz4-EC_6c`}ufTf?--T4ig|}5iTe_-VrNI-3GziBa3gX_A?2h#>c;KYZ z?J@73{_#G@X;6{Id+W7k^|cPGWfk^GTo+&G9&Z7|++1<*sIHgsicS4u#<6G88Uo1g9}v0)2P+-N!!D7d1q z1(`In%wG4!iI?PCC6cCADxo{Hg8|?4TY^ZtD;kI{kh5HCQ13;T#1w8A@$hk*<(jsiqi>Rq-x2QVYEnpbMHDe*bfA zW!Syy^Usj^#Z8eu#pZVVWK29#uS=26BFbk&BWT~Y;jrmdpG&Efxm-_|!{x#jH=|6i zi(0o%mp0!3($nRV%U=jo$EW6mF7X|t_l9m7v>ue4i)=U~vVIMDH^dJz8Kk;~k*a7m zwyo?!0d8#PCs1&?*=*<24><)U>W`9hYa3tQ&-_eE`wU-NpApTH+=n&ArO7=)6+xEy zCgEcx3X1ENRxhN~jw#cs40Muj#Xd2y9bNQB8h~PSZh2*l@5;L`N zWDc)ivE%;BF=gLABJT9|)sx>uT`e!9G&kfmPK@k=e1)l)p5rq$dU%ny^3N77U&8uo zR)~Er2-5r@ipTq=Hx~=4V>2ToB}V+x2cE+T?TBYRz@?eg!UZ}CLp)UUK3v&P(gU?m zwLd(BzVrdFd-slEI9DPZkKuHfbR)SZ0Ui@f2IQ{!AIaNEIeW@bWT!wno2aURRz@=C zTJ=@dHB&%`kpk__Z(j34!{wnh9mb^=K!AL|pHIyFt((KSW*EVNI;MJ@So6G=OMLMM z;1`c#t@8dV=$X-~cx&%qU59d6`E6v{cZ1(QLnGeC6+@!_FtGi3$bW{|h{aL|IX*o_ z2Ym4s?QkV%N>!bzfid0M>|@Tix~I&hPE=yYkA9WdJ?y3X7q8Z*GqMYW5U|&v{5@4W zGh!qtSeh8q2tkly|Kf?QE^=AHo8#W$@6Jl5fEIB}FajLl6irtI zmbM&xix8NcHS{DFNZeCk8{ec$4`0}dU6!1Qaj)V0%9x<_^`%x3A=LL~2v93c0wQv1 zwT`paU<(AwQ*x%crK(ihr*t*574L^#!zlLYj`SaAuD|9;P?N0$Jn_v?-~{ZjLfD){ z{l%NB3g`r!k#`GAN_-K!3hecMtTl(r0sfw60N+w;==v;c5CJ)a&F>i$ga?%n^K#c} zA8d||@VzbzO^iy^xmv^sqlHc^=mZ5IUspUu_b%vQ76=IbKSsYTttd zc6{Kz+eW#>hXl`qZiusxHfR`1@&^oSjLbfv-luy!P@h3McO<0ukv7Jnc%9e!*<1c0 z=FG?ZcT`9eG$W;nxHq2iWm{^Wfyt(mHsBEk)BFf8jW_$#=L*4()(-U4e|C6N;Z|8o0mCzM6e}*42Nev^i|K@g(nsKK+j_8VgeQ_&~7G%>)AJtQg)#zn1mf0G9>< z^mpQdi51FPTY7Z7;X7P>W?*D*riwr@iPK_S+2_NiN0!bxSQS*!M$ut-MMwV41PjCT zX!I7aJfCU3fC$_3gD(%x=2 zassvO9BRH2WO+Iu9eBFq!*k|13Yhc=*27!aWt-8;T){_1H(~{6KDdUyG(F zj;j9e%*<3pe1mhkstn%!59Oh1(7_y}E~#vVm!TX%=^{qcq&MSoo?9-6UzHt4dUL`5%PK*eBZ9{c{vCe_O;(y z^5yt8%{>9W zCRTso=;{2*F?*o-I?r4EFW-paQh?_2qZKo2^RRqe13mxZ$*bbEC~fPs&MH=qBM%3b zVM2hl{;dyna6T7fkiO+`LUMwJ zRz1I)tSnbj#x&XsSJb6IH|%1F*_JL9qRC|CcQkF7&`iM&X}0SwjIa^~bf`hBab+lB zm2tL&iI;VL)Uub3X_o0;mkHWFmybg}|CEND-t9YzJ(LA}=K8?yPInglU%Z!(U_007 zRMbrt>iI@!dj;lAL85kR0!PTER;Us)?|sLjIAUd07Z=rMG6YxnxUhiDK{zFVrA!2@ zVS9uU*NS0tn*A6*g5acg)(>3Ww2VqLmlNz;lCs9eFkQ^G0<-tOMpb6cYdK6@Sj-Nv zPucHKVu1O~5yJWUK6K>5I40&AYq{<$rGY$D7kI4c7fen4X?&0gV(=G_{oef!?@-Bk zjz#zeN@_g}nDt^%99X8z1d>5;g6Nay%}H6y;ZKNU|r7hHXF-8fw`@Y zYfLzukT>Y+6gg^`bl9;v6R(gIV6?J)Kx<^}c#^VAyB#BVSR4wE6dguhnvyg{oN0)! zZwY$7QEy!s9T9|8)EmgXL*Rp04QNVw8o3ACmeavIFy}PkLkVQWMIQ@S8zu)ua-rWc zWqkFaKHE2KbE+Vo!z677Fcvo3`M6vP)i~{q?QtJEUQ4T>ZsFQAHxAUnYSnT6p^P+l z>xq?QBdxabl=0Y^jADo%nd+q946W4mV-nW|rMl)hqzs@B$5&%y)7CF<`kcZW#L0G2 z90Tuovs8qVOkbeG^Gtl)RqG}JhkS(o9iM3WS#32pLz9tAk}?? z18r9(j(daYAvqVRw=_RJ7dRq)&L!MLW}I zS1lZwRv+8_H}>8;uE}*v8;+GCib$6dl@0<*ZxRJ50U^?*Mk&$+r1u0tdXp|SN=Hhh zOAWmv(xvy_TL=*1_v~}#ymQXpduGnO^Obq$_x%wd2~YCeZIx@SbuFqC(ZsM7kk1X! zrEV|hGYMSZT!qhgPFR_^%^Qjl(L|1rSM;`5=NO8gt6p7r7k$;?v6KojOdH ztwPyZ97@Wi0RWVDAsXo@ZJgt;%Q(Nl>~xlAjmySd+L^hI7(`s$gQuP+&xZ?uS!{PI z#7nLgAYUi8wk&vU=)ImQ~EHpO3uN9TkG?|KqRp2Mldu**tDm6g1|K;((1 z_HqcPF)#R@ZKB!risbo73ZQLSdMlNhmBziNApj4#$Zs*=HNzP{j17hfr`plz?)a+k zUb+JP5O25ClP6$=h`fVax45x=%Dy9Z%yoc+twlwaJ|IuU)u2@8pcK|Agi2EAHIcg4 zlc$hOu1D#e3IrgvaOW!%PCDHZqIfFp)9Vt#d*uFwwnc4Toi;m3#1k=LEG$uNMAPDT z(gff2yLl2w8RpKC+3l!TC0gS;9H@DjB{R9+`RewVGOmQ@d|F8%FK=?|Y@%yRFYe;a zAAg==vp5mSd(N<5b?^}&K=UJa*{w8E55;sU0X;GE4%vVfK|#pbbPu!gb+o^fAkx$J zIgMvcS^3)&@e69EmrWSAnP7~jKY)J@{{o(!dN+x%Nw!EG!@ObZo#&SL zkDup8DKtZ(n|7w3!?qo&4wqze*D6k1!z!xlYDKjN^PsO1FSXxG`y9KgP>fDO&MhF1 zfNZxVz+x=ZNq4)3?c3Ff6b>q2iP0G3+kPD~J;u+ATv%GP`NS3U&4AIgO6`E$%f)P( zb2rUZV}0o1uFaZd-V@0g*t9&?jI{^vI7XV^S8%g}c|4m&TW^|IX=GztWc2VAR@CjUTob}rr}r$ITu4mL2O~Z58WhN9=f5MyR`cZQls}FvG+w;X~=39Pkbbq z#W-6~A1~k_;RxD+rosG{M1wbIkFQW&qwX{=cf7UXNf{KCi?eAPM01+rR=jytVcx<^ zNdCaV1VeI_^A8XU7hHa3WguRuKsc4TpV2@F%>!>^Fi~uMH{Sg4QaoE|G>HDvct^V) zehFu{5E>aUEj3U+C9K?;J$XO-L8I5=>fZL91qT{~*##HtL!voM#FjLN8?DNy({t_X z@=Rwp94URZenek&sV-Te6YH1_IYVB-ZSAQV?N2%@&*!33)>dQF;%oJnyb(0p6Xrc` zQB9$&tp%@XXg+IfhaK5?NILQ->9H7xXg_nxM30SIJ03g$Fe!l8mFj=wNuw&{|of&o41izlD=}F{}`EI zbk+x7c}+%7Qmhp^Z%w;;DrUl3WaQXoz{E>Ko>jiFqw8y9BZxpfXD_0m^Qfzsj+ELv zWnC_d*U9_PlE9G{y)YNIkURgSIqiX~{dT=8M8rAO_PE6(HlhA*Z8fGgnc4LCR#{0s z=+gb)+Qyt-Fa&C0*3Km0c&E|s9y}wME@}+Xjde^#% zMD$}l)$Y$2*6rJ-^eZzqPBboW*RWCgKjj#2KK2z`2$#vUx_U8}5fH-{;O`dU3gyaR zMs3GXEz2BNy6lu`!eMf?%8m`(x#~3Aa(=X|shZw}X)sw^-A0e6lMQ1VTKaPZL5sih zLz>)TUl(_~L;pO(+Thu?Y0^^MTl${_4vQw=ULOJLY=Ctjvhn5H6C2`JUt@>3+U4jn zqYsr~5RJV}RrU&jf_$Ci*y*jB%7a&O?vbth<1n)2OvzWk)xwxE>1vcX+9w}wM+FaY|mCVmS z)V4WZtvynhv=g|)Hujjw!8!y;2(~1@g!5GVR*PyGm+P7cJJGJ>k{V!~z5JB50)|NK zyu^-y3@bk%B>{LAJk~ICP+T9uA4~}(A{QYE_b?p!hrYd~bi#2;w&WyddKU>(^%=uFD z3q^?lz+iqLeSDcBrUj@1H}z8d1^Nsa09;rM-m1`IrdXlnk%oH2wJr8h)!>8b`T0|t z2&s$i6oMF56Mw0w6tbx8`b+2U@l12Q!U=6Ct3B#Pl!s2#mdP`kUmzVuRpJir8_Tvj zEcEwkpJBN7AmI5&pL~jVfH-AJhLEy9;g_I{1bKQEb_e(J1*uP`Q_Cl zf4qaFJjV;Hem?g$_n-)c>chrYLpL&Y#wA2IQ+E$!hB9hOc6;-?XzI6YyM_9EwliyYQwMot&DiwN?&nbfh6cNQMGvA)vFR>_vnQEYY7m!9Qxj)%Q9jo&qg$anVkLFoK1<&llB1x@J4O=kRjdq?4 z#b2-J7j9O0y?;^F?8faU-hloxt}AwMnw2P3W~?_7^hbWl`BWro|K&nUYoyq zNi48 z;8IAoWQ@EY%JTFMUyroS#O?aN^90Q7Ec~L|DQ5xFis>Hw!i=nSwTR&2jI3D~a+{<~c(Lj4xL5-zF;mTI(;{^_U@c^6Isi0~1c7 zJGaEGa2pH!cuIL#=(P7J{tkhoUpUVdGEv}^7}M)myPd?FY}#GFY|B*fu9o!hLH*r5 zB695<-}%~VH9M;^9Vye8$~x*^>+Q{2gkbfl>>o_^&fEW1jJUFdGuDPOM9DrYGFh`Azl$@ z^2rOn8`tK)C#Y0DB9x3dsu=Xu`gUiZnB`7M3bWQnM&Bp+J26;8T+W$*+W_R27igHR zKB-GIT5?LNKvd0N%lP)RjB9E`tfl~CTcN*AdVTJBC?J>WBu4$qSAc5f19RRXwGQ4v z_~7Ai0QS3y_$}}HVbXm0p!1jErfOHEs&u>g*K{EiW(II2q*lIaIZb@<)P`r?V5~-@ zvZK}Zc3e5Gr=7Lklrl7*q$%AlD?Yw7Y_P0)dcxIP7&7}AvMXyN!o}}pcE<87=~4Cw z>iLhNl)9`>7nGIlIQXLGOkc|`-|V^QugZdpX@<}tI1ru8_d?p|uM~Z@&ylikh?hv| z6RwJQ6i5@pLnV_thmh&2@iAk24E~W8o%&U%qiW3i|X|OuMXDEO(AeRny*M|NHiU_2y7S zy>;SmZEq@EW_DFE6E`5^o^vhNTuVswZcX^I}z?o$O)6P50efP^oArYcVVEBU;5f$Iyt{+_A5O zcr-SPv(1@>=#ly60>5KwA!F>D)%1)Tdm_mpl0HNPppzB6r zN0b!P?93Sd+}w>}T;zTaz2--PLB97KT*wQ0o-Sp$r`K=erJt|(=5b?rr5c%lLk?el zIF38h_B-(cy>LrEyo1qjKESheFvVKOY4y>g6s8Q@fpUQ2WNYID{KK8-@{;>y6K6J3c#EQTDF{kBD3-lyzJe|ig>eN40FTWbMu9s^FWhTl zm#ldfBDWSkc|A(9k(}$XcT3Qu@a1DkRC&)97t|91jm`V1RCIw%ez)P>>Jo} z6b&jjG@rBcKSI;=n7Y;tEm{=5!ME-o(2kp1%-!Pdrnt#tOZ?t zB75Z+@~c^NP;0eqH1h}5-UEJj&pe?CxNuvodqC5x^_O2Mm-BgAJ0IPhF)EqjdBj{< zUAiwB>J&hCTYNq-JFF)C>eWkC3D&mHNtyyql*t~f0HB%3c^qc`CjsDA0LpF5kgbmZ z{QTJ0sc#Fu1v!Lm_OUndOKvg0mO}HJ*v^F4sM`C%NfF)f-KP+8LT}PI{xN`;`?|F^ zCP;t-&2RC2|Ay!;Z8;wJe{eNc8me zomsSpW0k^LMXy9`SfsmX*%y;orX;dZi0vRv`^MyFnmAy)k~z0$l~v2t@+TE|bx2 zlfC8k$H$w_XMzTJNPCDEk~Sva@Ui;$F2%Mp&I$UceSwm%3l@!9udjDg! zt~faud=b!f7U$AVXzpbwG{la5v3IpKI?(gPl*~F*Gczv536fcdwss1u*qPg=p7m}f zRHs#+q8gF!wZT`U$Z5hqLwOtd%&Wh}_&Ew*Av<%BA>;PR_yi>Ozc`9jgAclqT?O+r zYc##O^I4UIGpj`TbZY(B0~IvCr&TebJwf=!{v~F=jy6RoCz`*e8`7f zk^vlth|wuc9d?e@^JeO;PVOI-``;x_4`lS+^?RlxL+Ph@4&+nnjDf7q22%{>L1a7T zdihL%#JA$T=~QXd+l{(Qai$;ayCk9u!@_1$n8}3$;n}ydrd+|hwt{$ zMT<6GHhXc?9dI>W|UsZa4-&O*m zukhpa4Q<$SEYx@xjmi)0KjrvxSaSrC%td2r<<5k0;1&<+;rD5b1DmZ~)Rg1;tb$FR z0%G)+Z<4?LODwzYOb*d?8rbCR_JBF11K
  • $E15kc5WxyWN-vTJhPLOxMD0n6MJDl?sR2trsbS`K0Bb2 zK#U35zE3V7wG}^ku-_3FC&gULXnIQ={oM*PhXQKn1>xnxQvA-mapy57AA0GRV6u%s ze&7|=CzpMRS#``W;QEr@rbkdRrmd;$=x#9iYM`9ZPANO{ zS?`gCPH2C2D$1#E<(W1EWsLEXd`^va@+#I^>vd!==C+&saf8wQAtAe)+1aoe%b{Q3sU!toAdVhhGXE$Ef%66P^Khjgu?r%$nPPxOi`g6>O z`3FO+**yU2hITmhHiFZOlVn~JU2Zilv^?O>`cZ07!Kc4t0Tkw|!NC^x49c3~(TcP+ z)olX@3#`+}GtA~wvw4p?;RmZ-1~!y4PI831t(NGzzDWRiGYqiO&CwXWbP28{R}IB> zy`5M_m=sWV{0V@5{%1bYa_%K675eFrID7>I7M|jtW}s0$w|oFFpZ>@Fn_j!sv?alO z%FKtP%yRYCMl7S2pUt^`RiAk7>~i;o|MM3MKGu!W0)%OvxxSmd*?5kUuR{aFSZJ<+ zq!cAZY7?tWmXBCV^4GU-iJQU6UM73j3~+v)*hmJe(bMGo+8~aPB{BxS(|X=N8yikE zFEr52>dZ-*EL3Pj9n6(v9fhowdus`ey4$f~nlma}ht3rOz4+Ugy~>OG#?)P0rfnen zrGV#dRRh)qOus5T*C(8V4$fI}BdH)no4B{^Ya^w7K^c8PNfJV?X9%BGLGfL4OVWjdV*H;&8!n$;K)=Qz&q&;I zTCn3bJHgxcg*|qq8{#-L!9_FymIJL z3b++(*=R$TMSaY}vR|vpS1*#0AC#Pe(iNGR29S}W(v;s*s{DJ;PudsyHbrlgvL@v; z+F#G=E{g`Z57;|Q7vA1ve09TZyoIAj{BbS80onK*aoG@O(j;@xa$887aCdiUlESEm zP2SChO-3eO``(jfgZE3LH->-{h^(cv|M+CD30{k=;q<00|N0@;gp*nfuv^lySEWs;>B!zh4wftMQg4> z>+RVK{VY$?a_E72=tG5IDXuJBTceGaZjP>%OUJxc$oiEy>V`WH$Zp&qp+Dpp!9Xyx z)BIr`fuV@2ZZ(bLZ8>aHmGAi%lG^$g8omas^f*xOb-1g$OvL!bysJU=-#$+uzx|>> zGDAY!x7J-xyUZ{?SvNCj?RDh$y&Kug7%6jiNu{+77c|D%6Cy~|G;<9v4iq%=)!t3F zOtR_u(y>1=QDEDpXk2wk(&5uW&c({y^vdQ}oG(9bemxvw8Rud<=Dp!DygqN%r>&~K zJz=-^JCfzUm%KcwdpRwacdrk}Y*dIISXP#tT-_eH`t9s~;6>qR)~mA154j7zG}~*5 zO_D6P&aci|N(?V`U$L$a*&g*ya5+@g9#Bbs-aOQ*qLGQ_C|SoRxQ+o`byKOS)C}40 z%Kg@qi|0%h@lzW@S_FpfKGhzt{|4e)mWv)3ys^H<-afPJ`|%BpwT;IGA|NB&_ zocqVtLy?kcX`A}?29=oFNi$dMo1_{KME+dg)mH~cQV3DK=Ly`A?ySj}=<1`9Gi%TP zxCpj@oZ2m4byp#<=*W)7uoXZ7+Lu*?0^c(cU5bX@xdMt-1l@XENQ&FQF5(hITQMQG zVeh1wCaiA9wIw=Z9j_$Q12I3l?4Nm5&)cK-@3KF{>n}QyK0H7Q?~0Ndiyccw+;U9d zsa@}L$*6Jl>{7Sl+Q_d_VMp|}Bu??scBi_v8tOfW=&TP_4YnSXvk?-(+X2Z6Kt4od zZat%$^%=^%dI(5gaH=>1%HC%m8^dJ@y>9$1=*Toa9LB`PdXHQ;;dw>bhi)nvy1B=T zCiX9eWp4^x5O33`>+AypoyFw%TP*VmTO4~f_X&b-uB{MqDW#sq>Ak?ai2= z`_WF!rs(o_UII_0Uw4i%VhS*&02r4%3xqo2T?p?mAKrwPefBX3P>lf1^ zajYNWD&1*xr5bIEd2AaEKR+gn&h6a5Ap`XCe46Am;kG&{gXKk8Jm02lGrDRcSMD!_ z2}+HL8CzGzj6Cv%UPR4Q&uyhB!c^C}eN z`GM!4hCm9$@~99jm^4pPS}_upS97{n<+Gqz?6@rk_p$s&s>bW^l>Su%Lp=q zb@EILT4ivMbgP6iO`X7^ZIoYC)V%C}dH+SJXab>SrW7c-xxNW+1ydBHN9xyyU1M;6elq^%DN}37 z+Y&|M%O(z-jdyBt&tO=u^TcwyaET$obqYhvX+~I@^>P?`oa&(lQ`M_LW36|#S`VO| zO_lpV$90|1JtEg<TxcO* z`EP+1c9bdFny-)nUXru9kc*pXukgqT7wXa!Y~~acV8cmQ&0af5&7w*iHQV5>b>tuhDgQcBOXz^aS+!rlpV|7nH@B ztSIxPFZpGP@p^^x0*sj*?#XQ=rPrefsF1tiGN!V{qT1c^>I|9xNQ#A{u+O14q6 zV|FiZU29E2u`*?>TdJxo&kdk{ns^QFO6Wn-1-5|$v@0bFm&)6!Or=txEHK`v$_EN& zYX|88UHGcWs}D52X?;?hkH+}}?H!J1Q}U$u;8e~Y<(`=d%{|lYQS&)u-bU#F+x%2a z86E|-DIKdgu5`rImjr5*Bu%IX@PvB4x5x_7yb+y2Mr|eqdsR5r7uYMxJb&)u z`M+!HCwm)ZqS&;$5>!2NXhu6*k)g$lkdm3U_c*~lZpm;fOO2;PF`J`vS}Awdqu*aP z&2VkL8GZ*819$a{iu!JSz+8WJo-EB(+1lqXq8Gj)IHM0Pp01L6!rJjg8It6`T;+z*C-S0>21B8)-A;;tR5~V5AZEyb2#9N_RNdpmt#a|%o=t=etKF1T- zoJSIl!kPxe-{;eEh;foPr*b0jKh*yQwFjIYF%GA6KI;qB>kC*O z<_ickUvzh|k7Bf`rZbg(`CO_GwajpA^XMU*vVGD<)@Q%%2gUWp@yIYiwr1`go95 zvCwHFXz6a8W1_(=7K1H%aa#xKulD;u%J?{uA#Nr=VU$&*+Em@HVsPH!RbVdJ%#7Eq zYab+1t`JeTua2|r1RZg1;vZQ$Bi%*3zVXgmnt5GRrGCNzqCC7>2cMnUw48?o+c@)w zw2t~0Y!RM%cC#PkAa2mGbGrDrJ3Rgl>g%mM(HCD}=G!w)Z+MwoE)Sc{$<$ZcNy{iX zC`*bMH|ja>Tj|w^+PQEO5}fMqM!K+4j*nxY zrFXIiFP$d6W9DE@TY_`sel~|lvTO=z_a%tOvO36;A;$Nu*eVF*R1}Rdo3*hUN$`O|lRiZwx${+Fbi(16Rbr4-BgL?eePy#lJsA9doiu2hh+ygj3n z4_LBgcm6%Ut}p@iv;O)vhQXNqipP^b&)Oe_ zQ+J0o%$43&4d9Ell2zzt%`8eCELE{MGEA%oQ|l8!tYvI(?a&O4Dty^~3%Q6i+qr<~ z8KK2uE_#Vu&UM?Uo7g)u)s|FuEhUx)y0nSj1T~OS%TsIKI1S-ZqVsrRv9IPEW;+!P zHLR&L8?KnxKDt+E6}Qx#~!DU5dXgvSeS3{~&$QM|HR`UTR(KGa-*D#xU9c3F18p2nn3 z_m@DY0Fsd(DS|Te5aKg88;sT^uyiGnj2F zpggB_@c8z5-FOqYU?G=M=5mFqO_hGu#6&;XStpJ#3@LB-0|o-QXt)lh5blM*hKxo4 zdNj-Zw)w|}U#ZneF6T3S3V0cDQ;-*}UrmFEC}G+RUvKF_8Dzzp37Yn6_d!)J^4w{Ihi zHC|s3C|ct6U91w=EsoPrJ@)|!RP>I6j+E_gZj4|5o>7?1UNSg#u|4~(?&~V^Ce|Fw zp9S&*lAZ6s^@L`~?z2MfM=z}EsY+;Xdx3~QEO7jKbyD_W z!=*#fIaCwVr8(x0*y*dqG}8|0eLUjswo2R0lFT`L?O_+vKy=h66*})*_gYDwcY9|s zUV+WebBi8@toJF3Bt5Sw$11Hqgnq*mx6cDrX=*U)P7P{u-&l35Q0hoYsY*B6Wpi<& z)QIT}f@t2!nI7MO+}37yV@ecyRiVjBQJfw1BL4>6iOC&&P4G!{eRN4l;Ioo|36%gI zPfw>8=^?DdQTM{GX#FK0Y#=Kg1DjL5-t|rJ+L1v;9C{DBAuhd-OjqJbS7=(Pm5Z2{ zAj~jGVnYTV8b+TRPOSd~)a(&Qz<*>h|MosL(^86re`dw;o`MwR=Lp5c=biVWd<)2b zYa%lgs~)QvqhcX6?yArPE{8Soko&=@B2aVgY}VH{wCV%6#6>=*4z+(v`Fd60iPWr* z!xNJ}FE(6dKK{<0R{iI>nq+^!@8M|PTZY9m@;R|@pD_B)ijWw#L|AWP##yk4(1q>D z9R2z{_BfGbLB$f{%j@gizNFh4DgJj>if30?=E>=ClFxUEg0~NmR5D-TV-IxI3=PP9 z8Uoaxblf2b<3;D;Bqq2^kKD5jiImB?reo*Q!wOxQ80<@{p<_@EGt&V=@@x+fL~eE0 zYnWfm(Ul)&;nmkEiSIfd*fPu>o*K71yf&B9`iORZG|*(Imfj_Xg;SuRC*4yW%3$^w zDjWC4MXZC>QD34h#tywEsx46~_JszGsG5U!V(Bq;~ zEYJ!VMULPA!oA*ETT+VmN?*){Gn}*ZT;y3<#f{xoz3sfum(o<;)D9NQ$p6BKPgo-bO1Tubk*E&s42qy?}Q zHpH8J1xG?SJ$wezV#lL_zf0m+u@@z{S|4#`niQJOFZ@=vHh3w_ z7jefedPQf?B@|mH|66VO1V9#Y_}AicS52L(CUGxIL#!gC-*Y!gW{I^yC51hq z!k!}Nx@3u6H@qrbRO6cV^BC<~)A_COwL&31cgFVS9Y1MWoX#@p!svLs4EYlRt9=?Upj7AaNk3W2|X+-ch?mB)25 z0Lq>5rvpg;la#>!%4?s(j?%xDqe7(VcbBu+P(faDpI;csMhaXD#ihpyT%gx}DgT6M zRnBqVUh_fJN{gmi=)gWu>NzwLPWmm_VIVYSUxgswPbb8)Q0UIl#y`OVbVk`Uf{=C6 zh6wBOnkJ1i@WhJLza;90ZqvHP(+7}fMa5p6An#(>=fOsv10bqJB#zUl%6txe`==q! zPA-r}CrLZe(ly8OI{V_c1G|=@WrgQ=oONE4Qj=NGcOp|GX-tcp7szZNlbI_!JaUhm zJ-sL196d5?PE5^%&Mn5Nux!_P)2yt!ZUME1S7dOIGqANzHAl;o2VfYsYCn*DdwL_v zVpJ#D>{Xdy>er`dN$*y%cf`)H|%Y)Kl)ZC1<#K@RwS8JO#fltss@94P>)?Az>t4s2N2B-c1`J{S0(K;O<>P6ugjFA@mu2BR zznn8o)qvSoF}pCJW%=XSjTu@mEfecJYdd>qj}oEri0sO%Mn+4?Z*0xFJL}j#?2Ce0 z7RIF+5p6~nagNIvtPb|5rHa}1>#YsrR19}}nfwEtyA|6fA8i|3y)W%H&n5&T_@{&L z)Y;v<)%BfrZ0A#Dj76@BZmdMwoK;>9RHL#MTF$$F(8UWg7kN#^n=LzLwpRk0`drS&+|2gHd$~E2a3VRaRU(iF>FK38qRqPn zR+V|~duHSf4|~@cPO>}n#W{q&CMavIFpGj>AdX!kl+zW0`|>O6-xhn#k34+5p76uB9z6|*`>M# zq}6Hvr+P-{bVGKZ+Xa%C<*hMu4Za-1_WXn^)==xoF??nvPyCqsfkqtk)GtEtNZ1if zFGUO3()1g8NUV2Jh8tERnQr;1O#(Z|i?M6FVb@-Y$O{hXwBUl9)#hvcD19n1wic3x zc?w!)MW^j5dNyF?Ch_hLM?Eogsi$&I4KLHV5F!`9js6BPfd@kucKk$K5(M(Cn89r* z0W(>5D~Q8{!d;hF4v)L#&)-@v+6DarUByFpV>XeyU!7()0f!Gr7X$Ka6hq*Gucv0# z$BM%@MwzTfAhT)Jy**$`=Q?Swhvv>%$-yS$(v~do!m=r*FIDa|h)GthKE&tEU4<|- z)fy^a0FW+;yulRK6w{)rPm^Y^%b`=}@mro7XXqd?0};;;Flrh5=yfKTgr684qs|&II(+H48IG|P;_HyJe6r9}zw?@K zFz?0Gwr{-mLfK0V$j@&u#4tM_gEbI9?%BQ#0dQX`QRm!AbrQ7@?8W+|o!P8KR}E!u zeX9dG(wcaRyCJmPpU(TMroKuRKkD`LM>NhYvl-(|c$2itZv*_Q4aOO`@s*WYL%AQ{ z_loTZMo_?2QNauWcEcIV3ZPB?n!)fF1L6WHsZ2nsoNNxwCDN#o=*&!>wDKDXFX`pP zaz}yxLWkur(<4mO~nj*|yE*PQWWRa)efmqc4k z60^u2+_`hJtC`6SU6P9qS*R!d+@9zqfS1$NP)ka(ht}kFiHv4t_GQl5vgJ4ZtTJBH z(6HqbaUCl=^#G^N>}KMDb13s$Gm2dufHIH5aK|6=3dP`-@uadowJeo$l5ft&`+nYA zzlx(56PD8N7hOSna4R)Ip6_K;7V7W8-e&tl zC-4T;E-Z9(D6yWyTDy3hV=7en{(kp|DI?8`QcQu{_3{ecjp&EC9Y8~qoTp)4BExlz zF6~wN;7#(~0-Jy6mnC3Ny;Bz*67NCdn){prkv*9oLj68`qdv|v7@ZBo)3QY3xI}^a z$Bj{yU!Vax9xVBd+Iw?e7^Y2@_v*)He3qUA)%N8nXlwXp*6|c7-py}b8hO9|MO=_& z4z*e4n?nyC_=@B920--LJZqMDPZCDrr5+-@nwPbyvP+5zxR>zzJN?jL>q=|V-&z`P5=YqJqjyoi8|F_ zLf~=y_uo5(4))+_r0*;rMAv4Zk6~+kKy^n4Y|8g<$MQesI`w-G~_{WpVj zcOes6C7t0sN1mD*X=}Sh>Rt3!)GrXn-Fe8F3$O?;thc}5bg&)2KrtWoz~?o{wfn-m zNJZ2M`!3M_lK?rj%{!953FJ-Aqt87O2nm;_kh3WGzPvN!>^hKRpMD4Jj0OSNorkr^ zAYO3f5SQ7zRFs1SGy&qpVled$d%qYmKhUDhN^T!!bPA`VE-VJ)d+TZtKUi%eyl>Ul zbAk{L`;0OkimTh{Wv@R3Tm^)W*T>)2C~8W??OS>{xAr!eFxA@dTv?a#ReglZ%a?2g zBr(@I>S&Fh;A6$z*>o#x;K^6v=iuY99;W^sb_?lX|PCG@*~hz z%RGW^T%MTD=Y_x6<#%bBI*WaH2FqU{wuN~JV0r;7@D82>ub{;z>9mrcQI*5bS>l1F zNN*}4bRw}-luIQZ8&#({M6crx3T~l;t&E=3?s))22`0@t9p3Fy6PG{}*V~kmtm3bX^N! z*)rzMh<=zC@Od|j0dBYe6p_&&@|)aKy%z50gtU}@pNtQgfM((D05ZH$`oq74od36e zy#NvnzjX-qtG_fX1>2LqPb-6?*8_t;&c0ncw zDi_e9qod#w0|1+T9tSewaEGnI{HPDB+Vn3V#2zUnySTYKp`@cXaXX#bou$Tkl zTaRa#hyb=cQw2y=|MfD+92Ql!^S?f~5Hs`)dC_=lQjF^Ch@7;wTGbu);}G^`9CuX?U{9U$jRJs_b*TsIWAb&9DkN~ ztN_Rd2r{<-8JJXXy5l1xIi(E~SwDbPrkRh^dX`!BynS#LDs_X$xL%}CRLQ|_O47T1 zM(s=fV7TJu0MboO&I?$GD=ff(qi}ZEtGKsf6(e4r1x@X8{mOe&h(X7)r90~68Hx>G z7RWj^IW=1~!@1RJ(}MXoc0Aegf27FYi@z0{uN@tBuV`ZJ93jcF=E1Xj!w$!b*%D_h z=-lHnA%l|EGYx#LZx+cUm<}TUaciQ{z!tRs1?nTOhvS2HBU@66B|RnnWo<(6z{&)U z1I%-W8j$^|0p!W8NSt@h79jED1b=~er#OI_;Q9s1{rC&C5;((8cnbgD+1=oOds@*A z;6HpJ=O6Crf9~n{r&a)%5VeQ@vV=j%7Nu^;$`q^`f=@gYj;^wU&oPsA91T#!Sk0kF|C$ZS>ha^T)goGg}l z#HOIIOnl54a?3mVS|gIP>!VGylheNDioh1z&+G3wr#QMXzmpO=!!N% zkd@&&I?OWMiR-(PIOMSfO7)>)|VYFQ_uO<_feOS+R207r=+;o4JDP2Z?F}G{n^^0f5LF=3T=^;h##0@a$hO=sTBM&|V}eozaC`qKFEj#xDB z4G~5I1AdtnhbVB}aq?1wZ5SU-kw}3>fA}I6KB~I8+-AZ#%h^p4lyILV-|lp#WU7t= ztMbn%S{)#g^{;y#XZJf@``=P2OyLXxJ)V4?C8e+@+FXq&-DCM)iW%lDT!|!5ob$T2 z?SJ{&QGVIwlI&3x#@C1=jL1^3mh;jyW#DXgK39x8lz1M`&w z4@lf5*cn4v-#J%AZNyWH)cEIGv9rDpo*Wn2f!ubqaPWn2mX@?D)n_-#KO43pP3)Ig z>!0?2b@GlNt)ROUaG7~_bW6~n-_jY9cJG8Wc@?R)PAFT)19C3#Ct(ovzYK|3WaArv z5u|@`{%>Zl&f?zevsOFx->gJRY~TORqND#GG*th8S^o!1j9Z@{$2+qnZ0Si>ikA;r zVsUl}fRkIkVgoJaQ&n8|`>_S|8)UNhs|r5Y{}c0nIpk!4Kr#oM9P(Yv)(-Hfei*2< zhlsmfRGBW6==oa8{l`keyGZ%94OxE?@p3dD- zMjz6Twf+Ktn^h42WAh(*L()TV`2^A`;HwM=F~&PLdgsXBQ|ehEshq_GzC4ZxlRsSu zUeWk4@WGLhV{Fo9yu|6%gE;%)44E)Hj&nn-a=v)+$1X8affi%7ZsKjA!|cRr<(o6@ zdav^=q^SygezYZ{$c@?i_JCyai-C9 z4b*05PG@{i15D@F4!C%fpxNlxD83|-I;P09PN~<;ZwVY5FT^x8O7+@!bx#~mSg?** z)pL|H`W<8fKr#~y))FA&81w!5cGk$s?gLlb>Cnid&@l7QXyboMY*xE@Hd)m^Oc>~X zAQ+N$ZG1llahQ@;TQ~8rx7Mg}#jR?=UTCF{$EZIT>bHx2^YK-<+!EuO>|=5?)voHNeRUC;6Jc-2P|Kh;^ zH=F~2-A4v6e-81_cXe7fWY6?Az`KcV0%tc3gMgkO`h}t#1nIkHQfen3x5HDy7OrAU zYe^RB!1JztWivOuq=yQHy=isc*}mD~MR{qoO%X#r6Mc9>bXmtQ4q)gHShR+m(x zv-L#}UnhZ*i`wJ67DN@#RL;3WuPf==UsII4HZrO+0vw?h>Hl{106gULIxTfXvpYUV zFiO1YFt3v#k(|$@k&%JOEh0Sje8MyBGUv@BgU7RqLO-p3 z{jD2PK$DDL#AN9$nJ>Bbdt`i|7cs=h*=#_UXn1>K_x4tP3W3WlW^M>sc3bLch%j%p zDU5G7h*>IfSS=P&bozMqWfl2D3~LiyPWh9i0FV^Yyh{L&50Cs`?R^P6 z)a(BLXwxo5Buu43M2fUvS`MOv5RPvdp(qWJ?rEn6hS_%D#?WWQj1@ciCs` zvwr=b>733v#kse8@9*Aw{^y?8t1>f<@AG`0@AKI|@Asq8<*SFPmB7cutKi7m5JfaE zm+nBB9O2_=PO)_7lkQ7N6n(~Vqep1e{v5w_&7o<(+5St=kUUZK=le5N?KvC(K zLyQL#hch556`9NHavYohs1OBG{e%g*IL~Pe86bHJ5p*ch;aBtkWDD6%2zZ|| zV-F5g!D&JeH$?U)%prQDU2HkP0t(GOr@4v2E++voE!2BchkJ-XE`&A%QjVX7QvGSD zImBw=aRPk51Jq%*<3ay$71FSXrBj##3NJiCwpt6+$1zkv9Ysg{{Gp%)CPdsbhnSo| zIdZ(bK=@QoAoIY=0h(vQdd(YXq4`W#{wHr;^!sleUkVN_!huD8^m&n?qLt3SJ_?Z2 z1doDfHSf-+Ovuk*6^t7+!fg&7oL1J#B8su;ZucTfj-wFQ!F(f+w8B9K1PWB!8_* z(GiQIb26?pBy7p5%j0y`$V|=jx#V^wN6dM=!h3H*!nUGC%eOpOE+ac{Kgx{B87&zZ zlFUU)zdG<#EKVyu;%Bi#0k30931-RNjxTbzfzmOD_`$Lp53dK}#JC*vW#=O6r7)92 z7|JTZH(v1)4+PWg+wb~P&d<9ftsej=WCX{p@q|tk^Q^->#P)#h5BYK_L$n%8pj z9y5=pz3McbWN0$AQ&U+`eH8aNiLw42yz*HjwpD>sgTwi_lbUe%jYrFdvlR;!6Ozu1lD7-kBfkEC%3~UToIaK9RWq%g)p&oX9w@*&N(*1$(8nF`Q;I!Ix zb9iNz>Fopj#9oN8kpo@@k~}L48Wc9b<+K=vih?<29mTCtgSz5|lK~nzQS|brGObz% zvVR0hD@%b7eh1{O3$5%l?#?0JU}NVH>otOj#R^+fAHm0r!WtXO7_Wu|HR0p{Ok3|e zsxQW(kRe&~&Zo3Yt%rw7sC90l&Tg^}YM{R2UExGmJsC4={HblK$^5nAb<|pg^$z$R z!Q}m+WUnH@t>d~cELUU%5|uwh=LH`qSZUq@D{ z<#WD0(4S}EEyWerSG_0r+VExHluq&8jt5gNX+1j`Xb|m-nnSqWhk8q{(J^yDPwy)1*%9oEE?iEt*sxDC3LSd<|r#4ytnDB6$J8f&3mV2_7e z2~#hTnjmvdT|s~KYoCoXFgHsL!CfYqcD#n%FO#9d6^V8z&qE=bbhe!2a1XaGp2RCX zc74*VP~CH_;ZekjyYUa=`M88`qJ~IsjP$C!OEvQ~1BEyx&M4b6uXQM91HdtXWwlwYEqhYuRk%~Ok1X;t*^C8dUm~3xyg;~uN9)7bweA56lBH)X7Jtg;Y1Cc z`cF>Jva_H5m~2?ln|wIirc!15qtoUySG~4H`PaQ^jG#o(B+8)zlTLoM;)V>j^od&E zB4op?<=9EZhBuxASbpd2h$b(Vw1Dx>lDvz2%~2Yz3 z4v&tc_&zIoC*SvmgKFA-BA-gNk{#oiHIyt7Ti$cK^kgf2FJw|ahp-{ZTQjx>5UH?x5-v-62Ag>2 ztu?bT4Q_lwWhW`xZYrf*R`*xRKNmYAv}6&=1@uap-mJgieDdKvB*rUt7Iw5|{j#4eQ z3cXbKw3t$z0e+b5qRC`)E!C_PDdThxlHQRFXU!UcuGu8pmtS{bvAW3aBtZpeY0^hR zQL+Q7Ye{1ku0%k6b=<=p?nSi+x^EIARF76#BTcN+&B|-<7V4Dg_^jm2!A;vYNhJKX zh?WhfTZXs2`iHJZvHKn|SN33Nyrd9|?VAZ+q@+}Nw07v&wXv~Fd#h~4{42*s-$?-u zUEQCL3siSA@bYp$)9ooOF>#n5U>*Hc5NwTv%WvQ8(%UVz+YIH22_Fe!Y{aq^x2hT3 z7lU^5GNhQ88*p!mk%8yu5c+%E;atoYf7k)@hjxhnL)P_=7YY1lZkz==ZqcAhA~^w- z5gR-FcqQ`?^A(}DE>8sd7!?PMGk+>|7~m}C5ZIctC_pL_Q#Om?m0Q!pu~NVN%EJ=N z{mBuAAIlDV6zuZZD{_JT0Dm{(7~!fURT}S!oDnr6(3E#$QKfcU(bD~{YBpuYvyvR5 z>JJQf>%BBW#}YrnSFVJ6G9wwwMUsNL(hKms&X)<9sT5R~#(GN9P_@+V;zJ5g<)&pBIp(E_%=w-KOXI}*A5EHW{!rMMkQrN$+WYMHQ%c;FHW1p{vO-&X zR%jdXb4H49f#`C9jMTTk?wi#0JHh2QnJ?4=>9L=O{%w5svxh%#n0fbog*Ns(G4*$( z;S{oeKcxO|+;sdffwH59!M*tm(~WT;J`)#i`n z^HGbsG5Ht)XCBWgSc)Q961G9@NLF)Na-`vzRsK&}h4yVDwPf0Q_gK~F_YRrAFY3R( zLp5CME+LVQF9>kQdsMvUOjOG_{UAPqj#F;9TFX>+*ihz(Dkia;=53+4#Oj;TB=Cz` zo7xQ&C^0@+>!4#CTkfn~o@mAZ-NSc)`&pow>if~MRP2Sr;1~#!9dqdTtB*-Rs(u-= z`}1dEh714NN;Z5OYLNp;nG)JLDqD=?NEK7VC^mE&JUjf(q=Rcb{TRo<(cX1O3%Dt` zbyiHn2Y5$~uAl+N973LWn&8|Owlkub3kpoYioo&j#x;~{bInA;^Biba3m0G)D$4%b zKU>_7p@`cvBFww(nU2lSpj|Gckjh*N6&MrxHe)srh8sTszSYEG>NLKA0kVL$co@Y@ z6Yhb9p`7!CBFUBki!(J$V{pnt^d1t6PU z^QsEQtNFM(KRme>EqI719VEooNdyNg@Pnk6uw`croht$aF7JYwPql*?mq7&N3!KwY z#-=&Mg~t$0&laOoxR#oTq&v+aR7~Dkr!(%4pM41EkRGDsUIIOq06U;!@SxxZpFrJ# zM%aw!5Rv0VBSYU2R$6O2vH&%Mcnt<{7*4`}8iD9lnDLX7Cewm?@F!)46yqU?cHZ6q zki8>cw+eyx`z@!{rlg0ib__imFhA6>B$v1G)5M4F8Y*8ZibfBbLmZdE=?j9&fM2{e zsB~sRQ~UI$nVXa;)>Fz8?U&$L1BMo(X~vha?l7#i74$uSE@xgqJ}%*wtX`ks|meEXgH6++{XTQho>bX zyUJ>jzWo@I9ic?B3ilioKCa0!!i2q`5zS|y{yAzs<3`{ibhcVj1@~E0hqHFbdaq%7*CQwKGE0$2R~{< zcPnCBz*Z|pOVstmIpn&_HV(ERF$#3+ zSt38B0{?l81F^ZvzV5JjL}{D-QqUf%m)k@ss}51Ycngy9>s-7Osd!1ZwMXNAWV zB6={K9cHj(URM3Ik-B|mc6R3DcQTbpC--Hiix>N=NIT$)X}&(r$}aJgJle2!gWbXQ z{;{!sZE@n>O!4Y5P|s4?-oVFvw(V`~#gv59m#`?->kVk&qp)wVmbtbyk&$9dE}M2~ zq5Id`UB&;v41z9o@W8|lo~v@3$=CARhIpWyjEqNO7BNSbb68t-3iu@{$8NuOO7E8N zNH!hNHqPWxvd?#RxZYSsI+e}rD=B_zU~vd>#L3GYG2&w*ud;pDok}()lj%-4Pg;&G zBrFdncN96?Ymc%|voO&M-jjYPrPHN<_>IiFCHL2_+faV_vT4mEO>eivGzLV~RP647 ztdtBBH*@PXmVFUOPYnI4Q#rHTjJ6Y7)bVQeF6unyuBL-!SI#k#R(8w&v@tXRIHH+i3IG9YM@@hEO6)d?`G%LyaV7#qv4_ztWVT>Ai{bii5e=6Qkj zn8w7hmWJFjK(KGu{gz%ph}nEe9v?U?tipEa<01~G2seEx#uqdsxPyiS59F7AgaUN09N6tbl5{(4 z%or>Py^Vp}g9cbq=P|qWw^E0>4W!x7m#V^wH-=l(z}Dv#n=K?P1}AcgOCzQM zDUaNPZnV;1_&}rOSZf7ImPXY;jVn(oC@0}MV^Eu%cg!Kq$D^2n0hMSCvIcxo_AWFc zV={+`s3TnPE%wbfa~6kjx3@i38CHFl^+f3qaq?t;G^DqFaoFS2*I&FH){RSguHbp3 zv#!x2<`r1YA3>-%k5qdZsvUpeTi14gG#g)y`r2Eybs;>x4$BNd($e7gbX8or`=IbV zDAX%%3QMCeYlg}lDBzCHEk{i~M~^JUN|m7|q@gYkI>8+`Aqs%EQ8X|k48P0U--+$n zs5j>j?7KlRBpHAL^8m1FQ6o5wYuX)NYXG#0Nf!sx1$fa*6R{fTQNuL=!Qu&ZhK%4%5n+Db)G8W5NgT(J?)23e;)iGy_n= zaD$ZHgE_=w3ujOf#v5obCDQSq8VP7JiCIkmPiE#L&X*DjkGp}YDnXi?HVGcpN2r{S z0=DD%cn#*d2I?9N9>v=Hes~YU3?FFRsRhSP@j`ELoS^px#68vuCU-zx7+fJ8BtKhg z>asu!&MMT4J79GX{Cp4jJs*Vlxaw;Mgx<0PB)^E?3is#NM8gTu`qLG1ZKruG^09VeVyMxXJJe#^~edY^%35g=R8P?cPLYNV$0;_)`m3|!ac)X0^Ed(#6 zCeLU9vFb&mXkh0|J-G+OmCuFpJL39<&ykI6&)`1}P-bFL(8m^_tRiQ1D1MkUpcCi& z9k26oV;klWo56f8U`4x47~U^|ZcGI;sT7;ynArnDiXqUvuobSt@=<^~SeXv5M^cpi z?O-6PcY{W7S0;u1EvGTYE*X>inLQyU-rWS02>xZ+Ig!h!b06%KtIfKRtskG{qmV!e z$=+AoCzJR3E>3);VI=H9)xZZMP8tw$PHCtCOyV8zXfjx>QNgGo4{#zYh+JE_sc6NU z$e}=O(2W62m_ifE*>H{tGzCP#AAH>prGx6;=}QZTs1DHVQsnpsWu_*Ik%OY&A?zB6 zU}{xBpK4$=N#OJny-DyL)C2&m3J(&B36z7Vv*SFsrcWZ>P^(6rW$;G%7m^z!0* z2TC|hDpXqtt>3e$c~jb0!PZ;5AekJEZWra!JfXY>2FArMb|@=^WNUA4D;jA2)G*M* z{+gc0c7HP;?t#v|&VstV(^%~gaq-W=Ck`R9dBCSch~yj zy(PfBXFCBPsxA3Ae9saTp~YW;9Wunw4bQXWF3jM+q=!zd?Ar~J2zdXd-JH`O6=W)H zXi`O8RO96%nNZ@{(P@)Wjd{P1>K4M?K~9OWY8VBVfQKlI?oQ>dfus6zqh>=GB795Hk@{=G57piMRtVLESL& zdplJ7x^wUAg3iuP^rWAWNdfYKf+#Zt-eKAs_!8mH9Oz#tmOuy;!stEOD?P(3x|pPc@PG#sNQi;Afb>+hflb1=A^ ziZJOlMcUhuCbbN^2ylL+*ZC~b7#e)>U5}X``8WG^iPYG)?pfm z*J=A^8jfm<*6tYlAht657I3c~0yFzn*t8Zs9dr76 zAEe*ReQ1vWAd}4J{)9{uW&~@LAb$oh=L+WZ%oKximI?(Zt_y+jK#XlZD1M+2V;MEz zn#_&RPGn@qd>^TY^f@{O;ArJ9;OJL6R1Q$*dixG;!^D1L{kMH>{n4Ee;240lK)sYU z^8uQ2TgfEj^I6=4%8W=>x6<@zG|=m-xu-*bzzSjLHkc3%ubw#sbwdq9lTqV0977QX z(ClZ0s2RRwV7UejOXk&4W<;D%FqXv>)(kE43$okI&FE~HMJ3#?S>~x-Uxx;~?LARk zrxv@?>Zq>mPZjlJw=ZTCkd&Xu4?fnO@n?h2%|=uTPF09}-)n^M&YVv(g=V>Ync~QWa6{O*SuPOv$!k?T~HjKA*4Z{aX zxqu~gePVb*ql4Jnz_l$y+RQV(KXYPZ&q4+7pb2rtRv@Q|=sCoZR6IBs=R64O1A#c} zQI5|cFrYK{wmIfe10B5>dK-?T!~m_FOvZ7ddVv>EfUFmzZ3F80G*ls{zy*Eef(Dwz zsBH`RTS;fZto^Q|hP#29ziiQ!&hRmVh?&sfDTO>dVf=Ga{w21di^(OH?0mMUyM> z(R3pjNX<@8VrPD1^$mebSR^i;sb>(_K{FIi{igB{!?8YPk)4ausnj}l%HFB740oxYFe$i#u&=1LKU z^EaYs8Ul6=47vjZui8ylaWFXg=7Y54eOU29sHH$+4uEHxrGVaYe#t-kTrbO!k4h4B@HOq4oo-71$TL8dW_A8`E zSots9z}M*Be+@nwT?NPHxe-8+_%$XR#zcf;#qKZDkY z9(os`qMxwu3YW%B7(V(qrpr5UgmQ3Ati)JkUUr~UbccAgQ48YYBY!LT!K(?;^`!G@ zAI=AMcXu_4gnO*YNh)L0QQq!<1Yw777%VB>CC)JB(U%#nj~9q+p|{l}qdISQO!%z< zJ2J~1_T;9=e|GyR8{p-Gz=4m+UN5w0fM2p|u>0!BrdN-e6Zba&# z%{uGZH_9E6*Omyn5d?dFwBRq>WVSxb`=-I5Q~6+!3R(8w^l<$jxT6(9dC4gp7E$~T-37|CMieV)})s=1pKB~GVaqvw^+D#YK-Ah18| zjB+nNc1>N49xHR%e|X2o@03I`^ed9sQ|}59<*rNYBukC<)pJWObRJE(XPcTym6pU z2j;XdTDV7UqrJx&{dK_s&lpPe6z3rd zaRuK&qS#68y`hrxYK z90lil62Z)Hk6}O+!``{(dseU(&h13`Lx2rt3kdnm!AjRuKmY=~n7quB3C~}xg;w+| z8Ci}3j291~Vwp-%=k7xRZ1ru*Vnys60xI&KLtK`m?S!b-W*v+mOdyV|2Q}dekZ*iP zO)qufZs`lptaX=lx3Fw)84cpmd)l9&>oWP4^wwwDz6XbF%(HiQC+sx6|3L8Z#Yj?? zZY?JAX@5V1`$74uHzmXnew{R%yri}ZgngehTrrz+PSR1%s&zR=WM7?(+UZe2%>MRn zAvluSPVRaKOLR%at_H^ zePUYVgziuRSPeBto{@-gAtQ9`r5}Gf{MyY>~2cBmoA-S z9l@EDGbt1mTH6~Kz*8jAu2BwePD-DoINcv6k3yzvObMRy45F!oZ&FPJ=EMH3VWqBN zO=7P-gBuO6ya^7Hw_v8rX?JnlX_tjd$YY^PAGI_?#-_6hdj5@~{M1JV*&)eNeGm4P zaZ?(#ZzYCwDR^k?F{0 z+pj5(UNd7*zfv5z@Mo|~JtCxh79KwZHfWI04=@{w$>5d$<9zH zn+f@fbBcqqJ81O4A!nf1ekRYV<7ayfBQbo?+K*zGibl8joXiVw`$lV_;zIrk$Aq)!dvcPqYM^gcZbk%`ZVA zfj_*1&>6gVW&_2=KGFGjCLL3ZTK>!!Og$IzZCmzC^7zD-XK7ZE(u zt*Dv8&iiPHw^ka+Iz14>J=fw(&wQc2yKDJu$R=#H`dKp_O8*pHeSvD);Q& zin{I<``wzh?YVSdi|ZY>f+grtzZ<5w>SV8EuU0tb`vBqukS_X##mIvWeI4qV#72IG zvVt83fx_C)v}k~4>?^GT^5eIku#j(mp=tl5Yp_rXub4po0cvUKt{J^KZeH4KGRoOa zpq#jR-DjIv{0Ty`_^4bg-?^xCgTi|mJxlyBFWRNZfw&&h@}XcJuiY>)%3Rm~WBbwE zh{+cLD-Vqw+hmiVbaaWy9kG7fAKxUY-KXVV4oL_LO3RBy8u@*a-WRuu`>4$376Xii z(i~zR)gn9(^dQ~=Rl9<{GNg}(%W#(Ou?>mN2O49PN}JVyEu!__c;)X%4e>?wv`i=N}BfrH*eUi;!<_pbXo&moUDe7#La`_ z=c6Z8==-rm2sqIf<(;%ZCfhOiHYU zS6-O(HZInDeEMo&Vu!|wW?hQ*vKSd$m;QUi0$dDT&SRcJ)%0cbQA~h5*rcV;N^eM+T4U zS}V(yxtQHClZ$m~)L_X6=uldq><qia|vB}L_u>BZ)QJ3=$J`>TvT z%3r)F&)9c-N?`f9dc5{im86O)^L|qJDBhfjc5Aw}^@5pkh=oQ#iNa`wr23!%x#NECW&TBDCl`e!_kZ^F*S+MaROPVX2~#$6Y9iua9+1` zZ&r#)8s=%7iMKG0(a&wDD&To2wSwzh@_8MPe7QwV=q>_d&)0Bg8CsA##q+dPK!ZvY zY1-7~3C^GL$UXjSp1=oJ>~NE8{X`t|CfFWVQN2FSYT(4~h8Xz18KVVYUA}GdC1^pN z9r!z-t92h4*S#aX6uE*DR2amQm6{SbZhpvchQsR4#-)Cj6~);){mv3McE7BzFfZMp zM$eXEj4N2W?y(y*Ib*6G;&0fT0-{#s0U6-@c;E=Xl*bMlypFC_-`i)uNL7@pSBt1+ zj|PPaQBXts3Uk6XuEF58hgT@aV4GlpXJA(?KcD(RLB#ao`7?f83R^wc-z^!fy@a@M z(D%*3medHw;c=9k405T%1Nw<^7oWzbaf6R?Q`@2{l|&Pxj_LT-X6i0kcN9N##hX}y zB!KT;*XRU(MRuH_~8`AZ97tYL%QN)w)=hZ2b2%rNP&!sAayeYP;dKvec{~f_(FRG6jBvBr3fT%iSRT!@c*zmuc;O~JWap=n zy{7_pIEy+L`5xO=K=ZVm4~VA4m3Fl^2+c-}+Xr=rj|_m^!avCd@E45YzYD0Qi;1Ql z{%Dc%@lr-chMlKK_%V{$>wF!}^6d{10_HQe@QMvA1eg1a-J92x0J`d#66>DfyXyxX z4WA!!u;aBFJG9xKgPA;_aP=Wnunp?;+XGjp>7v#XOi%JA$+=kFF_Jv_m|bIS#9D!} z<2E1}ljg!+<&Qc#+<73v^C0z=!$H%O$SkU)1S#nqknmHT?Q44bD)%iP&e=x0@i++k zE}CQnW<@xR(h)O1B!jy@<&}sk-k(Nn(v|Mvy^fv$9-qg(POCC-{-n9))6dvuAgv|;1%iNWw05qvJ_=8Hy>l~CM_m6ZE-q^83~&9mNe+)34? z^0Ggs1k&io9`|{x@t+Uv9DZZSZ)`r98cDUEk4p_&8ggE0%2+@2herd=Y6?}*$+#>O zqVmn^8+!Kx1*7(lCaa_nm2kNBG{>;-=F%|2eBt8g{EN%upH*Cc7#b5DI@oMWHi7Q3 z>!3VZnB@yUmcbrgfIKE!n|If1KP}j<1$wax?6DgiaG;Vu zaw&QcT9I=Q?+~HwTRfA##x&A&qP_a{eTk#C{Hm2)8&9cA`OBM+gTBgd@$a8Hy%l$z z*%VtPTs;e1g3vIbu6T5%sVV^RS6zKZx0Q<{Jh!9>aWEna(FiK`9ub$@7b6e8sw~nco%RL&>tmzFr+^VYoPk zFvX<@;xaYuayoeRL=rIYWYbP5L;S4PXSyC_tsQm+ zSW9bAdDxz}AGC)U$$n-7ZLeUQ0s&_kwgShc0FovJN1V?mFk~o}J*0I5v-@d|MJ~|} z*s-x@tu(9nrGW=3(sJ%H#hVOr3 zu@jMgrVC-^XclBA`r2oe1{6K>CiG?W`Q$|3jzcb-H=Ez5WtuVh;+n3$rrK z8|eSr+t$*<7NGd;R((9@+DR%|db7<~TlgO7!V#7HR?<2h{s|R?-Oh!;g!{=c_he84 z4#7oAWQ%+4vGHb)qEe%xQj}DZF4^Auf#u{#VKPx)0Bv~ zqaNFroLqZ*!vl-amKP6QJ$4A(R6J~R(VoS`vf^YUw%C!+A*S0Zr7>^k&?yW5uG4aS zTvrLw8_Tjo9!^@FprTi0Lrs}gSnJ}})32RB%5mjZMeUf$37T~xq^1{*WAOz!o2JqH z*z-up;V3h#E@(DeGC6xTu>v&Ekxd2*zH}J><|q3n9tJlZUa<(bsk=T;6n>{~7UR86 zraJRQ%g$w4?7OY6MMM#A9EG!AGC$Ee%1pzZZt5Rw@OvL{>HM8YyCpYdnA&L8C1$}T z2z)}|64-Z#z$H$}lzzE{e3mcOaqKOMXomy8}_pz_b4d{h?& z0>_N6)$)Zt+A|;B2QfC&>GBT1hps8bA={kKkx|}qqUJZlk8|fVm-^Tm^N$HHiIaZ&;y!<; zttayozAU3La#z3lDbpmKw3tx_o9cC&Bz-^A$&+^1N|D8Qw=9lQNw%RdUP@?G^RWKf zRJQb-L*CUTxX?=3zzto-Mb`0gz;+CGe%*5((!>f-Z!kblp^PkA6r}K-` zosh&7`XlS)IR!t%6qo&v^-DTw`>2{DditTtA5As%tYaXl#}%yW)wz-@AZjKHabbdX&;G&`1ICXR6u9a(rLBiX- z>xu70-U=8XZ0S$QJe`vF2$ym9vP@j_I86Oa9VG&v{J>v+7?&%KZ|F5HH>M#F)CWS{ zae7856F2_=J&OkAD(VbI7Bpb%6*76^S0XW(0i!+aw6(xHaXn^^$@z0(%csOu3*VS*v1 zTArG!<-7esb8E5;_~)zR0{@8b5mshLY&Aawye=_S*S*08f`;o~5HP>|)H?7sDm+@V zUw6?S*0+DWRPaL0n7v{o--XLAJPYyf%m6FXQ!1PQlJY#-irZir4oq~s(bG1~n_W{S z!?CW{v5|O$OEbf<7Uw8&OC-W-Izp(?>-kENR{1M6xjaYqtJgQ%hD=2hukQ<{vrJbA;`vbpOXXm00E}|Z36X|^(m^a(+Z3B&*TLu*c0Ej zH2*KWXOA~Y+^w{BjU2ycdk86l?Mk?3{kQjx@b>if=JWRp+Lj!R(~|jraI}`Fhdc0g?QYHU9=&^0z);nds?oa8R=r&G%r7ilQ_s@c{o+V*TT^ z`D_Hj0#_oTY?W-WTSesnBJl9rMfsn3A7H>@ZfI&U+M`h_2)#(4rP4VSg|BlV^W#g>n7*^#402?ty&F9;JK57MU zZ&AALl4tITo81$Y6$Kk^@BFEQtX4Vo97~%?RZK@~9*=8gA)J50i+@wG`9+`v6h{64e)J>P3OtR4I2#>-gptIc3j?)khC z{{h4C*Z03haDMlC$gfr0d&{~m?sHL_mAW3v&0omvRNmBU_d;E9w=^S0A(f-ep5@P; z`R-AE_euJ^|6q^Q))bw%MH7AI8yjIgEu+u?tkX}grJYVM9(-VPrPW1ESpR8#zb^0t z#8~jQKfLq*XPM(qp@CmEUt`E<1nZe!o2MdRAfc!(XYV79cred_blfZt_`Ip=);;>o_O5eg(qx&3q`t#kD-_6u;)*m$+`~xJL$zfm-g4nu94vO~_pM`OTeZlwd_dK;@E!!8|FseRSetPAs;9p9 zXHutun8Wd`z~K2j8DDZyQ*kH$`5Jc9f*7s6V=vY^$Z*bL#8}y^_X{R!f|)>n3~Oq4 zvB@I!WpVndVz{TXZ1%X%uSF*nIwdBD)pc7%A8UIzJKo;Gb^Fc1s8)IN7b+m{@i`-K zB?}SmQ`89l{G7A|pJuV)8=t9u+UcJeGtSD7L@qe=ABe&0Pd~q3b63$3CW(i#!l?nQ z;Dcr7fAKm#zxF=tsV}Nx3nEaF#ayRYyrlELLmXC*Cg^+OmahRp!jnJ!+Wi^!X(ao$ zh$wAW^a`6-ND%d(n5Q`Yh8Q;m_x~qfcKRyQ)NGHH=}^l(OLhC& zTPd1~X7kbszgMkN1F7hGtey$-R-qZ9 z2oYvbk=c}xEJK*dzKm^*G2ds-`Fwu={=Tp8-yg46c|4DKp69;q`@XLC^}gPhyG{;g zwr%-s3jhGy&f5Or3;>%F003gLc_Vn{XGh~H@S6~a$*Db*0t*)#AU4A2au^8p5(cbZMnlfHnVU_#WWE-=M z+Sb-aWw)Wk>yo6RT-*%`_^Q$`J%LeY#=nOE(?mw^-@_Ajn-gUIJ@l{v@$@C^Y_`S&PK9YX*22so++NX-R)U4{PdmjAOQ@V`X_{BQRF|8pAv@IQYE z{4Y=d|GyFP($H9usyDUepCJ4qBx(pIs||dCt_sR~D%FW5Yjyfti%F`uB8|;;bw%b@1QN&qLS@ zewaQ4{1-_im=@~)23+0wFC5V08*bVLyt(u5tMC$CZceyB<$*W1o>@sBmbq1P!p^lw z*AK|s`yWuD3_mFcR5<>NMP`3f(*yK+|8uEGQ5L*p_wOaT7Gwy}yZ@g{B_}t_0~M$L zbIEMSgdO0w<3G^mLW@nH6Zp`I{{S4cn%X}3-_>3P`vI9z|012ttqvRX<*7Gi{Jxux zO94+pP59FW!bPWT@<2!Mf2T<0Zu~AHKcMPSmlR0ZyEc%vOsZ_)ie>zWFa9-#A3!8* zObGfN*y#5@9tr{Pvm_9!0)haH#2to!4ER!xDpw^-@vy!%|3G!JfWvcpaXhA z&QS0e0B#>ro4pkME1+Tyn(*?EopkV=p7jp1TWA?T4=U@I@WkBR_s4xz@HBHbUl#D| z`;QGCJXTZzKAr{S>6yYWo4^n4B2Cz3LilokpBMNm_9s?lWC6gh&f#l@-eDU*KxGs7 z0NASw;QRuB;7`zi&oursj0Xhph0cMt(RiO>30}?gHPs#L1AqsK(t*T(EoJTe=QN7F z4WPd|s}BAO;HRSv-Z)hHNqgxhnHd(~+5P-Y`X>I{fLrt2U;ReX$L+c_ARuH^vE({t zZSXX^C4c~Wn`QLqt-{1@06;ek*Lv*!1p$!*#2zuLV`tQ1z=6tdyA~=8N&i|s^i|D1 zKUZPScA$Q?!Wy`c;`fWCR^;0Go+)0MLe9m!g7ITgy$?8ceUfZormf3LGdWv`~p!C6jWEdqk~ z%lZH>`43s?*qhdZaoFwPGp`tO8S%z)emGlo;Pl6gt>8@y@7@NoqS>8*o~ohDHa-0Y zr#JUCo*8`<4Mr~I4`nW6+i(y--Uik_Yu3^Qyvf+&2mHLdAC|dMs6CH?03ly(uuzk# zjd?8at%1u!?`5S!oAch5u=ERv!ymvMv|rSMkIB>lm+C)TY=N0^^7h5o2H!GNQn8!C z#Zv}N+>4U?cgyIdkW*#BVPCJ?MxRsy50L!?Z&=KDmz3<&6mJslQ8#b zt~xyi$n=rkG`eLn_kb&x&Sa=%)05LCVc@Hp2EC?0Z^`)8?a0BcelCdJlaL>gz~(*Qlb1GWdCTSyb7H-G0P zS6<=TO7H-E&HaV$&9U+|5osm@kaqwr(eB}}=T&G^Nb&@0(oiiyRN8|u9|8{b zMBIpZ&I|^qdm7R9gP8=OC+^MLX>|m&K-9=e z|1k@o2wYEeuepN9t+>bI;ozG5eo|4MJ^`)}z3BME2ftXqDQtAV&jHt2CY%+GYgqsU z_Mi>WjEs{?fcc&)N2%|@cK`yO7~lZC?lrB@M8{WMZv2GT+aE11M?S*(e;X3_QNHH6hCG$bMmoZzzyg zAJ3M1Bemm&JHhS3(Stzo=RVC7ai-I9nZCGLxppJzJ@wsKAe9^AT&O)zrmk^>g#cN>Mg#XxW!03oe6H%&aT zntb5u6xsG(XSF`uVQ& zNZqP{-+t8U&Dbk)1kf=o*s6j~^}wNgR_qk4Rg{Ny_}cvZTTx-MRTB_ElDxdU$m~8R zoM$b*S3EkcHF{wAD>zBKOjW-wX_z#k-)$16oeO8(tmp&RH(<|veX1aSZto(5{>pR{ zpmz<11-&EEN!~qjWhmbBr~H!4Z%Z41%wQz&awB2R z3^RYl6t@=BQxWLcQ-?vD#;=6eEPSzJ$SGJj={8PYmwp^vf^=Yhl(UmAOy9546et7O zS7P0|4(P?g(`LSWaW=j2(+RWG6Ug}S<5CX|kMF4p^^DwGaqc;ww>92F4mg$P_)SUu6@%{4u9&Z? z_tBxvbO>857dIE5eL>p7MYBzd#zn?AX9OH;-wjP=4XeSO@(ZQj@2DqCx-A7v10*t> zm=qSqD2bR<(0d4u?ZEIKTLdLS*`HWF)d&+ziMd(C5~XMGpdx*8mKHezp-+%111LpD zEplb~+0j1AvxQC39ffdzPL3sVC2n@GQELI-`Yv*!s~js@ium#VZg9-}C(YvUmDP!| zGTCA4^Ts`r~2}*Xp>r!&f<=gC*&DLgFQ$ z_gv&<47jyzUG9P%HNGRz0leE>5S72?RUTI()DK~Td#WLz6~u7gRhXZxSLetUyUT5bMfoK&QUbS(rrTv^qy+mOxp8Q)V-+h9QMt(opD zTenw^(D-T(`xGj84|7a4t{;OVRmknjA9nhdXeGj8i^i9S?x>a~4&2wNeX#}WB16{V z8n8{Cjc+wMMf~d#)2^bOsc)?%o4Et@dcbY^PHfjDE|8G#{>`~9%dHz`|E1>L&bA*s zXCJe2L{!)jf44xBsPfc1Io?`#lGC+RaEpH&?o}04mXAt=>ElM=90Xxj$BK&QjBH<# z@{H3OC~NRc(kpY00d~DGmy6gCXq$veceJL#k>O2)hGH^cf~1e>nkav$(GAFw&V3&A zYQmH2^sP+N`*aT3q}XcH za}FLuA=|?^BsuJaE2^KtmTke6)y=n2olO`T zD(q0$Mm}yIX5`5~&Y=3DtLf6FOhaYR*~;HNW!KeVskQ}>Z|t=i*mgf|+k-JiPWD#+ z-gIZ9==;2e z4H4>U?5~oy{H%{#c16;l39w&_#(o;A^=ntO_gu|t(H0EUd(=&g;ka+C zC?q>nO9sSvpoNX^Z5ixj+GfUxP-`bFjXa{`2bafRSB8H7D!%KX6*pXGu%qq%os71Q zO&M+Mm;H(+GGRKypJyT+pXF4<<&!7-c!NiR)@yoyy&*dYsq-{j#?0Ct?lrGI`!nw6 zPiIry(>RhrRoZ-gbAL-bub?2J6SJr=5}@7cxFMVx;~O)Sq*VgztchOn?1~WfDKl5Ln?jP+5%zeqr#xC`Y=HJ;z)yYCW63b|S zNLEkZ;W;1(PW4^DR8iLj_EtjG#8c{YQ+8IKM3QlMnJ#Zvj^!;^Y{^Wpb+txdFrr>7u zieeX1|6a-Gr?++TIj)<@)B!>zS3DhEd~R^7Zr!WkCgJQ*GctihH-?)U!_6f_s_PeP zR>zAOrhKx?q5`-dAq|vYaUG-q#6xa07duKVE~Tyl?5Pg>8>=!7$HR$_+4{_|$%BiN=I0dv`%TiY;3X6Ozyy+0IcR)w_=%hEQO}*` zT-MGVRHxUEJdaSESm&=Lws}=kDwvd@yg(n zg84o>JAVc*>_hnUOzY=@h>;iy^{XuF-wQr=l-9-7YEocAW!L%X%G>`74-i+L~s`GKX7 zuf-@^XdGgPK8oapj`f|n%^6te)SB$CJ3JJ=y5_B=ORGg}XiTnug+Q+4(Bks9+*63$ zescnOh@bn~DGH^F1o|+U<&gp#%=Bnqsd=`L)9?m?ga-EHz)gO2@U4eltH>^Am2c}< zLswm=#=C6Tla&#W*Fiv6VQrRdxed^Sjt4iwQU*=BM=Q_1-4nEG+SwT7vHIl9KFY@| zB*+t-1#`v+WSAD+;NZFj=X5be&`)zf)zq)LnXKz(#FO{yB z_6+(vY+S?EUVVI5Ao)CQty#6SaQNHYFcR1W-BTSMM$+nR^@l0BHHHV8-3Yr7x1*+ z!(U}iD6R9#gEpEqH)i;j@{UG@<&F~l(Kg2!k}sR^w0D<|JP+bO(>#Zn|4KQ+Nb$1M+&@fVXDgXxhnj(UF+v9k6D54mT9%VGK_po!o|T> zd6Bl^Br9BpX1L8F;XP%NRcEJL|K|HNbP}c#C?wD*JJ{v!IzyH2S=ZYw+_u%D+aJT3_xi?M}Z>NQd5m0VwZZY>jTlXeB79 zdTVXko)OWQl|2@^b@U1UT_+uyfW|LdMwag}b#3e1X^#iUv&I~e6n9iDt7mvQBQN2X z-*K@=`&j<6Lc$!J*0$68#(Csn&?)HBb`0S8=3I&oiSQvSSFN`mdZ%LnJ$)oOL#_Jk z3w)1%M)ntiamB%z7hy;%U>K}ns@#vyvZJK|f=ge0XR}v=cB<*oM^k1OpR$%l9#eeq4f94K1HU$x>~{0VO}2jE* z#QVI1tU^f@eaj8Y_2b9=rNC869_oRp&2QgB{jZF`;9SkN%m^F5gineT2 z9%Wz9TDEwNh(eJb0xNxJ&P!q>4osGc`DtzAAsj;nm#Y3;c+|H|7*HqUFXo%7a@h^zc z9;Ys}_%v|?`YRq$OjxlRZ{R0HT-@5Qzl-cgHu~&aB41!FY0^a~7g1JO^G|pLvEynY<-)QvE~5jxs&g)AxnGwm1 zYjXR}RIzR;SY1Fafu=X?&z?Z5x*I7sLLV69bQ=v*%m&^?qw1rWIS~5q(1i0Kr}O~c z@Oe@6a=TypL$h{tQ!Q3%BEDp-6|M(3AOmD;5EnEQ;eYzue1;T2PZrlO=#RTXyZ$ZLHc z%VhSN@2X#Q?w~bx@=DDToV2Qt&@xHx-#R+CGrjEyUMz82Y$gpFT@1@{tMUg!X9+2o zYr(OQ)a_8PTyU!Saqo!FR(^%0^I3AQ5!|w<&A2R-H)@oXp?c(?lYE)Sf;FB(Cv4*9 zF)#RNa8ns*91M)Ee$ehvmtp;GXs^<`hM`?#NYiIR1N2L|y-hVCG%L4P{r$DDTs8PS zZ(U$QA+uIaFTmCUvVWgQ9n_m3HUw7WxYG9!T-iv)5CFsNthIV98 zE85{|rsA<`N5Kb*HY-%=52O5$Fp@var?BaGGD+Ax8;xIUDgLd+&AiNU@>@ea*#uX& zB~hRvMc-}BoTF*QY|?rdoc+DhDF+O~RiTIdmmRG-dial!BJUCXUfteO=m04+Q*3xK zoLX9et*9sQvIWEuqoC7fjG}WUWsW{HZIW26T*0`^Zr_pR+-_@`8%uKp22;Of9e5_) zR_2-)5-?Aindg^hMhq%JmPig59>R}ZSa9A@cN*w}JcBk66RM!5um>8DYs6S~9YXL? z${$prxIX<>C$Fa4{gvKpsOEjU(EaCV2XwwXKCxaS!46+n)1X?ML_2be+ZWw+tkLf_ zApyF*Vj5&cFpFM-KBX1$v)olm`CEUxNYbfnf5|6~*b8*RBcWQhf_`T>HFnRq#rj^nIJs@U7VJUavD8EEMkW?8M>w{WJ@>TmI}UbFnzD*r3ELV?E>)wcD_P$JeqV1N>_?eud{DY+i#p!Mrnm7WI7$qzaU#dc(yB9JGcD?W+hAGh6_Ba zSJN<*I2M{%U3ker=9+=-a<1s%!rXTP4Jul#w3uYAw9Ojh=38(jwM+p!_TU=pvQPZn zhfcJEK|97r!s4viD-(H|w{;`C=5i&v61&Ac9ja3#?iw$+o)nr9ni879&zLn{CWU?u zDefGJg6<>vD>?C&cBf6HE!sX7GJIETIs>nVj58VG5IS56=7c>HON5sq8Cs7`8C-(LLL~|_8G$k z0oeKYFWP8o=$+9S5G7UOYG&~3G%*VKDkJjT1<-7RX39D4)e87Qxz0MCspLlkwsZA# zmofPox{*W72AGi7$>Ln@*66qBHld+6pgNiBtLAumKAB2kRxI& ziD+Um&G%)?5Zi3Bq?vPJz2lRig5^VY-)kqIp7=1#D0sEr72fd!imqUhutzg?{VqDM z1(FqlQ-pxH08f79-N|mKh=eui9rvDzb&Krn!%vq{Hgcj^Rw<&kI*ZdwE&<*Hz3@B3 zfIP@{>;2+T4ha4Ln5V(w!D)`9y~BOiivTmVuMu!R6=hY7ItCN;O#!iHcFYycO zlDRnvd!1n;O=9=N9v`O2Pu2ZaZRyeEc9xx zG1sdqtk>0&BN1^V@yD$vzLvOAtviZPO$~<6;swcTwv|4C(S*w&0jow0^hvN7|}g?9Ca|$F`V4O zxa==+{6-SB;huCGE@y{ki=J+a-vp>^1QXUjOg_mnQ^-;;G*__gC7iR--ozaytF2Ng9LT4AO;u>LtOltW%QUXyLTLZnPY|qwO<0|MPU{ zR{!hh!^>8fa&pDB{9xHt`Vl*Sigv1-^$kbmUkfq4gw&cDizwL-kWdWS0%uc3sB))C zIY%lHKA0Ymf)+t8x#tF0qP?5i2v53w;zEyGCqKbpd}@*j&oNBI@0vI4Dp59-kY_bv zDl(p{L-k|W89Vc)(W`unza?&nD-mh6+O-l}*ywYu8GQ2NnJ(I#F*P8CL%r45N&MR< z2qS5Bx8ls*)y#P@ZBXWz_+ymyizdN-f7XP7b#FZ-1azXe0z3oEM^<@vWZb!ZMrDra zpCaM=#Vuge>x^PvHe#Ha7SVo|40o}o%HyZe80FdELN)n;x{11o*^gxi(fOrj3%-S9 z@=TAhK{h!1DPt}B;!3t$ufS`><0m}19`(lMVl=kY({UgUqu?*%NAGC$4<@L(u+t5T^oeD;s*-t_sAn+00(R?*!y%8}1& z=T|;C8*$fITJoSnYx(q8?`We3j68Z>e`Fo@)L*C_&9PiYGzQ7GScLJw2(gkmhL{V$ z!kSJ`8#kiTIcA#jEU~&5BeXn3_8UXzU5N)y(Om+;V91{Mg)_yo#=Wx{R-&Hq>qIAW zjydD_geb*&4%Rv4BHUe1puM&L9`)%Yr|>hT{$9Mg`WTEmn9q>vpmSzrk+N;t8N9m& zV1%@KGFSJpMrVRN*XK48`N?WdM3ab+dRbQY(BAtFRD%+YEcO(B93_59Rf@S5xI3HO zrude-erwbAC)K&0ZSZKfXPAfOjy%&ps)Br)19k`0)#xxzSK$rIgkx8-CMRlg*G%@8 zA$?LH?U+dm*Q&@SF@gE!;fxHhTOHV#siNtTfcYH=P;^)W9z2rdK2fAV0T<|Zgr>&+ zP-1#$VhLj9bm9x`6u%=f$b$)Ob%b~dktHsSsM?Z>7ZT^k?Ko)}k8a%JRzx4Y5$4%8 zJp~eS(qsLT zVPRaz758dLCdP`jD9uEVXOA=Q>o&ZNWnN^MJ=ZerRqHh!^6pYU>62sEb@U`xn!|c) z*mEWx^QaP`wJ6Y^Q$Ei2ksG|QOZH4yZntJ~N@VLWn9D>lqwa;w*j~*J&01tEwJn|9 zoF&X+`}-vGmY#I~Uj1p}$zuanD8EM2i*qB9J9_f^h*brAeaw2jrE}qJ$RT;39YC_3 zPw}L2PgM_})LjCdt8m1F*@KRhSI2Js-n2gwoxYj=Z_4`%2|#E-O{j9tBUH9t#|%CB7u6uM=!bsa6(gR89AekZ+67^sa% zqjRy{5_hDM<_UH!7EJ8H6bN!33MU1(5bix$w0o7wPt!Bz96goz(O&Zqqc))a_hG2J zzzcR&GjDHUAHK?M+ITs4iVT=fq((9qcX5`Ov4OUu5j-xKYjqkqfBq>RC+^DMGjmsxBFesG=?Rx_`TPRW7=yN;qCJbp%mydeLyC_hf#RWf@8j% zHkER2asgB4an(|5_m%|X$Q*9VbeZNy%|_Hx^4e9kUZb*8Ra2uGzCQJ?+|{mX>Z@#{ zc}u|W0GLU_4!i((pAXUML|&mZM1XKrR?kZykQg8n;e#XwBn3LDbns8&Nr%UzS>2(l zvi0_S{hKa{UkNeLX{8_P4_G&3=>!+b1y(s!*QK81q?IiA(GZC8-g^_JL*NB=NdASJxu@$(zxwtI5n zcf&^XK3(OitZtAdaY!I9w#D*lMlsLb`BI*L**L^~CcsiD$B5+@=oz>>aIex2|Ll1$ zrE!0AUX9g!>jC5mawul8(`|WgUujXUs4dpNA661*9sshIq^Zmk#!~cgCdUo7nJ+YE zT>w2liErhioWm!b>#Wo1S&ooE69saFfZZQX-x@Rzo0#-s^sQ$tlG3KlAvq^DgkQ;Z zNV`$dub+cG*ci{VTp7HDg%`&eQOTuRCxuW;ZoeHzd z;%Y~?#-Yo}maZ>P$u<)nVfNK9YeyP7szbWUpo#_!@!m@J9%}vcJ`6JcV<9b&w>?$v z+z@_95rN`Pv&gl$1Br#3kVDfKIc6YH^Hr23nWvS^)D&}Pjb+B$dbJghj?s8`$)ub+ zpFne$l%Na2VoyEkgvn4`j$6e#rh$d#2-+qhKPbDU-q{!XqBhyEqaPyZNuKx6_{yBB z*b6#wx;P}GbGmE>cq>?A998Is4X25*(9GApvd1}3SH}yi(RE=)n!(~pO zYNr~&a^Zy=BIK=yJ$<_%Vt}l5(?hoQbW<}S0-B`sv{31I2Dmog`r95NQu`qoN-5kn zW502uvsMh+TT9+$j8ur}*mGUWpyLk|QHwpFB{8Q7%#oYZ5{&YF)9qGB{uK^T8DCS6 zHngs8np|1x{MY+-qjLnBr`be(X#V7P*MCYuyd>wvt?`Q+wvMiD-P!iUx9%179-L0f zJHmNQF@v`f3zN{^Bf=|tmG%VSnp&B5c_yW^#y111&{ghKuLJq9T}K+ds@!D1gghhg zRVC*5A`h#ZajW|rUS(+ULRvy-h0y7T!HMj&n}OCBgSsrsSrs$SpBk5kR!sLa?25oP z{iw6*$t~YD|DA6-Qfr9K76qp>^r2XEkD*+)e+8FK2<3l0Fr3l7f2MI#gel`iwl<`7 ztIZt$9wI&$1q|4NViCGZLE`Czmb}SRK%LG>C)p}O82nJG(@~`nUM(a9n(X|f2k~5E z7Jev;t~2D7*g<$q>8hJ)5K*aL3#k?3jA}GS8s61FGdc_|mqynYZ_$QbDXcX(c{Q2-!$cK@` zEH}Me?|*!N(=9cUS}*O%RHj&`SI=5IZwe%p#9-0MKGn)T0RxizCNc>@k53pWQPm;m zl!IVK8n%p>rl*CgZzwyMkH-fQV;W>eB#2k=2o_@=nx*ukAl7JT`p=!-h@^<+fjE@1 zpGHaDIJKDC(t5ctXb^&Rw;GyfNJKsUbB^nxh($~5^GlOIm&Brn;GW(PsGWL*bB%w@ zNtP2*49UkFb>5pzH_rEhJ<_NG=oWUzQPg$)gZ6enmR3LEB*g+OHwG35e=W2zxOiUnsXX)n zY}QIRl7f6eIyVT3?Dj&#JL}LywJ_@KiV&?hv@aBAV13fZhKo=w>>2gq9s82HuJALbAy{#BdbnUA#LX6DOcbI_0UnJ-~b=u7j_kMV#!)Of%gurko=gD zh$BF4L+A2*?MY&2ju9pY?!pn~pR77#Gt0Dd8Q$dx>*>K|Wr*KEO0?46M1O)Au9eEFkY$`;Y*ZBREzi>T+jcnj}{)Iq{y=zdk)%8-_q9gGA0HZY%F zji^3biPFLzX!K9~wD2b>AP=UcUJ{~p+~7)UobgV> z#H&w6CR@HH~H-=_)!Jj3o9vzFzyIyGVK_< zWppugRS4e@pi@@A^j)jOGJNoaa+`Xdy&lNU-VBrFE-ZsAsdmcxO_$F^x;|s{7+C$E zw$ArXtVL8gQ^zL=_0ab&N!blyTp>H!wbx)MF0sAN2tTL0{$X`T@$Tmykw4Q)Pd(17 z4$`p@1uj)gTKDAY>xu%XIKkyaglr(!F2QR1?%;LP!WXic7~?h76yOaglK~Q-u)$-8 zM2d6^+wk;p5JU&r76Lk)6*+vM@{nt11=YUpm5tr1q>gI(-I?q3_crI0SoWVS?f zD1@gws)r{I5z-$b0+#%_%8RWbpQ?7<_=`Z6>Mg1~K~2MkC@l|gWtdJqB#y4*59Fh8 z+K&gTp?CwsZhtOIL2>?6qTw&us(THsoiaVN60&{+!NsCfIi4viYXSIS;l5O`l(VjJ z!md3txQ!`V?WXx`sPFd0V=Lor`fY8=6E3ewkEd$*ImO^xY(QPG;RdLrtFBKuhhd+a zKWJ^6jdH5VCCof=vg~XK+F^E5JhR50NtISzIwQs7nrqp&c{!JbRiRZV+?&U<#-flb zqAS+yb+Fzg3cu|oq01>CZqCi;#f*dExr8iGwwJjD6wN1KG#~+rOR5Q&#~Qh?5hHO? z;mlXULugC~3I&Q_PLVYtU9dzoxKh!V<<~9yifyV5>*K%;hY)ub+PtN(nKy0+m?fw9 zqqACKn~^L(VV6N zh50rI;HPxKg0*OT4RdtAgJ5J=dHibGbmwgne!DW=UKvRTu7MS6`=sGTb>P+Y`{4hf zX@$38@#+`s^uW@UTDXqE^RYg}y2dwnCF(Ze11he!w=Gv9z&DI{7ltO3bRX$JADpQ|jfAZ2n2+X*X|wI@=-?4MWc#~F^kIG! zPrcauoBMPcn^veD-NjCdn0)VyS%|Y6t)o@U;jW90JsOOz!BL%xKD?+tUv+lrbt^7Q zd?-I`iG?`FKeI)6-e#$CvIvS6+`2VcOi^M-)Oar~CQH6Lrm+@raZk~?pwALlcj`as z4alm~omUYy+E53q=X@pIor{cZ2$??2WCWvm5n{iUD+9PX%Yc40_0w5x+DO2&HANtA z%K2>8Y*mn}@hNaKCw_-iWo67EwrKupXOgZ47V}#Dg%1L27GUIY210?ae@b?!&r!?6 zdhyvjc{~%M<&1f+fcIKL(tPMn_v1wAvSyRT}+y6_E#FE4k)^j?Ck8&Z+4-#LnGz_WOk z%++@gFG`qw+VDRD#dpvkFw8)FgN%#?;FiAW(e#6Wg7!FLdI-%>4yd27QhIxwHEo0> zV#s|ujWeJmjEckhnjpYbA!ELupFy291~V|9L!+&)o%hAWQsSqEQji_(S-AnZs@PRO zVT!ddMeAVw%8&aKf%@@U_&9^RQ>k<#a1%=y>z#y}o@Jt{`W0Wx;RgoPdjs}?s?3<% z0X>tsJ-QXTfm~*?YJ@L{{Rej9CwQb{)dL}8lM)KV>PwG=L5p)i%EW)pNt$1|zx&13 z@3E&XWjmP3w`4$pvAy!97Xa$(L|HZVcKP4F>vH6QNr}XMe>yN3#})Y~_i0?PQ;GVd zo`JZDk{yE7Q|cZrR1;@SKq_xzpifu_AsEVahG*SlEp)_=;n$8JeQ7NYI%&;iQR~g* zL;lNE3nI@pHd~{wGBm#~{3?aq;m&)<$<^jh$JUq6zjroP(E+vJa80sZC-0EG*;;nm zmcg^3MLT_B@?*snj)hsg$y@(a$C{HSX+64M`eJmz2jkK|DX8e@sL7y~de@xPI^Xx8 zDwxvx+mxgyTA;FPeXhmY2PtbT@|x!Rt`(F;724H=J64~ohu(y3E~>@41p2a)cbL74 zReu+@G{odgApisO_?-Zc1QtrEn{@%#ZU)FEp#Np6dH2#NXt!VAXrB29=kB(8d%~7?* z-yIe9jV;0JD-E>6##~DSX#)-4Fx(qEerkJ62_CY&Q+IBUPVc$y;Jz~GR;+3}B2Hao z5A*bspU4#6QB=v4QWTT}bl^h&SVNs??TVtqb{PRRTVn0fN%GTfbZ;t$AVD*ZgJt2? zF7U9FsCeXJpOByHyP*|+wQfw%-Q9Sh`jY;TdstQjzB+X5cEGM=go_m#WF0l#N}Pwx ze*iK5FM0^Chvm&T(67DY=ft&b*y-hj9I*mrP{_rwrAk6Q{4WxAQL39*YMCsCN1}*m zwpG?>g(LsY7FP03qS#QhXuz>>(W(9Ul=@VBM)KXacxkLJ`Z@IH2lj$4c6PaKh<(v< zX*4t&Ylfr+HLbOOE}*tqqVBtybqA_)l7I8DtU@F;vEux$i>4zc3*)FFmPor1-DNr< znnK~;s7w;nw*6&VG?V%a|Z3ZoX^&b(v2BBu4s(v ztgnVzU4WlRSB#lRpAKJ$sm|(~v3Q5T4YDQ-rz}9N<;&UXW2JQT4%xx81lrt76t1Wz?qHraT!%dWFP*1dXV*hEua1x;f>SM58ZN2(= zycSNmIwQ|Y-iwPp_OHDEk@sHzYDOOi$K-+ne;Ht)WXEV3G(C zQt^7tnsr09*F1~nt7C!A2hmDK2D#En$1*N|(z3`L%VtlHlFXW^Lo;z!{o8PwD(fFh z5Im5^XovAM_QS7|R8aJyi|Z z{c2gf*m`wAY2AoGAEvBh2XAf~f8Aa1iNE%0zuF||MkF4R251;qFb%r>fL~jb6H6sF zfB>@XS+bQ!QVTQ@O(K-R1so_oV-tNBZd?=h)a(+wo$#7uF6dYjhRf|{ReikCQDVcZ_*E0`=^vW&-9;-2tTSYVr6 z%CPk@05CXI^v2JtoTd&e4a+6e%oQwm9@u@OKNL10h_-Y;ZENuxD3Fa7@{DQjI<&R< zSoQ=Fl+RX<8ApR+U?dU+647s2k)Xgoq_a;jP#;&kj_Z)WL3DT6+;_hhRbRO;?wPsO zY~10ErC@&5!SOUoi&(pISSr{uoo#c*thYO3=IPGTxZNYEXD=}N=SJRAR{NvYJ==ba z;w$&8UGps_7CWS^A>F)@FY$$-#=q8-qdOrF>hh%d(XQu*uWG2Pz1pvGHRa8TuZ+h5 zfgPq-Z>Njera%j!EMK3TS^2|KKf7y7?53BG_nt|bI{h%Xf4Y8o08F0dHIb6)f7k=f z>2vriGL zz3M2z-`6I+w~kT;?^s!ZtBIgk+cX<<$7gC%7u0JNUCQjX<*IgKwAL}px!JK{I)hq%n8DEq0E$)@Y~EM-0bUT|Q}86aKO0#W!?CH{4hFxA$RSd=8ONM+pI2?YO{xqq?En= zagRFFYse+5<%d!ITxxo4(?VoB5T7*qSc_mesSY8pX%TAEL1+fq(01Yp)3gt_mWoKc zTv3hQw38J`=vKHE9IV|l_X6Pb_! zpX3K0Uo~@2PY3Qx9K0CI%L=5I!OC70>~OMSv!O$ zn<#KAS!ObUertP4U@EfY;YDy?U*MdzGvH7NaA0jD#nCl?N-8i%(i=dP7MwunrX^5B9n7WPlLxPsjNEjwq3l&rr+^dIEe~ z-~8%WaZ=N++k>TeL~_aOhv?MunS0ZEFRI1Ou~#fkBVQN)7842$$pzKj)G;1HT$nsS zctiOL|3=zxG8CqD8Md&`G~ziiOe?9vo_f>eC^?ioyZpsY_hr+N;WBw>u0^#>mbV}# z-TUFwkDGY8_>Hm}wk~Y;RkgG30_azd{%)|dg!3IFNJDPp} z{{2yD=%M0Z@_`l?!&YZmix{wZLE1Lc)Q4EMntKoP7w}j}I;mIF)*+8Px-`S})oEDy zp>@Im)*kfR>W!`~xZT)IbYom;>ec~w#}m6vG&b>4z$RfJ;fbx@&?bMqcf{paLl2wZ zEMNvsC4oH!m+zdz2tPaL)D)-c)|WLr9X?XseV{@uKiELeL&1729PB7LLp|5G@R<~D zbV>{3^5I<|Cx=?YgVaz~;fLZIH|#{Pj5IepoFMN9L$DDv0yb~8(0;0OLD|jg{83PL zpDivwAeQhNBHkGu&1OL8MFtDA#$c;t`xfcqNqBPZf@}!0ZSXo-uHqRq(Gj*$b2Dye zAnZ?&m1&`Q`8J+@@cZL^QHwKUSNdJD!IwyTm*tQY2AkI?fp>-x(GGv2Y`#motf;e= zGj#&0qOrjFHA!T<0i`4DjtRz1AL-cTm`4@~EAUGk*UtA~?+=$YcEF^X5j%2BN!wUv zKGtF9de~ouxiWdj!Jg|^8k$%j;i5U*L`Hi!CO&o)@#XdO&o@2Mp_clmZ7SN{96!JJ z2np{~P)&c9T$|QB8J_Oxs5>-sn~I-ey!m?= z9>XWr@NTc+J8Y`B? z%(vIiwmj%CyrCtPjBYL3#n;YAFg``ZIIM`iT~0<0T?d)QbkSq=k-r{v{+D%j6&RFhTX3)ijrZ{5VTB!a0;$UX z_%NAnj08I+8c|KXgeoZBMHnaTA7h=+Wd6fSzcQ(2mDlwDBO?_Jgg0;NT!S>$NVG|X z&tdy%i!V_xZTM0}Yw$iZwfQ_Fgm^3P=AK5LOxMVJZ9p%W`1~1bk7ITBuSPQU5{dR` zwB)4av5f{GZs>eE{L?e~J2AnhSjZ>5frcvVv$%j9*I0vJAmteeI;v44s|Y?%U%6qI z668!Gw-sCHVFI`DMYE10|04TGyzBENK4QljszneJ*_RJ`v8QrCe-QyzB;Bg*X!MD( z%K7S<+2v=;>E)O+mNKCuF}pJzL0o>^6#V;L5AMTU-XvZADb(|qnSD#bS&42t4=fkN zUaCE+uAzZ7dactgz-T6)>Tp)*ASY` zBf|29VcAF94Q5Xv=d|Fvc-B1N{OLs8+#mbG+J31z3Uq^vO_4PzOyolc}qma!+< zi4fUl21VIr%Q|E?m?f1N$bH4BU{d4-~^!Cp4-p{>U_jO-4-hEUvgqdf%Fm8SC zyx>_qGM>$Hqt{gX=cPcHuRCEVDHD!RnSE%w@Q8&RT^x!CEKi7PDt_*S((LA(e+3Q= zGzrdTr%c7#Cdg)_(T81nyPdU^!W@02N2apej{4vBdA9djdZht^8e+0&1LXm!V zE>2jmSL(7a88C-dxdLpJj4)CFm>0@$rl&te9u@YO>>uz`xyP7^*R!8E zo9*XKg9v=vIAmJ!1F2&! zo)`B!`&!MOy49^LCysC2enKDBu)AG|B+qxX^B<^<^(o!!+<&VmTH@5fq@`sdf_19subsud zilE_l>m*%(U;TKdXthys+x&jIzv!pBvcCAx#*Hm43)B7jQ0KVMI3HZDRy}`pF8}>}_sDJH}4qi#~K;97{jcnboWOX}1NZsU?)2M-n_8AuPa|x{?A2s0`?g+8% z38aH6V))&i!54Fa?D;>`oxD>0yBk)=YOI?`aQ)L|L})BL!1xYmikds$j0P)21qBv< z*Y#V-u#B%L?5EqQ(KD^#4vAozBJ7kjpOzbTFYCSI|n%R8DfRq#>diJ(oHk9 zZ7tdI(NmGwTI07opxDjxyHga(oR4R?S$?H{n{Jt#eJV5;J5k?}cYc6A%w!O7hsg&7 zH{ZETkhh)Vlh%D*MG_!P?Tm=%zjqKZG(=*_^5TX%kIU6AMmgW%RK>e?B_qzvrRWvO zt>doF!pQ7FzSq(5P zAf$p67yHNDXD)S*0>^;YepvDBgGTrL8zI#d=ejJh9bR@T_TFm^hPvx*6F)OQ5JLtd zUp8{$XZX;7x6rFjr^_}NZ*&E1trVuZ+S}8|0PmuQ{ZkCT)H~4PHr1h6SaLLg;J;k$ zKD0|m?f$$B3)CTL}u?7E>DYbC#69dK^ zgEX>?*~_1(5S!Pggkmf zB!>?@TuPI8yd%cyIR6~_&wct24fkAtKV4PSu>i({cl$*CuGh1TzqGJ1`;&R2J=61i zGSgZIae2QuO>JHs&>D8??H6muVWk0F?MAmeG+FECIv9hH4>RveD_6@#oE7?!C-`ev z2Vq-uzh90F&lU>oBwE{_f*EJR%XvpN`U{UU03pkqv5-Zrt*G>)wkp!v%VHaTsJBWo zqJe3F9OhqF8i;C?+yUM}e{7cMP?Q$D|3OXUVSn>#r9Yi0n&-Rh#34ogBg3BUfikVa zP;yk%I*yTsHje{iRH{DW8a}VVe0oL@T!{`gJbRWOy7r4)_O4^~+o4DAObgP;>Vr)w z$pSkIjIVm9w5kzhYlx2dE(3yMbeYTX59F{Hu_NckG`4yYVe< ziXA6vy3vQ;%SYR(a%3p&_Rv!F%pd!&1(`4GMdD^E_xRrn(w?8AmF^rA0g_R|@b#ak zWVPC0zLs&UYy!cxTT(LOEECX0;01DWfd_}av9Jn!eJ{Ag)N!3@ECjUy?*5TIa z96UTHA5NgWLk=1x{hHtt#&XB^^OO5>3_I>1CoH2rBf!XG5f#?6D3;A;cE{6gLW-xY zua$Wagy*&hJK?Aw{yF@iS{BUTlf{Xw8&^4$l=9P+*r3JshcBITMy4rmObcg)8JWul z4yA^pXPoBk!spA2!rM-TlK>gghNE^LxW3l6AO2yvRF5Q0`tHsp4;%UC_7Fh%44H6{ z;nt{y(@#LV*tQ(R6`t%~O^U%eKimu>AAv-8f1Lp*#kJ9aWfYu2!`ESh5O!0!w!BSu z$yA@oKVPa+wI@h@PF8HDMXVoRdQ5H->%Xm203Q!)sVbb&K6QnO{Y6&JoPX^MIA4Yo zl3iKL7q%${bX5Y9*@0r6kK=K41QVCp{^=sirn;;^c!YqjW{2i8gj`-FVuR~1a5OZj zhhh_ww~=i~(yG0Sq^FpB7HFl>AAW#*hghcuPdns)1e}zP?hLIC_?EybAK(gD#eq(* z1*H9wY(1#>NF$Tuzf=MD;Q+fAkXbq_*WKFj@T9`I!3JJ`Q+p%1`sDPbg?Jeiol|#1EtAa|J(ghl)i!hzR9IJN+%*20jId@iX^sGJ(^B@{{qWx(rn8 zo&$jWztN-zU%o^TT7w&q)>fk?=FoWALx37b;gl;Nn;qT`uCVZ33+AIkz{-bSmBn*w zoV%)hbfmi3!6PPX#fr{@rXo8UDO@>2Nn~UKsQ&P83!vzshu_?st*EN6f%hoK24`pw zN!*VR_p7+xKcFI}Ui}v#aek&zyR=Jh+f#fH=e!@?Kd@_z92UYCf|dzvJNloWszYIn z+@l(8H`9twp%T4@_&OUSv0ae)gZmpjC0;9uxMl~)k>l5gKfd2(w~Q4&Dn>#1V%$t; z?FPUNd}nu2PqJ%gL3?NtJt(0^)<1>YZIKTf?q&Zw!lHheS)(q)i&kCb6Pw*R^l!L& z#W^f40xSp;o4GW$C+=FE=`k6%yO@=or=4@eAt5%R`45&F6e?^Zc7HTy1$N#P@9CCIqQ1UefWAZ(Avq zgCsjY6$R|;TY~~b262s%b##%-koI#ja!NKU3a4}o4(^*QeC6`fiV%M4o4U`%gW+;# zwmOGRR@K+K{pAD1SVM>y79J`ActK7FBH;6RFigxu=7YSmi5-&swUEO(^6W&1lL zqrgD4PK1A#@gJnkhBD;psHHtnO{nap`#Ia#u=H$?Jph1!97?aZ&A z-N%0J-?T&)JXRr9AYc-kblxS)HWj%x@#CFY=%QclN^M?d-{bymtS%7R0h7Yk?QkNf zhM2rLq>CQ@mG@tEbBdij@Ns}T9)j?ZZ)=CobewhK$F zZ_A*y4W~(uM&COd0wBt-JrCsM*QWT4jF$EDl3*8c9JZ7qZYb=Otp3Qyp)8SZ>wcH- zHi@GvKApNkd6W$gd0pOh-HZj7ach+j5)mk&WCprea->iFlZ5m-F!zQ#U>>icKT1A~ zaHX^80{<>a*kWy|Wu4TAgs8x>GF+;g81`g4qf5%qUpXAR%7BgO;?mgi>dr}d* zb2kvE(R5_$1Qq4I7UI(b*v}luIo~DQ5zcYq`wxRCCz6-=KT-GS-iLLcy{j)xKnB0b1WJWd>Z#q^6u3u(1;eAjw^Umg>~WYt>5Q;=2`#I~ zYvZ<2&BT;y=%eLy+0>C%eF&tnAwd2JYIzxUe-8gBb#K_kY2*bqe3)O((Xw*WHem|s z((K^!HQOMj$R$f?-J5`Itm|xmv5yUYag7^_nv$>h5Qg_(w>yB*w40zP%t-}C_e#j{ z+U_8c$n7%9&wP9;=v}E?p0PQ%xVvPX`*Bo~ri;`wcm7bC)oi+(rxojub6Dp_*H+y^7iy6&t;AxNWf&bW{Vy_?S8nf3j^ezjz6?uY0^hqI!;Vln|)2IkCX8Ga7+io{7|YmTiNDkL^>n>m+OpWMIVB}FBf17eB5yT zHdxa&JaStW>Yh6t{Mo-#_gtmL&#}7RHg1Z!@iz6m@r8Ur4JSZZx~?9=P;**>0cKcI zU>!c(qpfGN)Qj>k)u@9c`$wLoG|LIr4C@l6(NB(5sx*|xurnE2|-EC{O2 z%vH8(!7lWhllN2|Jdp5pR+0P&d(%t&IWy7p zYon2$fE|NdUofxUdJx){H@3>xn<+7Dp1e*Psm%A7+=F46OwS>`#O@`c#xxQaEy?g+ zUXhWi$7MpTZU6T`O`OfLzk--QQc;}HVw$x65|L1`OPN0^8VB7BdPY zNl-qWIKz)l=I?J+rs}(#+KID@#sxY^5TBc+#clh;u>9rbZ;?$e1weVnI>i^SyXQ98 zu)aS3+S+OX*gfcDAz`&LVX})hi%(_`OljJ03+HGbQ@6oPYJyx=aK&u^v{6b;5sTA^ zbY{01V|fAG%3yZHp%srI}dq5*w}p6cXX4(+$-wS-IJKnwSd}5dFHc)r7aN2?eC>kG~QUp>i8@;&Ak)EybLz}%+`u?_yO49K4{gt!< zqx3-1S7!HK`2X#9-uSpc=67sw@L)!(OSVMi_Whw|LhTN%Bj_p=Tr`4m$RC-t*vng1 zZ*o5?#40DCODw4@l6@s4kngM3;y--Hdcs@K5?^QoI5a?3=6B5&xL4k-$E3-F3e_*O zU&vJZW>mv3WssHS(?a`zBGHWo|BfVYVUjx+I@|02p46k@-FdUjT{{4M@}^y@oy$f& zN$~|2rk47O^65|GS@xZEGICex>hqDHyxN6R^O1XnF2sC(t8mNprV=UE*Ge>6Uk0VG*Bh z{aQyrBNXs~tj><_lnj@H06Hiw_X^0;#;KfsDClSM2rIu=@;%9@YnU(>!e;tI z;l1;gYM0q((rxr%9uPdXCWnp5X&_=3$e!$a;W5Hq1B6P7)TX^=>#hXU-@Z6``c=`7 zaYU*|cgQ?LXy)aWC4w?^ZA7fl|8!kb%b%Wq=IlWkc0_g zl{?w}qJ4TC4dj>bbZ04s4+W#8)uX((AK+n~vOr|>e_vPT+(TABMyx_?+7*FuKIOGz zAkKN3h@YBcQiW!`DnbQx_jv~5xWI|0fefac+s?dn?21q7F^%EhEI@~BDe`?EdmHzC zF~tV*MIEamYWwe9A(xt`uIQ&vRlB?FRdk#!o-dOFUAjp?v3fLgss;u+i}cTDBl67e zjNi`s36BPL^mWhNOJTip-Lt#}ih8o2p$}L$P|~9EdoA zOk95%wEXm;`^Bkw&z~QkIJT`M+7@laZ3ovSG?L2X=I>0(u}+zH%~Q`0zUQ_|Yafv< z=n9xuH@@H+`s9bjkgTI|0SfLqbf|r^@O$!oyqep|Pbx4LkcxzQ^S;iX>u*@!qF>9x zac2wJfPg`K3}ir(C+d;b8=@V<1lfMQCyRq|24Cw2sIY_ijyTkEE;kwD!r{ zF6Tqo)dfF+*^?XTN@_FQ&xIK8_@2d_p$kg^&}5w9tY`K7(NKuB>qPb0vsOJ$=B{_F z0D|`&*bq09J)YG7q84m z6C4_`nHc37ZJYMyB-E2ZZV?mJF%mHn&;3Yf6!Zbz(gF)1dwY5vL!)x@^6g*kVN% zDD3LhD`p(mJECt|cW|Rc^U_zmqc>ljzYaJC2>>clsi5inE)NDb1-gdku2YvAq$`gy zTEET!8B4J_>Wha)0;#Xngt3K5#wy)fk5>vMd~3J(El; zdjMJvgb~y`YOs{j^$b#QzBXw049>v>y3#gyCJeitSeBqiYtcKsO+ceVo(<9ynVWj0 zL+aCrJ*np(KxG?N4Mg)wb67Sz*MK!5P(5A*J5Pyp1c$D!6N-eVOSXAy}?QJ=zyts7*%ln8Y6BcA$%7FRY zn`dWHrFw)r^^Yxu@!VEGNG#fdWuZa}E>Ga-Z)utSLc_8Z_7kcZWR2}C-KRpF4rn^X za%bE^*!2n;r_jh$r04pU`00U=ZLHho(WTpVr3kHuLYzuN4hSh-1w2;)$bc6VqmIjQ z$3;9Z626aXN`7x>OgK}2&9>9$a?Mv)OytddO)~tesb!rld{1r#bcSvNaAP|pFG$lf z5*H!};rbplmJRRMC-$BjtSMy20)>_$Xltq|!ZO#Vc{uXZ%caO#kYmC9{rS=bF~Ux( zr|>84SBf8}p2+WnEM~{F!_%}|DUes3V!fc{PgxvO?YUWYtxpu zJt?tLmAs(~`k}Kv6)NZG4z5xTl09=M6ICq|5y%v^qNa)0iY@0_JuE*eP|RL7Bz=&q z>C7jwMpIc>4H{&4lKx!dMvL{|?1%$BGi9##Sju}8@&M!vfu70>@JFkM zVAGep&)$P1z`b|X=jGdIJXihdCwKcB!m)tp=Oa-#p&zsKm6Qq6kH+YxM$Y%#;p&1= z{5Mq3!~zIfk@eXB=dttM#n)^-SYIo<%cG-kRejf^@o~Y2@3~>` zmTg-G9gQEU#PYHd3STVW68>1}x`lhjufQewmM6essN}5V+mVG^QfoBZ;)T1I6aU6z z@JH;x9lcxa*=^SX=fMFV6#PpGdEUV^d3lOB`_n9xvbVOQlv}&`k`Z>?;k_2o-j+z* zN)WFkY^Rv5G4AOoOZ!nY7b1h}yS^iT3Y9(5bw(7(nfU(_!mP%cH}`|mK7r_6ET`rj zcth)XUY5Ezgj?v(T?s+!u=KYw#eM}|5PwAqA6m3=hfe$FESgi3_GuBKYX*dJfX01s z$Xw=>GWCRfaB}^6p4sHC-eam_7OzQjPrNAg?BtBAXPJ|ab8qkC?<{1{Dp^CzWIcV} zk3liEq2AR3AkV${x!}nw_Tz=AU#H$4PnW;23QBq%K!{G2ZImk$a+ zUscFA52LFTcrok~f3dSEFrV3W@zs($2mq z{$QTC)jppkbo4+ra~iNKt^1eB!%zLKRGeAJPptt&BDVZAZs`;|<9kl-%Ey8P++HoJ zJD@}uLY$=S*Bf%q+tK5IQ~YY$q>AJ?XYWbJcE2@noOC;@yOD0;dZR1; zB1%VMcoQ%rQ;*)J@5E%u&kZvw*LK2CJWmi>y43;!i-Wjn%=45VXkqxxHKmoH&HET( ze<=oP!Qp2nidBSLnmqeK_t4iy^x7Eum;3|V^$ITY# z)i?GHD3VC_YL7{!hC?;>J7OUmc;_V<$;&`C5WMWXK2|%fJYE@ci*R5zsb{*0N6H_> zG`J&<2U>M?E!?{3kZpS4bL|$`* zY=+C99GBy9s^bceabD9MOV@DiUa7B^6wSQQGI|qquk~nBY~Vxt0a(vi&j7-EMvES3 zoDs^sjl8A9a(%zF4`TlO;*}2wtm53}pXMhzvkzLbwpFn=i4~nc45?K9a%N7?Vfk$H zg1x9+89v>?@z7A!{0k<0Uk;9UWOyR>aX;C20BO1~L?ymVOgD8J0)svR zP)qvHT#hhJwa#`uX`ZUF)7Cub9b!fcLx;kif9*5r=i15M9NLZfI#k2wAwINNcfOL* zKks>9_Zur-{HPViZ_YTkfoe`+hX9ca(d_VAs)n#vLwv7nK`=>$Fjinl>PCAWSP}z` zx-$`jGHlldN#~+)NcsaER(zrFkD%RSw5ZS%J9i`*)uV}0%hI&{;YWdRNbi3zfXXIyT0`51*0-8>54nUhqd*$!Wa9O1ALNz|qY$2?cXH8= zc2ue`#5Rz=80}=E0AXhgarW)+4JZQ%yJV>$8HEQxzBcc(nTaIvd2-YmEn9$9WtHZ@ z4u8GQBZnb751T|Bf*OE?9RJcN+@DkKvZK|Nh%*2^GOh&+j0?k>6+ZC=US8WDx=8iwI)=};PN1eKx#jSrT4Q|U}?tq&wDcXB+fvjPy z%Zp~o!$0kQ@#?$Dy_SOOKty{=z2y(4t&a2P?rl~mFOA0$6BPP^2OnEcQpp{se?{Ge z`bUQq@#6%$6_sH)x(k)!Xi^;D_gDlwu{d6y zpj75+(>|l_vK6^-f&+pul5OWB`h}Hv%!L`3S*N;g83A$(OkkjN+WtuA=?A-MfZZ4D zRIuV>e-%+CLqcpfD7;Fa)A8U)70GwJk541s@;*PKb!`MtA4pURYX0;?lcTo9!n1~0ROj{4vP{m(AJWsP3@cSwcy0nOgh95Y~2}a zye>=*L@3U+hDR<<9?nZFp;vNEqrTel$eByfN@Uo~u^^o7< zeX&9mr_1yC*Vzu{q73Y+Yh8;SHkcU8iRaC5dauPJ16s8IJOy}5bGLt$L;0q3vd-Qo zQHf8LMXVe{&ev}ztv8iDZsWk~E99~Q1`r;F+8^S)(xpT2MV#VI13;y(ZWJs$2#e^H z$2ydwO8@rYNB1)rS=5<17m3Grel2`v!OB*=^cyY1MhbT% zm>FxewFMHn0+56*f}J8Z(kwnt9KrZz<-p3uNBPhg;J7+Z%i*1B3lD_$6G+qf*G2-J zbaJ}b@v*63Q^%&OIoLt~^~*RdmxH_^13Gj3UU=(LGl3l^#ETa5I|$k**MTA{UOwG| z#qmk>L0Tp_@MOlqjQiGZN?GFi9QYZc(uelBq{^QJBI_9ei$Px<&dxP5i!w1sx^t8~ z7~3qrejcZTJ%kB!0^MC6b)0X|DVX?-Q3hXPUh`l#%}xRQRdSka)%HF3au{Ghyvq3Djyc^zeACSa)vYyO(5^<=!QMy z#JR6&cIonaIGLYLj7<*H(I!FI^UiqJ(Zadc-%|DuII&Zl+|{%JE?Mi8m>F=Ix+H0XUvB^ zdWOg}>1etkkO+@}L^Sq;lM(rwy&pudw=oWk5ijMp&Ukqmy_IPOJFawP_``OxvnchlfzeYwM?xayOW5Sbl6SA>fviRs zjcotQ?io8$=vzJKu%X^V=T<+sZ1kY#^NBB&1IBu|&_VO!Jdjc#tK)0KHsFXBZ}tbT zm&1wM?{|^tiiB5e3_|@{wfCadJAcmC23(!N1x4tYZn3)LmU8!uya6GaER@H85zJUr zu;caP!O|0Gv8g;QSkjuqYlw>N84zL%Xp>uh$zA)?kHUp-ILNB9?;DAP^&~!(9%*O| zG8`m9k$~Yc3p|7F&N?gJov(fwnchTF8lnQCGvR#M!V~0>41_;@>%;EC%}r{3V31Bl zC!pe>BIEod!5WNMct5CxtyJ$mT){ra{VhD9ZKAJ38Ko^+PgBWAVkYV?&*nPnx`EC9 zs<9>AXP37$PdswkDm~1Y{25LF1n9*TqEX;ahFOxykjnYPVD6!#hRDr1qs`MY#XJhp zLAz-4D~lXdEOL1#O~CPX_-r3rE^UwhfFi96Woo`NR_uzkiifU$DG$}&-R7}hT}x+ZhOdG2 ztOToPIwef$mRvy}_Xe9DT))s>NJ1it;59CUHt{Ic2)=puK-NcQ z^5@f^Q3WkL#fr5L>xx6E6yO97hE=z_^C#jvlipM4VkAZ_>aEO$^1Lj=MH}E?lu+QJ zM}U{|TM*OJYdEILvg}_oC?xZExMxVlfIO+Sd=Lto7Bku@e!9R7RxYtXm*FL7c;M%{ zl=x;!GL0SSKb?w_j)OxUxtg5@Y-&AjT!4(gI)e~-w zPL}l6j~54QZ!g?jysyPo9ey*lu1A{gQ}JyxIKPoTbc^`jZ0&nZnbz+m&Z+&_o(A>S z7R^Iy?Lmh?gB=1RoL(*3OB(NlXhvTNR9!2gk)6bM19wqIYvL`USD& z;|U~w=MaRF)oyfAzP#Xi#RAO(_Hekg&El^{y31I($pO%o`Bf4r-!0sc zxV%XH7rc9Vcxg^GgL2K)vCepI;e}L1b-OFUuKlYs<&(V05>f4gz^%F*^Vx*?Ec6@! z`P}PWQGT@B#CFQ|ryci~VW9!>ti7;b7ub>D2Fs+r?m9)`G;3iM88G;9S**nY8afu> zp4bJ@D_ifqW`g?8I50El*TN zZH=A~miE(43mJ4WTW)&5hs-w5%K<9(-+8%oy5-x)z4F$KD+DR`G5jkc{`Eow` zgAR+;-Z$#vXgk+?Ogr)}Dq1tbIJ-b?gU?^?Pk#FJ4}Ed9j1wYnP8x z8K9sQ8>(~7CxynI#UeSI{&Mt5sC;z=HG2+J(5C

    Q7SsZ!v{Cw5xw)n*Y}C^|Pvt zw<8Vx?&8qD5Wqt0Fp5qG>#WKK-}?7#zh9$v#DA1)3lu_4>+tBKK{Fhx`dkhQZ}D-z z)qoa4?##ps{T;3IK;G%;RtvY1`?CxuKV-tG_r0Qme-9eq+IThdQr-YXBrlaskEA`7 zJclPhkZcDW{-3+Y*_6y>4`#}K#BMA%?d`>t)RT`m)`^E!NT#R)xp{V#K<*G~(3wer zq{*FH)V+r2!C4is_6d}3P%meM1yikQ2~3nb3D8)t=XX_(SGk+}x6jV1VvJk((3VVA zs8Nr2eRo|u5z!Blo?itP8--nG$-ka=Fx(0ARX5B!=YCG#mHP-OX+%7; z#bk+zBw$eww1<{SNQ-HN!*)G51zlaS)cGaTf#i<4#wyx5L|)ox)~gJsQggflq9!v` z%E5@IW#-=_a{gOU4B{KSqo)-#jW8KkYf<7tvxxBOxk+R`=y18BKj6*nmVca<`ktvDnUf&!VB6eDygq=h#`CSIrtW)+(TTQ;_^S1;}-IBFSPcE2Nt2z2vRl zN6OH3uyavFW*;456FVWY+k1!dMdOVBP+*u#QF8sC@A0s(%nh=>YqkA3mh#gA|%$Qf7fs}{nL1QcmL{m-OX?r_mGunN=) zbgCtnUiPGm%F!ee00Sq?3EqQuy*Qm+YV*?0TH_4=O*DO9O#ea|`N(|RQ<*1vE}hy; zN9mrBx$;_-g4oFdK=&$_oK~zfz5sGJK%d7z;HT#M6I{U%Tm$g?!Q^7fCk)&^wV!k_ zeSkLvjhh(US(?PYF^v0q@$j%rudR_n{ImKJ z>X((nk^mm}{(>vvM*AdCDsV1;)KEX!6I`14o`w5_IzsV22uhqXHQ$~4OXO8V6IX6+ zDY^`4TQsO;leD#mmB$(^cRJDqS=sO!AmQSEi03aaa$=X}HY318BiGj&>Lf>w4Hmq5 ziEp$9tg3iuGkaJnwX;NRU(Q9s%Mm~0S~`_&=j=G#Mo{^*(E331?H_-ofT{EJ z?mF$5YW9&CKPr>BO0hwBGe3}u+pK^TP|Iq~q+L&VwYf-dS}rhRlpMq<+u5Xx$AfAd zmlqr(Evy)kj=}x(n{jZp8aBnKxe*4SQq^a|l^w+)QMjDPb&OC;`tLYj`!D4TpAr=a zE-Sz+uG*CsspK_@tOB{#6T~3UT7J(0H-tDyC2|-|@EIh-S-P(K%dyo{rcrNKU8LDf zddDKMeEq19&38u_YL+-+9}_HKqyw*Su8&j1i;JD3Nz`0mON$)jy|v7pcR6)tZ7><} z&JaSW>iaKcSP#nZ<$>$adz-Rn68&mmHaUG_G|#F1MZgK2y};88lCqehim&VBC1C6=pVIcw&~Ql2}2rw6?P`sNTO zA&%Y6lC0Glx>xj?1jMB2U8rJOhWD$2ldho>Ae1_J^R0`%l9CRYSs-#_LO{m{x(x2* z^`|*N+{W`LQeVA<$F%9k%HsA1Gmg&7jSV=-aLHN2D=NZ}sljKj{$UbL)DT@8VQ~oH z8vpET#h=&|O#dzRmTv4Ta5by60_WT(uy`Zg$N)e_#CPsShq$ z4Ki!IB4DK{*j)=brlT|ibqaY3H{XMlj+j=HT;Xr3)6`kJR*ibg4_Y}Ju7z3$h@zV+ zt^7u#({SP8Ra1)<-`7OCmgC`JyU_!d)v^MAc;AAB?aYwp4p9E4Kd6cSe1Y3_)MNv91sDEMYN0?t!t{f+TRZiV#upUfnK+CBt0U+S8u^@m zNq$tA0IGmC^~IqWquM#?X+m>(13!}XMJao26ZLH3@dD`@X7LYtN7M&os;n6Ka6ov&AHmQ| zJJn1WZkQ3Ge3@z;IkYK3-5PWjD&KhX>lDDslDq*k@IDcFyOEnpLten0UsSj+1*0S^ zZ0(Fq;1`9&HrrJwT&0g^7Sy(8-lXt=Cd!Yqk2$AFr@&pVEYrSi%J*(qK%;|Fv0?{^ zsgsfr6b-PF>+gYe;$y#DUF^^JGt**@G?69VebqM+x2k4-fu|pFMQ>4SyJt|QGs?)j z#Bi^-9X#TG?_l%t9^tGzCn|@|!s`vxvsB^Jc3Pa9-8j+GPt`RCzwzx8PnTy<%A84q zUA(>&JYf4RKV^@0)skeFI+c8LsYY)qw1v6pHO&uEQ1Kk3l`YEGV2=>4(<>QOiPEd( z3VwdfE~T6G+n|PqI)#AkvH{yQUw$N)A}36f)ahN_K;C#|hjy&i0Wjb_eZ3uC3YQ(2sJlMg;mQ23lm^4DgL_&+g+RMbt=~*s`Xz=85q-bPthd4G$)zqf;p5voPDt$ z2~=;AK@-Ok8(;AxgCmy$;p8$vyYnuYoXA?cw|=69aDh8?p-toBON$&8y*ynv=Xp2Q zbo*SvfCv<~uQg}&%!8r8AI4YMAf;CzG33*{>A-sZisE!@Ni>c(qvhf>iS~Vw;R~wk zEAaq~181-P7eEG5zz;pcOyi_86X<@2s3-en67gU$DBY&?Xn;-hBn`M9KaeYS3V=hC zZp_rk{PeV%ZJR&TjB0=cD1iZmy;S`RV;!qFD5I0Q$50pjD|d$SMPOpjXQAgJQvCH1 z?r;R{woYFbw&(#J1E!2c!L4`c{JJOTcpU}wF%B|9e@gtbaYJk~(=@pP+=?edV~bab zv@5Si+Ncx587C1qaKR0u-;TCL;!2canU%WnqXOKgf;zl74S4o_%kE6m@o0b!GUmSyo|Zw9|?d9NBxXJ#z&RtY4i#z_|3! zXsI68QxJX}cnB32c4I!)k_3I{)E0tbBceRd{|Hi3Sp#wTAU+hfT))EMo~t6&7ntZX z?l%ht*CTRQWY|{ZUKO!R+A(p@e}ZNl$evW(3bE`W;fdj`c5;2P{=QyJ#IJPm6KBY5 zCMvR4>B^Ow$xzsazsWdBE1e0J?23_&!AG#S%NM)f>z+zR(3D)F@J}E~*X}(u46#8u9TX}E

    ~vEA84+_ zs=zn1;ijrb8>4V-x`{A{FMrTexx+bO?c%QYuwfsh4=1IaHTI^8sOj9jUjd++C zSHUvkFi9^&u_lRHa{b*?s53kyq~tPEmc}CMSTKl#pd0WJUl&$VdYvd2G|Y3&Rh8CA znKs+P;xZk*PfIAy{YtUzzT<}@8uVyiUQZktCwQ3zTLhaZpNYh;&M`sZ=66{NkB{v? zqNEfyCx`O8TRyBGFcAHa^}MkE#l+o8X3e$TiL7^Wdz;nq$e6h0`#68r(5=|>0`*fF zx#OS|VpW^^^b&6SqV8T%JM&mkZ%cps?=6wH2%5A_snpb9ERIPavw1>cY5@J})Nn8h z-)|@CQFRI|tUON3(9;tlX)ML|%tLK(sn2XP2W6Mkzs7>6x0$L{r0{P*VgluWK zzv(H^ywg$6P&>q!Z#x%yj|0O04m_$OBmL|EhC)`CV@c5yD|Pd2#tg>F(4w5Kas_up zP(Hl?@P;>$r^@i+loQj<@d}Cu-Gj}REcDVFj(3Nk05$-3vK9fvuu2oCm-C_*`Y&&1 zfd;f;wEl~6$r&!NGvK?cNM6*Li4c!NR7*SSFVwrc_y=Nh@tl`eON0S(0hL^o!s|a< zV+_(cE7uoz8YB0NDxMP$S=j+fBw1B5m*i{q~@y=D* zj1cR}jb=cNezyiT&6j}LUpG+@807PmX?M=sw`W*`(y9(LzyIqVH;s>82}pKh?59mE2=R}XF&C5>>KiW!zgB~l}n4^X3wS0&J9lj2rPnsVvsp=hJXVIgttns zRBe8y@=0*Euv-d;nJ6DVuvXaa4;3>xTltdi=F~4x9Em}-?39SAW!<^{Op$QAUB#hT z?(!#wZd%^YO267UDyVsX(G*~`IQ9Zu!;W48--&nk2hXGH&O2IsCMW$CEGbu{&eq=C z>A05>VxYrnek|NJwNK){->l|=tCpu?+w1jIKfy&@HL%_)%r!^MEztXUe4usn?7`vZmPU6*24JC)X_fNWXzW@;_sTKP?&54^E_89N}o!t?UUF=NE00eox0n8@1 zWOqJvrAY)Bq(n|Dis_un2(@;{0rbKtwS^aBwJVh-%|skQ4yqV_3IaD&1WdZ=cNDTl z*Z%G`jgbE<`lj)@ujJGvSR(}ff^ST>vbMB*&|u2Pfb!{31^@5QV3-J`pqU~nrJ>im zt2j5Ecf~D%&|t4Dz~<+wA92P>;BRB9$w(17fhgHem$!2P-zA|mXqns2%Z}|CS@W{k z^_HN{Ab0I=J&bRlBls5_92VhPl&vT%o013O?&aSm$8;wpj;S5h=HeaoN;M$d_h!aP z6`X}*a9w1Isxe73i^$uNRFZl6(cR?M%i9Xn8(PLe z<~<7Mrb)g}99<^CFqzi%)SlfwI7R7Y1rgi?3}L=bw8|_H8-7~x-=~hAg4yeY)=x1@r6XnhJcDiPIRMzts+gb@Oq zr9)eD?0kfeCXso2u9#^A7aGu%7ry^RSm<`!8T&`fC_*%$XN(V^n3>AbXBe>YfC^7!5T_uUfOjQ<(M3Fa-}Byx(;$zD*|^IhH8@ zlFfgq&+l2lI_lq-^qrpUafi=<8TV(oK$FL`DK@own+7`Fx?B1>P433)6tNEGwh#pZ z6RP0yYRW>KsB^Ke(153z04gg}{Whg7g=x}*lo>Lz78~Ix+*H+6)ySkX7rWlt*Sv4k zl?Rm_yji{zur|t{({YyVB*r5ERG5^nb1oU}wWETet0{qS)%*=vlEgM62|>DdexV$_eVhAIu+sbH-qJUN<~=ip zX+pn5qjK&KjqOt>@1EjvSqq-q0%8e(tMMBR5tw7ei3cDmG#?I$=nE^AkHhxKcN~ZT ze0BZPDNFnDlTfwmzn>HNV87L*RVIIb>7GtsJvL&BpJtEzM15lW;ZegzXl-5Q_mA_9 z8>2yYa~hH--FmTm>K-S`qJSAW+Jn0#f6?)^vq*ScyF-!?qdQA&A1>~~R*P=6sh(5$ zF^tvHhTpxppg*{@S=~Q7i<}AAl+lyzjOU2~OpJdLW)OxtKRBfi_*Epdo@B*G8N<$6 zr^({zWg<(Zqezy&41YLGa6xMd%$V>+H21X&)PMBFW+#P zi_gzfBs}9)?t}8LW0~J^dBZ1JneHkpbQ18`Sn?L?je5wwrW)<T~g}4r;E{QnUCb31wm4jNq+QK-~ZlWf5fPztZW#{;er>&<*{S zUtz*@(_$t88~x)i#L1-v1vWssvG{O>539dpa*KIS8%BD0vMTk;M%5qmwuEJiK;G z+Lb&)8ad%a7Cg@r&`awd9YDe%h#$}0tEuvq?xF*(W-qAz)9JJW2C298jCl{RaWrxF zG{~h0Hb!IPmSgyGXeH~DHe(tL!}Q10g@H*CJFt%0(FIuF&cv#YJs(qtwft;(jO{nX)R@M72Z{Rvwb?FzE_^9tPPyNb7x|u=HPWWl z@)e+W&;WQnq9AL_bs^gCf8I>LQlr=P{Gr7o<8S{` zO$}DjKYAoclf|ktu;5zT0BO#ts&4}$d21(?CZhendD$yi2TYW2<^IRi$Jpg)JL&D$ zrBTeP*9ztAmjAs4cl}$q!o`{$QK}_V$dl5m7Z)%;cqA#km1<6K$cG0e@hqF&^F z^MjE3=nm5_fq4tov0#D1J^(vKRrHD;uFt`5iVj^>W08yt-9AO_UfEeu%}sw0h<>`& zHeCACUebkYT!2$WxJ!H6v6PN}Bkhx8oaK07wK~&LK5m(vc8**-Sl_?0uZ1Tv3culQ^@L!9aLD=#`q-D$Q}Z$kzwOVMHEf5V z^7AZ8`aSG;CCmRwH(m%t&4J9dy{xf5)26MIAIZ?zJ)aRhaDsMjJx~=R*@AJh;^K`O zbH`*Ood2FiJed7yUEh$z6wT(TCSd7%n1pM?25RnynO z81Jat+{22rZ~SLBn439#;d+~c-CMiN8YJ?4+U@O-@2<7=CD%H#Ess=jTV7whmcVW+ zd^eBXjq9MIh;+h@L`Sb{Z=Gc1zw>q+v1G6Ec?O*eKfL;fCwoXX>VgHwlNSjqN*8ZD zW9i~Gx>6B#m(xeWvW&}9`ANe?iPZPGzmM1G;zGKob*Y4f`!1P*nkpN+P>jmP0+zO| zx6zrdN!ZQUU8-V%pkZ(qj~;|t78ztj*_iSg;76>{yQ)~^xWYP)P!|V`E&b6%@@sblDet3U@3I9!qLro||xK)1{x@5%pT>bnD>{{Q&j4(Du=y;YQXQubCftj19?PCi*> zg~+-yLS~zim8_FdlE3>A#gLZfe^U84jRaQ<(!vSRsHc2X!2J@;LT%$3SdIEimv)r{$ zJWbwSwz3%*OD#-h<%0nz2joK=-ZVzfjYSr7P(s$H2QZeUgSpBKzFJf%kYrLhB=*WyTWxwbIxCtIjK)IGpO{}xuQ{~?=QlR~tgT#Ck&|JO z4`q}DU~T}YZZIj0wAaS!(SmR8-72Tmc0@$XtYe*6W*=qJ z>6Hiq51I))LLvU0Iq5m~)E6ZDQ_5-UT0-Ob$u6XShTf{}`i*w>J?yyk*~nLY!?32O zBIputv(Jawnk5~Gi^YVLlO1$LaPVWLfemWEnfL}5w118YgP`*OJJtO16Q$Os7_gAu zto-!3Sl;qmxlfB$g3)6go;4U|!mWYHD_O=uoypql5t6#LsOr%y^^~gFEm0YWD=>&=p3;;BY#$kYNZygW4d-j2BWlZ-4yv%6QoJTO4&f(IYRn$a|};JfG9Ocv-4M zs@%YVGoX6lI*|?gvp8?+D$+iy+nCtY&CZh2_{f;_W}xRXV)SKkE(0V0vi*He^ckAS z4Rd%mZlvI-L$=@HViWmtxP^@D^P((>KUB3-(C8|S59<5$u%iYUY;VSbH_2r26myG9 z$ybJvdJi*kB}cUXUskTwFH3C!fb3Numc;-9W9Ffj6F8!W3fvd`4i6yJJK=Dc_2R0x zaXjTtF!Zxmc`go8qxd`%;;lJVvAXv9nT6Es+Ze$5ff+DJi#1JQNpq#;Ulnkz- zxEr6S=eVfo+oKG&Jcox&nB5GhZ>W1)8n}h^Yu`-`Jedh8;xFRYF1$=jIebt;dOevy ziQB*37_q66q@%Z*>wmNJ_3aV8JSIZ!8c(pt?9ZmM%<8?T18={rG_gQvA@J31L}c!V zx@a3W3}^tkdeeh3Q`?Ejw%S%>hPjPiSKKY;@LIEiL=zV^v1C%p~O)kTV3>l9T~ggiIr2o*)lEMn#^8`PQK(RKR@5v7gQ<=LSmqfDaXGc zDt7UI2f!t@y0IE35v=b;fMj;}esjN0W@J(xT&$m*6XMP)!CmvV65SiNDJ_7d8zpZ% zXdCUCP#Yl)9R-=9CE?yYnL~*gv^Fb0aI|6sSYUbX+dEKpvzkn_RmE?fzLXSoO?e-eJMK$I*7{J@VXA z!@@VF^fT2V=;-}pISkrC$CR8WxHxr9$)YIGaiqQX@NVPptg3g0`Sh8dWl zjc0cRUlO|)hhMb1)7I*jjD?ZlD@=n>pg9P7!!gn;%Raod9fO_K)dIx%_^0)rZFAv^zY{6->SN z$$$Dn+3Adenso>{qjkK?rYLXRiq&wxV7=lJae3=g&EQ}WN5nk-EZAL+>Hh7t@}nrm zxwbA3RyA8U?C+&i)PPEak{991q|v?D6?nt^EmZjAh=)r{#9}xqa3anlPgG9ftq4mF zGfP9<2`;SMt`D>98&^bK$-B+D;&dI~enyQ|k+?X=bE2a~r}0G!z%r8JhOq4**=607 zAIW||`f9@<#cRdEcH7%V=oBCAV$3Cvp9HZMBH#<%-LV-`|4Z(ydRhYng?s+_5_j5e zBeA_sVs;(~(Ya(0w{&0LX8EylF&TXfo+5KNkwVocHw5h*=f3b>fBwOrbEt62`Nf6u zwCwXvi^L3RU#`0vBTCif`;w4#O4rRm-A{K%^cK})5Ron+b6s%AP|lozvsdSm5#zz! z&pVI{Prw&=FlC}Cfq0;R^Y`OpIgw}6=MVPge!I!?GCEbwF#EdPrPCm7rvMy`ptSD? zu-{GyH|&xgd)zn|iz)CYl6J}%c4`mMV20f0d;kyy7mViw0aJJe5LHDxHvqx5gC&ls~%l}eUxH2nMcEiQ7r$I#s$^q^$K}z}ck+nNoN|B%KZlecV zp>iG}tcw?WB&9JHtsqTaJ;xfAw>OvCC&&N|ody7Wrr*_jG6Ul4;lFe|Zd|Vu>K_;D z^-`_5sqGG3iGLeSBDdi~!lfF@86d?c4?VF)AM1tVFb0Pp|MztTr^q9LTN&?#MnO8T zd2hYJbDIJ)e)6YR*J5aXt;L`WSG1T4GOaHcs2~Z<;xf-cWFKyTkN|PmNJ;3ZlkAuS& zeM!8Iqa{Mf9yG7%YD;ax?9zt~qAD%kSvoOjOmdZR|J3nI!yv5*Vf~=G=}_qtH(qYG zuJ6Txn29`<%L6Rg=1vM{K-qO9SSbhwB^hgKVhX<>m?JK{3^6|wLYeZ&NBT6_pqv^s zys835^gQxSREtD^|E3hGEP=x4yYu&_TEnpSibB0K1CoC()YkX0H3F*=S2uNNQQu_! zeK+k2Rq`GWuoKmS7IwF7U-LkDNDFdUTmHr$TxL@p^ZM%&WT`W1QR;uPH3j(*Upf<^ zxt`Ttm=&R2`gG+V$ z!Q6L4;jk@tgYJ8Mu63`Mmw@-~O6*X{aapvA!KN(KuR=-VT^$n6%eY zwF;t1yT~9CUG*5+dQHKEj6rqCW+?apDd;haUwhvTr)8jVA#^*Sciw#|q%nHt>=1k7 z^H6BC4eyhLv29%mb(=#*^3)6#pVeTM!+wwzfVd@SuH0GI@Cw<^fdhl{zj}B93D$mz z(}%$Js%fqr*}Z4k203{z2}I|*+>7PhA9sk}<~8l-ymG#}e7N>m*VOnbH^)D!6F$My zk_G(mfzwV$%I;_GYK=34cot~VjJ@z2E`l!CXDojk_1$E6H{<+H^@%MFF}Abk!!&#c z83Lz#^ZH!`kD7I@kw5s8AIy%0&-OV+6PK1dKov~3xoFa4dXwWg#$jC$Mc*EvPSKZF zn@;siK=!j=+Hlln+j`;75!sf?@35*ZjShTuIcBX_WH*ePlWeAmOiLg+l-cNM6bod1 z^@CNkN=^7|;c0PIE$%7vj~mxNQeOm*=*_d6S+j!aCV}aGo2Eb|Oz@(J^*tPQve6^Y zBp*Muk_;?{YJ&&ro(av}JNO6cYy(n7Z}_k{__zrQSy4Q*9RynSIL!9^m~AeCbM9|i zp#}`7{UMDh88#G`?5#V)=z|V=LkIU7m8lQuOe_qT_~67$71Nc9CY;E5?)Fzy%JZOt zba$WDa`sV-b;&w_Y)jFFeJ|^+Z}w`BR%3sHrTs%`VwN zVlf1W@Dbnw3BKAVXJ(U8eB@Tk^PJ%tZR_`lyY|CH&R4wbK8+!d`fjf_91E~Tghjo%sb`QG8Xtk{}AShBFTa#mF7m#|aaf)j9azhrP@%#~Jy4P^p-EC8HrGo^WeB zNMl3oYP}0+!>9L0}1}shQ0dV@lF*Zd*r)Hn@Nz(5$$r9X3W4b!m(LINDi*+8DTgJp` zTaYbq7x`zn8n+OnKdQLoeMvfvm<%%4ogBw_Lo``ki%C%M;N9BI+S29sj*xD+#ZHpO z`eQozZqv#4iY++n&xu|^8>LNE#Rt7X)q;{`ILWY+USec+Ooevl=nHk%oDQBEpKzwb z7Oe|EF6Ddw89$$1C^9?fF#AIlM(PH=We0!MwEqZPD8Pl#eZGWmueoa%jcR8li*UC1 zEOonJvOdMmMd`~pgH=iIQnRM$J}5i_SE+_jZ+q6c%&8aNZp zj~P%SP~ofNRo_~GGdfo0WC@IwY5E=>fsxV^E*$CJMMPfBk!^N$@Mp&@(2}0<^uE`1 zd;xEn(esvZlth6I) z>kw`zRq#ap2^;hTq~=?Tp$Kb-Ay=am;VUC<#S=bPzM$86J1k3cHsi4Y{ZhgCXHS#A z8TraGMb6{=^u>=u<3U_H!QBg!_WkSB$z9vB@SS&-kl661Mp+3@>W&VC{kuB>fVmHG z^B^Lh*B*j3G>-3m1|Gm_l`3B##^$_ejQ$^?VG;KL{`fL6wy2#f{geHy(e=9;__I>5 zK0lIu-J-qvKal;R?!ds^!uj`Y+hz7Un>WTF&8pu%yDgG$yyv|wM2&bjpeLX&hV^R% z#>ejDC8Hj*hW!)DwJ@~nTt;>YE-mYfsQ8N=(#^5Rj(jd-{FX0jk5uB3r+6fW;I}JA z)i<6ss<37%m>`^V{RGu)^!G^aw=aB{Q}W1Tw164R*4_R+l4=1X{f`?&LPkPM79v5g zIX;Nd$-W#09RxQlS}Otqx`?>D&U{})ZERCsC#SGt%2g3#OS30)WTHqpt)z;r1hAlbNaeVMt8Fj-gOwWEdHMy2*H9&w2wZO?~yU46-peiLb)Cm0iFg?(0M}8^ zwZFuT-OH6U6Nec!9pN(7{lYq@Octcy2E0UnGwFp~G) zkDaPF6y(ejtF*QZfKfuHYO@WBSK_wVTiYY5qh$((ggmDJm_548@iv?za8@DiXt*<=_ zf!XoD@wJ|kb=S>K*tAq?YkzA=sw1LLO7o@|0<(UPjv-Ys9GoD$cYOc_Z|L3H3FV5z z>*-(l9_ZxC0g8-0{gcF&`skw0@616c;lmAN#{I+v;!t^uldKH9g(`$}xV|;=#Aw{r zGPCw=TaPax+a;5{{uaMlg4)PiDJ-Q`ar!O93=Mkdc*!DfCifii`EE9@K>2#-WQsrC zK8XZ(k9*t@tTbSNrmPt_kgRStU1~aO4+|qx&Q};Iv;F8WoV;k#rEYMZ!MD0;Cm$E` z6<_a&S)JH;m$HMmMwLf+HPXsy6*$KT@5UyxF;d=Qke9^w0{NPOgWpmdga#vTS`zsQ zZ5Dekm6W6cmAz-TS!@RP0BrTbbMK8xJ{&>c*hP-L(~)0hI!BP~Um0%LBktQ@@jOiAPN2 zUUL+SP<#I1;Ma`F2x%?k-kuNx8MMZ_Vh1&R1)>M9v%i#@ukw;d&0W!<&n16zm2OO< zHGw_zI=aZ9_abN{@k#a5QJMFsc(e$iKd&r976}K(Pwzi)ET!`-Xu@4MD?{aQY$Ae385j%_?E_iik-_K8*qtJ6d9}d1~f|0^! z@4sp#nj`F+Sb9(cO4WtQ44elb7GT8pf{ItdFR;r@acevpI~wJ-&hExvr`A=`ic%SI zaB1J8`UuO8kg*m21CiY3XUzsP^kJmWark=O|13@GB-NCRW@C~R0uI%I6w&m2Zr~+mqRQ)r%XpcpHFLErc{OCQA&v`$}9j->N^$WB)8S zCEthB6N z_{3kND*Qjdy8Y9Px=?q7S(t#REz^FX;GhBpWe2c+{4B- zWgtiUt!rryYMiRa`c?v!Z`MibY&U`GUFbKkLu4)Z$qn{)?-851EFE0>-8XtS|C8$N zXP6vg4i)lxUH(RK8oPvA*$)_AXl~!m@1_JgocXmzxX5{s&3nhZ-OYE6&vlA^L;r!16WXL-j{0 z;~6%l*U1?UP207&koo&n-|X{{HRbcUR=c8H;a0|I0Us<>^SM@9Cm@A~c2RHN5Y|EQvU99fdzt&;jko)rqinAaq`aLbR#bkmhB^;$@=(1!LmY<$Y0GR8t>a{k zt)!lwhuq2=%fBhM11$S#IPJY>>-V_cZ;J&{%dQ(CA!|_jj27^=JdYK>UiY(n*KKiw z9b<#SP`gl87>V_5K5HOCPfcu@H6`$o$kYB+uM ztzh+)Jp4KurM3(+0Gm7@?l-Ez8d-LaATHh!YD^M^9m*D;xkb8}MoxQWAF?<6Mroh8 zy>a~I-fGpm;~zVf93I1tuudkk2DgI#=%HH_0y?ZY7hb}v}2 zyoL;bRNwj zFd5Jj0G=)m^3*;iiEp{WOP<#WWak@2kxYnbJre#%^7}Wb2{e#?$&L}eOw2A+PW`4% zyy=t83nZH_z^>4Dz_gaKZ!mim=oJjI+c$1jxB~h*v(UUV;Kr=jRYZ+Cg|1xsj^1lmZum@lB#ph z)ZBtNSj7p#KN;G^bbJ+8x6cl6^4{rLZ(aspfz~E14mk7-#L1A?z!sVY%x~bRbmeAs zJ!gBM95o;c{&oy9n-EL<*=aAIrbAI*s zQyPmomK>zReY>|>1%!c!p{ibD#i>*8=Xf;LPM&}i>0A86oLP_CuUKG4O*))A1w#hb zHbCDP-Iw0`pa%9qKF+|`o_v+1>XS^~!s^YhH^Rbi9@uj22oum@2eod;pZs6nX`+SA1*+ zI`9vhprwsn#zse?%$B(J?pVIxR_KE4gi7mu7bOWz%^!j*zh~9Ps$S6@=cl=C(UTHL zIJOq}IGrzo_B!JzkQzgH9DarNx;QKMd?E2&1V+8$n*9hp95ZLi6*dCLI}F+EjcD$2 z?ynkHDj**qB8i*hRYsw!UyiE9G7w%qhl!v!Wi;0tde$B6-uEq{j&{}d^k<#>@X*Z3 zanCFe#xeP&E%WJ_5t#fXL$y#n-TwX?L)Sj`_cYgIgwMAI4f~)v>#^tO)?NsRKr^N? z5HeIXu);08K*_uP9{!*V{JrG|(1A+m0UxaeJt-cuTKm<@q{JFgV|i@Cr;v*1$tlb5 z(W414o}g}sU+VK8j-x^Ls6{BAOd zGDN1#EfB*p@01oC^iv!^2ri4VzkDLKyQ&vL$^ix3#v@^Q4KzKId-wZ1{D&=hhg8eQ z_!-BQBP6+RY8T18T}sJ+x*WhuyTiL1Ygrf(`)HJwTV<>v!z94ONazPzO2CAlT-C<8 zHNAm*kR<(!tw_IxD7H0zJCq7TYK7|w!Ca`-5ywjdriRGXB_xf212o6*E9`9RxrHe@ z`t=P3T?_s|G+~c{?NUtm^35ZWqSXpfIqq;sv?|(je#i>99gY%?zexmPY@CmRbWqFVAl2Wn^IybV4?fIdO?)B4dFQi z#@SK1oXTGwuHZ$81unRa?Bw(8vkFu5+Us3+?%|VbW7U|sb@35xXRbL|G7?WDrQBQq zg^X1fUhPJZ`fRHq>>v7S<;IVEJ<-Tp-WJBn0`g8QGE9!Z-#b${;nTUm%N2Lfb{zWIQRhZ}oth4W*@H!}TFhIz1B?S4Z+9S&<%!ZciT;FsVa1RN-2WM_^4bL_Cf=;7Kgh>-*3Of@6iIWVQ>D*zFQbM zie=@NdH0I5B$R#t*yOL;PVvMkgdJ_0;jE%4UTCfDxl9+L$tgloaMIBcSmeW`Ls72DX+ovHf73ZtS^rQ^QwdJFAU#z9bR?hxL;EnL$A(S4WI5|{ z?KDzH>_S(7cytRCAP;-s(8_a&xtXQxxmOgi*JG=GI#hQ(99)W!s2={SOkd7ofB9#) z{?tiR$J5M^32?6!9hnHKds&oE&wQ>i`;MMsC;#k4kwrP!7<&_geU*mK9%{_`r{jHP zdIHgRGE`T4H!#_)Sh>*=m5s3)cF%d-)46G@u(S^11eW)G&Wify$qPYLko1YZ@w!=E zL|Yh|ZSF&0{o_?Q92G%a?s26O6i+jcF!X~$TXJL>e0`dI{!9HREBmpX6@&LBs2N;B z*-jb_$JPa}g!fSx@c%{p+r@z8ngdo3+1%bzU{W6WoH1q3EY|0KH*M|J`@4lh%74L8 zVNe&@T;P{`|uj&#ck6 zEzDhqc!O&WHdI$UnZ?)%G8Gv75N^V!7ehEHS&#s2#`*@vCkz+vfA^d4L6)#A*-S}4 zh)&TGJi!_%c(qM(FeYUs6hb>U9Oc3CO`dh{fm@g{;XxUL#wR${RSq8iDo~s@zE?vQ z#KJx*%I<*nw#4~fddOz8UfDi;XAjiF>Tk~Fy>B6IjRX3u?623&^R?*O-?ybHTD-(h z){%7KkQeBaLt;X`o*q6)7T!_ZwB_LV2;sstL@neQUjTyz1+peUu;-k$+^^&&OP0Nt z|FZ;7ZutIAs;EvDjUw)bw^0ycKTb3I{+MZgP_Vaq+J?brVM|&Hx&m@@Ai0mfc-(%J z9CRgG_;4-f8TZ#?-SQTpt^S2I@fDr=wOm{sMFT|Yat;TwY*SDigT1oQ@=@%(;`&(C zmM_9Cey}9Q#qFoZ+V;&gPr>XpeqjCg9|0`Ljw0O9J_vQ}8{9@dc^4oI83dQ%yleHm zAj8iuILOsYJN=wk9fbWbyeVTd7H@=P>d^W?9Gg)aT3-qK&ogQN-d0lbO!K5V)DP@e zbRr@g_3+tjFWOxK8hZm1n6$F9WgvIpFQ5vEkJO=@01Jf%yBLBV3n4nD$%xE8=B}nd zIBQ{$>P4mMuU$LQ=nCpqz#$%zE{IAYk=F;5&RJ91)!{}er7yxY?m93LK%S>;$>*Gi z)*wdw`YBZSn!krjT&&So)m)#_ilE0lh&sx@K=(^^JNLEPTf;=kPofDxIx>KHu=KFx z*!7TnM2xU)rs(iv6hR&<*M;SQ5(ob=#5XId(<5d0y~wJ!r;~4b3Zj~F{kBBy*K$K$ zKqkl8Y@!<;WS>GHy9)}v#r2ByK!+{GRDDIGdGD7n2MoRXg6mgxI#*q!3%C!+dF5=` zb}vY6C-Fjphb7|Ge{mAtfn%g;$nUGQz=dtsPjni|VT(OJAk1+b!9-X}mV{MvM(D7H zsSSn+nU3&mrT4gx;rs69@2yv!D- zv80eCIJ_T10c5T6Iysh!z{tzQSRn?40Z<2(i6wVj2f+QT9wJajxk07~M0nzGuC(}$ zv~LEfdaXC;aW6}(|D5>5PSxHUHmn(si|DuOZy2E&aP_UB#a(je<=!a|YSo`r6ewk$ zj-N;jE_b=KdM>Gz8_5NZ=1Ypy?q_y1y~xSA)7mlXUX!6u6t(2&Q0h`K1>K3J1Hw?+$P4P_EOzSWLBQE)#zrU6i4yC z%z!N}8Po__`i6h_W8v%VIlW$)KuI)rs zAIjU6uukA_$Z@-28qZ6UFjCbGst9E2LM^eUbKJg=;`rIPu63M)3x zQ^$7S6!d;1*pCjGc^;r`N>H1A17i#1p)#hw{P2{3!`7bT;UH+zUwu+?&O%Q2rHi-N z8m+FzQw#dA0=^iWw|%)MoCPl#_71nab!HfnJr9|?#WaHqFNlM{`|5zENbB6Bt7;Uf ziwBf(Qzb&zzg2x0k?PZN`yT*Gc+HBPzlRjUMgVqpKW6)+y}yMQ7FvC4|N8K6gwM$E zsM}qtH;B*^PGus_N$eP4^h?KzC}fJKRHatKD-3g`G*n<4g@&lkRT9`|ld?zPwK8V{ zw1nH=I(s#}O$vV&{ZI5`h`@9b@lOaR=1}}ywWRwIC~v~8Up?Ed(cK@Thie%OgFw!Q zZw%sq3syn?+#x1)@c_JXsoicvD&R!}xgjUWFml*tV*)S@ELdO zFqZ>)9nqH#H55$tTPzAhu8d(8?ds(X$cP8QLQK$6kkEwX`iQ;4)O6v(3&u7Vh08yc zBg*`?yPbDbrG2Z0+%lT=AVSNjxt&iMKfF}JWg*by0kA{B;iylDNdM`T8SSxz?7%9Q z5*J_s!$PB$lTvpr2W2O96w{yD$u-7Q++R;qJvO%i1lws3Eu(xZgk zcDbgSy63cCyCCi%nts1<|DxSb>2b;N{#*mKs%?K^aPRxrLSgo;KNa7cDSGR^8%ulD z1*PHCnu4*)x`J@RNlr8jY<2Hj-BRDOQ6Adnp@_1Kv{jdg?z?P05EiyOy{}zbMgAsu zry)bl;3cB>A>umaB)EMBFz1{#)BW040seN*6W8%6!o8h4OPvEs5cXT+&f}5RC)&S% z)PI+}eNk62ZKoK5pfzBY<+$)3#la*@%U~(l{%@-Hv^rK&!$E)`$QUH3plN`T>r(I^ zGS;1R)v@z9nz&0FSC{r(l8nxU2p3P`JDz|Je1tIi$BnXVG~qv#%j`zJTwb`J?Gy7` zi^*F)!e>d%NDrzw_zjal!+}C`2pusx5A`1EW8*#qM@~J;VjNYE4CTPKuhds~Zujsa z7$N^4*^^*jwn1pT_c5^O7(KTg2u4Li*$CeeOSg0d>=b1ds zXyY6+UU}1zmoKgq4W(|?nkH80m1{i3SONEzqQM9$p5}N6qUS9RA$s72CHmssE?Dq7 zzw)@ZlNL=BtI^<$TzfL6N-j=>pk7Yy9}MDjcm?ETc$)?>%w3I1@T>jo{7;|tBiG4U z(dZMg@0GC$~7<{%r zSHU1(vAkM{L#m*$zRI&eK*BTz|83GuB8{q!BCPJbnnnsi*s0S*APtwuz2`lcArc6z zKyaR1pUaPD=5y3mOct$TnzTN0T9}Z8(vygMJGUz9d3HJ@7$MCRAo(DAXGm=1FVDF2 zXPUMQ)y3xd&x^+ODjmHPS1P+0z6`l-ib8}{%gsta>pCFZ(T26K;L4~I1;+{`dHdV* z^{pmGWowed1-$%yzk@~2fl-IZbNl|U8pFKy9Mia;yur{!-mrVM~ z5fOQR>XL`t?y{|Pg4ZCS_V(z8!s~bB)Q&Siae%Lcu8h9%ws{beeG5N1n~9n6`{|>0 zg=fMKtcJi1CS@4(Dm(uGeFfl}4s2u_!6xQDuY7x5VcL$M!Bow!*{>0O*>@$9lD_=j z4RsNg_RVq>0kmQT8Qb6&jBmsgr&_}w`HJ$HORi-&{rH>YA*b3KT36suj?OpY@eww# zRpZ5Lvrc}t3}t78&gQOQYrG zhSjcvlpKaxzPccy33qw-F}iR_n(b|*c1R-CZ|Fcv6Rwf9s*nk4Z`>bKP5qdNdGSHk za*JK;@UGW#_rj{2!Non#;V73b+=>z7vL1GOyK%OuqOPFv*$3<+u-qY|t*@g(*ohZc zIX<#sB0<^dZ`R29XCP~LG!$Ap!W8QPHmTwsISqWIHCa}VM`4$l>i|t+1&mFssjgG; zYJM9-I6pDYm~J4Aq{sOQtSwG6?GQ)3#ce5@zZ3c0>1AN6r@&NTVoa*_Utv&;;}QBo z_W`5-9oD%|B;G;1uf++iGT}2DLkKWO=-Pt0PV>S{YJtc({O^Dp7A{ghR5`_Dc7G@H zq^F!FPTj|3si!mSoj}*d5Ugdv@lp8q$*!m3Fr9)(lSy6a-QQ2av*DYc_fZI4cB%(t zj6O<7{#Rr0oyiPB!wPClcgz{%G_5#H5#}8+1Q48Kf4~}jbzft&wk%VY{Hzleq=#5@0q6ZXE9SW^F38xz&|?~_T(Zu@o{9bnAEOj>Op_P_5*WIKNyOhV*WF(JH|B+; z5@~Wh$Z5Ai*lU9a>%fD`bmZW_2YrVA_u$LyMk|DQVOH3mwShi^GolASZVTodlSU>1 z`=&zZ-x;(D%l8%d;jzH&mA0w+KYdzPSP|cscP-R0xS?5tpIlDeD;?$Otk7)BgOj& z(%pQQHd1!>`J|DVys*?}uqIFhBm_=4`q{r5`NiKJv6K$y^2o%eXsl}k-X#8TSX;R5b%JODdjCV0zCgZ(d0tGEciHuz}cl-|~X zN(%AChjw4r0p+scF57CM*6IFNBfV9NzIO&DGx0ue?8f6(-TLox`x-DJf&M;U;zE`C zF7WmErs}{R5i^|I;4h$1k3`^FY##oj0To29+r`jqx5`q|#M z?97y=&4JH59@!eh;09pzj)Ug(obL4ans2pLo?5${EyRhBJ=$?i7WLd26FyDO$x;R) zP=)O74fAhimn5Oq_?~M`DzLvnYAmZ(qi05Qt`W}pG~#7F*W2;K*q&%HUZz?3@c+vOOz476)z&iovfc&x(zT z{(_(h6>#!fD<)}2?ZP&7e{nzopumIh@t5dSO-!wpgd!VXnD%Er4KA~u}Cgo4pkJYh7PO0%pSFjv| z{)Ph0uZ(WZNNK`Tg&bp=-b%i>bL=|4O^W2okAym(lLft>L_13(&+}6B0Ire=9AyaH z7r}^hUhVgfNC>%hnxQqGN7Hs+y%?6_{mFIfCW#vh(ym*)d1151yf)^a*Vi~L+|*QO zX>LJ7pkk2lYdT{jIKd99nIcAQ*B|1D=Dr^;$Plqhy^`vEmV^47O9ZRMOEcEvg_${7 zBRUTRoD^SIjzU9MK#?G%#`}~pFxm|C`5ggx-zP@CyYh2ejRnzgv~mo-*F1(G0srUN zq=b8s)X*#&^bCY72^=3ey&70FTTE795>{Lo|B}UDIdRRbpi}!6`|s&vxz4Oqu1FKz zHyN$ye~XPePXXi$p?>8=N*7t9AQ~K%YfF{(%x{2STMS|>et&kb8;e&Hq>V|6EX@V> zELbfNen9f^LbDmJ8gHjQGx&1+IIVH$$6~ybXf%(Orp;$g!ev%WuIM+(bw~8G!^>`Q zVz}hzX|LMCg&?E`*htsI#2R2o6J7azGkG*O9VE!{8$)r@@I44E50=%4D2gE{o*p$x zRy}laG9XW>Ir{S%EPeGH=FC_ypQ+ zPyl#;ab01}BnlZMx02ZY$RMbe0lGvt-eP68lI5NrLa2ui)W~dkY4Zmx;r}JicTO>y z{`Md)+iIl3{G081o!~R6<@+E#Ud@>tyxYm(K0~i_5E5odPd311ZS}KfavT0X!C4?f zJgSiyo9+#WS1nXj()at*gb!2u{<}1zjU5SUKCqJbJQ8or%M}%4%meo+8Ent|I0Yk= zGs)Hez^CH`_jcwKEOX18mzr2Mo;H-9GwAskUB8e-?Ypxtdg#^7lw*JK^EY$x4k~d3 zUfnujajRv|U-K^)V+j)@dg6WX&S#*p(%1p0uW{a?keM)bzxNo3ZmV@AorZM9kF1o` z6lRVr`u&+zUj_jVH)mwM87GeqcjVo-BPy$anRfegl!Vx$DW%P6Cf7kG66x!-Xib51|h&xar z@(>}lxEsPAIRNW3U*FykXMm*WVzgF>NxR2ub$|6>-@M=CQiX`z*a4-rz@UKG z5A?MRJP%2y{mRBD`rlB82oZ(d;b#%%jNt4-iXcpnehxH{;;G9F8EkbNwSOqzai?uh z9p#3^z;~Y`BBvOvA?S+4-=P={zfJxr6hYhbQrjawIsA>(r}L!GOd45pV_{GQu(2S4 z!^XrS*rsL0?O)+Akd(obekXbiCJEatHf{LSbqwg$1ugIbBABy? zcSYxb$3QtS_Fh794s*)XS84Km$+b;Z$T|&tLxzdV7!7JFK(0<+->Jl2 z4O<3A0M#-T)CTjMcmX^e?Dp~RYy{)!$n2VfBD=y)WlY_iP>zH&@;AuRfYRws89RTp z-Ua5Dg^@1yTIxJ%@dfe>xc6=ZFeSc{{sr^~~ zQI+zP#aSJLvRs9~#*hCmtJkI4lj*caE-zl~EfhwShU~iawL6YU_Wb`qNCq=?7JqGT z{W0;J=vWak`*)!Mr$gQC6*Jr#KFn?S)itWelk%TJ$%`U4u63s$;;>>yLJh~V$UPI~ zU;0+32*{^kYMSeomA zY-K^)-xtFoznw;iIXuG-0r;zaE8%^2+(^c z?T|64uS9@R)!Rub^LwpaykiI)R$J_?K&25Z zRns#{ERUIh167BVj}*4V|NRd3wn+eUM2n>B4R?Ol%hkuM!eb07oC3ccz%+0dk+Mvq1 zgO~Qq5{&O>V1WVaporQ%aWeD5HcxYZ_7F!p@y2juyKf+0t% zzB^w1cPBny?j<<45R3Z&DMzjPvdDs8uCI$gBKMm|D@z2P7UDwNt4rB5YdrF6kN>;q zOY5RwO21?9{<;57Nozvo32?xmyERPg@IkX|6#wsr&7X1K}R*>1@EZ9JH^c zFifl{vPM1!#1fO;y>}b%e9lu{z$54G5vE2?@(1|#eev#N!#o}V+50vK4!*-oa8O|C z8%RsrZ>(wq8UOTGm%yv^5KYc|p-+u9j`XGiln+1Og5`Z#fu*4JH_vMJkJa8C__H!n_4_8| zdohC~Tc#&whHe+r(x^!EmjH4Xn9!PbF)^#kBr)1d??b;+=#1zQS%*yal4mTa>yoBe z#a-qc75JY>ugOu;E_a?BBf{!(pjm}V zFQ{vFr)!*HI;YOQdUSc(UHC;n;2)`LOoXeK0~eJUdIcG{E394~dMO7%?%naSARZ5Q z?=G_~XFnip#x{u=>&_q_00eU zN}vc4fFI{9m||;Hn-I4%nEue8ig0;g0Yr27MP`nE{-=a);eT;d?D#2A7);<5V~gxa zYb9!<2v?WtyX3+9j}2MeWx9nyp%YZ|5O7QgfAn!)8cssVBq01YrO>iSbVsEiitDa5 zZ4$@2&M=nu5zc5OhE(NhbN!tg<@M7EbD$I5T4w!x9P z=t#Gy#6>0uO>dgm_|~PrVF)iwwSFJen=UA)mWMVSC0LpH+|b`tJ}CNgt_xR` zFmOKdPR?^>ip8W7xuI|%=q%dyV*ObF77q3dZLNv`<$IF{Zi!6zfoNM`)aXzUP(=gy zjA#$A#U}ALt-QCi3>~^b4bJ%hdJ@eODF#B0jA28AC$IiYag4B8K=i9NTKtbwAVkG; zj?4B{`P)V3{9tI|*8{^{YM9rk>&VJawC8RL)BvSZ~Y$uc1pRt}gD> z^r@xlE{`D@#Ea<#YMF_tE|61e~{RC+?N)JeL9hgMFqgc0U$AMt1~dFI!a zV;hjVU;~?_Kp={#>n`Bj1UM5aIBZM;MWY?h4iObh8KEKoWI}E6y-2u&kQF^^!t1-m zm+0fGp)S3OEn#Dd;00OgJ3prbpv7mwmrsYGgNL%JQs;A`3zs|ijlA72J_VVqiwQw z#FP?#`^-}Pkk9I(i?yOH4=5yt(jU(eXA(&65cC9~1JLP2GV+By>GrAg492nW>)F#h z``5#q^Sf6-r|4qPA^v|)$u0QvSMbz`1>&$Sn6^-?iTSiR2z`OzQwD!C6?8Op0V>0A z^$+2Gan9FAtz+STni8GrwSYYgDH^Ru=2-!1KLF%FvTA8wX4chvyK)`RN(PP%&YoA1 z0}cs+ivb{6Y3?To-me#{frEW5p#Bw1Mok>FUpfyoA~XlOJwdXET3X$4*slkq$+AS!Uxv7h4;z9D!H-8x}?T*XK`8fti^9Nm@eJg=s&)$br z3%FAXrieLgV&)l34|EH+qkB2(j z-WjH*2@xi1DU`}YO;HWi?vNj0NR9EMhrZd^&<|}k61!hkQMTH*p{d2rmJmJ7E?N0G zW!qk_A3KqyEKweXnOT z?J00gLr4#JZ8eUb_HkJ`WmP+unt%PkjxPk_ViT(R$6g(~YWOCjZUbGRyfjeJe;xY} z8`(!_2Cs+~4VIG-UPN(qWo<%>CoeH)GrWPv4oBgo2^-#Z{?Fz4iTmtc-x`kVwdbaF zbYmOU&EmIBW_#x-Kk?oY9e%p|VuITs2o?b|tiuc)e(zFUyouc@LA|U&lrgvQn2n_6 z1}G~)jw^tF(@#J9PB7<00z@CY5{2?BbtCUnh`g4^1@R(E)8+2jplTrQ0?H{grJrq| zk}dKrbKs`EGQ!Gjfxzq4=C-GSJ1<@kj6bXJ0(ORrC};ww6{*hCd%!n7a}pQf(Z?U2 z%kTZYnZaiEIdbhk;DkM1DM(3;-aL>v9=(Z+O;&JC!rse`Z&L~CWUoLqE;X+inax=3 z{_hcCXWI^^*)_kZ@ar+S)@K7F;wk5NhhpxVbF;0)MBHdK_QYm4omUdGbSjEmWA;P&=;HYi{d!wc@ zp`djOBl% z-<5!T$H1%{6LAH%xS*VSyMJV0oWiX!1eB;H8z%Pe?f}Xx^62I7Uw_zGa%!;^POC1u zvzCOFl-t~DH<+Gr!_53Xyk@#^yd_oyLWht(Z(#Yb^I+|Gc+1__`v;TkNQ~)(Qb0k> zU-a>N=LB^T-4*Ow{c%uIe?Gd;2X=yU0fzxrSl@bmF@!;1Ta|f65GLhlXLNeQ*iv`` zN^*t${Z|#aejDARFWoK~DCK9InYIvTW)pD9DJMdmM80#^nQ>1fbO&oE5+F2xusFdE zI-rqHLJ(1>k~W>@N<7&a(r+V70rN_{yHbA>K2Y-T9Iok;xJFL>(?T%)=!#e{r(FsS zGR_Djy(Z@c7OH<#Wy5?j$hb<@oo@5BA98^+DyLaZQCEO`xV^41euJ;2`)OkUvNZFZ zFA7Zh^b#BE=0g|F7_b3r3Vxv}O^V|M&$q4w9s0>R6|w+x9e~hG+c5D(10M)&KK!N! z(he)YX9DBUreU2RZone+{$}M>OcV>_B=#I*FNO_8xt!kART? z#Bu>@Fxgf%-YDK97F4pW_Y}Dz2Kc8Yd8re8mf3_~oo&JepHdnu*9emhK=H@FM*Pur z{okO^ZTSJZba-AtkR+PY_rE!c{%6#RQYxo=Kb^YO@r zawfO*2Hm#)TU?Auxvhu|;6yg3JRN?r|KosuLmc1_GKN^@*eJj8B58Y@{wxJMNw7<` zKJwkIGe7RFt-a9U4;#ObZQ58h*n9Qv58)rp8j1Z-EIskC z=@ImjPD9OQSZE-0kn1-ia~^(E@a4qlGb&qeLTWIw%{@nA2uW0O92yn7D1HkOze3=? z9|rDby_C)gd?yhp3cgAMQu8?dSbO3JqHQ{4(-|b0@ojOEqJnTh4D=&6kpi@`3*vfG zmS_-a_7aE&;)CJjg=z1;pY^ir$4XKJG6dQdX21aEYglR|W}B}+TAG*BfL0uO7yDxYsYzhD<|K3v5^ zxrtvG?7FIGXpB0`l(jDd$@%vjpcRO9uFZ?K)cnjM19ooREbiay(BfBT*sc3?NafI7 zt7A?2B!H;h&BfZ~5M2ddoQGeWiyXi_xtscXMLBoHft6Z#g9nWUh_o?$23dU$gk$$E zX=h;beiR;q=o=s`$CETps8I)Mr@}m9(or&y8>dOe32A0L_wMEVb!wQH>x*l#NY4-oGU`=34 z5Bx6TIivLgBhjw1xDIHliHh;ie~EQL%WzqCoWzO15ROkGay`j|XGG-Fn>AMk{vL>& z&I+qaAkjArfoO6PM9k09JyG@OfWr9z-GX_jfXqAq8_t*yD^;Pd(a|Rlo)6(jz$QYY zvZ?UBksRL!tu^jy#+hjJIvG5q)0qcB^AW)U z&%!A>JtM`WaT0-xF%*#GA-fanJ*fEJ(EvAMh zdhi;B0-cUPo1PV-_&yyF|Y^uK-u9!DLoY>FZyLe-VQciC=WyV%Z-jaWSuX z1KNs^Ygi{%uZ;|nr{c1tXzDTE!N&qyPuBc&HLn*tX$ycwlyBLNxO^9^&4zn5tURS# zReRi}BRX``3g>x}!lUU3C=7#NS`np#?MRZD1>Y;2G!FL)#InzA66*NV;6J#AR21^L)oUCd*H1X&btGF<$Che zKi`Ry15=O>&FIJKE!* zLce_Q6EqOX5Q_Nk7DA}k6;IV*c-@dB_|3jOo0ct+8|<^r{e9XC7_XY)y@tacs8<%q zd}LFba%~Q6ln9}o-HlS5XL=W^^{i#QTV|4PaZ=1x0K7Sou4&-+Ur)-0vLeS#DjU zInwss(xEz}&q27#BXxcf=QSqe%W%jV0VNd054`ib7!`G483$stKs0mH{Yo%GiePL6 z=?e&L`+O+p*Jo9W|D`&z(X|tYy}{K+JbhT(rf>pTeKH$)EHM(Qnog@b%QWIyr3q^~b2Fi~hfKth{%uecn6- zZ<`Ml-qV{f*ckB6{k-{|E=txJ7a^ z2;v4rgSUoMOLWrmCfZLbVaJ*Fq;DGb>!dSNj4Xcf-0Y%)*R%Tuf7=ABRt|!FJbT~gI6|#7qXxDW_S)_FU zlUn}K9JNj@D_5YrX!L!F48_JdU*Qog{D!DKuaXTIz#XeeLKaN5)i17k9mss2yoc`0 zpc8<6TA2BW@}@(gKL1}7Dtn!J2tY!nnsd}cs(y(F108k3hmLSY1XyH=7|HE>6j2Pk zz9t%`aR!}o5U0l9Z*CjDTTv86_kkK|%>+TVMSQFeB^y6LODt+Qu|%QK_ClsUsM4%} z-N$Z#1eH2-k_KkbuIx!9$fxrw*bC=FGY-PbV5r`XwxdM{=vs6Uzz)C@H4$&nDVacI zQ8O?QYIMihE@X~{2a|OM?ls(IW$Xs^HJG4)ynx!8(b&Ik|{VMrV43p>&bzESMng%>+Pjk$5>W?KBb zftqrT6k9XF!84@FNDVAQLqhWnVrTxq#!cK;mpTwn1=q{@L{kFEBjoO17+L)+scP_DH~1eSIOk_t{L$b_C4K>Np*S3Tq3!OFL z%P82&cd_;vaQxm?$0;i5mQ?1KliTg?;6vQ`Scv*lx$+E{o}hgoPB7taj=CVo?9ZyETqB-?1HE!=x>jZMr@mu~ zgE4^{iCxk%T@uRG>(uQ)Zl7+2T`QIG{xwr+ h4?&7hN9-#r{+hG!;&7=ZJKh5;e7yoat2~&@{{hW=Jz@X= literal 0 HcmV?d00001 diff --git a/examples/.gitignore b/examples/.gitignore new file mode 100644 index 0000000000..d3c22099a5 --- /dev/null +++ b/examples/.gitignore @@ -0,0 +1,2 @@ +column_families_example +simple_example diff --git a/examples/Makefile b/examples/Makefile new file mode 100644 index 0000000000..2567fdf864 --- /dev/null +++ b/examples/Makefile @@ -0,0 +1,9 @@ +include ../build_config.mk + +all: simple_example column_families_example + +simple_example: simple_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + +column_families_example: column_families_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000000..b07b3903a6 --- /dev/null +++ b/examples/README.md @@ -0,0 +1 @@ +Compile RocksDB first by executing `make static_lib` in parent dir diff --git a/examples/column_families_example.cc b/examples/column_families_example.cc new file mode 100644 index 0000000000..2bdf6ec42a --- /dev/null +++ b/examples/column_families_example.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/options.h" + +using namespace rocksdb; + +std::string kDBPath = "/tmp/rocksdb_column_families_example"; + +int main() { + // open DB + Options options; + options.create_if_missing = true; + DB* db; + Status s = DB::Open(options, kDBPath, &db); + assert(s.ok()); + + // create column family + ColumnFamilyHandle* cf; + s = db->CreateColumnFamily(ColumnFamilyOptions(), "new_cf", &cf); + assert(s.ok()); + + // close DB + delete cf; + delete db; + + // open DB with two column families + std::vector column_families; + // have to open default column familiy + column_families.push_back(ColumnFamilyDescriptor( + kDefaultColumnFamilyName, ColumnFamilyOptions())); + // open the new one, too + column_families.push_back(ColumnFamilyDescriptor( + "new_cf", ColumnFamilyOptions())); + std::vector handles; + s = DB::Open(DBOptions(), kDBPath, column_families, &handles, &db); + assert(s.ok()); + + // put and get from non-default column family + s = db->Put(WriteOptions(), handles[1], Slice("key"), Slice("value")); + assert(s.ok()); + std::string value; + s = db->Get(ReadOptions(), handles[1], Slice("key"), &value); + assert(s.ok()); + + // atomic write + WriteBatch batch; + batch.Put(handles[0], Slice("key2"), Slice("value2")); + batch.Put(handles[1], Slice("key3"), Slice("value3")); + batch.Delete(handles[0], Slice("key")); + s = db->Write(WriteOptions(), &batch); + assert(s.ok()); + + // drop column family + s = db->DropColumnFamily(handles[1]); + assert(s.ok()); + + // close db + for (auto handle : handles) { + delete handle; + } + delete db; + + return 0; +} diff --git a/examples/simple_example.cc b/examples/simple_example.cc new file mode 100644 index 0000000000..20e7faa4b0 --- /dev/null +++ b/examples/simple_example.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/options.h" + +using namespace rocksdb; + +std::string kDBPath = "/tmp/rocksdb_simple_example"; + +int main() { + DB* db; + Options options; + // Optimize RocksDB. This is the easiest way to get RocksDB to perform well + options.IncreaseParallelism(); + options.OptimizeLevelStyleCompaction(); + // create the DB if it's not already present + options.create_if_missing = true; + + // open DB + Status s = DB::Open(options, kDBPath, &db); + assert(s.ok()); + + // Put key-value + s = db->Put(WriteOptions(), "key", "value"); + assert(s.ok()); + std::string value; + // get value + s = db->Get(ReadOptions(), "key", &value); + assert(s.ok()); + assert(value == "value"); + + delete db; + + return 0; +} diff --git a/hdfs/README b/hdfs/README new file mode 100644 index 0000000000..f4f1106e45 --- /dev/null +++ b/hdfs/README @@ -0,0 +1,23 @@ +This directory contains the hdfs extensions needed to make rocksdb store +files in HDFS. + +It has been compiled and testing against CDH 4.4 (2.0.0+1475-1.cdh4.4.0.p0.23~precise-cdh4.4.0). + +The configuration assumes that packages libhdfs0, libhdfs0-dev are +installed which basically means that hdfs.h is in /usr/include and libhdfs in /usr/lib + +The env_hdfs.h file defines the rocksdb objects that are needed to talk to an +underlying filesystem. + +If you want to compile rocksdb with hdfs support, please set the following +enviroment variables appropriately (also defined in setup.sh for convenience) + USE_HDFS=1 + JAVA_HOME=/usr/local/jdk-6u22-64 + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/jdk-6u22-64/jre/lib/amd64/server:/usr/local/jdk-6u22-64/jre/lib/amd64/:./snappy/libs + make clean all db_bench + +To run dbbench, + set CLASSPATH to include your hadoop distribution + db_bench --hdfs="hdfs://hbaseudbperf001.snc1.facebook.com:9000" + + diff --git a/hdfs/env_hdfs.h b/hdfs/env_hdfs.h new file mode 100644 index 0000000000..5e7de77d30 --- /dev/null +++ b/hdfs/env_hdfs.h @@ -0,0 +1,327 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#pragma once +#include +#include +#include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/status.h" + +#ifdef USE_HDFS +#include + +namespace rocksdb { + +// Thrown during execution when there is an issue with the supplied +// arguments. +class HdfsUsageException : public std::exception { }; + +// A simple exception that indicates something went wrong that is not +// recoverable. The intention is for the message to be printed (with +// nothing else) and the process terminate. +class HdfsFatalException : public std::exception { +public: + explicit HdfsFatalException(const std::string& s) : what_(s) { } + virtual ~HdfsFatalException() throw() { } + virtual const char* what() const throw() { + return what_.c_str(); + } +private: + const std::string what_; +}; + +// +// The HDFS environment for rocksdb. This class overrides all the +// file/dir access methods and delegates the thread-mgmt methods to the +// default posix environment. +// +class HdfsEnv : public Env { + + public: + explicit HdfsEnv(const std::string& fsname) : fsname_(fsname) { + posixEnv = Env::Default(); + fileSys_ = connectToPath(fsname_); + } + + virtual ~HdfsEnv() { + fprintf(stderr, "Destroying HdfsEnv::Default()\n"); + hdfsDisconnect(fileSys_); + } + + virtual Status NewSequentialFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options); + + virtual Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options); + + virtual Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options); + + virtual Status NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options); + + virtual Status NewDirectory(const std::string& name, + std::unique_ptr* result); + + virtual bool FileExists(const std::string& fname); + + virtual Status GetChildren(const std::string& path, + std::vector* result); + + virtual Status DeleteFile(const std::string& fname); + + virtual Status CreateDir(const std::string& name); + + virtual Status CreateDirIfMissing(const std::string& name); + + virtual Status DeleteDir(const std::string& name); + + virtual Status GetFileSize(const std::string& fname, uint64_t* size); + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime); + + virtual Status RenameFile(const std::string& src, const std::string& target); + + virtual Status LockFile(const std::string& fname, FileLock** lock); + + virtual Status UnlockFile(FileLock* lock); + + virtual Status NewLogger(const std::string& fname, + std::shared_ptr* result); + + virtual void Schedule(void (*function)(void* arg), void* arg, + Priority pri = LOW) { + posixEnv->Schedule(function, arg, pri); + } + + virtual void StartThread(void (*function)(void* arg), void* arg) { + posixEnv->StartThread(function, arg); + } + + virtual void WaitForJoin() { posixEnv->WaitForJoin(); } + + virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const + override { + return posixEnv->GetThreadPoolQueueLen(pri); + } + + virtual Status GetTestDirectory(std::string* path) { + return posixEnv->GetTestDirectory(path); + } + + virtual uint64_t NowMicros() { + return posixEnv->NowMicros(); + } + + virtual void SleepForMicroseconds(int micros) { + posixEnv->SleepForMicroseconds(micros); + } + + virtual Status GetHostName(char* name, uint64_t len) { + return posixEnv->GetHostName(name, len); + } + + virtual Status GetCurrentTime(int64_t* unix_time) { + return posixEnv->GetCurrentTime(unix_time); + } + + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) { + return posixEnv->GetAbsolutePath(db_path, output_path); + } + + virtual void SetBackgroundThreads(int number, Priority pri = LOW) { + posixEnv->SetBackgroundThreads(number, pri); + } + + virtual std::string TimeToString(uint64_t number) { + return posixEnv->TimeToString(number); + } + + static uint64_t gettid() { + assert(sizeof(pthread_t) <= sizeof(uint64_t)); + return (uint64_t)pthread_self(); + } + + private: + std::string fsname_; // string of the form "hdfs://hostname:port/" + hdfsFS fileSys_; // a single FileSystem object for all files + Env* posixEnv; // This object is derived from Env, but not from + // posixEnv. We have posixnv as an encapsulated + // object here so that we can use posix timers, + // posix threads, etc. + + static const std::string kProto; + static const std::string pathsep; + + /** + * If the URI is specified of the form hdfs://server:port/path, + * then connect to the specified cluster + * else connect to default. + */ + hdfsFS connectToPath(const std::string& uri) { + if (uri.empty()) { + return nullptr; + } + if (uri.find(kProto) != 0) { + // uri doesn't start with hdfs:// -> use default:0, which is special + // to libhdfs. + return hdfsConnectNewInstance("default", 0); + } + const std::string hostport = uri.substr(kProto.length()); + + std::vector parts; + split(hostport, ':', parts); + if (parts.size() != 2) { + throw HdfsFatalException("Bad uri for hdfs " + uri); + } + // parts[0] = hosts, parts[1] = port/xxx/yyy + std::string host(parts[0]); + std::string remaining(parts[1]); + + int rem = remaining.find(pathsep); + std::string portStr = (rem == 0 ? remaining : + remaining.substr(0, rem)); + + tPort port; + port = atoi(portStr.c_str()); + if (port == 0) { + throw HdfsFatalException("Bad host-port for hdfs " + uri); + } + hdfsFS fs = hdfsConnectNewInstance(host.c_str(), port); + return fs; + } + + void split(const std::string &s, char delim, + std::vector &elems) { + elems.clear(); + size_t prev = 0; + size_t pos = s.find(delim); + while (pos != std::string::npos) { + elems.push_back(s.substr(prev, pos)); + prev = pos + 1; + pos = s.find(delim, prev); + } + elems.push_back(s.substr(prev, s.size())); + } +}; + +} // namespace rocksdb + +#else // USE_HDFS + + +namespace rocksdb { + +static const Status notsup; + +class HdfsEnv : public Env { + + public: + explicit HdfsEnv(const std::string& fsname) { + fprintf(stderr, "You have not build rocksdb with HDFS support\n"); + fprintf(stderr, "Please see hdfs/README for details\n"); + throw std::exception(); + } + + virtual ~HdfsEnv() { + } + + virtual Status NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options); + + virtual Status NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return notsup; + } + + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return notsup; + } + + virtual Status NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return notsup; + } + + virtual Status NewDirectory(const std::string& name, + unique_ptr* result) { + return notsup; + } + + virtual bool FileExists(const std::string& fname){return false;} + + virtual Status GetChildren(const std::string& path, + std::vector* result){return notsup;} + + virtual Status DeleteFile(const std::string& fname){return notsup;} + + virtual Status CreateDir(const std::string& name){return notsup;} + + virtual Status CreateDirIfMissing(const std::string& name){return notsup;} + + virtual Status DeleteDir(const std::string& name){return notsup;} + + virtual Status GetFileSize(const std::string& fname, uint64_t* size){return notsup;} + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* time) { + return notsup; + } + + virtual Status RenameFile(const std::string& src, const std::string& target){return notsup;} + + virtual Status LockFile(const std::string& fname, FileLock** lock){return notsup;} + + virtual Status UnlockFile(FileLock* lock){return notsup;} + + virtual Status NewLogger(const std::string& fname, + shared_ptr* result){return notsup;} + + virtual void Schedule(void (*function)(void* arg), void* arg, + Priority pri = LOW) {} + + virtual void StartThread(void (*function)(void* arg), void* arg) {} + + virtual void WaitForJoin() {} + + virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const { + return 0; + } + + virtual Status GetTestDirectory(std::string* path) {return notsup;} + + virtual uint64_t NowMicros() {return 0;} + + virtual void SleepForMicroseconds(int micros) {} + + virtual Status GetHostName(char* name, uint64_t len) {return notsup;} + + virtual Status GetCurrentTime(int64_t* unix_time) {return notsup;} + + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* outputpath) {return notsup;} + + virtual void SetBackgroundThreads(int number, Priority pri = LOW) {} + + virtual std::string TimeToString(uint64_t number) { return "";} +}; +} + +#endif // USE_HDFS diff --git a/hdfs/setup.sh b/hdfs/setup.sh new file mode 100644 index 0000000000..ac69b525df --- /dev/null +++ b/hdfs/setup.sh @@ -0,0 +1,7 @@ +export USE_HDFS=1 +export LD_LIBRARY_PATH=$JAVA_HOME/jre/lib/amd64/server:$JAVA_HOME/jre/lib/amd64:/usr/lib/hadoop/lib/native + +export CLASSPATH= +for f in `find /usr/lib/hadoop-hdfs | grep jar`; do export CLASSPATH=$CLASSPATH:$f; done +for f in `find /usr/lib/hadoop | grep jar`; do export CLASSPATH=$CLASSPATH:$f; done +for f in `find /usr/lib/hadoop/client | grep jar`; do export CLASSPATH=$CLASSPATH:$f; done diff --git a/helpers/memenv/memenv.cc b/helpers/memenv/memenv.cc new file mode 100644 index 0000000000..185e7d822b --- /dev/null +++ b/helpers/memenv/memenv.cc @@ -0,0 +1,395 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include +#include +#include +#include + +namespace rocksdb { + +namespace { + +class FileState { + public: + // FileStates are reference counted. The initial reference count is zero + // and the caller must call Ref() at least once. + FileState() : refs_(0), size_(0) {} + + // Increase the reference count. + void Ref() { + MutexLock lock(&refs_mutex_); + ++refs_; + } + + // Decrease the reference count. Delete if this is the last reference. + void Unref() { + bool do_delete = false; + + { + MutexLock lock(&refs_mutex_); + --refs_; + assert(refs_ >= 0); + if (refs_ <= 0) { + do_delete = true; + } + } + + if (do_delete) { + delete this; + } + } + + uint64_t Size() const { return size_; } + + Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { + if (offset > size_) { + return Status::IOError("Offset greater than file size."); + } + const uint64_t available = size_ - offset; + if (n > available) { + n = available; + } + if (n == 0) { + *result = Slice(); + return Status::OK(); + } + + size_t block = offset / kBlockSize; + size_t block_offset = offset % kBlockSize; + + if (n <= kBlockSize - block_offset) { + // The requested bytes are all in the first block. + *result = Slice(blocks_[block] + block_offset, n); + return Status::OK(); + } + + size_t bytes_to_copy = n; + char* dst = scratch; + + while (bytes_to_copy > 0) { + size_t avail = kBlockSize - block_offset; + if (avail > bytes_to_copy) { + avail = bytes_to_copy; + } + memcpy(dst, blocks_[block] + block_offset, avail); + + bytes_to_copy -= avail; + dst += avail; + block++; + block_offset = 0; + } + + *result = Slice(scratch, n); + return Status::OK(); + } + + Status Append(const Slice& data) { + const char* src = data.data(); + size_t src_len = data.size(); + + while (src_len > 0) { + size_t avail; + size_t offset = size_ % kBlockSize; + + if (offset != 0) { + // There is some room in the last block. + avail = kBlockSize - offset; + } else { + // No room in the last block; push new one. + blocks_.push_back(new char[kBlockSize]); + avail = kBlockSize; + } + + if (avail > src_len) { + avail = src_len; + } + memcpy(blocks_.back() + offset, src, avail); + src_len -= avail; + src += avail; + size_ += avail; + } + + return Status::OK(); + } + + private: + // Private since only Unref() should be used to delete it. + ~FileState() { + for (std::vector::iterator i = blocks_.begin(); i != blocks_.end(); + ++i) { + delete [] *i; + } + } + + // No copying allowed. + FileState(const FileState&); + void operator=(const FileState&); + + port::Mutex refs_mutex_; + int refs_; // Protected by refs_mutex_; + + // The following fields are not protected by any mutex. They are only mutable + // while the file is being written, and concurrent access is not allowed + // to writable files. + std::vector blocks_; + uint64_t size_; + + enum { kBlockSize = 8 * 1024 }; +}; + +class SequentialFileImpl : public SequentialFile { + public: + explicit SequentialFileImpl(FileState* file) : file_(file), pos_(0) { + file_->Ref(); + } + + ~SequentialFileImpl() { + file_->Unref(); + } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s = file_->Read(pos_, n, result, scratch); + if (s.ok()) { + pos_ += result->size(); + } + return s; + } + + virtual Status Skip(uint64_t n) { + if (pos_ > file_->Size()) { + return Status::IOError("pos_ > file_->Size()"); + } + const size_t available = file_->Size() - pos_; + if (n > available) { + n = available; + } + pos_ += n; + return Status::OK(); + } + + private: + FileState* file_; + size_t pos_; +}; + +class RandomAccessFileImpl : public RandomAccessFile { + public: + explicit RandomAccessFileImpl(FileState* file) : file_(file) { + file_->Ref(); + } + + ~RandomAccessFileImpl() { + file_->Unref(); + } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + return file_->Read(offset, n, result, scratch); + } + + private: + FileState* file_; +}; + +class WritableFileImpl : public WritableFile { + public: + WritableFileImpl(FileState* file) : file_(file) { + file_->Ref(); + } + + ~WritableFileImpl() { + file_->Unref(); + } + + virtual Status Append(const Slice& data) { + return file_->Append(data); + } + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + + private: + FileState* file_; +}; + +class InMemoryDirectory : public Directory { + public: + virtual Status Fsync() { return Status::OK(); } +}; + +class InMemoryEnv : public EnvWrapper { + public: + explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { } + + virtual ~InMemoryEnv() { + for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){ + i->second->Unref(); + } + } + + // Partial implementation of the Env interface. + virtual Status NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) == file_map_.end()) { + *result = NULL; + return Status::IOError(fname, "File not found"); + } + + result->reset(new SequentialFileImpl(file_map_[fname])); + return Status::OK(); + } + + virtual Status NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) == file_map_.end()) { + *result = NULL; + return Status::IOError(fname, "File not found"); + } + + result->reset(new RandomAccessFileImpl(file_map_[fname])); + return Status::OK(); + } + + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) != file_map_.end()) { + DeleteFileInternal(fname); + } + + FileState* file = new FileState(); + file->Ref(); + file_map_[fname] = file; + + result->reset(new WritableFileImpl(file)); + return Status::OK(); + } + + virtual Status NewDirectory(const std::string& name, + unique_ptr* result) { + result->reset(new InMemoryDirectory()); + return Status::OK(); + } + + virtual bool FileExists(const std::string& fname) { + MutexLock lock(&mutex_); + return file_map_.find(fname) != file_map_.end(); + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) { + MutexLock lock(&mutex_); + result->clear(); + + for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){ + const std::string& filename = i->first; + + if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' && + Slice(filename).starts_with(Slice(dir))) { + result->push_back(filename.substr(dir.size() + 1)); + } + } + + return Status::OK(); + } + + void DeleteFileInternal(const std::string& fname) { + if (file_map_.find(fname) == file_map_.end()) { + return; + } + + file_map_[fname]->Unref(); + file_map_.erase(fname); + } + + virtual Status DeleteFile(const std::string& fname) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) == file_map_.end()) { + return Status::IOError(fname, "File not found"); + } + + DeleteFileInternal(fname); + return Status::OK(); + } + + virtual Status CreateDir(const std::string& dirname) { + return Status::OK(); + } + + virtual Status CreateDirIfMissing(const std::string& dirname) { + return Status::OK(); + } + + virtual Status DeleteDir(const std::string& dirname) { + return Status::OK(); + } + + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) == file_map_.end()) { + return Status::IOError(fname, "File not found"); + } + + *file_size = file_map_[fname]->Size(); + return Status::OK(); + } + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* time) { + return Status::NotSupported("getFileMTime", "Not supported in MemEnv"); + } + + virtual Status RenameFile(const std::string& src, + const std::string& target) { + MutexLock lock(&mutex_); + if (file_map_.find(src) == file_map_.end()) { + return Status::IOError(src, "File not found"); + } + + DeleteFileInternal(target); + file_map_[target] = file_map_[src]; + file_map_.erase(src); + return Status::OK(); + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = new FileLock; + return Status::OK(); + } + + virtual Status UnlockFile(FileLock* lock) { + delete lock; + return Status::OK(); + } + + virtual Status GetTestDirectory(std::string* path) { + *path = "/test"; + return Status::OK(); + } + + private: + // Map from filenames to FileState objects, representing a simple file system. + typedef std::map FileSystem; + port::Mutex mutex_; + FileSystem file_map_; // Protected by mutex_. +}; + +} // namespace + +Env* NewMemEnv(Env* base_env) { + return new InMemoryEnv(base_env); +} + +} // namespace rocksdb diff --git a/helpers/memenv/memenv_test.cc b/helpers/memenv/memenv_test.cc new file mode 100644 index 0000000000..ea3ed61a03 --- /dev/null +++ b/helpers/memenv/memenv_test.cc @@ -0,0 +1,231 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "util/testharness.h" +#include +#include +#include + +namespace rocksdb { + +class MemEnvTest { + public: + Env* env_; + const EnvOptions soptions_; + + MemEnvTest() + : env_(NewMemEnv(Env::Default())) { + } + ~MemEnvTest() { + delete env_; + } +}; + +TEST(MemEnvTest, Basics) { + uint64_t file_size; + unique_ptr writable_file; + std::vector children; + + ASSERT_OK(env_->CreateDir("/dir")); + + // Check that the directory is empty. + ASSERT_TRUE(!env_->FileExists("/dir/non_existent")); + ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok()); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(0U, children.size()); + + // Create a file. + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + writable_file.reset(); + + // Check that the file exists. + ASSERT_TRUE(env_->FileExists("/dir/f")); + ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); + ASSERT_EQ(0U, file_size); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(1U, children.size()); + ASSERT_EQ("f", children[0]); + + // Write to the file. + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("abc")); + writable_file.reset(); + + // Check for expected size. + ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); + ASSERT_EQ(3U, file_size); + + // Check that renaming works. + ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok()); + ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g")); + ASSERT_TRUE(!env_->FileExists("/dir/f")); + ASSERT_TRUE(env_->FileExists("/dir/g")); + ASSERT_OK(env_->GetFileSize("/dir/g", &file_size)); + ASSERT_EQ(3U, file_size); + + // Check that opening non-existent file fails. + unique_ptr seq_file; + unique_ptr rand_file; + ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file, + soptions_).ok()); + ASSERT_TRUE(!seq_file); + ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file, + soptions_).ok()); + ASSERT_TRUE(!rand_file); + + // Check that deleting works. + ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok()); + ASSERT_OK(env_->DeleteFile("/dir/g")); + ASSERT_TRUE(!env_->FileExists("/dir/g")); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(0U, children.size()); + ASSERT_OK(env_->DeleteDir("/dir")); +} + +TEST(MemEnvTest, ReadWrite) { + unique_ptr writable_file; + unique_ptr seq_file; + unique_ptr rand_file; + Slice result; + char scratch[100]; + + ASSERT_OK(env_->CreateDir("/dir")); + + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("hello ")); + ASSERT_OK(writable_file->Append("world")); + writable_file.reset(); + + // Read sequentially. + ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_)); + ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(seq_file->Skip(1)); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF. + ASSERT_EQ(0U, result.size()); + ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file. + ASSERT_OK(seq_file->Read(1000, &result, scratch)); + ASSERT_EQ(0U, result.size()); + + // Random reads. + ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_)); + ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d". + ASSERT_EQ(0, result.compare("d")); + + // Too high offset. + ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok()); +} + +TEST(MemEnvTest, Locks) { + FileLock* lock; + + // These are no-ops, but we test they return success. + ASSERT_OK(env_->LockFile("some file", &lock)); + ASSERT_OK(env_->UnlockFile(lock)); +} + +TEST(MemEnvTest, Misc) { + std::string test_dir; + ASSERT_OK(env_->GetTestDirectory(&test_dir)); + ASSERT_TRUE(!test_dir.empty()); + + unique_ptr writable_file; + ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_)); + + // These are no-ops, but we test they return success. + ASSERT_OK(writable_file->Sync()); + ASSERT_OK(writable_file->Flush()); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); +} + +TEST(MemEnvTest, LargeWrite) { + const size_t kWriteSize = 300 * 1024; + char* scratch = new char[kWriteSize * 2]; + + std::string write_data; + for (size_t i = 0; i < kWriteSize; ++i) { + write_data.append(1, static_cast(i)); + } + + unique_ptr writable_file; + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("foo")); + ASSERT_OK(writable_file->Append(write_data)); + writable_file.reset(); + + unique_ptr seq_file; + Slice result; + ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_)); + ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo". + ASSERT_EQ(0, result.compare("foo")); + + size_t read = 0; + std::string read_data; + while (read < kWriteSize) { + ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch)); + read_data.append(result.data(), result.size()); + read += result.size(); + } + ASSERT_TRUE(write_data == read_data); + delete [] scratch; +} + +TEST(MemEnvTest, DBTest) { + Options options; + options.create_if_missing = true; + options.env = env_; + DB* db; + + const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; + const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; + + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); + } + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + Iterator* iterator = db->NewIterator(ReadOptions()); + iterator->SeekToFirst(); + for (size_t i = 0; i < 3; ++i) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_TRUE(keys[i] == iterator->key()); + ASSERT_TRUE(vals[i] == iterator->value()); + iterator->Next(); + } + ASSERT_TRUE(!iterator->Valid()); + delete iterator; + + DBImpl* dbi = reinterpret_cast(db); + ASSERT_OK(dbi->TEST_FlushMemTable()); + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + delete db; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h new file mode 100644 index 0000000000..013ee5d2ab --- /dev/null +++ b/include/rocksdb/c.h @@ -0,0 +1,575 @@ +/* Copyright (c) 2013, Facebook, Inc. All rights reserved. + This source code is licensed under the BSD-style license found in the + LICENSE file in the root directory of this source tree. An additional grant + of patent rights can be found in the PATENTS file in the same directory. + Copyright (c) 2011 The LevelDB Authors. All rights reserved. + Use of this source code is governed by a BSD-style license that can be + found in the LICENSE file. See the AUTHORS file for names of contributors. + + C bindings for leveldb. May be useful as a stable ABI that can be + used by programs that keep leveldb in a shared library, or for + a JNI api. + + Does not support: + . getters for the option types + . custom comparators that implement key shortening + . capturing post-write-snapshot + . custom iter, db, env, cache implementations using just the C bindings + + Some conventions: + + (1) We expose just opaque struct pointers and functions to clients. + This allows us to change internal representations without having to + recompile clients. + + (2) For simplicity, there is no equivalent to the Slice type. Instead, + the caller has to pass the pointer and length as separate + arguments. + + (3) Errors are represented by a null-terminated c string. NULL + means no error. All operations that can raise an error are passed + a "char** errptr" as the last argument. One of the following must + be true on entry: + *errptr == NULL + *errptr points to a malloc()ed null-terminated error message + On success, a leveldb routine leaves *errptr unchanged. + On failure, leveldb frees the old value of *errptr and + set *errptr to a malloc()ed error message. + + (4) Bools have the type unsigned char (0 == false; rest == true) + + (5) All of the pointer arguments must be non-NULL. +*/ + +#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_ +#define STORAGE_ROCKSDB_INCLUDE_C_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +/* Exported types */ + +typedef struct rocksdb_t rocksdb_t; +typedef struct rocksdb_cache_t rocksdb_cache_t; +typedef struct rocksdb_comparator_t rocksdb_comparator_t; +typedef struct rocksdb_env_t rocksdb_env_t; +typedef struct rocksdb_filelock_t rocksdb_filelock_t; +typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t; +typedef struct rocksdb_flushoptions_t rocksdb_flushoptions_t; +typedef struct rocksdb_iterator_t rocksdb_iterator_t; +typedef struct rocksdb_logger_t rocksdb_logger_t; +typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t; +typedef struct rocksdb_options_t rocksdb_options_t; +typedef struct rocksdb_randomfile_t rocksdb_randomfile_t; +typedef struct rocksdb_readoptions_t rocksdb_readoptions_t; +typedef struct rocksdb_seqfile_t rocksdb_seqfile_t; +typedef struct rocksdb_slicetransform_t rocksdb_slicetransform_t; +typedef struct rocksdb_snapshot_t rocksdb_snapshot_t; +typedef struct rocksdb_writablefile_t rocksdb_writablefile_t; +typedef struct rocksdb_writebatch_t rocksdb_writebatch_t; +typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t; +typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t; +typedef struct rocksdb_livefiles_t rocksdb_livefiles_t; + +/* DB operations */ + +extern rocksdb_t* rocksdb_open( + const rocksdb_options_t* options, + const char* name, + char** errptr); + +extern rocksdb_t* rocksdb_open_for_read_only( + const rocksdb_options_t* options, + const char* name, + unsigned char error_if_log_file_exist, + char** errptr); + +extern void rocksdb_close(rocksdb_t* db); + +extern void rocksdb_put( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr); + +extern void rocksdb_delete( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + char** errptr); + +extern void rocksdb_merge( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr); + +extern void rocksdb_write( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, + char** errptr); + +/* Returns NULL if not found. A malloc()ed array otherwise. + Stores the length of the array in *vallen. */ +extern char* rocksdb_get( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr); + +extern rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, + const rocksdb_readoptions_t* options); + +extern const rocksdb_snapshot_t* rocksdb_create_snapshot( + rocksdb_t* db); + +extern void rocksdb_release_snapshot( + rocksdb_t* db, + const rocksdb_snapshot_t* snapshot); + +/* Returns NULL if property name is unknown. + Else returns a pointer to a malloc()-ed null-terminated value. */ +extern char* rocksdb_property_value( + rocksdb_t* db, + const char* propname); + +extern void rocksdb_approximate_sizes( + rocksdb_t* db, + int num_ranges, + const char* const* range_start_key, const size_t* range_start_key_len, + const char* const* range_limit_key, const size_t* range_limit_key_len, + uint64_t* sizes); + +extern void rocksdb_compact_range( + rocksdb_t* db, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len); + +extern void rocksdb_delete_file( + rocksdb_t* db, + const char* name); + +extern const rocksdb_livefiles_t* rocksdb_livefiles( + rocksdb_t* db); + +extern void rocksdb_flush( + rocksdb_t* db, + const rocksdb_flushoptions_t* options, + char** errptr); + +extern void rocksdb_disable_file_deletions( + rocksdb_t* db, + char** errptr); + +extern void rocksdb_enable_file_deletions( + rocksdb_t* db, + unsigned char force, + char** errptr); + +/* Management operations */ + +extern void rocksdb_destroy_db( + const rocksdb_options_t* options, + const char* name, + char** errptr); + +extern void rocksdb_repair_db( + const rocksdb_options_t* options, + const char* name, + char** errptr); + +/* Iterator */ + +extern void rocksdb_iter_destroy(rocksdb_iterator_t*); +extern unsigned char rocksdb_iter_valid(const rocksdb_iterator_t*); +extern void rocksdb_iter_seek_to_first(rocksdb_iterator_t*); +extern void rocksdb_iter_seek_to_last(rocksdb_iterator_t*); +extern void rocksdb_iter_seek(rocksdb_iterator_t*, const char* k, size_t klen); +extern void rocksdb_iter_next(rocksdb_iterator_t*); +extern void rocksdb_iter_prev(rocksdb_iterator_t*); +extern const char* rocksdb_iter_key(const rocksdb_iterator_t*, size_t* klen); +extern const char* rocksdb_iter_value(const rocksdb_iterator_t*, size_t* vlen); +extern void rocksdb_iter_get_error(const rocksdb_iterator_t*, char** errptr); + +/* Write batch */ + +extern rocksdb_writebatch_t* rocksdb_writebatch_create(); +extern void rocksdb_writebatch_destroy(rocksdb_writebatch_t*); +extern void rocksdb_writebatch_clear(rocksdb_writebatch_t*); +extern int rocksdb_writebatch_count(rocksdb_writebatch_t*); +extern void rocksdb_writebatch_put( + rocksdb_writebatch_t*, + const char* key, size_t klen, + const char* val, size_t vlen); +extern void rocksdb_writebatch_merge( + rocksdb_writebatch_t*, + const char* key, size_t klen, + const char* val, size_t vlen); +extern void rocksdb_writebatch_delete( + rocksdb_writebatch_t*, + const char* key, size_t klen); +extern void rocksdb_writebatch_iterate( + rocksdb_writebatch_t*, + void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)); +extern const char* rocksdb_writebatch_data(rocksdb_writebatch_t*, size_t *size); + +/* Options */ + +extern rocksdb_options_t* rocksdb_options_create(); +extern void rocksdb_options_destroy(rocksdb_options_t*); +extern void rocksdb_options_set_comparator( + rocksdb_options_t*, + rocksdb_comparator_t*); +extern void rocksdb_options_set_merge_operator(rocksdb_options_t*, + rocksdb_mergeoperator_t*); +extern void rocksdb_options_set_compression_per_level( + rocksdb_options_t* opt, + int* level_values, + size_t num_levels); +extern void rocksdb_options_set_filter_policy( + rocksdb_options_t*, + rocksdb_filterpolicy_t*); +extern void rocksdb_options_set_create_if_missing( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_error_if_exists( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_paranoid_checks( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*); +extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*); +extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int); +extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int); +extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*); +extern void rocksdb_options_set_cache_compressed(rocksdb_options_t*, rocksdb_cache_t*); +extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int); +extern void rocksdb_options_set_compression_options( + rocksdb_options_t*, int, int, int); +extern void rocksdb_options_set_whole_key_filtering(rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_prefix_extractor( + rocksdb_options_t*, rocksdb_slicetransform_t*); +extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_file_num_compaction_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_slowdown_writes_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_stop_writes_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_max_mem_compaction_level( + rocksdb_options_t*, int); +extern void rocksdb_options_set_target_file_size_base( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_target_file_size_multiplier( + rocksdb_options_t*, int); +extern void rocksdb_options_set_max_bytes_for_level_base( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_max_bytes_for_level_multiplier( + rocksdb_options_t*, int); +extern void rocksdb_options_set_expanded_compaction_factor( + rocksdb_options_t*, int); +extern void rocksdb_options_set_max_grandparent_overlap_factor( + rocksdb_options_t*, int); +extern void rocksdb_options_set_max_bytes_for_level_multiplier_additional( + rocksdb_options_t*, int* level_values, size_t num_levels); +extern void rocksdb_options_enable_statistics(rocksdb_options_t*); + +extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int); +extern void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int); +extern void rocksdb_options_set_max_background_compactions(rocksdb_options_t*, int); +extern void rocksdb_options_set_max_background_flushes(rocksdb_options_t*, int); +extern void rocksdb_options_set_max_log_file_size(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_keep_log_file_num(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_soft_rate_limit(rocksdb_options_t*, double); +extern void rocksdb_options_set_hard_rate_limit(rocksdb_options_t*, double); +extern void rocksdb_options_set_rate_limit_delay_max_milliseconds( + rocksdb_options_t*, unsigned int); +extern void rocksdb_options_set_max_manifest_file_size( + rocksdb_options_t*, size_t); +extern void rocksdb_options_set_no_block_cache( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_table_cache_numshardbits( + rocksdb_options_t*, int); +extern void rocksdb_options_set_table_cache_remove_scan_count_limit( + rocksdb_options_t*, int); +extern void rocksdb_options_set_arena_block_size( + rocksdb_options_t*, size_t); +extern void rocksdb_options_set_use_fsync( + rocksdb_options_t*, int); +extern void rocksdb_options_set_db_stats_log_interval( + rocksdb_options_t*, int); +extern void rocksdb_options_set_db_log_dir( + rocksdb_options_t*, const char*); +extern void rocksdb_options_set_wal_dir( + rocksdb_options_t*, const char*); +extern void rocksdb_options_set_WAL_ttl_seconds( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_WAL_size_limit_MB( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_manifest_preallocation_size( + rocksdb_options_t*, size_t); +extern void rocksdb_options_set_purge_redundant_kvs_while_flush( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_allow_os_buffer( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_allow_mmap_reads( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_allow_mmap_writes( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_is_fd_close_on_exec( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_skip_log_error_on_recovery( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_stats_dump_period_sec( + rocksdb_options_t*, unsigned int); +extern void rocksdb_options_set_block_size_deviation( + rocksdb_options_t*, int); +extern void rocksdb_options_set_advise_random_on_open( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_access_hint_on_compaction_start( + rocksdb_options_t*, int); +extern void rocksdb_options_set_use_adaptive_mutex( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_bytes_per_sync( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_verify_checksums_in_compaction( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_filter_deletes( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_max_sequential_skip_in_iterations( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int); +extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int); +extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int); +extern void rocksdb_options_set_delete_obsolete_files_period_micros( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int); +extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*); +extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*); +extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, int32_t, int32_t); +extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_plain_table_factory(rocksdb_options_t*, uint32_t, int, double, size_t); + +extern void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, uint64_t n); +extern void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, unsigned int sec); + +extern void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level); + +extern void rocksdb_options_set_memtable_prefix_bloom_bits( + rocksdb_options_t*, uint32_t); +extern void rocksdb_options_set_memtable_prefix_bloom_probes( + rocksdb_options_t*, uint32_t); +extern void rocksdb_options_set_max_successive_merges( + rocksdb_options_t*, size_t); +extern void rocksdb_options_set_min_partial_merge_operands( + rocksdb_options_t*, uint32_t); +extern void rocksdb_options_set_bloom_locality( + rocksdb_options_t*, uint32_t); +extern void rocksdb_options_set_allow_thread_local( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_inplace_update_support( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_inplace_update_num_locks( + rocksdb_options_t*, size_t); + +enum { + rocksdb_no_compression = 0, + rocksdb_snappy_compression = 1, + rocksdb_zlib_compression = 2, + rocksdb_bz2_compression = 3, + rocksdb_lz4_compression = 4, + rocksdb_lz4hc_compression = 5 +}; +extern void rocksdb_options_set_compression(rocksdb_options_t*, int); + +enum { + rocksdb_level_compaction = 0, + rocksdb_universal_compaction = 1 +}; +extern void rocksdb_options_set_compaction_style(rocksdb_options_t*, int); +extern void rocksdb_options_set_universal_compaction_options(rocksdb_options_t*, rocksdb_universal_compaction_options_t*); +/* Comparator */ + +extern rocksdb_comparator_t* rocksdb_comparator_create( + void* state, + void (*destructor)(void*), + int (*compare)( + void*, + const char* a, size_t alen, + const char* b, size_t blen), + const char* (*name)(void*)); +extern void rocksdb_comparator_destroy(rocksdb_comparator_t*); + +/* Filter policy */ + +extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create( + void* state, + void (*destructor)(void*), + char* (*create_filter)( + void*, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length), + unsigned char (*key_may_match)( + void*, + const char* key, size_t length, + const char* filter, size_t filter_length), + void (*delete_filter)( + void*, + const char* filter, size_t filter_length), + const char* (*name)(void*)); +extern void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t*); + +extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom( + int bits_per_key); + +/* Merge Operator */ + +extern rocksdb_mergeoperator_t* rocksdb_mergeoperator_create( + void* state, + void (*destructor)(void*), + char* (*full_merge)( + void*, + const char* key, size_t key_length, + const char* existing_value, size_t existing_value_length, + const char* const* operands_list, const size_t* operands_list_length, + int num_operands, + unsigned char* success, size_t* new_value_length), + char* (*partial_merge)( + void*, + const char* key, size_t key_length, + const char* const* operands_list, const size_t* operands_list_length, + int num_operands, + unsigned char* success, size_t* new_value_length), + void (*delete_value)( + void*, + const char* value, size_t value_length), + const char* (*name)(void*)); +extern void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t*); + +/* Read options */ + +extern rocksdb_readoptions_t* rocksdb_readoptions_create(); +extern void rocksdb_readoptions_destroy(rocksdb_readoptions_t*); +extern void rocksdb_readoptions_set_verify_checksums( + rocksdb_readoptions_t*, + unsigned char); +extern void rocksdb_readoptions_set_fill_cache( + rocksdb_readoptions_t*, unsigned char); +extern void rocksdb_readoptions_set_snapshot( + rocksdb_readoptions_t*, + const rocksdb_snapshot_t*); +extern void rocksdb_readoptions_set_read_tier( + rocksdb_readoptions_t*, int); +extern void rocksdb_readoptions_set_tailing( + rocksdb_readoptions_t*, unsigned char); + +/* Write options */ + +extern rocksdb_writeoptions_t* rocksdb_writeoptions_create(); +extern void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t*); +extern void rocksdb_writeoptions_set_sync( + rocksdb_writeoptions_t*, unsigned char); +extern void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable); + +/* Flush options */ + +extern rocksdb_flushoptions_t* rocksdb_flushoptions_create(); +extern void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t*); +extern void rocksdb_flushoptions_set_wait( + rocksdb_flushoptions_t*, unsigned char); + +/* Cache */ + +extern rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity); +extern void rocksdb_cache_destroy(rocksdb_cache_t* cache); + +/* Env */ + +extern rocksdb_env_t* rocksdb_create_default_env(); +extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n); +extern void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n); +extern void rocksdb_env_destroy(rocksdb_env_t*); + +/* SliceTransform */ + +extern rocksdb_slicetransform_t* rocksdb_slicetransform_create( + void* state, + void (*destructor)(void*), + char* (*transform)( + void*, + const char* key, size_t length, + size_t* dst_length), + unsigned char (*in_domain)( + void*, + const char* key, size_t length), + unsigned char (*in_range)( + void*, + const char* key, size_t length), + const char* (*name)(void*)); +extern rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t); +extern void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t*); + +/* Universal Compaction options */ + +enum { + rocksdb_similar_size_compaction_stop_style = 0, + rocksdb_total_size_compaction_stop_style = 1 +}; + +extern rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() ; +extern void rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t*); + +extern int rocksdb_livefiles_count( + const rocksdb_livefiles_t*); +extern const char* rocksdb_livefiles_name( + const rocksdb_livefiles_t*, + int index); +extern int rocksdb_livefiles_level( + const rocksdb_livefiles_t*, + int index); +extern size_t rocksdb_livefiles_size( + const rocksdb_livefiles_t*, + int index); +extern const char* rocksdb_livefiles_smallestkey( + const rocksdb_livefiles_t*, + int index, + size_t* size); +extern const char* rocksdb_livefiles_largestkey( + const rocksdb_livefiles_t*, + int index, + size_t* size); +extern void rocksdb_livefiles_destroy( + const rocksdb_livefiles_t*); + +#ifdef __cplusplus +} /* end extern "C" */ +#endif + +#endif /* STORAGE_ROCKSDB_INCLUDE_C_H_ */ diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h new file mode 100644 index 0000000000..65d44b6cbf --- /dev/null +++ b/include/rocksdb/cache.h @@ -0,0 +1,140 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Cache is an interface that maps keys to values. It has internal +// synchronization and may be safely accessed concurrently from +// multiple threads. It may automatically evict entries to make room +// for new entries. Values have a specified charge against the cache +// capacity. For example, a cache where the values are variable +// length strings, may use the length of the string as the charge for +// the string. +// +// A builtin cache implementation with a least-recently-used eviction +// policy is provided. Clients may use their own implementations if +// they want something more sophisticated (like scan-resistance, a +// custom eviction policy, variable cache sizing, etc.) + +#ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_ +#define STORAGE_ROCKSDB_INCLUDE_CACHE_H_ + +#include +#include +#include "rocksdb/slice.h" + +namespace rocksdb { + +using std::shared_ptr; + +class Cache; + +// Create a new cache with a fixed size capacity. The cache is sharded +// to 2^numShardBits shards, by hash of the key. The total capacity +// is divided and evenly assigned to each shard. Inside each shard, +// the eviction is done in two passes: first try to free spaces by +// evicting entries that are among the most least used removeScanCountLimit +// entries and do not have reference other than by the cache itself, in +// the least-used order. If not enough space is freed, further free the +// entries in least used order. +// +// The functions without parameter numShardBits and/or removeScanCountLimit +// use default values. removeScanCountLimit's default value is 0, which +// means a strict LRU order inside each shard. +extern shared_ptr NewLRUCache(size_t capacity); +extern shared_ptr NewLRUCache(size_t capacity, int numShardBits); +extern shared_ptr NewLRUCache(size_t capacity, int numShardBits, + int removeScanCountLimit); + +class Cache { + public: + Cache() { } + + // Destroys all existing entries by calling the "deleter" + // function that was passed to the constructor. + virtual ~Cache(); + + // Opaque handle to an entry stored in the cache. + struct Handle { }; + + // Insert a mapping from key->value into the cache and assign it + // the specified charge against the total cache capacity. + // + // Returns a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + // + // When the inserted entry is no longer needed, the key and + // value will be passed to "deleter". + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) = 0; + + // If the cache has no mapping for "key", returns nullptr. + // + // Else return a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + virtual Handle* Lookup(const Slice& key) = 0; + + // Release a mapping returned by a previous Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void Release(Handle* handle) = 0; + + // Return the value encapsulated in a handle returned by a + // successful Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void* Value(Handle* handle) = 0; + + // If the cache contains entry for key, erase it. Note that the + // underlying entry will be kept around until all existing handles + // to it have been released. + virtual void Erase(const Slice& key) = 0; + + // Return a new numeric id. May be used by multiple clients who are + // sharing the same cache to partition the key space. Typically the + // client will allocate a new id at startup and prepend the id to + // its cache keys. + virtual uint64_t NewId() = 0; + + // returns the maximum configured capacity of the cache + virtual size_t GetCapacity() const = 0; + + // returns the memory size for the entries residing in the cache. + virtual size_t GetUsage() const = 0; + + // Call this on shutdown if you want to speed it up. Cache will disown + // any underlying data and will not free it on delete. This call will leak + // memory - call this only if you're shutting down the process. + // Any attempts of using cache after this call will fail terribly. + // Always delete the DB object before calling this method! + virtual void DisownData() { + // default implementation is noop + }; + + // Apply callback to all entries in the cache + // If thread_safe is true, it will also lock the accesses. Otherwise, it will + // access the cache without the lock held + virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), + bool thread_safe) = 0; + + private: + void LRU_Remove(Handle* e); + void LRU_Append(Handle* e); + void Unref(Handle* e); + + struct Rep; + Rep* rep_; + + // No copying allowed + Cache(const Cache&); + void operator=(const Cache&); +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_UTIL_CACHE_H_ diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h new file mode 100644 index 0000000000..59b050923e --- /dev/null +++ b/include/rocksdb/compaction_filter.h @@ -0,0 +1,198 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2013 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ +#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ + +#include +#include + +namespace rocksdb { + +class Slice; +class SliceTransform; + +// Context information of a compaction run +struct CompactionFilterContext { + // Does this compaction run include all data files + bool is_full_compaction; + // Is this compaction requested by the client (true), + // or is it occurring as an automatic compaction process + bool is_manual_compaction; +}; + +// CompactionFilter allows an application to modify/delete a key-value at +// the time of compaction. + +class CompactionFilter { + public: + // Context information of a compaction run + struct Context { + // Does this compaction run include all data files + bool is_full_compaction; + // Is this compaction requested by the client (true), + // or is it occurring as an automatic compaction process + bool is_manual_compaction; + }; + + virtual ~CompactionFilter() {} + + // The compaction process invokes this + // method for kv that is being compacted. A return value + // of false indicates that the kv should be preserved in the + // output of this compaction run and a return value of true + // indicates that this key-value should be removed from the + // output of the compaction. The application can inspect + // the existing value of the key and make decision based on it. + // + // When the value is to be preserved, the application has the option + // to modify the existing_value and pass it back through new_value. + // value_changed needs to be set to true in this case. + // + // If multithreaded compaction is being used *and* a single CompactionFilter + // instance was supplied via Options::compaction_filter, this method may be + // called from different threads concurrently. The application must ensure + // that the call is thread-safe. + // + // If the CompactionFilter was created by a factory, then it will only ever + // be used by a single thread that is doing the compaction run, and this + // call does not need to be thread-safe. However, multiple filters may be + // in existence and operating concurrently. + virtual bool Filter(int level, + const Slice& key, + const Slice& existing_value, + std::string* new_value, + bool* value_changed) const = 0; + + // Returns a name that identifies this compaction filter. + // The name will be printed to LOG file on start up for diagnosis. + virtual const char* Name() const = 0; +}; + +// CompactionFilterV2 that buffers kv pairs sharing the same prefix and let +// application layer to make individual decisions for all the kv pairs in the +// buffer. +class CompactionFilterV2 { + public: + virtual ~CompactionFilterV2() {} + + // The compaction process invokes this method for all the kv pairs + // sharing the same prefix. It is a "roll-up" version of CompactionFilter. + // + // Each entry in the return vector indicates if the corresponding kv should + // be preserved in the output of this compaction run. The application can + // inspect the exisitng values of the keys and make decision based on it. + // + // When a value is to be preserved, the application has the option + // to modify the entry in existing_values and pass it back through an entry + // in new_values. A corresponding values_changed entry needs to be set to + // true in this case. Note that the new_values vector contains only changed + // values, i.e. new_values.size() <= values_changed.size(). + // + typedef std::vector SliceVector; + virtual std::vector Filter(int level, + const SliceVector& keys, + const SliceVector& existing_values, + std::vector* new_values, + std::vector* values_changed) + const = 0; + + // Returns a name that identifies this compaction filter. + // The name will be printed to LOG file on start up for diagnosis. + virtual const char* Name() const = 0; +}; + +// Each compaction will create a new CompactionFilter allowing the +// application to know about different campactions +class CompactionFilterFactory { + public: + virtual ~CompactionFilterFactory() { } + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) = 0; + + // Returns a name that identifies this compaction filter factory. + virtual const char* Name() const = 0; +}; + +// Default implementaion of CompactionFilterFactory which does not +// return any filter +class DefaultCompactionFilterFactory : public CompactionFilterFactory { + public: + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + return std::unique_ptr(nullptr); + } + + virtual const char* Name() const override { + return "DefaultCompactionFilterFactory"; + } +}; + +// Each compaction will create a new CompactionFilterV2 +// +// CompactionFilterFactoryV2 enables application to specify a prefix and use +// CompactionFilterV2 to filter kv-pairs in batches. Each batch contains all +// the kv-pairs sharing the same prefix. +// +// This is useful for applications that require grouping kv-pairs in +// compaction filter to make a purge/no-purge decision. For example, if the +// key prefix is user id and the rest of key represents the type of value. +// This batching filter will come in handy if the application's compaction +// filter requires knowledge of all types of values for any user id. +// +class CompactionFilterFactoryV2 { + public: + // NOTE: CompactionFilterFactoryV2 will not delete prefix_extractor + explicit CompactionFilterFactoryV2(const SliceTransform* prefix_extractor) + : prefix_extractor_(prefix_extractor) { } + + virtual ~CompactionFilterFactoryV2() { } + + virtual std::unique_ptr CreateCompactionFilterV2( + const CompactionFilterContext& context) = 0; + + // Returns a name that identifies this compaction filter factory. + virtual const char* Name() const = 0; + + const SliceTransform* GetPrefixExtractor() const { + return prefix_extractor_; + } + + void SetPrefixExtractor(const SliceTransform* prefix_extractor) { + prefix_extractor_ = prefix_extractor; + } + + private: + // Prefix extractor for compaction filter v2 + // Keys sharing the same prefix will be buffered internally. + // Client can implement a Filter callback function to operate on the buffer + const SliceTransform* prefix_extractor_; +}; + +// Default implementaion of CompactionFilterFactoryV2 which does not +// return any filter +class DefaultCompactionFilterFactoryV2 : public CompactionFilterFactoryV2 { + public: + explicit DefaultCompactionFilterFactoryV2() + : CompactionFilterFactoryV2(nullptr) { } + + virtual std::unique_ptr + CreateCompactionFilterV2( + const CompactionFilterContext& context) override { + return std::unique_ptr(nullptr); + } + + virtual const char* Name() const override { + return "DefaultCompactionFilterFactoryV2"; + } +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h new file mode 100644 index 0000000000..f3a8499a8f --- /dev/null +++ b/include/rocksdb/comparator.h @@ -0,0 +1,67 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ +#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ + +#include + +namespace rocksdb { + +class Slice; + +// A Comparator object provides a total order across slices that are +// used as keys in an sstable or a database. A Comparator implementation +// must be thread-safe since rocksdb may invoke its methods concurrently +// from multiple threads. +class Comparator { + public: + virtual ~Comparator(); + + // Three-way comparison. Returns value: + // < 0 iff "a" < "b", + // == 0 iff "a" == "b", + // > 0 iff "a" > "b" + virtual int Compare(const Slice& a, const Slice& b) const = 0; + + // The name of the comparator. Used to check for comparator + // mismatches (i.e., a DB created with one comparator is + // accessed using a different comparator. + // + // The client of this package should switch to a new name whenever + // the comparator implementation changes in a way that will cause + // the relative ordering of any two keys to change. + // + // Names starting with "rocksdb." are reserved and should not be used + // by any clients of this package. + virtual const char* Name() const = 0; + + // Advanced functions: these are used to reduce the space requirements + // for internal data structures like index blocks. + + // If *start < limit, changes *start to a short string in [start,limit). + // Simple comparator implementations may return with *start unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const = 0; + + // Changes *key to a short string >= *key. + // Simple comparator implementations may return with *key unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortSuccessor(std::string* key) const = 0; +}; + +// Return a builtin comparator that uses lexicographic byte-wise +// ordering. The result remains the property of this module and +// must not be deleted. +extern const Comparator* BytewiseComparator(); + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h new file mode 100644 index 0000000000..33b443f407 --- /dev/null +++ b/include/rocksdb/db.h @@ -0,0 +1,495 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_ +#define STORAGE_ROCKSDB_INCLUDE_DB_H_ + +#include +#include +#include +#include +#include +#include +#include "rocksdb/version.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "rocksdb/transaction_log.h" + +namespace rocksdb { + +using std::unique_ptr; + +class ColumnFamilyHandle { + public: + virtual ~ColumnFamilyHandle() {} +}; +extern const std::string kDefaultColumnFamilyName; + +struct ColumnFamilyDescriptor { + std::string name; + ColumnFamilyOptions options; + ColumnFamilyDescriptor() + : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {} + ColumnFamilyDescriptor(const std::string& _name, + const ColumnFamilyOptions& _options) + : name(_name), options(_options) {} +}; + +static const int kMajorVersion = __ROCKSDB_MAJOR__; +static const int kMinorVersion = __ROCKSDB_MINOR__; + +struct Options; +struct ReadOptions; +struct WriteOptions; +struct FlushOptions; +struct TableProperties; +class WriteBatch; +class Env; + +// Metadata associated with each SST file. +struct LiveFileMetaData { + std::string column_family_name; // Name of the column family + std::string name; // Name of the file + int level; // Level at which this file resides. + size_t size; // File size in bytes. + std::string smallestkey; // Smallest user defined key in the file. + std::string largestkey; // Largest user defined key in the file. + SequenceNumber smallest_seqno; // smallest seqno in file + SequenceNumber largest_seqno; // largest seqno in file +}; + +// Abstract handle to particular state of a DB. +// A Snapshot is an immutable object and can therefore be safely +// accessed from multiple threads without any external synchronization. +class Snapshot { + protected: + virtual ~Snapshot(); +}; + +// A range of keys +struct Range { + Slice start; // Included in the range + Slice limit; // Not included in the range + + Range() { } + Range(const Slice& s, const Slice& l) : start(s), limit(l) { } +}; + +// A collections of table properties objects, where +// key: is the table's file name. +// value: the table properties object of the given table. +typedef std::unordered_map> + TablePropertiesCollection; + +// A DB is a persistent ordered map from keys to values. +// A DB is safe for concurrent access from multiple threads without +// any external synchronization. +class DB { + public: + // Open the database with the specified "name". + // Stores a pointer to a heap-allocated database in *dbptr and returns + // OK on success. + // Stores nullptr in *dbptr and returns a non-OK status on error. + // Caller should delete *dbptr when it is no longer needed. + static Status Open(const Options& options, + const std::string& name, + DB** dbptr); + + // Open the database for read only. All DB interfaces + // that modify data, like put/delete, will return error. + // If the db is opened in read only mode, then no compactions + // will happen. + static Status OpenForReadOnly(const Options& options, + const std::string& name, DB** dbptr, + bool error_if_log_file_exist = false); + + // Open the database for read only with column families. When opening DB with + // read only, you can specify only a subset of column families in the + // database that should be opened. However, you always need to specify default + // column family. The default column family name is 'default' and it's stored + // in rocksdb::kDefaultColumnFamilyName + static Status OpenForReadOnly( + const DBOptions& db_options, const std::string& name, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_log_file_exist = false); + + // Open DB with column families. + // db_options specify database specific options + // column_families is the vector of all column families in the databse, + // containing column family name and options. You need to open ALL column + // families in the database. To get the list of column families, you can use + // ListColumnFamilies(). Also, you can open only a subset of column families + // for read-only access. + // The default column family name is 'default' and it's stored + // in rocksdb::kDefaultColumnFamilyName. + // If everything is OK, handles will on return be the same size + // as column_families --- handles[i] will be a handle that you + // will use to operate on column family column_family[i] + static Status Open(const DBOptions& db_options, const std::string& name, + const std::vector& column_families, + std::vector* handles, DB** dbptr); + + // ListColumnFamilies will open the DB specified by argument name + // and return the list of all column families in that DB + // through column_families argument. The ordering of + // column families in column_families is unspecified. + static Status ListColumnFamilies(const DBOptions& db_options, + const std::string& name, + std::vector* column_families); + + DB() { } + virtual ~DB(); + + // Create a column_family and return the handle of column family + // through the argument handle. + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle); + + // Drop a column family specified by column_family handle. This call + // only records a drop record in the manifest and prevents the column + // family from flushing and compacting. + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family); + + // Set the database entry for "key" to "value". + // Returns OK on success, and a non-OK status on error. + // Note: consider setting options.sync = true. + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Put(const WriteOptions& options, const Slice& key, + const Slice& value) { + return Put(options, DefaultColumnFamily(), key, value); + } + + // Remove the database entry (if any) for "key". Returns OK on + // success, and a non-OK status on error. It is not an error if "key" + // did not exist in the database. + // Note: consider setting options.sync = true. + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status Delete(const WriteOptions& options, const Slice& key) { + return Delete(options, DefaultColumnFamily(), key); + } + + // Merge the database entry for "key" with "value". Returns OK on success, + // and a non-OK status on error. The semantics of this operation is + // determined by the user provided merge_operator when opening DB. + // Note: consider setting options.sync = true. + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Merge(const WriteOptions& options, const Slice& key, + const Slice& value) { + return Merge(options, DefaultColumnFamily(), key, value); + } + + // Apply the specified updates to the database. + // Returns OK on success, non-OK on failure. + // Note: consider setting options.sync = true. + virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; + + // If the database contains an entry for "key" store the + // corresponding value in *value and return OK. + // + // If there is no entry for "key" leave *value unchanged and return + // a status for which Status::IsNotFound() returns true. + // + // May return some other Status on an error. + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) = 0; + virtual Status Get(const ReadOptions& options, const Slice& key, std::string* value) { + return Get(options, DefaultColumnFamily(), key, value); + } + + // If keys[i] does not exist in the database, then the i'th returned + // status will be one for which Status::IsNotFound() is true, and + // (*values)[i] will be set to some arbitrary value (often ""). Otherwise, + // the i'th returned status will have Status::ok() true, and (*values)[i] + // will store the value associated with keys[i]. + // + // (*values) will always be resized to be the same size as (keys). + // Similarly, the number of returned statuses will be the number of keys. + // Note: keys will not be "de-duplicated". Duplicate keys will return + // duplicate values in order. + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) = 0; + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) { + return MultiGet(options, std::vector( + keys.size(), DefaultColumnFamily()), + keys, values); + } + + // If the key definitely does not exist in the database, then this method + // returns false, else true. If the caller wants to obtain value when the key + // is found in memory, a bool for 'value_found' must be passed. 'value_found' + // will be true on return if value has been set properly. + // This check is potentially lighter-weight than invoking DB::Get(). One way + // to make this lighter weight is to avoid doing any IOs. + // Default implementation here returns true and sets 'value_found' to false + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found = nullptr) { + if (value_found != nullptr) { + *value_found = false; + } + return true; + } + virtual bool KeyMayExist(const ReadOptions& options, const Slice& key, + std::string* value, bool* value_found = nullptr) { + return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found); + } + + // Return a heap-allocated iterator over the contents of the database. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // + // Caller should delete the iterator when it is no longer needed. + // The returned iterator should be deleted before this db is deleted. + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) = 0; + virtual Iterator* NewIterator(const ReadOptions& options) { + return NewIterator(options, DefaultColumnFamily()); + } + // Returns iterators from a consistent database state across multiple + // column families. Iterators are heap allocated and need to be deleted + // before the db is deleted + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) = 0; + + // Return a handle to the current DB state. Iterators created with + // this handle will all observe a stable snapshot of the current DB + // state. The caller must call ReleaseSnapshot(result) when the + // snapshot is no longer needed. + // + // nullptr will be returned if the DB fails to take a snapshot or does + // not support snapshot. + virtual const Snapshot* GetSnapshot() = 0; + + // Release a previously acquired snapshot. The caller must not + // use "snapshot" after this call. + virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; + + // DB implementations can export properties about their state + // via this method. If "property" is a valid property understood by this + // DB implementation, fills "*value" with its current value and returns + // true. Otherwise returns false. + // + // + // Valid property names include: + // + // "rocksdb.num-files-at-level" - return the number of files at level , + // where is an ASCII representation of a level number (e.g. "0"). + // "rocksdb.stats" - returns a multi-line string that describes statistics + // about the internal operation of the DB. + // "rocksdb.sstables" - returns a multi-line string that describes all + // of the sstables that make up the db contents. + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) = 0; + virtual bool GetProperty(const Slice& property, std::string* value) { + return GetProperty(DefaultColumnFamily(), property, value); + } + + // For each i in [0,n-1], store in "sizes[i]", the approximate + // file system space used by keys in "[range[i].start .. range[i].limit)". + // + // Note that the returned sizes measure file system space usage, so + // if the user data compresses by a factor of ten, the returned + // sizes will be one-tenth the size of the corresponding user data size. + // + // The results may not include the sizes of recently written data. + virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* range, int n, + uint64_t* sizes) = 0; + virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) { + GetApproximateSizes(DefaultColumnFamily(), range, n, sizes); + } + + // Compact the underlying storage for the key range [*begin,*end]. + // The actual compaction interval might be superset of [*begin, *end]. + // In particular, deleted and overwritten versions are discarded, + // and the data is rearranged to reduce the cost of operations + // needed to access the data. This operation should typically only + // be invoked by users who understand the underlying implementation. + // + // begin==nullptr is treated as a key before all keys in the database. + // end==nullptr is treated as a key after all keys in the database. + // Therefore the following call will compact the entire database: + // db->CompactRange(nullptr, nullptr); + // Note that after the entire database is compacted, all data are pushed + // down to the last level containing any data. If the total data size + // after compaction is reduced, that level might not be appropriate for + // hosting all the files. In this case, client could set reduce_level + // to true, to move the files back to the minimum level capable of holding + // the data set or a given level (specified by non-negative target_level). + virtual Status CompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + bool reduce_level = false, + int target_level = -1) = 0; + virtual Status CompactRange(const Slice* begin, const Slice* end, + bool reduce_level = false, + int target_level = -1) { + return CompactRange(DefaultColumnFamily(), begin, end, reduce_level, + target_level); + } + + // Number of levels used for this DB. + virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0; + virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); } + + // Maximum level to which a new compacted memtable is pushed if it + // does not create overlap. + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0; + virtual int MaxMemCompactionLevel() { + return MaxMemCompactionLevel(DefaultColumnFamily()); + } + + // Number of files in level-0 that would stop writes. + virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0; + virtual int Level0StopWriteTrigger() { + return Level0StopWriteTrigger(DefaultColumnFamily()); + } + + // Get DB name -- the exact same name that was provided as an argument to + // DB::Open() + virtual const std::string& GetName() const = 0; + + // Get Env object from the DB + virtual Env* GetEnv() const = 0; + + // Get DB Options that we use + virtual const Options& GetOptions(ColumnFamilyHandle* column_family) + const = 0; + virtual const Options& GetOptions() const { + return GetOptions(DefaultColumnFamily()); + } + + // Flush all mem-table data. + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) = 0; + virtual Status Flush(const FlushOptions& options) { + return Flush(options, DefaultColumnFamily()); + } + + // The sequence number of the most recent transaction. + virtual SequenceNumber GetLatestSequenceNumber() const = 0; + +#ifndef ROCKSDB_LITE + + // Prevent file deletions. Compactions will continue to occur, + // but no obsolete files will be deleted. Calling this multiple + // times have the same effect as calling it once. + virtual Status DisableFileDeletions() = 0; + + // Allow compactions to delete obsolete files. + // If force == true, the call to EnableFileDeletions() will guarantee that + // file deletions are enabled after the call, even if DisableFileDeletions() + // was called multiple times before. + // If force == false, EnableFileDeletions will only enable file deletion + // after it's been called at least as many times as DisableFileDeletions(), + // enabling the two methods to be called by two threads concurrently without + // synchronization -- i.e., file deletions will be enabled only after both + // threads call EnableFileDeletions() + virtual Status EnableFileDeletions(bool force = true) = 0; + + // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup + + // THIS METHOD IS DEPRECATED. Use the GetLiveFilesMetaData to get more + // detailed information on the live files. + // Retrieve the list of all files in the database. The files are + // relative to the dbname and are not absolute paths. The valid size of the + // manifest file is returned in manifest_file_size. The manifest file is an + // ever growing file, but only the portion specified by manifest_file_size is + // valid for this snapshot. + // Setting flush_memtable to true does Flush before recording the live files. + // Setting flush_memtable to false is useful when we don't want to wait for + // flush which may have to wait for compaction to complete taking an + // indeterminate time. + // + // In case you have multiple column families, even if flush_memtable is true, + // you still need to call GetSortedWalFiles after GetLiveFiles to compensate + // for new data that arrived to already-flushed column families while other + // column families were flushing + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) = 0; + + // Retrieve the sorted list of all wal files with earliest file first + virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0; + + // Sets iter to an iterator that is positioned at a write-batch containing + // seq_number. If the sequence number is non existent, it returns an iterator + // at the first available seq_no after the requested seq_no + // Returns Status::OK if iterator is valid + // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to + // use this api, else the WAL files will get + // cleared aggressively and the iterator might keep getting invalid before + // an update is read. + virtual Status GetUpdatesSince( + SequenceNumber seq_number, unique_ptr* iter, + const TransactionLogIterator::ReadOptions& + read_options = TransactionLogIterator::ReadOptions()) = 0; + + // Delete the file name from the db directory and update the internal state to + // reflect that. Supports deletion of sst and log files only. 'name' must be + // path relative to the db directory. eg. 000001.sst, /archive/000003.log + virtual Status DeleteFile(std::string name) = 0; + + // Returns a list of all table files with their level, start key + // and end key + virtual void GetLiveFilesMetaData(std::vector* metadata) {} + +#endif // ROCKSDB_LITE + + // Sets the globally unique ID created at database creation time by invoking + // Env::GenerateUniqueId(), in identity. Returns Status::OK if identity could + // be set properly + virtual Status GetDbIdentity(std::string& identity) = 0; + + // Returns default column family handle + virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0; + +#ifndef ROCKSDB_LITE + virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) = 0; + virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { + return GetPropertiesOfAllTables(DefaultColumnFamily(), props); + } +#endif // ROCKSDB_LITE + + private: + // No copying allowed + DB(const DB&); + void operator=(const DB&); +}; + +// Destroy the contents of the specified database. +// Be very careful using this method. +Status DestroyDB(const std::string& name, const Options& options); + +#ifndef ROCKSDB_LITE +// If a DB cannot be opened, you may attempt to call this method to +// resurrect as much of the contents of the database as possible. +// Some data may be lost, so be careful when calling this function +// on a database that contains important information. +Status RepairDB(const std::string& dbname, const Options& options); +#endif + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_DB_H_ diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h new file mode 100644 index 0000000000..6a963510f0 --- /dev/null +++ b/include/rocksdb/env.h @@ -0,0 +1,772 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the rocksdb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All Env implementations are safe for concurrent access from +// multiple threads without any external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_ +#define STORAGE_ROCKSDB_INCLUDE_ENV_H_ + +#include +#include +#include +#include +#include +#include "rocksdb/status.h" + +namespace rocksdb { + +class FileLock; +class Logger; +class RandomAccessFile; +class SequentialFile; +class Slice; +class WritableFile; +class RandomRWFile; +class Directory; +struct DBOptions; + +using std::unique_ptr; +using std::shared_ptr; + + +// Options while opening a file to read/write +struct EnvOptions { + + // construct with default Options + EnvOptions(); + + // construct from Options + explicit EnvOptions(const DBOptions& options); + + // If true, then allow caching of data in environment buffers + bool use_os_buffer = true; + + // If true, then use mmap to read data + bool use_mmap_reads = false; + + // If true, then use mmap to write data + bool use_mmap_writes = true; + + // If true, set the FD_CLOEXEC on open fd. + bool set_fd_cloexec = true; + + // Allows OS to incrementally sync files to disk while they are being + // written, in the background. Issue one request for every bytes_per_sync + // written. 0 turns it off. + // Default: 0 + uint64_t bytes_per_sync = 0; + + // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which + // means that file size won't change as part of preallocation. + // If false, preallocation will also change the file size. This option will + // improve the performance in workloads where you sync the data on every + // write. By default, we set it to true for MANIFEST writes and false for + // WAL writes + bool fallocate_with_keep_size = true; +}; + +class Env { + public: + Env() { } + virtual ~Env(); + + // Return a default environment suitable for the current operating + // system. Sophisticated users may wish to provide their own Env + // implementation instead of relying on this default environment. + // + // The result of Default() belongs to rocksdb and must never be deleted. + static Env* Default(); + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores nullptr in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) + = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual Status NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) + = 0; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) = 0; + + // Create an object that both reads and writes to a file on + // specified offsets (random access). If file already exists, + // does not overwrite it. On success, stores a pointer to the + // new file in *result and returns OK. On failure stores nullptr + // in *result and returns non-OK. + virtual Status NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) = 0; + + // Create an object that represents a directory. Will fail if directory + // doesn't exist. If the directory exists, it will open the directory + // and create a new Directory object. + // + // On success, stores a pointer to the new Directory in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + virtual Status NewDirectory(const std::string& name, + unique_ptr* result) = 0; + + // Returns true iff the named file exists. + virtual bool FileExists(const std::string& fname) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + virtual Status GetChildren(const std::string& dir, + std::vector* result) = 0; + + // Delete the named file. + virtual Status DeleteFile(const std::string& fname) = 0; + + // Create the specified directory. Returns error if directory exists. + virtual Status CreateDir(const std::string& dirname) = 0; + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + virtual Status CreateDirIfMissing(const std::string& dirname) = 0; + + // Delete the specified directory. + virtual Status DeleteDir(const std::string& dirname) = 0; + + // Store the size of fname in *file_size. + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; + + // Store the last modification time of fname in *file_mtime. + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) = 0; + // Rename file src to target. + virtual Status RenameFile(const std::string& src, + const std::string& target) = 0; + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores nullptr in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual Status UnlockFile(FileLock* lock) = 0; + + enum Priority { LOW, HIGH, TOTAL }; + + // Arrange to run "(*function)(arg)" once in a background thread, in + // the thread pool specified by pri. By default, jobs go to the 'LOW' + // priority thread pool. + + // "function" may run in an unspecified thread. Multiple functions + // added to the same Env may run concurrently in different threads. + // I.e., the caller may not assume that background work items are + // serialized. + virtual void Schedule( + void (*function)(void* arg), + void* arg, + Priority pri = LOW) = 0; + + // Start a new thread, invoking "function(arg)" within the new thread. + // When "function(arg)" returns, the thread will be destroyed. + virtual void StartThread(void (*function)(void* arg), void* arg) = 0; + + // Wait for all threads started by StartThread to terminate. + virtual void WaitForJoin() {} + + // Get thread pool queue length for specific thrad pool. + virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const { + return 0; + } + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual Status GetTestDirectory(std::string* path) = 0; + + // Create and return a log file for storing informational messages. + virtual Status NewLogger(const std::string& fname, + shared_ptr* result) = 0; + + // Returns the number of micro-seconds since some fixed point in time. Only + // useful for computing deltas of time. + virtual uint64_t NowMicros() = 0; + + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros + virtual uint64_t NowNanos() { + return NowMicros() * 1000; + } + + // Sleep/delay the thread for the perscribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + // Get the current host name. + virtual Status GetHostName(char* name, uint64_t len) = 0; + + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). + virtual Status GetCurrentTime(int64_t* unix_time) = 0; + + // Get full directory name for this db. + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) = 0; + + // The number of background worker threads of a specific thread pool + // for this environment. 'LOW' is the default pool. + // default number: 1 + virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0; + + // Converts seconds-since-Jan-01-1970 to a printable string + virtual std::string TimeToString(uint64_t time) = 0; + + // Generates a unique id that can be used to identify a db + virtual std::string GenerateUniqueId(); + + // OptimizeForLogWrite will create a new EnvOptions object that is a copy of + // the EnvOptions in the parameters, but is optimized for writing log files. + // Default implementation returns the copy of the same object. + virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const; + // OptimizeForManifestWrite will create a new EnvOptions object that is a copy + // of the EnvOptions in the parameters, but is optimized for writing manifest + // files. Default implementation returns the copy of the same object. + virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) + const; + + private: + // No copying allowed + Env(const Env&); + void operator=(const Env&); +}; + +// A file abstraction for reading sequentially through a file +class SequentialFile { + public: + SequentialFile() { } + virtual ~SequentialFile(); + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + virtual Status Read(size_t n, Slice* result, char* scratch) = 0; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + virtual Status Skip(uint64_t n) = 0; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual Status InvalidateCache(size_t offset, size_t length) { + return Status::NotSupported("InvalidateCache not supported."); + } +}; + +// A file abstraction for randomly reading the contents of a file. +class RandomAccessFile { + public: + RandomAccessFile() { } + virtual ~RandomAccessFile(); + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to eachother by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + virtual size_t GetUniqueId(char* id, size_t max_size) const { + return 0; // Default implementation to prevent issues with backwards + // compatibility. + }; + + + enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED }; + + virtual void Hint(AccessPattern pattern) {} + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual Status InvalidateCache(size_t offset, size_t length) { + return Status::NotSupported("InvalidateCache not supported."); + } +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class WritableFile { + public: + WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) { + } + virtual ~WritableFile(); + + virtual Status Append(const Slice& data) = 0; + virtual Status Close() = 0; + virtual Status Flush() = 0; + virtual Status Sync() = 0; // sync data + + /* + * Sync data and/or metadata as well. + * By default, sync only data. + * Override this method for environments where we need to sync + * metadata as well. + */ + virtual Status Fsync() { + return Sync(); + } + + /* + * Get the size of valid data in the file. + */ + virtual uint64_t GetFileSize() { + return 0; + } + + /* + * Get and set the default pre-allocation block size for writes to + * this file. If non-zero, then Allocate will be used to extend the + * underlying storage of a file (generally via fallocate) if the Env + * instance supports it. + */ + void SetPreallocationBlockSize(size_t size) { + preallocation_block_size_ = size; + } + + virtual void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) { + *last_allocated_block = last_preallocated_block_; + *block_size = preallocation_block_size_; + } + + // For documentation, refer to RandomAccessFile::GetUniqueId() + virtual size_t GetUniqueId(char* id, size_t max_size) const { + return 0; // Default implementation to prevent issues with backwards + } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. + virtual Status InvalidateCache(size_t offset, size_t length) { + return Status::NotSupported("InvalidateCache not supported."); + } + + protected: + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + void PrepareWrite(size_t offset, size_t len) { + if (preallocation_block_size_ == 0) { + return; + } + // If this write would cross one or more preallocation blocks, + // determine what the last preallocation block necesessary to + // cover this write would be and Allocate to that point. + const auto block_size = preallocation_block_size_; + size_t new_last_preallocated_block = + (offset + len + block_size - 1) / block_size; + if (new_last_preallocated_block > last_preallocated_block_) { + size_t num_spanned_blocks = + new_last_preallocated_block - last_preallocated_block_; + Allocate(block_size * last_preallocated_block_, + block_size * num_spanned_blocks); + last_preallocated_block_ = new_last_preallocated_block; + } + } + + /* + * Pre-allocate space for a file. + */ + virtual Status Allocate(off_t offset, off_t len) { + return Status::OK(); + } + + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + virtual Status RangeSync(off_t offset, off_t nbytes) { + return Status::OK(); + } + + private: + size_t last_preallocated_block_; + size_t preallocation_block_size_; + // No copying allowed + WritableFile(const WritableFile&); + void operator=(const WritableFile&); +}; + +// A file abstraction for random reading and writing. +class RandomRWFile { + public: + RandomRWFile() {} + virtual ~RandomRWFile() {} + + // Write data from Slice data to file starting from offset + // Returns IOError on failure, but does not guarantee + // atomicity of a write. Returns OK status on success. + // + // Safe for concurrent use. + virtual Status Write(uint64_t offset, const Slice& data) = 0; + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; + virtual Status Close() = 0; // closes the file + virtual Status Sync() = 0; // sync data + + /* + * Sync data and/or metadata as well. + * By default, sync only data. + * Override this method for environments where we need to sync + * metadata as well. + */ + virtual Status Fsync() { + return Sync(); + } + + /* + * Pre-allocate space for a file. + */ + virtual Status Allocate(off_t offset, off_t len) { + return Status::OK(); + } + + private: + // No copying allowed + RandomRWFile(const RandomRWFile&); + void operator=(const RandomRWFile&); +}; + +// Directory object represents collection of files and implements +// filesystem operations that can be executed on directories. +class Directory { + public: + virtual ~Directory() {} + // Fsync directory + virtual Status Fsync() = 0; +}; + +enum InfoLogLevel : unsigned char { + DEBUG_LEVEL = 0, + INFO_LEVEL, + WARN_LEVEL, + ERROR_LEVEL, + FATAL_LEVEL, + NUM_INFO_LOG_LEVELS, +}; + +// An interface for writing log messages. +class Logger { + public: + enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 }; + explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) + : log_level_(log_level) {} + virtual ~Logger(); + + // Write an entry to the log file with the specified format. + virtual void Logv(const char* format, va_list ap) = 0; + + // Write an entry to the log file with the specified log level + // and format. Any log with level under the internal log level + // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be + // printed. + void Logv(const InfoLogLevel log_level, const char* format, va_list ap) { + static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN", + "ERROR", "FATAL"}; + if (log_level < log_level_) { + return; + } + + if (log_level == InfoLogLevel::INFO_LEVEL) { + // Doesn't print log level if it is INFO level. + // This is to avoid unexpected performance regression after we add + // the feature of log level. All the logs before we add the feature + // are INFO level. We don't want to add extra costs to those existing + // logging. + Logv(format, ap); + } else { + char new_format[500]; + snprintf(new_format, sizeof(new_format) - 1, "[%s] %s", + kInfoLogLevelNames[log_level], format); + Logv(new_format, ap); + } + } + virtual size_t GetLogFileSize() const { + return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE; + } + // Flush to the OS buffers + virtual void Flush() {} + virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; } + virtual void SetInfoLogLevel(const InfoLogLevel log_level) { + log_level_ = log_level; + } + + private: + // No copying allowed + Logger(const Logger&); + void operator=(const Logger&); + InfoLogLevel log_level_; +}; + + +// Identifies a locked file. +class FileLock { + public: + FileLock() { } + virtual ~FileLock(); + private: + // No copying allowed + FileLock(const FileLock&); + void operator=(const FileLock&); +}; + +extern void LogFlush(const shared_ptr& info_log); + +extern void Log(const InfoLogLevel log_level, + const shared_ptr& info_log, const char* format, ...); + +// a set of log functions with different log levels. +extern void Debug(const shared_ptr& info_log, const char* format, ...); +extern void Info(const shared_ptr& info_log, const char* format, ...); +extern void Warn(const shared_ptr& info_log, const char* format, ...); +extern void Error(const shared_ptr& info_log, const char* format, ...); +extern void Fatal(const shared_ptr& info_log, const char* format, ...); + +// Log the specified data to *info_log if info_log is non-nullptr. +// The default info log level is InfoLogLevel::ERROR. +extern void Log(const shared_ptr& info_log, const char* format, ...) +# if defined(__GNUC__) || defined(__clang__) + __attribute__((__format__ (__printf__, 2, 3))) +# endif + ; + +extern void LogFlush(Logger *info_log); + +extern void Log(const InfoLogLevel log_level, Logger* info_log, + const char* format, ...); + +// The default info log level is InfoLogLevel::ERROR. +extern void Log(Logger* info_log, const char* format, ...) +# if defined(__GNUC__) || defined(__clang__) + __attribute__((__format__ (__printf__, 2, 3))) +# endif + ; + +// a set of log functions with different log levels. +extern void Debug(Logger* info_log, const char* format, ...); +extern void Info(Logger* info_log, const char* format, ...); +extern void Warn(Logger* info_log, const char* format, ...); +extern void Error(Logger* info_log, const char* format, ...); +extern void Fatal(Logger* info_log, const char* format, ...); + +// A utility routine: write "data" to the named file. +extern Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname, + bool should_sync = false); + +// A utility routine: read contents of named file into *data +extern Status ReadFileToString(Env* env, const std::string& fname, + std::string* data); + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class EnvWrapper : public Env { + public: + // Initialize an EnvWrapper that delegates all calls to *t + explicit EnvWrapper(Env* t) : target_(t) { } + virtual ~EnvWrapper(); + + // Return the target to which this Env forwards all calls + Env* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, + unique_ptr* r, + const EnvOptions& options) { + return target_->NewSequentialFile(f, r, options); + } + Status NewRandomAccessFile(const std::string& f, + unique_ptr* r, + const EnvOptions& options) { + return target_->NewRandomAccessFile(f, r, options); + } + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& options) { + return target_->NewWritableFile(f, r, options); + } + Status NewRandomRWFile(const std::string& f, unique_ptr* r, + const EnvOptions& options) { + return target_->NewRandomRWFile(f, r, options); + } + virtual Status NewDirectory(const std::string& name, + unique_ptr* result) { + return target_->NewDirectory(name, result); + } + bool FileExists(const std::string& f) { return target_->FileExists(f); } + Status GetChildren(const std::string& dir, std::vector* r) { + return target_->GetChildren(dir, r); + } + Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); } + Status CreateDir(const std::string& d) { return target_->CreateDir(d); } + Status CreateDirIfMissing(const std::string& d) { + return target_->CreateDirIfMissing(d); + } + Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); } + Status GetFileSize(const std::string& f, uint64_t* s) { + return target_->GetFileSize(f, s); + } + + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) { + return target_->GetFileModificationTime(fname, file_mtime); + } + + Status RenameFile(const std::string& s, const std::string& t) { + return target_->RenameFile(s, t); + } + Status LockFile(const std::string& f, FileLock** l) { + return target_->LockFile(f, l); + } + Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); } + void Schedule(void (*f)(void*), void* a, Priority pri) { + return target_->Schedule(f, a, pri); + } + void StartThread(void (*f)(void*), void* a) { + return target_->StartThread(f, a); + } + void WaitForJoin() { return target_->WaitForJoin(); } + virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const { + return target_->GetThreadPoolQueueLen(pri); + } + virtual Status GetTestDirectory(std::string* path) { + return target_->GetTestDirectory(path); + } + virtual Status NewLogger(const std::string& fname, + shared_ptr* result) { + return target_->NewLogger(fname, result); + } + uint64_t NowMicros() { + return target_->NowMicros(); + } + void SleepForMicroseconds(int micros) { + target_->SleepForMicroseconds(micros); + } + Status GetHostName(char* name, uint64_t len) { + return target_->GetHostName(name, len); + } + Status GetCurrentTime(int64_t* unix_time) { + return target_->GetCurrentTime(unix_time); + } + Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) { + return target_->GetAbsolutePath(db_path, output_path); + } + void SetBackgroundThreads(int num, Priority pri) { + return target_->SetBackgroundThreads(num, pri); + } + std::string TimeToString(uint64_t time) { + return target_->TimeToString(time); + } + + private: + Env* target_; +}; + +// Returns a new environment that stores its data in memory and delegates +// all non-file-storage tasks to base_env. The caller must delete the result +// when it is no longer needed. +// *base_env must remain live while the result is in use. +Env* NewMemEnv(Env* base_env); + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_ENV_H_ diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h new file mode 100644 index 0000000000..fa44db45ff --- /dev/null +++ b/include/rocksdb/filter_policy.h @@ -0,0 +1,74 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A database can be configured with a custom FilterPolicy object. +// This object is responsible for creating a small filter from a set +// of keys. These filters are stored in rocksdb and are consulted +// automatically by rocksdb to decide whether or not to read some +// information from disk. In many cases, a filter can cut down the +// number of disk seeks form a handful to a single disk seek per +// DB::Get() call. +// +// Most people will want to use the builtin bloom filter support (see +// NewBloomFilterPolicy() below). + +#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ +#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ + +#include + +namespace rocksdb { + +class Slice; + +class FilterPolicy { + public: + virtual ~FilterPolicy(); + + // Return the name of this policy. Note that if the filter encoding + // changes in an incompatible way, the name returned by this method + // must be changed. Otherwise, old incompatible filters may be + // passed to methods of this type. + virtual const char* Name() const = 0; + + // keys[0,n-1] contains a list of keys (potentially with duplicates) + // that are ordered according to the user supplied comparator. + // Append a filter that summarizes keys[0,n-1] to *dst. + // + // Warning: do not change the initial contents of *dst. Instead, + // append the newly constructed filter to *dst. + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) + const = 0; + + // "filter" contains the data appended by a preceding call to + // CreateFilter() on this class. This method must return true if + // the key was in the list of keys passed to CreateFilter(). + // This method may return true or false if the key was not on the + // list, but it should aim to return false with a high probability. + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0; +}; + +// Return a new filter policy that uses a bloom filter with approximately +// the specified number of bits per key. A good value for bits_per_key +// is 10, which yields a filter with ~ 1% false positive rate. +// +// Callers must delete the result after any database that is using the +// result has been closed. +// +// Note: if you are using a custom comparator that ignores some parts +// of the keys being compared, you must not use NewBloomFilterPolicy() +// and must provide your own FilterPolicy that also ignores the +// corresponding parts of the keys. For example, if the comparator +// ignores trailing spaces, it would be incorrect to use a +// FilterPolicy (like NewBloomFilterPolicy) that does not ignore +// trailing spaces in keys. +extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key); + +} + +#endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ diff --git a/include/rocksdb/flush_block_policy.h b/include/rocksdb/flush_block_policy.h new file mode 100644 index 0000000000..8340ad616e --- /dev/null +++ b/include/rocksdb/flush_block_policy.h @@ -0,0 +1,58 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include + +namespace rocksdb { + +class Slice; +class BlockBuilder; +struct Options; + +// FlushBlockPolicy provides a configurable way to determine when to flush a +// block in the block based tables, +class FlushBlockPolicy { + public: + // Keep track of the key/value sequences and return the boolean value to + // determine if table builder should flush current data block. + virtual bool Update(const Slice& key, + const Slice& value) = 0; + + virtual ~FlushBlockPolicy() { } +}; + +class FlushBlockPolicyFactory { + public: + // Return the name of the flush block policy. + virtual const char* Name() const = 0; + + // Return a new block flush policy that flushes data blocks by data size. + // FlushBlockPolicy may need to access the metadata of the data block + // builder to determine when to flush the blocks. + // + // Callers must delete the result after any database that is using the + // result has been closed. + virtual FlushBlockPolicy* NewFlushBlockPolicy( + const Options& options, const BlockBuilder& data_block_builder) const = 0; + + virtual ~FlushBlockPolicyFactory() { } +}; + +class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory { + public: + FlushBlockBySizePolicyFactory() {} + + virtual const char* Name() const override { + return "FlushBlockBySizePolicyFactory"; + } + + virtual FlushBlockPolicy* NewFlushBlockPolicy( + const Options& options, + const BlockBuilder& data_block_builder) const override; +}; + +} // rocksdb diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h new file mode 100644 index 0000000000..7538e9cfb5 --- /dev/null +++ b/include/rocksdb/iterator.h @@ -0,0 +1,106 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An iterator yields a sequence of key/value pairs from a source. +// The following class defines the interface. Multiple implementations +// are provided by this library. In particular, iterators are provided +// to access the contents of a Table or a DB. +// +// Multiple threads can invoke const methods on an Iterator without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Iterator must use +// external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ +#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +class Iterator { + public: + Iterator(); + virtual ~Iterator(); + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + virtual bool Valid() const = 0; + + // Position at the first key in the source. The iterator is Valid() + // after this call iff the source is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last key in the source. The iterator is + // Valid() after this call iff the source is not empty. + virtual void SeekToLast() = 0; + + // Position at the first key in the source that at or past target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + virtual void Seek(const Slice& target) = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Moves to the previous entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the first entry in source. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return the value for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: !AtEnd() && !AtStart() + virtual Slice value() const = 0; + + // If an error has occurred, return it. Else return an ok status. + // If non-blocking IO is requested and this operation cannot be + // satisfied without doing some IO, then this returns Status::Incomplete(). + virtual Status status() const = 0; + + // Clients are allowed to register function/arg1/arg2 triples that + // will be invoked when this iterator is destroyed. + // + // Note that unlike all of the preceding methods, this method is + // not abstract and therefore clients should not override it. + typedef void (*CleanupFunction)(void* arg1, void* arg2); + void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + + private: + struct Cleanup { + CleanupFunction function; + void* arg1; + void* arg2; + Cleanup* next; + }; + Cleanup cleanup_; + + // No copying allowed + Iterator(const Iterator&); + void operator=(const Iterator&); +}; + +// Return an empty iterator (yields nothing). +extern Iterator* NewEmptyIterator(); + +// Return an empty iterator with the specified status. +extern Iterator* NewErrorIterator(const Status& status); + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ diff --git a/include/rocksdb/ldb_tool.h b/include/rocksdb/ldb_tool.h new file mode 100644 index 0000000000..46bacc8068 --- /dev/null +++ b/include/rocksdb/ldb_tool.h @@ -0,0 +1,18 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#ifndef ROCKSDB_LITE +#pragma once +#include "rocksdb/options.h" + +namespace rocksdb { + +class LDBTool { + public: + void Run(int argc, char** argv, Options = Options()); +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h new file mode 100644 index 0000000000..6134fd1660 --- /dev/null +++ b/include/rocksdb/memtablerep.h @@ -0,0 +1,284 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file contains the interface that must be implemented by any collection +// to be used as the backing store for a MemTable. Such a collection must +// satisfy the following properties: +// (1) It does not store duplicate items. +// (2) It uses MemTableRep::KeyComparator to compare items for iteration and +// equality. +// (3) It can be accessed concurrently by multiple readers and can support +// during reads. However, it needn't support multiple concurrent writes. +// (4) Items are never deleted. +// The liberal use of assertions is encouraged to enforce (1). +// +// The factory will be passed an Arena object when a new MemTableRep is +// requested. The API for this object is in rocksdb/arena.h. +// +// Users can implement their own memtable representations. We include three +// types built in: +// - SkipListRep: This is the default; it is backed by a skip list. +// - HashSkipListRep: The memtable rep that is best used for keys that are +// structured like "prefix:suffix" where iteration within a prefix is +// common and iteration across different prefixes is rare. It is backed by +// a hash map where each bucket is a skip list. +// - VectorRep: This is backed by an unordered std::vector. On iteration, the +// vector is sorted. It is intelligent about sorting; once the MarkReadOnly() +// has been called, the vector will only be sorted once. It is optimized for +// random-write-heavy workloads. +// +// The last four implementations are designed for situations in which +// iteration over the entire collection is rare since doing so requires all the +// keys to be copied into a sorted data structure. + +#pragma once + +#include +#include + +namespace rocksdb { + +class Arena; +class LookupKey; +class Slice; +class SliceTransform; +class Logger; + +typedef void* KeyHandle; + +class MemTableRep { + public: + // KeyComparator provides a means to compare keys, which are internal keys + // concatenated with values. + class KeyComparator { + public: + // Compare a and b. Return a negative value if a is less than b, 0 if they + // are equal, and a positive value if a is greater than b + virtual int operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const = 0; + + virtual int operator()(const char* prefix_len_key, + const Slice& key) const = 0; + + virtual ~KeyComparator() { } + }; + + explicit MemTableRep(Arena* arena) : arena_(arena) {} + + // Allocate a buf of len size for storing key. The idea is that a specific + // memtable representation knows its underlying data structure better. By + // allowing it to allocate memory, it can possibly put correlated stuff + // in consecutive memory area to make processor prefetching more efficient. + virtual KeyHandle Allocate(const size_t len, char** buf); + + // Insert key into the collection. (The caller will pack key and value into a + // single buffer and pass that in as the parameter to Insert). + // REQUIRES: nothing that compares equal to key is currently in the + // collection. + virtual void Insert(KeyHandle handle) = 0; + + // Returns true iff an entry that compares equal to key is in the collection. + virtual bool Contains(const char* key) const = 0; + + // Notify this table rep that it will no longer be added to. By default, does + // nothing. + virtual void MarkReadOnly() { } + + // Look up key from the mem table, since the first key in the mem table whose + // user_key matches the one given k, call the function callback_func(), with + // callback_args directly forwarded as the first parameter, and the mem table + // key as the second parameter. If the return value is false, then terminates. + // Otherwise, go through the next key. + // It's safe for Get() to terminate after having finished all the potential + // key for the k.user_key(), or not. + // + // Default: + // Get() function with a default value of dynamically construct an iterator, + // seek and call the call back function. + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)); + + // Report an approximation of how much memory has been used other than memory + // that was allocated through the arena. + virtual size_t ApproximateMemoryUsage() = 0; + + virtual ~MemTableRep() { } + + // Iteration over the contents of a skip collection + class Iterator { + public: + // Initialize an iterator over the specified collection. + // The returned iterator is not valid. + // explicit Iterator(const MemTableRep* collection); + virtual ~Iterator() {} + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const = 0; + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const = 0; + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() = 0; + }; + + // Return an iterator over the keys in this representation. + // arena: If not null, the arena needs to be used to allocate the Iterator. + // When destroying the iterator, the caller will not call "delete" + // but Iterator::~Iterator() directly. The destructor needs to destroy + // all the states but those allocated in arena. + virtual Iterator* GetIterator(Arena* arena = nullptr) = 0; + + // Return an iterator over at least the keys with the specified user key. The + // iterator may also allow access to other keys, but doesn't have to. Default: + // GetIterator(). + virtual Iterator* GetIterator(const Slice& user_key) { + return GetIterator(nullptr); + } + + // Return an iterator that has a special Seek semantics. The result of + // a Seek might only include keys with the same prefix as the target key. + // arena: If not null, the arena needs to be used to allocate the Iterator. + // When destroying the iterator, the caller will not call "delete" + // but Iterator::~Iterator() directly. The destructor needs to destroy + // all the states but those allocated in arena. + virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) { + return GetIterator(arena); + } + + // Return true if the current MemTableRep supports merge operator. + // Default: true + virtual bool IsMergeOperatorSupported() const { return true; } + + // Return true if the current MemTableRep supports snapshot + // Default: true + virtual bool IsSnapshotSupported() const { return true; } + + protected: + // When *key is an internal key concatenated with the value, returns the + // user key. + virtual Slice UserKey(const char* key) const; + + Arena* arena_; +}; + +// This is the base class for all factories that are used by RocksDB to create +// new MemTableRep objects +class MemTableRepFactory { + public: + virtual ~MemTableRepFactory() {} + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Arena*, const SliceTransform*, + Logger* logger) = 0; + virtual const char* Name() const = 0; +}; + +// This uses a skip list to store keys. It is the default. +class SkipListFactory : public MemTableRepFactory { + public: + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Arena*, const SliceTransform*, + Logger* logger) override; + virtual const char* Name() const override { return "SkipListFactory"; } +}; + +#ifndef ROCKSDB_LITE +// This creates MemTableReps that are backed by an std::vector. On iteration, +// the vector is sorted. This is useful for workloads where iteration is very +// rare and writes are generally not issued after reads begin. +// +// Parameters: +// count: Passed to the constructor of the underlying std::vector of each +// VectorRep. On initialization, the underlying array will be at least count +// bytes reserved for usage. +class VectorRepFactory : public MemTableRepFactory { + const size_t count_; + + public: + explicit VectorRepFactory(size_t count = 0) : count_(count) { } + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Arena*, const SliceTransform*, + Logger* logger) override; + virtual const char* Name() const override { + return "VectorRepFactory"; + } +}; + +// This class contains a fixed array of buckets, each +// pointing to a skiplist (null if the bucket is empty). +// bucket_count: number of fixed array buckets +// skiplist_height: the max height of the skiplist +// skiplist_branching_factor: probabilistic size ratio between adjacent +// link lists in the skiplist +extern MemTableRepFactory* NewHashSkipListRepFactory( + size_t bucket_count = 1000000, int32_t skiplist_height = 4, + int32_t skiplist_branching_factor = 4 +); + +// The factory is to create memtables with a hashed linked list: +// it contains a fixed array of buckets, each pointing to a sorted single +// linked list (null if the bucket is empty). +// @bucket_count: number of fixed array buckets +// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc. +// Otherwise from huge page TLB. The user needs to reserve +// huge pages for it to be allocated, like: +// sysctl -w vm.nr_hugepages=20 +// See linux doc Documentation/vm/hugetlbpage.txt +extern MemTableRepFactory* NewHashLinkListRepFactory( + size_t bucket_count = 50000, size_t huge_page_tlb_size = 0); + +// This factory creates a cuckoo-hashing based mem-table representation. +// Cuckoo-hash is a closed-hash strategy, in which all key/value pairs +// are stored in the bucket array itself intead of in some data structures +// external to the bucket array. In addition, each key in cuckoo hash +// has a constant number of possible buckets in the bucket array. These +// two properties together makes cuckoo hash more memory efficient and +// a constant worst-case read time. Cuckoo hash is best suitable for +// point-lookup workload. +// +// When inserting a key / value, it first checks whether one of its possible +// buckets is empty. If so, the key / value will be inserted to that vacant +// bucket. Otherwise, one of the keys originally stored in one of these +// possible buckets will be "kicked out" and move to one of its possible +// buckets (and possibly kicks out another victim.) In the current +// implementation, such "kick-out" path is bounded. If it cannot find a +// "kick-out" path for a specific key, this key will be stored in a backup +// structure, and the current memtable to be forced to immutable. +// +// Note that currently this mem-table representation does not support +// snapshot (i.e., it only queries latest state) and iterators. In addition, +// MultiGet operation might also lose its atomicity due to the lack of +// snapshot support. +// +// Parameters: +// write_buffer_size: the write buffer size in bytes. +// average_data_size: the average size of key + value in bytes. This value +// together with write_buffer_size will be used to compute the number +// of buckets. +// hash_function_count: the number of hash functions that will be used by +// the cuckoo-hash. The number also equals to the number of possible +// buckets each key will have. +extern MemTableRepFactory* NewHashCuckooRepFactory( + size_t write_buffer_size, size_t average_data_size = 64, + unsigned int hash_function_count = 4); +#endif // ROCKSDB_LITE +} // namespace rocksdb diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h new file mode 100644 index 0000000000..2ae64c1bc2 --- /dev/null +++ b/include/rocksdb/merge_operator.h @@ -0,0 +1,182 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_ +#define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_ + +#include +#include +#include +#include "rocksdb/slice.h" + +namespace rocksdb { + +class Slice; +class Logger; + +// The Merge Operator +// +// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only +// client knows. It could be numeric addition, list append, string +// concatenation, edit data structure, ... , anything. +// The library, on the other hand, is concerned with the exercise of this +// interface, at the right time (during get, iteration, compaction...) +// +// To use merge, the client needs to provide an object implementing one of +// the following interfaces: +// a) AssociativeMergeOperator - for most simple semantics (always take +// two values, and merge them into one value, which is then put back +// into rocksdb); numeric addition and string concatenation are examples; +// +// b) MergeOperator - the generic class for all the more abstract / complex +// operations; one method (FullMerge) to merge a Put/Delete value with a +// merge operand; and another method (PartialMerge) that merges multiple +// operands together. this is especially useful if your key values have +// complex structures but you would still like to support client-specific +// incremental updates. +// +// AssociativeMergeOperator is simpler to implement. MergeOperator is simply +// more powerful. +// +// Refer to rocksdb-merge wiki for more details and example implementations. +// +class MergeOperator { + public: + virtual ~MergeOperator() {} + + // Gives the client a way to express the read -> modify -> write semantics + // key: (IN) The key that's associated with this merge operation. + // Client could multiplex the merge operator based on it + // if the key space is partitioned and different subspaces + // refer to different types of data which have different + // merge operation semantics + // existing: (IN) null indicates that the key does not exist before this op + // operand_list:(IN) the sequence of merge operations to apply, front() first. + // new_value:(OUT) Client is responsible for filling the merge result here + // logger: (IN) Client could use this to log errors during merge. + // + // Return true on success. + // All values passed in will be client-specific values. So if this method + // returns false, it is because client specified bad data or there was + // internal corruption. This will be treated as an error by the library. + // + // Also make use of the *logger for error messages. + virtual bool FullMerge(const Slice& key, + const Slice* existing_value, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const = 0; + + // This function performs merge(left_op, right_op) + // when both the operands are themselves merge operation types + // that you would have passed to a DB::Merge() call in the same order + // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)). + // + // PartialMerge should combine them into a single merge operation that is + // saved into *new_value, and then it should return true. + // *new_value should be constructed such that a call to + // DB::Merge(key, *new_value) would yield the same result as a call + // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op). + // + // The default implementation of PartialMergeMulti will use this function + // as a helper, for backward compatibility. Any successor class of + // MergeOperator should either implement PartialMerge or PartialMergeMulti, + // although implementing PartialMergeMulti is suggested as it is in general + // more effective to merge multiple operands at a time instead of two + // operands at a time. + // + // If it is impossible or infeasible to combine the two operations, + // leave new_value unchanged and return false. The library will + // internally keep track of the operations, and apply them in the + // correct order once a base-value (a Put/Delete/End-of-Database) is seen. + // + // TODO: Presently there is no way to differentiate between error/corruption + // and simply "return false". For now, the client should simply return + // false in any case it cannot perform partial-merge, regardless of reason. + // If there is corruption in the data, handle it in the FullMerge() function, + // and return false there. The default implementation of PartialMerge will + // always return false. + virtual bool PartialMerge(const Slice& key, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* logger) const { + return false; + } + + // This function performs merge when all the operands are themselves merge + // operation types that you would have passed to a DB::Merge() call in the + // same order (front() first) + // (i.e. DB::Merge(key, operand_list[0]), followed by + // DB::Merge(key, operand_list[1]), ...) + // + // PartialMergeMulti should combine them into a single merge operation that is + // saved into *new_value, and then it should return true. *new_value should + // be constructed such that a call to DB::Merge(key, *new_value) would yield + // the same result as subquential individual calls to DB::Merge(key, operand) + // for each operand in operand_list from front() to back(). + // + // The PartialMergeMulti function will be called only when the list of + // operands are long enough. The minimum amount of operands that will be + // passed to the function are specified by the "min_partial_merge_operands" + // option. + // + // In the default implementation, PartialMergeMulti will invoke PartialMerge + // multiple times, where each time it only merges two operands. Developers + // should either implement PartialMergeMulti, or implement PartialMerge which + // is served as the helper function of the default PartialMergeMulti. + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const; + + // The name of the MergeOperator. Used to check for MergeOperator + // mismatches (i.e., a DB created with one MergeOperator is + // accessed using a different MergeOperator) + // TODO: the name is currently not stored persistently and thus + // no checking is enforced. Client is responsible for providing + // consistent MergeOperator between DB opens. + virtual const char* Name() const = 0; +}; + +// The simpler, associative merge operator. +class AssociativeMergeOperator : public MergeOperator { + public: + virtual ~AssociativeMergeOperator() {} + + // Gives the client a way to express the read -> modify -> write semantics + // key: (IN) The key that's associated with this merge operation. + // existing_value:(IN) null indicates the key does not exist before this op + // value: (IN) the value to update/merge the existing_value with + // new_value: (OUT) Client is responsible for filling the merge result here + // logger: (IN) Client could use this to log errors during merge. + // + // Return true on success. + // All values passed in will be client-specific values. So if this method + // returns false, it is because client specified bad data or there was + // internal corruption. The client should assume that this will be treated + // as an error by the library. + virtual bool Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const = 0; + + + private: + // Default implementations of the MergeOperator functions + virtual bool FullMerge(const Slice& key, + const Slice* existing_value, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const override; + + virtual bool PartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const override; +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_ diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h new file mode 100644 index 0000000000..ded76a3ab5 --- /dev/null +++ b/include/rocksdb/options.h @@ -0,0 +1,975 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ +#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ + +#include +#include +#include +#include +#include + +#include "rocksdb/version.h" +#include "rocksdb/universal_compaction.h" + +namespace rocksdb { + +class Cache; +class CompactionFilter; +class CompactionFilterFactory; +class CompactionFilterFactoryV2; +class Comparator; +class Env; +enum InfoLogLevel : unsigned char; +class FilterPolicy; +class Logger; +class MergeOperator; +class Snapshot; +class TableFactory; +class MemTableRepFactory; +class TablePropertiesCollectorFactory; +class Slice; +class SliceTransform; +class Statistics; +class InternalKeyComparator; + +using std::shared_ptr; + +// DB contents are stored in a set of blocks, each of which holds a +// sequence of key,value pairs. Each block may be compressed before +// being stored in a file. The following enum describes which +// compression method (if any) is used to compress a block. +enum CompressionType : char { + // NOTE: do not change the values of existing entries, as these are + // part of the persistent format on disk. + kNoCompression = 0x0, kSnappyCompression = 0x1, kZlibCompression = 0x2, + kBZip2Compression = 0x3, kLZ4Compression = 0x4, kLZ4HCCompression = 0x5 +}; + +enum CompactionStyle : char { + kCompactionStyleLevel = 0x0, // level based compaction style + kCompactionStyleUniversal = 0x1, // Universal compaction style + kCompactionStyleFIFO = 0x2, // FIFO compaction style +}; + +struct CompactionOptionsFIFO { + // once the total sum of table files reaches this, we will delete the oldest + // table file + // Default: 1GB + uint64_t max_table_files_size; + + CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {} +}; + +// Compression options for different compression algorithms like Zlib +struct CompressionOptions { + int window_bits; + int level; + int strategy; + CompressionOptions() : window_bits(-14), level(-1), strategy(0) {} + CompressionOptions(int wbits, int _lev, int _strategy) + : window_bits(wbits), level(_lev), strategy(_strategy) {} +}; + +enum UpdateStatus { // Return status For inplace update callback + UPDATE_FAILED = 0, // Nothing to update + UPDATED_INPLACE = 1, // Value updated inplace + UPDATED = 2, // No inplace update. Merged value set +}; + +struct Options; + +struct ColumnFamilyOptions { + // Some functions that make it easier to optimize RocksDB + + // Use this if you don't need to keep the data sorted, i.e. you'll never use + // an iterator, only Put() and Get() API calls + ColumnFamilyOptions* OptimizeForPointLookup(); + + // Default values for some parameters in ColumnFamilyOptions are not + // optimized for heavy workloads and big datasets, which means you might + // observe write stalls under some conditions. As a starting point for tuning + // RocksDB options, use the following two functions: + // * OptimizeLevelStyleCompaction -- optimizes level style compaction + // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction + // Universal style compaction is focused on reducing Write Amplification + // Factor for big data sets, but increases Space Amplification. You can learn + // more about the different styles here: + // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide + // Make sure to also call IncreaseParallelism(), which will provide the + // biggest performance gains. + // Note: we might use more memory than memtable_memory_budget during high + // write rate period + ColumnFamilyOptions* OptimizeLevelStyleCompaction( + uint64_t memtable_memory_budget = 512 * 1024 * 1024); + ColumnFamilyOptions* OptimizeUniversalStyleCompaction( + uint64_t memtable_memory_budget = 512 * 1024 * 1024); + + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator; + + // REQUIRES: The client must provide a merge operator if Merge operation + // needs to be accessed. Calling Merge on a DB without a merge operator + // would result in Status::NotSupported. The client must ensure that the + // merge operator supplied here has the same name and *exactly* the same + // semantics as the merge operator provided to previous open calls on + // the same DB. The only exception is reserved for upgrade, where a DB + // previously without a merge operator is introduced to Merge operation + // for the first time. It's necessary to specify a merge operator when + // openning the DB in this case. + // Default: nullptr + shared_ptr merge_operator; + + // A single CompactionFilter instance to call into during compaction. + // Allows an application to modify/delete a key-value during background + // compaction. + // + // If the client requires a new compaction filter to be used for different + // compaction runs, it can specify compaction_filter_factory instead of this + // option. The client should specify only one of the two. + // compaction_filter takes precedence over compaction_filter_factory if + // client specifies both. + // + // If multithreaded compaction is being used, the supplied CompactionFilter + // instance may be used from different threads concurrently and so should be + // thread-safe. + // + // Default: nullptr + const CompactionFilter* compaction_filter; + + // This is a factory that provides compaction filter objects which allow + // an application to modify/delete a key-value during background compaction. + // + // A new filter will be created on each compaction run. If multithreaded + // compaction is being used, each created CompactionFilter will only be used + // from a single thread and so does not need to be thread-safe. + // + // Default: a factory that doesn't provide any object + std::shared_ptr compaction_filter_factory; + + // Version TWO of the compaction_filter_factory + // It supports rolling compaction + // + // Default: a factory that doesn't provide any object + std::shared_ptr compaction_filter_factory_v2; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. + // + // Larger values increase performance, especially during bulk loads. + // Up to max_write_buffer_number write buffers may be held in memory + // at the same time, + // so you may wish to adjust this parameter to control memory usage. + // Also, a larger write buffer will result in a longer recovery time + // the next time the database is opened. + // + // Default: 4MB + size_t write_buffer_size; + + // The maximum number of write buffers that are built up in memory. + // The default is 2, so that when 1 write buffer is being flushed to + // storage, new writes can continue to the other write buffer. + // Default: 2 + int max_write_buffer_number; + + // The minimum number of write buffers that will be merged together + // before writing to storage. If set to 1, then + // all write buffers are fushed to L0 as individual files and this increases + // read amplification because a get request has to check in all of these + // files. Also, an in-memory merge may result in writing lesser + // data to storage if there are duplicate records in each of these + // individual write buffers. Default: 1 + int min_write_buffer_number_to_merge; + + // Control over blocks (user data is stored in a set of blocks, and + // a block is the unit of reading from disk). + + // If non-NULL use the specified cache for blocks. + // If NULL, rocksdb will automatically create and use an 8MB internal cache. + // Default: nullptr + shared_ptr block_cache; + + // If non-NULL use the specified cache for compressed blocks. + // If NULL, rocksdb will not use a compressed block cache. + // Default: nullptr + shared_ptr block_cache_compressed; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + // + // Default: 4K + size_t block_size; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. + // + // Default: 16 + int block_restart_interval; + + // Compress blocks using the specified compression algorithm. This + // parameter can be changed dynamically. + // + // Default: kSnappyCompression, which gives lightweight but fast + // compression. + // + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kSnappyCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + CompressionType compression; + + // Different levels can have different compression policies. There + // are cases where most lower levels would like to quick compression + // algorithm while the higher levels (which have more data) use + // compression algorithms that have better compression but could + // be slower. This array, if non nullptr, should have an entry for + // each level of the database. This array, if non nullptr, overides the + // value specified in the previous field 'compression'. The caller is + // reponsible for allocating memory and initializing the values in it + // before invoking Open(). The caller is responsible for freeing this + // array and it could be freed anytime after the return from Open(). + // This could have been a std::vector but that makes the equivalent + // java/C api hard to construct. + std::vector compression_per_level; + + // different options for compression algorithms + CompressionOptions compression_opts; + + // If non-nullptr, use the specified filter policy to reduce disk reads. + // Many applications will benefit from passing the result of + // NewBloomFilterPolicy() here. + // + // Default: nullptr + const FilterPolicy* filter_policy; + + // If non-nullptr, use the specified function to determine the + // prefixes for keys. These prefixes will be placed in the filter. + // Depending on the workload, this can reduce the number of read-IOP + // cost for scans when a prefix is passed via ReadOptions to + // db.NewIterator(). For prefix filtering to work properly, + // "prefix_extractor" and "comparator" must be such that the following + // properties hold: + // + // 1) key.starts_with(prefix(key)) + // 2) Compare(prefix(key), key) <= 0. + // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0 + // 4) prefix(prefix(key)) == prefix(key) + // + // Default: nullptr + std::shared_ptr prefix_extractor; + + // If true, place whole keys in the filter (not just prefixes). + // This must generally be true for gets to be efficient. + // + // Default: true + bool whole_key_filtering; + + // Number of levels for this database + int num_levels; + + // Number of files to trigger level-0 compaction. A value <0 means that + // level-0 compaction will not be triggered by number of files at all. + // + // Default: 4 + int level0_file_num_compaction_trigger; + + // Soft limit on number of level-0 files. We start slowing down writes at this + // point. A value <0 means that no writing slow down will be triggered by + // number of files in level-0. + int level0_slowdown_writes_trigger; + + // Maximum number of level-0 files. We stop writes at this point. + int level0_stop_writes_trigger; + + // Maximum level to which a new compacted memtable is pushed if it + // does not create overlap. We try to push to level 2 to avoid the + // relatively expensive level 0=>1 compactions and to avoid some + // expensive manifest file operations. We do not push all the way to + // the largest level since that can generate a lot of wasted disk + // space if the same key space is being repeatedly overwritten. + int max_mem_compaction_level; + + // Target file size for compaction. + // target_file_size_base is per-file size for level-1. + // Target file size for level L can be calculated by + // target_file_size_base * (target_file_size_multiplier ^ (L-1)) + // For example, if target_file_size_base is 2MB and + // target_file_size_multiplier is 10, then each file on level-1 will + // be 2MB, and each file on level 2 will be 20MB, + // and each file on level-3 will be 200MB. + + // by default target_file_size_base is 2MB. + int target_file_size_base; + // by default target_file_size_multiplier is 1, which means + // by default files in different levels will have similar size. + int target_file_size_multiplier; + + // Control maximum total data size for a level. + // max_bytes_for_level_base is the max total for level-1. + // Maximum number of bytes for level L can be calculated as + // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1)) + // For example, if max_bytes_for_level_base is 20MB, and if + // max_bytes_for_level_multiplier is 10, total data size for level-1 + // will be 20MB, total file size for level-2 will be 200MB, + // and total file size for level-3 will be 2GB. + + // by default 'max_bytes_for_level_base' is 10MB. + uint64_t max_bytes_for_level_base; + // by default 'max_bytes_for_level_base' is 10. + int max_bytes_for_level_multiplier; + + // Different max-size multipliers for different levels. + // These are multiplied by max_bytes_for_level_multiplier to arrive + // at the max-size of each level. + // Default: 1 + std::vector max_bytes_for_level_multiplier_additional; + + // Maximum number of bytes in all compacted files. We avoid expanding + // the lower level file set of a compaction if it would make the + // total compaction cover more than + // (expanded_compaction_factor * targetFileSizeLevel()) many bytes. + int expanded_compaction_factor; + + // Maximum number of bytes in all source files to be compacted in a + // single compaction run. We avoid picking too many files in the + // source level so that we do not exceed the total source bytes + // for compaction to exceed + // (source_compaction_factor * targetFileSizeLevel()) many bytes. + // Default:1, i.e. pick maxfilesize amount of data as the source of + // a compaction. + int source_compaction_factor; + + // Control maximum bytes of overlaps in grandparent (i.e., level+2) before we + // stop building a single file in a level->level+1 compaction. + int max_grandparent_overlap_factor; + + // Disable compaction triggered by seek. + // With bloomfilter and fast storage, a miss on one level + // is very cheap if the file handle is cached in table cache + // (which is true if max_open_files is large). + // Default: true + bool disable_seek_compaction; + + // Puts are delayed 0-1 ms when any level has a compaction score that exceeds + // soft_rate_limit. This is ignored when == 0.0. + // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not + // hold, RocksDB will set soft_rate_limit = hard_rate_limit + // Default: 0 (disabled) + double soft_rate_limit; + + // Puts are delayed 1ms at a time when any level has a compaction score that + // exceeds hard_rate_limit. This is ignored when <= 1.0. + // Default: 0 (disabled) + double hard_rate_limit; + + // Max time a put will be stalled when hard_rate_limit is enforced. If 0, then + // there is no limit. + // Default: 1000 + unsigned int rate_limit_delay_max_milliseconds; + + // Disable block cache. If this is set to true, + // then no block cache should be used, and the block_cache should + // point to a nullptr object. + // Default: false + bool no_block_cache; + + // size of one block in arena memory allocation. + // If <= 0, a proper value is automatically calculated (usually 1/10 of + // writer_buffer_size). + // + // There are two additonal restriction of the The specified size: + // (1) size should be in the range of [4096, 2 << 30] and + // (2) be the multiple of the CPU word (which helps with the memory + // alignment). + // + // We'll automatically check and adjust the size number to make sure it + // conforms to the restrictions. + // + // Default: 0 + size_t arena_block_size; + + // Disable automatic compactions. Manual compactions can still + // be issued on this column family + bool disable_auto_compactions; + + // Purge duplicate/deleted keys when a memtable is flushed to storage. + // Default: true + bool purge_redundant_kvs_while_flush; + + // This is used to close a block before it reaches the configured + // 'block_size'. If the percentage of free space in the current block is less + // than this specified number and adding a new record to the block will + // exceed the configured block size, then this block will be closed and the + // new record will be written to the next block. + // Default is 10. + int block_size_deviation; + + // The compaction style. Default: kCompactionStyleLevel + CompactionStyle compaction_style; + + // If true, compaction will verify checksum on every read that happens + // as part of compaction + // Default: true + bool verify_checksums_in_compaction; + + // The options needed to support Universal Style compactions + CompactionOptionsUniversal compaction_options_universal; + + // The options for FIFO compaction style + CompactionOptionsFIFO compaction_options_fifo; + + // Use KeyMayExist API to filter deletes when this is true. + // If KeyMayExist returns false, i.e. the key definitely does not exist, then + // the delete is a noop. KeyMayExist only incurs in-memory look up. + // This optimization avoids writing the delete to storage when appropriate. + // Default: false + bool filter_deletes; + + // An iteration->Next() sequentially skips over keys with the same + // user-key unless this option is set. This number specifies the number + // of keys (with the same userkey) that will be sequentially + // skipped before a reseek is issued. + // Default: 8 + uint64_t max_sequential_skip_in_iterations; + + // This is a factory that provides MemTableRep objects. + // Default: a factory that provides a skip-list-based implementation of + // MemTableRep. + std::shared_ptr memtable_factory; + + // This is a factory that provides TableFactory objects. + // Default: a factory that provides a default implementation of + // Table and TableBuilder. + std::shared_ptr table_factory; + + // This option allows user to to collect their own interested statistics of + // the tables. + // Default: empty vector -- no user-defined statistics collection will be + // performed. + typedef std::vector> + TablePropertiesCollectorFactories; + TablePropertiesCollectorFactories table_properties_collector_factories; + + // Allows thread-safe inplace updates. + // If inplace_callback function is not set, + // Put(key, new_value) will update inplace the existing_value iff + // * key exists in current memtable + // * new sizeof(new_value) <= sizeof(existing_value) + // * existing_value for that key is a put i.e. kTypeValue + // If inplace_callback function is set, check doc for inplace_callback. + // Default: false. + bool inplace_update_support; + + // Number of locks used for inplace update + // Default: 10000, if inplace_update_support = true, else 0. + size_t inplace_update_num_locks; + + // existing_value - pointer to previous value (from both memtable and sst). + // nullptr if key doesn't exist + // existing_value_size - pointer to size of existing_value). + // nullptr if key doesn't exist + // delta_value - Delta value to be merged with the existing_value. + // Stored in transaction logs. + // merged_value - Set when delta is applied on the previous value. + + // Applicable only when inplace_update_support is true, + // this callback function is called at the time of updating the memtable + // as part of a Put operation, lets say Put(key, delta_value). It allows the + // 'delta_value' specified as part of the Put operation to be merged with + // an 'existing_value' of the key in the database. + + // If the merged value is smaller in size that the 'existing_value', + // then this function can update the 'existing_value' buffer inplace and + // the corresponding 'existing_value'_size pointer, if it wishes to. + // The callback should return UpdateStatus::UPDATED_INPLACE. + // In this case. (In this case, the snapshot-semantics of the rocksdb + // Iterator is not atomic anymore). + + // If the merged value is larger in size than the 'existing_value' or the + // application does not wish to modify the 'existing_value' buffer inplace, + // then the merged value should be returned via *merge_value. It is set by + // merging the 'existing_value' and the Put 'delta_value'. The callback should + // return UpdateStatus::UPDATED in this case. This merged value will be added + // to the memtable. + + // If merging fails or the application does not wish to take any action, + // then the callback should return UpdateStatus::UPDATE_FAILED. + + // Please remember that the original call from the application is Put(key, + // delta_value). So the transaction log (if enabled) will still contain (key, + // delta_value). The 'merged_value' is not stored in the transaction log. + // Hence the inplace_callback function should be consistent across db reopens. + + // Default: nullptr + UpdateStatus (*inplace_callback)(char* existing_value, + uint32_t* existing_value_size, + Slice delta_value, + std::string* merged_value); + + // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom + // for memtable + uint32_t memtable_prefix_bloom_bits; + + // number of hash probes per key + uint32_t memtable_prefix_bloom_probes; + + // Page size for huge page TLB for bloom in memtable. If <=0, not allocate + // from huge page TLB but from malloc. + // Need to reserve huge pages for it to be allocated. For example: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + + size_t memtable_prefix_bloom_huge_page_tlb_size; + + // Control locality of bloom filter probes to improve cache miss rate. + // This option only applies to memtable prefix bloom and plaintable + // prefix bloom. It essentially limits every bloom checking to one cache line. + // This optimization is turned off when set to 0, and positive number to turn + // it on. + // Default: 0 + uint32_t bloom_locality; + + // Maximum number of successive merge operations on a key in the memtable. + // + // When a merge operation is added to the memtable and the maximum number of + // successive merges is reached, the value of the key will be calculated and + // inserted into the memtable instead of the merge operation. This will + // ensure that there are never more than max_successive_merges merge + // operations in the memtable. + // + // Default: 0 (disabled) + size_t max_successive_merges; + + // The number of partial merge operands to accumulate before partial + // merge will be performed. Partial merge will not be called + // if the list of values to merge is less than min_partial_merge_operands. + // + // If min_partial_merge_operands < 2, then it will be treated as 2. + // + // Default: 2 + uint32_t min_partial_merge_operands; + + // Create ColumnFamilyOptions with default values for all fields + ColumnFamilyOptions(); + // Create ColumnFamilyOptions from Options + explicit ColumnFamilyOptions(const Options& options); + + void Dump(Logger* log) const; +}; + +struct DBOptions { + // Some functions that make it easier to optimize RocksDB + + // By default, RocksDB uses only one background thread for flush and + // compaction. Calling this function will set it up such that total of + // `total_threads` is used. Good value for `total_threads` is the number of + // cores. You almost definitely want to call this function if your system is + // bottlenecked by RocksDB. + DBOptions* IncreaseParallelism(int total_threads = 16); + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists; + + // If true, the implementation will do aggressive checking of the + // data it is processing and will stop early if it detects any + // errors. This may have unforeseen ramifications: for example, a + // corruption of one DB entry may cause a large number of entries to + // become unreadable or for the entire DB to become unopenable. + // If any of the writes to the database fails (Put, Delete, Merge, Write), + // the database will switch to read-only mode and fail all other + // Write operations. + // Default: true + bool paranoid_checks; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. + // Default: Env::Default() + Env* env; + + // Any internal progress/error information generated by the db will + // be written to info_log if it is non-nullptr, or to a file stored + // in the same directory as the DB contents if info_log is nullptr. + // Default: nullptr + shared_ptr info_log; + + InfoLogLevel info_log_level; + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set. Value -1 means + // files opened are always kept open. You can estimate number of files based + // on target_file_size_base and target_file_size_multiplier for level-based + // compaction. For universal-style compaction, you can usually set it to -1. + // Default: 5000 + int max_open_files; + + // Once write-ahead logs exceed this size, we will start forcing the flush of + // column families whose memtables are backed by the oldest live WAL file + // (i.e. the ones that are causing all the space amplification). If set to 0 + // (default), we will dynamically choose the WAL size limit to be + // [sum of all write_buffer_size * max_write_buffer_number] * 2 + // Default: 0 + uint64_t max_total_wal_size; + + // If non-null, then we should collect metrics about database operations + // Statistics objects should not be shared between DB instances as + // it does not use any locks to prevent concurrent updates. + shared_ptr statistics; + + // If true, then the contents of data files are not synced + // to stable storage. Their contents remain in the OS buffers till the + // OS decides to flush them. This option is good for bulk-loading + // of data. Once the bulk-loading is complete, please issue a + // sync to the OS to flush all dirty buffesrs to stable storage. + // Default: false + bool disableDataSync; + + // If true, then every store to stable storage will issue a fsync. + // If false, then every store to stable storage will issue a fdatasync. + // This parameter should be set to true while storing data to + // filesystem like ext3 that can lose files after a reboot. + // Default: false + bool use_fsync; + + // This number controls how often a new scribe log about + // db deploy stats is written out. + // -1 indicates no logging at all. + // Default value is 1800 (half an hour). + int db_stats_log_interval; + + // This specifies the info LOG dir. + // If it is empty, the log files will be in the same dir as data. + // If it is non empty, the log files will be in the specified dir, + // and the db data dir's absolute path will be used as the log file + // name's prefix. + std::string db_log_dir; + + // This specifies the absolute dir path for write-ahead logs (WAL). + // If it is empty, the log files will be in the same dir as data, + // dbname is used as the data dir by default + // If it is non empty, the log files will be in kept the specified dir. + // When destroying the db, + // all log files in wal_dir and the dir itself is deleted + std::string wal_dir; + + // The periodicity when obsolete files get deleted. The default + // value is 6 hours. The files that get out of scope by compaction + // process will still get automatically delete on every compaction, + // regardless of this setting + uint64_t delete_obsolete_files_period_micros; + + // Maximum number of concurrent background compaction jobs, submitted to + // the default LOW priority thread pool. + // If you're increasing this, also consider increasing number of threads in + // LOW priority thread pool. For more information, see + // Env::SetBackgroundThreads + // Default: 1 + int max_background_compactions; + + // Maximum number of concurrent background memtable flush jobs, submitted to + // the HIGH priority thread pool. + // + // By default, all background jobs (major compaction and memtable flush) go + // to the LOW priority pool. If this option is set to a positive number, + // memtable flush jobs will be submitted to the HIGH priority pool. + // It is important when the same Env is shared by multiple db instances. + // Without a separate pool, long running major compaction jobs could + // potentially block memtable flush jobs of other db instances, leading to + // unnecessary Put stalls. + // + // If you're increasing this, also consider increasing number of threads in + // HIGH priority thread pool. For more information, see + // Env::SetBackgroundThreads + // Default: 1 + int max_background_flushes; + + // Specify the maximal size of the info log file. If the log file + // is larger than `max_log_file_size`, a new info log file will + // be created. + // If max_log_file_size == 0, all logs will be written to one + // log file. + size_t max_log_file_size; + + // Time for the info log file to roll (in seconds). + // If specified with non-zero value, log file will be rolled + // if it has been active longer than `log_file_time_to_roll`. + // Default: 0 (disabled) + size_t log_file_time_to_roll; + + // Maximal info log files to be kept. + // Default: 1000 + size_t keep_log_file_num; + + // manifest file is rolled over on reaching this limit. + // The older manifest file be deleted. + // The default value is MAX_INT so that roll-over does not take place. + uint64_t max_manifest_file_size; + + // Number of shards used for table cache. + int table_cache_numshardbits; + + // During data eviction of table's LRU cache, it would be inefficient + // to strictly follow LRU because this piece of memory will not really + // be released unless its refcount falls to zero. Instead, make two + // passes: the first pass will release items with refcount = 1, + // and if not enough space releases after scanning the number of + // elements specified by this parameter, we will remove items in LRU + // order. + int table_cache_remove_scan_count_limit; + + // The following two fields affect how archived logs will be deleted. + // 1. If both set to 0, logs will be deleted asap and will not get into + // the archive. + // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + // WAL files will be checked every 10 min and if total size is greater + // then WAL_size_limit_MB, they will be deleted starting with the + // earliest until size_limit is met. All empty files will be deleted. + // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + // WAL files will be checked every WAL_ttl_secondsi / 2 and those that + // are older than WAL_ttl_seconds will be deleted. + // 4. If both are not 0, WAL files will be checked every 10 min and both + // checks will be performed with ttl being first. + uint64_t WAL_ttl_seconds; + uint64_t WAL_size_limit_MB; + + // Number of bytes to preallocate (via fallocate) the manifest + // files. Default is 4mb, which is reasonable to reduce random IO + // as well as prevent overallocation for mounts that preallocate + // large amounts of data (such as xfs's allocsize option). + size_t manifest_preallocation_size; + + // Data being read from file storage may be buffered in the OS + // Default: true + bool allow_os_buffer; + + // Allow the OS to mmap file for reading sst tables. Default: false + bool allow_mmap_reads; + + // Allow the OS to mmap file for writing. Default: false + bool allow_mmap_writes; + + // Disable child process inherit open files. Default: true + bool is_fd_close_on_exec; + + // Skip log corruption error on recovery (If client is ok with + // losing most recent changes) + // Default: false + bool skip_log_error_on_recovery; + + // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec + // Default: 3600 (1 hour) + unsigned int stats_dump_period_sec; + + // If set true, will hint the underlying file system that the file + // access pattern is random, when a sst file is opened. + // Default: true + bool advise_random_on_open; + + // Specify the file access pattern once a compaction is started. + // It will be applied to all input files of a compaction. + // Default: NORMAL + enum { + NONE, + NORMAL, + SEQUENTIAL, + WILLNEED + } access_hint_on_compaction_start; + + // Use adaptive mutex, which spins in the user space before resorting + // to kernel. This could reduce context switch when the mutex is not + // heavily contended. However, if the mutex is hot, we could end up + // wasting spin time. + // Default: false + bool use_adaptive_mutex; + + // Allows OS to incrementally sync files to disk while they are being + // written, asynchronously, in the background. + // Issue one request for every bytes_per_sync written. 0 turns it off. + // Default: 0 + uint64_t bytes_per_sync; + + // Allow RocksDB to use thread local storage to optimize performance. + // Default: true + bool allow_thread_local; + + // Create DBOptions with default values for all fields + DBOptions(); + // Create DBOptions from Options + explicit DBOptions(const Options& options); + + void Dump(Logger* log) const; +}; + +// Options to control the behavior of a database (passed to DB::Open) +struct Options : public DBOptions, public ColumnFamilyOptions { + // Create an Options object with default values for all fields. + Options() : + DBOptions(), + ColumnFamilyOptions() {} + + Options(const DBOptions& db_options, + const ColumnFamilyOptions& column_family_options) + : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {} + + void Dump(Logger* log) const; + + // Set appropriate parameters for bulk loading. + // The reason that this is a function that returns "this" instead of a + // constructor is to enable chaining of multiple similar calls in the future. + // + + // All data will be in level 0 without any automatic compaction. + // It's recommended to manually call CompactRange(NULL, NULL) before reading + // from the database, because otherwise the read can be very slow. + Options* PrepareForBulkLoad(); +}; + +// +// An application can issue a read request (via Get/Iterators) and specify +// if that read should process data that ALREADY resides on a specified cache +// level. For example, if an application specifies kBlockCacheTier then the +// Get call will process data that is already processed in the memtable or +// the block cache. It will not page in data from the OS cache or data that +// resides in storage. +enum ReadTier { + kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage + kBlockCacheTier = 0x1 // data in memtable or block cache +}; + +// Options that control read operations +struct ReadOptions { + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: true + bool verify_checksums; + + // Should the "data block"/"index block"/"filter block" read for this + // iteration be cached in memory? + // Callers may wish to set this field to false for bulk scans. + // Default: true + bool fill_cache; + + // If this option is set and memtable implementation allows, Seek + // might only return keys with the same prefix as the seek-key + // + // ! DEPRECATED: prefix_seek is on by default when prefix_extractor + // is configured + // bool prefix_seek; + + // If "snapshot" is non-nullptr, read as of the supplied snapshot + // (which must belong to the DB that is being read and which must + // not have been released). If "snapshot" is nullptr, use an impliicit + // snapshot of the state at the beginning of this read operation. + // Default: nullptr + const Snapshot* snapshot; + + // If "prefix" is non-nullptr, and ReadOptions is being passed to + // db.NewIterator, only return results when the key begins with this + // prefix. This field is ignored by other calls (e.g., Get). + // Options.prefix_extractor must also be set, and + // prefix_extractor.InRange(prefix) must be true. The iterator + // returned by NewIterator when this option is set will behave just + // as if the underlying store did not contain any non-matching keys, + // with two exceptions. Seek() only accepts keys starting with the + // prefix, and SeekToLast() is not supported. prefix filter with this + // option will sometimes reduce the number of read IOPs. + // Default: nullptr + // + // ! DEPRECATED + // const Slice* prefix; + + // Specify if this read request should process data that ALREADY + // resides on a particular cache. If the required data is not + // found at the specified cache, then Status::Incomplete is returned. + // Default: kReadAllTier + ReadTier read_tier; + + // Specify to create a tailing iterator -- a special iterator that has a + // view of the complete database (i.e. it can also be used to read newly + // added data) and is optimized for sequential reads. It will return records + // that were inserted into the database after the creation of the iterator. + // Default: false + // Not supported in ROCKSDB_LITE mode! + bool tailing; + + ReadOptions() + : verify_checksums(true), + fill_cache(true), + snapshot(nullptr), + read_tier(kReadAllTier), + tailing(false) {} + ReadOptions(bool cksum, bool cache) + : verify_checksums(cksum), + fill_cache(cache), + snapshot(nullptr), + read_tier(kReadAllTier), + tailing(false) {} +}; + +// Options that control write operations +struct WriteOptions { + // If true, the write will be flushed from the operating system + // buffer cache (by calling WritableFile::Sync()) before the write + // is considered complete. If this flag is true, writes will be + // slower. + // + // If this flag is false, and the machine crashes, some recent + // writes may be lost. Note that if it is just the process that + // crashes (i.e., the machine does not reboot), no writes will be + // lost even if sync==false. + // + // In other words, a DB write with sync==false has similar + // crash semantics as the "write()" system call. A DB write + // with sync==true has similar crash semantics to a "write()" + // system call followed by "fdatasync()". + // + // Default: false + bool sync; + + // If true, writes will not first go to the write ahead log, + // and the write may got lost after a crash. + bool disableWAL; + + WriteOptions() : sync(false), disableWAL(false) {} +}; + +// Options that control flush operations +struct FlushOptions { + // If true, the flush will wait until the flush is done. + // Default: true + bool wait; + + FlushOptions() : wait(true) {} +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h new file mode 100644 index 0000000000..0704ea2108 --- /dev/null +++ b/include/rocksdb/perf_context.h @@ -0,0 +1,75 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H +#define STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H + +#include +#include + +namespace rocksdb { + +enum PerfLevel { + kDisable = 0, // disable perf stats + kEnableCount = 1, // enable only count stats + kEnableTime = 2 // enable time stats too +}; + +// set the perf stats level +void SetPerfLevel(PerfLevel level); + +// A thread local context for gathering performance counter efficiently +// and transparently. + +struct PerfContext { + + void Reset(); // reset all performance counters to zero + + std::string ToString() const; + + uint64_t user_key_comparison_count; // total number of user key comparisons + uint64_t block_cache_hit_count; // total number of block cache hits + uint64_t block_read_count; // total number of block reads (with IO) + uint64_t block_read_byte; // total number of bytes from block reads + uint64_t block_read_time; // total time spent on block reads + uint64_t block_checksum_time; // total time spent on block checksum + uint64_t block_decompress_time; // total time spent on block decompression + // total number of internal keys skipped over during iteration (overwritten or + // deleted, to be more specific, hidden by a put or delete of the same key) + uint64_t internal_key_skipped_count; + // total number of deletes skipped over during iteration + uint64_t internal_delete_skipped_count; + + uint64_t get_snapshot_time; // total time spent on getting snapshot + uint64_t get_from_memtable_time; // total time spent on querying memtables + uint64_t get_from_memtable_count; // number of mem tables queried + // total time spent after Get() finds a key + uint64_t get_post_process_time; + uint64_t get_from_output_files_time; // total time reading from output files + // total time spent on seeking child iters + uint64_t seek_child_seek_time; + // number of seek issued in child iterators + uint64_t seek_child_seek_count; + uint64_t seek_min_heap_time; // total time spent on the merge heap + // total time spent on seeking the internal entries + uint64_t seek_internal_seek_time; + // total time spent on iterating internal entries to find the next user entry + uint64_t find_next_user_entry_time; + // total time spent on pre or post processing when writing a record + uint64_t write_pre_and_post_process_time; + uint64_t write_wal_time; // total time spent on writing to WAL + // total time spent on writing to mem tables + uint64_t write_memtable_time; +}; + +#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) +extern PerfContext perf_context; +#else +extern __thread PerfContext perf_context; +#endif + +} + +#endif diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h new file mode 100644 index 0000000000..2253715714 --- /dev/null +++ b/include/rocksdb/slice.h @@ -0,0 +1,136 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Slice is a simple structure containing a pointer into some external +// storage and a size. The user of a Slice must ensure that the slice +// is not used after the corresponding external storage has been +// deallocated. +// +// Multiple threads can invoke const methods on a Slice without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Slice must use +// external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_H_ +#define STORAGE_ROCKSDB_INCLUDE_SLICE_H_ + +#include +#include +#include +#include + +namespace rocksdb { + +class Slice { + public: + // Create an empty slice. + Slice() : data_(""), size_(0) { } + + // Create a slice that refers to d[0,n-1]. + Slice(const char* d, size_t n) : data_(d), size_(n) { } + + // Create a slice that refers to the contents of "s" + /* implicit */ + Slice(const std::string& s) : data_(s.data()), size_(s.size()) { } + + // Create a slice that refers to s[0,strlen(s)-1] + /* implicit */ + Slice(const char* s) : data_(s), size_(strlen(s)) { } + + // Return a pointer to the beginning of the referenced data + const char* data() const { return data_; } + + // Return the length (in bytes) of the referenced data + size_t size() const { return size_; } + + // Return true iff the length of the referenced data is zero + bool empty() const { return size_ == 0; } + + // Return the ith byte in the referenced data. + // REQUIRES: n < size() + char operator[](size_t n) const { + assert(n < size()); + return data_[n]; + } + + // Change this slice to refer to an empty array + void clear() { data_ = ""; size_ = 0; } + + // Drop the first "n" bytes from this slice. + void remove_prefix(size_t n) { + assert(n <= size()); + data_ += n; + size_ -= n; + } + + // Return a string that contains the copy of the referenced data. + std::string ToString(bool hex = false) const { + if (hex) { + std::string result; + char buf[10]; + for (size_t i = 0; i < size_; i++) { + snprintf(buf, 10, "%02X", (unsigned char)data_[i]); + result += buf; + } + return result; + } else { + return std::string(data_, size_); + } + } + + // Three-way comparison. Returns value: + // < 0 iff "*this" < "b", + // == 0 iff "*this" == "b", + // > 0 iff "*this" > "b" + int compare(const Slice& b) const; + + // Return true iff "x" is a prefix of "*this" + bool starts_with(const Slice& x) const { + return ((size_ >= x.size_) && + (memcmp(data_, x.data_, x.size_) == 0)); + } + + // private: make these public for rocksdbjni access + const char* data_; + size_t size_; + + // Intentionally copyable +}; + +// A set of Slices that are virtually concatenated together. 'parts' points +// to an array of Slices. The number of elements in the array is 'num_parts'. +struct SliceParts { + SliceParts(const Slice* _parts, int _num_parts) : + parts(_parts), num_parts(_num_parts) { } + + const Slice* parts; + int num_parts; +}; + +inline bool operator==(const Slice& x, const Slice& y) { + return ((x.size() == y.size()) && + (memcmp(x.data(), y.data(), x.size()) == 0)); +} + +inline bool operator!=(const Slice& x, const Slice& y) { + return !(x == y); +} + +inline int Slice::compare(const Slice& b) const { + const int min_len = (size_ < b.size_) ? size_ : b.size_; + int r = memcmp(data_, b.data_, min_len); + if (r == 0) { + if (size_ < b.size_) r = -1; + else if (size_ > b.size_) r = +1; + } + return r; +} + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_H_ diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h new file mode 100644 index 0000000000..a78455001a --- /dev/null +++ b/include/rocksdb/slice_transform.h @@ -0,0 +1,47 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Class for specifying user-defined functions which perform a +// transformation on a slice. It is not required that every slice +// belong to the domain and/or range of a function. Subclasses should +// define InDomain and InRange to determine which slices are in either +// of these sets respectively. + +#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_ +#define STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_ + +#include + +namespace rocksdb { + +class Slice; + +class SliceTransform { + public: + virtual ~SliceTransform() {}; + + // Return the name of this transformation. + virtual const char* Name() const = 0; + + // transform a src in domain to a dst in the range + virtual Slice Transform(const Slice& src) const = 0; + + // determine whether this is a valid src upon the function applies + virtual bool InDomain(const Slice& src) const = 0; + + // determine whether dst=Transform(src) for some src + virtual bool InRange(const Slice& dst) const = 0; +}; + +extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len); + +extern const SliceTransform* NewNoopTransform(); + +} + +#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_ diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h new file mode 100644 index 0000000000..dcd82f6637 --- /dev/null +++ b/include/rocksdb/statistics.h @@ -0,0 +1,268 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_ +#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_ + +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +/** + * Keep adding ticker's here. + * 1. Any ticker should be added before TICKER_ENUM_MAX. + * 2. Add a readable string in TickersNameMap below for the newly added ticker. + */ +enum Tickers { + // total block cache misses + // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS + + // BLOCK_CACHE_FILTER_MISS + + // BLOCK_CACHE_DATA_MISS; + BLOCK_CACHE_MISS, + // total block cache hit + // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT + + // BLOCK_CACHE_FILTER_HIT + + // BLOCK_CACHE_DATA_HIT; + BLOCK_CACHE_HIT, + // # of blocks added to block cache. + BLOCK_CACHE_ADD, + // # of times cache miss when accessing index block from block cache. + BLOCK_CACHE_INDEX_MISS, + // # of times cache hit when accessing index block from block cache. + BLOCK_CACHE_INDEX_HIT, + // # of times cache miss when accessing filter block from block cache. + BLOCK_CACHE_FILTER_MISS, + // # of times cache hit when accessing filter block from block cache. + BLOCK_CACHE_FILTER_HIT, + // # of times cache miss when accessing data block from block cache. + BLOCK_CACHE_DATA_MISS, + // # of times cache hit when accessing data block from block cache. + BLOCK_CACHE_DATA_HIT, + // # of times bloom filter has avoided file reads. + BLOOM_FILTER_USEFUL, + + // # of memtable hits. + MEMTABLE_HIT, + // # of memtable misses. + MEMTABLE_MISS, + + /** + * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction + * There are 3 reasons currently. + */ + COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value. + COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete. + COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key. + + // Number of keys written to the database via the Put and Write call's + NUMBER_KEYS_WRITTEN, + // Number of Keys read, + NUMBER_KEYS_READ, + // Number keys updated, if inplace update is enabled + NUMBER_KEYS_UPDATED, + // Bytes written / read + BYTES_WRITTEN, + BYTES_READ, + NO_FILE_CLOSES, + NO_FILE_OPENS, + NO_FILE_ERRORS, + // Time system had to wait to do LO-L1 compactions + STALL_L0_SLOWDOWN_MICROS, + // Time system had to wait to move memtable to L1. + STALL_MEMTABLE_COMPACTION_MICROS, + // write throttle because of too many files in L0 + STALL_L0_NUM_FILES_MICROS, + RATE_LIMIT_DELAY_MILLIS, + NO_ITERATORS, // number of iterators currently open + + // Number of MultiGet calls, keys read, and bytes read + NUMBER_MULTIGET_CALLS, + NUMBER_MULTIGET_KEYS_READ, + NUMBER_MULTIGET_BYTES_READ, + + // Number of deletes records that were not required to be + // written to storage because key does not exist + NUMBER_FILTERED_DELETES, + NUMBER_MERGE_FAILURES, + SEQUENCE_NUMBER, + + // number of times bloom was checked before creating iterator on a + // file, and the number of times the check was useful in avoiding + // iterator creation (and thus likely IOPs). + BLOOM_FILTER_PREFIX_CHECKED, + BLOOM_FILTER_PREFIX_USEFUL, + + // Number of times we had to reseek inside an iteration to skip + // over large number of keys with same userkey. + NUMBER_OF_RESEEKS_IN_ITERATION, + + // Record the number of calls to GetUpadtesSince. Useful to keep track of + // transaction log iterator refreshes + GET_UPDATES_SINCE_CALLS, + BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache + BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache + WAL_FILE_SYNCED, // Number of times WAL sync is done + WAL_FILE_BYTES, // Number of bytes written to WAL + + // Writes can be processed by requesting thread or by the thread at the + // head of the writers queue. + WRITE_DONE_BY_SELF, + WRITE_DONE_BY_OTHER, + WRITE_WITH_WAL, // Number of Write calls that request WAL + COMPACT_READ_BYTES, // Bytes read during compaction + COMPACT_WRITE_BYTES, // Bytes written during compaction + + // Number of table's properties loaded directly from file, without creating + // table reader object. + NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, + NUMBER_SUPERVERSION_ACQUIRES, + NUMBER_SUPERVERSION_RELEASES, + NUMBER_SUPERVERSION_CLEANUPS, + TICKER_ENUM_MAX +}; + +// The order of items listed in Tickers should be the same as +// the order listed in TickersNameMap +const std::vector> TickersNameMap = { + {BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"}, + {BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"}, + {BLOCK_CACHE_ADD, "rocksdb.block.cache.add"}, + {BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"}, + {BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"}, + {BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"}, + {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"}, + {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"}, + {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"}, + {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"}, + {MEMTABLE_HIT, "rocksdb.memtable.hit"}, + {MEMTABLE_MISS, "rocksdb.memtable.miss"}, + {COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"}, + {COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"}, + {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"}, + {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"}, + {NUMBER_KEYS_READ, "rocksdb.number.keys.read"}, + {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"}, + {BYTES_WRITTEN, "rocksdb.bytes.written"}, + {BYTES_READ, "rocksdb.bytes.read"}, + {NO_FILE_CLOSES, "rocksdb.no.file.closes"}, + {NO_FILE_OPENS, "rocksdb.no.file.opens"}, + {NO_FILE_ERRORS, "rocksdb.no.file.errors"}, + {STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"}, + {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"}, + {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"}, + {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"}, + {NO_ITERATORS, "rocksdb.num.iterators"}, + {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"}, + {NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"}, + {NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"}, + {NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"}, + {NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"}, + {SEQUENCE_NUMBER, "rocksdb.sequence.number"}, + {BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"}, + {BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"}, + {NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"}, + {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"}, + {BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"}, + {BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"}, + {WAL_FILE_SYNCED, "rocksdb.wal.synced"}, + {WAL_FILE_BYTES, "rocksdb.wal.bytes"}, + {WRITE_DONE_BY_SELF, "rocksdb.write.self"}, + {WRITE_DONE_BY_OTHER, "rocksdb.write.other"}, + {WRITE_WITH_WAL, "rocksdb.write.wal"}, + {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"}, + {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"}, + {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, + "rocksdb.number.direct.load.table.properties"}, + {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"}, + {NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"}, + {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"}, +}; + +/** + * Keep adding histogram's here. + * Any histogram whould have value less than HISTOGRAM_ENUM_MAX + * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX + * Add a string representation in HistogramsNameMap below + * And increment HISTOGRAM_ENUM_MAX + */ +enum Histograms { + DB_GET, + DB_WRITE, + COMPACTION_TIME, + TABLE_SYNC_MICROS, + COMPACTION_OUTFILE_SYNC_MICROS, + WAL_FILE_SYNC_MICROS, + MANIFEST_FILE_SYNC_MICROS, + // TIME SPENT IN IO DURING TABLE OPEN + TABLE_OPEN_IO_MICROS, + DB_MULTIGET, + READ_BLOCK_COMPACTION_MICROS, + READ_BLOCK_GET_MICROS, + WRITE_RAW_BLOCK_MICROS, + + STALL_L0_SLOWDOWN_COUNT, + STALL_MEMTABLE_COMPACTION_COUNT, + STALL_L0_NUM_FILES_COUNT, + HARD_RATE_LIMIT_DELAY_COUNT, + SOFT_RATE_LIMIT_DELAY_COUNT, + NUM_FILES_IN_SINGLE_COMPACTION, + HISTOGRAM_ENUM_MAX, +}; + +const std::vector> HistogramsNameMap = { + { DB_GET, "rocksdb.db.get.micros" }, + { DB_WRITE, "rocksdb.db.write.micros" }, + { COMPACTION_TIME, "rocksdb.compaction.times.micros" }, + { TABLE_SYNC_MICROS, "rocksdb.table.sync.micros" }, + { COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros" }, + { WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros" }, + { MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros" }, + { TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros" }, + { DB_MULTIGET, "rocksdb.db.multiget.micros" }, + { READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros" }, + { READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros" }, + { WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros" }, + { STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"}, + { STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"}, + { STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"}, + { HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"}, + { SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"}, + { NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" }, +}; + +struct HistogramData { + double median; + double percentile95; + double percentile99; + double average; + double standard_deviation; +}; + +// Analyze the performance of a db +class Statistics { + public: + virtual ~Statistics() {} + + virtual long getTickerCount(Tickers tickerType) = 0; + virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0; + virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0; + virtual void measureTime(Histograms histogramType, uint64_t time) = 0; + + virtual void histogramData(Histograms type, HistogramData* const data) = 0; + // String representation of the statistic object. + std::string ToString(); +}; + +// Create a concrete DBStatistics object +std::shared_ptr CreateDBStatistics(); + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_ diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h new file mode 100644 index 0000000000..0298a28380 --- /dev/null +++ b/include/rocksdb/status.h @@ -0,0 +1,145 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_STATUS_H_ +#define STORAGE_ROCKSDB_INCLUDE_STATUS_H_ + +#include +#include "rocksdb/slice.h" + +namespace rocksdb { + +class Status { + public: + // Create a success status. + Status() : code_(kOk), state_(nullptr) { } + ~Status() { delete[] state_; } + + // Copy the specified status. + Status(const Status& s); + void operator=(const Status& s); + + // Return a success status. + static Status OK() { return Status(); } + + // Return error status of an appropriate type. + static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotFound, msg, msg2); + } + // Fast path for not found without malloc; + static Status NotFound() { + return Status(kNotFound); + } + static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kCorruption, msg, msg2); + } + static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotSupported, msg, msg2); + } + static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kInvalidArgument, msg, msg2); + } + static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, msg, msg2); + } + static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kMergeInProgress, msg, msg2); + } + static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIncomplete, msg, msg2); + } + static Status ShutdownInProgress(const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kShutdownInProgress, msg, msg2); + } + + // Returns true iff the status indicates success. + bool ok() const { return code() == kOk; } + + // Returns true iff the status indicates a NotFound error. + bool IsNotFound() const { return code() == kNotFound; } + + // Returns true iff the status indicates a Corruption error. + bool IsCorruption() const { return code() == kCorruption; } + + // Returns true iff the status indicates a NotSupported error. + bool IsNotSupported() const { return code() == kNotSupported; } + + // Returns true iff the status indicates an InvalidArgument error. + bool IsInvalidArgument() const { return code() == kInvalidArgument; } + + // Returns true iff the status indicates an IOError. + bool IsIOError() const { return code() == kIOError; } + + // Returns true iff the status indicates an MergeInProgress. + bool IsMergeInProgress() const { return code() == kMergeInProgress; } + + // Returns true iff the status indicates Incomplete + bool IsIncomplete() const { return code() == kIncomplete; } + + // Returns true iff the status indicates Incomplete + bool IsShutdownInProgress() const { return code() == kShutdownInProgress; } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + enum Code { + kOk = 0, + kNotFound = 1, + kCorruption = 2, + kNotSupported = 3, + kInvalidArgument = 4, + kIOError = 5, + kMergeInProgress = 6, + kIncomplete = 7, + kShutdownInProgress = 8 + }; + + Code code() const { + return code_; + } + private: + // A nullptr state_ (which is always the case for OK) means the message + // is empty. + // of the following form: + // state_[0..3] == length of message + // state_[4..] == message + Code code_; + const char* state_; + + explicit Status(Code code) : code_(code), state_(nullptr) { } + Status(Code code, const Slice& msg, const Slice& msg2); + static const char* CopyState(const char* s); +}; + +inline Status::Status(const Status& s) { + code_ = s.code_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); +} +inline void Status::operator=(const Status& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + code_ = s.code_; + if (state_ != s.state_) { + delete[] state_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); + } +} + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_STATUS_H_ diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h new file mode 100644 index 0000000000..11adfec8cf --- /dev/null +++ b/include/rocksdb/table.h @@ -0,0 +1,206 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Currently we support two types of tables: plain table and block-based table. +// 1. Block-based table: this is the default table type that we inherited from +// LevelDB, which was designed for storing data in hard disk or flash +// device. +// 2. Plain table: it is one of RocksDB's SST file format optimized +// for low query latency on pure-memory or really low-latency media. +// +// A tutorial of rocksdb table formats is available here: +// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats +// +// Example code is also available +// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples + +#pragma once +#include +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +// -- Block-based Table +class FlushBlockPolicyFactory; +class RandomAccessFile; +class TableBuilder; +class TableReader; +class WritableFile; +struct EnvOptions; +struct Options; + +using std::unique_ptr; + +enum ChecksumType : char { + kNoChecksum = 0x0, // not yet supported. Will fail + kCRC32c = 0x1, + kxxHash = 0x2, +}; + +// For advanced user only +struct BlockBasedTableOptions { + // @flush_block_policy_factory creates the instances of flush block policy. + // which provides a configurable way to determine when to flush a block in + // the block based tables. If not set, table builder will use the default + // block flush policy, which cut blocks by block size (please refer to + // `FlushBlockBySizePolicy`). + std::shared_ptr flush_block_policy_factory; + + // TODO(kailiu) Temporarily disable this feature by making the default value + // to be false. + // + // Indicating if we'd put index/filter blocks to the block cache. + // If not specified, each "table reader" object will pre-load index/filter + // block during table initialization. + bool cache_index_and_filter_blocks = false; + + // The index type that will be used for this table. + enum IndexType : char { + // A space efficient index block that is optimized for + // binary-search-based index. + kBinarySearch, + + // The hash index, if enabled, will do the hash lookup when + // `Options.prefix_extractor` is provided. + kHashSearch, + }; + + IndexType index_type = kBinarySearch; + + // Use the specified checksum type. Newly created table files will be + // protected with this checksum type. Old table files will still be readable, + // even though they have different checksum type. + ChecksumType checksum = kCRC32c; +}; + +// Table Properties that are specific to block-based table properties. +struct BlockBasedTablePropertyNames { + // value of this propertis is a fixed int32 number. + static const std::string kIndexType; +}; + +// Create default block based table factory. +extern TableFactory* NewBlockBasedTableFactory( + const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); + +#ifndef ROCKSDB_LITE +// -- Plain Table with prefix-only seek +// For this factory, you need to set Options.prefix_extrator properly to make it +// work. Look-up will starts with prefix hash lookup for key prefix. Inside the +// hash bucket found, a binary search is executed for hash conflicts. Finally, +// a linear search is used. +// @user_key_len: plain table has optimization for fix-sized keys, which can be +// specified via user_key_len. Alternatively, you can pass +// `kPlainTableVariableLength` if your keys have variable +// lengths. +// @bloom_bits_per_key: the number of bits used for bloom filer per prefix. You +// may disable it by passing a zero. +// @hash_table_ratio: the desired utilization of the hash table used for prefix +// hashing. hash_table_ratio = number of prefixes / #buckets +// in the hash table +// @index_sparseness: inside each prefix, need to build one index record for how +// many keys for binary search inside each hash bucket. +// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc. +// Otherwise from huge page TLB. The user needs to reserve +// huge pages for it to be allocated, like: +// sysctl -w vm.nr_hugepages=20 +// See linux doc Documentation/vm/hugetlbpage.txt + +const uint32_t kPlainTableVariableLength = 0; +extern TableFactory* NewPlainTableFactory(uint32_t user_key_len = + kPlainTableVariableLength, + int bloom_bits_per_prefix = 10, + double hash_table_ratio = 0.75, + size_t index_sparseness = 16, + size_t huge_page_tlb_size = 0); + +// -- Plain Table +// This factory of plain table ignores Options.prefix_extractor and assumes no +// hashable prefix available to the key structure. Lookup will be based on +// binary search index only. Total order seek() can be issued. +// @user_key_len: plain table has optimization for fix-sized keys, which can be +// specified via user_key_len. Alternatively, you can pass +// `kPlainTableVariableLength` if your keys have variable +// lengths. +// @bloom_bits_per_key: the number of bits used for bloom filer per key. You may +// disable it by passing a zero. +// @index_sparseness: need to build one index record for how many keys for +// binary search. +// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc. +// Otherwise from huge page TLB. The user needs to reserve +// huge pages for it to be allocated, like: +// sysctl -w vm.nr_hugepages=20 +// See linux doc Documentation/vm/hugetlbpage.txt +extern TableFactory* NewTotalOrderPlainTableFactory( + uint32_t user_key_len = kPlainTableVariableLength, + int bloom_bits_per_key = 0, size_t index_sparseness = 16, + size_t huge_page_tlb_size = 0); + +#endif // ROCKSDB_LITE + +// A base class for table factories. +class TableFactory { + public: + virtual ~TableFactory() {} + + // The type of the table. + // + // The client of this package should switch to a new name whenever + // the table format implementation changes. + // + // Names starting with "rocksdb." are reserved and should not be used + // by any clients of this package. + virtual const char* Name() const = 0; + + // Returns a Table object table that can fetch data from file specified + // in parameter file. It's the caller's responsibility to make sure + // file is in the correct format. + // + // NewTableReader() is called in two places: + // (1) TableCache::FindTable() calls the function when table cache miss + // and cache the table object returned. + // (1) SstFileReader (for SST Dump) opens the table and dump the table + // contents using the interator of the table. + // options and soptions are options. options is the general options. + // Multiple configured can be accessed from there, including and not + // limited to block cache and key comparators. + // file is a file handler to handle the file for the table + // file_size is the physical file size of the file + // table_reader is the output table reader + virtual Status NewTableReader( + const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const = 0; + + // Return a table builder to write to a file for this table type. + // + // It is called in several places: + // (1) When flushing memtable to a level-0 output file, it creates a table + // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable()) + // (2) During compaction, it gets the builder for writing compaction output + // files in DBImpl::OpenCompactionOutputFile(). + // (3) When recovering from transaction logs, it creates a table builder to + // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery, + // by calling BuildTable()) + // (4) When running Repairer, it creates a table builder to convert logs to + // SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) + // + // options is the general options. Multiple configured can be acceseed from + // there, including and not limited to compression options. + // file is a handle of a writable file. It is the caller's responsibility to + // keep the file open and close the file after closing the table builder. + // compression_type is the compression type to use in this table. + virtual TableBuilder* NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const = 0; +}; + +} // namespace rocksdb diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h new file mode 100644 index 0000000000..d6b3f4d7b5 --- /dev/null +++ b/include/rocksdb/table_properties.h @@ -0,0 +1,127 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include "rocksdb/status.h" + +namespace rocksdb { + +// -- Table Properties +// Other than basic table properties, each table may also have the user +// collected properties. +// The value of the user-collected properties are encoded as raw bytes -- +// users have to interprete these values by themselves. +// Note: To do prefix seek/scan in `UserCollectedProperties`, you can do +// something similar to: +// +// UserCollectedProperties props = ...; +// for (auto pos = props.lower_bound(prefix); +// pos != props.end() && pos->first.compare(0, prefix.size(), prefix) == 0; +// ++pos) { +// ... +// } +typedef std::map UserCollectedProperties; + +// TableProperties contains a bunch of read-only properties of its associated +// table. +struct TableProperties { + public: + // the total size of all data blocks. + uint64_t data_size = 0; + // the size of index block. + uint64_t index_size = 0; + // the size of filter block. + uint64_t filter_size = 0; + // total raw key size + uint64_t raw_key_size = 0; + // total raw value size + uint64_t raw_value_size = 0; + // the number of blocks in this table + uint64_t num_data_blocks = 0; + // the number of entries in this table + uint64_t num_entries = 0; + // format version, reserved for backward compatibility + uint64_t format_version = 0; + // If 0, key is variable length. Otherwise number of bytes for each key. + uint64_t fixed_key_len = 0; + + // The name of the filter policy used in this table. + // If no filter policy is used, `filter_policy_name` will be an empty string. + std::string filter_policy_name; + + // user collected properties + UserCollectedProperties user_collected_properties; + + // convert this object to a human readable form + // @prop_delim: delimiter for each property. + std::string ToString(const std::string& prop_delim = "; ", + const std::string& kv_delim = "=") const; +}; + +// table properties' human-readable names in the property block. +struct TablePropertiesNames { + static const std::string kDataSize; + static const std::string kIndexSize; + static const std::string kFilterSize; + static const std::string kRawKeySize; + static const std::string kRawValueSize; + static const std::string kNumDataBlocks; + static const std::string kNumEntries; + static const std::string kFormatVersion; + static const std::string kFixedKeyLen; + static const std::string kFilterPolicy; +}; + +extern const std::string kPropertiesBlock; + +// `TablePropertiesCollector` provides the mechanism for users to collect +// their own interested properties. This class is essentially a collection +// of callback functions that will be invoked during table building. +// It is construced with TablePropertiesCollectorFactory. The methods don't +// need to be thread-safe, as we will create exactly one +// TablePropertiesCollector object per table and then call it sequentially +class TablePropertiesCollector { + public: + virtual ~TablePropertiesCollector() {} + + // Add() will be called when a new key/value pair is inserted into the table. + // @params key the original key that is inserted into the table. + // @params value the original value that is inserted into the table. + virtual Status Add(const Slice& key, const Slice& value) = 0; + + // Finish() will be called when a table has already been built and is ready + // for writing the properties block. + // @params properties User will add their collected statistics to + // `properties`. + virtual Status Finish(UserCollectedProperties* properties) = 0; + + // Return the human-readable properties, where the key is property name and + // the value is the human-readable form of value. + virtual UserCollectedProperties GetReadableProperties() const = 0; + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const = 0; +}; + +// Constructs TablePropertiesCollector. Internals create a new +// TablePropertiesCollector for each new table +class TablePropertiesCollectorFactory { + public: + virtual ~TablePropertiesCollectorFactory() {} + // has to be thread-safe + virtual TablePropertiesCollector* CreateTablePropertiesCollector() = 0; + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const = 0; +}; + +// Extra properties +// Below is a list of non-basic properties that are collected by database +// itself. Especially some properties regarding to the internal keys (which +// is unknown to `table`). +extern uint64_t GetDeletedKeys(const UserCollectedProperties& props); + +} // namespace rocksdb diff --git a/include/rocksdb/transaction_log.h b/include/rocksdb/transaction_log.h new file mode 100644 index 0000000000..30443bba55 --- /dev/null +++ b/include/rocksdb/transaction_log.h @@ -0,0 +1,104 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_ +#define STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_ + +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" +#include +#include + +namespace rocksdb { + +class LogFile; +typedef std::vector> VectorLogPtr; + +enum WalFileType { + /* Indicates that WAL file is in archive directory. WAL files are moved from + * the main db directory to archive directory once they are not live and stay + * there until cleaned up. Files are cleaned depending on archive size + * (Options::WAL_size_limit_MB) and time since last cleaning + * (Options::WAL_ttl_seconds). + */ + kArchivedLogFile = 0, + + /* Indicates that WAL file is live and resides in the main db directory */ + kAliveLogFile = 1 +} ; + +class LogFile { + public: + LogFile() {} + virtual ~LogFile() {} + + // Returns log file's pathname relative to the main db dir + // Eg. For a live-log-file = /000003.log + // For an archived-log-file = /archive/000003.log + virtual std::string PathName() const = 0; + + + // Primary identifier for log file. + // This is directly proportional to creation time of the log file + virtual uint64_t LogNumber() const = 0; + + // Log file can be either alive or archived + virtual WalFileType Type() const = 0; + + // Starting sequence number of writebatch written in this log file + virtual SequenceNumber StartSequence() const = 0; + + // Size of log file on disk in Bytes + virtual uint64_t SizeFileBytes() const = 0; +}; + +struct BatchResult { + SequenceNumber sequence = 0; + std::unique_ptr writeBatchPtr; +}; + +// A TransactionLogIterator is used to iterate over the transactions in a db. +// One run of the iterator is continuous, i.e. the iterator will stop at the +// beginning of any gap in sequences +class TransactionLogIterator { + public: + TransactionLogIterator() {} + virtual ~TransactionLogIterator() {} + + // An iterator is either positioned at a WriteBatch or not valid. + // This method returns true if the iterator is valid. + // Can read data from a valid iterator. + virtual bool Valid() = 0; + + // Moves the iterator to the next WriteBatch. + // REQUIRES: Valid() to be true. + virtual void Next() = 0; + + // Returns ok if the iterator is valid. + // Returns the Error when something has gone wrong. + virtual Status status() = 0; + + // If valid return's the current write_batch and the sequence number of the + // earliest transaction contained in the batch. + // ONLY use if Valid() is true and status() is OK. + virtual BatchResult GetBatch() = 0; + + // The read options for TransactionLogIterator. + struct ReadOptions { + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: true + bool verify_checksums_; + + ReadOptions() : verify_checksums_(true) {} + + explicit ReadOptions(bool verify_checksums) + : verify_checksums_(verify_checksums) {} + }; +}; +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_ diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h new file mode 100644 index 0000000000..f20bf8277f --- /dev/null +++ b/include/rocksdb/types.h @@ -0,0 +1,20 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_TYPES_H_ +#define STORAGE_ROCKSDB_INCLUDE_TYPES_H_ + +#include + +namespace rocksdb { + +// Define all public custom types here. + +// Represents a sequence number in a WAL file. +typedef uint64_t SequenceNumber; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_TYPES_H_ diff --git a/include/rocksdb/universal_compaction.h b/include/rocksdb/universal_compaction.h new file mode 100644 index 0000000000..eaf47e5c78 --- /dev/null +++ b/include/rocksdb/universal_compaction.h @@ -0,0 +1,83 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H +#define STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H + +#include +#include + +namespace rocksdb { + +// +// Algorithm used to make a compaction request stop picking new files +// into a single compaction run +// +enum CompactionStopStyle { + kCompactionStopStyleSimilarSize, // pick files of similar size + kCompactionStopStyleTotalSize // total size of picked files > next file +}; + +class CompactionOptionsUniversal { + public: + + // Percentage flexibilty while comparing file size. If the candidate file(s) + // size is 1% smaller than the next file's size, then include next file into + // this candidate set. // Default: 1 + unsigned int size_ratio; + + // The minimum number of files in a single compaction run. Default: 2 + unsigned int min_merge_width; + + // The maximum number of files in a single compaction run. Default: UINT_MAX + unsigned int max_merge_width; + + // The size amplification is defined as the amount (in percentage) of + // additional storage needed to store a single byte of data in the database. + // For example, a size amplification of 2% means that a database that + // contains 100 bytes of user-data may occupy upto 102 bytes of + // physical storage. By this definition, a fully compacted database has + // a size amplification of 0%. Rocksdb uses the following heuristic + // to calculate size amplification: it assumes that all files excluding + // the earliest file contribute to the size amplification. + // Default: 200, which means that a 100 byte database could require upto + // 300 bytes of storage. + unsigned int max_size_amplification_percent; + + // If this option is set to be -1 (the default value), all the output files + // will follow compression type specified. + // + // If this option is not negative, we will try to make sure compressed + // size is just above this value. In normal cases, at least this percentage + // of data will be compressed. + // When we are compacting to a new file, here is the criteria whether + // it needs to be compressed: assuming here are the list of files sorted + // by generation time: + // A1...An B1...Bm C1...Ct + // where A1 is the newest and Ct is the oldest, and we are going to compact + // B1...Bm, we calculate the total size of all the files as total_size, as + // well as the total size of C1...Ct as total_C, the compaction output file + // will be compressed iff + // total_C / total_size < this percentage + int compression_size_percent; + + // The algorithm used to stop picking files into a single compaction run + // Default: kCompactionStopStyleTotalSize + CompactionStopStyle stop_style; + + // Default set of parameters + CompactionOptionsUniversal() : + size_ratio(1), + min_merge_width(2), + max_merge_width(UINT_MAX), + max_size_amplification_percent(200), + compression_size_percent(-1), + stop_style(kCompactionStopStyleTotalSize) { + } +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h new file mode 100644 index 0000000000..2bae1ed6e1 --- /dev/null +++ b/include/rocksdb/version.h @@ -0,0 +1,17 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +// Also update Makefile if you change these +#define ROCKSDB_MAJOR 3 +#define ROCKSDB_MINOR 2 +#define ROCKSDB_PATCH 0 + +// Do not use these. We made the mistake of declaring macros starting with +// double underscore. Now we have to live with our choice. We'll deprecate these +// at some point +#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR +#define __ROCKSDB_MINOR__ ROCKSDB_MINOR +#define __ROCKSDB_PATCH__ ROCKSDB_PATCH diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h new file mode 100644 index 0000000000..74ee2ad160 --- /dev/null +++ b/include/rocksdb/write_batch.h @@ -0,0 +1,158 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch holds a collection of updates to apply atomically to a DB. +// +// The updates are applied in the order in which they are added +// to the WriteBatch. For example, the value of "key" will be "v3" +// after the following batch is written: +// +// batch.Put("key", "v1"); +// batch.Delete("key"); +// batch.Put("key", "v2"); +// batch.Put("key", "v3"); +// +// Multiple threads can invoke const methods on a WriteBatch without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same WriteBatch must use +// external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_ +#define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_ + +#include +#include "rocksdb/status.h" + +namespace rocksdb { + +class Slice; +class ColumnFamilyHandle; +struct SliceParts; + +class WriteBatch { + public: + explicit WriteBatch(size_t reserved_bytes = 0); + ~WriteBatch(); + + // Store the mapping "key->value" in the database. + void Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value); + void Put(const Slice& key, const Slice& value) { + Put(nullptr, key, value); + } + + // Variant of Put() that gathers output like writev(2). The key and value + // that will be written to the database are concatentations of arrays of + // slices. + void Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value); + void Put(const SliceParts& key, const SliceParts& value) { + Put(nullptr, key, value); + } + + // Merge "value" with the existing value of "key" in the database. + // "key->merge(existing, value)" + void Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value); + void Merge(const Slice& key, const Slice& value) { + Merge(nullptr, key, value); + } + + // If the database contains a mapping for "key", erase it. Else do nothing. + void Delete(ColumnFamilyHandle* column_family, const Slice& key); + void Delete(const Slice& key) { Delete(nullptr, key); } + + // Append a blob of arbitrary size to the records in this batch. The blob will + // be stored in the transaction log but not in any other file. In particular, + // it will not be persisted to the SST files. When iterating over this + // WriteBatch, WriteBatch::Handler::LogData will be called with the contents + // of the blob as it is encountered. Blobs, puts, deletes, and merges will be + // encountered in the same order in thich they were inserted. The blob will + // NOT consume sequence number(s) and will NOT increase the count of the batch + // + // Example application: add timestamps to the transaction log for use in + // replication. + void PutLogData(const Slice& blob); + + // Clear all updates buffered in this batch. + void Clear(); + + // Support for iterating over the contents of a batch. + class Handler { + public: + virtual ~Handler(); + // default implementation will just call Put without column family for + // backwards compatibility. If the column family is not default, + // the function is noop + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + // Put() historically doesn't return status. We didn't want to be + // backwards incompatible so we didn't change the return status + // (this is a public API). We do an ordinary get and return Status::OK() + Put(key, value); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and PutCF not implemented"); + } + virtual void Put(const Slice& key, const Slice& value); + // Merge and LogData are not pure virtual. Otherwise, we would break + // existing clients of Handler on a source code level. The default + // implementation of Merge simply throws a runtime exception. + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + Merge(key, value); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and MergeCF not implemented"); + } + virtual void Merge(const Slice& key, const Slice& value); + // The default implementation of LogData does nothing. + virtual void LogData(const Slice& blob); + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { + if (column_family_id == 0) { + Delete(key); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and DeleteCF not implemented"); + } + virtual void Delete(const Slice& key); + // Continue is called by WriteBatch::Iterate. If it returns false, + // iteration is halted. Otherwise, it continues iterating. The default + // implementation always returns true. + virtual bool Continue(); + }; + Status Iterate(Handler* handler) const; + + // Retrieve the serialized version of this batch. + const std::string& Data() const { return rep_; } + + // Retrieve data size of the batch. + size_t GetDataSize() const { return rep_.size(); } + + // Returns the number of updates in the batch + int Count() const; + + // Constructor with a serialized string object + explicit WriteBatch(std::string rep): rep_(rep) {} + + private: + friend class WriteBatchInternal; + + std::string rep_; // See comment in write_batch.cc for the format of rep_ + + // Intentionally copyable +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_ diff --git a/include/utilities/backupable_db.h b/include/utilities/backupable_db.h new file mode 100644 index 0000000000..617fe8aefb --- /dev/null +++ b/include/utilities/backupable_db.h @@ -0,0 +1,251 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE + +#define __STDC_FORMAT_MACROS +#include +#include +#include +#include + +#include "utilities/stackable_db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +struct BackupableDBOptions { + // Where to keep the backup files. Has to be different than dbname_ + // Best to set this to dbname_ + "/backups" + // Required + std::string backup_dir; + + // Backup Env object. It will be used for backup file I/O. If it's + // nullptr, backups will be written out using DBs Env. If it's + // non-nullptr, backup's I/O will be performed using this object. + // If you want to have backups on HDFS, use HDFS Env here! + // Default: nullptr + Env* backup_env; + + // If share_table_files == true, backup will assume that table files with + // same name have the same contents. This enables incremental backups and + // avoids unnecessary data copies. + // If share_table_files == false, each backup will be on its own and will + // not share any data with other backups. + // default: true + bool share_table_files; + + // Backup info and error messages will be written to info_log + // if non-nullptr. + // Default: nullptr + Logger* info_log; + + // If sync == true, we can guarantee you'll get consistent backup even + // on a machine crash/reboot. Backup process is slower with sync enabled. + // If sync == false, we don't guarantee anything on machine reboot. However, + // chances are some of the backups are consistent. + // Default: true + bool sync; + + // If true, it will delete whatever backups there are already + // Default: false + bool destroy_old_data; + + // If false, we won't backup log files. This option can be useful for backing + // up in-memory databases where log file are persisted, but table files are in + // memory. + // Default: true + bool backup_log_files; + + // Max bytes that can be transferred in a second during backup. + // If 0, go as fast as you can + // Default: 0 + uint64_t backup_rate_limit; + + // Max bytes that can be transferred in a second during restore. + // If 0, go as fast as you can + // Default: 0 + uint64_t restore_rate_limit; + + // Only used if share_table_files is set to true. If true, will consider that + // backups can come from different databases, hence a sst is not uniquely + // identifed by its name, but by the triple (file name, crc32, file length) + // Default: false + // Note: this is an experimental option, and you'll need to set it manually + // *turn it on only if you know what you're doing* + bool share_files_with_checksum; + + void Dump(Logger* logger) const; + + explicit BackupableDBOptions(const std::string& _backup_dir, + Env* _backup_env = nullptr, + bool _share_table_files = true, + Logger* _info_log = nullptr, bool _sync = true, + bool _destroy_old_data = false, + bool _backup_log_files = true, + uint64_t _backup_rate_limit = 0, + uint64_t _restore_rate_limit = 0) + : backup_dir(_backup_dir), + backup_env(_backup_env), + share_table_files(_share_table_files), + info_log(_info_log), + sync(_sync), + destroy_old_data(_destroy_old_data), + backup_log_files(_backup_log_files), + backup_rate_limit(_backup_rate_limit), + restore_rate_limit(_restore_rate_limit), + share_files_with_checksum(false) { + assert(share_table_files || !share_files_with_checksum); + } +}; + +struct RestoreOptions { + // If true, restore won't overwrite the existing log files in wal_dir. It will + // also move all log files from archive directory to wal_dir. Use this option + // in combination with BackupableDBOptions::backup_log_files = false for + // persisting in-memory databases. + // Default: false + bool keep_log_files; + + explicit RestoreOptions(bool _keep_log_files = false) + : keep_log_files(_keep_log_files) {} +}; + +typedef uint32_t BackupID; + +struct BackupInfo { + BackupID backup_id; + int64_t timestamp; + uint64_t size; + + BackupInfo() {} + BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size) + : backup_id(_backup_id), timestamp(_timestamp), size(_size) {} +}; + +class BackupEngineReadOnly { + public: + virtual ~BackupEngineReadOnly() {} + + static BackupEngineReadOnly* NewReadOnlyBackupEngine( + Env* db_env, const BackupableDBOptions& options); + + // You can GetBackupInfo safely, even with other BackupEngine performing + // backups on the same directory + virtual void GetBackupInfo(std::vector* backup_info) = 0; + + // Restoring DB from backup is NOT safe when there is another BackupEngine + // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's + // responsibility to synchronize the operation, i.e. don't delete the backup + // when you're restoring from it + virtual Status RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) = 0; + virtual Status RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) = 0; +}; + +// Please see the documentation in BackupableDB and RestoreBackupableDB +class BackupEngine { + public: + virtual ~BackupEngine() {} + + static BackupEngine* NewBackupEngine(Env* db_env, + const BackupableDBOptions& options); + + virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false) = 0; + virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0; + virtual Status DeleteBackup(BackupID backup_id) = 0; + virtual void StopBackup() = 0; + + virtual void GetBackupInfo(std::vector* backup_info) = 0; + virtual Status RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) = 0; + virtual Status RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) = 0; +}; + +// Stack your DB with BackupableDB to be able to backup the DB +class BackupableDB : public StackableDB { + public: + // BackupableDBOptions have to be the same as the ones used in a previous + // incarnation of the DB + // + // BackupableDB ownes the pointer `DB* db` now. You should not delete it or + // use it after the invocation of BackupableDB + BackupableDB(DB* db, const BackupableDBOptions& options); + virtual ~BackupableDB(); + + // Captures the state of the database in the latest backup + // NOT a thread safe call + Status CreateNewBackup(bool flush_before_backup = false); + // Returns info about backups in backup_info + void GetBackupInfo(std::vector* backup_info); + // deletes old backups, keeping latest num_backups_to_keep alive + Status PurgeOldBackups(uint32_t num_backups_to_keep); + // deletes a specific backup + Status DeleteBackup(BackupID backup_id); + // Call this from another thread if you want to stop the backup + // that is currently happening. It will return immediatelly, will + // not wait for the backup to stop. + // The backup will stop ASAP and the call to CreateNewBackup will + // return Status::Incomplete(). It will not clean up after itself, but + // the state will remain consistent. The state will be cleaned up + // next time you create BackupableDB or RestoreBackupableDB. + void StopBackup(); + + private: + BackupEngine* backup_engine_; +}; + +// Use this class to access information about backups and restore from them +class RestoreBackupableDB { + public: + RestoreBackupableDB(Env* db_env, const BackupableDBOptions& options); + ~RestoreBackupableDB(); + + // Returns info about backups in backup_info + void GetBackupInfo(std::vector* backup_info); + + // restore from backup with backup_id + // IMPORTANT -- if options_.share_table_files == true and you restore DB + // from some backup that is not the latest, and you start creating new + // backups from the new DB, they will probably fail + // + // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3. + // If you add new data to the DB and try creating a new backup now, the + // database will diverge from backups 4 and 5 and the new backup will fail. + // If you want to create new backup, you will first have to delete backups 4 + // and 5. + Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir, + const std::string& wal_dir, + const RestoreOptions& restore_options = + RestoreOptions()); + + // restore from the latest backup + Status RestoreDBFromLatestBackup(const std::string& db_dir, + const std::string& wal_dir, + const RestoreOptions& restore_options = + RestoreOptions()); + // deletes old backups, keeping latest num_backups_to_keep alive + Status PurgeOldBackups(uint32_t num_backups_to_keep); + // deletes a specific backup + Status DeleteBackup(BackupID backup_id); + + private: + BackupEngine* backup_engine_; +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/include/utilities/db_ttl.h b/include/utilities/db_ttl.h new file mode 100644 index 0000000000..e99744d8f9 --- /dev/null +++ b/include/utilities/db_ttl.h @@ -0,0 +1,68 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include + +#include "utilities/stackable_db.h" +#include "rocksdb/db.h" + +namespace rocksdb { + +// Database with TTL support. +// +// USE-CASES: +// This API should be used to open the db when key-values inserted are +// meant to be removed from the db in a non-strict 'ttl' amount of time +// Therefore, this guarantees that key-values inserted will remain in the +// db for >= ttl amount of time and the db will make efforts to remove the +// key-values as soon as possible after ttl seconds of their insertion. +// +// BEHAVIOUR: +// TTL is accepted in seconds +// (int32_t)Timestamp(creation) is suffixed to values in Put internally +// Expired TTL values deleted in compaction only:(Timestamp+ttl=5 +// read_only=true opens in the usual read-only mode. Compactions will not be +// triggered(neither manual nor automatic), so no expired entries removed +// +// CONSTRAINTS: +// Not specifying/passing or non-positive TTL behaves like TTL = infinity +// +// !!!WARNING!!!: +// Calling DB::Open directly to re-open a db created by this API will get +// corrupt values(timestamp suffixed) and no ttl effect will be there +// during the second Open, so use this API consistently to open the db +// Be careful when passing ttl with a small positive value because the +// whole database may be deleted in a small amount of time + +class DBWithTTL : public StackableDB { + public: + virtual Status CreateColumnFamilyWithTtl( + const ColumnFamilyOptions& options, const std::string& column_family_name, + ColumnFamilyHandle** handle, int ttl) = 0; + + static Status Open(const Options& options, const std::string& dbname, + DBWithTTL** dbptr, int32_t ttl = 0, + bool read_only = false); + + static Status Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + DBWithTTL** dbptr, std::vector ttls, + bool read_only = false); + + protected: + explicit DBWithTTL(DB* db) : StackableDB(db) {} +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/include/utilities/geo_db.h b/include/utilities/geo_db.h new file mode 100644 index 0000000000..87ff5e6a01 --- /dev/null +++ b/include/utilities/geo_db.h @@ -0,0 +1,105 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#ifndef ROCKSDB_LITE +#pragma once +#include +#include + +#include "utilities/stackable_db.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +// +// Configurable options needed for setting up a Geo database +// +struct GeoDBOptions { + // Backup info and error messages will be written to info_log + // if non-nullptr. + // Default: nullptr + Logger* info_log; + + explicit GeoDBOptions(Logger* _info_log = nullptr):info_log(_info_log) { } +}; + +// +// A position in the earth's geoid +// +class GeoPosition { + public: + double latitude; + double longitude; + + explicit GeoPosition(double la = 0, double lo = 0) : + latitude(la), longitude(lo) { + } +}; + +// +// Description of an object on the Geoid. It is located by a GPS location, +// and is identified by the id. The value associated with this object is +// an opaque string 'value'. Different objects identified by unique id's +// can have the same gps-location associated with them. +// +class GeoObject { + public: + GeoPosition position; + std::string id; + std::string value; + + GeoObject() {} + + GeoObject(const GeoPosition& pos, const std::string& i, + const std::string& val) : + position(pos), id(i), value(val) { + } +}; + +// +// Stack your DB with GeoDB to be able to get geo-spatial support +// +class GeoDB : public StackableDB { + public: + // GeoDBOptions have to be the same as the ones used in a previous + // incarnation of the DB + // + // GeoDB owns the pointer `DB* db` now. You should not delete it or + // use it after the invocation of GeoDB + // GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {} + GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {} + virtual ~GeoDB() {} + + // Insert a new object into the location database. The object is + // uniquely identified by the id. If an object with the same id already + // exists in the db, then the old one is overwritten by the new + // object being inserted here. + virtual Status Insert(const GeoObject& object) = 0; + + // Retrieve the value of the object located at the specified GPS + // location and is identified by the 'id'. + virtual Status GetByPosition(const GeoPosition& pos, + const Slice& id, std::string* value) = 0; + + // Retrieve the value of the object identified by the 'id'. This method + // could be potentially slower than GetByPosition + virtual Status GetById(const Slice& id, GeoObject* object) = 0; + + // Delete the specified object + virtual Status Remove(const Slice& id) = 0; + + // Returns a list of all items within a circular radius from the + // specified gps location. If 'number_of_values' is specified, + // then this call returns at most that many number of objects. + // The radius is specified in 'meters'. + virtual Status SearchRadial(const GeoPosition& pos, + double radius, + std::vector* values, + int number_of_values = INT_MAX) = 0; +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/include/utilities/stackable_db.h b/include/utilities/stackable_db.h new file mode 100644 index 0000000000..7927c2a88f --- /dev/null +++ b/include/utilities/stackable_db.h @@ -0,0 +1,215 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/db.h" + +namespace rocksdb { + +// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d +class StackableDB : public DB { + public: + // StackableDB is the owner of db now! + explicit StackableDB(DB* db) : db_(db) {} + + ~StackableDB() { + delete db_; + } + + virtual DB* GetBaseDB() { + return db_; + } + + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) { + return db_->CreateColumnFamily(options, column_family_name, handle); + } + + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) { + return db_->DropColumnFamily(column_family); + } + + using DB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) override { + return db_->Put(options, column_family, key, val); + } + + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) override { + return db_->Get(options, column_family, key, value); + } + + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override { + return db_->MultiGet(options, column_family, keys, values); + } + + using DB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, + bool* value_found = nullptr) override { + return db_->KeyMayExist(options, column_family, key, value, value_found); + } + + using DB::Delete; + virtual Status Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override { + return db_->Delete(wopts, column_family, key); + } + + using DB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override { + return db_->Merge(options, column_family, key, value); + } + + + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) + override { + return db_->Write(opts, updates); + } + + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) override { + return db_->NewIterator(opts, column_family); + } + + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) { + return db_->NewIterators(options, column_families, iterators); + } + + + virtual const Snapshot* GetSnapshot() override { + return db_->GetSnapshot(); + } + + virtual void ReleaseSnapshot(const Snapshot* snapshot) override { + return db_->ReleaseSnapshot(snapshot); + } + + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) override { + return db_->GetProperty(column_family, property, value); + } + + using DB::GetApproximateSizes; + virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* r, int n, + uint64_t* sizes) override { + return db_->GetApproximateSizes(column_family, r, n, sizes); + } + + using DB::CompactRange; + virtual Status CompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + bool reduce_level = false, + int target_level = -1) override { + return db_->CompactRange(column_family, begin, end, reduce_level, + target_level); + } + + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family) override { + return db_->NumberLevels(column_family); + } + + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) + override { + return db_->MaxMemCompactionLevel(column_family); + } + + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) + override { + return db_->Level0StopWriteTrigger(column_family); + } + + virtual const std::string& GetName() const override { + return db_->GetName(); + } + + virtual Env* GetEnv() const override { + return db_->GetEnv(); + } + + using DB::GetOptions; + virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const + override { + return db_->GetOptions(column_family); + } + + using DB::Flush; + virtual Status Flush(const FlushOptions& fopts, + ColumnFamilyHandle* column_family) override { + return db_->Flush(fopts, column_family); + } + + virtual Status DisableFileDeletions() override { + return db_->DisableFileDeletions(); + } + + virtual Status EnableFileDeletions(bool force) override { + return db_->EnableFileDeletions(force); + } + + virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, + bool flush_memtable = true) override { + return db_->GetLiveFiles(vec, mfs, flush_memtable); + } + + virtual SequenceNumber GetLatestSequenceNumber() const override { + return db_->GetLatestSequenceNumber(); + } + + virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + return db_->GetSortedWalFiles(files); + } + + virtual Status DeleteFile(std::string name) override { + return db_->DeleteFile(name); + } + + virtual Status GetDbIdentity(std::string& identity) { + return db_->GetDbIdentity(identity); + } + + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) { + return db_->GetPropertiesOfAllTables(column_family, props); + } + + virtual Status GetUpdatesSince( + SequenceNumber seq_number, unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options) override { + return db_->GetUpdatesSince(seq_number, iter, read_options); + } + + virtual ColumnFamilyHandle* DefaultColumnFamily() const override { + return db_->DefaultColumnFamily(); + } + + protected: + DB* db_; +}; + +} // namespace rocksdb diff --git a/include/utilities/utility_db.h b/include/utilities/utility_db.h new file mode 100644 index 0000000000..f2b99cedf0 --- /dev/null +++ b/include/utilities/utility_db.h @@ -0,0 +1,30 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE +#include +#include + +#include "utilities/stackable_db.h" +#include "utilities/db_ttl.h" +#include "rocksdb/db.h" + +namespace rocksdb { + +// Please don't use this class. It's deprecated +class UtilityDB { + public: + // This function is here only for backwards compatibility. Please use the + // functions defined in DBWithTTl (utilities/db_ttl.h) + // (deprecated) + __attribute__((deprecated)) static Status OpenTtlDB(const Options& options, + const std::string& name, + StackableDB** dbptr, + int32_t ttl = 0, + bool read_only = false); +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/java/Makefile b/java/Makefile new file mode 100644 index 0000000000..feacb83464 --- /dev/null +++ b/java/Makefile @@ -0,0 +1,31 @@ +NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter +NATIVE_INCLUDE = ./include +ROCKSDB_JAR = rocksdbjni.jar + +clean: + -find . -name "*.class" -exec rm {} \; + -find . -name "hs*.log" -exec rm {} \; + rm -f $(ROCKSDB_JAR) + +java: + javac org/rocksdb/util/*.java org/rocksdb/*.java + jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class + javah -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES) + +sample: java + javac -cp $(ROCKSDB_JAR) RocksDBSample.java + @rm -rf /tmp/rocksdbjni + @rm -rf /tmp/rocksdbjni_not_found + java -ea -Djava.library.path=.:../ -cp ".:./*" -Xcheck:jni RocksDBSample /tmp/rocksdbjni + @rm -rf /tmp/rocksdbjni + @rm -rf /tmp/rocksdbjni_not_found + +test: java + javac org/rocksdb/test/*.java + java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.WriteBatchTest + java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.BackupableDBTest + java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest + java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest + +db_bench: java + javac org/rocksdb/benchmark/*.java diff --git a/java/RocksDBSample.java b/java/RocksDBSample.java new file mode 100644 index 0000000000..1493977a93 --- /dev/null +++ b/java/RocksDBSample.java @@ -0,0 +1,253 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.ArrayList; +import org.rocksdb.*; +import org.rocksdb.util.SizeUnit; +import java.io.IOException; + +public class RocksDBSample { + static { + RocksDB.loadLibrary(); + } + + public static void main(String[] args) { + if (args.length < 1) { + System.out.println("usage: RocksDBSample db_path"); + return; + } + String db_path = args[0]; + String db_path_not_found = db_path + "_not_found"; + + System.out.println("RocksDBSample"); + RocksDB db = null; + Options options = new Options(); + try { + db = RocksDB.open(options, db_path_not_found); + assert(false); + } catch (RocksDBException e) { + System.out.format("caught the expceted exception -- %s\n", e); + assert(db == null); + } + + Filter filter = new BloomFilter(10); + options.setCreateIfMissing(true) + .createStatistics() + .setWriteBufferSize(8 * SizeUnit.KB) + .setMaxWriteBufferNumber(3) + .setDisableSeekCompaction(true) + .setBlockSize(64 * SizeUnit.KB) + .setMaxBackgroundCompactions(10) + .setFilter(filter); + Statistics stats = options.statisticsPtr(); + + assert(options.createIfMissing() == true); + assert(options.writeBufferSize() == 8 * SizeUnit.KB); + assert(options.maxWriteBufferNumber() == 3); + assert(options.disableSeekCompaction() == true); + assert(options.blockSize() == 64 * SizeUnit.KB); + assert(options.maxBackgroundCompactions() == 10); + + assert(options.memTableFactoryName().equals("SkipListFactory")); + options.setMemTableConfig( + new HashSkipListMemTableConfig() + .setHeight(4) + .setBranchingFactor(4) + .setBucketCount(2000000)); + assert(options.memTableFactoryName().equals("HashSkipListRepFactory")); + + options.setMemTableConfig( + new HashLinkedListMemTableConfig() + .setBucketCount(100000)); + assert(options.memTableFactoryName().equals("HashLinkedListRepFactory")); + + options.setMemTableConfig( + new VectorMemTableConfig().setReservedSize(10000)); + assert(options.memTableFactoryName().equals("VectorRepFactory")); + + options.setMemTableConfig(new SkipListMemTableConfig()); + assert(options.memTableFactoryName().equals("SkipListFactory")); + + options.setTableFormatConfig(new PlainTableConfig()); + assert(options.tableFactoryName().equals("PlainTable")); + + try { + db = RocksDB.open(options, db_path_not_found); + db.put("hello".getBytes(), "world".getBytes()); + byte[] value = db.get("hello".getBytes()); + assert("world".equals(new String(value))); + } catch (RocksDBException e) { + System.out.format("[ERROR] caught the unexpceted exception -- %s\n", e); + assert(db == null); + assert(false); + } + // be sure to release the c++ pointer + db.close(); + + ReadOptions readOptions = new ReadOptions(); + readOptions.setFillCache(false); + + try { + db = RocksDB.open(options, db_path); + db.put("hello".getBytes(), "world".getBytes()); + byte[] value = db.get("hello".getBytes()); + System.out.format("Get('hello') = %s\n", + new String(value)); + + for (int i = 1; i <= 9; ++i) { + for (int j = 1; j <= 9; ++j) { + db.put(String.format("%dx%d", i, j).getBytes(), + String.format("%d", i * j).getBytes()); + } + } + + for (int i = 1; i <= 9; ++i) { + for (int j = 1; j <= 9; ++j) { + System.out.format("%s ", new String(db.get( + String.format("%dx%d", i, j).getBytes()))); + } + System.out.println(""); + } + + value = db.get("1x1".getBytes()); + assert(value != null); + value = db.get("world".getBytes()); + assert(value == null); + value = db.get(readOptions, "world".getBytes()); + assert(value == null); + + byte[] testKey = "asdf".getBytes(); + byte[] testValue = + "asdfghjkl;'?> insufficientArray.length); + len = db.get("asdfjkl;".getBytes(), enoughArray); + assert(len == RocksDB.NOT_FOUND); + len = db.get(testKey, enoughArray); + assert(len == testValue.length); + + len = db.get(readOptions, testKey, insufficientArray); + assert(len > insufficientArray.length); + len = db.get(readOptions, "asdfjkl;".getBytes(), enoughArray); + assert(len == RocksDB.NOT_FOUND); + len = db.get(readOptions, testKey, enoughArray); + assert(len == testValue.length); + + db.remove(testKey); + len = db.get(testKey, enoughArray); + assert(len == RocksDB.NOT_FOUND); + + // repeat the test with WriteOptions + WriteOptions writeOpts = new WriteOptions(); + writeOpts.setSync(true); + writeOpts.setDisableWAL(true); + db.put(writeOpts, testKey, testValue); + len = db.get(testKey, enoughArray); + assert(len == testValue.length); + assert(new String(testValue).equals( + new String(enoughArray, 0, len))); + writeOpts.dispose(); + + try { + for (TickerType statsType : TickerType.values()) { + stats.getTickerCount(statsType); + } + System.out.println("getTickerCount() passed."); + } catch (Exception e) { + System.out.println("Failed in call to getTickerCount()"); + assert(false); //Should never reach here. + } + + try { + for (HistogramType histogramType : HistogramType.values()) { + HistogramData data = stats.geHistogramData(histogramType); + } + System.out.println("geHistogramData() passed."); + } catch (Exception e) { + System.out.println("Failed in call to geHistogramData()"); + assert(false); //Should never reach here. + } + + RocksIterator iterator = db.newIterator(); + + boolean seekToFirstPassed = false; + for (iterator.seekToFirst(); iterator.isValid(); iterator.next()) { + iterator.status(); + assert(iterator.key() != null); + assert(iterator.value() != null); + seekToFirstPassed = true; + } + if(seekToFirstPassed) { + System.out.println("iterator seekToFirst tests passed."); + } + + boolean seekToLastPassed = false; + for (iterator.seekToLast(); iterator.isValid(); iterator.prev()) { + iterator.status(); + assert(iterator.key() != null); + assert(iterator.value() != null); + seekToLastPassed = true; + } + + if(seekToLastPassed) { + System.out.println("iterator seekToLastPassed tests passed."); + } + + iterator.seekToFirst(); + iterator.seek(iterator.key()); + assert(iterator.key() != null); + assert(iterator.value() != null); + + System.out.println("iterator seek test passed."); + + iterator.dispose(); + System.out.println("iterator tests passed."); + + iterator = db.newIterator(); + List keys = new ArrayList(); + for (iterator.seekToLast(); iterator.isValid(); iterator.prev()) { + keys.add(iterator.key()); + } + iterator.dispose(); + + Map values = db.multiGet(keys); + assert(values.size() == keys.size()); + for(byte[] value1 : values.values()) { + assert(value1 != null); + } + + values = db.multiGet(new ReadOptions(), keys); + assert(values.size() == keys.size()); + for(byte[] value1 : values.values()) { + assert(value1 != null); + } + } catch (RocksDBException e) { + System.err.println(e); + } + if (db != null) { + db.close(); + } + // be sure to dispose c++ pointers + options.dispose(); + readOptions.dispose(); + filter.dispose(); + } +} diff --git a/java/jdb_bench.sh b/java/jdb_bench.sh new file mode 100755 index 0000000000..dba7dbd319 --- /dev/null +++ b/java/jdb_bench.sh @@ -0,0 +1 @@ +java -server -d64 -XX:NewSize=4m -XX:+AggressiveOpts -Djava.library.path=.:../ -cp "rocksdbjni.jar:.:./*" org.rocksdb.benchmark.DbBenchmark $@ diff --git a/java/org/rocksdb/BackupableDB.java b/java/org/rocksdb/BackupableDB.java new file mode 100644 index 0000000000..90d4a2a9af --- /dev/null +++ b/java/org/rocksdb/BackupableDB.java @@ -0,0 +1,80 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * A subclass of RocksDB which supports backup-related operations. + * + * @see BackupableDBOptions + */ +public class BackupableDB extends RocksDB { + /** + * Open a BackupableDB under the specified path. + * Note that the backup path should be set properly in the + * input BackupableDBOptions. + * + * @param opt options for db. + * @param bopt backup related options. + * @param the db path for storing data. The path for storing + * backup should be specified in the BackupableDBOptions. + * @return reference to the opened BackupableDB. + */ + public static BackupableDB open( + Options opt, BackupableDBOptions bopt, String db_path) + throws RocksDBException { + + RocksDB db = RocksDB.open(opt, db_path); + BackupableDB bdb = new BackupableDB(); + bdb.open(db.nativeHandle_, bopt.nativeHandle_); + + // Prevent the RocksDB object from attempting to delete + // the underly C++ DB object. + db.disOwnNativeHandle(); + + return bdb; + } + + /** + * Captures the state of the database in the latest backup. + * Note that this function is not thread-safe. + * + * @param flushBeforeBackup if true, then all data will be flushed + * before creating backup. + */ + public void createNewBackup(boolean flushBeforeBackup) { + createNewBackup(nativeHandle_, flushBeforeBackup); + } + + + /** + * Close the BackupableDB instance and release resource. + * + * Internally, BackupableDB owns the rocksdb::DB pointer to its + * associated RocksDB. The release of that RocksDB pointer is + * handled in the destructor of the c++ rocksdb::BackupableDB and + * should be transparent to Java developers. + */ + @Override public synchronized void close() { + if (isInitialized()) { + super.close(); + } + } + + /** + * A protected construction that will be used in the static factory + * method BackupableDB.open(). + */ + protected BackupableDB() { + super(); + } + + @Override protected void finalize() { + close(); + } + + protected native void open(long rocksDBHandle, long backupDBOptionsHandle); + protected native void createNewBackup(long handle, boolean flag); +} diff --git a/java/org/rocksdb/BackupableDBOptions.java b/java/org/rocksdb/BackupableDBOptions.java new file mode 100644 index 0000000000..f229d88aa6 --- /dev/null +++ b/java/org/rocksdb/BackupableDBOptions.java @@ -0,0 +1,43 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * BackupableDBOptions to control the behavior of a backupable database. + * It will be used during the creation of a BackupableDB. + * + * Note that dispose() must be called before an Options instance + * become out-of-scope to release the allocated memory in c++. + */ +public class BackupableDBOptions extends RocksObject { + public BackupableDBOptions(String path) { + super(); + newBackupableDBOptions(path); + } + + /** + * Returns the path to the BackupableDB directory. + * + * @return the path to the BackupableDB directory. + */ + public String backupDir() { + assert(isInitialized()); + return backupDir(nativeHandle_); + } + + /** + * Release the memory allocated for the current instance + * in the c++ side. + */ + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + private native void newBackupableDBOptions(String path); + private native String backupDir(long handle); + private native void disposeInternal(long handle); +} diff --git a/java/org/rocksdb/BloomFilter.java b/java/org/rocksdb/BloomFilter.java new file mode 100644 index 0000000000..9c4913a8c6 --- /dev/null +++ b/java/org/rocksdb/BloomFilter.java @@ -0,0 +1,37 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * This class creates a new filter policy that uses a bloom filter + * with approximately the specified number of bits per key. + * A good value for bitsPerKey is 10, which yields a filter + * with ~ 1% false positive rate. + * + * Default value of bits per key is 10. + */ +public class BloomFilter extends Filter { + private static final int DEFAULT_BITS_PER_KEY = 10; + private final int bitsPerKey_; + + public BloomFilter() { + this(DEFAULT_BITS_PER_KEY); + } + + public BloomFilter(int bitsPerKey) { + super(); + bitsPerKey_ = bitsPerKey; + + createNewFilter(); + } + + @Override + protected void createNewFilter() { + createNewFilter0(bitsPerKey_); + } + + private native void createNewFilter0(int bitsKeyKey); +} diff --git a/java/org/rocksdb/Filter.java b/java/org/rocksdb/Filter.java new file mode 100644 index 0000000000..ce5c41f26d --- /dev/null +++ b/java/org/rocksdb/Filter.java @@ -0,0 +1,31 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Filters are stored in rocksdb and are consulted automatically + * by rocksdb to decide whether or not to read some + * information from disk. In many cases, a filter can cut down the + * number of disk seeks form a handful to a single disk seek per + * DB::Get() call. + */ +public abstract class Filter extends RocksObject { + protected abstract void createNewFilter(); + + /** + * Deletes underlying C++ filter pointer. + * + * Note that this function should be called only after all + * RocksDB instances referencing the filter are closed. + * Otherwise an undefined behavior will occur. + */ + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + private native void disposeInternal(long handle); +} diff --git a/java/org/rocksdb/HashLinkedListMemTableConfig.java b/java/org/rocksdb/HashLinkedListMemTableConfig.java new file mode 100644 index 0000000000..24fcd8b52b --- /dev/null +++ b/java/org/rocksdb/HashLinkedListMemTableConfig.java @@ -0,0 +1,52 @@ +package org.rocksdb; + +/** + * The config for hash linked list memtable representation + * Such memtable contains a fix-sized array of buckets, where + * each bucket points to a sorted singly-linked + * list (or null if the bucket is empty). + * + * Note that since this mem-table representation relies on the + * key prefix, it is required to invoke one of the usePrefixExtractor + * functions to specify how to extract key prefix given a key. + * If proper prefix-extractor is not set, then RocksDB will + * use the default memtable representation (SkipList) instead + * and post a warning in the LOG. + */ +public class HashLinkedListMemTableConfig extends MemTableConfig { + public static final long DEFAULT_BUCKET_COUNT = 50000; + + public HashLinkedListMemTableConfig() { + bucketCount_ = DEFAULT_BUCKET_COUNT; + } + + /** + * Set the number of buckets in the fixed-size array used + * in the hash linked-list mem-table. + * + * @param count the number of hash buckets. + * @return the reference to the current HashLinkedListMemTableConfig. + */ + public HashLinkedListMemTableConfig setBucketCount(long count) { + bucketCount_ = count; + return this; + } + + /** + * Returns the number of buckets that will be used in the memtable + * created based on this config. + * + * @return the number of buckets + */ + public long bucketCount() { + return bucketCount_; + } + + @Override protected long newMemTableFactoryHandle() { + return newMemTableFactoryHandle(bucketCount_); + } + + private native long newMemTableFactoryHandle(long bucketCount); + + private long bucketCount_; +} diff --git a/java/org/rocksdb/HashSkipListMemTableConfig.java b/java/org/rocksdb/HashSkipListMemTableConfig.java new file mode 100644 index 0000000000..74fb0dba2d --- /dev/null +++ b/java/org/rocksdb/HashSkipListMemTableConfig.java @@ -0,0 +1,97 @@ +package org.rocksdb; + +/** + * The config for hash skip-list mem-table representation. + * Such mem-table representation contains a fix-sized array of + * buckets, where each bucket points to a skiplist (or null if the + * bucket is empty). + * + * Note that since this mem-table representation relies on the + * key prefix, it is required to invoke one of the usePrefixExtractor + * functions to specify how to extract key prefix given a key. + * If proper prefix-extractor is not set, then RocksDB will + * use the default memtable representation (SkipList) instead + * and post a warning in the LOG. + */ +public class HashSkipListMemTableConfig extends MemTableConfig { + public static final int DEFAULT_BUCKET_COUNT = 1000000; + public static final int DEFAULT_BRANCHING_FACTOR = 4; + public static final int DEFAULT_HEIGHT = 4; + + public HashSkipListMemTableConfig() { + bucketCount_ = DEFAULT_BUCKET_COUNT; + branchingFactor_ = DEFAULT_BRANCHING_FACTOR; + height_ = DEFAULT_HEIGHT; + } + + /** + * Set the number of hash buckets used in the hash skiplist memtable. + * Default = 1000000. + * + * @param count the number of hash buckets used in the hash + * skiplist memtable. + * @return the reference to the current HashSkipListMemTableConfig. + */ + public HashSkipListMemTableConfig setBucketCount(long count) { + bucketCount_ = count; + return this; + } + + /** + * @return the number of hash buckets + */ + public long bucketCount() { + return bucketCount_; + } + + /** + * Set the height of the skip list. Default = 4. + * + * @return the reference to the current HashSkipListMemTableConfig. + */ + public HashSkipListMemTableConfig setHeight(int height) { + height_ = height; + return this; + } + + /** + * @return the height of the skip list. + */ + public int height() { + return height_; + } + + /** + * Set the branching factor used in the hash skip-list memtable. + * This factor controls the probabilistic size ratio between adjacent + * links in the skip list. + * + * @param bf the probabilistic size ratio between adjacent link + * lists in the skip list. + * @return the reference to the current HashSkipListMemTableConfig. + */ + public HashSkipListMemTableConfig setBranchingFactor(int bf) { + branchingFactor_ = bf; + return this; + } + + /** + * @return branching factor, the probabilistic size ratio between + * adjacent links in the skip list. + */ + public int branchingFactor() { + return branchingFactor_; + } + + @Override protected long newMemTableFactoryHandle() { + return newMemTableFactoryHandle( + bucketCount_, height_, branchingFactor_); + } + + private native long newMemTableFactoryHandle( + long bucketCount, int height, int branchingFactor); + + private long bucketCount_; + private int branchingFactor_; + private int height_; +} diff --git a/java/org/rocksdb/HistogramData.java b/java/org/rocksdb/HistogramData.java new file mode 100644 index 0000000000..3b2e295997 --- /dev/null +++ b/java/org/rocksdb/HistogramData.java @@ -0,0 +1,43 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +public class HistogramData { + private final double median_; + private final double percentile95_; + private final double percentile99_; + private final double average_; + private final double standardDeviation_; + + public HistogramData(double median, double percentile95, + double percentile99, double average, double standardDeviation) { + median_ = median; + percentile95_ = percentile95; + percentile99_ = percentile99; + average_ = average; + standardDeviation_ = standardDeviation; + } + + public double getMedian() { + return median_; + } + + public double getPercentile95() { + return percentile95_; + } + + public double getPercentile99() { + return percentile99_; + } + + public double getAverage() { + return average_; + } + + public double getStandardDeviation() { + return standardDeviation_; + } +} diff --git a/java/org/rocksdb/HistogramType.java b/java/org/rocksdb/HistogramType.java new file mode 100644 index 0000000000..751c03a111 --- /dev/null +++ b/java/org/rocksdb/HistogramType.java @@ -0,0 +1,39 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +public enum HistogramType { + DB_GET(0), + DB_WRITE(1), + COMPACTION_TIME(2), + TABLE_SYNC_MICROS(3), + COMPACTION_OUTFILE_SYNC_MICROS(4), + WAL_FILE_SYNC_MICROS(5), + MANIFEST_FILE_SYNC_MICROS(6), + // TIME SPENT IN IO DURING TABLE OPEN + TABLE_OPEN_IO_MICROS(7), + DB_MULTIGET(8), + READ_BLOCK_COMPACTION_MICROS(9), + READ_BLOCK_GET_MICROS(10), + WRITE_RAW_BLOCK_MICROS(11), + + STALL_L0_SLOWDOWN_COUNT(12), + STALL_MEMTABLE_COMPACTION_COUNT(13), + STALL_L0_NUM_FILES_COUNT(14), + HARD_RATE_LIMIT_DELAY_COUNT(15), + SOFT_RATE_LIMIT_DELAY_COUNT(16), + NUM_FILES_IN_SINGLE_COMPACTION(17); + + private final int value_; + + private HistogramType(int value) { + value_ = value; + } + + public int getValue() { + return value_; + } +} diff --git a/java/org/rocksdb/MemTableConfig.java b/java/org/rocksdb/MemTableConfig.java new file mode 100644 index 0000000000..a473c25856 --- /dev/null +++ b/java/org/rocksdb/MemTableConfig.java @@ -0,0 +1,27 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +/** + * MemTableConfig is used to config the internal mem-table of a RocksDB. + * It is required for each memtable to have one such sub-class to allow + * Java developers to use it. + * + * To make a RocksDB to use a specific MemTable format, its associated + * MemTableConfig should be properly set and passed into Options + * via Options.setMemTableFactory() and open the db using that Options. + * + * @see Options + */ +public abstract class MemTableConfig { + /** + * This function should only be called by Options.setMemTableConfig(), + * which will create a c++ shared-pointer to the c++ MemTableRepFactory + * that associated with the Java MemTableConfig. + * + * @see Options.setMemTableFactory() + */ + abstract protected long newMemTableFactoryHandle(); +} diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java new file mode 100644 index 0000000000..af4b82fabe --- /dev/null +++ b/java/org/rocksdb/Options.java @@ -0,0 +1,2354 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Options to control the behavior of a database. It will be used + * during the creation of a RocksDB (i.e., RocksDB.open()). + * + * Note that dispose() must be called before an Options instance + * become out-of-scope to release the allocated memory in c++. + */ +public class Options extends RocksObject { + static final long DEFAULT_CACHE_SIZE = 8 << 20; + /** + * Construct options for opening a RocksDB. + * + * This constructor will create (by allocating a block of memory) + * an rocksdb::Options in the c++ side. + */ + public Options() { + super(); + cacheSize_ = DEFAULT_CACHE_SIZE; + newOptions(); + } + + /** + * If this value is set to true, then the database will be created + * if it is missing during RocksDB.open(). + * Default: false + * + * @param flag a flag indicating whether to create a database the + * specified database in RocksDB.open() operation is missing. + * @return the instance of the current Options. + * @see RocksDB.open() + */ + public Options setCreateIfMissing(boolean flag) { + assert(isInitialized()); + setCreateIfMissing(nativeHandle_, flag); + return this; + } + + /** + * Return true if the create_if_missing flag is set to true. + * If true, the database will be created if it is missing. + * + * @return true if the createIfMissing option is set to true. + * @see setCreateIfMissing() + */ + public boolean createIfMissing() { + assert(isInitialized()); + return createIfMissing(nativeHandle_); + } + + /** + * Amount of data to build up in memory (backed by an unsorted log + * on disk) before converting to a sorted on-disk file. + * + * Larger values increase performance, especially during bulk loads. + * Up to max_write_buffer_number write buffers may be held in memory + * at the same time, so you may wish to adjust this parameter + * to control memory usage. + * + * Also, a larger write buffer will result in a longer recovery time + * the next time the database is opened. + * + * Default: 4MB + * @param writeBufferSize the size of write buffer. + * @return the instance of the current Options. + * @see RocksDB.open() + */ + public Options setWriteBufferSize(long writeBufferSize) { + assert(isInitialized()); + setWriteBufferSize(nativeHandle_, writeBufferSize); + return this; + } + + /** + * Return size of write buffer size. + * + * @return size of write buffer. + * @see setWriteBufferSize() + */ + public long writeBufferSize() { + assert(isInitialized()); + return writeBufferSize(nativeHandle_); + } + + /** + * The maximum number of write buffers that are built up in memory. + * The default is 2, so that when 1 write buffer is being flushed to + * storage, new writes can continue to the other write buffer. + * Default: 2 + * + * @param maxWriteBufferNumber maximum number of write buffers. + * @return the instance of the current Options. + * @see RocksDB.open() + */ + public Options setMaxWriteBufferNumber(int maxWriteBufferNumber) { + assert(isInitialized()); + setMaxWriteBufferNumber(nativeHandle_, maxWriteBufferNumber); + return this; + } + + /** + * Returns maximum number of write buffers. + * + * @return maximum number of write buffers. + * @see setMaxWriteBufferNumber() + */ + public int maxWriteBufferNumber() { + assert(isInitialized()); + return maxWriteBufferNumber(nativeHandle_); + } + + /* + * Approximate size of user data packed per block. Note that the + * block size specified here corresponds to uncompressed data. The + * actual size of the unit read from disk may be smaller if + * compression is enabled. This parameter can be changed dynamically. + * + * Default: 4K + * + * @param blockSize the size of each block in bytes. + * @return the instance of the current Options. + * @see RocksDB.open() + */ + public Options setBlockSize(long blockSize) { + assert(isInitialized()); + setBlockSize(nativeHandle_, blockSize); + return this; + } + + /* + * Returns the size of a block in bytes. + * + * @return block size. + * @see setBlockSize() + */ + public long blockSize() { + assert(isInitialized()); + return blockSize(nativeHandle_); + } + + /** + * Use the specified filter policy to reduce disk reads. + * + * Note that the caller should not dispose the input filter as + * Options.dispose() will dispose this filter. + * + * @param Filter policy java instance. + * @return the instance of the current Options. + * @see RocksDB.open() + */ + public Options setFilter(Filter filter) { + assert(isInitialized()); + setFilterHandle(nativeHandle_, filter.nativeHandle_); + filter_ = filter; + return this; + } + private native void setFilterHandle(long optHandle, long filterHandle); + + /* + * Disable compaction triggered by seek. + * With bloomfilter and fast storage, a miss on one level + * is very cheap if the file handle is cached in table cache + * (which is true if max_open_files is large). + * Default: true + * + * @param disableSeekCompaction a boolean value to specify whether + * to disable seek compaction. + * @return the instance of the current Options. + * @see RocksDB.open() + */ + public Options setDisableSeekCompaction(boolean disableSeekCompaction) { + assert(isInitialized()); + setDisableSeekCompaction(nativeHandle_, disableSeekCompaction); + return this; + } + + /* + * Returns true if disable seek compaction is set to true. + * + * @return true if disable seek compaction is set to true. + * @see setDisableSeekCompaction() + */ + public boolean disableSeekCompaction() { + assert(isInitialized()); + return disableSeekCompaction(nativeHandle_); + } + + /** + * Set the amount of cache in bytes that will be used by RocksDB. + * If cacheSize is non-positive, then cache will not be used. + * + * DEFAULT: 8M + */ + public Options setCacheSize(long cacheSize) { + cacheSize_ = cacheSize; + return this; + } + + /** + * @return the amount of cache in bytes that will be used by RocksDB. + */ + public long cacheSize() { + return cacheSize_; + } + + /** + * If true, an error will be thrown during RocksDB.open() if the + * database already exists. + * + * @return if true, an error is raised when the specified database + * already exists before open. + */ + public boolean errorIfExists() { + assert(isInitialized()); + return errorIfExists(nativeHandle_); + } + private native boolean errorIfExists(long handle); + + /** + * If true, an error will be thrown during RocksDB.open() if the + * database already exists. + * Default: false + * + * @param errorIfExists if true, an exception will be thrown + * during RocksDB.open() if the database already exists. + * @return the reference to the current option. + * @see RocksDB.open() + */ + public Options setErrorIfExists(boolean errorIfExists) { + assert(isInitialized()); + setErrorIfExists(nativeHandle_, errorIfExists); + return this; + } + private native void setErrorIfExists(long handle, boolean errorIfExists); + + /** + * If true, the implementation will do aggressive checking of the + * data it is processing and will stop early if it detects any + * errors. This may have unforeseen ramifications: for example, a + * corruption of one DB entry may cause a large number of entries to + * become unreadable or for the entire DB to become unopenable. + * If any of the writes to the database fails (Put, Delete, Merge, Write), + * the database will switch to read-only mode and fail all other + * Write operations. + * + * @return a boolean indicating whether paranoid-check is on. + */ + public boolean paranoidChecks() { + assert(isInitialized()); + return paranoidChecks(nativeHandle_); + } + private native boolean paranoidChecks(long handle); + + /** + * If true, the implementation will do aggressive checking of the + * data it is processing and will stop early if it detects any + * errors. This may have unforeseen ramifications: for example, a + * corruption of one DB entry may cause a large number of entries to + * become unreadable or for the entire DB to become unopenable. + * If any of the writes to the database fails (Put, Delete, Merge, Write), + * the database will switch to read-only mode and fail all other + * Write operations. + * Default: true + * + * @param paranoidChecks a flag to indicate whether paranoid-check + * is on. + * @return the reference to the current option. + */ + public Options setParanoidChecks(boolean paranoidChecks) { + assert(isInitialized()); + setParanoidChecks(nativeHandle_, paranoidChecks); + return this; + } + private native void setParanoidChecks( + long handle, boolean paranoidChecks); + + /** + * Number of open files that can be used by the DB. You may need to + * increase this if your database has a large working set. Value -1 means + * files opened are always kept open. You can estimate number of files based + * on target_file_size_base and target_file_size_multiplier for level-based + * compaction. For universal-style compaction, you can usually set it to -1. + * + * @return the maximum number of open files. + */ + public int maxOpenFiles() { + assert(isInitialized()); + return maxOpenFiles(nativeHandle_); + } + private native int maxOpenFiles(long handle); + + /** + * Number of open files that can be used by the DB. You may need to + * increase this if your database has a large working set. Value -1 means + * files opened are always kept open. You can estimate number of files based + * on target_file_size_base and target_file_size_multiplier for level-based + * compaction. For universal-style compaction, you can usually set it to -1. + * Default: 5000 + * + * @param maxOpenFiles the maximum number of open files. + * @return the reference to the current option. + */ + public Options setMaxOpenFiles(int maxOpenFiles) { + assert(isInitialized()); + setMaxOpenFiles(nativeHandle_, maxOpenFiles); + return this; + } + private native void setMaxOpenFiles(long handle, int maxOpenFiles); + + /** + * If true, then the contents of data files are not synced + * to stable storage. Their contents remain in the OS buffers till the + * OS decides to flush them. This option is good for bulk-loading + * of data. Once the bulk-loading is complete, please issue a + * sync to the OS to flush all dirty buffesrs to stable storage. + * + * @return if true, then data-sync is disabled. + */ + public boolean disableDataSync() { + assert(isInitialized()); + return disableDataSync(nativeHandle_); + } + private native boolean disableDataSync(long handle); + + /** + * If true, then the contents of data files are not synced + * to stable storage. Their contents remain in the OS buffers till the + * OS decides to flush them. This option is good for bulk-loading + * of data. Once the bulk-loading is complete, please issue a + * sync to the OS to flush all dirty buffesrs to stable storage. + * Default: false + * + * @param disableDataSync a boolean flag to specify whether to + * disable data sync. + * @return the reference to the current option. + */ + public Options setDisableDataSync(boolean disableDataSync) { + assert(isInitialized()); + setDisableDataSync(nativeHandle_, disableDataSync); + return this; + } + private native void setDisableDataSync(long handle, boolean disableDataSync); + + /** + * If true, then every store to stable storage will issue a fsync. + * If false, then every store to stable storage will issue a fdatasync. + * This parameter should be set to true while storing data to + * filesystem like ext3 that can lose files after a reboot. + * + * @return true if fsync is used. + */ + public boolean useFsync() { + assert(isInitialized()); + return useFsync(nativeHandle_); + } + private native boolean useFsync(long handle); + + /** + * If true, then every store to stable storage will issue a fsync. + * If false, then every store to stable storage will issue a fdatasync. + * This parameter should be set to true while storing data to + * filesystem like ext3 that can lose files after a reboot. + * Default: false + * + * @param useFsync a boolean flag to specify whether to use fsync + * @return the reference to the current option. + */ + public Options setUseFsync(boolean useFsync) { + assert(isInitialized()); + setUseFsync(nativeHandle_, useFsync); + return this; + } + private native void setUseFsync(long handle, boolean useFsync); + + /** + * The time interval in seconds between each two consecutive stats logs. + * This number controls how often a new scribe log about + * db deploy stats is written out. + * -1 indicates no logging at all. + * + * @return the time interval in seconds between each two consecutive + * stats logs. + */ + public int dbStatsLogInterval() { + assert(isInitialized()); + return dbStatsLogInterval(nativeHandle_); + } + private native int dbStatsLogInterval(long handle); + + /** + * The time interval in seconds between each two consecutive stats logs. + * This number controls how often a new scribe log about + * db deploy stats is written out. + * -1 indicates no logging at all. + * Default value is 1800 (half an hour). + * + * @param dbStatsLogInterval the time interval in seconds between each + * two consecutive stats logs. + * @return the reference to the current option. + */ + public Options setDbStatsLogInterval(int dbStatsLogInterval) { + assert(isInitialized()); + setDbStatsLogInterval(nativeHandle_, dbStatsLogInterval); + return this; + } + private native void setDbStatsLogInterval( + long handle, int dbStatsLogInterval); + + /** + * Returns the directory of info log. + * + * If it is empty, the log files will be in the same dir as data. + * If it is non empty, the log files will be in the specified dir, + * and the db data dir's absolute path will be used as the log file + * name's prefix. + * + * @return the path to the info log directory + */ + public String dbLogDir() { + assert(isInitialized()); + return dbLogDir(nativeHandle_); + } + private native String dbLogDir(long handle); + + /** + * This specifies the info LOG dir. + * If it is empty, the log files will be in the same dir as data. + * If it is non empty, the log files will be in the specified dir, + * and the db data dir's absolute path will be used as the log file + * name's prefix. + * + * @param dbLogDir the path to the info log directory + * @return the reference to the current option. + */ + public Options setDbLogDir(String dbLogDir) { + assert(isInitialized()); + setDbLogDir(nativeHandle_, dbLogDir); + return this; + } + private native void setDbLogDir(long handle, String dbLogDir); + + /** + * Returns the path to the write-ahead-logs (WAL) directory. + * + * If it is empty, the log files will be in the same dir as data, + * dbname is used as the data dir by default + * If it is non empty, the log files will be in kept the specified dir. + * When destroying the db, + * all log files in wal_dir and the dir itself is deleted + * + * @return the path to the write-ahead-logs (WAL) directory. + */ + public String walDir() { + assert(isInitialized()); + return walDir(nativeHandle_); + } + private native String walDir(long handle); + + /** + * This specifies the absolute dir path for write-ahead logs (WAL). + * If it is empty, the log files will be in the same dir as data, + * dbname is used as the data dir by default + * If it is non empty, the log files will be in kept the specified dir. + * When destroying the db, + * all log files in wal_dir and the dir itself is deleted + * + * @param walDir the path to the write-ahead-log directory. + * @return the reference to the current option. + */ + public Options setWalDir(String walDir) { + assert(isInitialized()); + setWalDir(nativeHandle_, walDir); + return this; + } + private native void setWalDir(long handle, String walDir); + + /** + * The periodicity when obsolete files get deleted. The default + * value is 6 hours. The files that get out of scope by compaction + * process will still get automatically delete on every compaction, + * regardless of this setting + * + * @return the time interval in micros when obsolete files will be deleted. + */ + public long deleteObsoleteFilesPeriodMicros() { + assert(isInitialized()); + return deleteObsoleteFilesPeriodMicros(nativeHandle_); + } + private native long deleteObsoleteFilesPeriodMicros(long handle); + + /** + * The periodicity when obsolete files get deleted. The default + * value is 6 hours. The files that get out of scope by compaction + * process will still get automatically delete on every compaction, + * regardless of this setting + * + * @param micros the time interval in micros + * @return the reference to the current option. + */ + public Options setDeleteObsoleteFilesPeriodMicros(long micros) { + assert(isInitialized()); + setDeleteObsoleteFilesPeriodMicros(nativeHandle_, micros); + return this; + } + private native void setDeleteObsoleteFilesPeriodMicros( + long handle, long micros); + + /** + * Returns the maximum number of concurrent background compaction jobs, + * submitted to the default LOW priority thread pool. + * When increasing this number, we may also want to consider increasing + * number of threads in LOW priority thread pool. + * Default: 1 + * + * @return the maximum number of concurrent background compaction jobs. + * @see Env.setBackgroundThreads() + */ + public int maxBackgroundCompactions() { + assert(isInitialized()); + return maxBackgroundCompactions(nativeHandle_); + } + + /** + * Creates statistics object which collects metrics about database operations. + Statistics objects should not be shared between DB instances as + it does not use any locks to prevent concurrent updates. + * + * @return the instance of the current Options. + * @see RocksDB.open() + */ + public Options createStatistics() { + assert(isInitialized()); + createStatistics(nativeHandle_); + return this; + } + + /** + * Returns statistics object. Calls createStatistics() if + * C++ returns NULL pointer for statistics. + * + * @return the instance of the statistics object. + * @see createStatistics() + */ + public Statistics statisticsPtr() { + assert(isInitialized()); + + long statsPtr = statisticsPtr(nativeHandle_); + if(statsPtr == 0) { + createStatistics(); + statsPtr = statisticsPtr(nativeHandle_); + } + + return new Statistics(statsPtr); + } + + /** + * Specifies the maximum number of concurrent background compaction jobs, + * submitted to the default LOW priority thread pool. + * If you're increasing this, also consider increasing number of threads in + * LOW priority thread pool. For more information, see + * Default: 1 + * + * @param maxBackgroundCompactions the maximum number of background + * compaction jobs. + * @return the reference to the current option. + * + * @see Env.setBackgroundThreads() + * @see maxBackgroundFlushes() + */ + public Options setMaxBackgroundCompactions(int maxBackgroundCompactions) { + assert(isInitialized()); + setMaxBackgroundCompactions(nativeHandle_, maxBackgroundCompactions); + return this; + } + + /** + * Returns the maximum number of concurrent background flush jobs. + * If you're increasing this, also consider increasing number of threads in + * HIGH priority thread pool. For more information, see + * Default: 1 + * + * @return the maximum number of concurrent background flush jobs. + * @see Env.setBackgroundThreads() + */ + public int maxBackgroundFlushes() { + assert(isInitialized()); + return maxBackgroundFlushes(nativeHandle_); + } + private native int maxBackgroundFlushes(long handle); + + /** + * Specifies the maximum number of concurrent background flush jobs. + * If you're increasing this, also consider increasing number of threads in + * HIGH priority thread pool. For more information, see + * Default: 1 + * + * @param maxBackgroundFlushes + * @return the reference to the current option. + * + * @see Env.setBackgroundThreads() + * @see maxBackgroundCompactions() + */ + public Options setMaxBackgroundFlushes(int maxBackgroundFlushes) { + assert(isInitialized()); + setMaxBackgroundFlushes(nativeHandle_, maxBackgroundFlushes); + return this; + } + private native void setMaxBackgroundFlushes( + long handle, int maxBackgroundFlushes); + + /** + * Returns the maximum size of a info log file. If the current log file + * is larger than this size, a new info log file will be created. + * If 0, all logs will be written to one log file. + * + * @return the maximum size of the info log file. + */ + public long maxLogFileSize() { + assert(isInitialized()); + return maxLogFileSize(nativeHandle_); + } + private native long maxLogFileSize(long handle); + + /** + * Specifies the maximum size of a info log file. If the current log file + * is larger than `max_log_file_size`, a new info log file will + * be created. + * If 0, all logs will be written to one log file. + * + * @param maxLogFileSize the maximum size of a info log file. + * @return the reference to the current option. + */ + public Options setMaxLogFileSize(long maxLogFileSize) { + assert(isInitialized()); + setMaxLogFileSize(nativeHandle_, maxLogFileSize); + return this; + } + private native void setMaxLogFileSize(long handle, long maxLogFileSize); + + /** + * Returns the time interval for the info log file to roll (in seconds). + * If specified with non-zero value, log file will be rolled + * if it has been active longer than `log_file_time_to_roll`. + * Default: 0 (disabled) + * + * @return the time interval in seconds. + */ + public long logFileTimeToRoll() { + assert(isInitialized()); + return logFileTimeToRoll(nativeHandle_); + } + private native long logFileTimeToRoll(long handle); + + /** + * Specifies the time interval for the info log file to roll (in seconds). + * If specified with non-zero value, log file will be rolled + * if it has been active longer than `log_file_time_to_roll`. + * Default: 0 (disabled) + * + * @param logFileTimeToRoll the time interval in seconds. + * @return the reference to the current option. + */ + public Options setLogFileTimeToRoll(long logFileTimeToRoll) { + assert(isInitialized()); + setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll); + return this; + } + private native void setLogFileTimeToRoll( + long handle, long logFileTimeToRoll); + + /** + * Returns the maximum number of info log files to be kept. + * Default: 1000 + * + * @return the maximum number of info log files to be kept. + */ + public long keepLogFileNum() { + assert(isInitialized()); + return keepLogFileNum(nativeHandle_); + } + private native long keepLogFileNum(long handle); + + /** + * Specifies the maximum number of info log files to be kept. + * Default: 1000 + * + * @param keepLogFileNum the maximum number of info log files to be kept. + * @return the reference to the current option. + */ + public Options setKeepLogFileNum(long keepLogFileNum) { + assert(isInitialized()); + setKeepLogFileNum(nativeHandle_, keepLogFileNum); + return this; + } + private native void setKeepLogFileNum(long handle, long keepLogFileNum); + + /** + * Manifest file is rolled over on reaching this limit. + * The older manifest file be deleted. + * The default value is MAX_INT so that roll-over does not take place. + * + * @return the size limit of a manifest file. + */ + public long maxManifestFileSize() { + assert(isInitialized()); + return maxManifestFileSize(nativeHandle_); + } + private native long maxManifestFileSize(long handle); + + /** + * Manifest file is rolled over on reaching this limit. + * The older manifest file be deleted. + * The default value is MAX_INT so that roll-over does not take place. + * + * @param maxManifestFileSize the size limit of a manifest file. + * @return the reference to the current option. + */ + public Options setMaxManifestFileSize(long maxManifestFileSize) { + assert(isInitialized()); + setMaxManifestFileSize(nativeHandle_, maxManifestFileSize); + return this; + } + private native void setMaxManifestFileSize( + long handle, long maxManifestFileSize); + + /** + * Number of shards used for table cache. + * + * @return the number of shards used for table cache. + */ + public int tableCacheNumshardbits() { + assert(isInitialized()); + return tableCacheNumshardbits(nativeHandle_); + } + private native int tableCacheNumshardbits(long handle); + + /** + * Number of shards used for table cache. + * + * @param tableCacheNumshardbits the number of chards + * @return the reference to the current option. + */ + public Options setTableCacheNumshardbits(int tableCacheNumshardbits) { + assert(isInitialized()); + setTableCacheNumshardbits(nativeHandle_, tableCacheNumshardbits); + return this; + } + private native void setTableCacheNumshardbits( + long handle, int tableCacheNumshardbits); + + /** + * During data eviction of table's LRU cache, it would be inefficient + * to strictly follow LRU because this piece of memory will not really + * be released unless its refcount falls to zero. Instead, make two + * passes: the first pass will release items with refcount = 1, + * and if not enough space releases after scanning the number of + * elements specified by this parameter, we will remove items in LRU + * order. + * + * @return scan count limit + */ + public int tableCacheRemoveScanCountLimit() { + assert(isInitialized()); + return tableCacheRemoveScanCountLimit(nativeHandle_); + } + private native int tableCacheRemoveScanCountLimit(long handle); + + /** + * During data eviction of table's LRU cache, it would be inefficient + * to strictly follow LRU because this piece of memory will not really + * be released unless its refcount falls to zero. Instead, make two + * passes: the first pass will release items with refcount = 1, + * and if not enough space releases after scanning the number of + * elements specified by this parameter, we will remove items in LRU + * order. + * + * @param limit scan count limit + * @return the reference to the current option. + */ + public Options setTableCacheRemoveScanCountLimit(int limit) { + assert(isInitialized()); + setTableCacheRemoveScanCountLimit(nativeHandle_, limit); + return this; + } + private native void setTableCacheRemoveScanCountLimit( + long handle, int limit); + + /** + * WalTtlSeconds() and walSizeLimitMB() affect how archived logs + * will be deleted. + * 1. If both set to 0, logs will be deleted asap and will not get into + * the archive. + * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + * WAL files will be checked every 10 min and if total size is greater + * then WAL_size_limit_MB, they will be deleted starting with the + * earliest until size_limit is met. All empty files will be deleted. + * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + * WAL files will be checked every WAL_ttl_secondsi / 2 and those that + * are older than WAL_ttl_seconds will be deleted. + * 4. If both are not 0, WAL files will be checked every 10 min and both + * checks will be performed with ttl being first. + * + * @return the wal-ttl seconds + * @see walSizeLimitMB() + */ + public long walTtlSeconds() { + assert(isInitialized()); + return walTtlSeconds(nativeHandle_); + } + private native long walTtlSeconds(long handle); + + /** + * WalTtlSeconds() and walSizeLimitMB() affect how archived logs + * will be deleted. + * 1. If both set to 0, logs will be deleted asap and will not get into + * the archive. + * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + * WAL files will be checked every 10 min and if total size is greater + * then WAL_size_limit_MB, they will be deleted starting with the + * earliest until size_limit is met. All empty files will be deleted. + * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + * WAL files will be checked every WAL_ttl_secondsi / 2 and those that + * are older than WAL_ttl_seconds will be deleted. + * 4. If both are not 0, WAL files will be checked every 10 min and both + * checks will be performed with ttl being first. + * + * @param walTtlSeconds the ttl seconds + * @return the reference to the current option. + * @see setWalSizeLimitMB() + */ + public Options setWalTtlSeconds(long walTtlSeconds) { + assert(isInitialized()); + setWalTtlSeconds(nativeHandle_, walTtlSeconds); + return this; + } + private native void setWalTtlSeconds(long handle, long walTtlSeconds); + + /** + * WalTtlSeconds() and walSizeLimitMB() affect how archived logs + * will be deleted. + * 1. If both set to 0, logs will be deleted asap and will not get into + * the archive. + * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + * WAL files will be checked every 10 min and if total size is greater + * then WAL_size_limit_MB, they will be deleted starting with the + * earliest until size_limit is met. All empty files will be deleted. + * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + * WAL files will be checked every WAL_ttl_secondsi / 2 and those that + * are older than WAL_ttl_seconds will be deleted. + * 4. If both are not 0, WAL files will be checked every 10 min and both + * checks will be performed with ttl being first. + * + * @return size limit in mega-bytes. + * @see walSizeLimitMB() + */ + public long walSizeLimitMB() { + assert(isInitialized()); + return walSizeLimitMB(nativeHandle_); + } + private native long walSizeLimitMB(long handle); + + /** + * WalTtlSeconds() and walSizeLimitMB() affect how archived logs + * will be deleted. + * 1. If both set to 0, logs will be deleted asap and will not get into + * the archive. + * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + * WAL files will be checked every 10 min and if total size is greater + * then WAL_size_limit_MB, they will be deleted starting with the + * earliest until size_limit is met. All empty files will be deleted. + * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + * WAL files will be checked every WAL_ttl_secondsi / 2 and those that + * are older than WAL_ttl_seconds will be deleted. + * 4. If both are not 0, WAL files will be checked every 10 min and both + * checks will be performed with ttl being first. + * + * @param sizeLimitMB size limit in mega-bytes. + * @return the reference to the current option. + * @see setWalSizeLimitMB() + */ + public Options setWalSizeLimitMB(long sizeLimitMB) { + assert(isInitialized()); + setWalSizeLimitMB(nativeHandle_, sizeLimitMB); + return this; + } + private native void setWalSizeLimitMB(long handle, long sizeLimitMB); + + /** + * Number of bytes to preallocate (via fallocate) the manifest + * files. Default is 4mb, which is reasonable to reduce random IO + * as well as prevent overallocation for mounts that preallocate + * large amounts of data (such as xfs's allocsize option). + * + * @return size in bytes. + */ + public long manifestPreallocationSize() { + assert(isInitialized()); + return manifestPreallocationSize(nativeHandle_); + } + private native long manifestPreallocationSize(long handle); + + /** + * Number of bytes to preallocate (via fallocate) the manifest + * files. Default is 4mb, which is reasonable to reduce random IO + * as well as prevent overallocation for mounts that preallocate + * large amounts of data (such as xfs's allocsize option). + * + * @param size the size in byte + * @return the reference to the current option. + */ + public Options setManifestPreallocationSize(long size) { + assert(isInitialized()); + setManifestPreallocationSize(nativeHandle_, size); + return this; + } + private native void setManifestPreallocationSize( + long handle, long size); + + /** + * Data being read from file storage may be buffered in the OS + * Default: true + * + * @return if true, then OS buffering is allowed. + */ + public boolean allowOsBuffer() { + assert(isInitialized()); + return allowOsBuffer(nativeHandle_); + } + private native boolean allowOsBuffer(long handle); + + /** + * Data being read from file storage may be buffered in the OS + * Default: true + * + * @param allowOsBufferif true, then OS buffering is allowed. + * @return the reference to the current option. + */ + public Options setAllowOsBuffer(boolean allowOsBuffer) { + assert(isInitialized()); + setAllowOsBuffer(nativeHandle_, allowOsBuffer); + return this; + } + private native void setAllowOsBuffer( + long handle, boolean allowOsBuffer); + + /** + * Allow the OS to mmap file for reading sst tables. + * Default: false + * + * @return true if mmap reads are allowed. + */ + public boolean allowMmapReads() { + assert(isInitialized()); + return allowMmapReads(nativeHandle_); + } + private native boolean allowMmapReads(long handle); + + /** + * Allow the OS to mmap file for reading sst tables. + * Default: false + * + * @param allowMmapReads true if mmap reads are allowed. + * @return the reference to the current option. + */ + public Options setAllowMmapReads(boolean allowMmapReads) { + assert(isInitialized()); + setAllowMmapReads(nativeHandle_, allowMmapReads); + return this; + } + private native void setAllowMmapReads( + long handle, boolean allowMmapReads); + + /** + * Allow the OS to mmap file for writing. Default: false + * + * @return true if mmap writes are allowed. + */ + public boolean allowMmapWrites() { + assert(isInitialized()); + return allowMmapWrites(nativeHandle_); + } + private native boolean allowMmapWrites(long handle); + + /** + * Allow the OS to mmap file for writing. Default: false + * + * @param allowMmapWrites true if mmap writes are allowd. + * @return the reference to the current option. + */ + public Options setAllowMmapWrites(boolean allowMmapWrites) { + assert(isInitialized()); + setAllowMmapWrites(nativeHandle_, allowMmapWrites); + return this; + } + private native void setAllowMmapWrites( + long handle, boolean allowMmapWrites); + + /** + * Disable child process inherit open files. Default: true + * + * @return true if child process inheriting open files is disabled. + */ + public boolean isFdCloseOnExec() { + assert(isInitialized()); + return isFdCloseOnExec(nativeHandle_); + } + private native boolean isFdCloseOnExec(long handle); + + /** + * Disable child process inherit open files. Default: true + * + * @param isFdCloseOnExec true if child process inheriting open + * files is disabled. + * @return the reference to the current option. + */ + public Options setIsFdCloseOnExec(boolean isFdCloseOnExec) { + assert(isInitialized()); + setIsFdCloseOnExec(nativeHandle_, isFdCloseOnExec); + return this; + } + private native void setIsFdCloseOnExec( + long handle, boolean isFdCloseOnExec); + + /** + * Skip log corruption error on recovery (If client is ok with + * losing most recent changes) + * Default: false + * + * @return true if log corruption errors are skipped during recovery. + */ + public boolean skipLogErrorOnRecovery() { + assert(isInitialized()); + return skipLogErrorOnRecovery(nativeHandle_); + } + private native boolean skipLogErrorOnRecovery(long handle); + + /** + * Skip log corruption error on recovery (If client is ok with + * losing most recent changes) + * Default: false + * + * @param skip true if log corruption errors are skipped during recovery. + * @return the reference to the current option. + */ + public Options setSkipLogErrorOnRecovery(boolean skip) { + assert(isInitialized()); + setSkipLogErrorOnRecovery(nativeHandle_, skip); + return this; + } + private native void setSkipLogErrorOnRecovery( + long handle, boolean skip); + + /** + * If not zero, dump rocksdb.stats to LOG every stats_dump_period_sec + * Default: 3600 (1 hour) + * + * @return time interval in seconds. + */ + public int statsDumpPeriodSec() { + assert(isInitialized()); + return statsDumpPeriodSec(nativeHandle_); + } + private native int statsDumpPeriodSec(long handle); + + /** + * if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec + * Default: 3600 (1 hour) + * + * @param statsDumpPeriodSec time interval in seconds. + * @return the reference to the current option. + */ + public Options setStatsDumpPeriodSec(int statsDumpPeriodSec) { + assert(isInitialized()); + setStatsDumpPeriodSec(nativeHandle_, statsDumpPeriodSec); + return this; + } + private native void setStatsDumpPeriodSec( + long handle, int statsDumpPeriodSec); + + /** + * If set true, will hint the underlying file system that the file + * access pattern is random, when a sst file is opened. + * Default: true + * + * @return true if hinting random access is on. + */ + public boolean adviseRandomOnOpen() { + return adviseRandomOnOpen(nativeHandle_); + } + private native boolean adviseRandomOnOpen(long handle); + + /** + * If set true, will hint the underlying file system that the file + * access pattern is random, when a sst file is opened. + * Default: true + * + * @param adviseRandomOnOpen true if hinting random access is on. + * @return the reference to the current option. + */ + public Options setAdviseRandomOnOpen(boolean adviseRandomOnOpen) { + assert(isInitialized()); + setAdviseRandomOnOpen(nativeHandle_, adviseRandomOnOpen); + return this; + } + private native void setAdviseRandomOnOpen( + long handle, boolean adviseRandomOnOpen); + + /** + * Use adaptive mutex, which spins in the user space before resorting + * to kernel. This could reduce context switch when the mutex is not + * heavily contended. However, if the mutex is hot, we could end up + * wasting spin time. + * Default: false + * + * @return true if adaptive mutex is used. + */ + public boolean useAdaptiveMutex() { + assert(isInitialized()); + return useAdaptiveMutex(nativeHandle_); + } + private native boolean useAdaptiveMutex(long handle); + + /** + * Use adaptive mutex, which spins in the user space before resorting + * to kernel. This could reduce context switch when the mutex is not + * heavily contended. However, if the mutex is hot, we could end up + * wasting spin time. + * Default: false + * + * @param useAdaptiveMutex true if adaptive mutex is used. + * @return the reference to the current option. + */ + public Options setUseAdaptiveMutex(boolean useAdaptiveMutex) { + assert(isInitialized()); + setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex); + return this; + } + private native void setUseAdaptiveMutex( + long handle, boolean useAdaptiveMutex); + + /** + * Allows OS to incrementally sync files to disk while they are being + * written, asynchronously, in the background. + * Issue one request for every bytes_per_sync written. 0 turns it off. + * Default: 0 + * + * @return size in bytes + */ + public long bytesPerSync() { + return bytesPerSync(nativeHandle_); + } + private native long bytesPerSync(long handle); + + /** + * Allows OS to incrementally sync files to disk while they are being + * written, asynchronously, in the background. + * Issue one request for every bytes_per_sync written. 0 turns it off. + * Default: 0 + * + * @param bytesPerSync size in bytes + * @return the reference to the current option. + */ + public Options setBytesPerSync(long bytesPerSync) { + assert(isInitialized()); + setBytesPerSync(nativeHandle_, bytesPerSync); + return this; + } + private native void setBytesPerSync( + long handle, long bytesPerSync); + + /** + * Allow RocksDB to use thread local storage to optimize performance. + * Default: true + * + * @return true if thread-local storage is allowed + */ + public boolean allowThreadLocal() { + assert(isInitialized()); + return allowThreadLocal(nativeHandle_); + } + private native boolean allowThreadLocal(long handle); + + /** + * Allow RocksDB to use thread local storage to optimize performance. + * Default: true + * + * @param allowThreadLocal true if thread-local storage is allowed. + * @return the reference to the current option. + */ + public Options setAllowThreadLocal(boolean allowThreadLocal) { + assert(isInitialized()); + setAllowThreadLocal(nativeHandle_, allowThreadLocal); + return this; + } + private native void setAllowThreadLocal( + long handle, boolean allowThreadLocal); + + /** + * Set the config for mem-table. + * + * @param config the mem-table config. + * @return the instance of the current Options. + */ + public Options setMemTableConfig(MemTableConfig config) { + setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle()); + return this; + } + + /** + * Returns the name of the current mem table representation. + * Memtable format can be set using setTableFormatConfig. + * + * @return the name of the currently-used memtable factory. + * @see setTableFormatConfig() + */ + public String memTableFactoryName() { + assert(isInitialized()); + return memTableFactoryName(nativeHandle_); + } + + /** + * Set the config for table format. + * + * @param config the table format config. + * @return the reference of the current Options. + */ + public Options setTableFormatConfig(TableFormatConfig config) { + setTableFactory(nativeHandle_, config.newTableFactoryHandle()); + return this; + } + + /** + * @return the name of the currently used table factory. + */ + public String tableFactoryName() { + assert(isInitialized()); + return tableFactoryName(nativeHandle_); + } + + /** + * This prefix-extractor uses the first n bytes of a key as its prefix. + * + * In some hash-based memtable representation such as HashLinkedList + * and HashSkipList, prefixes are used to partition the keys into + * several buckets. Prefix extractor is used to specify how to + * extract the prefix given a key. + * + * @param n use the first n bytes of a key as its prefix. + */ + public Options useFixedLengthPrefixExtractor(int n) { + assert(isInitialized()); + useFixedLengthPrefixExtractor(nativeHandle_, n); + return this; + } + +/////////////////////////////////////////////////////////////////////// + /** + * Number of keys between restart points for delta encoding of keys. + * This parameter can be changed dynamically. Most clients should + * leave this parameter alone. + * Default: 16 + * + * @return the number of keys between restart points. + */ + public int blockRestartInterval() { + return blockRestartInterval(nativeHandle_); + } + private native int blockRestartInterval(long handle); + + /** + * Number of keys between restart points for delta encoding of keys. + * This parameter can be changed dynamically. Most clients should + * leave this parameter alone. + * Default: 16 + * + * @param blockRestartInterval the number of keys between restart points. + * @return the reference to the current option. + */ + public Options setBlockRestartInterval(int blockRestartInterval) { + setBlockRestartInterval(nativeHandle_, blockRestartInterval); + return this; + } + private native void setBlockRestartInterval( + long handle, int blockRestartInterval); + + /** + * If true, place whole keys in the filter (not just prefixes). + * This must generally be true for gets to be efficient. + * Default: true + * + * @return if true, then whole-key-filtering is on. + */ + public boolean wholeKeyFiltering() { + return wholeKeyFiltering(nativeHandle_); + } + private native boolean wholeKeyFiltering(long handle); + + /** + * If true, place whole keys in the filter (not just prefixes). + * This must generally be true for gets to be efficient. + * Default: true + * + * @param wholeKeyFiltering if true, then whole-key-filtering is on. + * @return the reference to the current option. + */ + public Options setWholeKeyFiltering(boolean wholeKeyFiltering) { + setWholeKeyFiltering(nativeHandle_, wholeKeyFiltering); + return this; + } + private native void setWholeKeyFiltering( + long handle, boolean wholeKeyFiltering); + + /** + * If level-styled compaction is used, then this number determines + * the total number of levels. + * + * @return the number of levels. + */ + public int numLevels() { + return numLevels(nativeHandle_); + } + private native int numLevels(long handle); + + /** + * Set the number of levels for this database + * If level-styled compaction is used, then this number determines + * the total number of levels. + * + * @param numLevels the number of levels. + * @return the reference to the current option. + */ + public Options setNumLevels(int numLevels) { + setNumLevels(nativeHandle_, numLevels); + return this; + } + private native void setNumLevels( + long handle, int numLevels); + + /** + * The number of files in leve 0 to trigger compaction from level-0 to + * level-1. A value < 0 means that level-0 compaction will not be + * triggered by number of files at all. + * Default: 4 + * + * @return the number of files in level 0 to trigger compaction. + */ + public int levelZeroFileNumCompactionTrigger() { + return levelZeroFileNumCompactionTrigger(nativeHandle_); + } + private native int levelZeroFileNumCompactionTrigger(long handle); + + /** + * Number of files to trigger level-0 compaction. A value <0 means that + * level-0 compaction will not be triggered by number of files at all. + * Default: 4 + * + * @param numFiles the number of files in level-0 to trigger compaction. + * @return the reference to the current option. + */ + public Options setLevelZeroFileNumCompactionTrigger( + int numFiles) { + setLevelZeroFileNumCompactionTrigger( + nativeHandle_, numFiles); + return this; + } + private native void setLevelZeroFileNumCompactionTrigger( + long handle, int numFiles); + + /** + * Soft limit on the number of level-0 files. We start slowing down writes + * at this point. A value < 0 means that no writing slow down will be + * triggered by number of files in level-0. + * + * @return the soft limit on the number of level-0 files. + */ + public int levelZeroSlowdownWritesTrigger() { + return levelZeroSlowdownWritesTrigger(nativeHandle_); + } + private native int levelZeroSlowdownWritesTrigger(long handle); + + /** + * Soft limit on number of level-0 files. We start slowing down writes at this + * point. A value <0 means that no writing slow down will be triggered by + * number of files in level-0. + * + * @param numFiles soft limit on number of level-0 files. + * @return the reference to the current option. + */ + public Options setLevelZeroSlowdownWritesTrigger( + int numFiles) { + setLevelZeroSlowdownWritesTrigger(nativeHandle_, numFiles); + return this; + } + private native void setLevelZeroSlowdownWritesTrigger( + long handle, int numFiles); + + /** + * Maximum number of level-0 files. We stop writes at this point. + * + * @return the hard limit of the number of level-0 file. + */ + public int levelZeroStopWritesTrigger() { + return levelZeroStopWritesTrigger(nativeHandle_); + } + private native int levelZeroStopWritesTrigger(long handle); + + /** + * Maximum number of level-0 files. We stop writes at this point. + * + * @param numFiles the hard limit of the number of level-0 files. + * @return the reference to the current option. + */ + public Options setLevelZeroStopWritesTrigger(int numFiles) { + setLevelZeroStopWritesTrigger(nativeHandle_, numFiles); + return this; + } + private native void setLevelZeroStopWritesTrigger( + long handle, int numFiles); + + /** + * The highest level to which a new compacted memtable is pushed if it + * does not create overlap. We try to push to level 2 to avoid the + * relatively expensive level 0=>1 compactions and to avoid some + * expensive manifest file operations. We do not push all the way to + * the largest level since that can generate a lot of wasted disk + * space if the same key space is being repeatedly overwritten. + * + * @return the highest level where a new compacted memtable will be pushed. + */ + public int maxMemCompactionLevel() { + return maxMemCompactionLevel(nativeHandle_); + } + private native int maxMemCompactionLevel(long handle); + + /** + * The highest level to which a new compacted memtable is pushed if it + * does not create overlap. We try to push to level 2 to avoid the + * relatively expensive level 0=>1 compactions and to avoid some + * expensive manifest file operations. We do not push all the way to + * the largest level since that can generate a lot of wasted disk + * space if the same key space is being repeatedly overwritten. + * + * @param maxMemCompactionLevel the highest level to which a new compacted + * mem-table will be pushed. + * @return the reference to the current option. + */ + public Options setMaxMemCompactionLevel(int maxMemCompactionLevel) { + setMaxMemCompactionLevel(nativeHandle_, maxMemCompactionLevel); + return this; + } + private native void setMaxMemCompactionLevel( + long handle, int maxMemCompactionLevel); + + /** + * The target file size for compaction. + * This targetFileSizeBase determines a level-1 file size. + * Target file size for level L can be calculated by + * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1)) + * For example, if targetFileSizeBase is 2MB and + * target_file_size_multiplier is 10, then each file on level-1 will + * be 2MB, and each file on level 2 will be 20MB, + * and each file on level-3 will be 200MB. + * by default targetFileSizeBase is 2MB. + * + * @return the target size of a level-0 file. + * + * @see targetFileSizeMultiplier() + */ + public int targetFileSizeBase() { + return targetFileSizeBase(nativeHandle_); + } + private native int targetFileSizeBase(long handle); + + /** + * The target file size for compaction. + * This targetFileSizeBase determines a level-1 file size. + * Target file size for level L can be calculated by + * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1)) + * For example, if targetFileSizeBase is 2MB and + * target_file_size_multiplier is 10, then each file on level-1 will + * be 2MB, and each file on level 2 will be 20MB, + * and each file on level-3 will be 200MB. + * by default targetFileSizeBase is 2MB. + * + * @param targetFileSizeBase the target size of a level-0 file. + * @return the reference to the current option. + * + * @see setTargetFileSizeMultiplier() + */ + public Options setTargetFileSizeBase(int targetFileSizeBase) { + setTargetFileSizeBase(nativeHandle_, targetFileSizeBase); + return this; + } + private native void setTargetFileSizeBase( + long handle, int targetFileSizeBase); + + /** + * targetFileSizeMultiplier defines the size ratio between a + * level-(L+1) file and level-L file. + * By default targetFileSizeMultiplier is 1, meaning + * files in different levels have the same target. + * + * @return the size ratio between a level-(L+1) file and level-L file. + */ + public int targetFileSizeMultiplier() { + return targetFileSizeMultiplier(nativeHandle_); + } + private native int targetFileSizeMultiplier(long handle); + + /** + * targetFileSizeMultiplier defines the size ratio between a + * level-L file and level-(L+1) file. + * By default target_file_size_multiplier is 1, meaning + * files in different levels have the same target. + * + * @param multiplier the size ratio between a level-(L+1) file + * and level-L file. + * @return the reference to the current option. + */ + public Options setTargetFileSizeMultiplier(int multiplier) { + setTargetFileSizeMultiplier(nativeHandle_, multiplier); + return this; + } + private native void setTargetFileSizeMultiplier( + long handle, int multiplier); + + /** + * The upper-bound of the total size of level-1 files in bytes. + * Maximum number of bytes for level L can be calculated as + * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1)) + * For example, if maxBytesForLevelBase is 20MB, and if + * max_bytes_for_level_multiplier is 10, total data size for level-1 + * will be 20MB, total file size for level-2 will be 200MB, + * and total file size for level-3 will be 2GB. + * by default 'maxBytesForLevelBase' is 10MB. + * + * @return the upper-bound of the total size of leve-1 files in bytes. + * @see maxBytesForLevelMultiplier() + */ + public long maxBytesForLevelBase() { + return maxBytesForLevelBase(nativeHandle_); + } + private native long maxBytesForLevelBase(long handle); + + /** + * The upper-bound of the total size of level-1 files in bytes. + * Maximum number of bytes for level L can be calculated as + * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1)) + * For example, if maxBytesForLevelBase is 20MB, and if + * max_bytes_for_level_multiplier is 10, total data size for level-1 + * will be 20MB, total file size for level-2 will be 200MB, + * and total file size for level-3 will be 2GB. + * by default 'maxBytesForLevelBase' is 10MB. + * + * @return maxBytesForLevelBase the upper-bound of the total size of + * leve-1 files in bytes. + * @return the reference to the current option. + * @see setMaxBytesForLevelMultiplier() + */ + public Options setMaxBytesForLevelBase(long maxBytesForLevelBase) { + setMaxBytesForLevelBase(nativeHandle_, maxBytesForLevelBase); + return this; + } + private native void setMaxBytesForLevelBase( + long handle, long maxBytesForLevelBase); + + /** + * The ratio between the total size of level-(L+1) files and the total + * size of level-L files for all L. + * DEFAULT: 10 + * + * @return the ratio between the total size of level-(L+1) files and + * the total size of level-L files for all L. + * @see maxBytesForLevelBase() + */ + public int maxBytesForLevelMultiplier() { + return maxBytesForLevelMultiplier(nativeHandle_); + } + private native int maxBytesForLevelMultiplier(long handle); + + /** + * The ratio between the total size of level-(L+1) files and the total + * size of level-L files for all L. + * DEFAULT: 10 + * + * @param multiplier the ratio between the total size of level-(L+1) + * files and the total size of level-L files for all L. + * @return the reference to the current option. + * @see setMaxBytesForLevelBase() + */ + public Options setMaxBytesForLevelMultiplier(int multiplier) { + setMaxBytesForLevelMultiplier(nativeHandle_, multiplier); + return this; + } + private native void setMaxBytesForLevelMultiplier( + long handle, int multiplier); + + /** + * Maximum number of bytes in all compacted files. We avoid expanding + * the lower level file set of a compaction if it would make the + * total compaction cover more than + * (expanded_compaction_factor * targetFileSizeLevel()) many bytes. + * + * @return the maximum number of bytes in all compacted files. + * @see sourceCompactionFactor() + */ + public int expandedCompactionFactor() { + return expandedCompactionFactor(nativeHandle_); + } + private native int expandedCompactionFactor(long handle); + + /** + * Maximum number of bytes in all compacted files. We avoid expanding + * the lower level file set of a compaction if it would make the + * total compaction cover more than + * (expanded_compaction_factor * targetFileSizeLevel()) many bytes. + * + * @param expandedCompactionFactor the maximum number of bytes in all + * compacted files. + * @return the reference to the current option. + * @see setSourceCompactionFactor() + */ + public Options setExpandedCompactionFactor(int expandedCompactionFactor) { + setExpandedCompactionFactor(nativeHandle_, expandedCompactionFactor); + return this; + } + private native void setExpandedCompactionFactor( + long handle, int expandedCompactionFactor); + + /** + * Maximum number of bytes in all source files to be compacted in a + * single compaction run. We avoid picking too many files in the + * source level so that we do not exceed the total source bytes + * for compaction to exceed + * (source_compaction_factor * targetFileSizeLevel()) many bytes. + * Default:1, i.e. pick maxfilesize amount of data as the source of + * a compaction. + * + * @return the maximum number of bytes in all source files to be compactedo. + * @see expendedCompactionFactor() + */ + public int sourceCompactionFactor() { + return sourceCompactionFactor(nativeHandle_); + } + private native int sourceCompactionFactor(long handle); + + /** + * Maximum number of bytes in all source files to be compacted in a + * single compaction run. We avoid picking too many files in the + * source level so that we do not exceed the total source bytes + * for compaction to exceed + * (source_compaction_factor * targetFileSizeLevel()) many bytes. + * Default:1, i.e. pick maxfilesize amount of data as the source of + * a compaction. + * + * @param sourceCompactionFactor the maximum number of bytes in all + * source files to be compacted in a single compaction run. + * @return the reference to the current option. + * @see setExpendedCompactionFactor() + */ + public Options setSourceCompactionFactor(int sourceCompactionFactor) { + setSourceCompactionFactor(nativeHandle_, sourceCompactionFactor); + return this; + } + private native void setSourceCompactionFactor( + long handle, int sourceCompactionFactor); + + /** + * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we + * stop building a single file in a level->level+1 compaction. + * + * @return maximum bytes of overlaps in "grandparent" level. + */ + public int maxGrandparentOverlapFactor() { + return maxGrandparentOverlapFactor(nativeHandle_); + } + private native int maxGrandparentOverlapFactor(long handle); + + /** + * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we + * stop building a single file in a level->level+1 compaction. + * + * @param maxGrandparentOverlapFactor maximum bytes of overlaps in + * "grandparent" level. + * @return the reference to the current option. + */ + public Options setMaxGrandparentOverlapFactor( + int maxGrandparentOverlapFactor) { + setMaxGrandparentOverlapFactor(nativeHandle_, maxGrandparentOverlapFactor); + return this; + } + private native void setMaxGrandparentOverlapFactor( + long handle, int maxGrandparentOverlapFactor); + + /** + * Puts are delayed 0-1 ms when any level has a compaction score that exceeds + * soft_rate_limit. This is ignored when == 0.0. + * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not + * hold, RocksDB will set soft_rate_limit = hard_rate_limit + * Default: 0 (disabled) + * + * @return soft-rate-limit for put delay. + */ + public double softRateLimit() { + return softRateLimit(nativeHandle_); + } + private native double softRateLimit(long handle); + + /** + * Puts are delayed 0-1 ms when any level has a compaction score that exceeds + * soft_rate_limit. This is ignored when == 0.0. + * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not + * hold, RocksDB will set soft_rate_limit = hard_rate_limit + * Default: 0 (disabled) + * + * @param softRateLimit the soft-rate-limit of a compaction score + * for put delay. + * @return the reference to the current option. + */ + public Options setSoftRateLimit(double softRateLimit) { + setSoftRateLimit(nativeHandle_, softRateLimit); + return this; + } + private native void setSoftRateLimit( + long handle, double softRateLimit); + + /** + * Puts are delayed 1ms at a time when any level has a compaction score that + * exceeds hard_rate_limit. This is ignored when <= 1.0. + * Default: 0 (disabled) + * + * @return the hard-rate-limit of a compaction score for put delay. + */ + public double hardRateLimit() { + return hardRateLimit(nativeHandle_); + } + private native double hardRateLimit(long handle); + + /** + * Puts are delayed 1ms at a time when any level has a compaction score that + * exceeds hard_rate_limit. This is ignored when <= 1.0. + * Default: 0 (disabled) + * + * @param hardRateLimit the hard-rate-limit of a compaction score for put + * delay. + * @return the reference to the current option. + */ + public Options setHardRateLimit(double hardRateLimit) { + setHardRateLimit(nativeHandle_, hardRateLimit); + return this; + } + private native void setHardRateLimit( + long handle, double hardRateLimit); + + /** + * The maximum time interval a put will be stalled when hard_rate_limit + * is enforced. If 0, then there is no limit. + * Default: 1000 + * + * @return the maximum time interval a put will be stalled when + * hard_rate_limit is enforced. + */ + public int rateLimitDelayMaxMilliseconds() { + return rateLimitDelayMaxMilliseconds(nativeHandle_); + } + private native int rateLimitDelayMaxMilliseconds(long handle); + + /** + * The maximum time interval a put will be stalled when hard_rate_limit + * is enforced. If 0, then there is no limit. + * Default: 1000 + * + * @param rateLimitDelayMaxMilliseconds the maximum time interval a put + * will be stalled. + * @return the reference to the current option. + */ + public Options setRateLimitDelayMaxMilliseconds( + int rateLimitDelayMaxMilliseconds) { + setRateLimitDelayMaxMilliseconds( + nativeHandle_, rateLimitDelayMaxMilliseconds); + return this; + } + private native void setRateLimitDelayMaxMilliseconds( + long handle, int rateLimitDelayMaxMilliseconds); + + /** + * Disable block cache. If this is set to true, + * then no block cache should be used, and the block_cache should + * point to a nullptr object. + * Default: false + * + * @return true if block cache is disabled. + */ + public boolean noBlockCache() { + return noBlockCache(nativeHandle_); + } + private native boolean noBlockCache(long handle); + + /** + * Disable block cache. If this is set to true, + * then no block cache should be used, and the block_cache should + * point to a nullptr object. + * Default: false + * + * @param noBlockCache true if block-cache is disabled. + * @return the reference to the current option. + */ + public Options setNoBlockCache(boolean noBlockCache) { + setNoBlockCache(nativeHandle_, noBlockCache); + return this; + } + private native void setNoBlockCache( + long handle, boolean noBlockCache); + + /** + * The size of one block in arena memory allocation. + * If <= 0, a proper value is automatically calculated (usually 1/10 of + * writer_buffer_size). + * + * There are two additonal restriction of the The specified size: + * (1) size should be in the range of [4096, 2 << 30] and + * (2) be the multiple of the CPU word (which helps with the memory + * alignment). + * + * We'll automatically check and adjust the size number to make sure it + * conforms to the restrictions. + * Default: 0 + * + * @return the size of an arena block + */ + public long arenaBlockSize() { + return arenaBlockSize(nativeHandle_); + } + private native long arenaBlockSize(long handle); + + /** + * The size of one block in arena memory allocation. + * If <= 0, a proper value is automatically calculated (usually 1/10 of + * writer_buffer_size). + * + * There are two additonal restriction of the The specified size: + * (1) size should be in the range of [4096, 2 << 30] and + * (2) be the multiple of the CPU word (which helps with the memory + * alignment). + * + * We'll automatically check and adjust the size number to make sure it + * conforms to the restrictions. + * Default: 0 + * + * @param arenaBlockSize the size of an arena block + * @return the reference to the current option. + */ + public Options setArenaBlockSize(long arenaBlockSize) { + setArenaBlockSize(nativeHandle_, arenaBlockSize); + return this; + } + private native void setArenaBlockSize( + long handle, long arenaBlockSize); + + /** + * Disable automatic compactions. Manual compactions can still + * be issued on this column family + * + * @return true if auto-compactions are disabled. + */ + public boolean disableAutoCompactions() { + return disableAutoCompactions(nativeHandle_); + } + private native boolean disableAutoCompactions(long handle); + + /** + * Disable automatic compactions. Manual compactions can still + * be issued on this column family + * + * @param disableAutoCompactions true if auto-compactions are disabled. + * @return the reference to the current option. + */ + public Options setDisableAutoCompactions(boolean disableAutoCompactions) { + setDisableAutoCompactions(nativeHandle_, disableAutoCompactions); + return this; + } + private native void setDisableAutoCompactions( + long handle, boolean disableAutoCompactions); + + /** + * Purge duplicate/deleted keys when a memtable is flushed to storage. + * Default: true + * + * @return true if purging keys is disabled. + */ + public boolean purgeRedundantKvsWhileFlush() { + return purgeRedundantKvsWhileFlush(nativeHandle_); + } + private native boolean purgeRedundantKvsWhileFlush(long handle); + + /** + * Purge duplicate/deleted keys when a memtable is flushed to storage. + * Default: true + * + * @param purgeRedundantKvsWhileFlush true if purging keys is disabled. + * @return the reference to the current option. + */ + public Options setPurgeRedundantKvsWhileFlush( + boolean purgeRedundantKvsWhileFlush) { + setPurgeRedundantKvsWhileFlush( + nativeHandle_, purgeRedundantKvsWhileFlush); + return this; + } + private native void setPurgeRedundantKvsWhileFlush( + long handle, boolean purgeRedundantKvsWhileFlush); + + /** + * This is used to close a block before it reaches the configured + * 'block_size'. If the percentage of free space in the current block is less + * than this specified number and adding a new record to the block will + * exceed the configured block size, then this block will be closed and the + * new record will be written to the next block. + * Default is 10. + * + * @return the target block size + */ + public int blockSizeDeviation() { + return blockSizeDeviation(nativeHandle_); + } + private native int blockSizeDeviation(long handle); + + /** + * This is used to close a block before it reaches the configured + * 'block_size'. If the percentage of free space in the current block is less + * than this specified number and adding a new record to the block will + * exceed the configured block size, then this block will be closed and the + * new record will be written to the next block. + * Default is 10. + * + * @param blockSizeDeviation the target block size + * @return the reference to the current option. + */ + public Options setBlockSizeDeviation(int blockSizeDeviation) { + setBlockSizeDeviation(nativeHandle_, blockSizeDeviation); + return this; + } + private native void setBlockSizeDeviation( + long handle, int blockSizeDeviation); + + /** + * If true, compaction will verify checksum on every read that happens + * as part of compaction + * Default: true + * + * @return true if compaction verifies checksum on every read. + */ + public boolean verifyChecksumsInCompaction() { + return verifyChecksumsInCompaction(nativeHandle_); + } + private native boolean verifyChecksumsInCompaction(long handle); + + /** + * If true, compaction will verify checksum on every read that happens + * as part of compaction + * Default: true + * + * @param verifyChecksumsInCompaction true if compaction verifies + * checksum on every read. + * @return the reference to the current option. + */ + public Options setVerifyChecksumsInCompaction( + boolean verifyChecksumsInCompaction) { + setVerifyChecksumsInCompaction( + nativeHandle_, verifyChecksumsInCompaction); + return this; + } + private native void setVerifyChecksumsInCompaction( + long handle, boolean verifyChecksumsInCompaction); + + /** + * Use KeyMayExist API to filter deletes when this is true. + * If KeyMayExist returns false, i.e. the key definitely does not exist, then + * the delete is a noop. KeyMayExist only incurs in-memory look up. + * This optimization avoids writing the delete to storage when appropriate. + * Default: false + * + * @return true if filter-deletes behavior is on. + */ + public boolean filterDeletes() { + return filterDeletes(nativeHandle_); + } + private native boolean filterDeletes(long handle); + + /** + * Use KeyMayExist API to filter deletes when this is true. + * If KeyMayExist returns false, i.e. the key definitely does not exist, then + * the delete is a noop. KeyMayExist only incurs in-memory look up. + * This optimization avoids writing the delete to storage when appropriate. + * Default: false + * + * @param filterDeletes true if filter-deletes behavior is on. + * @return the reference to the current option. + */ + public Options setFilterDeletes(boolean filterDeletes) { + setFilterDeletes(nativeHandle_, filterDeletes); + return this; + } + private native void setFilterDeletes( + long handle, boolean filterDeletes); + + /** + * An iteration->Next() sequentially skips over keys with the same + * user-key unless this option is set. This number specifies the number + * of keys (with the same userkey) that will be sequentially + * skipped before a reseek is issued. + * Default: 8 + * + * @return the number of keys could be skipped in a iteration. + */ + public long maxSequentialSkipInIterations() { + return maxSequentialSkipInIterations(nativeHandle_); + } + private native long maxSequentialSkipInIterations(long handle); + + /** + * An iteration->Next() sequentially skips over keys with the same + * user-key unless this option is set. This number specifies the number + * of keys (with the same userkey) that will be sequentially + * skipped before a reseek is issued. + * Default: 8 + * + * @param maxSequentialSkipInIterations the number of keys could + * be skipped in a iteration. + * @return the reference to the current option. + */ + public Options setMaxSequentialSkipInIterations(long maxSequentialSkipInIterations) { + setMaxSequentialSkipInIterations(nativeHandle_, maxSequentialSkipInIterations); + return this; + } + private native void setMaxSequentialSkipInIterations( + long handle, long maxSequentialSkipInIterations); + + /** + * Allows thread-safe inplace updates. + * If inplace_callback function is not set, + * Put(key, new_value) will update inplace the existing_value iff + * * key exists in current memtable + * * new sizeof(new_value) <= sizeof(existing_value) + * * existing_value for that key is a put i.e. kTypeValue + * If inplace_callback function is set, check doc for inplace_callback. + * Default: false. + * + * @return true if thread-safe inplace updates are allowed. + */ + public boolean inplaceUpdateSupport() { + return inplaceUpdateSupport(nativeHandle_); + } + private native boolean inplaceUpdateSupport(long handle); + + /** + * Allows thread-safe inplace updates. + * If inplace_callback function is not set, + * Put(key, new_value) will update inplace the existing_value iff + * * key exists in current memtable + * * new sizeof(new_value) <= sizeof(existing_value) + * * existing_value for that key is a put i.e. kTypeValue + * If inplace_callback function is set, check doc for inplace_callback. + * Default: false. + * + * @param inplaceUpdateSupport true if thread-safe inplace updates + * are allowed. + * @return the reference to the current option. + */ + public Options setInplaceUpdateSupport(boolean inplaceUpdateSupport) { + setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport); + return this; + } + private native void setInplaceUpdateSupport( + long handle, boolean inplaceUpdateSupport); + + /** + * Number of locks used for inplace update + * Default: 10000, if inplace_update_support = true, else 0. + * + * @return the number of locks used for inplace update. + */ + public long inplaceUpdateNumLocks() { + return inplaceUpdateNumLocks(nativeHandle_); + } + private native long inplaceUpdateNumLocks(long handle); + + /** + * Number of locks used for inplace update + * Default: 10000, if inplace_update_support = true, else 0. + * + * @param inplaceUpdateNumLocks the number of locks used for + * inplace updates. + * @return the reference to the current option. + */ + public Options setInplaceUpdateNumLocks(long inplaceUpdateNumLocks) { + setInplaceUpdateNumLocks(nativeHandle_, inplaceUpdateNumLocks); + return this; + } + private native void setInplaceUpdateNumLocks( + long handle, long inplaceUpdateNumLocks); + + /** + * Returns the number of bits used in the prefix bloom filter. + * + * This value will be used only when a prefix-extractor is specified. + * + * @return the number of bloom-bits. + * @see useFixedLengthPrefixExtractor() + */ + public int memtablePrefixBloomBits() { + return memtablePrefixBloomBits(nativeHandle_); + } + private native int memtablePrefixBloomBits(long handle); + + /** + * Sets the number of bits used in the prefix bloom filter. + * + * This value will be used only when a prefix-extractor is specified. + * + * @param memtablePrefixBloomBits the number of bits used in the + * prefix bloom filter. + * @return the reference to the current option. + */ + public Options setMemtablePrefixBloomBits(int memtablePrefixBloomBits) { + setMemtablePrefixBloomBits(nativeHandle_, memtablePrefixBloomBits); + return this; + } + private native void setMemtablePrefixBloomBits( + long handle, int memtablePrefixBloomBits); + + /** + * The number of hash probes per key used in the mem-table. + * + * @return the number of hash probes per key. + */ + public int memtablePrefixBloomProbes() { + return memtablePrefixBloomProbes(nativeHandle_); + } + private native int memtablePrefixBloomProbes(long handle); + + /** + * The number of hash probes per key used in the mem-table. + * + * @param memtablePrefixBloomProbes the number of hash probes per key. + * @return the reference to the current option. + */ + public Options setMemtablePrefixBloomProbes(int memtablePrefixBloomProbes) { + setMemtablePrefixBloomProbes(nativeHandle_, memtablePrefixBloomProbes); + return this; + } + private native void setMemtablePrefixBloomProbes( + long handle, int memtablePrefixBloomProbes); + + /** + * Control locality of bloom filter probes to improve cache miss rate. + * This option only applies to memtable prefix bloom and plaintable + * prefix bloom. It essentially limits the max number of cache lines each + * bloom filter check can touch. + * This optimization is turned off when set to 0. The number should never + * be greater than number of probes. This option can boost performance + * for in-memory workload but should use with care since it can cause + * higher false positive rate. + * Default: 0 + * + * @return the level of locality of bloom-filter probes. + * @see setMemTablePrefixBloomProbes + */ + public int bloomLocality() { + return bloomLocality(nativeHandle_); + } + private native int bloomLocality(long handle); + + /** + * Control locality of bloom filter probes to improve cache miss rate. + * This option only applies to memtable prefix bloom and plaintable + * prefix bloom. It essentially limits the max number of cache lines each + * bloom filter check can touch. + * This optimization is turned off when set to 0. The number should never + * be greater than number of probes. This option can boost performance + * for in-memory workload but should use with care since it can cause + * higher false positive rate. + * Default: 0 + * + * @param bloomLocality the level of locality of bloom-filter probes. + * @return the reference to the current option. + */ + public Options setBloomLocality(int bloomLocality) { + setBloomLocality(nativeHandle_, bloomLocality); + return this; + } + private native void setBloomLocality( + long handle, int bloomLocality); + + /** + * Maximum number of successive merge operations on a key in the memtable. + * + * When a merge operation is added to the memtable and the maximum number of + * successive merges is reached, the value of the key will be calculated and + * inserted into the memtable instead of the merge operation. This will + * ensure that there are never more than max_successive_merges merge + * operations in the memtable. + * + * Default: 0 (disabled) + * + * @return the maximum number of successive merges. + */ + public long maxSuccessiveMerges() { + return maxSuccessiveMerges(nativeHandle_); + } + private native long maxSuccessiveMerges(long handle); + + /** + * Maximum number of successive merge operations on a key in the memtable. + * + * When a merge operation is added to the memtable and the maximum number of + * successive merges is reached, the value of the key will be calculated and + * inserted into the memtable instead of the merge operation. This will + * ensure that there are never more than max_successive_merges merge + * operations in the memtable. + * + * Default: 0 (disabled) + * + * @param maxSuccessiveMerges the maximum number of successive merges. + * @return the reference to the current option. + */ + public Options setMaxSuccessiveMerges(long maxSuccessiveMerges) { + setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges); + return this; + } + private native void setMaxSuccessiveMerges( + long handle, long maxSuccessiveMerges); + + /** + * The minimum number of write buffers that will be merged together + * before writing to storage. If set to 1, then + * all write buffers are fushed to L0 as individual files and this increases + * read amplification because a get request has to check in all of these + * files. Also, an in-memory merge may result in writing lesser + * data to storage if there are duplicate records in each of these + * individual write buffers. Default: 1 + * + * @return the minimum number of write buffers that will be merged together. + */ + public int minWriteBufferNumberToMerge() { + return minWriteBufferNumberToMerge(nativeHandle_); + } + private native int minWriteBufferNumberToMerge(long handle); + + /** + * The minimum number of write buffers that will be merged together + * before writing to storage. If set to 1, then + * all write buffers are fushed to L0 as individual files and this increases + * read amplification because a get request has to check in all of these + * files. Also, an in-memory merge may result in writing lesser + * data to storage if there are duplicate records in each of these + * individual write buffers. Default: 1 + * + * @param minWriteBufferNumberToMerge the minimum number of write buffers + * that will be merged together. + * @return the reference to the current option. + */ + public Options setMinWriteBufferNumberToMerge(int minWriteBufferNumberToMerge) { + setMinWriteBufferNumberToMerge(nativeHandle_, minWriteBufferNumberToMerge); + return this; + } + private native void setMinWriteBufferNumberToMerge( + long handle, int minWriteBufferNumberToMerge); + + /** + * The number of partial merge operands to accumulate before partial + * merge will be performed. Partial merge will not be called + * if the list of values to merge is less than min_partial_merge_operands. + * + * If min_partial_merge_operands < 2, then it will be treated as 2. + * + * Default: 2 + * + * @return + */ + public int minPartialMergeOperands() { + return minPartialMergeOperands(nativeHandle_); + } + private native int minPartialMergeOperands(long handle); + + /** + * The number of partial merge operands to accumulate before partial + * merge will be performed. Partial merge will not be called + * if the list of values to merge is less than min_partial_merge_operands. + * + * If min_partial_merge_operands < 2, then it will be treated as 2. + * + * Default: 2 + * + * @param minPartialMergeOperands + * @return the reference to the current option. + */ + public Options setMinPartialMergeOperands(int minPartialMergeOperands) { + setMinPartialMergeOperands(nativeHandle_, minPartialMergeOperands); + return this; + } + private native void setMinPartialMergeOperands( + long handle, int minPartialMergeOperands); + + /** + * Release the memory allocated for the current instance + * in the c++ side. + */ + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + static final int DEFAULT_PLAIN_TABLE_BLOOM_BITS_PER_KEY = 10; + static final double DEFAULT_PLAIN_TABLE_HASH_TABLE_RATIO = 0.75; + static final int DEFAULT_PLAIN_TABLE_INDEX_SPARSENESS = 16; + + private native void newOptions(); + private native void disposeInternal(long handle); + private native void setCreateIfMissing(long handle, boolean flag); + private native boolean createIfMissing(long handle); + private native void setWriteBufferSize(long handle, long writeBufferSize); + private native long writeBufferSize(long handle); + private native void setMaxWriteBufferNumber( + long handle, int maxWriteBufferNumber); + private native int maxWriteBufferNumber(long handle); + private native void setBlockSize(long handle, long blockSize); + private native long blockSize(long handle); + private native void setDisableSeekCompaction( + long handle, boolean disableSeekCompaction); + private native boolean disableSeekCompaction(long handle); + private native void setMaxBackgroundCompactions( + long handle, int maxBackgroundCompactions); + private native int maxBackgroundCompactions(long handle); + private native void createStatistics(long optHandle); + private native long statisticsPtr(long optHandle); + + private native void setMemTableFactory(long handle, long factoryHandle); + private native String memTableFactoryName(long handle); + + private native void setTableFactory(long handle, long factoryHandle); + private native String tableFactoryName(long handle); + + private native void useFixedLengthPrefixExtractor( + long handle, int prefixLength); + + long cacheSize_; + Filter filter_; +} diff --git a/java/org/rocksdb/PlainTableConfig.java b/java/org/rocksdb/PlainTableConfig.java new file mode 100644 index 0000000000..554ce3840d --- /dev/null +++ b/java/org/rocksdb/PlainTableConfig.java @@ -0,0 +1,123 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +/** + * The config for plain table sst format. + * + * PlainTable is a RocksDB's SST file format optimized for low query latency + * on pure-memory or really low-latency media. It also support prefix + * hash feature. + */ +public class PlainTableConfig extends TableFormatConfig { + public static final int VARIABLE_LENGTH = 0; + public static final int DEFAULT_BLOOM_BITS_PER_KEY = 10; + public static final double DEFAULT_HASH_TABLE_RATIO = 0.75; + public static final int DEFAULT_INDEX_SPARSENESS = 16; + + public PlainTableConfig() { + keySize_ = VARIABLE_LENGTH; + bloomBitsPerKey_ = DEFAULT_BLOOM_BITS_PER_KEY; + hashTableRatio_ = DEFAULT_HASH_TABLE_RATIO; + indexSparseness_ = DEFAULT_INDEX_SPARSENESS; + } + + /** + * Set the length of the user key. If it is set to be VARIABLE_LENGTH, + * then it indicates the user keys are variable-lengthed. Otherwise, + * all the keys need to have the same length in byte. + * DEFAULT: VARIABLE_LENGTH + * + * @param keySize the length of the user key. + * @return the reference to the current config. + */ + public PlainTableConfig setKeySize(int keySize) { + keySize_ = keySize; + return this; + } + + /** + * @return the specified size of the user key. If VARIABLE_LENGTH, + * then it indicates variable-length key. + */ + public int keySize() { + return keySize_; + } + + /** + * Set the number of bits per key used by the internal bloom filter + * in the plain table sst format. + * + * @param bitsPerKey the number of bits per key for bloom filer. + * @return the reference to the current config. + */ + public PlainTableConfig setBloomBitsPerKey(int bitsPerKey) { + bloomBitsPerKey_ = bitsPerKey; + return this; + } + + /** + * @return the number of bits per key used for the bloom filter. + */ + public int bloomBitsPerKey() { + return bloomBitsPerKey_; + } + + /** + * hashTableRatio is the desired utilization of the hash table used + * for prefix hashing. The ideal ratio would be the number of + * prefixes / the number of hash buckets. If this value is set to + * zero, then hash table will not be used. + * + * @param ratio the hash table ratio. + * @return the reference to the current config. + */ + public PlainTableConfig setHashTableRatio(double ratio) { + hashTableRatio_ = ratio; + return this; + } + + /** + * @return the hash table ratio. + */ + public double hashTableRatio() { + return hashTableRatio_; + } + + /** + * Index sparseness determines the index interval for keys inside the + * same prefix. This number is equal to the maximum number of linear + * search required after hash and binary search. If it's set to 0, + * then each key will be indexed. + * + * @param sparseness the index sparseness. + * @return the reference to the current config. + */ + public PlainTableConfig setIndexSparseness(int sparseness) { + indexSparseness_ = sparseness; + return this; + } + + /** + * @return the index sparseness. + */ + public int indexSparseness() { + return indexSparseness_; + } + + @Override protected long newTableFactoryHandle() { + return newTableFactoryHandle(keySize_, bloomBitsPerKey_, + hashTableRatio_, indexSparseness_); + } + + private native long newTableFactoryHandle( + int keySize, int bloomBitsPerKey, + double hashTableRatio, int indexSparseness); + + private int keySize_; + private int bloomBitsPerKey_; + private double hashTableRatio_; + private int indexSparseness_; +} diff --git a/java/org/rocksdb/ReadOptions.java b/java/org/rocksdb/ReadOptions.java new file mode 100644 index 0000000000..97c47c7d62 --- /dev/null +++ b/java/org/rocksdb/ReadOptions.java @@ -0,0 +1,125 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * The class that controls the get behavior. + * + * Note that dispose() must be called before an Options instance + * become out-of-scope to release the allocated memory in c++. + */ +public class ReadOptions extends RocksObject { + public ReadOptions() { + super(); + newReadOptions(); + } + private native void newReadOptions(); + + /** + * If true, all data read from underlying storage will be + * verified against corresponding checksums. + * Default: true + * + * @return true if checksum verification is on. + */ + public boolean verifyChecksums() { + assert(isInitialized()); + return verifyChecksums(nativeHandle_); + } + private native boolean verifyChecksums(long handle); + + /** + * If true, all data read from underlying storage will be + * verified against corresponding checksums. + * Default: true + * + * @param verifyChecksums if true, then checksum verification + * will be performed on every read. + * @return the reference to the current ReadOptions. + */ + public ReadOptions setVerifyChecksums(boolean verifyChecksums) { + assert(isInitialized()); + setVerifyChecksums(nativeHandle_, verifyChecksums); + return this; + } + private native void setVerifyChecksums( + long handle, boolean verifyChecksums); + + // TODO(yhchiang): this option seems to be block-based table only. + // move this to a better place? + /** + * Fill the cache when loading the block-based sst formated db. + * Callers may wish to set this field to false for bulk scans. + * Default: true + * + * @return true if the fill-cache behavior is on. + */ + public boolean fillCache() { + assert(isInitialized()); + return fillCache(nativeHandle_); + } + private native boolean fillCache(long handle); + + /** + * Fill the cache when loading the block-based sst formated db. + * Callers may wish to set this field to false for bulk scans. + * Default: true + * + * @param fillCache if true, then fill-cache behavior will be + * performed. + * @return the reference to the current ReadOptions. + */ + public ReadOptions setFillCache(boolean fillCache) { + assert(isInitialized()); + setFillCache(nativeHandle_, fillCache); + return this; + } + private native void setFillCache( + long handle, boolean fillCache); + + /** + * Specify to create a tailing iterator -- a special iterator that has a + * view of the complete database (i.e. it can also be used to read newly + * added data) and is optimized for sequential reads. It will return records + * that were inserted into the database after the creation of the iterator. + * Default: false + * Not supported in ROCKSDB_LITE mode! + * + * @return true if tailing iterator is enabled. + */ + public boolean tailing() { + assert(isInitialized()); + return tailing(nativeHandle_); + } + private native boolean tailing(long handle); + + /** + * Specify to create a tailing iterator -- a special iterator that has a + * view of the complete database (i.e. it can also be used to read newly + * added data) and is optimized for sequential reads. It will return records + * that were inserted into the database after the creation of the iterator. + * Default: false + * Not supported in ROCKSDB_LITE mode! + * + * @param tailing if true, then tailing iterator will be enabled. + * @return the reference to the current ReadOptions. + */ + public ReadOptions setTailing(boolean tailing) { + assert(isInitialized()); + setTailing(nativeHandle_, tailing); + return this; + } + private native void setTailing( + long handle, boolean tailing); + + + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + private native void disposeInternal(long handle); + +} diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java new file mode 100644 index 0000000000..1b758e1a2e --- /dev/null +++ b/java/org/rocksdb/RocksDB.java @@ -0,0 +1,370 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.List; +import java.util.Map; +import java.util.HashMap; +import java.io.Closeable; +import java.io.IOException; +import org.rocksdb.util.Environment; + +/** + * A RocksDB is a persistent ordered map from keys to values. It is safe for + * concurrent access from multiple threads without any external synchronization. + * All methods of this class could potentially throw RocksDBException, which + * indicates sth wrong at the rocksdb library side and the call failed. + */ +public class RocksDB extends RocksObject { + public static final int NOT_FOUND = -1; + private static final String[] compressionLibs_ = { + "snappy", "zlib", "bzip2", "lz4", "lz4hc"}; + + /** + * Loads the necessary library files. + * Calling this method twice will have no effect. + */ + public static synchronized void loadLibrary() { + // loading possibly necessary libraries. + for (String lib : compressionLibs_) { + try { + System.loadLibrary(lib); + } catch (UnsatisfiedLinkError e) { + // since it may be optional, we ignore its loading failure here. + } + } + // However, if any of them is required. We will see error here. + System.loadLibrary("rocksdbjni"); + } + + /** + * Tries to load the necessary library files from the given list of + * directories. + * + * @param paths a list of strings where each describes a directory + * of a library. + */ + public static synchronized void loadLibrary(List paths) { + for (String lib : compressionLibs_) { + for (String path : paths) { + try { + System.load(path + "/" + Environment.getSharedLibraryName(lib)); + break; + } catch (UnsatisfiedLinkError e) { + // since they are optional, we ignore loading fails. + } + } + } + boolean success = false; + UnsatisfiedLinkError err = null; + for (String path : paths) { + try { + System.load(path + "/" + Environment.getJniLibraryName("rocksdbjni")); + success = true; + break; + } catch (UnsatisfiedLinkError e) { + err = e; + } + } + if (success == false) { + throw err; + } + } + + /** + * The factory constructor of RocksDB that opens a RocksDB instance given + * the path to the database using the default options w/ createIfMissing + * set to true. + * + * @param path the path to the rocksdb. + * @param status an out value indicating the status of the Open(). + * @return a rocksdb instance on success, null if the specified rocksdb can + * not be opened. + * + * @see Options.setCreateIfMissing() + * @see Options.createIfMissing() + */ + public static RocksDB open(String path) throws RocksDBException { + RocksDB db = new RocksDB(); + + // This allows to use the rocksjni default Options instead of + // the c++ one. + Options options = new Options(); + db.open(options.nativeHandle_, options.cacheSize_, path); + db.transferCppRawPointersOwnershipFrom(options); + options.dispose(); + return db; + } + + /** + * The factory constructor of RocksDB that opens a RocksDB instance given + * the path to the database using the specified options and db path. + */ + public static RocksDB open(Options options, String path) + throws RocksDBException { + // when non-default Options is used, keeping an Options reference + // in RocksDB can prevent Java to GC during the life-time of + // the currently-created RocksDB. + RocksDB db = new RocksDB(); + db.open(options.nativeHandle_, options.cacheSize_, path); + db.transferCppRawPointersOwnershipFrom(options); + return db; + } + + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + /** + * Close the RocksDB instance. + * This function is equivalent to dispose(). + */ + public void close() { + dispose(); + } + + /** + * Set the database entry for "key" to "value". + * + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + */ + public void put(byte[] key, byte[] value) throws RocksDBException { + put(nativeHandle_, key, key.length, value, value.length); + } + + /** + * Set the database entry for "key" to "value". + * + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + */ + public void put(WriteOptions writeOpts, byte[] key, byte[] value) + throws RocksDBException { + put(nativeHandle_, writeOpts.nativeHandle_, + key, key.length, value, value.length); + } + + /** + * Apply the specified updates to the database. + */ + public void write(WriteOptions writeOpts, WriteBatch updates) + throws RocksDBException { + write(writeOpts.nativeHandle_, updates.nativeHandle_); + } + + /** + * Get the value associated with the specified key. + * + * @param key the key to retrieve the value. + * @param value the out-value to receive the retrieved value. + * @return The size of the actual value that matches the specified + * {@code key} in byte. If the return value is greater than the + * length of {@code value}, then it indicates that the size of the + * input buffer {@code value} is insufficient and partial result will + * be returned. RocksDB.NOT_FOUND will be returned if the value not + * found. + */ + public int get(byte[] key, byte[] value) throws RocksDBException { + return get(nativeHandle_, key, key.length, value, value.length); + } + + /** + * Get the value associated with the specified key. + * + * @param key the key to retrieve the value. + * @param value the out-value to receive the retrieved value. + * @return The size of the actual value that matches the specified + * {@code key} in byte. If the return value is greater than the + * length of {@code value}, then it indicates that the size of the + * input buffer {@code value} is insufficient and partial result will + * be returned. RocksDB.NOT_FOUND will be returned if the value not + * found. + */ + public int get(ReadOptions opt, byte[] key, byte[] value) + throws RocksDBException { + return get(nativeHandle_, opt.nativeHandle_, + key, key.length, value, value.length); + } + + /** + * The simplified version of get which returns a new byte array storing + * the value associated with the specified input key if any. null will be + * returned if the specified key is not found. + * + * @param key the key retrieve the value. + * @return a byte array storing the value associated with the input key if + * any. null if it does not find the specified key. + * + * @see RocksDBException + */ + public byte[] get(byte[] key) throws RocksDBException { + return get(nativeHandle_, key, key.length); + } + + /** + * The simplified version of get which returns a new byte array storing + * the value associated with the specified input key if any. null will be + * returned if the specified key is not found. + * + * @param key the key retrieve the value. + * @param opt Read options. + * @return a byte array storing the value associated with the input key if + * any. null if it does not find the specified key. + * + * @see RocksDBException + */ + public byte[] get(ReadOptions opt, byte[] key) throws RocksDBException { + return get(nativeHandle_, opt.nativeHandle_, key, key.length); + } + + /** + * Returns a map of keys for which values were found in DB. + * + * @param keys List of keys for which values need to be retrieved. + * @return Map where key of map is the key passed by user and value for map + * entry is the corresponding value in DB. + * + * @see RocksDBException + */ + public Map multiGet(List keys) + throws RocksDBException { + assert(keys.size() != 0); + + List values = multiGet( + nativeHandle_, keys, keys.size()); + + Map keyValueMap = new HashMap(); + for(int i = 0; i < values.size(); i++) { + if(values.get(i) == null) { + continue; + } + + keyValueMap.put(keys.get(i), values.get(i)); + } + + return keyValueMap; + } + + + /** + * Returns a map of keys for which values were found in DB. + * + * @param List of keys for which values need to be retrieved. + * @param opt Read options. + * @return Map where key of map is the key passed by user and value for map + * entry is the corresponding value in DB. + * + * @see RocksDBException + */ + public Map multiGet(ReadOptions opt, List keys) + throws RocksDBException { + assert(keys.size() != 0); + + List values = multiGet( + nativeHandle_, opt.nativeHandle_, keys, keys.size()); + + Map keyValueMap = new HashMap(); + for(int i = 0; i < values.size(); i++) { + if(values.get(i) == null) { + continue; + } + + keyValueMap.put(keys.get(i), values.get(i)); + } + + return keyValueMap; + } + + /** + * Remove the database entry (if any) for "key". Returns OK on + * success, and a non-OK status on error. It is not an error if "key" + * did not exist in the database. + */ + public void remove(byte[] key) throws RocksDBException { + remove(nativeHandle_, key, key.length); + } + + /** + * Remove the database entry (if any) for "key". Returns OK on + * success, and a non-OK status on error. It is not an error if "key" + * did not exist in the database. + */ + public void remove(WriteOptions writeOpt, byte[] key) + throws RocksDBException { + remove(nativeHandle_, writeOpt.nativeHandle_, key, key.length); + } + + /** + * Return a heap-allocated iterator over the contents of the database. + * The result of newIterator() is initially invalid (caller must + * call one of the Seek methods on the iterator before using it). + * + * Caller should close the iterator when it is no longer needed. + * The returned iterator should be closed before this db is closed. + * + * @return instance of iterator object. + */ + public RocksIterator newIterator() { + return new RocksIterator(iterator0(nativeHandle_)); + } + + /** + * Private constructor. + */ + protected RocksDB() { + super(); + } + + /** + * Transfer the ownership of all c++ raw-pointers from Options + * to RocksDB to ensure the life-time of those raw-pointers + * will be at least as long as the life-time of any RocksDB + * that uses these raw-pointers. + */ + protected void transferCppRawPointersOwnershipFrom(Options opt) { + filter_ = opt.filter_; + opt.filter_ = null; + } + + // native methods + protected native void open( + long optionsHandle, long cacheSize, String path) throws RocksDBException; + protected native void put( + long handle, byte[] key, int keyLen, + byte[] value, int valueLen) throws RocksDBException; + protected native void put( + long handle, long writeOptHandle, + byte[] key, int keyLen, + byte[] value, int valueLen) throws RocksDBException; + protected native void write( + long writeOptHandle, long batchHandle) throws RocksDBException; + protected native int get( + long handle, byte[] key, int keyLen, + byte[] value, int valueLen) throws RocksDBException; + protected native int get( + long handle, long readOptHandle, byte[] key, int keyLen, + byte[] value, int valueLen) throws RocksDBException; + protected native List multiGet( + long dbHandle, List keys, int keysCount); + protected native List multiGet( + long dbHandle, long rOptHandle, List keys, int keysCount); + protected native byte[] get( + long handle, byte[] key, int keyLen) throws RocksDBException; + protected native byte[] get( + long handle, long readOptHandle, + byte[] key, int keyLen) throws RocksDBException; + protected native void remove( + long handle, byte[] key, int keyLen) throws RocksDBException; + protected native void remove( + long handle, long writeOptHandle, + byte[] key, int keyLen) throws RocksDBException; + protected native long iterator0(long optHandle); + private native void disposeInternal(long handle); + + protected Filter filter_; +} diff --git a/java/org/rocksdb/RocksDBException.java b/java/org/rocksdb/RocksDBException.java new file mode 100644 index 0000000000..acc93669ee --- /dev/null +++ b/java/org/rocksdb/RocksDBException.java @@ -0,0 +1,23 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.*; + +/** + * A RocksDBException encapsulates the error of an operation. This exception + * type is used to describe an internal error from the c++ rocksdb library. + */ +public class RocksDBException extends Exception { + /** + * The private construct used by a set of public static factory method. + * + * @param msg the specified error message. + */ + public RocksDBException(String msg) { + super(msg); + } +} diff --git a/java/org/rocksdb/RocksIterator.java b/java/org/rocksdb/RocksIterator.java new file mode 100644 index 0000000000..9ef2e8c24a --- /dev/null +++ b/java/org/rocksdb/RocksIterator.java @@ -0,0 +1,136 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * An iterator yields a sequence of key/value pairs from a source. + * The following class defines the interface. Multiple implementations + * are provided by this library. In particular, iterators are provided + * to access the contents of a Table or a DB. + * + * Multiple threads can invoke const methods on an RocksIterator without + * external synchronization, but if any of the threads may call a + * non-const method, all threads accessing the same RocksIterator must use + * external synchronization. + */ +public class RocksIterator extends RocksObject { + public RocksIterator(long nativeHandle) { + super(); + nativeHandle_ = nativeHandle; + } + + /** + * An iterator is either positioned at a key/value pair, or + * not valid. This method returns true iff the iterator is valid. + * @return true if iterator is valid. + */ + public boolean isValid() { + assert(isInitialized()); + return isValid0(nativeHandle_); + } + + /** + * Position at the first key in the source. The iterator is Valid() + * after this call iff the source is not empty. + */ + public void seekToFirst() { + assert(isInitialized()); + seekToFirst0(nativeHandle_); + } + + /** + * Position at the last key in the source. The iterator is + * Valid() after this call iff the source is not empty. + */ + public void seekToLast() { + assert(isInitialized()); + seekToLast0(nativeHandle_); + } + + /** + * Moves to the next entry in the source. After this call, Valid() is + * true iff the iterator was not positioned at the last entry in the source. + * REQUIRES: Valid() + */ + public void next() { + assert(isInitialized()); + next0(nativeHandle_); + } + + /** + * Moves to the previous entry in the source. After this call, Valid() is + * true iff the iterator was not positioned at the first entry in source. + * REQUIRES: Valid() + */ + public void prev() { + assert(isInitialized()); + prev0(nativeHandle_); + } + + /** + * Return the key for the current entry. The underlying storage for + * the returned slice is valid only until the next modification of + * the iterator. + * REQUIRES: Valid() + * @return key for the current entry. + */ + public byte[] key() { + assert(isInitialized()); + return key0(nativeHandle_); + } + + /** + * Return the value for the current entry. The underlying storage for + * the returned slice is valid only until the next modification of + * the iterator. + * REQUIRES: !AtEnd() && !AtStart() + * @return value for the current entry. + */ + public byte[] value() { + assert(isInitialized()); + return value0(nativeHandle_); + } + + /** + * Position at the first key in the source that at or past target + * The iterator is Valid() after this call iff the source contains + * an entry that comes at or past target. + */ + public void seek(byte[] target) { + assert(isInitialized()); + seek0(nativeHandle_, target, target.length); + } + + /** + * If an error has occurred, return it. Else return an ok status. + * If non-blocking IO is requested and this operation cannot be + * satisfied without doing some IO, then this returns Status::Incomplete(). + * + */ + public void status() throws RocksDBException { + assert(isInitialized()); + status0(nativeHandle_); + } + + /** + * Deletes underlying C++ iterator pointer. + */ + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + private native boolean isValid0(long handle); + private native void disposeInternal(long handle); + private native void seekToFirst0(long handle); + private native void seekToLast0(long handle); + private native void next0(long handle); + private native void prev0(long handle); + private native byte[] key0(long handle); + private native byte[] value0(long handle); + private native void seek0(long handle, byte[] target, int targetLen); + private native void status0(long handle); +} diff --git a/java/org/rocksdb/RocksObject.java b/java/org/rocksdb/RocksObject.java new file mode 100644 index 0000000000..31c347daa0 --- /dev/null +++ b/java/org/rocksdb/RocksObject.java @@ -0,0 +1,72 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * RocksObject is the base-class of all RocksDB related class that has + * a pointer to some c++ rocksdb object. Although RocksObject + * will release its c++ resource on its finalize() once it has been + * garbage-collected, it is suggested to call dispose() manually to + * release its c++ resource once an instance of RocksObject is no + * longer used. + */ +public abstract class RocksObject { + protected RocksObject() { + nativeHandle_ = 0; + owningHandle_ = true; + } + + /** + * Release the c++ object pointed by the native handle. + * + * Note that once an instance of RocksObject has been disposed, + * calling its function will lead undefined behavior. + */ + public final synchronized void dispose() { + if (isOwningNativeHandle() && isInitialized()) { + disposeInternal(); + } + nativeHandle_ = 0; + disOwnNativeHandle(); + } + + /** + * The helper function of dispose() which all subclasses of RocksObject + * must implement to release their associated C++ resource. + */ + protected abstract void disposeInternal(); + + /** + * Revoke ownership of the native object. + * + * This will prevent the object from attempting to delete the underlying + * native object in its finalizer. This must be used when another object + * takes over ownership of the native object or both will attempt to delete + * the underlying object when garbage collected. + * + * When disOwnNativeHandle is called, dispose() will simply set nativeHandle_ + * to 0 without releasing its associated C++ resource. As a result, + * incorrectly use this function may cause memory leak. + */ + protected void disOwnNativeHandle() { + owningHandle_ = false; + } + + protected boolean isOwningNativeHandle() { + return owningHandle_; + } + + protected boolean isInitialized() { + return (nativeHandle_ != 0); + } + + @Override protected void finalize() { + dispose(); + } + + protected long nativeHandle_; + private boolean owningHandle_; +} diff --git a/java/org/rocksdb/SkipListMemTableConfig.java b/java/org/rocksdb/SkipListMemTableConfig.java new file mode 100644 index 0000000000..7f9f5cb5f4 --- /dev/null +++ b/java/org/rocksdb/SkipListMemTableConfig.java @@ -0,0 +1,15 @@ +package org.rocksdb; + +/** + * The config for skip-list memtable representation. + */ +public class SkipListMemTableConfig extends MemTableConfig { + public SkipListMemTableConfig() { + } + + @Override protected long newMemTableFactoryHandle() { + return newMemTableFactoryHandle0(); + } + + private native long newMemTableFactoryHandle0(); +} diff --git a/java/org/rocksdb/Statistics.java b/java/org/rocksdb/Statistics.java new file mode 100644 index 0000000000..bed2b88108 --- /dev/null +++ b/java/org/rocksdb/Statistics.java @@ -0,0 +1,38 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Statistics to analyze the performance of a db. Pointer for statistics object + * is managed by Options class. + */ +public class Statistics { + + private final long statsHandle_; + + public Statistics(long statsHandle) { + statsHandle_ = statsHandle; + } + + public long getTickerCount(TickerType tickerType) { + assert(isInitialized()); + return getTickerCount0(tickerType.getValue(), statsHandle_); + } + + public HistogramData geHistogramData(HistogramType histogramType) { + assert(isInitialized()); + HistogramData hist = geHistogramData0( + histogramType.getValue(), statsHandle_); + return hist; + } + + private boolean isInitialized() { + return (statsHandle_ != 0); + } + + private native long getTickerCount0(int tickerType, long handle); + private native HistogramData geHistogramData0(int histogramType, long handle); +} diff --git a/java/org/rocksdb/TableFormatConfig.java b/java/org/rocksdb/TableFormatConfig.java new file mode 100644 index 0000000000..e5c63411fd --- /dev/null +++ b/java/org/rocksdb/TableFormatConfig.java @@ -0,0 +1,20 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +/** + * TableFormatConfig is used to config the internal Table format of a RocksDB. + * To make a RocksDB to use a specific Table format, its associated + * TableFormatConfig should be properly set and passed into Options via + * Options.setTableFormatConfig() and open the db using that Options. + */ +public abstract class TableFormatConfig { + /** + * This function should only be called by Options.setTableFormatConfig(), + * which will create a c++ shared-pointer to the c++ TableFactory + * that associated with the Java TableFormatConfig. + */ + abstract protected long newTableFactoryHandle(); +} diff --git a/java/org/rocksdb/TickerType.java b/java/org/rocksdb/TickerType.java new file mode 100644 index 0000000000..5ad714d309 --- /dev/null +++ b/java/org/rocksdb/TickerType.java @@ -0,0 +1,123 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +public enum TickerType { + // total block cache misses + // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS + + // BLOCK_CACHE_FILTER_MISS + + // BLOCK_CACHE_DATA_MISS; + BLOCK_CACHE_MISS(0), + // total block cache hit + // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT + + // BLOCK_CACHE_FILTER_HIT + + // BLOCK_CACHE_DATA_HIT; + BLOCK_CACHE_HIT(1), + // # of blocks added to block cache. + BLOCK_CACHE_ADD(2), + // # of times cache miss when accessing index block from block cache. + BLOCK_CACHE_INDEX_MISS(3), + // # of times cache hit when accessing index block from block cache. + BLOCK_CACHE_INDEX_HIT(4), + // # of times cache miss when accessing filter block from block cache. + BLOCK_CACHE_FILTER_MISS(5), + // # of times cache hit when accessing filter block from block cache. + BLOCK_CACHE_FILTER_HIT(6), + // # of times cache miss when accessing data block from block cache. + BLOCK_CACHE_DATA_MISS(7), + // # of times cache hit when accessing data block from block cache. + BLOCK_CACHE_DATA_HIT(8), + // # of times bloom filter has avoided file reads. + BLOOM_FILTER_USEFUL(9), + + // # of memtable hits. + MEMTABLE_HIT(10), + // # of memtable misses. + MEMTABLE_MISS(11), + + /** + * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction + * There are 3 reasons currently. + */ + COMPACTION_KEY_DROP_NEWER_ENTRY(12), // key was written with a newer value. + COMPACTION_KEY_DROP_OBSOLETE(13), // The key is obsolete. + COMPACTION_KEY_DROP_USER(14), // user compaction function has dropped the key. + + // Number of keys written to the database via the Put and Write call's + NUMBER_KEYS_WRITTEN(15), + // Number of Keys read, + NUMBER_KEYS_READ(16), + // Number keys updated, if inplace update is enabled + NUMBER_KEYS_UPDATED(17), + // Bytes written / read + BYTES_WRITTEN(18), + BYTES_READ(19), + NO_FILE_CLOSES(20), + NO_FILE_OPENS(21), + NO_FILE_ERRORS(22), + // Time system had to wait to do LO-L1 compactions + STALL_L0_SLOWDOWN_MICROS(23), + // Time system had to wait to move memtable to L1. + STALL_MEMTABLE_COMPACTION_MICROS(24), + // write throttle because of too many files in L0 + STALL_L0_NUM_FILES_MICROS(25), + RATE_LIMIT_DELAY_MILLIS(26), + NO_ITERATORS(27), // number of iterators currently open + + // Number of MultiGet calls, keys read, and bytes read + NUMBER_MULTIGET_CALLS(28), + NUMBER_MULTIGET_KEYS_READ(29), + NUMBER_MULTIGET_BYTES_READ(30), + + // Number of deletes records that were not required to be + // written to storage because key does not exist + NUMBER_FILTERED_DELETES(31), + NUMBER_MERGE_FAILURES(32), + SEQUENCE_NUMBER(33), + + // number of times bloom was checked before creating iterator on a + // file, and the number of times the check was useful in avoiding + // iterator creation (and thus likely IOPs). + BLOOM_FILTER_PREFIX_CHECKED(34), + BLOOM_FILTER_PREFIX_USEFUL(35), + + // Number of times we had to reseek inside an iteration to skip + // over large number of keys with same userkey. + NUMBER_OF_RESEEKS_IN_ITERATION(36), + + // Record the number of calls to GetUpadtesSince. Useful to keep track of + // transaction log iterator refreshes + GET_UPDATES_SINCE_CALLS(37), + BLOCK_CACHE_COMPRESSED_MISS(38), // miss in the compressed block cache + BLOCK_CACHE_COMPRESSED_HIT(39), // hit in the compressed block cache + WAL_FILE_SYNCED(40), // Number of times WAL sync is done + WAL_FILE_BYTES(41), // Number of bytes written to WAL + + // Writes can be processed by requesting thread or by the thread at the + // head of the writers queue. + WRITE_DONE_BY_SELF(42), + WRITE_DONE_BY_OTHER(43), + WRITE_WITH_WAL(44), // Number of Write calls that request WAL + COMPACT_READ_BYTES(45), // Bytes read during compaction + COMPACT_WRITE_BYTES(46), // Bytes written during compaction + + // Number of table's properties loaded directly from file, without creating + // table reader object. + NUMBER_DIRECT_LOAD_TABLE_PROPERTIES(47), + NUMBER_SUPERVERSION_ACQUIRES(48), + NUMBER_SUPERVERSION_RELEASES(49), + NUMBER_SUPERVERSION_CLEANUPS(50); + + private final int value_; + + private TickerType(int value) { + value_ = value; + } + + public int getValue() { + return value_; + } +} diff --git a/java/org/rocksdb/VectorMemTableConfig.java b/java/org/rocksdb/VectorMemTableConfig.java new file mode 100644 index 0000000000..b7a413f195 --- /dev/null +++ b/java/org/rocksdb/VectorMemTableConfig.java @@ -0,0 +1,40 @@ +package org.rocksdb; + +/** + * The config for vector memtable representation. + */ +public class VectorMemTableConfig extends MemTableConfig { + public static final int DEFAULT_RESERVED_SIZE = 0; + public VectorMemTableConfig() { + reservedSize_ = DEFAULT_RESERVED_SIZE; + } + + /** + * Set the initial size of the vector that will be used + * by the memtable created based on this config. + * + * @param size the initial size of the vector. + * @return the reference to the current config. + */ + public VectorMemTableConfig setReservedSize(int size) { + reservedSize_ = size; + return this; + } + + /** + * Returns the initial size of the vector used by the memtable + * created based on this config. + * + * @return the initial size of the vector. + */ + public int reservedSize() { + return reservedSize_; + } + + @Override protected long newMemTableFactoryHandle() { + return newMemTableFactoryHandle(reservedSize_); + } + + private native long newMemTableFactoryHandle(long reservedSize); + private int reservedSize_; +} diff --git a/java/org/rocksdb/WriteBatch.java b/java/org/rocksdb/WriteBatch.java new file mode 100644 index 0000000000..f538dc1a0b --- /dev/null +++ b/java/org/rocksdb/WriteBatch.java @@ -0,0 +1,112 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.*; + +/** + * WriteBatch holds a collection of updates to apply atomically to a DB. + * + * The updates are applied in the order in which they are added + * to the WriteBatch. For example, the value of "key" will be "v3" + * after the following batch is written: + * + * batch.put("key", "v1"); + * batch.remove("key"); + * batch.put("key", "v2"); + * batch.put("key", "v3"); + * + * Multiple threads can invoke const methods on a WriteBatch without + * external synchronization, but if any of the threads may call a + * non-const method, all threads accessing the same WriteBatch must use + * external synchronization. + */ +public class WriteBatch extends RocksObject { + public WriteBatch() { + super(); + newWriteBatch(0); + } + + public WriteBatch(int reserved_bytes) { + nativeHandle_ = 0; + newWriteBatch(reserved_bytes); + } + + /** + * Returns the number of updates in the batch. + */ + public native int count(); + + /** + * Store the mapping "key->value" in the database. + */ + public void put(byte[] key, byte[] value) { + put(key, key.length, value, value.length); + } + + /** + * Merge "value" with the existing value of "key" in the database. + * "key->merge(existing, value)" + */ + public void merge(byte[] key, byte[] value) { + merge(key, key.length, value, value.length); + } + + /** + * If the database contains a mapping for "key", erase it. Else do nothing. + */ + public void remove(byte[] key) { + remove(key, key.length); + } + + /** + * Append a blob of arbitrary size to the records in this batch. The blob will + * be stored in the transaction log but not in any other file. In particular, + * it will not be persisted to the SST files. When iterating over this + * WriteBatch, WriteBatch::Handler::LogData will be called with the contents + * of the blob as it is encountered. Blobs, puts, deletes, and merges will be + * encountered in the same order in thich they were inserted. The blob will + * NOT consume sequence number(s) and will NOT increase the count of the batch + * + * Example application: add timestamps to the transaction log for use in + * replication. + */ + public void putLogData(byte[] blob) { + putLogData(blob, blob.length); + } + + /** + * Clear all updates buffered in this batch + */ + public native void clear(); + + /** + * Delete the c++ side pointer. + */ + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + private native void newWriteBatch(int reserved_bytes); + private native void put(byte[] key, int keyLen, + byte[] value, int valueLen); + private native void merge(byte[] key, int keyLen, + byte[] value, int valueLen); + private native void remove(byte[] key, int keyLen); + private native void putLogData(byte[] blob, int blobLen); + private native void disposeInternal(long handle); +} + +/** + * Package-private class which provides java api to access + * c++ WriteBatchInternal. + */ +class WriteBatchInternal { + static native void setSequence(WriteBatch batch, long sn); + static native long sequence(WriteBatch batch); + static native void append(WriteBatch b1, WriteBatch b2); +} diff --git a/java/org/rocksdb/WriteBatchTest.java b/java/org/rocksdb/WriteBatchTest.java new file mode 100644 index 0000000000..03a8663133 --- /dev/null +++ b/java/org/rocksdb/WriteBatchTest.java @@ -0,0 +1,124 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +package org.rocksdb; + +import java.util.*; +import java.io.UnsupportedEncodingException; + +/** + * This class mimics the db/write_batch_test.cc in the c++ rocksdb library. + */ +public class WriteBatchTest { + static { + RocksDB.loadLibrary(); + } + + public static void main(String args[]) { + System.out.println("Testing WriteBatchTest.Empty ==="); + Empty(); + + System.out.println("Testing WriteBatchTest.Multiple ==="); + Multiple(); + + System.out.println("Testing WriteBatchTest.Append ==="); + Append(); + + System.out.println("Testing WriteBatchTest.Blob ==="); + Blob(); + + // The following tests have not yet ported. + // Continue(); + // PutGatherSlices(); + + System.out.println("Passed all WriteBatchTest!"); + } + + static void Empty() { + WriteBatch batch = new WriteBatch(); + assert(batch.count() == 0); + } + + static void Multiple() { + try { + WriteBatch batch = new WriteBatch(); + batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII")); + batch.remove("box".getBytes("US-ASCII")); + batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII")); + WriteBatchInternal.setSequence(batch, 100); + assert(100 == WriteBatchInternal.sequence(batch)); + assert(3 == batch.count()); + assert(new String("Put(baz, boo)@102" + + "Delete(box)@101" + + "Put(foo, bar)@100") + .equals(new String(getContents(batch), "US-ASCII"))); + } catch (UnsupportedEncodingException e) { + System.err.println(e); + assert(false); + } + } + + static void Append() { + WriteBatch b1 = new WriteBatch(); + WriteBatch b2 = new WriteBatch(); + WriteBatchInternal.setSequence(b1, 200); + WriteBatchInternal.setSequence(b2, 300); + WriteBatchInternal.append(b1, b2); + assert(getContents(b1).length == 0); + assert(b1.count() == 0); + try { + b2.put("a".getBytes("US-ASCII"), "va".getBytes("US-ASCII")); + WriteBatchInternal.append(b1, b2); + assert("Put(a, va)@200".equals(new String(getContents(b1), "US-ASCII"))); + assert(1 == b1.count()); + b2.clear(); + b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII")); + WriteBatchInternal.append(b1, b2); + assert(new String("Put(a, va)@200" + + "Put(b, vb)@201") + .equals(new String(getContents(b1), "US-ASCII"))); + assert(2 == b1.count()); + b2.remove("foo".getBytes("US-ASCII")); + WriteBatchInternal.append(b1, b2); + assert(new String("Put(a, va)@200" + + "Put(b, vb)@202" + + "Put(b, vb)@201" + + "Delete(foo)@203") + .equals(new String(getContents(b1), "US-ASCII"))); + assert(4 == b1.count()); + } catch (UnsupportedEncodingException e) { + System.err.println(e); + assert(false); + } + } + + static void Blob() { + WriteBatch batch = new WriteBatch(); + try { + batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII")); + batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII")); + batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII")); + batch.putLogData("blob1".getBytes("US-ASCII")); + batch.remove("k2".getBytes("US-ASCII")); + batch.putLogData("blob2".getBytes("US-ASCII")); + batch.merge("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII")); + assert(5 == batch.count()); + assert(new String("Merge(foo, bar)@4" + + "Put(k1, v1)@0" + + "Delete(k2)@3" + + "Put(k2, v2)@1" + + "Put(k3, v3)@2") + .equals(new String(getContents(batch), "US-ASCII"))); + } catch (UnsupportedEncodingException e) { + System.err.println(e); + assert(false); + } + } + + static native byte[] getContents(WriteBatch batch); +} diff --git a/java/org/rocksdb/WriteOptions.java b/java/org/rocksdb/WriteOptions.java new file mode 100644 index 0000000000..d26dbb918c --- /dev/null +++ b/java/org/rocksdb/WriteOptions.java @@ -0,0 +1,99 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Options that control write operations. + * + * Note that developers should call WriteOptions.dispose() to release the + * c++ side memory before a WriteOptions instance runs out of scope. + */ +public class WriteOptions extends RocksObject { + public WriteOptions() { + super(); + newWriteOptions(); + } + + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + /** + * If true, the write will be flushed from the operating system + * buffer cache (by calling WritableFile::Sync()) before the write + * is considered complete. If this flag is true, writes will be + * slower. + * + * If this flag is false, and the machine crashes, some recent + * writes may be lost. Note that if it is just the process that + * crashes (i.e., the machine does not reboot), no writes will be + * lost even if sync==false. + * + * In other words, a DB write with sync==false has similar + * crash semantics as the "write()" system call. A DB write + * with sync==true has similar crash semantics to a "write()" + * system call followed by "fdatasync()". + * + * Default: false + * + * @param flag a boolean flag to indicate whether a write + * should be synchronized. + * @return the instance of the current WriteOptions. + */ + public WriteOptions setSync(boolean flag) { + setSync(nativeHandle_, flag); + return this; + } + + /** + * If true, the write will be flushed from the operating system + * buffer cache (by calling WritableFile::Sync()) before the write + * is considered complete. If this flag is true, writes will be + * slower. + * + * If this flag is false, and the machine crashes, some recent + * writes may be lost. Note that if it is just the process that + * crashes (i.e., the machine does not reboot), no writes will be + * lost even if sync==false. + * + * In other words, a DB write with sync==false has similar + * crash semantics as the "write()" system call. A DB write + * with sync==true has similar crash semantics to a "write()" + * system call followed by "fdatasync()". + */ + public boolean sync() { + return sync(nativeHandle_); + } + + /** + * If true, writes will not first go to the write ahead log, + * and the write may got lost after a crash. + * + * @param flag a boolean flag to specify whether to disable + * write-ahead-log on writes. + * @return the instance of the current WriteOptions. + */ + public WriteOptions setDisableWAL(boolean flag) { + setDisableWAL(nativeHandle_, flag); + return this; + } + + /** + * If true, writes will not first go to the write ahead log, + * and the write may got lost after a crash. + */ + public boolean disableWAL() { + return disableWAL(nativeHandle_); + } + + private native void newWriteOptions(); + private native void setSync(long handle, boolean flag); + private native boolean sync(long handle); + private native void setDisableWAL(long handle, boolean flag); + private native boolean disableWAL(long handle); + private native void disposeInternal(long handle); +} diff --git a/java/org/rocksdb/benchmark/DbBenchmark.java b/java/org/rocksdb/benchmark/DbBenchmark.java new file mode 100644 index 0000000000..c34ae9b0a4 --- /dev/null +++ b/java/org/rocksdb/benchmark/DbBenchmark.java @@ -0,0 +1,1590 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +/** + * Copyright (C) 2011 the original author or authors. + * See the notice.md file distributed with this work for additional + * information regarding copyright ownership. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.rocksdb.benchmark; + +import java.lang.Runnable; +import java.io.File; +import java.nio.ByteBuffer; +import java.util.Collection; +import java.util.Date; +import java.util.EnumMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.TimeUnit; +import java.util.Arrays; +import java.util.ArrayList; +import java.util.concurrent.Callable; +import java.util.concurrent.Executors; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import org.rocksdb.*; +import org.rocksdb.util.SizeUnit; + +class Stats { + int id_; + long start_; + long finish_; + double seconds_; + long done_; + long found_; + long lastOpTime_; + long nextReport_; + long bytes_; + StringBuilder message_; + boolean excludeFromMerge_; + + // TODO(yhchiang): use the following arguments: + // (Long)Flag.stats_interval + // (Integer)Flag.stats_per_interval + + Stats(int id) { + id_ = id; + nextReport_ = 100; + done_ = 0; + bytes_ = 0; + seconds_ = 0; + start_ = System.nanoTime(); + lastOpTime_ = start_; + finish_ = start_; + found_ = 0; + message_ = new StringBuilder(""); + excludeFromMerge_ = false; + } + + void merge(final Stats other) { + if (other.excludeFromMerge_) { + return; + } + + done_ += other.done_; + found_ += other.found_; + bytes_ += other.bytes_; + seconds_ += other.seconds_; + if (other.start_ < start_) start_ = other.start_; + if (other.finish_ > finish_) finish_ = other.finish_; + + // Just keep the messages from one thread + if (message_.length() == 0) { + message_ = other.message_; + } + } + + void stop() { + finish_ = System.nanoTime(); + seconds_ = (double) (finish_ - start_) / 1000000; + } + + void addMessage(String msg) { + if (message_.length() > 0) { + message_.append(" "); + } + message_.append(msg); + } + + void setId(int id) { id_ = id; } + void setExcludeFromMerge() { excludeFromMerge_ = true; } + + void finishedSingleOp(int bytes) { + done_++; + lastOpTime_ = System.nanoTime(); + bytes_ += bytes; + if (done_ >= nextReport_) { + if (nextReport_ < 1000) { + nextReport_ += 100; + } else if (nextReport_ < 5000) { + nextReport_ += 500; + } else if (nextReport_ < 10000) { + nextReport_ += 1000; + } else if (nextReport_ < 50000) { + nextReport_ += 5000; + } else if (nextReport_ < 100000) { + nextReport_ += 10000; + } else if (nextReport_ < 500000) { + nextReport_ += 50000; + } else { + nextReport_ += 100000; + } + System.err.printf("... Task %s finished %d ops%30s\r", id_, done_, ""); + } + } + + void report(String name) { + // Pretend at least one op was done in case we are running a benchmark + // that does not call FinishedSingleOp(). + if (done_ < 1) done_ = 1; + + StringBuilder extra = new StringBuilder(""); + if (bytes_ > 0) { + // Rate is computed on actual elapsed time, not the sum of per-thread + // elapsed times. + double elapsed = (finish_ - start_) * 1e-6; + extra.append(String.format("%6.1f MB/s", (bytes_ / 1048576.0) / elapsed)); + } + extra.append(message_.toString()); + double elapsed = (finish_ - start_) * 1e-6; + double throughput = (double) done_ / elapsed; + + System.out.format("%-12s : %11.3f micros/op %d ops/sec;%s%s\n", + name, elapsed * 1e6 / done_, + (long) throughput, (extra.length() == 0 ? "" : " "), extra.toString()); + } +} + +public class DbBenchmark { + enum Order { + SEQUENTIAL, + RANDOM + } + + enum DBState { + FRESH, + EXISTING + } + + enum CompressionType { + NONE, + SNAPPY, + ZLIB, + BZIP2, + LZ4, + LZ4HC + } + + static { + RocksDB.loadLibrary(); + } + + abstract class BenchmarkTask implements Callable { + // TODO(yhchiang): use (Integer)Flag.perf_level. + public BenchmarkTask( + int tid, long randSeed, long numEntries, long keyRange) { + tid_ = tid; + rand_ = new Random(randSeed + tid * 1000); + numEntries_ = numEntries; + keyRange_ = keyRange; + stats_ = new Stats(tid); + } + + @Override public Stats call() throws RocksDBException { + stats_.start_ = System.nanoTime(); + runTask(); + stats_.finish_ = System.nanoTime(); + return stats_; + } + + abstract protected void runTask() throws RocksDBException; + + protected int tid_; + protected Random rand_; + protected long numEntries_; + protected long keyRange_; + protected Stats stats_; + + protected void getFixedKey(byte[] key, long sn) { + generateKeyFromLong(key, sn); + } + + protected void getRandomKey(byte[] key, long range) { + generateKeyFromLong(key, Math.abs(rand_.nextLong() % range)); + } + } + + abstract class WriteTask extends BenchmarkTask { + public WriteTask( + int tid, long randSeed, long numEntries, long keyRange, + WriteOptions writeOpt, long entriesPerBatch) { + super(tid, randSeed, numEntries, keyRange); + writeOpt_ = writeOpt; + entriesPerBatch_ = entriesPerBatch; + maxWritesPerSecond_ = -1; + } + + public WriteTask( + int tid, long randSeed, long numEntries, long keyRange, + WriteOptions writeOpt, long entriesPerBatch, long maxWritesPerSecond) { + super(tid, randSeed, numEntries, keyRange); + writeOpt_ = writeOpt; + entriesPerBatch_ = entriesPerBatch; + maxWritesPerSecond_ = maxWritesPerSecond; + } + + @Override public void runTask() throws RocksDBException { + if (numEntries_ != DbBenchmark.this.num_) { + stats_.message_.append(String.format(" (%d ops)", numEntries_)); + } + byte[] key = new byte[keySize_]; + byte[] value = new byte[valueSize_]; + + try { + if (entriesPerBatch_ == 1) { + for (long i = 0; i < numEntries_; ++i) { + getKey(key, i, keyRange_); + db_.put(writeOpt_, key, DbBenchmark.this.gen_.generate(valueSize_)); + stats_.finishedSingleOp(keySize_ + valueSize_); + writeRateControl(i); + if (isFinished()) { + return; + } + } + } else { + for (long i = 0; i < numEntries_; i += entriesPerBatch_) { + WriteBatch batch = new WriteBatch(); + for (long j = 0; j < entriesPerBatch_; j++) { + getKey(key, i + j, keyRange_); + batch.put(key, DbBenchmark.this.gen_.generate(valueSize_)); + stats_.finishedSingleOp(keySize_ + valueSize_); + } + db_.write(writeOpt_, batch); + batch.dispose(); + writeRateControl(i); + if (isFinished()) { + return; + } + } + } + } catch (InterruptedException e) { + // thread has been terminated. + } + } + + protected void writeRateControl(long writeCount) + throws InterruptedException { + if (maxWritesPerSecond_ <= 0) return; + long minInterval = + writeCount * TimeUnit.SECONDS.toNanos(1) / maxWritesPerSecond_; + long interval = System.nanoTime() - stats_.start_; + if (minInterval - interval > TimeUnit.MILLISECONDS.toNanos(1)) { + TimeUnit.NANOSECONDS.sleep(minInterval - interval); + } + } + + abstract protected void getKey(byte[] key, long id, long range); + protected WriteOptions writeOpt_; + protected long entriesPerBatch_; + protected long maxWritesPerSecond_; + } + + class WriteSequentialTask extends WriteTask { + public WriteSequentialTask( + int tid, long randSeed, long numEntries, long keyRange, + WriteOptions writeOpt, long entriesPerBatch) { + super(tid, randSeed, numEntries, keyRange, + writeOpt, entriesPerBatch); + } + public WriteSequentialTask( + int tid, long randSeed, long numEntries, long keyRange, + WriteOptions writeOpt, long entriesPerBatch, + long maxWritesPerSecond) { + super(tid, randSeed, numEntries, keyRange, + writeOpt, entriesPerBatch, + maxWritesPerSecond); + } + @Override protected void getKey(byte[] key, long id, long range) { + getFixedKey(key, id); + } + } + + class WriteRandomTask extends WriteTask { + public WriteRandomTask( + int tid, long randSeed, long numEntries, long keyRange, + WriteOptions writeOpt, long entriesPerBatch) { + super(tid, randSeed, numEntries, keyRange, + writeOpt, entriesPerBatch); + } + public WriteRandomTask( + int tid, long randSeed, long numEntries, long keyRange, + WriteOptions writeOpt, long entriesPerBatch, + long maxWritesPerSecond) { + super(tid, randSeed, numEntries, keyRange, + writeOpt, entriesPerBatch, + maxWritesPerSecond); + } + @Override protected void getKey(byte[] key, long id, long range) { + getRandomKey(key, range); + } + } + + class WriteUniqueRandomTask extends WriteTask { + static final int MAX_BUFFER_SIZE = 10000000; + public WriteUniqueRandomTask( + int tid, long randSeed, long numEntries, long keyRange, + WriteOptions writeOpt, long entriesPerBatch) { + super(tid, randSeed, numEntries, keyRange, + writeOpt, entriesPerBatch); + initRandomKeySequence(); + } + public WriteUniqueRandomTask( + int tid, long randSeed, long numEntries, long keyRange, + WriteOptions writeOpt, long entriesPerBatch, + long maxWritesPerSecond) { + super(tid, randSeed, numEntries, keyRange, + writeOpt, entriesPerBatch, + maxWritesPerSecond); + initRandomKeySequence(); + } + @Override protected void getKey(byte[] key, long id, long range) { + generateKeyFromLong(key, nextUniqueRandom()); + } + + protected void initRandomKeySequence() { + bufferSize_ = MAX_BUFFER_SIZE; + if (bufferSize_ > keyRange_) { + bufferSize_ = (int) keyRange_; + } + currentKeyCount_ = bufferSize_; + keyBuffer_ = new long[MAX_BUFFER_SIZE]; + for (int k = 0; k < bufferSize_; ++k) { + keyBuffer_[k] = k; + } + } + + /** + * Semi-randomly return the next unique key. It is guaranteed to be + * fully random if keyRange_ <= MAX_BUFFER_SIZE. + */ + long nextUniqueRandom() { + if (bufferSize_ == 0) { + System.err.println("bufferSize_ == 0."); + return 0; + } + int r = rand_.nextInt(bufferSize_); + // randomly pick one from the keyBuffer + long randKey = keyBuffer_[r]; + if (currentKeyCount_ < keyRange_) { + // if we have not yet inserted all keys, insert next new key to [r]. + keyBuffer_[r] = currentKeyCount_++; + } else { + // move the last element to [r] and decrease the size by 1. + keyBuffer_[r] = keyBuffer_[--bufferSize_]; + } + return randKey; + } + + int bufferSize_; + long currentKeyCount_; + long[] keyBuffer_; + } + + class ReadRandomTask extends BenchmarkTask { + public ReadRandomTask( + int tid, long randSeed, long numEntries, long keyRange) { + super(tid, randSeed, numEntries, keyRange); + } + @Override public void runTask() throws RocksDBException { + byte[] key = new byte[keySize_]; + byte[] value = new byte[valueSize_]; + for (long i = 0; i < numEntries_; i++) { + getRandomKey(key, keyRange_); + int len = db_.get(key, value); + if (len != RocksDB.NOT_FOUND) { + stats_.found_++; + stats_.finishedSingleOp(keySize_ + valueSize_); + } else { + stats_.finishedSingleOp(keySize_); + } + if (isFinished()) { + return; + } + } + } + } + + class ReadSequentialTask extends BenchmarkTask { + public ReadSequentialTask( + int tid, long randSeed, long numEntries, long keyRange) { + super(tid, randSeed, numEntries, keyRange); + } + @Override public void runTask() throws RocksDBException { + RocksIterator iter = db_.newIterator(); + long i; + for (iter.seekToFirst(), i = 0; + iter.isValid() && i < numEntries_; + iter.next(), ++i) { + stats_.found_++; + stats_.finishedSingleOp(iter.key().length + iter.value().length); + if (isFinished()) { + return; + } + } + } + } + + public DbBenchmark(Map flags) throws Exception { + benchmarks_ = (List) flags.get(Flag.benchmarks); + num_ = (Integer) flags.get(Flag.num); + threadNum_ = (Integer) flags.get(Flag.threads); + reads_ = (Integer) (flags.get(Flag.reads) == null ? + flags.get(Flag.num) : flags.get(Flag.reads)); + keySize_ = (Integer) flags.get(Flag.key_size); + valueSize_ = (Integer) flags.get(Flag.value_size); + compressionRatio_ = (Double) flags.get(Flag.compression_ratio); + useExisting_ = (Boolean) flags.get(Flag.use_existing_db); + randSeed_ = (Long) flags.get(Flag.seed); + databaseDir_ = (String) flags.get(Flag.db); + writesPerSeconds_ = (Integer) flags.get(Flag.writes_per_second); + cacheSize_ = (Long) flags.get(Flag.cache_size); + memtable_ = (String) flags.get(Flag.memtablerep); + maxWriteBufferNumber_ = (Integer) flags.get(Flag.max_write_buffer_number); + prefixSize_ = (Integer) flags.get(Flag.prefix_size); + keysPerPrefix_ = (Integer) flags.get(Flag.keys_per_prefix); + hashBucketCount_ = (Long) flags.get(Flag.hash_bucket_count); + usePlainTable_ = (Boolean) flags.get(Flag.use_plain_table); + flags_ = flags; + finishLock_ = new Object(); + // options.setPrefixSize((Integer)flags_.get(Flag.prefix_size)); + // options.setKeysPerPrefix((Long)flags_.get(Flag.keys_per_prefix)); + compressionType_ = (String) flags.get(Flag.compression_type); + compression_ = CompressionType.NONE; + try { + if (compressionType_.equals("snappy")) { + System.loadLibrary("snappy"); + } else if (compressionType_.equals("zlib")) { + System.loadLibrary("zlib"); + } else if (compressionType_.equals("bzip2")) { + System.loadLibrary("bzip2"); + } else if (compressionType_.equals("lz4")) { + System.loadLibrary("lz4"); + } else if (compressionType_.equals("lz4hc")) { + System.loadLibrary("lz4hc"); + } + } catch (UnsatisfiedLinkError e) { + System.err.format("Unable to load %s library:%s%n" + + "No compression is used.%n", + compressionType_, e.toString()); + compressionType_ = "none"; + compressionRatio_ = 1.0; + } + gen_ = new RandomGenerator(randSeed_, compressionRatio_); + } + + private void prepareReadOptions(ReadOptions options) { + options.setVerifyChecksums((Boolean)flags_.get(Flag.verify_checksum)); + options.setTailing((Boolean)flags_.get(Flag.use_tailing_iterator)); + } + + private void prepareWriteOptions(WriteOptions options) { + options.setSync((Boolean)flags_.get(Flag.sync)); + options.setDisableWAL((Boolean)flags_.get(Flag.disable_wal)); + } + + private void prepareOptions(Options options) { + options.setCacheSize(cacheSize_); + if (!useExisting_) { + options.setCreateIfMissing(true); + } else { + options.setCreateIfMissing(false); + } + if (memtable_.equals("skip_list")) { + options.setMemTableConfig(new SkipListMemTableConfig()); + } else if (memtable_.equals("vector")) { + options.setMemTableConfig(new VectorMemTableConfig()); + } else if (memtable_.equals("hash_linkedlist")) { + options.setMemTableConfig( + new HashLinkedListMemTableConfig() + .setBucketCount(hashBucketCount_)); + options.useFixedLengthPrefixExtractor(prefixSize_); + } else if (memtable_.equals("hash_skiplist") || + memtable_.equals("prefix_hash")) { + options.setMemTableConfig( + new HashSkipListMemTableConfig() + .setBucketCount(hashBucketCount_)); + options.useFixedLengthPrefixExtractor(prefixSize_); + } else { + System.err.format( + "unable to detect the specified memtable, " + + "use the default memtable factory %s%n", + options.memTableFactoryName()); + } + if (usePlainTable_) { + options.setTableFormatConfig( + new PlainTableConfig().setKeySize(keySize_)); + } + options.setWriteBufferSize( + (Long)flags_.get(Flag.write_buffer_size)); + options.setMaxWriteBufferNumber( + (Integer)flags_.get(Flag.max_write_buffer_number)); + options.setMaxBackgroundCompactions( + (Integer)flags_.get(Flag.max_background_compactions)); + options.setMaxBackgroundFlushes( + (Integer)flags_.get(Flag.max_background_flushes)); + options.setCacheSize( + (Long)flags_.get(Flag.cache_size)); + options.setBlockSize( + (Long)flags_.get(Flag.block_size)); + options.setMaxOpenFiles( + (Integer)flags_.get(Flag.open_files)); + options.setTableCacheRemoveScanCountLimit( + (Integer)flags_.get(Flag.cache_remove_scan_count_limit)); + options.setDisableDataSync( + (Boolean)flags_.get(Flag.disable_data_sync)); + options.setUseFsync( + (Boolean)flags_.get(Flag.use_fsync)); + options.setWalDir( + (String)flags_.get(Flag.wal_dir)); + options.setDisableSeekCompaction( + (Boolean)flags_.get(Flag.disable_seek_compaction)); + options.setDeleteObsoleteFilesPeriodMicros( + (Integer)flags_.get(Flag.delete_obsolete_files_period_micros)); + options.setTableCacheNumshardbits( + (Integer)flags_.get(Flag.table_cache_numshardbits)); + options.setAllowMmapReads( + (Boolean)flags_.get(Flag.mmap_read)); + options.setAllowMmapWrites( + (Boolean)flags_.get(Flag.mmap_write)); + options.setAdviseRandomOnOpen( + (Boolean)flags_.get(Flag.advise_random_on_open)); + options.setUseAdaptiveMutex( + (Boolean)flags_.get(Flag.use_adaptive_mutex)); + options.setBytesPerSync( + (Long)flags_.get(Flag.bytes_per_sync)); + options.setBloomLocality( + (Integer)flags_.get(Flag.bloom_locality)); + options.setMinWriteBufferNumberToMerge( + (Integer)flags_.get(Flag.min_write_buffer_number_to_merge)); + options.setMemtablePrefixBloomBits( + (Integer)flags_.get(Flag.memtable_bloom_bits)); + options.setNumLevels( + (Integer)flags_.get(Flag.num_levels)); + options.setTargetFileSizeBase( + (Integer)flags_.get(Flag.target_file_size_base)); + options.setTargetFileSizeMultiplier( + (Integer)flags_.get(Flag.target_file_size_multiplier)); + options.setMaxBytesForLevelBase( + (Integer)flags_.get(Flag.max_bytes_for_level_base)); + options.setMaxBytesForLevelMultiplier( + (Integer)flags_.get(Flag.max_bytes_for_level_multiplier)); + options.setLevelZeroStopWritesTrigger( + (Integer)flags_.get(Flag.level0_stop_writes_trigger)); + options.setLevelZeroSlowdownWritesTrigger( + (Integer)flags_.get(Flag.level0_slowdown_writes_trigger)); + options.setLevelZeroFileNumCompactionTrigger( + (Integer)flags_.get(Flag.level0_file_num_compaction_trigger)); + options.setSoftRateLimit( + (Double)flags_.get(Flag.soft_rate_limit)); + options.setHardRateLimit( + (Double)flags_.get(Flag.hard_rate_limit)); + options.setRateLimitDelayMaxMilliseconds( + (Integer)flags_.get(Flag.rate_limit_delay_max_milliseconds)); + options.setMaxGrandparentOverlapFactor( + (Integer)flags_.get(Flag.max_grandparent_overlap_factor)); + options.setDisableAutoCompactions( + (Boolean)flags_.get(Flag.disable_auto_compactions)); + options.setSourceCompactionFactor( + (Integer)flags_.get(Flag.source_compaction_factor)); + options.setFilterDeletes( + (Boolean)flags_.get(Flag.filter_deletes)); + options.setMaxSuccessiveMerges( + (Integer)flags_.get(Flag.max_successive_merges)); + options.setWalTtlSeconds((Long)flags_.get(Flag.wal_ttl_seconds)); + options.setWalSizeLimitMB((Long)flags_.get(Flag.wal_size_limit_MB)); + int bloomBits = (Integer)flags_.get(Flag.bloom_bits); + if (bloomBits > 0) { + // Internally, options will keep a reference to this BloomFilter. + // This will disallow Java to GC this BloomFilter. In addition, + // options.dispose() will release the c++ object of this BloomFilter. + // As a result, the caller should not directly call + // BloomFilter.dispose(). + options.setFilter(new BloomFilter(bloomBits)); + } + /* TODO(yhchiang): enable the following parameters + options.setCompressionType((String)flags_.get(Flag.compression_type)); + options.setCompressionLevel((Integer)flags_.get(Flag.compression_level)); + options.setMinLevelToCompress((Integer)flags_.get(Flag.min_level_to_compress)); + options.setHdfs((String)flags_.get(Flag.hdfs)); // env + options.setCacheNumshardbits((Integer)flags_.get(Flag.cache_numshardbits)); + options.setStatistics((Boolean)flags_.get(Flag.statistics)); + options.setUniversalSizeRatio( + (Integer)flags_.get(Flag.universal_size_ratio)); + options.setUniversalMinMergeWidth( + (Integer)flags_.get(Flag.universal_min_merge_width)); + options.setUniversalMaxMergeWidth( + (Integer)flags_.get(Flag.universal_max_merge_width)); + options.setUniversalMaxSizeAmplificationPercent( + (Integer)flags_.get(Flag.universal_max_size_amplification_percent)); + options.setUniversalCompressionSizePercent( + (Integer)flags_.get(Flag.universal_compression_size_percent)); + // TODO(yhchiang): add RocksDB.openForReadOnly() to enable Flag.readonly + // TODO(yhchiang): enable Flag.merge_operator by switch + options.setAccessHintOnCompactionStart( + (String)flags_.get(Flag.compaction_fadvice)); + // available values of fadvice are "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" for fadvice + */ + } + + private void run() throws RocksDBException { + if (!useExisting_) { + destroyDb(); + } + Options options = new Options(); + prepareOptions(options); + open(options); + + printHeader(options); + + for (String benchmark : benchmarks_) { + List> tasks = new ArrayList>(); + List> bgTasks = new ArrayList>(); + WriteOptions writeOpt = new WriteOptions(); + prepareWriteOptions(writeOpt); + ReadOptions readOpt = new ReadOptions(); + prepareReadOptions(readOpt); + int currentTaskId = 0; + boolean known = true; + + if (benchmark.equals("fillseq")) { + tasks.add(new WriteSequentialTask( + currentTaskId++, randSeed_, num_, num_, writeOpt, 1)); + } else if (benchmark.equals("fillbatch")) { + tasks.add(new WriteRandomTask( + currentTaskId++, randSeed_, num_ / 1000, num_, writeOpt, 1000)); + } else if (benchmark.equals("fillrandom")) { + tasks.add(new WriteRandomTask( + currentTaskId++, randSeed_, num_, num_, writeOpt, 1)); + } else if (benchmark.equals("filluniquerandom")) { + tasks.add(new WriteUniqueRandomTask( + currentTaskId++, randSeed_, num_, num_, writeOpt, 1)); + } else if (benchmark.equals("fillsync")) { + writeOpt.setSync(true); + tasks.add(new WriteRandomTask( + currentTaskId++, randSeed_, num_ / 1000, num_ / 1000, + writeOpt, 1)); + } else if (benchmark.equals("readseq")) { + for (int t = 0; t < threadNum_; ++t) { + tasks.add(new ReadSequentialTask( + currentTaskId++, randSeed_, reads_ / threadNum_, num_)); + } + } else if (benchmark.equals("readrandom")) { + for (int t = 0; t < threadNum_; ++t) { + tasks.add(new ReadRandomTask( + currentTaskId++, randSeed_, reads_ / threadNum_, num_)); + } + } else if (benchmark.equals("readwhilewriting")) { + WriteTask writeTask = new WriteRandomTask( + -1, randSeed_, Long.MAX_VALUE, num_, writeOpt, 1, writesPerSeconds_); + writeTask.stats_.setExcludeFromMerge(); + bgTasks.add(writeTask); + for (int t = 0; t < threadNum_; ++t) { + tasks.add(new ReadRandomTask( + currentTaskId++, randSeed_, reads_ / threadNum_, num_)); + } + } else if (benchmark.equals("readhot")) { + for (int t = 0; t < threadNum_; ++t) { + tasks.add(new ReadRandomTask( + currentTaskId++, randSeed_, reads_ / threadNum_, num_ / 100)); + } + } else if (benchmark.equals("delete")) { + destroyDb(); + open(options); + } else { + known = false; + System.err.println("Unknown benchmark: " + benchmark); + } + if (known) { + ExecutorService executor = Executors.newCachedThreadPool(); + ExecutorService bgExecutor = Executors.newCachedThreadPool(); + try { + // measure only the main executor time + List> bgResults = new ArrayList>(); + for (Callable bgTask : bgTasks) { + bgResults.add(bgExecutor.submit(bgTask)); + } + start(); + List> results = executor.invokeAll(tasks); + executor.shutdown(); + boolean finished = executor.awaitTermination(10, TimeUnit.SECONDS); + if (!finished) { + System.out.format( + "Benchmark %s was not finished before timeout.", + benchmark); + executor.shutdownNow(); + } + setFinished(true); + bgExecutor.shutdown(); + finished = bgExecutor.awaitTermination(10, TimeUnit.SECONDS); + if (!finished) { + System.out.format( + "Benchmark %s was not finished before timeout.", + benchmark); + bgExecutor.shutdownNow(); + } + + stop(benchmark, results, currentTaskId); + } catch (InterruptedException e) { + System.err.println(e); + } + } + writeOpt.dispose(); + readOpt.dispose(); + } + options.dispose(); + db_.close(); + } + + private void printHeader(Options options) { + int kKeySize = 16; + System.out.printf("Keys: %d bytes each\n", kKeySize); + System.out.printf("Values: %d bytes each (%d bytes after compression)\n", + valueSize_, + (int) (valueSize_ * compressionRatio_ + 0.5)); + System.out.printf("Entries: %d\n", num_); + System.out.printf("RawSize: %.1f MB (estimated)\n", + ((double)(kKeySize + valueSize_) * num_) / SizeUnit.MB); + System.out.printf("FileSize: %.1f MB (estimated)\n", + (((kKeySize + valueSize_ * compressionRatio_) * num_) / SizeUnit.MB)); + System.out.format("Memtable Factory: %s%n", options.memTableFactoryName()); + System.out.format("Prefix: %d bytes%n", prefixSize_); + System.out.format("Compression: %s%n", compressionType_); + printWarnings(); + System.out.printf("------------------------------------------------\n"); + } + + void printWarnings() { + boolean assertsEnabled = false; + assert assertsEnabled = true; // Intentional side effect!!! + if (assertsEnabled) { + System.out.printf( + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); + } + } + + private void open(Options options) throws RocksDBException { + db_ = RocksDB.open(options, databaseDir_); + } + + private void start() { + setFinished(false); + startTime_ = System.nanoTime(); + } + + private void stop( + String benchmark, List> results, int concurrentThreads) { + long endTime = System.nanoTime(); + double elapsedSeconds = + 1.0d * (endTime - startTime_) / TimeUnit.SECONDS.toNanos(1); + + Stats stats = new Stats(-1); + int taskFinishedCount = 0; + for (Future result : results) { + if (result.isDone()) { + try { + Stats taskStats = result.get(3, TimeUnit.SECONDS); + if (!result.isCancelled()) { + taskFinishedCount++; + } + stats.merge(taskStats); + } catch (Exception e) { + // then it's not successful, the output will indicate this + } + } + } + String extra = ""; + if (benchmark.indexOf("read") >= 0) { + extra = String.format(" %d / %d found; ", stats.found_, stats.done_); + } else { + extra = String.format(" %d ops done; ", stats.done_); + } + + System.out.printf( + "%-16s : %11.5f micros/op; %6.1f MB/s;%s %d / %d task(s) finished.\n", + benchmark, (double) elapsedSeconds / stats.done_ * 1e6, + (stats.bytes_ / 1048576.0) / elapsedSeconds, extra, + taskFinishedCount, concurrentThreads); + } + + public void generateKeyFromLong(byte[] slice, long n) { + assert(n >= 0); + int startPos = 0; + + if (keysPerPrefix_ > 0) { + long numPrefix = (num_ + keysPerPrefix_ - 1) / keysPerPrefix_; + long prefix = n % numPrefix; + int bytesToFill = Math.min(prefixSize_, 8); + for (int i = 0; i < bytesToFill; ++i) { + slice[i] = (byte) (prefix % 256); + prefix /= 256; + } + for (int i = 8; i < bytesToFill; ++i) { + slice[i] = '0'; + } + startPos = bytesToFill; + } + + for (int i = slice.length - 1; i >= startPos; --i) { + slice[i] = (byte) ('0' + (n % 10)); + n /= 10; + } + } + + private void destroyDb() { + if (db_ != null) { + db_.close(); + } + // TODO(yhchiang): develop our own FileUtil + // FileUtil.deleteDir(databaseDir_); + } + + private void printStats() { + } + + static void printHelp() { + System.out.println("usage:"); + for (Flag flag : Flag.values()) { + System.out.format(" --%s%n\t%s%n", + flag.name(), + flag.desc()); + if (flag.getDefaultValue() != null) { + System.out.format("\tDEFAULT: %s%n", + flag.getDefaultValue().toString()); + } + } + } + + public static void main(String[] args) throws Exception { + Map flags = new EnumMap(Flag.class); + for (Flag flag : Flag.values()) { + if (flag.getDefaultValue() != null) { + flags.put(flag, flag.getDefaultValue()); + } + } + for (String arg : args) { + boolean valid = false; + if (arg.equals("--help") || arg.equals("-h")) { + printHelp(); + System.exit(0); + } + if (arg.startsWith("--")) { + try { + String[] parts = arg.substring(2).split("="); + if (parts.length >= 1) { + Flag key = Flag.valueOf(parts[0]); + if (key != null) { + Object value = null; + if (parts.length >= 2) { + value = key.parseValue(parts[1]); + } + flags.put(key, value); + valid = true; + } + } + } + catch (Exception e) { + } + } + if (!valid) { + System.err.println("Invalid argument " + arg); + System.exit(1); + } + } + new DbBenchmark(flags).run(); + } + + private enum Flag { + benchmarks( + Arrays.asList( + "fillseq", + "readrandom", + "fillrandom"), + "Comma-separated list of operations to run in the specified order\n" + + "\tActual benchmarks:\n" + + "\t\tfillseq -- write N values in sequential key order in async mode.\n" + + "\t\tfillrandom -- write N values in random key order in async mode.\n" + + "\t\tfillbatch -- write N/1000 batch where each batch has 1000 values\n" + + "\t\t in random key order in sync mode.\n" + + "\t\tfillsync -- write N/100 values in random key order in sync mode.\n" + + "\t\tfill100K -- write N/1000 100K values in random order in async mode.\n" + + "\t\treadseq -- read N times sequentially.\n" + + "\t\treadrandom -- read N times in random order.\n" + + "\t\treadhot -- read N times in random order from 1% section of DB.\n" + + "\t\treadwhilewriting -- measure the read performance of multiple readers\n" + + "\t\t with a bg single writer. The write rate of the bg\n" + + "\t\t is capped by --writes_per_second.\n" + + "\tMeta Operations:\n" + + "\t\tdelete -- delete DB") { + @Override public Object parseValue(String value) { + return new ArrayList(Arrays.asList(value.split(","))); + } + }, + compression_ratio(0.5d, + "Arrange to generate values that shrink to this fraction of\n" + + "\ttheir original size after compression.") { + @Override public Object parseValue(String value) { + return Double.parseDouble(value); + } + }, + use_existing_db(false, + "If true, do not destroy the existing database. If you set this\n" + + "\tflag and also specify a benchmark that wants a fresh database,\n" + + "\tthat benchmark will fail.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + num(1000000, + "Number of key/values to place in database.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + threads(1, + "Number of concurrent threads to run.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + reads(null, + "Number of read operations to do. If negative, do --nums reads.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + key_size(16, + "The size of each key in bytes.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + value_size(100, + "The size of each value in bytes.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + write_buffer_size(4 * SizeUnit.MB, + "Number of bytes to buffer in memtable before compacting\n" + + "\t(initialized to default value by 'main'.)") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + max_write_buffer_number(2, + "The number of in-memory memtables. Each memtable is of size\n" + + "\twrite_buffer_size.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + prefix_size(0, "Controls the prefix size for HashSkipList, HashLinkedList,\n" + + "\tand plain table.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + keys_per_prefix(0, "Controls the average number of keys generated\n" + + "\tper prefix, 0 means no special handling of the prefix,\n" + + "\ti.e. use the prefix comes with the generated random number.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + memtablerep("skip_list", + "The memtable format. Available options are\n" + + "\tskip_list,\n" + + "\tvector,\n" + + "\thash_linkedlist,\n" + + "\thash_skiplist (prefix_hash.)") { + @Override public Object parseValue(String value) { + return value; + } + }, + hash_bucket_count(SizeUnit.MB, + "The number of hash buckets used in the hash-bucket-based\n" + + "\tmemtables. Memtables that currently support this argument are\n" + + "\thash_linkedlist and hash_skiplist.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + writes_per_second(10000, + "The write-rate of the background writer used in the\n" + + "\t`readwhilewriting` benchmark. Non-positive number indicates\n" + + "\tusing an unbounded write-rate in `readwhilewriting` benchmark.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + use_plain_table(false, + "Use plain-table sst format.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + cache_size(-1L, + "Number of bytes to use as a cache of uncompressed data.\n" + + "\tNegative means use default settings.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + seed(0L, + "Seed base for random number generators.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + num_levels(7, + "The total number of levels.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + numdistinct(1000, + "Number of distinct keys to use. Used in RandomWithVerify to\n" + + "\tread/write on fewer keys so that gets are more likely to find the\n" + + "\tkey and puts are more likely to update the same key.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + merge_keys(-1, + "Number of distinct keys to use for MergeRandom and\n" + + "\tReadRandomMergeRandom.\n" + + "\tIf negative, there will be FLAGS_num keys.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + bloom_locality(0,"Control bloom filter probes locality.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + duration(0,"Time in seconds for the random-ops tests to run.\n" + + "\tWhen 0 then num & reads determine the test duration.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + num_multi_db(0, + "Number of DBs used in the benchmark. 0 means single DB.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + histogram(false,"Print histogram of operation timings.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + min_write_buffer_number_to_merge( + defaultOptions_.minWriteBufferNumberToMerge(), + "The minimum number of write buffers that will be merged together\n" + + "\tbefore writing to storage. This is cheap because it is an\n" + + "\tin-memory merge. If this feature is not enabled, then all these\n" + + "\twrite buffers are flushed to L0 as separate files and this\n" + + "\tincreases read amplification because a get request has to check\n" + + "\tin all of these files. Also, an in-memory merge may result in\n" + + "\twriting less data to storage if there are duplicate records\n" + + "\tin each of these individual write buffers.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + max_background_compactions( + defaultOptions_.maxBackgroundCompactions(), + "The maximum number of concurrent background compactions\n" + + "\tthat can occur in parallel.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + max_background_flushes( + defaultOptions_.maxBackgroundFlushes(), + "The maximum number of concurrent background flushes\n" + + "\tthat can occur in parallel.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + /* TODO(yhchiang): enable the following + compaction_style((int32_t) defaultOptions_.compactionStyle(), + "style of compaction: level-based vs universal.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + },*/ + universal_size_ratio(0, + "Percentage flexibility while comparing file size\n" + + "\t(for universal compaction only).") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + universal_min_merge_width(0,"The minimum number of files in a\n" + + "\tsingle compaction run (for universal compaction only).") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + universal_max_merge_width(0,"The max number of files to compact\n" + + "\tin universal style compaction.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + universal_max_size_amplification_percent(0, + "The max size amplification for universal style compaction.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + universal_compression_size_percent(-1, + "The percentage of the database to compress for universal\n" + + "\tcompaction. -1 means compress everything.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + block_size(defaultOptions_.blockSize(), + "Number of bytes in a block.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + compressed_cache_size(-1, + "Number of bytes to use as a cache of compressed data.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + open_files(defaultOptions_.maxOpenFiles(), + "Maximum number of files to keep open at the same time\n" + + "\t(use default if == 0)") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + bloom_bits(-1,"Bloom filter bits per key. Negative means\n" + + "\tuse default settings.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + memtable_bloom_bits(0,"Bloom filter bits per key for memtable.\n" + + "\tNegative means no bloom filter.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + cache_numshardbits(-1,"Number of shards for the block cache\n" + + "\tis 2 ** cache_numshardbits. Negative means use default settings.\n" + + "\tThis is applied only if FLAGS_cache_size is non-negative.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + cache_remove_scan_count_limit(32,"") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + verify_checksum(false,"Verify checksum for every block read\n" + + "\tfrom storage.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + statistics(false,"Database statistics.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + writes(-1,"Number of write operations to do. If negative, do\n" + + "\t--num reads.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + sync(false,"Sync all writes to disk.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + disable_data_sync(false,"If true, do not wait until data is\n" + + "\tsynced to disk.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + use_fsync(false,"If true, issue fsync instead of fdatasync.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + disable_wal(false,"If true, do not write WAL for write.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + wal_dir("", "If not empty, use the given dir for WAL.") { + @Override public Object parseValue(String value) { + return value; + } + }, + target_file_size_base(2 * 1048576,"Target file size at level-1") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + target_file_size_multiplier(1, + "A multiplier to compute target level-N file size (N >= 2)") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + max_bytes_for_level_base(10 * 1048576, + "Max bytes for level-1") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + max_bytes_for_level_multiplier(10, + "A multiplier to compute max bytes for level-N (N >= 2)") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + level0_stop_writes_trigger(12,"Number of files in level-0\n" + + "\tthat will trigger put stop.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + level0_slowdown_writes_trigger(8,"Number of files in level-0\n" + + "\tthat will slow down writes.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + level0_file_num_compaction_trigger(4,"Number of files in level-0\n" + + "\twhen compactions start.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + readwritepercent(90,"Ratio of reads to reads/writes (expressed\n" + + "\tas percentage) for the ReadRandomWriteRandom workload. The\n" + + "\tdefault value 90 means 90% operations out of all reads and writes\n" + + "\toperations are reads. In other words, 9 gets for every 1 put.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + mergereadpercent(70,"Ratio of merges to merges&reads (expressed\n" + + "\tas percentage) for the ReadRandomMergeRandom workload. The\n" + + "\tdefault value 70 means 70% out of all read and merge operations\n" + + "\tare merges. In other words, 7 merges for every 3 gets.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + deletepercent(2,"Percentage of deletes out of reads/writes/\n" + + "\tdeletes (used in RandomWithVerify only). RandomWithVerify\n" + + "\tcalculates writepercent as (100 - FLAGS_readwritepercent -\n" + + "\tdeletepercent), so deletepercent must be smaller than (100 -\n" + + "\tFLAGS_readwritepercent)") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + disable_seek_compaction(false,"Option to disable compaction\n" + + "\ttriggered by read.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + delete_obsolete_files_period_micros(0,"Option to delete\n" + + "\tobsolete files periodically. 0 means that obsolete files are\n" + + "\tdeleted after every compaction run.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + compression_type("snappy", + "Algorithm used to compress the database.") { + @Override public Object parseValue(String value) { + return value; + } + }, + compression_level(-1, + "Compression level. For zlib this should be -1 for the\n" + + "\tdefault level, or between 0 and 9.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + min_level_to_compress(-1,"If non-negative, compression starts\n" + + "\tfrom this level. Levels with number < min_level_to_compress are\n" + + "\tnot compressed. Otherwise, apply compression_type to\n" + + "\tall levels.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + table_cache_numshardbits(4,"") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + stats_interval(0,"Stats are reported every N operations when\n" + + "\tthis is greater than zero. When 0 the interval grows over time.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + stats_per_interval(0,"Reports additional stats per interval when\n" + + "\tthis is greater than 0.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + perf_level(0,"Level of perf collection.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + soft_rate_limit(0.0,"") { + @Override public Object parseValue(String value) { + return Double.parseDouble(value); + } + }, + hard_rate_limit(0.0,"When not equal to 0 this make threads\n" + + "\tsleep at each stats reporting interval until the compaction\n" + + "\tscore for all levels is less than or equal to this value.") { + @Override public Object parseValue(String value) { + return Double.parseDouble(value); + } + }, + rate_limit_delay_max_milliseconds(1000, + "When hard_rate_limit is set then this is the max time a put will\n" + + "\tbe stalled.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + max_grandparent_overlap_factor(10,"Control maximum bytes of\n" + + "\toverlaps in grandparent (i.e., level+2) before we stop building a\n" + + "\tsingle file in a level->level+1 compaction.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + readonly(false,"Run read only benchmarks.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + disable_auto_compactions(false,"Do not auto trigger compactions.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + source_compaction_factor(1,"Cap the size of data in level-K for\n" + + "\ta compaction run that compacts Level-K with Level-(K+1) (for\n" + + "\tK >= 1)") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + wal_ttl_seconds(0L,"Set the TTL for the WAL Files in seconds.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + wal_size_limit_MB(0L,"Set the size limit for the WAL Files\n" + + "\tin MB.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + /* TODO(yhchiang): enable the following + bufferedio(rocksdb::EnvOptions().use_os_buffer, + "Allow buffered io using OS buffers.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + */ + mmap_read(false, + "Allow reads to occur via mmap-ing files.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + mmap_write(false, + "Allow writes to occur via mmap-ing files.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + advise_random_on_open(defaultOptions_.adviseRandomOnOpen(), + "Advise random access on table file open.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + compaction_fadvice("NORMAL", + "Access pattern advice when a file is compacted.") { + @Override public Object parseValue(String value) { + return value; + } + }, + use_tailing_iterator(false, + "Use tailing iterator to access a series of keys instead of get.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + use_adaptive_mutex(defaultOptions_.useAdaptiveMutex(), + "Use adaptive mutex.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + bytes_per_sync(defaultOptions_.bytesPerSync(), + "Allows OS to incrementally sync files to disk while they are\n" + + "\tbeing written, in the background. Issue one request for every\n" + + "\tbytes_per_sync written. 0 turns it off.") { + @Override public Object parseValue(String value) { + return Long.parseLong(value); + } + }, + filter_deletes(false," On true, deletes use bloom-filter and drop\n" + + "\tthe delete if key not present.") { + @Override public Object parseValue(String value) { + return parseBoolean(value); + } + }, + max_successive_merges(0,"Maximum number of successive merge\n" + + "\toperations on a key in the memtable.") { + @Override public Object parseValue(String value) { + return Integer.parseInt(value); + } + }, + db("/tmp/rocksdbjni-bench", + "Use the db with the following name.") { + @Override public Object parseValue(String value) { + return value; + } + }; + + private Flag(Object defaultValue, String desc) { + defaultValue_ = defaultValue; + desc_ = desc; + } + + public Object getDefaultValue() { + return defaultValue_; + } + + public String desc() { + return desc_; + } + + public boolean parseBoolean(String value) { + if (value.equals("1")) { + return true; + } else if (value.equals("0")) { + return false; + } + return Boolean.parseBoolean(value); + } + + protected abstract Object parseValue(String value); + + private final Object defaultValue_; + private final String desc_; + } + + private static class RandomGenerator { + private final byte[] data_; + private int dataLength_; + private int position_; + Random rand_; + + private RandomGenerator(long seed, double compressionRatio) { + // We use a limited amount of data over and over again and ensure + // that it is larger than the compression window (32KB), and also + // large enough to serve all typical value sizes we want to write. + rand_ = new Random(seed); + dataLength_ = 1048576 + 100; + data_ = new byte[dataLength_]; + // TODO(yhchiang): mimic test::CompressibleString? + for (int i = 0; i < dataLength_; ++i) { + data_[i] = (byte) (' ' + rand_.nextInt(95)); + } + } + + private byte[] generate(int length) { + position_ = rand_.nextInt(data_.length - length); + return Arrays.copyOfRange(data_, position_, position_ + length); + } + } + + boolean isFinished() { + synchronized(finishLock_) { + return isFinished_; + } + } + + void setFinished(boolean flag) { + synchronized(finishLock_) { + isFinished_ = flag; + } + } + + RocksDB db_; + final List benchmarks_; + final int num_; + final int reads_; + final int keySize_; + final int valueSize_; + final int threadNum_; + final int writesPerSeconds_; + final long randSeed_; + final long cacheSize_; + final boolean useExisting_; + final String databaseDir_; + double compressionRatio_; + RandomGenerator gen_; + long startTime_; + + // memtable related + final int maxWriteBufferNumber_; + final int prefixSize_; + final int keysPerPrefix_; + final String memtable_; + final long hashBucketCount_; + + // sst format related + boolean usePlainTable_; + + Object finishLock_; + boolean isFinished_; + Map flags_; + // as the scope of a static member equals to the scope of the problem, + // we let its c++ pointer to be disposed in its finalizer. + static Options defaultOptions_ = new Options(); + String compressionType_; + CompressionType compression_; +} diff --git a/java/org/rocksdb/test/BackupableDBTest.java b/java/org/rocksdb/test/BackupableDBTest.java new file mode 100644 index 0000000000..f0fc3d5019 --- /dev/null +++ b/java/org/rocksdb/test/BackupableDBTest.java @@ -0,0 +1,41 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb.test; + +import org.rocksdb.*; + +public class BackupableDBTest { + static final String db_path = "/tmp/backupablejni_db"; + static final String backup_path = "/tmp/backupablejni_db_backup"; + static { + RocksDB.loadLibrary(); + } + public static void main(String[] args) { + + Options opt = new Options(); + opt.setCreateIfMissing(true); + + BackupableDBOptions bopt = new BackupableDBOptions(backup_path); + BackupableDB bdb = null; + + try { + bdb = BackupableDB.open(opt, bopt, db_path); + bdb.put("hello".getBytes(), "BackupableDB".getBytes()); + bdb.createNewBackup(true); + byte[] value = bdb.get("hello".getBytes()); + assert(new String(value).equals("BackupableDB")); + } catch (RocksDBException e) { + System.err.format("[ERROR]: %s%n", e); + e.printStackTrace(); + } finally { + opt.dispose(); + bopt.dispose(); + if (bdb != null) { + bdb.close(); + } + } + } +} diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java new file mode 100644 index 0000000000..e1e0e059e1 --- /dev/null +++ b/java/org/rocksdb/test/OptionsTest.java @@ -0,0 +1,424 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb.test; + +import java.util.Random; +import org.rocksdb.RocksDB; +import org.rocksdb.Options; + +public class OptionsTest { + static { + RocksDB.loadLibrary(); + } + public static void main(String[] args) { + Options opt = new Options(); + Random rand = new Random(); + { // CreateIfMissing test + boolean boolValue = rand.nextBoolean(); + opt.setCreateIfMissing(boolValue); + assert(opt.createIfMissing() == boolValue); + } + + { // ErrorIfExists test + boolean boolValue = rand.nextBoolean(); + opt.setErrorIfExists(boolValue); + assert(opt.errorIfExists() == boolValue); + } + + { // ParanoidChecks test + boolean boolValue = rand.nextBoolean(); + opt.setParanoidChecks(boolValue); + assert(opt.paranoidChecks() == boolValue); + } + + { // MaxOpenFiles test + int intValue = rand.nextInt(); + opt.setMaxOpenFiles(intValue); + assert(opt.maxOpenFiles() == intValue); + } + + { // DisableDataSync test + boolean boolValue = rand.nextBoolean(); + opt.setDisableDataSync(boolValue); + assert(opt.disableDataSync() == boolValue); + } + + { // UseFsync test + boolean boolValue = rand.nextBoolean(); + opt.setUseFsync(boolValue); + assert(opt.useFsync() == boolValue); + } + + { // DbStatsLogInterval test + int intValue = rand.nextInt(); + opt.setDbStatsLogInterval(intValue); + assert(opt.dbStatsLogInterval() == intValue); + } + + { // DbLogDir test + String str = "path/to/DbLogDir"; + opt.setDbLogDir(str); + assert(opt.dbLogDir().equals(str)); + } + + { // WalDir test + String str = "path/to/WalDir"; + opt.setWalDir(str); + assert(opt.walDir().equals(str)); + } + + { // DeleteObsoleteFilesPeriodMicros test + long longValue = rand.nextLong(); + opt.setDeleteObsoleteFilesPeriodMicros(longValue); + assert(opt.deleteObsoleteFilesPeriodMicros() == longValue); + } + + { // MaxBackgroundCompactions test + int intValue = rand.nextInt(); + opt.setMaxBackgroundCompactions(intValue); + assert(opt.maxBackgroundCompactions() == intValue); + } + + { // MaxBackgroundFlushes test + int intValue = rand.nextInt(); + opt.setMaxBackgroundFlushes(intValue); + assert(opt.maxBackgroundFlushes() == intValue); + } + + { // MaxLogFileSize test + long longValue = rand.nextLong(); + opt.setMaxLogFileSize(longValue); + assert(opt.maxLogFileSize() == longValue); + } + + { // LogFileTimeToRoll test + long longValue = rand.nextLong(); + opt.setLogFileTimeToRoll(longValue); + assert(opt.logFileTimeToRoll() == longValue); + } + + { // KeepLogFileNum test + long longValue = rand.nextLong(); + opt.setKeepLogFileNum(longValue); + assert(opt.keepLogFileNum() == longValue); + } + + { // MaxManifestFileSize test + long longValue = rand.nextLong(); + opt.setMaxManifestFileSize(longValue); + assert(opt.maxManifestFileSize() == longValue); + } + + { // TableCacheNumshardbits test + int intValue = rand.nextInt(); + opt.setTableCacheNumshardbits(intValue); + assert(opt.tableCacheNumshardbits() == intValue); + } + + { // TableCacheRemoveScanCountLimit test + int intValue = rand.nextInt(); + opt.setTableCacheRemoveScanCountLimit(intValue); + assert(opt.tableCacheRemoveScanCountLimit() == intValue); + } + + { // WalTtlSeconds test + long longValue = rand.nextLong(); + opt.setWalTtlSeconds(longValue); + assert(opt.walTtlSeconds() == longValue); + } + + { // ManifestPreallocationSize test + long longValue = rand.nextLong(); + opt.setManifestPreallocationSize(longValue); + assert(opt.manifestPreallocationSize() == longValue); + } + + { // AllowOsBuffer test + boolean boolValue = rand.nextBoolean(); + opt.setAllowOsBuffer(boolValue); + assert(opt.allowOsBuffer() == boolValue); + } + + { // AllowMmapReads test + boolean boolValue = rand.nextBoolean(); + opt.setAllowMmapReads(boolValue); + assert(opt.allowMmapReads() == boolValue); + } + + { // AllowMmapWrites test + boolean boolValue = rand.nextBoolean(); + opt.setAllowMmapWrites(boolValue); + assert(opt.allowMmapWrites() == boolValue); + } + + { // IsFdCloseOnExec test + boolean boolValue = rand.nextBoolean(); + opt.setIsFdCloseOnExec(boolValue); + assert(opt.isFdCloseOnExec() == boolValue); + } + + { // SkipLogErrorOnRecovery test + boolean boolValue = rand.nextBoolean(); + opt.setSkipLogErrorOnRecovery(boolValue); + assert(opt.skipLogErrorOnRecovery() == boolValue); + } + + { // StatsDumpPeriodSec test + int intValue = rand.nextInt(); + opt.setStatsDumpPeriodSec(intValue); + assert(opt.statsDumpPeriodSec() == intValue); + } + + { // AdviseRandomOnOpen test + boolean boolValue = rand.nextBoolean(); + opt.setAdviseRandomOnOpen(boolValue); + assert(opt.adviseRandomOnOpen() == boolValue); + } + + { // UseAdaptiveMutex test + boolean boolValue = rand.nextBoolean(); + opt.setUseAdaptiveMutex(boolValue); + assert(opt.useAdaptiveMutex() == boolValue); + } + + { // BytesPerSync test + long longValue = rand.nextLong(); + opt.setBytesPerSync(longValue); + assert(opt.bytesPerSync() == longValue); + } + + { // AllowThreadLocal test + boolean boolValue = rand.nextBoolean(); + opt.setAllowThreadLocal(boolValue); + assert(opt.allowThreadLocal() == boolValue); + } + + { // WriteBufferSize test + long longValue = rand.nextLong(); + opt.setWriteBufferSize(longValue); + assert(opt.writeBufferSize() == longValue); + } + + { // MaxWriteBufferNumber test + int intValue = rand.nextInt(); + opt.setMaxWriteBufferNumber(intValue); + assert(opt.maxWriteBufferNumber() == intValue); + } + + { // MinWriteBufferNumberToMerge test + int intValue = rand.nextInt(); + opt.setMinWriteBufferNumberToMerge(intValue); + assert(opt.minWriteBufferNumberToMerge() == intValue); + } + + { // BlockSize test + long longValue = rand.nextLong(); + opt.setBlockSize(longValue); + assert(opt.blockSize() == longValue); + } + + { // BlockRestartInterval test + int intValue = rand.nextInt(); + opt.setBlockRestartInterval(intValue); + assert(opt.blockRestartInterval() == intValue); + } + + { // WholeKeyFiltering test + boolean boolValue = rand.nextBoolean(); + opt.setWholeKeyFiltering(boolValue); + assert(opt.wholeKeyFiltering() == boolValue); + } + + { // NumLevels test + int intValue = rand.nextInt(); + opt.setNumLevels(intValue); + assert(opt.numLevels() == intValue); + } + + { // LevelFileNumCompactionTrigger test + int intValue = rand.nextInt(); + opt.setLevelZeroFileNumCompactionTrigger(intValue); + assert(opt.levelZeroFileNumCompactionTrigger() == intValue); + } + + { // LevelSlowdownWritesTrigger test + int intValue = rand.nextInt(); + opt.setLevelZeroSlowdownWritesTrigger(intValue); + assert(opt.levelZeroSlowdownWritesTrigger() == intValue); + } + + { // LevelStopWritesTrigger test + int intValue = rand.nextInt(); + opt.setLevelZeroStopWritesTrigger(intValue); + assert(opt.levelZeroStopWritesTrigger() == intValue); + } + + { // MaxMemCompactionLevel test + int intValue = rand.nextInt(); + opt.setMaxMemCompactionLevel(intValue); + assert(opt.maxMemCompactionLevel() == intValue); + } + + { // TargetFileSizeBase test + int intValue = rand.nextInt(); + opt.setTargetFileSizeBase(intValue); + assert(opt.targetFileSizeBase() == intValue); + } + + { // TargetFileSizeMultiplier test + int intValue = rand.nextInt(); + opt.setTargetFileSizeMultiplier(intValue); + assert(opt.targetFileSizeMultiplier() == intValue); + } + + { // MaxBytesForLevelBase test + long longValue = rand.nextLong(); + opt.setMaxBytesForLevelBase(longValue); + assert(opt.maxBytesForLevelBase() == longValue); + } + + { // MaxBytesForLevelMultiplier test + int intValue = rand.nextInt(); + opt.setMaxBytesForLevelMultiplier(intValue); + assert(opt.maxBytesForLevelMultiplier() == intValue); + } + + { // ExpandedCompactionFactor test + int intValue = rand.nextInt(); + opt.setExpandedCompactionFactor(intValue); + assert(opt.expandedCompactionFactor() == intValue); + } + + { // SourceCompactionFactor test + int intValue = rand.nextInt(); + opt.setSourceCompactionFactor(intValue); + assert(opt.sourceCompactionFactor() == intValue); + } + + { // MaxGrandparentOverlapFactor test + int intValue = rand.nextInt(); + opt.setMaxGrandparentOverlapFactor(intValue); + assert(opt.maxGrandparentOverlapFactor() == intValue); + } + + { // DisableSeekCompaction test + boolean boolValue = rand.nextBoolean(); + opt.setDisableSeekCompaction(boolValue); + assert(opt.disableSeekCompaction() == boolValue); + } + + { // SoftRateLimit test + double doubleValue = rand.nextDouble(); + opt.setSoftRateLimit(doubleValue); + assert(opt.softRateLimit() == doubleValue); + } + + { // HardRateLimit test + double doubleValue = rand.nextDouble(); + opt.setHardRateLimit(doubleValue); + assert(opt.hardRateLimit() == doubleValue); + } + + { // RateLimitDelayMaxMilliseconds test + int intValue = rand.nextInt(); + opt.setRateLimitDelayMaxMilliseconds(intValue); + assert(opt.rateLimitDelayMaxMilliseconds() == intValue); + } + + { // NoBlockCache test + boolean boolValue = rand.nextBoolean(); + opt.setNoBlockCache(boolValue); + assert(opt.noBlockCache() == boolValue); + } + + { // ArenaBlockSize test + long longValue = rand.nextLong(); + opt.setArenaBlockSize(longValue); + assert(opt.arenaBlockSize() == longValue); + } + + { // DisableAutoCompactions test + boolean boolValue = rand.nextBoolean(); + opt.setDisableAutoCompactions(boolValue); + assert(opt.disableAutoCompactions() == boolValue); + } + + { // PurgeRedundantKvsWhileFlush test + boolean boolValue = rand.nextBoolean(); + opt.setPurgeRedundantKvsWhileFlush(boolValue); + assert(opt.purgeRedundantKvsWhileFlush() == boolValue); + } + + { // BlockSizeDeviation test + int intValue = rand.nextInt(); + opt.setBlockSizeDeviation(intValue); + assert(opt.blockSizeDeviation() == intValue); + } + + { // VerifyChecksumsInCompaction test + boolean boolValue = rand.nextBoolean(); + opt.setVerifyChecksumsInCompaction(boolValue); + assert(opt.verifyChecksumsInCompaction() == boolValue); + } + + { // FilterDeletes test + boolean boolValue = rand.nextBoolean(); + opt.setFilterDeletes(boolValue); + assert(opt.filterDeletes() == boolValue); + } + + { // MaxSequentialSkipInIterations test + long longValue = rand.nextLong(); + opt.setMaxSequentialSkipInIterations(longValue); + assert(opt.maxSequentialSkipInIterations() == longValue); + } + + { // InplaceUpdateSupport test + boolean boolValue = rand.nextBoolean(); + opt.setInplaceUpdateSupport(boolValue); + assert(opt.inplaceUpdateSupport() == boolValue); + } + + { // InplaceUpdateNumLocks test + long longValue = rand.nextLong(); + opt.setInplaceUpdateNumLocks(longValue); + assert(opt.inplaceUpdateNumLocks() == longValue); + } + + { // MemtablePrefixBloomBits test + int intValue = rand.nextInt(); + opt.setMemtablePrefixBloomBits(intValue); + assert(opt.memtablePrefixBloomBits() == intValue); + } + + { // MemtablePrefixBloomProbes test + int intValue = rand.nextInt(); + opt.setMemtablePrefixBloomProbes(intValue); + assert(opt.memtablePrefixBloomProbes() == intValue); + } + + { // BloomLocality test + int intValue = rand.nextInt(); + opt.setBloomLocality(intValue); + assert(opt.bloomLocality() == intValue); + } + + { // MaxSuccessiveMerges test + long longValue = rand.nextLong(); + opt.setMaxSuccessiveMerges(longValue); + assert(opt.maxSuccessiveMerges() == longValue); + } + + { // MinPartialMergeOperands test + int intValue = rand.nextInt(); + opt.setMinPartialMergeOperands(intValue); + assert(opt.minPartialMergeOperands() == intValue); + } + + opt.dispose(); + System.out.println("Passed OptionsTest"); + } +} diff --git a/java/org/rocksdb/test/ReadOptionsTest.java b/java/org/rocksdb/test/ReadOptionsTest.java new file mode 100644 index 0000000000..b3b5b2690c --- /dev/null +++ b/java/org/rocksdb/test/ReadOptionsTest.java @@ -0,0 +1,40 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb.test; + +import java.util.Random; +import org.rocksdb.RocksDB; +import org.rocksdb.ReadOptions; + +public class ReadOptionsTest { + static { + RocksDB.loadLibrary(); + } + public static void main(String[] args) { + ReadOptions opt = new ReadOptions(); + Random rand = new Random(); + { // VerifyChecksums test + boolean boolValue = rand.nextBoolean(); + opt.setVerifyChecksums(boolValue); + assert(opt.verifyChecksums() == boolValue); + } + + { // FillCache test + boolean boolValue = rand.nextBoolean(); + opt.setFillCache(boolValue); + assert(opt.fillCache() == boolValue); + } + + { // Tailing test + boolean boolValue = rand.nextBoolean(); + opt.setTailing(boolValue); + assert(opt.tailing() == boolValue); + } + + opt.dispose(); + System.out.println("Passed ReadOptionsTest"); + } +} diff --git a/java/org/rocksdb/util/Environment.java b/java/org/rocksdb/util/Environment.java new file mode 100644 index 0000000000..c2e3bc088e --- /dev/null +++ b/java/org/rocksdb/util/Environment.java @@ -0,0 +1,37 @@ +package org.rocksdb.util; + +public class Environment { + private static String OS = System.getProperty("os.name").toLowerCase(); + + public static boolean isWindows() { + return (OS.indexOf("win") >= 0); + } + + public static boolean isMac() { + return (OS.indexOf("mac") >= 0); + } + + public static boolean isUnix() { + return (OS.indexOf("nix") >= 0 || + OS.indexOf("nux") >= 0 || + OS.indexOf("aix") >= 0); + } + + public static String getSharedLibraryName(String name) { + if (isUnix()) { + return String.format("lib%s.so", name); + } else if (isMac()) { + return String.format("lib%s.dylib", name); + } + throw new UnsupportedOperationException(); + } + + public static String getJniLibraryName(String name) { + if (isUnix()) { + return String.format("lib%s.so", name); + } else if (isMac()) { + return String.format("lib%s.jnilib", name); + } + throw new UnsupportedOperationException(); + } +} diff --git a/java/org/rocksdb/util/SizeUnit.java b/java/org/rocksdb/util/SizeUnit.java new file mode 100644 index 0000000000..8d50cd10e6 --- /dev/null +++ b/java/org/rocksdb/util/SizeUnit.java @@ -0,0 +1,16 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb.util; + +public class SizeUnit { + public static final long KB = 1024L; + public static final long MB = KB * KB; + public static final long GB = KB * MB; + public static final long TB = KB * GB; + public static final long PB = KB * TB; + + private SizeUnit() {} +} diff --git a/java/rocksjni/backupablejni.cc b/java/rocksjni/backupablejni.cc new file mode 100644 index 0000000000..956912ef16 --- /dev/null +++ b/java/rocksjni/backupablejni.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ and enables +// calling c++ rocksdb::DB methods from Java side. + +#include +#include +#include +#include + +#include "include/org_rocksdb_BackupableDB.h" +#include "include/org_rocksdb_BackupableDBOptions.h" +#include "rocksjni/portal.h" +#include "utilities/backupable_db.h" + +/* + * Class: org_rocksdb_BackupableDB + * Method: open + * Signature: (JJ)V + */ +void Java_org_rocksdb_BackupableDB_open( + JNIEnv* env, jobject jbdb, jlong jdb_handle, jlong jopt_handle) { + auto db = reinterpret_cast(jdb_handle); + auto opt = reinterpret_cast(jopt_handle); + auto bdb = new rocksdb::BackupableDB(db, *opt); + + // as BackupableDB extends RocksDB on the java side, we can reuse + // the RocksDB portal here. + rocksdb::RocksDBJni::setHandle(env, jbdb, bdb); +} + +/* + * Class: org_rocksdb_BackupableDB + * Method: createNewBackup + * Signature: (JZ)V + */ +void Java_org_rocksdb_BackupableDB_createNewBackup( + JNIEnv* env, jobject jbdb, jlong jhandle, jboolean jflag) { + reinterpret_cast(jhandle)->CreateNewBackup(jflag); +} + +/////////////////////////////////////////////////////////////////////////// +// BackupDBOptions + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: newBackupableDBOptions + * Signature: (Ljava/lang/String;)V + */ +void Java_org_rocksdb_BackupableDBOptions_newBackupableDBOptions( + JNIEnv* env, jobject jobj, jstring jpath) { + const char* cpath = env->GetStringUTFChars(jpath, 0); + auto bopt = new rocksdb::BackupableDBOptions(cpath); + env->ReleaseStringUTFChars(jpath, cpath); + + rocksdb::BackupableDBOptionsJni::setHandle(env, jobj, bopt); +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: backupDir + * Signature: (J)Ljava/lang/String; + */ +jstring Java_org_rocksdb_BackupableDBOptions_backupDir( + JNIEnv* env, jobject jopt, jlong jhandle, jstring jpath) { + auto bopt = reinterpret_cast(jhandle); + return env->NewStringUTF(bopt->backup_dir.c_str()); +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_BackupableDBOptions_disposeInternal( + JNIEnv* env, jobject jopt, jlong jhandle) { + auto bopt = reinterpret_cast(jhandle); + assert(bopt); + delete bopt; + + rocksdb::BackupableDBOptionsJni::setHandle(env, jopt, nullptr); +} diff --git a/java/rocksjni/filter.cc b/java/rocksjni/filter.cc new file mode 100644 index 0000000000..572b4a66d4 --- /dev/null +++ b/java/rocksjni/filter.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ for +// rocksdb::FilterPolicy. + +#include +#include +#include +#include + +#include "include/org_rocksdb_Filter.h" +#include "include/org_rocksdb_BloomFilter.h" +#include "rocksjni/portal.h" +#include "rocksdb/filter_policy.h" + +/* + * Class: org_rocksdb_BloomFilter + * Method: createNewFilter0 + * Signature: (I)V + */ +void Java_org_rocksdb_BloomFilter_createNewFilter0( + JNIEnv* env, jobject jobj, jint bits_per_key) { + const rocksdb::FilterPolicy* fp = rocksdb::NewBloomFilterPolicy(bits_per_key); + rocksdb::FilterJni::setHandle(env, jobj, fp); +} + +/* + * Class: org_rocksdb_Filter + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_Filter_disposeInternal( + JNIEnv* env, jobject jobj, jlong handle) { + delete reinterpret_cast(handle); +} diff --git a/java/rocksjni/iterator.cc b/java/rocksjni/iterator.cc new file mode 100644 index 0000000000..84b0b31332 --- /dev/null +++ b/java/rocksjni/iterator.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ and enables +// calling c++ rocksdb::Iterator methods from Java side. + +#include +#include +#include + +#include "include/org_rocksdb_RocksIterator.h" +#include "rocksjni/portal.h" +#include "rocksdb/iterator.h" + +/* + * Class: org_rocksdb_RocksIterator + * Method: isValid0 + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_RocksIterator_isValid0( + JNIEnv* env, jobject jobj, jlong handle) { + return reinterpret_cast(handle)->Valid(); +} + +/* + * Class: org_rocksdb_RocksIterator + * Method: seekToFirst0 + * Signature: (J)V + */ +void Java_org_rocksdb_RocksIterator_seekToFirst0( + JNIEnv* env, jobject jobj, jlong handle) { + reinterpret_cast(handle)->SeekToFirst(); +} + +/* + * Class: org_rocksdb_RocksIterator + * Method: seekToFirst0 + * Signature: (J)V + */ +void Java_org_rocksdb_RocksIterator_seekToLast0( + JNIEnv* env, jobject jobj, jlong handle) { + reinterpret_cast(handle)->SeekToLast(); +} + +/* + * Class: org_rocksdb_RocksIterator + * Method: seekToLast0 + * Signature: (J)V + */ +void Java_org_rocksdb_RocksIterator_next0( + JNIEnv* env, jobject jobj, jlong handle) { + reinterpret_cast(handle)->Next(); +} + +/* + * Class: org_rocksdb_RocksIterator + * Method: next0 + * Signature: (J)V + */ +void Java_org_rocksdb_RocksIterator_prev0( + JNIEnv* env, jobject jobj, jlong handle) { + reinterpret_cast(handle)->Prev(); +} + +/* + * Class: org_rocksdb_RocksIterator + * Method: prev0 + * Signature: (J)V + */ +jbyteArray Java_org_rocksdb_RocksIterator_key0( + JNIEnv* env, jobject jobj, jlong handle) { + auto it = reinterpret_cast(handle); + rocksdb::Slice key_slice = it->key(); + + jbyteArray jkey = env->NewByteArray(key_slice.size()); + env->SetByteArrayRegion( + jkey, 0, key_slice.size(), + reinterpret_cast(key_slice.data())); + return jkey; +} + +/* + * Class: org_rocksdb_RocksIterator + * Method: key0 + * Signature: (J)[B + */ +jbyteArray Java_org_rocksdb_RocksIterator_value0( + JNIEnv* env, jobject jobj, jlong handle) { + auto it = reinterpret_cast(handle); + rocksdb::Slice value_slice = it->value(); + + jbyteArray jvalue = env->NewByteArray(value_slice.size()); + env->SetByteArrayRegion( + jvalue, 0, value_slice.size(), + reinterpret_cast(value_slice.data())); + return jvalue; +} + +/* + * Class: org_rocksdb_RocksIterator + * Method: value0 + * Signature: (J)[B + */ +void Java_org_rocksdb_RocksIterator_seek0( + JNIEnv* env, jobject jobj, jlong handle, + jbyteArray jtarget, jint jtarget_len) { + auto it = reinterpret_cast(handle); + jbyte* target = env->GetByteArrayElements(jtarget, 0); + rocksdb::Slice target_slice( + reinterpret_cast(target), jtarget_len); + + it->Seek(target_slice); + + env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT); +} + +/* + * Class: org_rocksdb_RocksIterator + * Method: seek0 + * Signature: (J[BI)V + */ +void Java_org_rocksdb_RocksIterator_status0( + JNIEnv* env, jobject jobj, jlong handle) { + auto it = reinterpret_cast(handle); + rocksdb::Status s = it->status(); + + if (s.ok()) { + return; + } + + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_RocksIterator + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_RocksIterator_disposeInternal( + JNIEnv* env, jobject jobj, jlong handle) { + auto it = reinterpret_cast(handle); + delete it; +} diff --git a/java/rocksjni/memtablejni.cc b/java/rocksjni/memtablejni.cc new file mode 100644 index 0000000000..a0d50f5f5e --- /dev/null +++ b/java/rocksjni/memtablejni.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ for MemTables. + +#include "include/org_rocksdb_HashSkipListMemTableConfig.h" +#include "include/org_rocksdb_HashLinkedListMemTableConfig.h" +#include "include/org_rocksdb_VectorMemTableConfig.h" +#include "include/org_rocksdb_SkipListMemTableConfig.h" +#include "rocksdb/memtablerep.h" + +/* + * Class: org_rocksdb_HashSkipListMemTableConfig + * Method: newMemTableFactoryHandle + * Signature: (JII)J + */ +jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle( + JNIEnv* env, jobject jobj, jlong jbucket_count, + jint jheight, jint jbranching_factor) { + return reinterpret_cast(rocksdb::NewHashSkipListRepFactory( + static_cast(jbucket_count), + static_cast(jheight), + static_cast(jbranching_factor))); +} + +/* + * Class: org_rocksdb_HashLinkedListMemTableConfig + * Method: newMemTableFactoryHandle + * Signature: (J)J + */ +jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle( + JNIEnv* env, jobject jobj, jlong jbucket_count) { + return reinterpret_cast(rocksdb::NewHashLinkListRepFactory( + static_cast(jbucket_count))); +} + +/* + * Class: org_rocksdb_VectorMemTableConfig + * Method: newMemTableFactoryHandle + * Signature: (J)J + */ +jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle( + JNIEnv* env, jobject jobj, jlong jreserved_size) { + return reinterpret_cast(new rocksdb::VectorRepFactory( + static_cast(jreserved_size))); +} + +/* + * Class: org_rocksdb_SkipListMemTableConfig + * Method: newMemTableFactoryHandle0 + * Signature: ()J + */ +jlong Java_org_rocksdb_SkipListMemTableConfig_newMemTableFactoryHandle0( + JNIEnv* env, jobject jobj) { + return reinterpret_cast(new rocksdb::SkipListFactory()); +} diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc new file mode 100644 index 0000000000..003d353e61 --- /dev/null +++ b/java/rocksjni/options.cc @@ -0,0 +1,1805 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ for rocksdb::Options. + +#include +#include +#include +#include +#include + +#include "include/org_rocksdb_Options.h" +#include "include/org_rocksdb_WriteOptions.h" +#include "include/org_rocksdb_ReadOptions.h" +#include "rocksjni/portal.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/table.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/filter_policy.h" + +/* + * Class: org_rocksdb_Options + * Method: newOptions + * Signature: ()V + */ +void Java_org_rocksdb_Options_newOptions(JNIEnv* env, jobject jobj) { + rocksdb::Options* op = new rocksdb::Options(); + rocksdb::OptionsJni::setHandle(env, jobj, op); +} + +/* + * Class: org_rocksdb_Options + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_Options_disposeInternal( + JNIEnv* env, jobject jobj, jlong handle) { + delete reinterpret_cast(handle); +} + +/* + * Class: org_rocksdb_Options + * Method: setCreateIfMissing + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setCreateIfMissing( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) { + reinterpret_cast(jhandle)->create_if_missing = flag; +} + +/* + * Class: org_rocksdb_Options + * Method: createIfMissing + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_createIfMissing( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->create_if_missing; +} + +/* + * Class: org_rocksdb_Options + * Method: setWriteBufferSize + * Signature: (JJ)I + */ +void Java_org_rocksdb_Options_setWriteBufferSize( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) { + reinterpret_cast(jhandle)->write_buffer_size = + static_cast(jwrite_buffer_size); +} + + +/* + * Class: org_rocksdb_Options + * Method: writeBufferSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_writeBufferSize( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->write_buffer_size; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxWriteBufferNumber + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMaxWriteBufferNumber( + JNIEnv* env, jobject jobj, jlong jhandle, jint jmax_write_buffer_number) { + reinterpret_cast(jhandle)->max_write_buffer_number = + jmax_write_buffer_number; +} + +/* + * Class: org_rocksdb_Options + * Method: createStatistics + * Signature: (J)V + */ +void Java_org_rocksdb_Options_createStatistics( + JNIEnv* env, jobject jobj, jlong jOptHandle) { + reinterpret_cast(jOptHandle)->statistics = + rocksdb::CreateDBStatistics(); +} + +/* + * Class: org_rocksdb_Options + * Method: statisticsPtr + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_statisticsPtr( + JNIEnv* env, jobject jobj, jlong jOptHandle) { + auto st = reinterpret_cast(jOptHandle)->statistics.get(); + return reinterpret_cast(st); +} + +/* + * Class: org_rocksdb_Options + * Method: setFilterHandle + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setFilterHandle( + JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jfilter_handle) { + reinterpret_cast(jopt_handle)->filter_policy = + reinterpret_cast(jfilter_handle); +} + +/* + * Class: org_rocksdb_Options + * Method: maxWriteBufferNumber + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_maxWriteBufferNumber( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->max_write_buffer_number; +} + +/* + * Class: org_rocksdb_Options + * Method: setBlockSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setBlockSize( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jblock_size) { + reinterpret_cast(jhandle)->block_size = + static_cast(jblock_size); +} + +/* + * Class: org_rocksdb_Options + * Method: blockSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_blockSize( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->block_size; +} + +/* + * Class: org_rocksdb_Options + * Method: setDisableSeekCompaction + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setDisableSeekCompaction( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jdisable_seek_compaction) { + reinterpret_cast(jhandle)->disable_seek_compaction = + jdisable_seek_compaction; +} + +/* + * Class: org_rocksdb_Options + * Method: disableSeekCompaction + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_disableSeekCompaction( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->disable_seek_compaction; +} + +/* + * Class: org_rocksdb_Options + * Method: errorIfExists + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_errorIfExists( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->error_if_exists; +} + +/* + * Class: org_rocksdb_Options + * Method: setErrorIfExists + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setErrorIfExists( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean error_if_exists) { + reinterpret_cast(jhandle)->error_if_exists = + static_cast(error_if_exists); +} + +/* + * Class: org_rocksdb_Options + * Method: paranoidChecks + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_paranoidChecks( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->paranoid_checks; +} + +/* + * Class: org_rocksdb_Options + * Method: setParanoidChecks + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setParanoidChecks( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean paranoid_checks) { + reinterpret_cast(jhandle)->paranoid_checks = + static_cast(paranoid_checks); +} + +/* + * Class: org_rocksdb_Options + * Method: maxOpenFiles + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_maxOpenFiles( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->max_open_files; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxOpenFiles + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMaxOpenFiles( + JNIEnv* env, jobject jobj, jlong jhandle, jint max_open_files) { + reinterpret_cast(jhandle)->max_open_files = + static_cast(max_open_files); +} + +/* + * Class: org_rocksdb_Options + * Method: disableDataSync + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_disableDataSync( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->disableDataSync; +} + +/* + * Class: org_rocksdb_Options + * Method: setDisableDataSync + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setDisableDataSync( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean disableDataSync) { + reinterpret_cast(jhandle)->disableDataSync = + static_cast(disableDataSync); +} + +/* + * Class: org_rocksdb_Options + * Method: useFsync + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_useFsync( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->use_fsync; +} + +/* + * Class: org_rocksdb_Options + * Method: setUseFsync + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setUseFsync( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_fsync) { + reinterpret_cast(jhandle)->use_fsync = + static_cast(use_fsync); +} + +/* + * Class: org_rocksdb_Options + * Method: dbStatsLogInterval + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_dbStatsLogInterval( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->db_stats_log_interval; +} + +/* + * Class: org_rocksdb_Options + * Method: setDbStatsLogInterval + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setDbStatsLogInterval( + JNIEnv* env, jobject jobj, jlong jhandle, jint db_stats_log_interval) { + reinterpret_cast(jhandle)->db_stats_log_interval = + static_cast(db_stats_log_interval); +} + +/* + * Class: org_rocksdb_Options + * Method: dbLogDir + * Signature: (J)Ljava/lang/String + */ +jstring Java_org_rocksdb_Options_dbLogDir( + JNIEnv* env, jobject jobj, jlong jhandle) { + return env->NewStringUTF( + reinterpret_cast(jhandle)->db_log_dir.c_str()); +} + +/* + * Class: org_rocksdb_Options + * Method: setDbLogDir + * Signature: (JLjava/lang/String)V + */ +void Java_org_rocksdb_Options_setDbLogDir( + JNIEnv* env, jobject jobj, jlong jhandle, jstring jdb_log_dir) { + const char* log_dir = env->GetStringUTFChars(jdb_log_dir, 0); + reinterpret_cast(jhandle)->db_log_dir.assign(log_dir); + env->ReleaseStringUTFChars(jdb_log_dir, log_dir); +} + +/* + * Class: org_rocksdb_Options + * Method: walDir + * Signature: (J)Ljava/lang/String + */ +jstring Java_org_rocksdb_Options_walDir( + JNIEnv* env, jobject jobj, jlong jhandle) { + return env->NewStringUTF( + reinterpret_cast(jhandle)->wal_dir.c_str()); +} + +/* + * Class: org_rocksdb_Options + * Method: setWalDir + * Signature: (JLjava/lang/String)V + */ +void Java_org_rocksdb_Options_setWalDir( + JNIEnv* env, jobject jobj, jlong jhandle, jstring jwal_dir) { + const char* wal_dir = env->GetStringUTFChars(jwal_dir, 0); + reinterpret_cast(jhandle)->wal_dir.assign(wal_dir); + env->ReleaseStringUTFChars(jwal_dir, wal_dir); +} + +/* + * Class: org_rocksdb_Options + * Method: deleteObsoleteFilesPeriodMicros + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_deleteObsoleteFilesPeriodMicros( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle) + ->delete_obsolete_files_period_micros; +} + +/* + * Class: org_rocksdb_Options + * Method: setDeleteObsoleteFilesPeriodMicros + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setDeleteObsoleteFilesPeriodMicros( + JNIEnv* env, jobject jobj, jlong jhandle, jlong micros) { + reinterpret_cast(jhandle) + ->delete_obsolete_files_period_micros = + static_cast(micros); +} + +/* + * Class: org_rocksdb_Options + * Method: maxBackgroundCompactions + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_maxBackgroundCompactions( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_background_compactions; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxBackgroundCompactions + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMaxBackgroundCompactions( + JNIEnv* env, jobject jobj, jlong jhandle, jint max) { + reinterpret_cast(jhandle) + ->max_background_compactions = static_cast(max); +} + +/* + * Class: org_rocksdb_Options + * Method: maxBackgroundFlushes + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_maxBackgroundFlushes( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->max_background_flushes; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxBackgroundFlushes + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMaxBackgroundFlushes( + JNIEnv* env, jobject jobj, jlong jhandle, jint max_background_flushes) { + reinterpret_cast(jhandle)->max_background_flushes = + static_cast(max_background_flushes); +} + +/* + * Class: org_rocksdb_Options + * Method: maxLogFileSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_maxLogFileSize( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->max_log_file_size; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxLogFileSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setMaxLogFileSize( + JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) { + reinterpret_cast(jhandle)->max_log_file_size = + static_cast(max_log_file_size); +} + +/* + * Class: org_rocksdb_Options + * Method: logFileTimeToRoll + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_logFileTimeToRoll( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->log_file_time_to_roll; +} + +/* + * Class: org_rocksdb_Options + * Method: setLogFileTimeToRoll + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setLogFileTimeToRoll( + JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) { + reinterpret_cast(jhandle)->log_file_time_to_roll = + static_cast(log_file_time_to_roll); +} + +/* + * Class: org_rocksdb_Options + * Method: keepLogFileNum + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_keepLogFileNum( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->keep_log_file_num; +} + +/* + * Class: org_rocksdb_Options + * Method: setKeepLogFileNum + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setKeepLogFileNum( + JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) { + reinterpret_cast(jhandle)->keep_log_file_num = + static_cast(keep_log_file_num); +} + +/* + * Class: org_rocksdb_Options + * Method: maxManifestFileSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_maxManifestFileSize( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->max_manifest_file_size; +} + +/* + * Method: memTableFactoryName + * Signature: (J)Ljava/lang/String + */ +jstring Java_org_rocksdb_Options_memTableFactoryName( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto opt = reinterpret_cast(jhandle); + rocksdb::MemTableRepFactory* tf = opt->memtable_factory.get(); + + // Should never be nullptr. + // Default memtable factory is SkipListFactory + assert(tf); + + // temporarly fix for the historical typo + if (strcmp(tf->Name(), "HashLinkListRepFactory") == 0) { + return env->NewStringUTF("HashLinkedListRepFactory"); + } + + return env->NewStringUTF(tf->Name()); +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxManifestFileSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setMaxManifestFileSize( + JNIEnv* env, jobject jobj, jlong jhandle, jlong max_manifest_file_size) { + reinterpret_cast(jhandle)->max_manifest_file_size = + static_cast(max_manifest_file_size); +} + +/* + * Method: setMemTableFactory + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setMemTableFactory( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) { + reinterpret_cast(jhandle)->memtable_factory.reset( + reinterpret_cast(jfactory_handle)); +} + +/* + * Class: org_rocksdb_Options + * Method: tableCacheNumshardbits + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_tableCacheNumshardbits( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->table_cache_numshardbits; +} + +/* + * Class: org_rocksdb_Options + * Method: setTableCacheNumshardbits + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setTableCacheNumshardbits( + JNIEnv* env, jobject jobj, jlong jhandle, jint table_cache_numshardbits) { + reinterpret_cast(jhandle)->table_cache_numshardbits = + static_cast(table_cache_numshardbits); +} + +/* + * Class: org_rocksdb_Options + * Method: tableCacheRemoveScanCountLimit + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_tableCacheRemoveScanCountLimit( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->table_cache_remove_scan_count_limit; +} + +/* + * Class: org_rocksdb_Options + * Method: setTableCacheRemoveScanCountLimit + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setTableCacheRemoveScanCountLimit( + JNIEnv* env, jobject jobj, jlong jhandle, jint limit) { + reinterpret_cast( + jhandle)->table_cache_remove_scan_count_limit = static_cast(limit); +} + +/* + * Method: useFixedLengthPrefixExtractor + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor( + JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) { + reinterpret_cast(jhandle)->prefix_extractor.reset( + rocksdb::NewFixedPrefixTransform(static_cast(jprefix_length))); +} + +/* + * Class: org_rocksdb_Options + * Method: walTtlSeconds + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_walTtlSeconds( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->WAL_ttl_seconds; +} + +/* + * Class: org_rocksdb_Options + * Method: setWalTtlSeconds + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setWalTtlSeconds( + JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_ttl_seconds) { + reinterpret_cast(jhandle)->WAL_ttl_seconds = + static_cast(WAL_ttl_seconds); +} + +/* + * Class: org_rocksdb_Options + * Method: walTtlSeconds + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_walSizeLimitMB( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->WAL_size_limit_MB; +} + +/* + * Class: org_rocksdb_Options + * Method: setWalSizeLimitMB + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setWalSizeLimitMB( + JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_size_limit_MB) { + reinterpret_cast(jhandle)->WAL_size_limit_MB = + static_cast(WAL_size_limit_MB); +} + +/* + * Class: org_rocksdb_Options + * Method: manifestPreallocationSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_manifestPreallocationSize( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle) + ->manifest_preallocation_size; +} + +/* + * Class: org_rocksdb_Options + * Method: setManifestPreallocationSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setManifestPreallocationSize( + JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) { + reinterpret_cast(jhandle)->manifest_preallocation_size = + static_cast(preallocation_size); +} + +/* + * Class: org_rocksdb_Options + * Method: allowOsBuffer + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_allowOsBuffer( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->allow_os_buffer; +} + +/* + * Class: org_rocksdb_Options + * Method: setAllowOsBuffer + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setAllowOsBuffer( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_os_buffer) { + reinterpret_cast(jhandle)->allow_os_buffer = + static_cast(allow_os_buffer); +} + +/* + * Method: setTableFactory + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setTableFactory( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) { + reinterpret_cast(jhandle)->table_factory.reset( + reinterpret_cast(jfactory_handle)); +} + +/* + * Class: org_rocksdb_Options + * Method: allowMmapReads + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_allowMmapReads( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->allow_mmap_reads; +} + +/* + * Class: org_rocksdb_Options + * Method: setAllowMmapReads + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setAllowMmapReads( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_reads) { + reinterpret_cast(jhandle)->allow_mmap_reads = + static_cast(allow_mmap_reads); +} + +/* + * Class: org_rocksdb_Options + * Method: allowMmapWrites + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_allowMmapWrites( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->allow_mmap_writes; +} + +/* + * Class: org_rocksdb_Options + * Method: setAllowMmapWrites + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setAllowMmapWrites( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_writes) { + reinterpret_cast(jhandle)->allow_mmap_writes = + static_cast(allow_mmap_writes); +} + +/* + * Class: org_rocksdb_Options + * Method: isFdCloseOnExec + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_isFdCloseOnExec( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->is_fd_close_on_exec; +} + +/* + * Class: org_rocksdb_Options + * Method: setIsFdCloseOnExec + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setIsFdCloseOnExec( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean is_fd_close_on_exec) { + reinterpret_cast(jhandle)->is_fd_close_on_exec = + static_cast(is_fd_close_on_exec); +} + +/* + * Class: org_rocksdb_Options + * Method: skipLogErrorOnRecovery + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_skipLogErrorOnRecovery( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle) + ->skip_log_error_on_recovery; +} + +/* + * Class: org_rocksdb_Options + * Method: setSkipLogErrorOnRecovery + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setSkipLogErrorOnRecovery( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean skip) { + reinterpret_cast(jhandle)->skip_log_error_on_recovery = + static_cast(skip); +} + +/* + * Class: org_rocksdb_Options + * Method: statsDumpPeriodSec + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_statsDumpPeriodSec( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->stats_dump_period_sec; +} + +/* + * Class: org_rocksdb_Options + * Method: setStatsDumpPeriodSec + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setStatsDumpPeriodSec( + JNIEnv* env, jobject jobj, jlong jhandle, jint stats_dump_period_sec) { + reinterpret_cast(jhandle)->stats_dump_period_sec = + static_cast(stats_dump_period_sec); +} + +/* + * Class: org_rocksdb_Options + * Method: adviseRandomOnOpen + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_adviseRandomOnOpen( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->advise_random_on_open; +} + +/* + * Class: org_rocksdb_Options + * Method: setAdviseRandomOnOpen + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setAdviseRandomOnOpen( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean advise_random_on_open) { + reinterpret_cast(jhandle)->advise_random_on_open = + static_cast(advise_random_on_open); +} + +/* + * Class: org_rocksdb_Options + * Method: useAdaptiveMutex + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_useAdaptiveMutex( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->use_adaptive_mutex; +} + +/* + * Class: org_rocksdb_Options + * Method: setUseAdaptiveMutex + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setUseAdaptiveMutex( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_adaptive_mutex) { + reinterpret_cast(jhandle)->use_adaptive_mutex = + static_cast(use_adaptive_mutex); +} + +/* + * Class: org_rocksdb_Options + * Method: bytesPerSync + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_bytesPerSync( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->bytes_per_sync; +} + +/* + * Class: org_rocksdb_Options + * Method: setBytesPerSync + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setBytesPerSync( + JNIEnv* env, jobject jobj, jlong jhandle, jlong bytes_per_sync) { + reinterpret_cast(jhandle)->bytes_per_sync = + static_cast(bytes_per_sync); +} + +/* + * Class: org_rocksdb_Options + * Method: allowThreadLocal + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_allowThreadLocal( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->allow_thread_local; +} + +/* + * Class: org_rocksdb_Options + * Method: setAllowThreadLocal + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setAllowThreadLocal( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_thread_local) { + reinterpret_cast(jhandle)->allow_thread_local = + static_cast(allow_thread_local); +} + +/* + * Method: tableFactoryName + * Signature: (J)Ljava/lang/String + */ +jstring Java_org_rocksdb_Options_tableFactoryName( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto opt = reinterpret_cast(jhandle); + rocksdb::TableFactory* tf = opt->table_factory.get(); + + // Should never be nullptr. + // Default memtable factory is SkipListFactory + assert(tf); + + return env->NewStringUTF(tf->Name()); +} + + +/* + * Class: org_rocksdb_Options + * Method: minWriteBufferNumberToMerge + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_minWriteBufferNumberToMerge( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->min_write_buffer_number_to_merge; +} + +/* + * Class: org_rocksdb_Options + * Method: setMinWriteBufferNumberToMerge + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmin_write_buffer_number_to_merge) { + reinterpret_cast( + jhandle)->min_write_buffer_number_to_merge = + static_cast(jmin_write_buffer_number_to_merge); +} + +/* + * Class: org_rocksdb_Options + * Method: blockRestartInterval + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_blockRestartInterval( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->block_restart_interval; +} + +/* + * Class: org_rocksdb_Options + * Method: setBlockRestartInterval + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setBlockRestartInterval( + JNIEnv* env, jobject jobj, jlong jhandle, jint jblock_restart_interval) { + reinterpret_cast(jhandle)->block_restart_interval = + static_cast(jblock_restart_interval); +} + +/* + * Class: org_rocksdb_Options + * Method: wholeKeyFiltering + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_wholeKeyFiltering( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->whole_key_filtering; +} + +/* + * Class: org_rocksdb_Options + * Method: setWholeKeyFiltering + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setWholeKeyFiltering( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jwhole_key_filtering) { + reinterpret_cast(jhandle)->whole_key_filtering = + static_cast(jwhole_key_filtering); +} + +/* + * Class: org_rocksdb_Options + * Method: numLevels + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_numLevels( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->num_levels; +} + +/* + * Class: org_rocksdb_Options + * Method: setNumLevels + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setNumLevels( + JNIEnv* env, jobject jobj, jlong jhandle, jint jnum_levels) { + reinterpret_cast(jhandle)->num_levels = + static_cast(jnum_levels); +} + +/* + * Class: org_rocksdb_Options + * Method: levelZeroFileNumCompactionTrigger + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_levelZeroFileNumCompactionTrigger( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->level0_file_num_compaction_trigger; +} + +/* + * Class: org_rocksdb_Options + * Method: setLevelZeroFileNumCompactionTrigger + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setLevelZeroFileNumCompactionTrigger( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jlevel0_file_num_compaction_trigger) { + reinterpret_cast( + jhandle)->level0_file_num_compaction_trigger = + static_cast(jlevel0_file_num_compaction_trigger); +} + +/* + * Class: org_rocksdb_Options + * Method: levelZeroSlowdownWritesTrigger + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_levelZeroSlowdownWritesTrigger( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->level0_slowdown_writes_trigger; +} + +/* + * Class: org_rocksdb_Options + * Method: setLevelSlowdownWritesTrigger + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setLevelZeroSlowdownWritesTrigger( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jlevel0_slowdown_writes_trigger) { + reinterpret_cast( + jhandle)->level0_slowdown_writes_trigger = + static_cast(jlevel0_slowdown_writes_trigger); +} + +/* + * Class: org_rocksdb_Options + * Method: levelZeroStopWritesTrigger + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_levelZeroStopWritesTrigger( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->level0_stop_writes_trigger; +} + +/* + * Class: org_rocksdb_Options + * Method: setLevelStopWritesTrigger + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setLevelZeroStopWritesTrigger( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jlevel0_stop_writes_trigger) { + reinterpret_cast(jhandle)->level0_stop_writes_trigger = + static_cast(jlevel0_stop_writes_trigger); +} + +/* + * Class: org_rocksdb_Options + * Method: maxMemCompactionLevel + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_maxMemCompactionLevel( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_mem_compaction_level; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxMemCompactionLevel + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMaxMemCompactionLevel( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmax_mem_compaction_level) { + reinterpret_cast(jhandle)->max_mem_compaction_level = + static_cast(jmax_mem_compaction_level); +} + +/* + * Class: org_rocksdb_Options + * Method: targetFileSizeBase + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_targetFileSizeBase( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->target_file_size_base; +} + +/* + * Class: org_rocksdb_Options + * Method: setTargetFileSizeBase + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setTargetFileSizeBase( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jtarget_file_size_base) { + reinterpret_cast(jhandle)->target_file_size_base = + static_cast(jtarget_file_size_base); +} + +/* + * Class: org_rocksdb_Options + * Method: targetFileSizeMultiplier + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_targetFileSizeMultiplier( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->target_file_size_multiplier; +} + +/* + * Class: org_rocksdb_Options + * Method: setTargetFileSizeMultiplier + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setTargetFileSizeMultiplier( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jtarget_file_size_multiplier) { + reinterpret_cast( + jhandle)->target_file_size_multiplier = + static_cast(jtarget_file_size_multiplier); +} + +/* + * Class: org_rocksdb_Options + * Method: maxBytesForLevelBase + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_maxBytesForLevelBase( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_bytes_for_level_base; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxBytesForLevelBase + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setMaxBytesForLevelBase( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jmax_bytes_for_level_base) { + reinterpret_cast( + jhandle)->max_bytes_for_level_base = + static_cast(jmax_bytes_for_level_base); +} + +/* + * Class: org_rocksdb_Options + * Method: maxBytesForLevelMultiplier + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_maxBytesForLevelMultiplier( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_bytes_for_level_multiplier; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxBytesForLevelMultiplier + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplier( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmax_bytes_for_level_multiplier) { + reinterpret_cast( + jhandle)->max_bytes_for_level_multiplier = + static_cast(jmax_bytes_for_level_multiplier); +} + +/* + * Class: org_rocksdb_Options + * Method: expandedCompactionFactor + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_expandedCompactionFactor( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->expanded_compaction_factor; +} + +/* + * Class: org_rocksdb_Options + * Method: setExpandedCompactionFactor + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setExpandedCompactionFactor( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jexpanded_compaction_factor) { + reinterpret_cast( + jhandle)->expanded_compaction_factor = + static_cast(jexpanded_compaction_factor); +} + +/* + * Class: org_rocksdb_Options + * Method: sourceCompactionFactor + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_sourceCompactionFactor( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->source_compaction_factor; +} + +/* + * Class: org_rocksdb_Options + * Method: setSourceCompactionFactor + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setSourceCompactionFactor( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jsource_compaction_factor) { + reinterpret_cast( + jhandle)->source_compaction_factor = + static_cast(jsource_compaction_factor); +} + +/* + * Class: org_rocksdb_Options + * Method: maxGrandparentOverlapFactor + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_maxGrandparentOverlapFactor( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_grandparent_overlap_factor; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxGrandparentOverlapFactor + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMaxGrandparentOverlapFactor( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmax_grandparent_overlap_factor) { + reinterpret_cast( + jhandle)->max_grandparent_overlap_factor = + static_cast(jmax_grandparent_overlap_factor); +} + +/* + * Class: org_rocksdb_Options + * Method: softRateLimit + * Signature: (J)D + */ +jdouble Java_org_rocksdb_Options_softRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->soft_rate_limit; +} + +/* + * Class: org_rocksdb_Options + * Method: setSoftRateLimit + * Signature: (JD)V + */ +void Java_org_rocksdb_Options_setSoftRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle, jdouble jsoft_rate_limit) { + reinterpret_cast(jhandle)->soft_rate_limit = + static_cast(jsoft_rate_limit); +} + +/* + * Class: org_rocksdb_Options + * Method: hardRateLimit + * Signature: (J)D + */ +jdouble Java_org_rocksdb_Options_hardRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->hard_rate_limit; +} + +/* + * Class: org_rocksdb_Options + * Method: setHardRateLimit + * Signature: (JD)V + */ +void Java_org_rocksdb_Options_setHardRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle, jdouble jhard_rate_limit) { + reinterpret_cast(jhandle)->hard_rate_limit = + static_cast(jhard_rate_limit); +} + +/* + * Class: org_rocksdb_Options + * Method: rateLimitDelayMaxMilliseconds + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_rateLimitDelayMaxMilliseconds( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->rate_limit_delay_max_milliseconds; +} + +/* + * Class: org_rocksdb_Options + * Method: setRateLimitDelayMaxMilliseconds + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setRateLimitDelayMaxMilliseconds( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jrate_limit_delay_max_milliseconds) { + reinterpret_cast( + jhandle)->rate_limit_delay_max_milliseconds = + static_cast(jrate_limit_delay_max_milliseconds); +} + +/* + * Class: org_rocksdb_Options + * Method: noBlockCache + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_noBlockCache( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->no_block_cache; +} + +/* + * Class: org_rocksdb_Options + * Method: setNoBlockCache + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setNoBlockCache( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jno_block_cache) { + reinterpret_cast(jhandle)->no_block_cache = + static_cast(jno_block_cache); +} + +/* + * Class: org_rocksdb_Options + * Method: arenaBlockSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_arenaBlockSize( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->arena_block_size; +} + +/* + * Class: org_rocksdb_Options + * Method: setArenaBlockSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setArenaBlockSize( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) { + reinterpret_cast(jhandle)->arena_block_size = + static_cast(jarena_block_size); +} + +/* + * Class: org_rocksdb_Options + * Method: disableAutoCompactions + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_disableAutoCompactions( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->disable_auto_compactions; +} + +/* + * Class: org_rocksdb_Options + * Method: setDisableAutoCompactions + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setDisableAutoCompactions( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jdisable_auto_compactions) { + reinterpret_cast( + jhandle)->disable_auto_compactions = + static_cast(jdisable_auto_compactions); +} + +/* + * Class: org_rocksdb_Options + * Method: purgeRedundantKvsWhileFlush + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_purgeRedundantKvsWhileFlush( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->purge_redundant_kvs_while_flush; +} + +/* + * Class: org_rocksdb_Options + * Method: setPurgeRedundantKvsWhileFlush + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setPurgeRedundantKvsWhileFlush( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jpurge_redundant_kvs_while_flush) { + reinterpret_cast( + jhandle)->purge_redundant_kvs_while_flush = + static_cast(jpurge_redundant_kvs_while_flush); +} + +/* + * Class: org_rocksdb_Options + * Method: blockSizeDeviation + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_blockSizeDeviation( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->block_size_deviation; +} + +/* + * Class: org_rocksdb_Options + * Method: setBlockSizeDeviation + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setBlockSizeDeviation( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jblock_size_deviation) { + reinterpret_cast(jhandle)->block_size_deviation = + static_cast(jblock_size_deviation); +} + +/* + * Class: org_rocksdb_Options + * Method: verifyChecksumsInCompaction + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_verifyChecksumsInCompaction( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->verify_checksums_in_compaction; +} + +/* + * Class: org_rocksdb_Options + * Method: setVerifyChecksumsInCompaction + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setVerifyChecksumsInCompaction( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jverify_checksums_in_compaction) { + reinterpret_cast( + jhandle)->verify_checksums_in_compaction = + static_cast(jverify_checksums_in_compaction); +} + +/* + * Class: org_rocksdb_Options + * Method: filterDeletes + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_filterDeletes( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->filter_deletes; +} + +/* + * Class: org_rocksdb_Options + * Method: setFilterDeletes + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setFilterDeletes( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfilter_deletes) { + reinterpret_cast(jhandle)->filter_deletes = + static_cast(jfilter_deletes); +} + +/* + * Class: org_rocksdb_Options + * Method: maxSequentialSkipInIterations + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_maxSequentialSkipInIterations( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_sequential_skip_in_iterations; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxSequentialSkipInIterations + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setMaxSequentialSkipInIterations( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jmax_sequential_skip_in_iterations) { + reinterpret_cast( + jhandle)->max_sequential_skip_in_iterations = + static_cast(jmax_sequential_skip_in_iterations); +} + +/* + * Class: org_rocksdb_Options + * Method: inplaceUpdateSupport + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_inplaceUpdateSupport( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->inplace_update_support; +} + +/* + * Class: org_rocksdb_Options + * Method: setInplaceUpdateSupport + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setInplaceUpdateSupport( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jinplace_update_support) { + reinterpret_cast( + jhandle)->inplace_update_support = + static_cast(jinplace_update_support); +} + +/* + * Class: org_rocksdb_Options + * Method: inplaceUpdateNumLocks + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->inplace_update_num_locks; +} + +/* + * Class: org_rocksdb_Options + * Method: setInplaceUpdateNumLocks + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setInplaceUpdateNumLocks( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jinplace_update_num_locks) { + reinterpret_cast( + jhandle)->inplace_update_num_locks = + static_cast(jinplace_update_num_locks); +} + +/* + * Class: org_rocksdb_Options + * Method: memtablePrefixBloomBits + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_memtablePrefixBloomBits( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->memtable_prefix_bloom_bits; +} + +/* + * Class: org_rocksdb_Options + * Method: setMemtablePrefixBloomBits + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMemtablePrefixBloomBits( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmemtable_prefix_bloom_bits) { + reinterpret_cast( + jhandle)->memtable_prefix_bloom_bits = + static_cast(jmemtable_prefix_bloom_bits); +} + +/* + * Class: org_rocksdb_Options + * Method: memtablePrefixBloomProbes + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_memtablePrefixBloomProbes( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->memtable_prefix_bloom_probes; +} + +/* + * Class: org_rocksdb_Options + * Method: setMemtablePrefixBloomProbes + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMemtablePrefixBloomProbes( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmemtable_prefix_bloom_probes) { + reinterpret_cast( + jhandle)->memtable_prefix_bloom_probes = + static_cast(jmemtable_prefix_bloom_probes); +} + +/* + * Class: org_rocksdb_Options + * Method: bloomLocality + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_bloomLocality( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->bloom_locality; +} + +/* + * Class: org_rocksdb_Options + * Method: setBloomLocality + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setBloomLocality( + JNIEnv* env, jobject jobj, jlong jhandle, jint jbloom_locality) { + reinterpret_cast(jhandle)->bloom_locality = + static_cast(jbloom_locality); +} + +/* + * Class: org_rocksdb_Options + * Method: maxSuccessiveMerges + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_maxSuccessiveMerges( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->max_successive_merges; +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxSuccessiveMerges + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setMaxSuccessiveMerges( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jmax_successive_merges) { + reinterpret_cast(jhandle)->max_successive_merges = + static_cast(jmax_successive_merges); +} + +/* + * Class: org_rocksdb_Options + * Method: minPartialMergeOperands + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_minPartialMergeOperands( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->min_partial_merge_operands; +} + +/* + * Class: org_rocksdb_Options + * Method: setMinPartialMergeOperands + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMinPartialMergeOperands( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmin_partial_merge_operands) { + reinterpret_cast( + jhandle)->min_partial_merge_operands = + static_cast(jmin_partial_merge_operands); +} + +////////////////////////////////////////////////////////////////////////////// +// WriteOptions + +/* + * Class: org_rocksdb_WriteOptions + * Method: newWriteOptions + * Signature: ()V + */ +void Java_org_rocksdb_WriteOptions_newWriteOptions( + JNIEnv* env, jobject jwrite_options) { + rocksdb::WriteOptions* op = new rocksdb::WriteOptions(); + rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, op); +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: disposeInternal + * Signature: ()V + */ +void Java_org_rocksdb_WriteOptions_disposeInternal( + JNIEnv* env, jobject jwrite_options, jlong jhandle) { + auto write_options = reinterpret_cast(jhandle); + delete write_options; + + rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, nullptr); +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: setSync + * Signature: (JZ)V + */ +void Java_org_rocksdb_WriteOptions_setSync( + JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) { + reinterpret_cast(jhandle)->sync = jflag; +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: sync + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_WriteOptions_sync( + JNIEnv* env, jobject jwrite_options, jlong jhandle) { + return reinterpret_cast(jhandle)->sync; +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: setDisableWAL + * Signature: (JZ)V + */ +void Java_org_rocksdb_WriteOptions_setDisableWAL( + JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) { + reinterpret_cast(jhandle)->disableWAL = jflag; +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: disableWAL + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_WriteOptions_disableWAL( + JNIEnv* env, jobject jwrite_options, jlong jhandle) { + return reinterpret_cast(jhandle)->disableWAL; +} + +///////////////////////////////////////////////////////////////////// +// rocksdb::ReadOptions + +/* + * Class: org_rocksdb_ReadOptions + * Method: newReadOptions + * Signature: ()V + */ +void Java_org_rocksdb_ReadOptions_newReadOptions( + JNIEnv* env, jobject jobj) { + auto read_opt = new rocksdb::ReadOptions(); + rocksdb::ReadOptionsJni::setHandle(env, jobj, read_opt); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_ReadOptions_disposeInternal( + JNIEnv* env, jobject jobj, jlong jhandle) { + delete reinterpret_cast(jhandle); + rocksdb::ReadOptionsJni::setHandle(env, jobj, nullptr); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: verifyChecksums + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ReadOptions_verifyChecksums( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->verify_checksums; +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setVerifyChecksums + * Signature: (JZ)V + */ +void Java_org_rocksdb_ReadOptions_setVerifyChecksums( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jverify_checksums) { + reinterpret_cast(jhandle)->verify_checksums = + static_cast(jverify_checksums); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: fillCache + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ReadOptions_fillCache( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->fill_cache; +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setFillCache + * Signature: (JZ)V + */ +void Java_org_rocksdb_ReadOptions_setFillCache( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfill_cache) { + reinterpret_cast(jhandle)->fill_cache = + static_cast(jfill_cache); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: tailing + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ReadOptions_tailing( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->tailing; +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setTailing + * Signature: (JZ)V + */ +void Java_org_rocksdb_ReadOptions_setTailing( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jtailing) { + reinterpret_cast(jhandle)->tailing = + static_cast(jtailing); +} diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h new file mode 100644 index 0000000000..bd37bffcf8 --- /dev/null +++ b/java/rocksjni/portal.h @@ -0,0 +1,383 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +// This file is designed for caching those frequently used IDs and provide +// efficient portal (i.e, a set of static functions) to access java code +// from c++. + +#ifndef JAVA_ROCKSJNI_PORTAL_H_ +#define JAVA_ROCKSJNI_PORTAL_H_ + +#include +#include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" +#include "utilities/backupable_db.h" + +namespace rocksdb { + +// The portal class for org.rocksdb.RocksDB +class RocksDBJni { + public: + // Get the java class id of org.rocksdb.RocksDB. + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/RocksDB"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the field id of the member variable of org.rocksdb.RocksDB + // that stores the pointer to rocksdb::DB. + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Get the pointer to rocksdb::DB of the specified org.rocksdb.RocksDB. + static rocksdb::DB* getHandle(JNIEnv* env, jobject jdb) { + return reinterpret_cast( + env->GetLongField(jdb, getHandleFieldID(env))); + } + + // Pass the rocksdb::DB pointer to the java side. + static void setHandle(JNIEnv* env, jobject jdb, rocksdb::DB* db) { + env->SetLongField( + jdb, getHandleFieldID(env), + reinterpret_cast(db)); + } +}; + +// The portal class for org.rocksdb.RocksDBException +class RocksDBExceptionJni { + public: + // Get the jclass of org.rocksdb.RocksDBException + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/RocksDBException"); + assert(jclazz != nullptr); + return jclazz; + } + + // Create and throw a java exception by converting the input + // Status to an RocksDBException. + // + // In case s.ok() is true, then this function will not throw any + // exception. + static void ThrowNew(JNIEnv* env, Status s) { + if (s.ok()) { + return; + } + jstring msg = env->NewStringUTF(s.ToString().c_str()); + // get the constructor id of org.rocksdb.RocksDBException + static jmethodID mid = env->GetMethodID( + getJClass(env), "", "(Ljava/lang/String;)V"); + assert(mid != nullptr); + + env->Throw((jthrowable)env->NewObject(getJClass(env), mid, msg)); + } +}; + +class OptionsJni { + public: + // Get the java class id of org.rocksdb.Options. + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/Options"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the field id of the member variable of org.rocksdb.Options + // that stores the pointer to rocksdb::Options + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Get the pointer to rocksdb::Options + static rocksdb::Options* getHandle(JNIEnv* env, jobject jobj) { + return reinterpret_cast( + env->GetLongField(jobj, getHandleFieldID(env))); + } + + // Pass the rocksdb::Options pointer to the java side. + static void setHandle(JNIEnv* env, jobject jobj, rocksdb::Options* op) { + env->SetLongField( + jobj, getHandleFieldID(env), + reinterpret_cast(op)); + } +}; + +class WriteOptionsJni { + public: + // Get the java class id of org.rocksdb.WriteOptions. + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/WriteOptions"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the field id of the member variable of org.rocksdb.WriteOptions + // that stores the pointer to rocksdb::WriteOptions + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Get the pointer to rocksdb::WriteOptions + static rocksdb::WriteOptions* getHandle(JNIEnv* env, jobject jobj) { + return reinterpret_cast( + env->GetLongField(jobj, getHandleFieldID(env))); + } + + // Pass the rocksdb::WriteOptions pointer to the java side. + static void setHandle(JNIEnv* env, jobject jobj, rocksdb::WriteOptions* op) { + env->SetLongField( + jobj, getHandleFieldID(env), + reinterpret_cast(op)); + } +}; + + +class ReadOptionsJni { + public: + // Get the java class id of org.rocksdb.ReadOptions. + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/ReadOptions"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the field id of the member variable of org.rocksdb.ReadOptions + // that stores the pointer to rocksdb::ReadOptions + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Get the pointer to rocksdb::ReadOptions + static rocksdb::ReadOptions* getHandle(JNIEnv* env, jobject jobj) { + return reinterpret_cast( + env->GetLongField(jobj, getHandleFieldID(env))); + } + + // Pass the rocksdb::ReadOptions pointer to the java side. + static void setHandle(JNIEnv* env, jobject jobj, + rocksdb::ReadOptions* op) { + env->SetLongField( + jobj, getHandleFieldID(env), + reinterpret_cast(op)); + } +}; + + +class WriteBatchJni { + public: + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/WriteBatch"); + assert(jclazz != nullptr); + return jclazz; + } + + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Get the pointer to rocksdb::WriteBatch of the specified + // org.rocksdb.WriteBatch. + static rocksdb::WriteBatch* getHandle(JNIEnv* env, jobject jwb) { + return reinterpret_cast( + env->GetLongField(jwb, getHandleFieldID(env))); + } + + // Pass the rocksdb::WriteBatch pointer to the java side. + static void setHandle(JNIEnv* env, jobject jwb, rocksdb::WriteBatch* wb) { + env->SetLongField( + jwb, getHandleFieldID(env), + reinterpret_cast(wb)); + } +}; + +class HistogramDataJni { + public: + static jmethodID getConstructorMethodId(JNIEnv* env, jclass jclazz) { + static jmethodID mid = env->GetMethodID( + jclazz, "", "(DDDDD)V"); + assert(mid != nullptr); + return mid; + } +}; +class BackupableDBOptionsJni { + public: + // Get the java class id of org.rocksdb.BackupableDBOptions. + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/BackupableDBOptions"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the field id of the member variable of org.rocksdb.BackupableDBOptions + // that stores the pointer to rocksdb::BackupableDBOptions + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Get the pointer to rocksdb::BackupableDBOptions + static rocksdb::BackupableDBOptions* getHandle(JNIEnv* env, jobject jobj) { + return reinterpret_cast( + env->GetLongField(jobj, getHandleFieldID(env))); + } + + // Pass the rocksdb::BackupableDBOptions pointer to the java side. + static void setHandle( + JNIEnv* env, jobject jobj, rocksdb::BackupableDBOptions* op) { + env->SetLongField( + jobj, getHandleFieldID(env), + reinterpret_cast(op)); + } +}; + +class IteratorJni { + public: + // Get the java class id of org.rocksdb.Iteartor. + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/RocksIterator"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the field id of the member variable of org.rocksdb.Iterator + // that stores the pointer to rocksdb::Iterator. + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Get the pointer to rocksdb::Iterator. + static rocksdb::Iterator* getHandle(JNIEnv* env, jobject jobj) { + return reinterpret_cast( + env->GetLongField(jobj, getHandleFieldID(env))); + } + + // Pass the rocksdb::Iterator pointer to the java side. + static void setHandle( + JNIEnv* env, jobject jobj, rocksdb::Iterator* op) { + env->SetLongField( + jobj, getHandleFieldID(env), + reinterpret_cast(op)); + } +}; + +class FilterJni { + public: + // Get the java class id of org.rocksdb.FilterPolicy. + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/Filter"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the field id of the member variable of org.rocksdb.Filter + // that stores the pointer to rocksdb::FilterPolicy. + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Get the pointer to rocksdb::FilterPolicy. + static rocksdb::FilterPolicy* getHandle(JNIEnv* env, jobject jobj) { + return reinterpret_cast( + env->GetLongField(jobj, getHandleFieldID(env))); + } + + // Pass the rocksdb::FilterPolicy pointer to the java side. + static void setHandle( + JNIEnv* env, jobject jobj, const rocksdb::FilterPolicy* op) { + env->SetLongField( + jobj, getHandleFieldID(env), + reinterpret_cast(op)); + } +}; + +class ListJni { + public: + // Get the java class id of java.util.List. + static jclass getListClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("java/util/List"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the java class id of java.util.ArrayList. + static jclass getArrayListClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("java/util/ArrayList"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the java class id of java.util.Iterator. + static jclass getIteratorClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("java/util/Iterator"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the java method id of java.util.List.iterator(). + static jmethodID getIteratorMethod(JNIEnv* env) { + static jmethodID mid = env->GetMethodID( + getListClass(env), "iterator", "()Ljava/util/Iterator;"); + assert(mid != nullptr); + return mid; + } + + // Get the java method id of java.util.Iterator.hasNext(). + static jmethodID getHasNextMethod(JNIEnv* env) { + static jmethodID mid = env->GetMethodID( + getIteratorClass(env), "hasNext", "()Z"); + assert(mid != nullptr); + return mid; + } + + // Get the java method id of java.util.Iterator.next(). + static jmethodID getNextMethod(JNIEnv* env) { + static jmethodID mid = env->GetMethodID( + getIteratorClass(env), "next", "()Ljava/lang/Object;"); + assert(mid != nullptr); + return mid; + } + + // Get the java method id of arrayList constructor. + static jmethodID getArrayListConstructorMethodId(JNIEnv* env, jclass jclazz) { + static jmethodID mid = env->GetMethodID( + jclazz, "", "(I)V"); + assert(mid != nullptr); + return mid; + } + + // Get the java method id of java.util.List.add(). + static jmethodID getListAddMethodId(JNIEnv* env) { + static jmethodID mid = env->GetMethodID( + getListClass(env), "add", "(Ljava/lang/Object;)Z"); + assert(mid != nullptr); + return mid; + } +}; +} // namespace rocksdb +#endif // JAVA_ROCKSJNI_PORTAL_H_ diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc new file mode 100644 index 0000000000..697bd0cef7 --- /dev/null +++ b/java/rocksjni/rocksjni.cc @@ -0,0 +1,440 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ and enables +// calling c++ rocksdb::DB methods from Java side. + +#include +#include +#include +#include +#include + +#include "include/org_rocksdb_RocksDB.h" +#include "rocksjni/portal.h" +#include "rocksdb/db.h" +#include "rocksdb/cache.h" + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::Open + +/* + * Class: org_rocksdb_RocksDB + * Method: open + * Signature: (JLjava/lang/String;)V + */ +void Java_org_rocksdb_RocksDB_open( + JNIEnv* env, jobject jdb, jlong jopt_handle, + jlong jcache_size, jstring jdb_path) { + auto opt = reinterpret_cast(jopt_handle); + // TODO(yhchiang): should be removed once Java binding for Env is ready. + if (opt->max_background_compactions > 1) { + opt->env->SetBackgroundThreads(opt->max_background_compactions); + } + if (jcache_size > 0) { + opt->no_block_cache = false; + opt->block_cache = rocksdb::NewLRUCache(jcache_size); + } else { + opt->no_block_cache = true; + opt->block_cache = nullptr; + } + + rocksdb::DB* db = nullptr; + const char* db_path = env->GetStringUTFChars(jdb_path, 0); + rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, &db); + env->ReleaseStringUTFChars(jdb_path, db_path); + + if (s.ok()) { + rocksdb::RocksDBJni::setHandle(env, jdb, db); + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::Put + +void rocksdb_put_helper( + JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options, + jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + + jbyte* key = env->GetByteArrayElements(jkey, 0); + jbyte* value = env->GetByteArrayElements(jvalue, 0); + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); + rocksdb::Slice value_slice(reinterpret_cast(value), jvalue_len); + + rocksdb::Status s = db->Put(write_options, key_slice, value_slice); + + // trigger java unref on key and value. + // by passing JNI_ABORT, it will simply release the reference without + // copying the result back to the java byte array. + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT); + + if (s.ok()) { + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: put + * Signature: (J[BI[BI)V + */ +void Java_org_rocksdb_RocksDB_put__J_3BI_3BI( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + auto db = reinterpret_cast(jdb_handle); + static const rocksdb::WriteOptions default_write_options = + rocksdb::WriteOptions(); + + rocksdb_put_helper(env, db, default_write_options, + jkey, jkey_len, + jvalue, jvalue_len); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: put + * Signature: (JJ[BI[BI)V + */ +void Java_org_rocksdb_RocksDB_put__JJ_3BI_3BI( + JNIEnv* env, jobject jdb, + jlong jdb_handle, jlong jwrite_options_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + auto db = reinterpret_cast(jdb_handle); + auto write_options = reinterpret_cast( + jwrite_options_handle); + + rocksdb_put_helper(env, db, *write_options, + jkey, jkey_len, + jvalue, jvalue_len); +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::Write +/* + * Class: org_rocksdb_RocksDB + * Method: write + * Signature: (JJ)V + */ +void Java_org_rocksdb_RocksDB_write( + JNIEnv* env, jobject jdb, + jlong jwrite_options_handle, jlong jbatch_handle) { + rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb); + auto write_options = reinterpret_cast( + jwrite_options_handle); + auto batch = reinterpret_cast(jbatch_handle); + + rocksdb::Status s = db->Write(*write_options, batch); + + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::Get + +jbyteArray rocksdb_get_helper( + JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_opt, + jbyteArray jkey, jint jkey_len) { + jboolean isCopy; + jbyte* key = env->GetByteArrayElements(jkey, &isCopy); + rocksdb::Slice key_slice( + reinterpret_cast(key), jkey_len); + + std::string value; + rocksdb::Status s = db->Get( + read_opt, key_slice, &value); + + // trigger java unref on key. + // by passing JNI_ABORT, it will simply release the reference without + // copying the result back to the java byte array. + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + + if (s.IsNotFound()) { + return nullptr; + } + + if (s.ok()) { + jbyteArray jvalue = env->NewByteArray(value.size()); + env->SetByteArrayRegion( + jvalue, 0, value.size(), + reinterpret_cast(value.c_str())); + return jvalue; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + + return nullptr; +} + +/* + * Class: org_rocksdb_RocksDB + * Method: get + * Signature: (J[BI)[B + */ +jbyteArray Java_org_rocksdb_RocksDB_get__J_3BI( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jbyteArray jkey, jint jkey_len) { + return rocksdb_get_helper(env, + reinterpret_cast(jdb_handle), + rocksdb::ReadOptions(), + jkey, jkey_len); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: get + * Signature: (JJ[BI)[B + */ +jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BI( + JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle, + jbyteArray jkey, jint jkey_len) { + return rocksdb_get_helper(env, + reinterpret_cast(jdb_handle), + *reinterpret_cast(jropt_handle), + jkey, jkey_len); +} + +jint rocksdb_get_helper( + JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_options, + jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + static const int kNotFound = -1; + static const int kStatusError = -2; + + jbyte* key = env->GetByteArrayElements(jkey, 0); + rocksdb::Slice key_slice( + reinterpret_cast(key), jkey_len); + + // TODO(yhchiang): we might save one memory allocation here by adding + // a DB::Get() function which takes preallocated jbyte* as input. + std::string cvalue; + rocksdb::Status s = db->Get( + read_options, key_slice, &cvalue); + + // trigger java unref on key. + // by passing JNI_ABORT, it will simply release the reference without + // copying the result back to the java byte array. + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + + if (s.IsNotFound()) { + return kNotFound; + } else if (!s.ok()) { + // Here since we are throwing a Java exception from c++ side. + // As a result, c++ does not know calling this function will in fact + // throwing an exception. As a result, the execution flow will + // not stop here, and codes after this throw will still be + // executed. + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + + // Return a dummy const value to avoid compilation error, although + // java side might not have a chance to get the return value :) + return kStatusError; + } + + int cvalue_len = static_cast(cvalue.size()); + int length = std::min(jvalue_len, cvalue_len); + + env->SetByteArrayRegion( + jvalue, 0, length, + reinterpret_cast(cvalue.c_str())); + return cvalue_len; +} + +jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db, + const rocksdb::ReadOptions& rOpt, jobject jkey_list, jint jkeys_count) { + std::vector keys; + std::vector keys_to_free; + + // get iterator + jobject iteratorObj = env->CallObjectMethod( + jkey_list, rocksdb::ListJni::getIteratorMethod(env)); + + // iterate over keys and convert java byte array to slice + while(env->CallBooleanMethod( + iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { + jbyteArray jkey = (jbyteArray) env->CallObjectMethod( + iteratorObj, rocksdb::ListJni::getNextMethod(env)); + jint key_length = env->GetArrayLength(jkey); + + jbyte* key = new jbyte[key_length]; + env->GetByteArrayRegion(jkey, 0, key_length, key); + // store allocated jbyte to free it after multiGet call + keys_to_free.push_back(key); + + rocksdb::Slice key_slice( + reinterpret_cast(key), key_length); + keys.push_back(key_slice); + } + + std::vector values; + std::vector s = db->MultiGet(rOpt, keys, &values); + + // Don't reuse class pointer + jclass jclazz = env->FindClass("java/util/ArrayList"); + jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId( + env, jclazz); + jobject jvalue_list = env->NewObject(jclazz, mid, jkeys_count); + + // insert in java list + for(std::vector::size_type i = 0; i != s.size(); i++) { + if(s[i].ok()) { + jbyteArray jvalue = env->NewByteArray(values[i].size()); + env->SetByteArrayRegion( + jvalue, 0, values[i].size(), + reinterpret_cast(values[i].c_str())); + env->CallBooleanMethod( + jvalue_list, rocksdb::ListJni::getListAddMethodId(env), jvalue); + } + else { + env->CallBooleanMethod( + jvalue_list, rocksdb::ListJni::getListAddMethodId(env), nullptr); + } + } + + // free up allocated byte arrays + for(std::vector::size_type i = 0; i != keys_to_free.size(); i++) { + delete[] keys_to_free[i]; + } + keys_to_free.clear(); + + return jvalue_list; +} + +/* + * Class: org_rocksdb_RocksDB + * Method: multiGet + * Signature: (JLjava/util/List;I)Ljava/util/List; + */ +jobject Java_org_rocksdb_RocksDB_multiGet__JLjava_util_List_2I( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jobject jkey_list, jint jkeys_count) { + return multi_get_helper(env, jdb, reinterpret_cast(jdb_handle), + rocksdb::ReadOptions(), jkey_list, jkeys_count); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: multiGet + * Signature: (JJLjava/util/List;I)Ljava/util/List; + */ +jobject Java_org_rocksdb_RocksDB_multiGet__JJLjava_util_List_2I( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jlong jropt_handle, jobject jkey_list, jint jkeys_count) { + return multi_get_helper(env, jdb, reinterpret_cast(jdb_handle), + *reinterpret_cast(jropt_handle), jkey_list, + jkeys_count); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: get + * Signature: (J[BI[BI)I + */ +jint Java_org_rocksdb_RocksDB_get__J_3BI_3BI( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + return rocksdb_get_helper(env, + reinterpret_cast(jdb_handle), + rocksdb::ReadOptions(), + jkey, jkey_len, jvalue, jvalue_len); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: get + * Signature: (JJ[BI[BI)I + */ +jint Java_org_rocksdb_RocksDB_get__JJ_3BI_3BI( + JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + return rocksdb_get_helper(env, + reinterpret_cast(jdb_handle), + *reinterpret_cast(jropt_handle), + jkey, jkey_len, jvalue, jvalue_len); +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::Delete() +void rocksdb_remove_helper( + JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options, + jbyteArray jkey, jint jkey_len) { + jbyte* key = env->GetByteArrayElements(jkey, 0); + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); + + rocksdb::Status s = db->Delete(write_options, key_slice); + + // trigger java unref on key and value. + // by passing JNI_ABORT, it will simply release the reference without + // copying the result back to the java byte array. + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } + return; +} + +/* + * Class: org_rocksdb_RocksDB + * Method: remove + * Signature: (J[BI)V + */ +void Java_org_rocksdb_RocksDB_remove__J_3BI( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jbyteArray jkey, jint jkey_len) { + auto db = reinterpret_cast(jdb_handle); + static const rocksdb::WriteOptions default_write_options = + rocksdb::WriteOptions(); + + rocksdb_remove_helper(env, db, default_write_options, jkey, jkey_len); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: remove + * Signature: (JJ[BI)V + */ +void Java_org_rocksdb_RocksDB_remove__JJ_3BI( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jlong jwrite_options, jbyteArray jkey, jint jkey_len) { + auto db = reinterpret_cast(jdb_handle); + auto write_options = reinterpret_cast(jwrite_options); + + rocksdb_remove_helper(env, db, *write_options, jkey, jkey_len); +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::~DB() + +/* + * Class: org_rocksdb_RocksDB + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_RocksDB_disposeInternal( + JNIEnv* env, jobject java_db, jlong jhandle) { + delete reinterpret_cast(jhandle); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: iterator0 + * Signature: (J)J + */ +jlong Java_org_rocksdb_RocksDB_iterator0( + JNIEnv* env, jobject jdb, jlong db_handle) { + auto db = reinterpret_cast(db_handle); + rocksdb::Iterator* iterator = db->NewIterator(rocksdb::ReadOptions()); + return reinterpret_cast(iterator); +} diff --git a/java/rocksjni/statistics.cc b/java/rocksjni/statistics.cc new file mode 100644 index 0000000000..bf170c6de4 --- /dev/null +++ b/java/rocksjni/statistics.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ and enables +// calling c++ rocksdb::Statistics methods from Java side. + +#include +#include +#include + +#include "include/org_rocksdb_Statistics.h" +#include "rocksjni/portal.h" +#include "rocksdb/statistics.h" + +/* + * Class: org_rocksdb_Statistics + * Method: getTickerCount0 + * Signature: (IJ)J + */ +jlong Java_org_rocksdb_Statistics_getTickerCount0( + JNIEnv* env, jobject jobj, int tickerType, jlong handle) { + auto st = reinterpret_cast(handle); + assert(st != nullptr); + + return st->getTickerCount(static_cast(tickerType)); +} + +/* + * Class: org_rocksdb_Statistics + * Method: geHistogramData0 + * Signature: (IJ)Lorg/rocksdb/HistogramData; + */ +jobject Java_org_rocksdb_Statistics_geHistogramData0( + JNIEnv* env, jobject jobj, int histogramType, jlong handle) { + auto st = reinterpret_cast(handle); + assert(st != nullptr); + + rocksdb::HistogramData data; + st->histogramData(static_cast(histogramType), + &data); + + // Don't reuse class pointer + jclass jclazz = env->FindClass("org/rocksdb/HistogramData"); + jmethodID mid = rocksdb::HistogramDataJni::getConstructorMethodId( + env, jclazz); + return env->NewObject(jclazz, mid, data.median, data.percentile95, + data.percentile99, data.average, data.standard_deviation); +} diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc new file mode 100644 index 0000000000..c21501bb4f --- /dev/null +++ b/java/rocksjni/table.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ for rocksdb::Options. + +#include +#include "include/org_rocksdb_PlainTableConfig.h" +#include "rocksdb/table.h" + +/* + * Class: org_rocksdb_PlainTableConfig + * Method: newTableFactoryHandle + * Signature: (IIDI)J + */ +jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle( + JNIEnv* env, jobject jobj, jint jkey_size, jint jbloom_bits_per_key, + jdouble jhash_table_ratio, jint jindex_sparseness) { + return reinterpret_cast(rocksdb::NewPlainTableFactory( + static_cast(jkey_size), + static_cast(jbloom_bits_per_key), + static_cast(jhash_table_ratio), + static_cast(jindex_sparseness))); +} diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc new file mode 100644 index 0000000000..e8b2456eee --- /dev/null +++ b/java/rocksjni/write_batch.cc @@ -0,0 +1,261 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ and enables +// calling c++ rocksdb::WriteBatch methods from Java side. +#include + +#include "include/org_rocksdb_WriteBatch.h" +#include "include/org_rocksdb_WriteBatchInternal.h" +#include "include/org_rocksdb_WriteBatchTest.h" +#include "rocksjni/portal.h" +#include "rocksdb/db.h" +#include "db/memtable.h" +#include "rocksdb/write_batch.h" +#include "db/write_batch_internal.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "util/logging.h" +#include "util/testharness.h" + +/* + * Class: org_rocksdb_WriteBatch + * Method: newWriteBatch + * Signature: (I)V + */ +void Java_org_rocksdb_WriteBatch_newWriteBatch( + JNIEnv* env, jobject jobj, jint jreserved_bytes) { + rocksdb::WriteBatch* wb = new rocksdb::WriteBatch( + static_cast(jreserved_bytes)); + + rocksdb::WriteBatchJni::setHandle(env, jobj, wb); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: count + * Signature: ()I + */ +jint Java_org_rocksdb_WriteBatch_count(JNIEnv* env, jobject jobj) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + return static_cast(wb->Count()); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: clear + * Signature: ()V + */ +void Java_org_rocksdb_WriteBatch_clear(JNIEnv* env, jobject jobj) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + wb->Clear(); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: put + * Signature: ([BI[BI)V + */ +void Java_org_rocksdb_WriteBatch_put( + JNIEnv* env, jobject jobj, + jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + jbyte* value = env->GetByteArrayElements(jvalue, nullptr); + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); + rocksdb::Slice value_slice(reinterpret_cast(value), jvalue_len); + wb->Put(key_slice, value_slice); + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: merge + * Signature: ([BI[BI)V + */ +JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_merge( + JNIEnv* env, jobject jobj, + jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + jbyte* value = env->GetByteArrayElements(jvalue, nullptr); + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); + rocksdb::Slice value_slice(reinterpret_cast(value), jvalue_len); + wb->Merge(key_slice, value_slice); + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: remove + * Signature: ([BI)V + */ +JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_remove( + JNIEnv* env, jobject jobj, + jbyteArray jkey, jint jkey_len) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); + wb->Delete(key_slice); + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: putLogData + * Signature: ([BI)V + */ +void Java_org_rocksdb_WriteBatch_putLogData( + JNIEnv* env, jobject jobj, jbyteArray jblob, jint jblob_len) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + jbyte* blob = env->GetByteArrayElements(jblob, nullptr); + rocksdb::Slice blob_slice(reinterpret_cast(blob), jblob_len); + wb->PutLogData(blob_slice); + env->ReleaseByteArrayElements(jblob, blob, JNI_ABORT); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_WriteBatch_disposeInternal( + JNIEnv* env, jobject jobj, jlong handle) { + delete reinterpret_cast(handle); +} + +/* + * Class: org_rocksdb_WriteBatchInternal + * Method: setSequence + * Signature: (Lorg/rocksdb/WriteBatch;J)V + */ +void Java_org_rocksdb_WriteBatchInternal_setSequence( + JNIEnv* env, jclass jclazz, jobject jobj, jlong jsn) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + rocksdb::WriteBatchInternal::SetSequence( + wb, static_cast(jsn)); +} + +/* + * Class: org_rocksdb_WriteBatchInternal + * Method: sequence + * Signature: (Lorg/rocksdb/WriteBatch;)J + */ +jlong Java_org_rocksdb_WriteBatchInternal_sequence( + JNIEnv* env, jclass jclazz, jobject jobj) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + return static_cast(rocksdb::WriteBatchInternal::Sequence(wb)); +} + +/* + * Class: org_rocksdb_WriteBatchInternal + * Method: append + * Signature: (Lorg/rocksdb/WriteBatch;Lorg/rocksdb/WriteBatch;)V + */ +void Java_org_rocksdb_WriteBatchInternal_append( + JNIEnv* env, jclass jclazz, jobject jwb1, jobject jwb2) { + rocksdb::WriteBatch* wb1 = rocksdb::WriteBatchJni::getHandle(env, jwb1); + assert(wb1 != nullptr); + rocksdb::WriteBatch* wb2 = rocksdb::WriteBatchJni::getHandle(env, jwb2); + assert(wb2 != nullptr); + + rocksdb::WriteBatchInternal::Append(wb1, wb2); +} + +/* + * Class: org_rocksdb_WriteBatchTest + * Method: getContents + * Signature: (Lorg/rocksdb/WriteBatch;)[B + */ +jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( + JNIEnv* env, jclass jclazz, jobject jobj) { + rocksdb::WriteBatch* b = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(b != nullptr); + + // todo: Currently the following code is directly copied from + // db/write_bench_test.cc. It could be implemented in java once + // all the necessary components can be accessed via jni api. + + rocksdb::InternalKeyComparator cmp(rocksdb::BytewiseComparator()); + auto factory = std::make_shared(); + rocksdb::Options options; + options.memtable_factory = factory; + rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options); + mem->Ref(); + std::string state; + rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options); + rocksdb::Status s = + rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default); + int count = 0; + rocksdb::Iterator* iter = mem->NewIterator(rocksdb::ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + rocksdb::ParsedInternalKey ikey; + memset(reinterpret_cast(&ikey), 0, sizeof(ikey)); + ASSERT_TRUE(rocksdb::ParseInternalKey(iter->key(), &ikey)); + switch (ikey.type) { + case rocksdb::kTypeValue: + state.append("Put("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; + case rocksdb::kTypeMerge: + state.append("Merge("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; + case rocksdb::kTypeDeletion: + state.append("Delete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + count++; + break; + default: + assert(false); + break; + } + state.append("@"); + state.append(rocksdb::NumberToString(ikey.sequence)); + } + delete iter; + if (!s.ok()) { + state.append(s.ToString()); + } else if (count != rocksdb::WriteBatchInternal::Count(b)) { + state.append("CountMismatch()"); + } + delete mem->Unref(); + + jbyteArray jstate = env->NewByteArray(state.size()); + env->SetByteArrayRegion( + jstate, 0, state.size(), + reinterpret_cast(state.c_str())); + + return jstate; +} diff --git a/linters/__phutil_library_init__.php b/linters/__phutil_library_init__.php new file mode 100644 index 0000000000..4b8d3d1316 --- /dev/null +++ b/linters/__phutil_library_init__.php @@ -0,0 +1,3 @@ + 2, + 'class' => + array( + 'FacebookFbcodeLintEngine' => 'lint_engine/FacebookFbcodeLintEngine.php', + 'FbcodeCppLinter' => 'cpp_linter/FbcodeCppLinter.php', + 'PfffCppLinter' => 'cpp_linter/PfffCppLinter.php', + 'ArcanistCpplintLinter' => 'cpp_linter/ArcanistCpplintLinter.php', + ), + 'function' => + array( + ), + 'xmap' => + array( + 'FacebookFbcodeLintEngine' => 'ArcanistLintEngine', + 'FbcodeCppLinter' => 'ArcanistLinter', + 'PfffCppLinter' => 'ArcanistLinter', + ), +)); diff --git a/linters/cpp_linter/ArcanistCpplintLinter.php b/linters/cpp_linter/ArcanistCpplintLinter.php new file mode 100644 index 0000000000..b9c4137555 --- /dev/null +++ b/linters/cpp_linter/ArcanistCpplintLinter.php @@ -0,0 +1,88 @@ +linterDir(), $bin); + if (!$err) { + return $this->linterDir().'/'.$bin; + } + + // Look for globally installed cpplint.py + list($err) = exec_manual('which %s', $bin); + if ($err) { + throw new ArcanistUsageException( + "cpplint.py does not appear to be installed on this system. Install ". + "it (e.g., with 'wget \"http://google-styleguide.googlecode.com/". + "svn/trunk/cpplint/cpplint.py\"') ". + "in your .arcconfig to point to the directory where it resides. ". + "Also don't forget to chmod a+x cpplint.py!"); + } + + return $bin; + } + + public function lintPath($path) { + $bin = $this->getLintPath(); + $path = $this->rocksdbDir().'/'.$path; + + $f = new ExecFuture("%C $path", $bin); + + list($err, $stdout, $stderr) = $f->resolve(); + + if ($err === 2) { + throw new Exception("cpplint failed to run correctly:\n".$stderr); + } + + $lines = explode("\n", $stderr); + $messages = array(); + foreach ($lines as $line) { + $line = trim($line); + $matches = null; + $regex = '/^[^:]+:(\d+):\s*(.*)\s*\[(.*)\] \[(\d+)\]$/'; + if (!preg_match($regex, $line, $matches)) { + continue; + } + foreach ($matches as $key => $match) { + $matches[$key] = trim($match); + } + $message = new ArcanistLintMessage(); + $message->setPath($path); + $message->setLine($matches[1]); + $message->setCode($matches[3]); + $message->setName($matches[3]); + $message->setDescription($matches[2]); + $message->setSeverity(ArcanistLintSeverity::SEVERITY_WARNING); + $this->addLintMessage($message); + } + } + + // The path of this linter + private function linterDir() { + return dirname(__FILE__); + } + + // TODO(kaili) a quick and dirty way to figure out rocksdb's root dir. + private function rocksdbDir() { + return $this->linterDir()."/../.."; + } +} diff --git a/linters/cpp_linter/FbcodeCppLinter.php b/linters/cpp_linter/FbcodeCppLinter.php new file mode 100644 index 0000000000..e62d3bbe1b --- /dev/null +++ b/linters/cpp_linter/FbcodeCppLinter.php @@ -0,0 +1,99 @@ +getEngine()->getFilePathOnDisk($p); + $lpath_file = file($lpath); + if (preg_match('/\.(c)$/', $lpath) || + preg_match('/-\*-.*Mode: C[; ].*-\*-/', $lpath_file[0]) || + preg_match('/vim(:.*)*:\s*(set\s+)?filetype=c\s*:/', $lpath_file[0]) + ) { + $futures[$p] = new ExecFuture("%s %s %s 2>&1", + $CPP_LINT, self::C_FLAG, + $this->getEngine()->getFilePathOnDisk($p)); + } else { + $futures[$p] = new ExecFuture("%s %s 2>&1", + self::CPPLINT, $this->getEngine()->getFilePathOnDisk($p)); + } + } + + foreach (Futures($futures)->limit(8) as $p => $f) { + $this->rawLintOutput[$p] = $f->resolvex(); + } + } + return; + } + + public function getLinterName() { + return "FBCPP"; + } + + public function lintPath($path) { + $msgs = $this->getCppLintOutput($path); + foreach ($msgs as $m) { + $this->raiseLintAtLine($m['line'], 0, $m['severity'], $m['msg']); + } + } + + public function getLintSeverityMap() { + return array( + self::LINT_WARNING => ArcanistLintSeverity::SEVERITY_WARNING, + self::LINT_ERROR => ArcanistLintSeverity::SEVERITY_ERROR + ); + } + + public function getLintNameMap() { + return array( + self::LINT_WARNING => "CppLint Warning", + self::LINT_ERROR => "CppLint Error" + ); + } + + private function getCppLintOutput($path) { + list($output) = $this->rawLintOutput[$path]; + + $msgs = array(); + $current = null; + foreach (explode("\n", $output) as $line) { + if (preg_match('/[^:]*\((\d+)\):(.*)$/', $line, $matches)) { + if ($current) { + $msgs[] = $current; + } + $line = $matches[1]; + $text = $matches[2]; + $sev = preg_match('/.*Warning.*/', $text) + ? self::LINT_WARNING + : self::LINT_ERROR; + $current = array('line' => $line, + 'msg' => $text, + 'severity' => $sev); + } else if ($current) { + $current['msg'] .= ' ' . $line; + } + } + if ($current) { + $msgs[] = $current; + } + + return $msgs; + } +} + diff --git a/linters/cpp_linter/PfffCppLinter.php b/linters/cpp_linter/PfffCppLinter.php new file mode 100644 index 0000000000..67366143ce --- /dev/null +++ b/linters/cpp_linter/PfffCppLinter.php @@ -0,0 +1,68 @@ +&1", + $program, $this->getEngine()->getFilePathOnDisk($p)); + } + foreach (Futures($futures)->limit(8) as $p => $f) { + + list($stdout, $stderr) = $f->resolvex(); + $raw = json_decode($stdout, true); + if (!is_array($raw)) { + throw new Exception( + "checkCpp returned invalid JSON!". + "Stdout: {$stdout} Stderr: {$stderr}" + ); + } + foreach($raw as $err) { + $this->addLintMessage( + ArcanistLintMessage::newFromDictionary( + array( + 'path' => $err['file'], + 'line' => $err['line'], + 'char' => 0, + 'name' => $err['name'], + 'description' => $err['info'], + 'code' => $this->getLinterName(), + 'severity' => ArcanistLintSeverity::SEVERITY_WARNING, + ) + ) + ); + } + } + } + return; + } + + public function lintPath($path) { + return; + } +} diff --git a/linters/cpp_linter/cpplint.py b/linters/cpp_linter/cpplint.py new file mode 100755 index 0000000000..d264b00da0 --- /dev/null +++ b/linters/cpp_linter/cpplint.py @@ -0,0 +1,4767 @@ +#!/usr/bin/python +# Copyright (c) 2013, Facebook, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. +# Copyright (c) 2011 The LevelDB Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. See the AUTHORS file for names of contributors. +# +# Copyright (c) 2009 Google Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""Does google-lint on c++ files. + +The goal of this script is to identify places in the code that *may* +be in non-compliance with google style. It does not attempt to fix +up these problems -- the point is to educate. It does also not +attempt to find all problems, or to ensure that everything it does +find is legitimately a problem. + +In particular, we can get very confused by /* and // inside strings! +We do a small hack, which is to ignore //'s with "'s after them on the +same line, but it is far from perfect (in either direction). +""" + +import codecs +import copy +import getopt +import math # for log +import os +import re +import sre_compile +import string +import sys +import unicodedata + + +_USAGE = """ +Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...] + [--counting=total|toplevel|detailed] [--root=subdir] + [--linelength=digits] + [file] ... + + The style guidelines this tries to follow are those in + http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml + + Every problem is given a confidence score from 1-5, with 5 meaning we are + certain of the problem, and 1 meaning it could be a legitimate construct. + This will miss some errors, and is not a substitute for a code review. + + To suppress false-positive errors of a certain category, add a + 'NOLINT(category)' comment to the line. NOLINT or NOLINT(*) + suppresses errors of all categories on that line. + + The files passed in will be linted; at least one file must be provided. + Default linted extensions are .cc, .cpp, .cu, .cuh and .h. Change the + extensions with the --extensions flag. + + Flags: + + output=vs7 + By default, the output is formatted to ease emacs parsing. Visual Studio + compatible output (vs7) may also be used. Other formats are unsupported. + + verbose=# + Specify a number 0-5 to restrict errors to certain verbosity levels. + + filter=-x,+y,... + Specify a comma-separated list of category-filters to apply: only + error messages whose category names pass the filters will be printed. + (Category names are printed with the message and look like + "[whitespace/indent]".) Filters are evaluated left to right. + "-FOO" and "FOO" means "do not print categories that start with FOO". + "+FOO" means "do print categories that start with FOO". + + Examples: --filter=-whitespace,+whitespace/braces + --filter=whitespace,runtime/printf,+runtime/printf_format + --filter=-,+build/include_what_you_use + + To see a list of all the categories used in cpplint, pass no arg: + --filter= + + counting=total|toplevel|detailed + The total number of errors found is always printed. If + 'toplevel' is provided, then the count of errors in each of + the top-level categories like 'build' and 'whitespace' will + also be printed. If 'detailed' is provided, then a count + is provided for each category like 'build/class'. + + root=subdir + The root directory used for deriving header guard CPP variable. + By default, the header guard CPP variable is calculated as the relative + path to the directory that contains .git, .hg, or .svn. When this flag + is specified, the relative path is calculated from the specified + directory. If the specified directory does not exist, this flag is + ignored. + + Examples: + Assuing that src/.git exists, the header guard CPP variables for + src/chrome/browser/ui/browser.h are: + + No flag => CHROME_BROWSER_UI_BROWSER_H_ + --root=chrome => BROWSER_UI_BROWSER_H_ + --root=chrome/browser => UI_BROWSER_H_ + + linelength=digits + This is the allowed line length for the project. The default value is + 80 characters. + + Examples: + --linelength=120 + + extensions=extension,extension,... + The allowed file extensions that cpplint will check + + Examples: + --extensions=hpp,cpp +""" + +# We categorize each error message we print. Here are the categories. +# We want an explicit list so we can list them all in cpplint --filter=. +# If you add a new error message with a new category, add it to the list +# here! cpplint_unittest.py should tell you if you forget to do this. +_ERROR_CATEGORIES = [ + 'build/class', + 'build/deprecated', + 'build/endif_comment', + 'build/explicit_make_pair', + 'build/forward_decl', + 'build/header_guard', + 'build/include', + 'build/include_alpha', + 'build/include_order', + 'build/include_what_you_use', + 'build/namespaces', + 'build/printf_format', + 'build/storage_class', + 'legal/copyright', + 'readability/alt_tokens', + 'readability/braces', + 'readability/casting', + 'readability/check', + 'readability/constructors', + 'readability/fn_size', + 'readability/function', + 'readability/multiline_comment', + 'readability/multiline_string', + 'readability/namespace', + 'readability/nolint', + 'readability/nul', + 'readability/streams', + 'readability/todo', + 'readability/utf8', + 'runtime/arrays', + 'runtime/casting', + 'runtime/explicit', + 'runtime/int', + 'runtime/init', + 'runtime/invalid_increment', + 'runtime/member_string_references', + 'runtime/memset', + 'runtime/operator', + 'runtime/printf', + 'runtime/printf_format', + 'runtime/references', + 'runtime/string', + 'runtime/threadsafe_fn', + 'runtime/vlog', + 'whitespace/blank_line', + 'whitespace/braces', + 'whitespace/comma', + 'whitespace/comments', + 'whitespace/empty_conditional_body', + 'whitespace/empty_loop_body', + 'whitespace/end_of_line', + 'whitespace/ending_newline', + 'whitespace/forcolon', + 'whitespace/indent', + 'whitespace/line_length', + 'whitespace/newline', + 'whitespace/operators', + 'whitespace/parens', + 'whitespace/semicolon', + 'whitespace/tab', + 'whitespace/todo' + ] + +# The default state of the category filter. This is overrided by the --filter= +# flag. By default all errors are on, so only add here categories that should be +# off by default (i.e., categories that must be enabled by the --filter= flags). +# All entries here should start with a '-' or '+', as in the --filter= flag. +_DEFAULT_FILTERS = ['-build/include_alpha'] + +# We used to check for high-bit characters, but after much discussion we +# decided those were OK, as long as they were in UTF-8 and didn't represent +# hard-coded international strings, which belong in a separate i18n file. + + +# C++ headers +_CPP_HEADERS = frozenset([ + # Legacy + 'algobase.h', + 'algo.h', + 'alloc.h', + 'builtinbuf.h', + 'bvector.h', + 'complex.h', + 'defalloc.h', + 'deque.h', + 'editbuf.h', + 'fstream.h', + 'function.h', + 'hash_map', + 'hash_map.h', + 'hash_set', + 'hash_set.h', + 'hashtable.h', + 'heap.h', + 'indstream.h', + 'iomanip.h', + 'iostream.h', + 'istream.h', + 'iterator.h', + 'list.h', + 'map.h', + 'multimap.h', + 'multiset.h', + 'ostream.h', + 'pair.h', + 'parsestream.h', + 'pfstream.h', + 'procbuf.h', + 'pthread_alloc', + 'pthread_alloc.h', + 'rope', + 'rope.h', + 'ropeimpl.h', + 'set.h', + 'slist', + 'slist.h', + 'stack.h', + 'stdiostream.h', + 'stl_alloc.h', + 'stl_relops.h', + 'streambuf.h', + 'stream.h', + 'strfile.h', + 'strstream.h', + 'tempbuf.h', + 'tree.h', + 'type_traits.h', + 'vector.h', + # 17.6.1.2 C++ library headers + 'algorithm', + 'array', + 'atomic', + 'bitset', + 'chrono', + 'codecvt', + 'complex', + 'condition_variable', + 'deque', + 'exception', + 'forward_list', + 'fstream', + 'functional', + 'future', + 'initializer_list', + 'iomanip', + 'ios', + 'iosfwd', + 'iostream', + 'istream', + 'iterator', + 'limits', + 'list', + 'locale', + 'map', + 'memory', + 'mutex', + 'new', + 'numeric', + 'ostream', + 'queue', + 'random', + 'ratio', + 'regex', + 'set', + 'sstream', + 'stack', + 'stdexcept', + 'streambuf', + 'string', + 'strstream', + 'system_error', + 'thread', + 'tuple', + 'typeindex', + 'typeinfo', + 'type_traits', + 'unordered_map', + 'unordered_set', + 'utility', + 'valarray', + 'vector', + # 17.6.1.2 C++ headers for C library facilities + 'cassert', + 'ccomplex', + 'cctype', + 'cerrno', + 'cfenv', + 'cfloat', + 'cinttypes', + 'ciso646', + 'climits', + 'clocale', + 'cmath', + 'csetjmp', + 'csignal', + 'cstdalign', + 'cstdarg', + 'cstdbool', + 'cstddef', + 'cstdint', + 'cstdio', + 'cstdlib', + 'cstring', + 'ctgmath', + 'ctime', + 'cuchar', + 'cwchar', + 'cwctype', + ]) + +# Assertion macros. These are defined in base/logging.h and +# testing/base/gunit.h. Note that the _M versions need to come first +# for substring matching to work. +_CHECK_MACROS = [ + 'DCHECK', 'CHECK', + 'EXPECT_TRUE_M', 'EXPECT_TRUE', + 'ASSERT_TRUE_M', 'ASSERT_TRUE', + 'EXPECT_FALSE_M', 'EXPECT_FALSE', + 'ASSERT_FALSE_M', 'ASSERT_FALSE', + ] + +# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE +_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS]) + +for op, replacement in [('==', 'EQ'), ('!=', 'NE'), + ('>=', 'GE'), ('>', 'GT'), + ('<=', 'LE'), ('<', 'LT')]: + _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement + _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement + _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement + _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement + _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement + _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement + +for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'), + ('>=', 'LT'), ('>', 'LE'), + ('<=', 'GT'), ('<', 'GE')]: + _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement + _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement + _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement + _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement + +# Alternative tokens and their replacements. For full list, see section 2.5 +# Alternative tokens [lex.digraph] in the C++ standard. +# +# Digraphs (such as '%:') are not included here since it's a mess to +# match those on a word boundary. +_ALT_TOKEN_REPLACEMENT = { + 'and': '&&', + 'bitor': '|', + 'or': '||', + 'xor': '^', + 'compl': '~', + 'bitand': '&', + 'and_eq': '&=', + 'or_eq': '|=', + 'xor_eq': '^=', + 'not': '!', + 'not_eq': '!=' + } + +# Compile regular expression that matches all the above keywords. The "[ =()]" +# bit is meant to avoid matching these keywords outside of boolean expressions. +# +# False positives include C-style multi-line comments and multi-line strings +# but those have always been troublesome for cpplint. +_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile( + r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)') + + +# These constants define types of headers for use with +# _IncludeState.CheckNextIncludeOrder(). +_C_SYS_HEADER = 1 +_CPP_SYS_HEADER = 2 +_LIKELY_MY_HEADER = 3 +_POSSIBLE_MY_HEADER = 4 +_OTHER_HEADER = 5 + +# These constants define the current inline assembly state +_NO_ASM = 0 # Outside of inline assembly block +_INSIDE_ASM = 1 # Inside inline assembly block +_END_ASM = 2 # Last line of inline assembly block +_BLOCK_ASM = 3 # The whole block is an inline assembly block + +# Match start of assembly blocks +_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)' + r'(?:\s+(volatile|__volatile__))?' + r'\s*[{(]') + + +_regexp_compile_cache = {} + +# Finds occurrences of NOLINT or NOLINT(...). +_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?') + +# {str, set(int)}: a map from error categories to sets of linenumbers +# on which those errors are expected and should be suppressed. +_error_suppressions = {} + +# The root directory used for deriving header guard CPP variable. +# This is set by --root flag. +_root = None + +# The allowed line length of files. +# This is set by --linelength flag. +_line_length = 80 + +# The allowed extensions for file names +# This is set by --extensions flag. +_valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh']) + +def ParseNolintSuppressions(filename, raw_line, linenum, error): + """Updates the global list of error-suppressions. + + Parses any NOLINT comments on the current line, updating the global + error_suppressions store. Reports an error if the NOLINT comment + was malformed. + + Args: + filename: str, the name of the input file. + raw_line: str, the line of input text, with comments. + linenum: int, the number of the current line. + error: function, an error handler. + """ + # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*). + matched = _RE_SUPPRESSION.search(raw_line) + if matched: + category = matched.group(1) + if category in (None, '(*)'): # => "suppress all" + _error_suppressions.setdefault(None, set()).add(linenum) + else: + if category.startswith('(') and category.endswith(')'): + category = category[1:-1] + if category in _ERROR_CATEGORIES: + _error_suppressions.setdefault(category, set()).add(linenum) + else: + error(filename, linenum, 'readability/nolint', 5, + 'Unknown NOLINT error category: %s' % category) + + +def ResetNolintSuppressions(): + "Resets the set of NOLINT suppressions to empty." + _error_suppressions.clear() + + +def IsErrorSuppressedByNolint(category, linenum): + """Returns true if the specified error category is suppressed on this line. + + Consults the global error_suppressions map populated by + ParseNolintSuppressions/ResetNolintSuppressions. + + Args: + category: str, the category of the error. + linenum: int, the current line number. + Returns: + bool, True iff the error should be suppressed due to a NOLINT comment. + """ + return (linenum in _error_suppressions.get(category, set()) or + linenum in _error_suppressions.get(None, set())) + +def Match(pattern, s): + """Matches the string with the pattern, caching the compiled regexp.""" + # The regexp compilation caching is inlined in both Match and Search for + # performance reasons; factoring it out into a separate function turns out + # to be noticeably expensive. + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].match(s) + + +def ReplaceAll(pattern, rep, s): + """Replaces instances of pattern in a string with a replacement. + + The compiled regex is kept in a cache shared by Match and Search. + + Args: + pattern: regex pattern + rep: replacement text + s: search string + + Returns: + string with replacements made (or original string if no replacements) + """ + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].sub(rep, s) + + +def Search(pattern, s): + """Searches the string for the pattern, caching the compiled regexp.""" + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].search(s) + + +class _IncludeState(dict): + """Tracks line numbers for includes, and the order in which includes appear. + + As a dict, an _IncludeState object serves as a mapping between include + filename and line number on which that file was included. + + Call CheckNextIncludeOrder() once for each header in the file, passing + in the type constants defined above. Calls in an illegal order will + raise an _IncludeError with an appropriate error message. + + """ + # self._section will move monotonically through this set. If it ever + # needs to move backwards, CheckNextIncludeOrder will raise an error. + _INITIAL_SECTION = 0 + _MY_H_SECTION = 1 + _C_SECTION = 2 + _CPP_SECTION = 3 + _OTHER_H_SECTION = 4 + + _TYPE_NAMES = { + _C_SYS_HEADER: 'C system header', + _CPP_SYS_HEADER: 'C++ system header', + _LIKELY_MY_HEADER: 'header this file implements', + _POSSIBLE_MY_HEADER: 'header this file may implement', + _OTHER_HEADER: 'other header', + } + _SECTION_NAMES = { + _INITIAL_SECTION: "... nothing. (This can't be an error.)", + _MY_H_SECTION: 'a header this file implements', + _C_SECTION: 'C system header', + _CPP_SECTION: 'C++ system header', + _OTHER_H_SECTION: 'other header', + } + + def __init__(self): + dict.__init__(self) + self.ResetSection() + + def ResetSection(self): + # The name of the current section. + self._section = self._INITIAL_SECTION + # The path of last found header. + self._last_header = '' + + def SetLastHeader(self, header_path): + self._last_header = header_path + + def CanonicalizeAlphabeticalOrder(self, header_path): + """Returns a path canonicalized for alphabetical comparison. + + - replaces "-" with "_" so they both cmp the same. + - removes '-inl' since we don't require them to be after the main header. + - lowercase everything, just in case. + + Args: + header_path: Path to be canonicalized. + + Returns: + Canonicalized path. + """ + return header_path.replace('-inl.h', '.h').replace('-', '_').lower() + + def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path): + """Check if a header is in alphabetical order with the previous header. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + header_path: Canonicalized header to be checked. + + Returns: + Returns true if the header is in alphabetical order. + """ + # If previous section is different from current section, _last_header will + # be reset to empty string, so it's always less than current header. + # + # If previous line was a blank line, assume that the headers are + # intentionally sorted the way they are. + if (self._last_header > header_path and + not Match(r'^\s*$', clean_lines.elided[linenum - 1])): + return False + return True + + def CheckNextIncludeOrder(self, header_type): + """Returns a non-empty error message if the next header is out of order. + + This function also updates the internal state to be ready to check + the next include. + + Args: + header_type: One of the _XXX_HEADER constants defined above. + + Returns: + The empty string if the header is in the right order, or an + error message describing what's wrong. + + """ + error_message = ('Found %s after %s' % + (self._TYPE_NAMES[header_type], + self._SECTION_NAMES[self._section])) + + last_section = self._section + + if header_type == _C_SYS_HEADER: + if self._section <= self._C_SECTION: + self._section = self._C_SECTION + else: + self._last_header = '' + return error_message + elif header_type == _CPP_SYS_HEADER: + if self._section <= self._CPP_SECTION: + self._section = self._CPP_SECTION + else: + self._last_header = '' + return error_message + elif header_type == _LIKELY_MY_HEADER: + if self._section <= self._MY_H_SECTION: + self._section = self._MY_H_SECTION + else: + self._section = self._OTHER_H_SECTION + elif header_type == _POSSIBLE_MY_HEADER: + if self._section <= self._MY_H_SECTION: + self._section = self._MY_H_SECTION + else: + # This will always be the fallback because we're not sure + # enough that the header is associated with this file. + self._section = self._OTHER_H_SECTION + else: + assert header_type == _OTHER_HEADER + self._section = self._OTHER_H_SECTION + + if last_section != self._section: + self._last_header = '' + + return '' + + +class _CppLintState(object): + """Maintains module-wide state..""" + + def __init__(self): + self.verbose_level = 1 # global setting. + self.error_count = 0 # global count of reported errors + # filters to apply when emitting error messages + self.filters = _DEFAULT_FILTERS[:] + self.counting = 'total' # In what way are we counting errors? + self.errors_by_category = {} # string to int dict storing error counts + + # output format: + # "emacs" - format that emacs can parse (default) + # "vs7" - format that Microsoft Visual Studio 7 can parse + self.output_format = 'emacs' + + def SetOutputFormat(self, output_format): + """Sets the output format for errors.""" + self.output_format = output_format + + def SetVerboseLevel(self, level): + """Sets the module's verbosity, and returns the previous setting.""" + last_verbose_level = self.verbose_level + self.verbose_level = level + return last_verbose_level + + def SetCountingStyle(self, counting_style): + """Sets the module's counting options.""" + self.counting = counting_style + + def SetFilters(self, filters): + """Sets the error-message filters. + + These filters are applied when deciding whether to emit a given + error message. + + Args: + filters: A string of comma-separated filters (eg "+whitespace/indent"). + Each filter should start with + or -; else we die. + + Raises: + ValueError: The comma-separated filters did not all start with '+' or '-'. + E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter" + """ + # Default filters always have less priority than the flag ones. + self.filters = _DEFAULT_FILTERS[:] + for filt in filters.split(','): + clean_filt = filt.strip() + if clean_filt: + self.filters.append(clean_filt) + for filt in self.filters: + if not (filt.startswith('+') or filt.startswith('-')): + raise ValueError('Every filter in --filters must start with + or -' + ' (%s does not)' % filt) + + def ResetErrorCounts(self): + """Sets the module's error statistic back to zero.""" + self.error_count = 0 + self.errors_by_category = {} + + def IncrementErrorCount(self, category): + """Bumps the module's error statistic.""" + self.error_count += 1 + if self.counting in ('toplevel', 'detailed'): + if self.counting != 'detailed': + category = category.split('/')[0] + if category not in self.errors_by_category: + self.errors_by_category[category] = 0 + self.errors_by_category[category] += 1 + + def PrintErrorCounts(self): + """Print a summary of errors by category, and the total.""" + for category, count in self.errors_by_category.iteritems(): + sys.stderr.write('Category \'%s\' errors found: %d\n' % + (category, count)) + sys.stderr.write('Total errors found: %d\n' % self.error_count) + +_cpplint_state = _CppLintState() + + +def _OutputFormat(): + """Gets the module's output format.""" + return _cpplint_state.output_format + + +def _SetOutputFormat(output_format): + """Sets the module's output format.""" + _cpplint_state.SetOutputFormat(output_format) + + +def _VerboseLevel(): + """Returns the module's verbosity setting.""" + return _cpplint_state.verbose_level + + +def _SetVerboseLevel(level): + """Sets the module's verbosity, and returns the previous setting.""" + return _cpplint_state.SetVerboseLevel(level) + + +def _SetCountingStyle(level): + """Sets the module's counting options.""" + _cpplint_state.SetCountingStyle(level) + + +def _Filters(): + """Returns the module's list of output filters, as a list.""" + return _cpplint_state.filters + + +def _SetFilters(filters): + """Sets the module's error-message filters. + + These filters are applied when deciding whether to emit a given + error message. + + Args: + filters: A string of comma-separated filters (eg "whitespace/indent"). + Each filter should start with + or -; else we die. + """ + _cpplint_state.SetFilters(filters) + + +class _FunctionState(object): + """Tracks current function name and the number of lines in its body.""" + + _NORMAL_TRIGGER = 250 # for --v=0, 500 for --v=1, etc. + _TEST_TRIGGER = 400 # about 50% more than _NORMAL_TRIGGER. + + def __init__(self): + self.in_a_function = False + self.lines_in_function = 0 + self.current_function = '' + + def Begin(self, function_name): + """Start analyzing function body. + + Args: + function_name: The name of the function being tracked. + """ + self.in_a_function = True + self.lines_in_function = 0 + self.current_function = function_name + + def Count(self): + """Count line in current function body.""" + if self.in_a_function: + self.lines_in_function += 1 + + def Check(self, error, filename, linenum): + """Report if too many lines in function body. + + Args: + error: The function to call with any errors found. + filename: The name of the current file. + linenum: The number of the line to check. + """ + if Match(r'T(EST|est)', self.current_function): + base_trigger = self._TEST_TRIGGER + else: + base_trigger = self._NORMAL_TRIGGER + trigger = base_trigger * 2**_VerboseLevel() + + if self.lines_in_function > trigger: + error_level = int(math.log(self.lines_in_function / base_trigger, 2)) + # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ... + if error_level > 5: + error_level = 5 + error(filename, linenum, 'readability/fn_size', error_level, + 'Small and focused functions are preferred:' + ' %s has %d non-comment lines' + ' (error triggered by exceeding %d lines).' % ( + self.current_function, self.lines_in_function, trigger)) + + def End(self): + """Stop analyzing function body.""" + self.in_a_function = False + + +class _IncludeError(Exception): + """Indicates a problem with the include order in a file.""" + pass + + +class FileInfo: + """Provides utility functions for filenames. + + FileInfo provides easy access to the components of a file's path + relative to the project root. + """ + + def __init__(self, filename): + self._filename = filename + + def FullName(self): + """Make Windows paths like Unix.""" + return os.path.abspath(self._filename).replace('\\', '/') + + def RepositoryName(self): + """FullName after removing the local path to the repository. + + If we have a real absolute path name here we can try to do something smart: + detecting the root of the checkout and truncating /path/to/checkout from + the name so that we get header guards that don't include things like + "C:\Documents and Settings\..." or "/home/username/..." in them and thus + people on different computers who have checked the source out to different + locations won't see bogus errors. + """ + fullname = self.FullName() + + if os.path.exists(fullname): + project_dir = os.path.dirname(fullname) + + if os.path.exists(os.path.join(project_dir, ".svn")): + # If there's a .svn file in the current directory, we recursively look + # up the directory tree for the top of the SVN checkout + root_dir = project_dir + one_up_dir = os.path.dirname(root_dir) + while os.path.exists(os.path.join(one_up_dir, ".svn")): + root_dir = os.path.dirname(root_dir) + one_up_dir = os.path.dirname(one_up_dir) + + prefix = os.path.commonprefix([root_dir, project_dir]) + return fullname[len(prefix) + 1:] + + # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by + # searching up from the current path. + root_dir = os.path.dirname(fullname) + while (root_dir != os.path.dirname(root_dir) and + not os.path.exists(os.path.join(root_dir, ".git")) and + not os.path.exists(os.path.join(root_dir, ".hg")) and + not os.path.exists(os.path.join(root_dir, ".svn"))): + root_dir = os.path.dirname(root_dir) + + if (os.path.exists(os.path.join(root_dir, ".git")) or + os.path.exists(os.path.join(root_dir, ".hg")) or + os.path.exists(os.path.join(root_dir, ".svn"))): + prefix = os.path.commonprefix([root_dir, project_dir]) + return fullname[len(prefix) + 1:] + + # Don't know what to do; header guard warnings may be wrong... + return fullname + + def Split(self): + """Splits the file into the directory, basename, and extension. + + For 'chrome/browser/browser.cc', Split() would + return ('chrome/browser', 'browser', '.cc') + + Returns: + A tuple of (directory, basename, extension). + """ + + googlename = self.RepositoryName() + project, rest = os.path.split(googlename) + return (project,) + os.path.splitext(rest) + + def BaseName(self): + """File base name - text after the final slash, before the final period.""" + return self.Split()[1] + + def Extension(self): + """File extension - text following the final period.""" + return self.Split()[2] + + def NoExtension(self): + """File has no source file extension.""" + return '/'.join(self.Split()[0:2]) + + def IsSource(self): + """File has a source file extension.""" + return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx') + + +def _ShouldPrintError(category, confidence, linenum): + """If confidence >= verbose, category passes filter and is not suppressed.""" + + # There are three ways we might decide not to print an error message: + # a "NOLINT(category)" comment appears in the source, + # the verbosity level isn't high enough, or the filters filter it out. + if IsErrorSuppressedByNolint(category, linenum): + return False + if confidence < _cpplint_state.verbose_level: + return False + + is_filtered = False + for one_filter in _Filters(): + if one_filter.startswith('-'): + if category.startswith(one_filter[1:]): + is_filtered = True + elif one_filter.startswith('+'): + if category.startswith(one_filter[1:]): + is_filtered = False + else: + assert False # should have been checked for in SetFilter. + if is_filtered: + return False + + return True + + +def Error(filename, linenum, category, confidence, message): + """Logs the fact we've found a lint error. + + We log where the error was found, and also our confidence in the error, + that is, how certain we are this is a legitimate style regression, and + not a misidentification or a use that's sometimes justified. + + False positives can be suppressed by the use of + "cpplint(category)" comments on the offending line. These are + parsed into _error_suppressions. + + Args: + filename: The name of the file containing the error. + linenum: The number of the line containing the error. + category: A string used to describe the "category" this bug + falls under: "whitespace", say, or "runtime". Categories + may have a hierarchy separated by slashes: "whitespace/indent". + confidence: A number from 1-5 representing a confidence score for + the error, with 5 meaning that we are certain of the problem, + and 1 meaning that it could be a legitimate construct. + message: The error message. + """ + if _ShouldPrintError(category, confidence, linenum): + _cpplint_state.IncrementErrorCount(category) + if _cpplint_state.output_format == 'vs7': + sys.stderr.write('%s(%s): %s [%s] [%d]\n' % ( + filename, linenum, message, category, confidence)) + elif _cpplint_state.output_format == 'eclipse': + sys.stderr.write('%s:%s: warning: %s [%s] [%d]\n' % ( + filename, linenum, message, category, confidence)) + else: + sys.stderr.write('%s:%s: %s [%s] [%d]\n' % ( + filename, linenum, message, category, confidence)) + + +# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard. +_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile( + r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)') +# Matches strings. Escape codes should already be removed by ESCAPES. +_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"') +# Matches characters. Escape codes should already be removed by ESCAPES. +_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'") +# Matches multi-line C++ comments. +# This RE is a little bit more complicated than one might expect, because we +# have to take care of space removals tools so we can handle comments inside +# statements better. +# The current rule is: We only clear spaces from both sides when we're at the +# end of the line. Otherwise, we try to remove spaces from the right side, +# if this doesn't work we try on left side but only if there's a non-character +# on the right. +_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile( + r"""(\s*/\*.*\*/\s*$| + /\*.*\*/\s+| + \s+/\*.*\*/(?=\W)| + /\*.*\*/)""", re.VERBOSE) + + +def IsCppString(line): + """Does line terminate so, that the next symbol is in string constant. + + This function does not consider single-line nor multi-line comments. + + Args: + line: is a partial line of code starting from the 0..n. + + Returns: + True, if next character appended to 'line' is inside a + string constant. + """ + + line = line.replace(r'\\', 'XX') # after this, \\" does not match to \" + return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1 + + +def CleanseRawStrings(raw_lines): + """Removes C++11 raw strings from lines. + + Before: + static const char kData[] = R"( + multi-line string + )"; + + After: + static const char kData[] = "" + (replaced by blank line) + ""; + + Args: + raw_lines: list of raw lines. + + Returns: + list of lines with C++11 raw strings replaced by empty strings. + """ + + delimiter = None + lines_without_raw_strings = [] + for line in raw_lines: + if delimiter: + # Inside a raw string, look for the end + end = line.find(delimiter) + if end >= 0: + # Found the end of the string, match leading space for this + # line and resume copying the original lines, and also insert + # a "" on the last line. + leading_space = Match(r'^(\s*)\S', line) + line = leading_space.group(1) + '""' + line[end + len(delimiter):] + delimiter = None + else: + # Haven't found the end yet, append a blank line. + line = '' + + else: + # Look for beginning of a raw string. + # See 2.14.15 [lex.string] for syntax. + matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line) + if matched: + delimiter = ')' + matched.group(2) + '"' + + end = matched.group(3).find(delimiter) + if end >= 0: + # Raw string ended on same line + line = (matched.group(1) + '""' + + matched.group(3)[end + len(delimiter):]) + delimiter = None + else: + # Start of a multi-line raw string + line = matched.group(1) + '""' + + lines_without_raw_strings.append(line) + + # TODO(unknown): if delimiter is not None here, we might want to + # emit a warning for unterminated string. + return lines_without_raw_strings + + +def FindNextMultiLineCommentStart(lines, lineix): + """Find the beginning marker for a multiline comment.""" + while lineix < len(lines): + if lines[lineix].strip().startswith('/*'): + # Only return this marker if the comment goes beyond this line + if lines[lineix].strip().find('*/', 2) < 0: + return lineix + lineix += 1 + return len(lines) + + +def FindNextMultiLineCommentEnd(lines, lineix): + """We are inside a comment, find the end marker.""" + while lineix < len(lines): + if lines[lineix].strip().endswith('*/'): + return lineix + lineix += 1 + return len(lines) + + +def RemoveMultiLineCommentsFromRange(lines, begin, end): + """Clears a range of lines for multi-line comments.""" + # Having // dummy comments makes the lines non-empty, so we will not get + # unnecessary blank line warnings later in the code. + for i in range(begin, end): + lines[i] = '// dummy' + + +def RemoveMultiLineComments(filename, lines, error): + """Removes multiline (c-style) comments from lines.""" + lineix = 0 + while lineix < len(lines): + lineix_begin = FindNextMultiLineCommentStart(lines, lineix) + if lineix_begin >= len(lines): + return + lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin) + if lineix_end >= len(lines): + error(filename, lineix_begin + 1, 'readability/multiline_comment', 5, + 'Could not find end of multi-line comment') + return + RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1) + lineix = lineix_end + 1 + + +def CleanseComments(line): + """Removes //-comments and single-line C-style /* */ comments. + + Args: + line: A line of C++ source. + + Returns: + The line with single-line comments removed. + """ + commentpos = line.find('//') + if commentpos != -1 and not IsCppString(line[:commentpos]): + line = line[:commentpos].rstrip() + # get rid of /* ... */ + return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line) + + +class CleansedLines(object): + """Holds 3 copies of all lines with different preprocessing applied to them. + + 1) elided member contains lines without strings and comments, + 2) lines member contains lines without comments, and + 3) raw_lines member contains all the lines without processing. + All these three members are of , and of the same length. + """ + + def __init__(self, lines): + self.elided = [] + self.lines = [] + self.raw_lines = lines + self.num_lines = len(lines) + self.lines_without_raw_strings = CleanseRawStrings(lines) + for linenum in range(len(self.lines_without_raw_strings)): + self.lines.append(CleanseComments( + self.lines_without_raw_strings[linenum])) + elided = self._CollapseStrings(self.lines_without_raw_strings[linenum]) + self.elided.append(CleanseComments(elided)) + + def NumLines(self): + """Returns the number of lines represented.""" + return self.num_lines + + @staticmethod + def _CollapseStrings(elided): + """Collapses strings and chars on a line to simple "" or '' blocks. + + We nix strings first so we're not fooled by text like '"http://"' + + Args: + elided: The line being processed. + + Returns: + The line with collapsed strings. + """ + if not _RE_PATTERN_INCLUDE.match(elided): + # Remove escaped characters first to make quote/single quote collapsing + # basic. Things that look like escaped characters shouldn't occur + # outside of strings and chars. + elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided) + elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided) + elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided) + return elided + + +def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar): + """Find the position just after the matching endchar. + + Args: + line: a CleansedLines line. + startpos: start searching at this position. + depth: nesting level at startpos. + startchar: expression opening character. + endchar: expression closing character. + + Returns: + On finding matching endchar: (index just after matching endchar, 0) + Otherwise: (-1, new depth at end of this line) + """ + for i in xrange(startpos, len(line)): + if line[i] == startchar: + depth += 1 + elif line[i] == endchar: + depth -= 1 + if depth == 0: + return (i + 1, 0) + return (-1, depth) + + +def CloseExpression(clean_lines, linenum, pos): + """If input points to ( or { or [ or <, finds the position that closes it. + + If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the + linenum/pos that correspond to the closing of the expression. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + pos: A position on the line. + + Returns: + A tuple (line, linenum, pos) pointer *past* the closing brace, or + (line, len(lines), -1) if we never find a close. Note we ignore + strings and comments when matching; and the line we return is the + 'cleansed' line at linenum. + """ + + line = clean_lines.elided[linenum] + startchar = line[pos] + if startchar not in '({[<': + return (line, clean_lines.NumLines(), -1) + if startchar == '(': endchar = ')' + if startchar == '[': endchar = ']' + if startchar == '{': endchar = '}' + if startchar == '<': endchar = '>' + + # Check first line + (end_pos, num_open) = FindEndOfExpressionInLine( + line, pos, 0, startchar, endchar) + if end_pos > -1: + return (line, linenum, end_pos) + + # Continue scanning forward + while linenum < clean_lines.NumLines() - 1: + linenum += 1 + line = clean_lines.elided[linenum] + (end_pos, num_open) = FindEndOfExpressionInLine( + line, 0, num_open, startchar, endchar) + if end_pos > -1: + return (line, linenum, end_pos) + + # Did not find endchar before end of file, give up + return (line, clean_lines.NumLines(), -1) + + +def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar): + """Find position at the matching startchar. + + This is almost the reverse of FindEndOfExpressionInLine, but note + that the input position and returned position differs by 1. + + Args: + line: a CleansedLines line. + endpos: start searching at this position. + depth: nesting level at endpos. + startchar: expression opening character. + endchar: expression closing character. + + Returns: + On finding matching startchar: (index at matching startchar, 0) + Otherwise: (-1, new depth at beginning of this line) + """ + for i in xrange(endpos, -1, -1): + if line[i] == endchar: + depth += 1 + elif line[i] == startchar: + depth -= 1 + if depth == 0: + return (i, 0) + return (-1, depth) + + +def ReverseCloseExpression(clean_lines, linenum, pos): + """If input points to ) or } or ] or >, finds the position that opens it. + + If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the + linenum/pos that correspond to the opening of the expression. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + pos: A position on the line. + + Returns: + A tuple (line, linenum, pos) pointer *at* the opening brace, or + (line, 0, -1) if we never find the matching opening brace. Note + we ignore strings and comments when matching; and the line we + return is the 'cleansed' line at linenum. + """ + line = clean_lines.elided[linenum] + endchar = line[pos] + if endchar not in ')}]>': + return (line, 0, -1) + if endchar == ')': startchar = '(' + if endchar == ']': startchar = '[' + if endchar == '}': startchar = '{' + if endchar == '>': startchar = '<' + + # Check last line + (start_pos, num_open) = FindStartOfExpressionInLine( + line, pos, 0, startchar, endchar) + if start_pos > -1: + return (line, linenum, start_pos) + + # Continue scanning backward + while linenum > 0: + linenum -= 1 + line = clean_lines.elided[linenum] + (start_pos, num_open) = FindStartOfExpressionInLine( + line, len(line) - 1, num_open, startchar, endchar) + if start_pos > -1: + return (line, linenum, start_pos) + + # Did not find startchar before beginning of file, give up + return (line, 0, -1) + + +def CheckForCopyright(filename, lines, error): + """Logs an error if no Copyright message appears at the top of the file.""" + + # We'll say it should occur by line 10. Don't forget there's a + # dummy line at the front. + for line in xrange(1, min(len(lines), 11)): + if re.search(r'Copyright', lines[line], re.I): break + else: # means no copyright line was found + error(filename, 0, 'legal/copyright', 5, + 'No copyright message found. ' + 'You should have a line: "Copyright [year] "') + + +def GetHeaderGuardCPPVariable(filename): + """Returns the CPP variable that should be used as a header guard. + + Args: + filename: The name of a C++ header file. + + Returns: + The CPP variable that should be used as a header guard in the + named file. + + """ + + # Restores original filename in case that cpplint is invoked from Emacs's + # flymake. + filename = re.sub(r'_flymake\.h$', '.h', filename) + filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename) + + fileinfo = FileInfo(filename) + file_path_from_root = fileinfo.RepositoryName() + if _root: + file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root) + return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_' + + +def CheckForHeaderGuard(filename, lines, error): + """Checks that the file contains a header guard. + + Logs an error if no #ifndef header guard is present. For other + headers, checks that the full pathname is used. + + Args: + filename: The name of the C++ header file. + lines: An array of strings, each representing a line of the file. + error: The function to call with any errors found. + """ + + cppvar = GetHeaderGuardCPPVariable(filename) + + ifndef = None + ifndef_linenum = 0 + define = None + endif = None + endif_linenum = 0 + for linenum, line in enumerate(lines): + # Already been well guarded, no need for further checking. + if line.strip() == "#pragma once": + return + linesplit = line.split() + if len(linesplit) >= 2: + # find the first occurrence of #ifndef and #define, save arg + if not ifndef and linesplit[0] == '#ifndef': + # set ifndef to the header guard presented on the #ifndef line. + ifndef = linesplit[1] + ifndef_linenum = linenum + if not define and linesplit[0] == '#define': + define = linesplit[1] + # find the last occurrence of #endif, save entire line + if line.startswith('#endif'): + endif = line + endif_linenum = linenum + + if not ifndef: + error(filename, 0, 'build/header_guard', 5, + 'No #ifndef header guard found, suggested CPP variable is: %s' % + cppvar) + return + + if not define: + error(filename, 0, 'build/header_guard', 5, + 'No #define header guard found, suggested CPP variable is: %s' % + cppvar) + return + + # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__ + # for backward compatibility. + if ifndef != cppvar: + error_level = 0 + if ifndef != cppvar + '_': + error_level = 5 + + ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum, + error) + error(filename, ifndef_linenum, 'build/header_guard', error_level, + '#ifndef header guard has wrong style, please use: %s' % cppvar) + + if define != ifndef: + error(filename, 0, 'build/header_guard', 5, + '#ifndef and #define don\'t match, suggested CPP variable is: %s' % + cppvar) + return + + if endif != ('#endif // %s' % cppvar): + error_level = 0 + if endif != ('#endif // %s' % (cppvar + '_')): + error_level = 5 + + ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum, + error) + error(filename, endif_linenum, 'build/header_guard', error_level, + '#endif line should be "#endif // %s"' % cppvar) + + +def CheckForBadCharacters(filename, lines, error): + """Logs an error for each line containing bad characters. + + Two kinds of bad characters: + + 1. Unicode replacement characters: These indicate that either the file + contained invalid UTF-8 (likely) or Unicode replacement characters (which + it shouldn't). Note that it's possible for this to throw off line + numbering if the invalid UTF-8 occurred adjacent to a newline. + + 2. NUL bytes. These are problematic for some tools. + + Args: + filename: The name of the current file. + lines: An array of strings, each representing a line of the file. + error: The function to call with any errors found. + """ + for linenum, line in enumerate(lines): + if u'\ufffd' in line: + error(filename, linenum, 'readability/utf8', 5, + 'Line contains invalid UTF-8 (or Unicode replacement character).') + if '\0' in line: + error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.') + + +def CheckForNewlineAtEOF(filename, lines, error): + """Logs an error if there is no newline char at the end of the file. + + Args: + filename: The name of the current file. + lines: An array of strings, each representing a line of the file. + error: The function to call with any errors found. + """ + + # The array lines() was created by adding two newlines to the + # original file (go figure), then splitting on \n. + # To verify that the file ends in \n, we just have to make sure the + # last-but-two element of lines() exists and is empty. + if len(lines) < 3 or lines[-2]: + error(filename, len(lines) - 2, 'whitespace/ending_newline', 5, + 'Could not find a newline character at the end of the file.') + + +def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error): + """Logs an error if we see /* ... */ or "..." that extend past one line. + + /* ... */ comments are legit inside macros, for one line. + Otherwise, we prefer // comments, so it's ok to warn about the + other. Likewise, it's ok for strings to extend across multiple + lines, as long as a line continuation character (backslash) + terminates each line. Although not currently prohibited by the C++ + style guide, it's ugly and unnecessary. We don't do well with either + in this lint program, so we warn about both. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Remove all \\ (escaped backslashes) from the line. They are OK, and the + # second (escaped) slash may trigger later \" detection erroneously. + line = line.replace('\\\\', '') + + if line.count('/*') > line.count('*/'): + error(filename, linenum, 'readability/multiline_comment', 5, + 'Complex multi-line /*...*/-style comment found. ' + 'Lint may give bogus warnings. ' + 'Consider replacing these with //-style comments, ' + 'with #if 0...#endif, ' + 'or with more clearly structured multi-line comments.') + + if (line.count('"') - line.count('\\"')) % 2: + error(filename, linenum, 'readability/multiline_string', 5, + 'Multi-line string ("...") found. This lint script doesn\'t ' + 'do well with such strings, and may give bogus warnings. ' + 'Use C++11 raw strings or concatenation instead.') + + +threading_list = ( + ('asctime(', 'asctime_r('), + ('ctime(', 'ctime_r('), + ('getgrgid(', 'getgrgid_r('), + ('getgrnam(', 'getgrnam_r('), + ('getlogin(', 'getlogin_r('), + ('getpwnam(', 'getpwnam_r('), + ('getpwuid(', 'getpwuid_r('), + ('gmtime(', 'gmtime_r('), + ('localtime(', 'localtime_r('), + ('rand(', 'rand_r('), + ('strtok(', 'strtok_r('), + ('ttyname(', 'ttyname_r('), + ) + + +def CheckPosixThreading(filename, clean_lines, linenum, error): + """Checks for calls to thread-unsafe functions. + + Much code has been originally written without consideration of + multi-threading. Also, engineers are relying on their old experience; + they have learned posix before threading extensions were added. These + tests guide the engineers to use thread-safe functions (when using + posix directly). + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + for single_thread_function, multithread_safe_function in threading_list: + ix = line.find(single_thread_function) + # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison + if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and + line[ix - 1] not in ('_', '.', '>'))): + error(filename, linenum, 'runtime/threadsafe_fn', 2, + 'Consider using ' + multithread_safe_function + + '...) instead of ' + single_thread_function + + '...) for improved thread safety.') + + +def CheckVlogArguments(filename, clean_lines, linenum, error): + """Checks that VLOG() is only used for defining a logging level. + + For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and + VLOG(FATAL) are not. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line): + error(filename, linenum, 'runtime/vlog', 5, + 'VLOG() should be used with numeric verbosity level. ' + 'Use LOG() if you want symbolic severity levels.') + + +# Matches invalid increment: *count++, which moves pointer instead of +# incrementing a value. +_RE_PATTERN_INVALID_INCREMENT = re.compile( + r'^\s*\*\w+(\+\+|--);') + + +def CheckInvalidIncrement(filename, clean_lines, linenum, error): + """Checks for invalid increment *count++. + + For example following function: + void increment_counter(int* count) { + *count++; + } + is invalid, because it effectively does count++, moving pointer, and should + be replaced with ++*count, (*count)++ or *count += 1. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + if _RE_PATTERN_INVALID_INCREMENT.match(line): + error(filename, linenum, 'runtime/invalid_increment', 5, + 'Changing pointer instead of value (or unused value of operator*).') + + +class _BlockInfo(object): + """Stores information about a generic block of code.""" + + def __init__(self, seen_open_brace): + self.seen_open_brace = seen_open_brace + self.open_parentheses = 0 + self.inline_asm = _NO_ASM + + def CheckBegin(self, filename, clean_lines, linenum, error): + """Run checks that applies to text up to the opening brace. + + This is mostly for checking the text after the class identifier + and the "{", usually where the base class is specified. For other + blocks, there isn't much to check, so we always pass. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + pass + + def CheckEnd(self, filename, clean_lines, linenum, error): + """Run checks that applies to text after the closing brace. + + This is mostly used for checking end of namespace comments. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + pass + + +class _ClassInfo(_BlockInfo): + """Stores information about a class.""" + + def __init__(self, name, class_or_struct, clean_lines, linenum): + _BlockInfo.__init__(self, False) + self.name = name + self.starting_linenum = linenum + self.is_derived = False + if class_or_struct == 'struct': + self.access = 'public' + self.is_struct = True + else: + self.access = 'private' + self.is_struct = False + + # Remember initial indentation level for this class. Using raw_lines here + # instead of elided to account for leading comments. + initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum]) + if initial_indent: + self.class_indent = len(initial_indent.group(1)) + else: + self.class_indent = 0 + + # Try to find the end of the class. This will be confused by things like: + # class A { + # } *x = { ... + # + # But it's still good enough for CheckSectionSpacing. + self.last_line = 0 + depth = 0 + for i in range(linenum, clean_lines.NumLines()): + line = clean_lines.elided[i] + depth += line.count('{') - line.count('}') + if not depth: + self.last_line = i + break + + def CheckBegin(self, filename, clean_lines, linenum, error): + # Look for a bare ':' + if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]): + self.is_derived = True + + def CheckEnd(self, filename, clean_lines, linenum, error): + # Check that closing brace is aligned with beginning of the class. + # Only do this if the closing brace is indented by only whitespaces. + # This means we will not check single-line class definitions. + indent = Match(r'^( *)\}', clean_lines.elided[linenum]) + if indent and len(indent.group(1)) != self.class_indent: + if self.is_struct: + parent = 'struct ' + self.name + else: + parent = 'class ' + self.name + error(filename, linenum, 'whitespace/indent', 3, + 'Closing brace should be aligned with beginning of %s' % parent) + + +class _NamespaceInfo(_BlockInfo): + """Stores information about a namespace.""" + + def __init__(self, name, linenum): + _BlockInfo.__init__(self, False) + self.name = name or '' + self.starting_linenum = linenum + + def CheckEnd(self, filename, clean_lines, linenum, error): + """Check end of namespace comments.""" + line = clean_lines.raw_lines[linenum] + + # Check how many lines is enclosed in this namespace. Don't issue + # warning for missing namespace comments if there aren't enough + # lines. However, do apply checks if there is already an end of + # namespace comment and it's incorrect. + # + # TODO(unknown): We always want to check end of namespace comments + # if a namespace is large, but sometimes we also want to apply the + # check if a short namespace contained nontrivial things (something + # other than forward declarations). There is currently no logic on + # deciding what these nontrivial things are, so this check is + # triggered by namespace size only, which works most of the time. + if (linenum - self.starting_linenum < 10 + and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)): + return + + # Look for matching comment at end of namespace. + # + # Note that we accept C style "/* */" comments for terminating + # namespaces, so that code that terminate namespaces inside + # preprocessor macros can be cpplint clean. + # + # We also accept stuff like "// end of namespace ." with the + # period at the end. + # + # Besides these, we don't accept anything else, otherwise we might + # get false negatives when existing comment is a substring of the + # expected namespace. + if self.name: + # Named namespace + if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) + + r'[\*/\.\\\s]*$'), + line): + error(filename, linenum, 'readability/namespace', 5, + 'Namespace should be terminated with "// namespace %s"' % + self.name) + else: + # Anonymous namespace + if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line): + error(filename, linenum, 'readability/namespace', 5, + 'Namespace should be terminated with "// namespace"') + + +class _PreprocessorInfo(object): + """Stores checkpoints of nesting stacks when #if/#else is seen.""" + + def __init__(self, stack_before_if): + # The entire nesting stack before #if + self.stack_before_if = stack_before_if + + # The entire nesting stack up to #else + self.stack_before_else = [] + + # Whether we have already seen #else or #elif + self.seen_else = False + + +class _NestingState(object): + """Holds states related to parsing braces.""" + + def __init__(self): + # Stack for tracking all braces. An object is pushed whenever we + # see a "{", and popped when we see a "}". Only 3 types of + # objects are possible: + # - _ClassInfo: a class or struct. + # - _NamespaceInfo: a namespace. + # - _BlockInfo: some other type of block. + self.stack = [] + + # Stack of _PreprocessorInfo objects. + self.pp_stack = [] + + def SeenOpenBrace(self): + """Check if we have seen the opening brace for the innermost block. + + Returns: + True if we have seen the opening brace, False if the innermost + block is still expecting an opening brace. + """ + return (not self.stack) or self.stack[-1].seen_open_brace + + def InNamespaceBody(self): + """Check if we are currently one level inside a namespace body. + + Returns: + True if top of the stack is a namespace block, False otherwise. + """ + return self.stack and isinstance(self.stack[-1], _NamespaceInfo) + + def UpdatePreprocessor(self, line): + """Update preprocessor stack. + + We need to handle preprocessors due to classes like this: + #ifdef SWIG + struct ResultDetailsPageElementExtensionPoint { + #else + struct ResultDetailsPageElementExtensionPoint : public Extension { + #endif + + We make the following assumptions (good enough for most files): + - Preprocessor condition evaluates to true from #if up to first + #else/#elif/#endif. + + - Preprocessor condition evaluates to false from #else/#elif up + to #endif. We still perform lint checks on these lines, but + these do not affect nesting stack. + + Args: + line: current line to check. + """ + if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line): + # Beginning of #if block, save the nesting stack here. The saved + # stack will allow us to restore the parsing state in the #else case. + self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack))) + elif Match(r'^\s*#\s*(else|elif)\b', line): + # Beginning of #else block + if self.pp_stack: + if not self.pp_stack[-1].seen_else: + # This is the first #else or #elif block. Remember the + # whole nesting stack up to this point. This is what we + # keep after the #endif. + self.pp_stack[-1].seen_else = True + self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack) + + # Restore the stack to how it was before the #if + self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if) + else: + # TODO(unknown): unexpected #else, issue warning? + pass + elif Match(r'^\s*#\s*endif\b', line): + # End of #if or #else blocks. + if self.pp_stack: + # If we saw an #else, we will need to restore the nesting + # stack to its former state before the #else, otherwise we + # will just continue from where we left off. + if self.pp_stack[-1].seen_else: + # Here we can just use a shallow copy since we are the last + # reference to it. + self.stack = self.pp_stack[-1].stack_before_else + # Drop the corresponding #if + self.pp_stack.pop() + else: + # TODO(unknown): unexpected #endif, issue warning? + pass + + def Update(self, filename, clean_lines, linenum, error): + """Update nesting state with current line. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Update pp_stack first + self.UpdatePreprocessor(line) + + # Count parentheses. This is to avoid adding struct arguments to + # the nesting stack. + if self.stack: + inner_block = self.stack[-1] + depth_change = line.count('(') - line.count(')') + inner_block.open_parentheses += depth_change + + # Also check if we are starting or ending an inline assembly block. + if inner_block.inline_asm in (_NO_ASM, _END_ASM): + if (depth_change != 0 and + inner_block.open_parentheses == 1 and + _MATCH_ASM.match(line)): + # Enter assembly block + inner_block.inline_asm = _INSIDE_ASM + else: + # Not entering assembly block. If previous line was _END_ASM, + # we will now shift to _NO_ASM state. + inner_block.inline_asm = _NO_ASM + elif (inner_block.inline_asm == _INSIDE_ASM and + inner_block.open_parentheses == 0): + # Exit assembly block + inner_block.inline_asm = _END_ASM + + # Consume namespace declaration at the beginning of the line. Do + # this in a loop so that we catch same line declarations like this: + # namespace proto2 { namespace bridge { class MessageSet; } } + while True: + # Match start of namespace. The "\b\s*" below catches namespace + # declarations even if it weren't followed by a whitespace, this + # is so that we don't confuse our namespace checker. The + # missing spaces will be flagged by CheckSpacing. + namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line) + if not namespace_decl_match: + break + + new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum) + self.stack.append(new_namespace) + + line = namespace_decl_match.group(2) + if line.find('{') != -1: + new_namespace.seen_open_brace = True + line = line[line.find('{') + 1:] + + # Look for a class declaration in whatever is left of the line + # after parsing namespaces. The regexp accounts for decorated classes + # such as in: + # class LOCKABLE API Object { + # }; + # + # Templates with class arguments may confuse the parser, for example: + # template , + # class Vector = vector > + # class HeapQueue { + # + # Because this parser has no nesting state about templates, by the + # time it saw "class Comparator", it may think that it's a new class. + # Nested templates have a similar problem: + # template < + # typename ExportedType, + # typename TupleType, + # template class ImplTemplate> + # + # To avoid these cases, we ignore classes that are followed by '=' or '>' + class_decl_match = Match( + r'\s*(template\s*<[\w\s<>,:]*>\s*)?' + r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)' + r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line) + if (class_decl_match and + (not self.stack or self.stack[-1].open_parentheses == 0)): + self.stack.append(_ClassInfo( + class_decl_match.group(4), class_decl_match.group(2), + clean_lines, linenum)) + line = class_decl_match.group(5) + + # If we have not yet seen the opening brace for the innermost block, + # run checks here. + if not self.SeenOpenBrace(): + self.stack[-1].CheckBegin(filename, clean_lines, linenum, error) + + # Update access control if we are inside a class/struct + if self.stack and isinstance(self.stack[-1], _ClassInfo): + classinfo = self.stack[-1] + access_match = Match( + r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?' + r':(?:[^:]|$)', + line) + if access_match: + classinfo.access = access_match.group(2) + + # Check that access keywords are indented +1 space. Skip this + # check if the keywords are not preceded by whitespaces. + indent = access_match.group(1) + if (len(indent) != classinfo.class_indent + 1 and + Match(r'^\s*$', indent)): + if classinfo.is_struct: + parent = 'struct ' + classinfo.name + else: + parent = 'class ' + classinfo.name + slots = '' + if access_match.group(3): + slots = access_match.group(3) + error(filename, linenum, 'whitespace/indent', 3, + '%s%s: should be indented +1 space inside %s' % ( + access_match.group(2), slots, parent)) + + # Consume braces or semicolons from what's left of the line + while True: + # Match first brace, semicolon, or closed parenthesis. + matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line) + if not matched: + break + + token = matched.group(1) + if token == '{': + # If namespace or class hasn't seen a opening brace yet, mark + # namespace/class head as complete. Push a new block onto the + # stack otherwise. + if not self.SeenOpenBrace(): + self.stack[-1].seen_open_brace = True + else: + self.stack.append(_BlockInfo(True)) + if _MATCH_ASM.match(line): + self.stack[-1].inline_asm = _BLOCK_ASM + elif token == ';' or token == ')': + # If we haven't seen an opening brace yet, but we already saw + # a semicolon, this is probably a forward declaration. Pop + # the stack for these. + # + # Similarly, if we haven't seen an opening brace yet, but we + # already saw a closing parenthesis, then these are probably + # function arguments with extra "class" or "struct" keywords. + # Also pop these stack for these. + if not self.SeenOpenBrace(): + self.stack.pop() + else: # token == '}' + # Perform end of block checks and pop the stack. + if self.stack: + self.stack[-1].CheckEnd(filename, clean_lines, linenum, error) + self.stack.pop() + line = matched.group(2) + + def InnermostClass(self): + """Get class info on the top of the stack. + + Returns: + A _ClassInfo object if we are inside a class, or None otherwise. + """ + for i in range(len(self.stack), 0, -1): + classinfo = self.stack[i - 1] + if isinstance(classinfo, _ClassInfo): + return classinfo + return None + + def CheckCompletedBlocks(self, filename, error): + """Checks that all classes and namespaces have been completely parsed. + + Call this when all lines in a file have been processed. + Args: + filename: The name of the current file. + error: The function to call with any errors found. + """ + # Note: This test can result in false positives if #ifdef constructs + # get in the way of brace matching. See the testBuildClass test in + # cpplint_unittest.py for an example of this. + for obj in self.stack: + if isinstance(obj, _ClassInfo): + error(filename, obj.starting_linenum, 'build/class', 5, + 'Failed to find complete declaration of class %s' % + obj.name) + elif isinstance(obj, _NamespaceInfo): + error(filename, obj.starting_linenum, 'build/namespaces', 5, + 'Failed to find complete declaration of namespace %s' % + obj.name) + + +def CheckForNonStandardConstructs(filename, clean_lines, linenum, + nesting_state, error): + r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2. + + Complain about several constructs which gcc-2 accepts, but which are + not standard C++. Warning about these in lint is one way to ease the + transition to new compilers. + - put storage class first (e.g. "static const" instead of "const static"). + - "%lld" instead of %qd" in printf-type functions. + - "%1$d" is non-standard in printf-type functions. + - "\%" is an undefined character escape sequence. + - text after #endif is not allowed. + - invalid inner-style forward declaration. + - >? and ?= and )\?=?\s*(\w+|[+-]?\d+)(\.\d*)?', + line): + error(filename, linenum, 'build/deprecated', 3, + '>? and ))?' + # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;' + error(filename, linenum, 'runtime/member_string_references', 2, + 'const string& members are dangerous. It is much better to use ' + 'alternatives, such as pointers or simple constants.') + + # Everything else in this function operates on class declarations. + # Return early if the top of the nesting stack is not a class, or if + # the class head is not completed yet. + classinfo = nesting_state.InnermostClass() + if not classinfo or not classinfo.seen_open_brace: + return + + # The class may have been declared with namespace or classname qualifiers. + # The constructor and destructor will not have those qualifiers. + base_classname = classinfo.name.split('::')[-1] + + # Look for single-argument constructors that aren't marked explicit. + # Technically a valid construct, but against style. + args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)' + % re.escape(base_classname), + line) + if (args and + args.group(1) != 'void' and + not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&' + % re.escape(base_classname), args.group(1).strip())): + error(filename, linenum, 'runtime/explicit', 5, + 'Single-argument constructors should be marked explicit.') + + +def CheckSpacingForFunctionCall(filename, line, linenum, error): + """Checks for the correctness of various spacing around function calls. + + Args: + filename: The name of the current file. + line: The text of the line to check. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + # Since function calls often occur inside if/for/while/switch + # expressions - which have their own, more liberal conventions - we + # first see if we should be looking inside such an expression for a + # function call, to which we can apply more strict standards. + fncall = line # if there's no control flow construct, look at whole line + for pattern in (r'\bif\s*\((.*)\)\s*{', + r'\bfor\s*\((.*)\)\s*{', + r'\bwhile\s*\((.*)\)\s*[{;]', + r'\bswitch\s*\((.*)\)\s*{'): + match = Search(pattern, line) + if match: + fncall = match.group(1) # look inside the parens for function calls + break + + # Except in if/for/while/switch, there should never be space + # immediately inside parens (eg "f( 3, 4 )"). We make an exception + # for nested parens ( (a+b) + c ). Likewise, there should never be + # a space before a ( when it's a function argument. I assume it's a + # function argument when the char before the whitespace is legal in + # a function name (alnum + _) and we're not starting a macro. Also ignore + # pointers and references to arrays and functions coz they're too tricky: + # we use a very simple way to recognize these: + # " (something)(maybe-something)" or + # " (something)(maybe-something," or + # " (something)[something]" + # Note that we assume the contents of [] to be short enough that + # they'll never need to wrap. + if ( # Ignore control structures. + not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b', + fncall) and + # Ignore pointers/references to functions. + not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and + # Ignore pointers/references to arrays. + not Search(r' \([^)]+\)\[[^\]]+\]', fncall)): + if Search(r'\w\s*\(\s(?!\s*\\$)', fncall): # a ( used for a fn call + error(filename, linenum, 'whitespace/parens', 4, + 'Extra space after ( in function call') + elif Search(r'\(\s+(?!(\s*\\)|\()', fncall): + error(filename, linenum, 'whitespace/parens', 2, + 'Extra space after (') + if (Search(r'\w\s+\(', fncall) and + not Search(r'#\s*define|typedef', fncall) and + not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)): + error(filename, linenum, 'whitespace/parens', 4, + 'Extra space before ( in function call') + # If the ) is followed only by a newline or a { + newline, assume it's + # part of a control statement (if/while/etc), and don't complain + if Search(r'[^)]\s+\)\s*[^{\s]', fncall): + # If the closing parenthesis is preceded by only whitespaces, + # try to give a more descriptive error message. + if Search(r'^\s+\)', fncall): + error(filename, linenum, 'whitespace/parens', 2, + 'Closing ) should be moved to the previous line') + else: + error(filename, linenum, 'whitespace/parens', 2, + 'Extra space before )') + + +def IsBlankLine(line): + """Returns true if the given line is blank. + + We consider a line to be blank if the line is empty or consists of + only white spaces. + + Args: + line: A line of a string. + + Returns: + True, if the given line is blank. + """ + return not line or line.isspace() + + +def CheckForFunctionLengths(filename, clean_lines, linenum, + function_state, error): + """Reports for long function bodies. + + For an overview why this is done, see: + http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions + + Uses a simplistic algorithm assuming other style guidelines + (especially spacing) are followed. + Only checks unindented functions, so class members are unchecked. + Trivial bodies are unchecked, so constructors with huge initializer lists + may be missed. + Blank/comment lines are not counted so as to avoid encouraging the removal + of vertical space and comments just to get through a lint check. + NOLINT *on the last line of a function* disables this check. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + function_state: Current function name and lines in body so far. + error: The function to call with any errors found. + """ + lines = clean_lines.lines + line = lines[linenum] + raw = clean_lines.raw_lines + raw_line = raw[linenum] + joined_line = '' + + starting_func = False + regexp = r'(\w(\w|::|\*|\&|\s)*)\(' # decls * & space::name( ... + match_result = Match(regexp, line) + if match_result: + # If the name is all caps and underscores, figure it's a macro and + # ignore it, unless it's TEST or TEST_F. + function_name = match_result.group(1).split()[-1] + if function_name == 'TEST' or function_name == 'TEST_F' or ( + not Match(r'[A-Z_]+$', function_name)): + starting_func = True + + if starting_func: + body_found = False + for start_linenum in xrange(linenum, clean_lines.NumLines()): + start_line = lines[start_linenum] + joined_line += ' ' + start_line.lstrip() + if Search(r'(;|})', start_line): # Declarations and trivial functions + body_found = True + break # ... ignore + elif Search(r'{', start_line): + body_found = True + function = Search(r'((\w|:)*)\(', line).group(1) + if Match(r'TEST', function): # Handle TEST... macros + parameter_regexp = Search(r'(\(.*\))', joined_line) + if parameter_regexp: # Ignore bad syntax + function += parameter_regexp.group(1) + else: + function += '()' + function_state.Begin(function) + break + if not body_found: + # No body for the function (or evidence of a non-function) was found. + error(filename, linenum, 'readability/fn_size', 5, + 'Lint failed to find start of function body.') + elif Match(r'^\}\s*$', line): # function end + function_state.Check(error, filename, linenum) + function_state.End() + elif not Match(r'^\s*$', line): + function_state.Count() # Count non-blank/non-comment lines. + + +_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?') + + +def CheckComment(comment, filename, linenum, error): + """Checks for common mistakes in TODO comments. + + Args: + comment: The text of the comment from the line in question. + filename: The name of the current file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + match = _RE_PATTERN_TODO.match(comment) + if match: + # One whitespace is correct; zero whitespace is handled elsewhere. + leading_whitespace = match.group(1) + if len(leading_whitespace) > 1: + error(filename, linenum, 'whitespace/todo', 2, + 'Too many spaces before TODO') + + username = match.group(2) + if not username: + error(filename, linenum, 'readability/todo', 2, + 'Missing username in TODO; it should look like ' + '"// TODO(my_username): Stuff."') + + middle_whitespace = match.group(3) + # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison + if middle_whitespace != ' ' and middle_whitespace != '': + error(filename, linenum, 'whitespace/todo', 2, + 'TODO(my_username) should be followed by a space') + +def CheckAccess(filename, clean_lines, linenum, nesting_state, error): + """Checks for improper use of DISALLOW* macros. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] # get rid of comments and strings + + matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|' + r'DISALLOW_EVIL_CONSTRUCTORS|' + r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line) + if not matched: + return + if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo): + if nesting_state.stack[-1].access != 'private': + error(filename, linenum, 'readability/constructors', 3, + '%s must be in the private: section' % matched.group(1)) + + else: + # Found DISALLOW* macro outside a class declaration, or perhaps it + # was used inside a function when it should have been part of the + # class declaration. We could issue a warning here, but it + # probably resulted in a compiler error already. + pass + + +def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix): + """Find the corresponding > to close a template. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: Current line number. + init_suffix: Remainder of the current line after the initial <. + + Returns: + True if a matching bracket exists. + """ + line = init_suffix + nesting_stack = ['<'] + while True: + # Find the next operator that can tell us whether < is used as an + # opening bracket or as a less-than operator. We only want to + # warn on the latter case. + # + # We could also check all other operators and terminate the search + # early, e.g. if we got something like this "a(),;\[\]]*([<>(),;\[\]])(.*)$', line) + if match: + # Found an operator, update nesting stack + operator = match.group(1) + line = match.group(2) + + if nesting_stack[-1] == '<': + # Expecting closing angle bracket + if operator in ('<', '(', '['): + nesting_stack.append(operator) + elif operator == '>': + nesting_stack.pop() + if not nesting_stack: + # Found matching angle bracket + return True + elif operator == ',': + # Got a comma after a bracket, this is most likely a template + # argument. We have not seen a closing angle bracket yet, but + # it's probably a few lines later if we look for it, so just + # return early here. + return True + else: + # Got some other operator. + return False + + else: + # Expecting closing parenthesis or closing bracket + if operator in ('<', '(', '['): + nesting_stack.append(operator) + elif operator in (')', ']'): + # We don't bother checking for matching () or []. If we got + # something like (] or [), it would have been a syntax error. + nesting_stack.pop() + + else: + # Scan the next line + linenum += 1 + if linenum >= len(clean_lines.elided): + break + line = clean_lines.elided[linenum] + + # Exhausted all remaining lines and still no matching angle bracket. + # Most likely the input was incomplete, otherwise we should have + # seen a semicolon and returned early. + return True + + +def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix): + """Find the corresponding < that started a template. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: Current line number. + init_prefix: Part of the current line before the initial >. + + Returns: + True if a matching bracket exists. + """ + line = init_prefix + nesting_stack = ['>'] + while True: + # Find the previous operator + match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line) + if match: + # Found an operator, update nesting stack + operator = match.group(2) + line = match.group(1) + + if nesting_stack[-1] == '>': + # Expecting opening angle bracket + if operator in ('>', ')', ']'): + nesting_stack.append(operator) + elif operator == '<': + nesting_stack.pop() + if not nesting_stack: + # Found matching angle bracket + return True + elif operator == ',': + # Got a comma before a bracket, this is most likely a + # template argument. The opening angle bracket is probably + # there if we look for it, so just return early here. + return True + else: + # Got some other operator. + return False + + else: + # Expecting opening parenthesis or opening bracket + if operator in ('>', ')', ']'): + nesting_stack.append(operator) + elif operator in ('(', '['): + nesting_stack.pop() + + else: + # Scan the previous line + linenum -= 1 + if linenum < 0: + break + line = clean_lines.elided[linenum] + + # Exhausted all earlier lines and still no matching angle bracket. + return False + + +def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): + """Checks for the correctness of various spacing issues in the code. + + Things we check for: spaces around operators, spaces after + if/for/while/switch, no spaces around parens in function calls, two + spaces between code and comment, don't start a block with a blank + line, don't end a function with a blank line, don't add a blank line + after public/protected/private, don't have too many blank lines in a row. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + + # Don't use "elided" lines here, otherwise we can't check commented lines. + # Don't want to use "raw" either, because we don't want to check inside C++11 + # raw strings, + raw = clean_lines.lines_without_raw_strings + line = raw[linenum] + + # Before nixing comments, check if the line is blank for no good + # reason. This includes the first line after a block is opened, and + # blank lines at the end of a function (ie, right before a line like '}' + # + # Skip all the blank line checks if we are immediately inside a + # namespace body. In other words, don't issue blank line warnings + # for this block: + # namespace { + # + # } + # + # A warning about missing end of namespace comments will be issued instead. + if IsBlankLine(line) and not nesting_state.InNamespaceBody(): + elided = clean_lines.elided + prev_line = elided[linenum - 1] + prevbrace = prev_line.rfind('{') + # TODO(unknown): Don't complain if line before blank line, and line after, + # both start with alnums and are indented the same amount. + # This ignores whitespace at the start of a namespace block + # because those are not usually indented. + if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1: + # OK, we have a blank line at the start of a code block. Before we + # complain, we check if it is an exception to the rule: The previous + # non-empty line has the parameters of a function header that are indented + # 4 spaces (because they did not fit in a 80 column line when placed on + # the same line as the function name). We also check for the case where + # the previous line is indented 6 spaces, which may happen when the + # initializers of a constructor do not fit into a 80 column line. + exception = False + if Match(r' {6}\w', prev_line): # Initializer list? + # We are looking for the opening column of initializer list, which + # should be indented 4 spaces to cause 6 space indentation afterwards. + search_position = linenum-2 + while (search_position >= 0 + and Match(r' {6}\w', elided[search_position])): + search_position -= 1 + exception = (search_position >= 0 + and elided[search_position][:5] == ' :') + else: + # Search for the function arguments or an initializer list. We use a + # simple heuristic here: If the line is indented 4 spaces; and we have a + # closing paren, without the opening paren, followed by an opening brace + # or colon (for initializer lists) we assume that it is the last line of + # a function header. If we have a colon indented 4 spaces, it is an + # initializer list. + exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)', + prev_line) + or Match(r' {4}:', prev_line)) + + if not exception: + error(filename, linenum, 'whitespace/blank_line', 2, + 'Redundant blank line at the start of a code block ' + 'should be deleted.') + # Ignore blank lines at the end of a block in a long if-else + # chain, like this: + # if (condition1) { + # // Something followed by a blank line + # + # } else if (condition2) { + # // Something else + # } + if linenum + 1 < clean_lines.NumLines(): + next_line = raw[linenum + 1] + if (next_line + and Match(r'\s*}', next_line) + and next_line.find('} else ') == -1): + error(filename, linenum, 'whitespace/blank_line', 3, + 'Redundant blank line at the end of a code block ' + 'should be deleted.') + + matched = Match(r'\s*(public|protected|private):', prev_line) + if matched: + error(filename, linenum, 'whitespace/blank_line', 3, + 'Do not leave a blank line after "%s:"' % matched.group(1)) + + # Next, we complain if there's a comment too near the text + commentpos = line.find('//') + if commentpos != -1: + # Check if the // may be in quotes. If so, ignore it + # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison + if (line.count('"', 0, commentpos) - + line.count('\\"', 0, commentpos)) % 2 == 0: # not in quotes + # Allow one space for new scopes, two spaces otherwise: + if (not Match(r'^\s*{ //', line) and + ((commentpos >= 1 and + line[commentpos-1] not in string.whitespace) or + (commentpos >= 2 and + line[commentpos-2] not in string.whitespace))): + error(filename, linenum, 'whitespace/comments', 2, + 'At least two spaces is best between code and comments') + # There should always be a space between the // and the comment + commentend = commentpos + 2 + if commentend < len(line) and not line[commentend] == ' ': + # but some lines are exceptions -- e.g. if they're big + # comment delimiters like: + # //---------------------------------------------------------- + # or are an empty C++ style Doxygen comment, like: + # /// + # or C++ style Doxygen comments placed after the variable: + # ///< Header comment + # //!< Header comment + # or they begin with multiple slashes followed by a space: + # //////// Header comment + match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or + Search(r'^/$', line[commentend:]) or + Search(r'^!< ', line[commentend:]) or + Search(r'^/< ', line[commentend:]) or + Search(r'^/+ ', line[commentend:])) + if not match: + error(filename, linenum, 'whitespace/comments', 4, + 'Should have a space between // and comment') + CheckComment(line[commentpos:], filename, linenum, error) + + line = clean_lines.elided[linenum] # get rid of comments and strings + + # Don't try to do spacing checks for operator methods + line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line) + + # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )". + # Otherwise not. Note we only check for non-spaces on *both* sides; + # sometimes people put non-spaces on one side when aligning ='s among + # many lines (not that this is behavior that I approve of...) + if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line): + error(filename, linenum, 'whitespace/operators', 4, + 'Missing spaces around =') + + # It's ok not to have spaces around binary operators like + - * /, but if + # there's too little whitespace, we get concerned. It's hard to tell, + # though, so we punt on this one for now. TODO. + + # You should always have whitespace around binary operators. + # + # Check <= and >= first to avoid false positives with < and >, then + # check non-include lines for spacing around < and >. + match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line) + if match: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around %s' % match.group(1)) + # We allow no-spaces around << when used like this: 10<<20, but + # not otherwise (particularly, not when used as streams) + # Also ignore using ns::operator<<; + match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line) + if (match and + not (match.group(1).isdigit() and match.group(2).isdigit()) and + not (match.group(1) == 'operator' and match.group(2) == ';')): + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around <<') + elif not Match(r'#.*include', line): + # Avoid false positives on -> + reduced_line = line.replace('->', '') + + # Look for < that is not surrounded by spaces. This is only + # triggered if both sides are missing spaces, even though + # technically should should flag if at least one side is missing a + # space. This is done to avoid some false positives with shifts. + match = Search(r'[^\s<]<([^\s=<].*)', reduced_line) + if (match and + not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))): + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around <') + + # Look for > that is not surrounded by spaces. Similar to the + # above, we only trigger if both sides are missing spaces to avoid + # false positives with shifts. + match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line) + if (match and + not FindPreviousMatchingAngleBracket(clean_lines, linenum, + match.group(1))): + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around >') + + # We allow no-spaces around >> for almost anything. This is because + # C++11 allows ">>" to close nested templates, which accounts for + # most cases when ">>" is not followed by a space. + # + # We still warn on ">>" followed by alpha character, because that is + # likely due to ">>" being used for right shifts, e.g.: + # value >> alpha + # + # When ">>" is used to close templates, the alphanumeric letter that + # follows would be part of an identifier, and there should still be + # a space separating the template type and the identifier. + # type> alpha + match = Search(r'>>[a-zA-Z_]', line) + if match: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around >>') + + # There shouldn't be space around unary operators + match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line) + if match: + error(filename, linenum, 'whitespace/operators', 4, + 'Extra space for operator %s' % match.group(1)) + + # A pet peeve of mine: no spaces after an if, while, switch, or for + match = Search(r' (if\(|for\(|while\(|switch\()', line) + if match: + error(filename, linenum, 'whitespace/parens', 5, + 'Missing space before ( in %s' % match.group(1)) + + # For if/for/while/switch, the left and right parens should be + # consistent about how many spaces are inside the parens, and + # there should either be zero or one spaces inside the parens. + # We don't want: "if ( foo)" or "if ( foo )". + # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed. + match = Search(r'\b(if|for|while|switch)\s*' + r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$', + line) + if match: + if len(match.group(2)) != len(match.group(4)): + if not (match.group(3) == ';' and + len(match.group(2)) == 1 + len(match.group(4)) or + not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)): + error(filename, linenum, 'whitespace/parens', 5, + 'Mismatching spaces inside () in %s' % match.group(1)) + if len(match.group(2)) not in [0, 1]: + error(filename, linenum, 'whitespace/parens', 5, + 'Should have zero or one spaces inside ( and ) in %s' % + match.group(1)) + + # You should always have a space after a comma (either as fn arg or operator) + # + # This does not apply when the non-space character following the + # comma is another comma, since the only time when that happens is + # for empty macro arguments. + # + # We run this check in two passes: first pass on elided lines to + # verify that lines contain missing whitespaces, second pass on raw + # lines to confirm that those missing whitespaces are not due to + # elided comments. + if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]): + error(filename, linenum, 'whitespace/comma', 3, + 'Missing space after ,') + + # You should always have a space after a semicolon + # except for few corner cases + # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more + # space after ; + if Search(r';[^\s};\\)/]', line): + error(filename, linenum, 'whitespace/semicolon', 3, + 'Missing space after ;') + + # Next we will look for issues with function calls. + CheckSpacingForFunctionCall(filename, line, linenum, error) + + # Except after an opening paren, or after another opening brace (in case of + # an initializer list, for instance), you should have spaces before your + # braces. And since you should never have braces at the beginning of a line, + # this is an easy test. + match = Match(r'^(.*[^ ({]){', line) + if match: + # Try a bit harder to check for brace initialization. This + # happens in one of the following forms: + # Constructor() : initializer_list_{} { ... } + # Constructor{}.MemberFunction() + # Type variable{}; + # FunctionCall(type{}, ...); + # LastArgument(..., type{}); + # LOG(INFO) << type{} << " ..."; + # map_of_type[{...}] = ...; + # + # We check for the character following the closing brace, and + # silence the warning if it's one of those listed above, i.e. + # "{.;,)<]". + # + # To account for nested initializer list, we allow any number of + # closing braces up to "{;,)<". We can't simply silence the + # warning on first sight of closing brace, because that would + # cause false negatives for things that are not initializer lists. + # Silence this: But not this: + # Outer{ if (...) { + # Inner{...} if (...){ // Missing space before { + # }; } + # + # There is a false negative with this approach if people inserted + # spurious semicolons, e.g. "if (cond){};", but we will catch the + # spurious semicolon with a separate check. + (endline, endlinenum, endpos) = CloseExpression( + clean_lines, linenum, len(match.group(1))) + trailing_text = '' + if endpos > -1: + trailing_text = endline[endpos:] + for offset in xrange(endlinenum + 1, + min(endlinenum + 3, clean_lines.NumLines() - 1)): + trailing_text += clean_lines.elided[offset] + if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text): + error(filename, linenum, 'whitespace/braces', 5, + 'Missing space before {') + + # Make sure '} else {' has spaces. + if Search(r'}else', line): + error(filename, linenum, 'whitespace/braces', 5, + 'Missing space before else') + + # You shouldn't have spaces before your brackets, except maybe after + # 'delete []' or 'new char * []'. + if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line): + error(filename, linenum, 'whitespace/braces', 5, + 'Extra space before [') + + # You shouldn't have a space before a semicolon at the end of the line. + # There's a special case for "for" since the style guide allows space before + # the semicolon there. + if Search(r':\s*;\s*$', line): + error(filename, linenum, 'whitespace/semicolon', 5, + 'Semicolon defining empty statement. Use {} instead.') + elif Search(r'^\s*;\s*$', line): + error(filename, linenum, 'whitespace/semicolon', 5, + 'Line contains only semicolon. If this should be an empty statement, ' + 'use {} instead.') + elif (Search(r'\s+;\s*$', line) and + not Search(r'\bfor\b', line)): + error(filename, linenum, 'whitespace/semicolon', 5, + 'Extra space before last semicolon. If this should be an empty ' + 'statement, use {} instead.') + + # In range-based for, we wanted spaces before and after the colon, but + # not around "::" tokens that might appear. + if (Search('for *\(.*[^:]:[^: ]', line) or + Search('for *\(.*[^: ]:[^:]', line)): + error(filename, linenum, 'whitespace/forcolon', 2, + 'Missing space around colon in range-based for loop') + + +def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error): + """Checks for additional blank line issues related to sections. + + Currently the only thing checked here is blank line before protected/private. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + class_info: A _ClassInfo objects. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + # Skip checks if the class is small, where small means 25 lines or less. + # 25 lines seems like a good cutoff since that's the usual height of + # terminals, and any class that can't fit in one screen can't really + # be considered "small". + # + # Also skip checks if we are on the first line. This accounts for + # classes that look like + # class Foo { public: ... }; + # + # If we didn't find the end of the class, last_line would be zero, + # and the check will be skipped by the first condition. + if (class_info.last_line - class_info.starting_linenum <= 24 or + linenum <= class_info.starting_linenum): + return + + matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum]) + if matched: + # Issue warning if the line before public/protected/private was + # not a blank line, but don't do this if the previous line contains + # "class" or "struct". This can happen two ways: + # - We are at the beginning of the class. + # - We are forward-declaring an inner class that is semantically + # private, but needed to be public for implementation reasons. + # Also ignores cases where the previous line ends with a backslash as can be + # common when defining classes in C macros. + prev_line = clean_lines.lines[linenum - 1] + if (not IsBlankLine(prev_line) and + not Search(r'\b(class|struct)\b', prev_line) and + not Search(r'\\$', prev_line)): + # Try a bit harder to find the beginning of the class. This is to + # account for multi-line base-specifier lists, e.g.: + # class Derived + # : public Base { + end_class_head = class_info.starting_linenum + for i in range(class_info.starting_linenum, linenum): + if Search(r'\{\s*$', clean_lines.lines[i]): + end_class_head = i + break + if end_class_head < linenum - 1: + error(filename, linenum, 'whitespace/blank_line', 3, + '"%s:" should be preceded by a blank line' % matched.group(1)) + + +def GetPreviousNonBlankLine(clean_lines, linenum): + """Return the most recent non-blank line and its line number. + + Args: + clean_lines: A CleansedLines instance containing the file contents. + linenum: The number of the line to check. + + Returns: + A tuple with two elements. The first element is the contents of the last + non-blank line before the current line, or the empty string if this is the + first non-blank line. The second is the line number of that line, or -1 + if this is the first non-blank line. + """ + + prevlinenum = linenum - 1 + while prevlinenum >= 0: + prevline = clean_lines.elided[prevlinenum] + if not IsBlankLine(prevline): # if not a blank line... + return (prevline, prevlinenum) + prevlinenum -= 1 + return ('', -1) + + +def CheckBraces(filename, clean_lines, linenum, error): + """Looks for misplaced braces (e.g. at the end of line). + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + line = clean_lines.elided[linenum] # get rid of comments and strings + + if Match(r'\s*{\s*$', line): + # We allow an open brace to start a line in the case where someone is using + # braces in a block to explicitly create a new scope, which is commonly used + # to control the lifetime of stack-allocated variables. Braces are also + # used for brace initializers inside function calls. We don't detect this + # perfectly: we just don't complain if the last non-whitespace character on + # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the + # previous line starts a preprocessor block. + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if (not Search(r'[,;:}{(]\s*$', prevline) and + not Match(r'\s*#', prevline)): + error(filename, linenum, 'whitespace/braces', 4, + '{ should almost always be at the end of the previous line') + + # An else clause should be on the same line as the preceding closing brace. + if Match(r'\s*else\s*', line): + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if Match(r'\s*}\s*$', prevline): + error(filename, linenum, 'whitespace/newline', 4, + 'An else should appear on the same line as the preceding }') + + # If braces come on one side of an else, they should be on both. + # However, we have to worry about "else if" that spans multiple lines! + if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line): + if Search(r'}\s*else if([^{]*)$', line): # could be multi-line if + # find the ( after the if + pos = line.find('else if') + pos = line.find('(', pos) + if pos > 0: + (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos) + if endline[endpos:].find('{') == -1: # must be brace after if + error(filename, linenum, 'readability/braces', 5, + 'If an else has a brace on one side, it should have it on both') + else: # common case: else not followed by a multi-line if + error(filename, linenum, 'readability/braces', 5, + 'If an else has a brace on one side, it should have it on both') + + # Likewise, an else should never have the else clause on the same line + if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line): + error(filename, linenum, 'whitespace/newline', 4, + 'Else clause should never be on same line as else (use 2 lines)') + + # In the same way, a do/while should never be on one line + if Match(r'\s*do [^\s{]', line): + error(filename, linenum, 'whitespace/newline', 4, + 'do/while clauses should not be on a single line') + + # Block bodies should not be followed by a semicolon. Due to C++11 + # brace initialization, there are more places where semicolons are + # required than not, so we use a whitelist approach to check these + # rather than a blacklist. These are the places where "};" should + # be replaced by just "}": + # 1. Some flavor of block following closing parenthesis: + # for (;;) {}; + # while (...) {}; + # switch (...) {}; + # Function(...) {}; + # if (...) {}; + # if (...) else if (...) {}; + # + # 2. else block: + # if (...) else {}; + # + # 3. const member function: + # Function(...) const {}; + # + # 4. Block following some statement: + # x = 42; + # {}; + # + # 5. Block at the beginning of a function: + # Function(...) { + # {}; + # } + # + # Note that naively checking for the preceding "{" will also match + # braces inside multi-dimensional arrays, but this is fine since + # that expression will not contain semicolons. + # + # 6. Block following another block: + # while (true) {} + # {}; + # + # 7. End of namespaces: + # namespace {}; + # + # These semicolons seems far more common than other kinds of + # redundant semicolons, possibly due to people converting classes + # to namespaces. For now we do not warn for this case. + # + # Try matching case 1 first. + match = Match(r'^(.*\)\s*)\{', line) + if match: + # Matched closing parenthesis (case 1). Check the token before the + # matching opening parenthesis, and don't warn if it looks like a + # macro. This avoids these false positives: + # - macro that defines a base class + # - multi-line macro that defines a base class + # - macro that defines the whole class-head + # + # But we still issue warnings for macros that we know are safe to + # warn, specifically: + # - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P + # - TYPED_TEST + # - INTERFACE_DEF + # - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED: + # + # We implement a whitelist of safe macros instead of a blacklist of + # unsafe macros, even though the latter appears less frequently in + # google code and would have been easier to implement. This is because + # the downside for getting the whitelist wrong means some extra + # semicolons, while the downside for getting the blacklist wrong + # would result in compile errors. + # + # In addition to macros, we also don't want to warn on compound + # literals. + closing_brace_pos = match.group(1).rfind(')') + opening_parenthesis = ReverseCloseExpression( + clean_lines, linenum, closing_brace_pos) + if opening_parenthesis[2] > -1: + line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]] + macro = Search(r'\b([A-Z_]+)\s*$', line_prefix) + if ((macro and + macro.group(1) not in ( + 'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST', + 'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED', + 'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or + Search(r'\s+=\s*$', line_prefix)): + match = None + # Whitelist lambda function definition which also requires a ";" after + # closing brace + if match: + if Match(r'^.*\[.*\]\s*(.*\)\s*)\{', line): + match = None + + else: + # Try matching cases 2-3. + match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line) + if not match: + # Try matching cases 4-6. These are always matched on separate lines. + # + # Note that we can't simply concatenate the previous line to the + # current line and do a single match, otherwise we may output + # duplicate warnings for the blank line case: + # if (cond) { + # // blank line + # } + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if prevline and Search(r'[;{}]\s*$', prevline): + match = Match(r'^(\s*)\{', line) + + # Check matching closing brace + if match: + (endline, endlinenum, endpos) = CloseExpression( + clean_lines, linenum, len(match.group(1))) + if endpos > -1 and Match(r'^\s*;', endline[endpos:]): + # Current {} pair is eligible for semicolon check, and we have found + # the redundant semicolon, output warning here. + # + # Note: because we are scanning forward for opening braces, and + # outputting warnings for the matching closing brace, if there are + # nested blocks with trailing semicolons, we will get the error + # messages in reversed order. + error(filename, endlinenum, 'readability/braces', 4, + "You don't need a ; after a }") + + +def CheckEmptyBlockBody(filename, clean_lines, linenum, error): + """Look for empty loop/conditional body with only a single semicolon. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + # Search for loop keywords at the beginning of the line. Because only + # whitespaces are allowed before the keywords, this will also ignore most + # do-while-loops, since those lines should start with closing brace. + # + # We also check "if" blocks here, since an empty conditional block + # is likely an error. + line = clean_lines.elided[linenum] + matched = Match(r'\s*(for|while|if)\s*\(', line) + if matched: + # Find the end of the conditional expression + (end_line, end_linenum, end_pos) = CloseExpression( + clean_lines, linenum, line.find('(')) + + # Output warning if what follows the condition expression is a semicolon. + # No warning for all other cases, including whitespace or newline, since we + # have a separate check for semicolons preceded by whitespace. + if end_pos >= 0 and Match(r';', end_line[end_pos:]): + if matched.group(1) == 'if': + error(filename, end_linenum, 'whitespace/empty_conditional_body', 5, + 'Empty conditional bodies should use {}') + else: + error(filename, end_linenum, 'whitespace/empty_loop_body', 5, + 'Empty loop bodies should use {} or continue') + + +def CheckCheck(filename, clean_lines, linenum, error): + """Checks the use of CHECK and EXPECT macros. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + # Decide the set of replacement macros that should be suggested + lines = clean_lines.elided + check_macro = None + start_pos = -1 + for macro in _CHECK_MACROS: + i = lines[linenum].find(macro) + if i >= 0: + check_macro = macro + + # Find opening parenthesis. Do a regular expression match here + # to make sure that we are matching the expected CHECK macro, as + # opposed to some other macro that happens to contain the CHECK + # substring. + matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum]) + if not matched: + continue + start_pos = len(matched.group(1)) + break + if not check_macro or start_pos < 0: + # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT' + return + + # Find end of the boolean expression by matching parentheses + (last_line, end_line, end_pos) = CloseExpression( + clean_lines, linenum, start_pos) + if end_pos < 0: + return + if linenum == end_line: + expression = lines[linenum][start_pos + 1:end_pos - 1] + else: + expression = lines[linenum][start_pos + 1:] + for i in xrange(linenum + 1, end_line): + expression += lines[i] + expression += last_line[0:end_pos - 1] + + # Parse expression so that we can take parentheses into account. + # This avoids false positives for inputs like "CHECK((a < 4) == b)", + # which is not replaceable by CHECK_LE. + lhs = '' + rhs = '' + operator = None + while expression: + matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||' + r'==|!=|>=|>|<=|<|\()(.*)$', expression) + if matched: + token = matched.group(1) + if token == '(': + # Parenthesized operand + expression = matched.group(2) + (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')') + if end < 0: + return # Unmatched parenthesis + lhs += '(' + expression[0:end] + expression = expression[end:] + elif token in ('&&', '||'): + # Logical and/or operators. This means the expression + # contains more than one term, for example: + # CHECK(42 < a && a < b); + # + # These are not replaceable with CHECK_LE, so bail out early. + return + elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'): + # Non-relational operator + lhs += token + expression = matched.group(2) + else: + # Relational operator + operator = token + rhs = matched.group(2) + break + else: + # Unparenthesized operand. Instead of appending to lhs one character + # at a time, we do another regular expression match to consume several + # characters at once if possible. Trivial benchmark shows that this + # is more efficient when the operands are longer than a single + # character, which is generally the case. + matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression) + if not matched: + matched = Match(r'^(\s*\S)(.*)$', expression) + if not matched: + break + lhs += matched.group(1) + expression = matched.group(2) + + # Only apply checks if we got all parts of the boolean expression + if not (lhs and operator and rhs): + return + + # Check that rhs do not contain logical operators. We already know + # that lhs is fine since the loop above parses out && and ||. + if rhs.find('&&') > -1 or rhs.find('||') > -1: + return + + # At least one of the operands must be a constant literal. This is + # to avoid suggesting replacements for unprintable things like + # CHECK(variable != iterator) + # + # The following pattern matches decimal, hex integers, strings, and + # characters (in that order). + lhs = lhs.strip() + rhs = rhs.strip() + match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$' + if Match(match_constant, lhs) or Match(match_constant, rhs): + # Note: since we know both lhs and rhs, we can provide a more + # descriptive error message like: + # Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42) + # Instead of: + # Consider using CHECK_EQ instead of CHECK(a == b) + # + # We are still keeping the less descriptive message because if lhs + # or rhs gets long, the error message might become unreadable. + error(filename, linenum, 'readability/check', 2, + 'Consider using %s instead of %s(a %s b)' % ( + _CHECK_REPLACEMENT[check_macro][operator], + check_macro, operator)) + + +def CheckAltTokens(filename, clean_lines, linenum, error): + """Check alternative keywords being used in boolean expressions. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Avoid preprocessor lines + if Match(r'^\s*#', line): + return + + # Last ditch effort to avoid multi-line comments. This will not help + # if the comment started before the current line or ended after the + # current line, but it catches most of the false positives. At least, + # it provides a way to workaround this warning for people who use + # multi-line comments in preprocessor macros. + # + # TODO(unknown): remove this once cpplint has better support for + # multi-line comments. + if line.find('/*') >= 0 or line.find('*/') >= 0: + return + + for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line): + error(filename, linenum, 'readability/alt_tokens', 2, + 'Use operator %s instead of %s' % ( + _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1))) + + +def GetLineWidth(line): + """Determines the width of the line in column positions. + + Args: + line: A string, which may be a Unicode string. + + Returns: + The width of the line in column positions, accounting for Unicode + combining characters and wide characters. + """ + if isinstance(line, unicode): + width = 0 + for uc in unicodedata.normalize('NFC', line): + if unicodedata.east_asian_width(uc) in ('W', 'F'): + width += 2 + elif not unicodedata.combining(uc): + width += 1 + return width + else: + return len(line) + + +def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, + error): + """Checks rules from the 'C++ style rules' section of cppguide.html. + + Most of these rules are hard to test (naming, comment style), but we + do what we can. In particular we check for 2-space indents, line lengths, + tab usage, spaces inside code, etc. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + file_extension: The extension (without the dot) of the filename. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + + # Don't use "elided" lines here, otherwise we can't check commented lines. + # Don't want to use "raw" either, because we don't want to check inside C++11 + # raw strings, + raw_lines = clean_lines.lines_without_raw_strings + line = raw_lines[linenum] + + if line.find('\t') != -1: + error(filename, linenum, 'whitespace/tab', 1, + 'Tab found; better to use spaces') + + # One or three blank spaces at the beginning of the line is weird; it's + # hard to reconcile that with 2-space indents. + # NOTE: here are the conditions rob pike used for his tests. Mine aren't + # as sophisticated, but it may be worth becoming so: RLENGTH==initial_spaces + # if(RLENGTH > 20) complain = 0; + # if(match($0, " +(error|private|public|protected):")) complain = 0; + # if(match(prev, "&& *$")) complain = 0; + # if(match(prev, "\\|\\| *$")) complain = 0; + # if(match(prev, "[\",=><] *$")) complain = 0; + # if(match($0, " <<")) complain = 0; + # if(match(prev, " +for \\(")) complain = 0; + # if(prevodd && match(prevprev, " +for \\(")) complain = 0; + initial_spaces = 0 + cleansed_line = clean_lines.elided[linenum] + while initial_spaces < len(line) and line[initial_spaces] == ' ': + initial_spaces += 1 + if line and line[-1].isspace(): + error(filename, linenum, 'whitespace/end_of_line', 4, + 'Line ends in whitespace. Consider deleting these extra spaces.') + # There are certain situations we allow one space, notably for section labels + elif ((initial_spaces == 1 or initial_spaces == 3) and + not Match(r'\s*\w+\s*:\s*$', cleansed_line)): + error(filename, linenum, 'whitespace/indent', 3, + 'Weird number of spaces at line-start. ' + 'Are you using a 2-space indent?') + + # Check if the line is a header guard. + is_header_guard = False + if file_extension == 'h': + cppvar = GetHeaderGuardCPPVariable(filename) + if (line.startswith('#ifndef %s' % cppvar) or + line.startswith('#define %s' % cppvar) or + line.startswith('#endif // %s' % cppvar)): + is_header_guard = True + # #include lines and header guards can be long, since there's no clean way to + # split them. + # + # URLs can be long too. It's possible to split these, but it makes them + # harder to cut&paste. + # + # The "$Id:...$" comment may also get very long without it being the + # developers fault. + if (not line.startswith('#include') and not is_header_guard and + not Match(r'^\s*//.*http(s?)://\S*$', line) and + not Match(r'^// \$Id:.*#[0-9]+ \$$', line)): + line_width = GetLineWidth(line) + extended_length = int((_line_length * 1.25)) + if line_width > extended_length: + error(filename, linenum, 'whitespace/line_length', 4, + 'Lines should very rarely be longer than %i characters' % + extended_length) + elif line_width > _line_length: + error(filename, linenum, 'whitespace/line_length', 2, + 'Lines should be <= %i characters long' % _line_length) + + if (cleansed_line.count(';') > 1 and + # for loops are allowed two ;'s (and may run over two lines). + cleansed_line.find('for') == -1 and + (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or + GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and + # It's ok to have many commands in a switch case that fits in 1 line + not ((cleansed_line.find('case ') != -1 or + cleansed_line.find('default:') != -1) and + cleansed_line.find('break;') != -1)): + error(filename, linenum, 'whitespace/newline', 0, + 'More than one command on the same line') + + # Some more style checks + CheckBraces(filename, clean_lines, linenum, error) + CheckEmptyBlockBody(filename, clean_lines, linenum, error) + CheckAccess(filename, clean_lines, linenum, nesting_state, error) + CheckSpacing(filename, clean_lines, linenum, nesting_state, error) + CheckCheck(filename, clean_lines, linenum, error) + CheckAltTokens(filename, clean_lines, linenum, error) + classinfo = nesting_state.InnermostClass() + if classinfo: + CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error) + + +_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"') +_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$') +# Matches the first component of a filename delimited by -s and _s. That is: +# _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo' +# _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo' +# _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo' +# _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo' +_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+') + + +def _DropCommonSuffixes(filename): + """Drops common suffixes like _test.cc or -inl.h from filename. + + For example: + >>> _DropCommonSuffixes('foo/foo-inl.h') + 'foo/foo' + >>> _DropCommonSuffixes('foo/bar/foo.cc') + 'foo/bar/foo' + >>> _DropCommonSuffixes('foo/foo_internal.h') + 'foo/foo' + >>> _DropCommonSuffixes('foo/foo_unusualinternal.h') + 'foo/foo_unusualinternal' + + Args: + filename: The input filename. + + Returns: + The filename with the common suffix removed. + """ + for suffix in ('test.cc', 'regtest.cc', 'unittest.cc', + 'inl.h', 'impl.h', 'internal.h'): + if (filename.endswith(suffix) and len(filename) > len(suffix) and + filename[-len(suffix) - 1] in ('-', '_')): + return filename[:-len(suffix) - 1] + return os.path.splitext(filename)[0] + + +def _IsTestFilename(filename): + """Determines if the given filename has a suffix that identifies it as a test. + + Args: + filename: The input filename. + + Returns: + True if 'filename' looks like a test, False otherwise. + """ + if (filename.endswith('_test.cc') or + filename.endswith('_unittest.cc') or + filename.endswith('_regtest.cc')): + return True + else: + return False + + +def _ClassifyInclude(fileinfo, include, is_system): + """Figures out what kind of header 'include' is. + + Args: + fileinfo: The current file cpplint is running over. A FileInfo instance. + include: The path to a #included file. + is_system: True if the #include used <> rather than "". + + Returns: + One of the _XXX_HEADER constants. + + For example: + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True) + _C_SYS_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True) + _CPP_SYS_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False) + _LIKELY_MY_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'), + ... 'bar/foo_other_ext.h', False) + _POSSIBLE_MY_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False) + _OTHER_HEADER + """ + # This is a list of all standard c++ header files, except + # those already checked for above. + is_cpp_h = include in _CPP_HEADERS + + if is_system: + if is_cpp_h: + return _CPP_SYS_HEADER + else: + return _C_SYS_HEADER + + # If the target file and the include we're checking share a + # basename when we drop common extensions, and the include + # lives in . , then it's likely to be owned by the target file. + target_dir, target_base = ( + os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName()))) + include_dir, include_base = os.path.split(_DropCommonSuffixes(include)) + if target_base == include_base and ( + include_dir == target_dir or + include_dir == os.path.normpath(target_dir + '/../public')): + return _LIKELY_MY_HEADER + + # If the target and include share some initial basename + # component, it's possible the target is implementing the + # include, so it's allowed to be first, but we'll never + # complain if it's not there. + target_first_component = _RE_FIRST_COMPONENT.match(target_base) + include_first_component = _RE_FIRST_COMPONENT.match(include_base) + if (target_first_component and include_first_component and + target_first_component.group(0) == + include_first_component.group(0)): + return _POSSIBLE_MY_HEADER + + return _OTHER_HEADER + + + +def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): + """Check rules that are applicable to #include lines. + + Strings on #include lines are NOT removed from elided line, to make + certain tasks easier. However, to prevent false positives, checks + applicable to #include lines in CheckLanguage must be put here. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + include_state: An _IncludeState instance in which the headers are inserted. + error: The function to call with any errors found. + """ + fileinfo = FileInfo(filename) + + line = clean_lines.lines[linenum] + + # "include" should use the new style "foo/bar.h" instead of just "bar.h" + if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line): + error(filename, linenum, 'build/include', 4, + 'Include the directory when naming .h files') + + # we shouldn't include a file more than once. actually, there are a + # handful of instances where doing so is okay, but in general it's + # not. + match = _RE_PATTERN_INCLUDE.search(line) + if match: + include = match.group(2) + is_system = (match.group(1) == '<') + if include in include_state: + error(filename, linenum, 'build/include', 4, + '"%s" already included at %s:%s' % + (include, filename, include_state[include])) + else: + include_state[include] = linenum + + # We want to ensure that headers appear in the right order: + # 1) for foo.cc, foo.h (preferred location) + # 2) c system files + # 3) cpp system files + # 4) for foo.cc, foo.h (deprecated location) + # 5) other google headers + # + # We classify each include statement as one of those 5 types + # using a number of techniques. The include_state object keeps + # track of the highest type seen, and complains if we see a + # lower type after that. + error_message = include_state.CheckNextIncludeOrder( + _ClassifyInclude(fileinfo, include, is_system)) + if error_message: + error(filename, linenum, 'build/include_order', 4, + '%s. Should be: %s.h, c system, c++ system, other.' % + (error_message, fileinfo.BaseName())) + canonical_include = include_state.CanonicalizeAlphabeticalOrder(include) + if not include_state.IsInAlphabeticalOrder( + clean_lines, linenum, canonical_include): + error(filename, linenum, 'build/include_alpha', 4, + 'Include "%s" not in alphabetical order' % include) + include_state.SetLastHeader(canonical_include) + + # Look for any of the stream classes that are part of standard C++. + match = _RE_PATTERN_INCLUDE.match(line) + if match: + include = match.group(2) + if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include): + # Many unit tests use cout, so we exempt them. + if not _IsTestFilename(filename): + error(filename, linenum, 'readability/streams', 3, + 'Streams are highly discouraged.') + + +def _GetTextInside(text, start_pattern): + r"""Retrieves all the text between matching open and close parentheses. + + Given a string of lines and a regular expression string, retrieve all the text + following the expression and between opening punctuation symbols like + (, [, or {, and the matching close-punctuation symbol. This properly nested + occurrences of the punctuations, so for the text like + printf(a(), b(c())); + a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'. + start_pattern must match string having an open punctuation symbol at the end. + + Args: + text: The lines to extract text. Its comments and strings must be elided. + It can be single line and can span multiple lines. + start_pattern: The regexp string indicating where to start extracting + the text. + Returns: + The extracted text. + None if either the opening string or ending punctuation could not be found. + """ + # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably + # rewritten to use _GetTextInside (and use inferior regexp matching today). + + # Give opening punctuations to get the matching close-punctuations. + matching_punctuation = {'(': ')', '{': '}', '[': ']'} + closing_punctuation = set(matching_punctuation.itervalues()) + + # Find the position to start extracting text. + match = re.search(start_pattern, text, re.M) + if not match: # start_pattern not found in text. + return None + start_position = match.end(0) + + assert start_position > 0, ( + 'start_pattern must ends with an opening punctuation.') + assert text[start_position - 1] in matching_punctuation, ( + 'start_pattern must ends with an opening punctuation.') + # Stack of closing punctuations we expect to have in text after position. + punctuation_stack = [matching_punctuation[text[start_position - 1]]] + position = start_position + while punctuation_stack and position < len(text): + if text[position] == punctuation_stack[-1]: + punctuation_stack.pop() + elif text[position] in closing_punctuation: + # A closing punctuation without matching opening punctuations. + return None + elif text[position] in matching_punctuation: + punctuation_stack.append(matching_punctuation[text[position]]) + position += 1 + if punctuation_stack: + # Opening punctuations left without matching close-punctuations. + return None + # punctuations match. + return text[start_position:position - 1] + + +# Patterns for matching call-by-reference parameters. +# +# Supports nested templates up to 2 levels deep using this messy pattern: +# < (?: < (?: < [^<>]* +# > +# | [^<>] )* +# > +# | [^<>] )* +# > +_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*' # =~ [[:alpha:]][[:alnum:]]* +_RE_PATTERN_TYPE = ( + r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?' + r'(?:\w|' + r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|' + r'::)+') +# A call-by-reference parameter ends with '& identifier'. +_RE_PATTERN_REF_PARAM = re.compile( + r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*' + r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]') +# A call-by-const-reference parameter either ends with 'const& identifier' +# or looks like 'const type& identifier' when 'type' is atomic. +_RE_PATTERN_CONST_REF_PARAM = ( + r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT + + r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')') + + +def CheckLanguage(filename, clean_lines, linenum, file_extension, + include_state, nesting_state, error): + """Checks rules from the 'C++ language rules' section of cppguide.html. + + Some of these rules are hard to test (function overloading, using + uint32 inappropriately), but we do the best we can. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + file_extension: The extension (without the dot) of the filename. + include_state: An _IncludeState instance in which the headers are inserted. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + # If the line is empty or consists of entirely a comment, no need to + # check it. + line = clean_lines.elided[linenum] + if not line: + return + + match = _RE_PATTERN_INCLUDE.search(line) + if match: + CheckIncludeLine(filename, clean_lines, linenum, include_state, error) + return + + # Reset include state across preprocessor directives. This is meant + # to silence warnings for conditional includes. + if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line): + include_state.ResetSection() + + # Make Windows paths like Unix. + fullname = os.path.abspath(filename).replace('\\', '/') + + # TODO(unknown): figure out if they're using default arguments in fn proto. + + # Check to see if they're using an conversion function cast. + # I just try to capture the most common basic types, though there are more. + # Parameterless conversion functions, such as bool(), are allowed as they are + # probably a member operator declaration or default constructor. + match = Search( + r'(\bnew\s+)?\b' # Grab 'new' operator, if it's there + r'(int|float|double|bool|char|int32|uint32|int64|uint64)' + r'(\([^)].*)', line) + if match: + matched_new = match.group(1) + matched_type = match.group(2) + matched_funcptr = match.group(3) + + # gMock methods are defined using some variant of MOCK_METHODx(name, type) + # where type may be float(), int(string), etc. Without context they are + # virtually indistinguishable from int(x) casts. Likewise, gMock's + # MockCallback takes a template parameter of the form return_type(arg_type), + # which looks much like the cast we're trying to detect. + # + # std::function<> wrapper has a similar problem. + # + # Return types for function pointers also look like casts if they + # don't have an extra space. + if (matched_new is None and # If new operator, then this isn't a cast + not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or + Search(r'\bMockCallback<.*>', line) or + Search(r'\bstd::function<.*>', line)) and + not (matched_funcptr and + Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(', + matched_funcptr))): + # Try a bit harder to catch gmock lines: the only place where + # something looks like an old-style cast is where we declare the + # return type of the mocked method, and the only time when we + # are missing context is if MOCK_METHOD was split across + # multiple lines. The missing MOCK_METHOD is usually one or two + # lines back, so scan back one or two lines. + # + # It's not possible for gmock macros to appear in the first 2 + # lines, since the class head + section name takes up 2 lines. + if (linenum < 2 or + not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$', + clean_lines.elided[linenum - 1]) or + Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$', + clean_lines.elided[linenum - 2]))): + error(filename, linenum, 'readability/casting', 4, + 'Using deprecated casting style. ' + 'Use static_cast<%s>(...) instead' % + matched_type) + + CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], + 'static_cast', + r'\((int|float|double|bool|char|u?int(16|32|64))\)', error) + + # This doesn't catch all cases. Consider (const char * const)"hello". + # + # (char *) "foo" should always be a const_cast (reinterpret_cast won't + # compile). + if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], + 'const_cast', r'\((char\s?\*+\s?)\)\s*"', error): + pass + else: + # Check pointer casts for other than string constants + CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], + 'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error) + + # In addition, we look for people taking the address of a cast. This + # is dangerous -- casts can assign to temporaries, so the pointer doesn't + # point where you think. + match = Search( + r'(?:&\(([^)]+)\)[\w(])|' + r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line) + if match and match.group(1) != '*': + error(filename, linenum, 'runtime/casting', 4, + ('Are you taking an address of a cast? ' + 'This is dangerous: could be a temp var. ' + 'Take the address before doing the cast, rather than after')) + + # Create an extended_line, which is the concatenation of the current and + # next lines, for more effective checking of code that may span more than one + # line. + if linenum + 1 < clean_lines.NumLines(): + extended_line = line + clean_lines.elided[linenum + 1] + else: + extended_line = line + + # Check for people declaring static/global STL strings at the top level. + # This is dangerous because the C++ language does not guarantee that + # globals with constructors are initialized before the first access. + match = Match( + r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)', + line) + # Make sure it's not a function. + # Function template specialization looks like: "string foo(...". + # Class template definitions look like: "string Foo::Method(...". + # + # Also ignore things that look like operators. These are matched separately + # because operator names cross non-word boundaries. If we change the pattern + # above, we would decrease the accuracy of matching identifiers. + if (match and + not Search(r'\boperator\W', line) and + not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))): + error(filename, linenum, 'runtime/string', 4, + 'For a static/global string constant, use a C style string instead: ' + '"%schar %s[]".' % + (match.group(1), match.group(2))) + + if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line): + error(filename, linenum, 'runtime/init', 4, + 'You seem to be initializing a member variable with itself.') + + if file_extension == 'h': + # TODO(unknown): check that 1-arg constructors are explicit. + # How to tell it's a constructor? + # (handled in CheckForNonStandardConstructs for now) + # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS + # (level 1 error) + pass + + # Check if people are using the verboten C basic types. The only exception + # we regularly allow is "unsigned short port" for port. + if Search(r'\bshort port\b', line): + if not Search(r'\bunsigned short port\b', line): + error(filename, linenum, 'runtime/int', 4, + 'Use "unsigned short" for ports, not "short"') + else: + match = Search(r'\b(short|long(?! +double)|long long)\b', line) + if match: + error(filename, linenum, 'runtime/int', 4, + 'Use int16/int64/etc, rather than the C type %s' % match.group(1)) + + # When snprintf is used, the second argument shouldn't be a literal. + match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line) + if match and match.group(2) != '0': + # If 2nd arg is zero, snprintf is used to calculate size. + error(filename, linenum, 'runtime/printf', 3, + 'If you can, use sizeof(%s) instead of %s as the 2nd arg ' + 'to snprintf.' % (match.group(1), match.group(2))) + + # Check if some verboten C functions are being used. + if Search(r'\bsprintf\b', line): + error(filename, linenum, 'runtime/printf', 5, + 'Never use sprintf. Use snprintf instead.') + match = Search(r'\b(strcpy|strcat)\b', line) + if match: + error(filename, linenum, 'runtime/printf', 4, + 'Almost always, snprintf is better than %s' % match.group(1)) + + # Check if some verboten operator overloading is going on + # TODO(unknown): catch out-of-line unary operator&: + # class X {}; + # int operator&(const X& x) { return 42; } // unary operator& + # The trick is it's hard to tell apart from binary operator&: + # class Y { int operator&(const Y& x) { return 23; } }; // binary operator& + if Search(r'\boperator\s*&\s*\(\s*\)', line): + error(filename, linenum, 'runtime/operator', 4, + 'Unary operator& is dangerous. Do not use it.') + + # Check for suspicious usage of "if" like + # } if (a == b) { + if Search(r'\}\s*if\s*\(', line): + error(filename, linenum, 'readability/braces', 4, + 'Did you mean "else if"? If not, start a new line for "if".') + + # Check for potential format string bugs like printf(foo). + # We constrain the pattern not to pick things like DocidForPrintf(foo). + # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str()) + # TODO(sugawarayu): Catch the following case. Need to change the calling + # convention of the whole function to process multiple line to handle it. + # printf( + # boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line); + printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(') + if printf_args: + match = Match(r'([\w.\->()]+)$', printf_args) + if match and match.group(1) != '__VA_ARGS__': + function_name = re.search(r'\b((?:string)?printf)\s*\(', + line, re.I).group(1) + error(filename, linenum, 'runtime/printf', 4, + 'Potential format string bug. Do %s("%%s", %s) instead.' + % (function_name, match.group(1))) + + # Check for potential memset bugs like memset(buf, sizeof(buf), 0). + match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line) + if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)): + error(filename, linenum, 'runtime/memset', 4, + 'Did you mean "memset(%s, 0, %s)"?' + % (match.group(1), match.group(2))) + + if Search(r'\busing namespace\b', line): + error(filename, linenum, 'build/namespaces', 5, + 'Do not use namespace using-directives. ' + 'Use using-declarations instead.') + + # Detect variable-length arrays. + match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line) + if (match and match.group(2) != 'return' and match.group(2) != 'delete' and + match.group(3).find(']') == -1): + # Split the size using space and arithmetic operators as delimiters. + # If any of the resulting tokens are not compile time constants then + # report the error. + tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3)) + is_const = True + skip_next = False + for tok in tokens: + if skip_next: + skip_next = False + continue + + if Search(r'sizeof\(.+\)', tok): continue + if Search(r'arraysize\(\w+\)', tok): continue + + tok = tok.lstrip('(') + tok = tok.rstrip(')') + if not tok: continue + if Match(r'\d+', tok): continue + if Match(r'0[xX][0-9a-fA-F]+', tok): continue + if Match(r'k[A-Z0-9]\w*', tok): continue + if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue + if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue + # A catch all for tricky sizeof cases, including 'sizeof expression', + # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)' + # requires skipping the next token because we split on ' ' and '*'. + if tok.startswith('sizeof'): + skip_next = True + continue + is_const = False + break + if not is_const: + error(filename, linenum, 'runtime/arrays', 1, + 'Do not use variable-length arrays. Use an appropriately named ' + "('k' followed by CamelCase) compile-time constant for the size.") + + # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or + # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing + # in the class declaration. + match = Match( + (r'\s*' + r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))' + r'\(.*\);$'), + line) + if match and linenum + 1 < clean_lines.NumLines(): + next_line = clean_lines.elided[linenum + 1] + # We allow some, but not all, declarations of variables to be present + # in the statement that defines the class. The [\w\*,\s]* fragment of + # the regular expression below allows users to declare instances of + # the class or pointers to instances, but not less common types such + # as function pointers or arrays. It's a tradeoff between allowing + # reasonable code and avoiding trying to parse more C++ using regexps. + if not Search(r'^\s*}[\w\*,\s]*;', next_line): + error(filename, linenum, 'readability/constructors', 3, + match.group(1) + ' should be the last thing in the class') + + # Check for use of unnamed namespaces in header files. Registration + # macros are typically OK, so we allow use of "namespace {" on lines + # that end with backslashes. + if (file_extension == 'h' + and Search(r'\bnamespace\s*{', line) + and line[-1] != '\\'): + error(filename, linenum, 'build/namespaces', 4, + 'Do not use unnamed namespaces in header files. See ' + 'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces' + ' for more information.') + +def CheckForNonConstReference(filename, clean_lines, linenum, + nesting_state, error): + """Check for non-const references. + + Separate from CheckLanguage since it scans backwards from current + line, instead of scanning forward. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + # Do nothing if there is no '&' on current line. + line = clean_lines.elided[linenum] + if '&' not in line: + return + + # Long type names may be broken across multiple lines, usually in one + # of these forms: + # LongType + # ::LongTypeContinued &identifier + # LongType:: + # LongTypeContinued &identifier + # LongType< + # ...>::LongTypeContinued &identifier + # + # If we detected a type split across two lines, join the previous + # line to current line so that we can match const references + # accordingly. + # + # Note that this only scans back one line, since scanning back + # arbitrary number of lines would be expensive. If you have a type + # that spans more than 2 lines, please use a typedef. + if linenum > 1: + previous = None + if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line): + # previous_line\n + ::current_line + previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$', + clean_lines.elided[linenum - 1]) + elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line): + # previous_line::\n + current_line + previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$', + clean_lines.elided[linenum - 1]) + if previous: + line = previous.group(1) + line.lstrip() + else: + # Check for templated parameter that is split across multiple lines + endpos = line.rfind('>') + if endpos > -1: + (_, startline, startpos) = ReverseCloseExpression( + clean_lines, linenum, endpos) + if startpos > -1 and startline < linenum: + # Found the matching < on an earlier line, collect all + # pieces up to current line. + line = '' + for i in xrange(startline, linenum + 1): + line += clean_lines.elided[i].strip() + + # Check for non-const references in function parameters. A single '&' may + # found in the following places: + # inside expression: binary & for bitwise AND + # inside expression: unary & for taking the address of something + # inside declarators: reference parameter + # We will exclude the first two cases by checking that we are not inside a + # function body, including one that was just introduced by a trailing '{'. + # TODO(unknwon): Doesn't account for preprocessor directives. + # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare]. + check_params = False + if not nesting_state.stack: + check_params = True # top level + elif (isinstance(nesting_state.stack[-1], _ClassInfo) or + isinstance(nesting_state.stack[-1], _NamespaceInfo)): + check_params = True # within class or namespace + elif Match(r'.*{\s*$', line): + if (len(nesting_state.stack) == 1 or + isinstance(nesting_state.stack[-2], _ClassInfo) or + isinstance(nesting_state.stack[-2], _NamespaceInfo)): + check_params = True # just opened global/class/namespace block + # We allow non-const references in a few standard places, like functions + # called "swap()" or iostream operators like "<<" or ">>". Do not check + # those function parameters. + # + # We also accept & in static_assert, which looks like a function but + # it's actually a declaration expression. + whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|' + r'operator\s*[<>][<>]|' + r'static_assert|COMPILE_ASSERT' + r')\s*\(') + if Search(whitelisted_functions, line): + check_params = False + elif not Search(r'\S+\([^)]*$', line): + # Don't see a whitelisted function on this line. Actually we + # didn't see any function name on this line, so this is likely a + # multi-line parameter list. Try a bit harder to catch this case. + for i in xrange(2): + if (linenum > i and + Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])): + check_params = False + break + + if check_params: + decls = ReplaceAll(r'{[^}]*}', ' ', line) # exclude function body + for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls): + if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter): + error(filename, linenum, 'runtime/references', 2, + 'Is this a non-const reference? ' + 'If so, make const or use a pointer: ' + + ReplaceAll(' *<', '<', parameter)) + + +def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern, + error): + """Checks for a C-style cast by looking for the pattern. + + Args: + filename: The name of the current file. + linenum: The number of the line to check. + line: The line of code to check. + raw_line: The raw line of code to check, with comments. + cast_type: The string for the C++ cast to recommend. This is either + reinterpret_cast, static_cast, or const_cast, depending. + pattern: The regular expression used to find C-style casts. + error: The function to call with any errors found. + + Returns: + True if an error was emitted. + False otherwise. + """ + match = Search(pattern, line) + if not match: + return False + + # Exclude lines with sizeof, since sizeof looks like a cast. + sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1]) + if sizeof_match: + return False + + # operator++(int) and operator--(int) + if (line[0:match.start(1) - 1].endswith(' operator++') or + line[0:match.start(1) - 1].endswith(' operator--')): + return False + + # A single unnamed argument for a function tends to look like old + # style cast. If we see those, don't issue warnings for deprecated + # casts, instead issue warnings for unnamed arguments where + # appropriate. + # + # These are things that we want warnings for, since the style guide + # explicitly require all parameters to be named: + # Function(int); + # Function(int) { + # ConstMember(int) const; + # ConstMember(int) const { + # ExceptionMember(int) throw (...); + # ExceptionMember(int) throw (...) { + # PureVirtual(int) = 0; + # + # These are functions of some sort, where the compiler would be fine + # if they had named parameters, but people often omit those + # identifiers to reduce clutter: + # (FunctionPointer)(int); + # (FunctionPointer)(int) = value; + # Function((function_pointer_arg)(int)) + # ; + # <(FunctionPointerTemplateArgument)(int)>; + remainder = line[match.end(0):] + if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder): + # Looks like an unnamed parameter. + + # Don't warn on any kind of template arguments. + if Match(r'^\s*>', remainder): + return False + + # Don't warn on assignments to function pointers, but keep warnings for + # unnamed parameters to pure virtual functions. Note that this pattern + # will also pass on assignments of "0" to function pointers, but the + # preferred values for those would be "nullptr" or "NULL". + matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder) + if matched_zero and matched_zero.group(1) != '0': + return False + + # Don't warn on function pointer declarations. For this we need + # to check what came before the "(type)" string. + if Match(r'.*\)\s*$', line[0:match.start(0)]): + return False + + # Don't warn if the parameter is named with block comments, e.g.: + # Function(int /*unused_param*/); + if '/*' in raw_line: + return False + + # Passed all filters, issue warning here. + error(filename, linenum, 'readability/function', 3, + 'All parameters should be named in a function') + return True + + # At this point, all that should be left is actual casts. + error(filename, linenum, 'readability/casting', 4, + 'Using C-style cast. Use %s<%s>(...) instead' % + (cast_type, match.group(1))) + + return True + + +_HEADERS_CONTAINING_TEMPLATES = ( + ('', ('deque',)), + ('', ('unary_function', 'binary_function', + 'plus', 'minus', 'multiplies', 'divides', 'modulus', + 'negate', + 'equal_to', 'not_equal_to', 'greater', 'less', + 'greater_equal', 'less_equal', + 'logical_and', 'logical_or', 'logical_not', + 'unary_negate', 'not1', 'binary_negate', 'not2', + 'bind1st', 'bind2nd', + 'pointer_to_unary_function', + 'pointer_to_binary_function', + 'ptr_fun', + 'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t', + 'mem_fun_ref_t', + 'const_mem_fun_t', 'const_mem_fun1_t', + 'const_mem_fun_ref_t', 'const_mem_fun1_ref_t', + 'mem_fun_ref', + )), + ('', ('numeric_limits',)), + ('', ('list',)), + ('', ('map', 'multimap',)), + ('', ('allocator',)), + ('', ('queue', 'priority_queue',)), + ('', ('set', 'multiset',)), + ('', ('stack',)), + ('', ('char_traits', 'basic_string',)), + ('', ('pair',)), + ('', ('vector',)), + + # gcc extensions. + # Note: std::hash is their hash, ::hash is our hash + ('', ('hash_map', 'hash_multimap',)), + ('', ('hash_set', 'hash_multiset',)), + ('', ('slist',)), + ) + +_RE_PATTERN_STRING = re.compile(r'\bstring\b') + +_re_pattern_algorithm_header = [] +for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap', + 'transform'): + # Match max(..., ...), max(..., ...), but not foo->max, foo.max or + # type::max(). + _re_pattern_algorithm_header.append( + (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'), + _template, + '')) + +_re_pattern_templates = [] +for _header, _templates in _HEADERS_CONTAINING_TEMPLATES: + for _template in _templates: + _re_pattern_templates.append( + (re.compile(r'(\<|\b)' + _template + r'\s*\<'), + _template + '<>', + _header)) + + +def FilesBelongToSameModule(filename_cc, filename_h): + """Check if these two filenames belong to the same module. + + The concept of a 'module' here is a as follows: + foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the + same 'module' if they are in the same directory. + some/path/public/xyzzy and some/path/internal/xyzzy are also considered + to belong to the same module here. + + If the filename_cc contains a longer path than the filename_h, for example, + '/absolute/path/to/base/sysinfo.cc', and this file would include + 'base/sysinfo.h', this function also produces the prefix needed to open the + header. This is used by the caller of this function to more robustly open the + header file. We don't have access to the real include paths in this context, + so we need this guesswork here. + + Known bugs: tools/base/bar.cc and base/bar.h belong to the same module + according to this implementation. Because of this, this function gives + some false positives. This should be sufficiently rare in practice. + + Args: + filename_cc: is the path for the .cc file + filename_h: is the path for the header path + + Returns: + Tuple with a bool and a string: + bool: True if filename_cc and filename_h belong to the same module. + string: the additional prefix needed to open the header file. + """ + + if not filename_cc.endswith('.cc'): + return (False, '') + filename_cc = filename_cc[:-len('.cc')] + if filename_cc.endswith('_unittest'): + filename_cc = filename_cc[:-len('_unittest')] + elif filename_cc.endswith('_test'): + filename_cc = filename_cc[:-len('_test')] + filename_cc = filename_cc.replace('/public/', '/') + filename_cc = filename_cc.replace('/internal/', '/') + + if not filename_h.endswith('.h'): + return (False, '') + filename_h = filename_h[:-len('.h')] + if filename_h.endswith('-inl'): + filename_h = filename_h[:-len('-inl')] + filename_h = filename_h.replace('/public/', '/') + filename_h = filename_h.replace('/internal/', '/') + + files_belong_to_same_module = filename_cc.endswith(filename_h) + common_path = '' + if files_belong_to_same_module: + common_path = filename_cc[:-len(filename_h)] + return files_belong_to_same_module, common_path + + +def UpdateIncludeState(filename, include_state, io=codecs): + """Fill up the include_state with new includes found from the file. + + Args: + filename: the name of the header to read. + include_state: an _IncludeState instance in which the headers are inserted. + io: The io factory to use to read the file. Provided for testability. + + Returns: + True if a header was successfully added. False otherwise. + """ + headerfile = None + try: + headerfile = io.open(filename, 'r', 'utf8', 'replace') + except IOError: + return False + linenum = 0 + for line in headerfile: + linenum += 1 + clean_line = CleanseComments(line) + match = _RE_PATTERN_INCLUDE.search(clean_line) + if match: + include = match.group(2) + # The value formatting is cute, but not really used right now. + # What matters here is that the key is in include_state. + include_state.setdefault(include, '%s:%d' % (filename, linenum)) + return True + + +def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, + io=codecs): + """Reports for missing stl includes. + + This function will output warnings to make sure you are including the headers + necessary for the stl containers and functions that you use. We only give one + reason to include a header. For example, if you use both equal_to<> and + less<> in a .h file, only one (the latter in the file) of these will be + reported as a reason to include the . + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + include_state: An _IncludeState instance. + error: The function to call with any errors found. + io: The IO factory to use to read the header file. Provided for unittest + injection. + """ + required = {} # A map of header name to linenumber and the template entity. + # Example of required: { '': (1219, 'less<>') } + + for linenum in xrange(clean_lines.NumLines()): + line = clean_lines.elided[linenum] + if not line or line[0] == '#': + continue + + # String is special -- it is a non-templatized type in STL. + matched = _RE_PATTERN_STRING.search(line) + if matched: + # Don't warn about strings in non-STL namespaces: + # (We check only the first match per line; good enough.) + prefix = line[:matched.start()] + if prefix.endswith('std::') or not prefix.endswith('::'): + required[''] = (linenum, 'string') + + for pattern, template, header in _re_pattern_algorithm_header: + if pattern.search(line): + required[header] = (linenum, template) + + # The following function is just a speed up, no semantics are changed. + if not '<' in line: # Reduces the cpu time usage by skipping lines. + continue + + for pattern, template, header in _re_pattern_templates: + if pattern.search(line): + required[header] = (linenum, template) + + # The policy is that if you #include something in foo.h you don't need to + # include it again in foo.cc. Here, we will look at possible includes. + # Let's copy the include_state so it is only messed up within this function. + include_state = include_state.copy() + + # Did we find the header for this file (if any) and successfully load it? + header_found = False + + # Use the absolute path so that matching works properly. + abs_filename = FileInfo(filename).FullName() + + # For Emacs's flymake. + # If cpplint is invoked from Emacs's flymake, a temporary file is generated + # by flymake and that file name might end with '_flymake.cc'. In that case, + # restore original file name here so that the corresponding header file can be + # found. + # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h' + # instead of 'foo_flymake.h' + abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename) + + # include_state is modified during iteration, so we iterate over a copy of + # the keys. + header_keys = include_state.keys() + for header in header_keys: + (same_module, common_path) = FilesBelongToSameModule(abs_filename, header) + fullpath = common_path + header + if same_module and UpdateIncludeState(fullpath, include_state, io): + header_found = True + + # If we can't find the header file for a .cc, assume it's because we don't + # know where to look. In that case we'll give up as we're not sure they + # didn't include it in the .h file. + # TODO(unknown): Do a better job of finding .h files so we are confident that + # not having the .h file means there isn't one. + if filename.endswith('.cc') and not header_found: + return + + # All the lines have been processed, report the errors found. + for required_header_unstripped in required: + template = required[required_header_unstripped][1] + if required_header_unstripped.strip('<>"') not in include_state: + error(filename, required[required_header_unstripped][0], + 'build/include_what_you_use', 4, + 'Add #include ' + required_header_unstripped + ' for ' + template) + + +_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<') + + +def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error): + """Check that make_pair's template arguments are deduced. + + G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are + specified explicitly, and such use isn't intended in any case. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line) + if match: + error(filename, linenum, 'build/explicit_make_pair', + 4, # 4 = high confidence + 'For C++11-compatibility, omit template arguments from make_pair' + ' OR use pair directly OR if appropriate, construct a pair directly') + + +def ProcessLine(filename, file_extension, clean_lines, line, + include_state, function_state, nesting_state, error, + extra_check_functions=[]): + """Processes a single line in the file. + + Args: + filename: Filename of the file that is being processed. + file_extension: The extension (dot not included) of the file. + clean_lines: An array of strings, each representing a line of the file, + with comments stripped. + line: Number of line being processed. + include_state: An _IncludeState instance in which the headers are inserted. + function_state: A _FunctionState instance which counts function lines, etc. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: A callable to which errors are reported, which takes 4 arguments: + filename, line number, error level, and message + extra_check_functions: An array of additional check functions that will be + run on each source line. Each function takes 4 + arguments: filename, clean_lines, line, error + """ + raw_lines = clean_lines.raw_lines + ParseNolintSuppressions(filename, raw_lines[line], line, error) + nesting_state.Update(filename, clean_lines, line, error) + if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM: + return + CheckForFunctionLengths(filename, clean_lines, line, function_state, error) + CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error) + CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error) + CheckLanguage(filename, clean_lines, line, file_extension, include_state, + nesting_state, error) + CheckForNonConstReference(filename, clean_lines, line, nesting_state, error) + CheckForNonStandardConstructs(filename, clean_lines, line, + nesting_state, error) + CheckVlogArguments(filename, clean_lines, line, error) + CheckPosixThreading(filename, clean_lines, line, error) + CheckInvalidIncrement(filename, clean_lines, line, error) + CheckMakePairUsesDeduction(filename, clean_lines, line, error) + for check_fn in extra_check_functions: + check_fn(filename, clean_lines, line, error) + +def ProcessFileData(filename, file_extension, lines, error, + extra_check_functions=[]): + """Performs lint checks and reports any errors to the given error function. + + Args: + filename: Filename of the file that is being processed. + file_extension: The extension (dot not included) of the file. + lines: An array of strings, each representing a line of the file, with the + last element being empty if the file is terminated with a newline. + error: A callable to which errors are reported, which takes 4 arguments: + filename, line number, error level, and message + extra_check_functions: An array of additional check functions that will be + run on each source line. Each function takes 4 + arguments: filename, clean_lines, line, error + """ + lines = (['// marker so line numbers and indices both start at 1'] + lines + + ['// marker so line numbers end in a known way']) + + include_state = _IncludeState() + function_state = _FunctionState() + nesting_state = _NestingState() + + ResetNolintSuppressions() + + CheckForCopyright(filename, lines, error) + + if file_extension == 'h': + CheckForHeaderGuard(filename, lines, error) + + RemoveMultiLineComments(filename, lines, error) + clean_lines = CleansedLines(lines) + for line in xrange(clean_lines.NumLines()): + ProcessLine(filename, file_extension, clean_lines, line, + include_state, function_state, nesting_state, error, + extra_check_functions) + nesting_state.CheckCompletedBlocks(filename, error) + + CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error) + + # We check here rather than inside ProcessLine so that we see raw + # lines rather than "cleaned" lines. + CheckForBadCharacters(filename, lines, error) + + CheckForNewlineAtEOF(filename, lines, error) + +def ProcessFile(filename, vlevel, extra_check_functions=[]): + """Does google-lint on a single file. + + Args: + filename: The name of the file to parse. + + vlevel: The level of errors to report. Every error of confidence + >= verbose_level will be reported. 0 is a good default. + + extra_check_functions: An array of additional check functions that will be + run on each source line. Each function takes 4 + arguments: filename, clean_lines, line, error + """ + + _SetVerboseLevel(vlevel) + + try: + # Support the UNIX convention of using "-" for stdin. Note that + # we are not opening the file with universal newline support + # (which codecs doesn't support anyway), so the resulting lines do + # contain trailing '\r' characters if we are reading a file that + # has CRLF endings. + # If after the split a trailing '\r' is present, it is removed + # below. If it is not expected to be present (i.e. os.linesep != + # '\r\n' as in Windows), a warning is issued below if this file + # is processed. + + if filename == '-': + lines = codecs.StreamReaderWriter(sys.stdin, + codecs.getreader('utf8'), + codecs.getwriter('utf8'), + 'replace').read().split('\n') + else: + lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n') + + carriage_return_found = False + # Remove trailing '\r'. + for linenum in range(len(lines)): + if lines[linenum].endswith('\r'): + lines[linenum] = lines[linenum].rstrip('\r') + carriage_return_found = True + + except IOError: + sys.stderr.write( + "Skipping input '%s': Can't open for reading\n" % filename) + return + + # Note, if no dot is found, this will give the entire filename as the ext. + file_extension = filename[filename.rfind('.') + 1:] + + # When reading from stdin, the extension is unknown, so no cpplint tests + # should rely on the extension. + if filename != '-' and file_extension not in _valid_extensions: + sys.stderr.write('Ignoring %s; not a valid file name ' + '(%s)\n' % (filename, ', '.join(_valid_extensions))) + else: + ProcessFileData(filename, file_extension, lines, Error, + extra_check_functions) + if carriage_return_found and os.linesep != '\r\n': + # Use 0 for linenum since outputting only one error for potentially + # several lines. + Error(filename, 0, 'whitespace/newline', 1, + 'One or more unexpected \\r (^M) found;' + 'better to use only a \\n') + + sys.stderr.write('Done processing %s\n' % filename) + + +def PrintUsage(message): + """Prints a brief usage string and exits, optionally with an error message. + + Args: + message: The optional error message. + """ + sys.stderr.write(_USAGE) + if message: + sys.exit('\nFATAL ERROR: ' + message) + else: + sys.exit(1) + + +def PrintCategories(): + """Prints a list of all the error-categories used by error messages. + + These are the categories used to filter messages via --filter. + """ + sys.stderr.write(''.join(' %s\n' % cat for cat in _ERROR_CATEGORIES)) + sys.exit(0) + + +def ParseArguments(args): + """Parses the command line arguments. + + This may set the output format and verbosity level as side-effects. + + Args: + args: The command line arguments: + + Returns: + The list of filenames to lint. + """ + try: + (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=', + 'counting=', + 'filter=', + 'root=', + 'linelength=', + 'extensions=']) + except getopt.GetoptError: + PrintUsage('Invalid arguments.') + + verbosity = _VerboseLevel() + output_format = _OutputFormat() + filters = '' + counting_style = '' + + for (opt, val) in opts: + if opt == '--help': + PrintUsage(None) + elif opt == '--output': + if val not in ('emacs', 'vs7', 'eclipse'): + PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.') + output_format = val + elif opt == '--verbose': + verbosity = int(val) + elif opt == '--filter': + filters = val + if not filters: + PrintCategories() + elif opt == '--counting': + if val not in ('total', 'toplevel', 'detailed'): + PrintUsage('Valid counting options are total, toplevel, and detailed') + counting_style = val + elif opt == '--root': + global _root + _root = val + elif opt == '--linelength': + global _line_length + try: + _line_length = int(val) + except ValueError: + PrintUsage('Line length must be digits.') + elif opt == '--extensions': + global _valid_extensions + try: + _valid_extensions = set(val.split(',')) + except ValueError: + PrintUsage('Extensions must be comma separated list.') + + if not filenames: + PrintUsage('No files were specified.') + + _SetOutputFormat(output_format) + _SetVerboseLevel(verbosity) + _SetFilters(filters) + _SetCountingStyle(counting_style) + + return filenames + + +def main(): + filenames = ParseArguments(sys.argv[1:]) + + # Change stderr to write with replacement characters so we don't die + # if we try to print something containing non-ASCII characters. + sys.stderr = codecs.StreamReaderWriter(sys.stderr, + codecs.getreader('utf8'), + codecs.getwriter('utf8'), + 'replace') + + _cpplint_state.ResetErrorCounts() + for filename in filenames: + ProcessFile(filename, _cpplint_state.verbose_level) + _cpplint_state.PrintErrorCounts() + + sys.exit(_cpplint_state.error_count > 0) + + +if __name__ == '__main__': + main() diff --git a/linters/lint_engine/FacebookFbcodeLintEngine.php b/linters/lint_engine/FacebookFbcodeLintEngine.php new file mode 100644 index 0000000000..cb9cf9bdba --- /dev/null +++ b/linters/lint_engine/FacebookFbcodeLintEngine.php @@ -0,0 +1,147 @@ +getPaths(); + + // Remove all deleted files, which are not checked by the + // following linters. + foreach ($paths as $key => $path) { + if (!Filesystem::pathExists($this->getFilePathOnDisk($path))) { + unset($paths[$key]); + } + } + + $generated_linter = new ArcanistGeneratedLinter(); + $linters[] = $generated_linter; + + $nolint_linter = new ArcanistNoLintLinter(); + $linters[] = $nolint_linter; + + $text_linter = new ArcanistTextLinter(); + $text_linter->setCustomSeverityMap(array( + ArcanistTextLinter::LINT_LINE_WRAP + => ArcanistLintSeverity::SEVERITY_ADVICE, + )); + $linters[] = $text_linter; + + $java_text_linter = new ArcanistTextLinter(); + $java_text_linter->setMaxLineLength(100); + $java_text_linter->setCustomSeverityMap(array( + ArcanistTextLinter::LINT_LINE_WRAP + => ArcanistLintSeverity::SEVERITY_ADVICE, + )); + $linters[] = $java_text_linter; + + $pep8_options = $this->getPEP8WithTextOptions().',E302'; + + $python_linter = new ArcanistPEP8Linter(); + $python_linter->setConfig(array('options' => $pep8_options)); + $linters[] = $python_linter; + + $python_2space_linter = new ArcanistPEP8Linter(); + $python_2space_linter->setConfig(array('options' => $pep8_options.',E111')); + $linters[] = $python_2space_linter; + + // Currently we can't run cpplint in commit hook mode, because it + // depends on having access to the working directory. + if (!$this->getCommitHookMode()) { + $cpp_linters = array(); + $google_linter = new ArcanistCpplintLinter(); + $google_linter->setConfig(array( + 'lint.cpplint.prefix' => '', + 'lint.cpplint.bin' => 'cpplint', + )); + $cpp_linters[] = $linters[] = $google_linter; + $cpp_linters[] = $linters[] = new FbcodeCppLinter(); + $cpp_linters[] = $linters[] = new PfffCppLinter(); + } + + $spelling_linter = new ArcanistSpellingLinter(); + $linters[] = $spelling_linter; + + foreach ($paths as $path) { + $is_text = false; + + $text_extensions = ( + '/\.('. + 'cpp|cxx|c|cc|h|hpp|hxx|tcc|'. + 'py|rb|hs|pl|pm|tw|'. + 'php|phpt|css|js|'. + 'java|'. + 'thrift|'. + 'lua|'. + 'siv|'. + 'txt'. + ')$/' + ); + if (preg_match($text_extensions, $path)) { + $is_text = true; + } + if ($is_text) { + $nolint_linter->addPath($path); + + $generated_linter->addPath($path); + $generated_linter->addData($path, $this->loadData($path)); + + if (preg_match('/\.java$/', $path)) { + $java_text_linter->addPath($path); + $java_text_linter->addData($path, $this->loadData($path)); + } else { + $text_linter->addPath($path); + $text_linter->addData($path, $this->loadData($path)); + } + + $spelling_linter->addPath($path); + $spelling_linter->addData($path, $this->loadData($path)); + } + if (preg_match('/\.(cpp|c|cc|cxx|h|hh|hpp|hxx|tcc)$/', $path)) { + foreach ($cpp_linters as &$linter) { + $linter->addPath($path); + $linter->addData($path, $this->loadData($path)); + } + } + + // Match *.py and contbuild config files + if (preg_match('/(\.(py|tw|smcprops)|^contbuild\/configs\/[^\/]*)$/', + $path)) { + $space_count = 4; + $real_path = $this->getFilePathOnDisk($path); + $dir = dirname($real_path); + do { + if (file_exists($dir.'/.python2space')) { + $space_count = 2; + break; + } + $dir = dirname($dir); + } while ($dir != '/' && $dir != '.'); + + if ($space_count == 4) { + $cur_path_linter = $python_linter; + } else { + $cur_path_linter = $python_2space_linter; + } + $cur_path_linter->addPath($path); + $cur_path_linter->addData($path, $this->loadData($path)); + + if (preg_match('/\.tw$/', $path)) { + $cur_path_linter->setCustomSeverityMap(array( + 'E251' => ArcanistLintSeverity::SEVERITY_DISABLED, + )); + } + } + } + + $name_linter = new ArcanistFilenameLinter(); + $linters[] = $name_linter; + foreach ($paths as $path) { + $name_linter->addPath($path); + } + + return $linters; + } + +} diff --git a/port/README b/port/README new file mode 100644 index 0000000000..422563e25c --- /dev/null +++ b/port/README @@ -0,0 +1,10 @@ +This directory contains interfaces and implementations that isolate the +rest of the package from platform details. + +Code in the rest of the package includes "port.h" from this directory. +"port.h" in turn includes a platform specific "port_.h" file +that provides the platform specific implementation. + +See port_posix.h for an example of what must be provided in a platform +specific header file. + diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h new file mode 100644 index 0000000000..db3580bded --- /dev/null +++ b/port/atomic_pointer.h @@ -0,0 +1,157 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// AtomicPointer provides storage for a lock-free pointer. +// Platform-dependent implementation of AtomicPointer: +// - If the platform provides a cheap barrier, we use it with raw pointers +// - If cstdatomic is present (on newer versions of gcc, it is), we use +// a cstdatomic-based AtomicPointer. However we prefer the memory +// barrier based version, because at least on a gcc 4.4 32-bit build +// on linux, we have encountered a buggy +// implementation. Also, some implementations are much +// slower than a memory-barrier based implementation (~16ns for +// based acquire-load vs. ~1ns for a barrier based +// acquire-load). +// This code is based on atomicops-internals-* in Google's perftools: +// http://code.google.com/p/google-perftools/source/browse/#svn%2Ftrunk%2Fsrc%2Fbase + +#ifndef PORT_ATOMIC_POINTER_H_ +#define PORT_ATOMIC_POINTER_H_ + +#include +#ifdef ROCKSDB_ATOMIC_PRESENT +#include +#endif +#ifdef OS_WIN +#include +#endif +#ifdef OS_MACOSX +#include +#endif + +#if defined(_M_X64) || defined(__x86_64__) +#define ARCH_CPU_X86_FAMILY 1 +#elif defined(_M_IX86) || defined(__i386__) || defined(__i386) +#define ARCH_CPU_X86_FAMILY 1 +#elif defined(__ARMEL__) +#define ARCH_CPU_ARM_FAMILY 1 +#endif + +namespace rocksdb { +namespace port { + +// Define MemoryBarrier() if available +// Windows on x86 +#if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY) +// windows.h already provides a MemoryBarrier(void) macro +// http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx +#define ROCKSDB_HAVE_MEMORY_BARRIER + +// Gcc on x86 +#elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__) +inline void MemoryBarrier() { + // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on + // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. + __asm__ __volatile__("" : : : "memory"); +} +#define ROCKSDB_HAVE_MEMORY_BARRIER + +// Sun Studio +#elif defined(ARCH_CPU_X86_FAMILY) && defined(__SUNPRO_CC) +inline void MemoryBarrier() { + // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on + // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. + asm volatile("" : : : "memory"); +} +#define ROCKSDB_HAVE_MEMORY_BARRIER + +// Mac OS +#elif defined(OS_MACOSX) +inline void MemoryBarrier() { + OSMemoryBarrier(); +} +#define ROCKSDB_HAVE_MEMORY_BARRIER + +// ARM Linux +#elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__) +typedef void (*LinuxKernelMemoryBarrierFunc)(void); +// The Linux ARM kernel provides a highly optimized device-specific memory +// barrier function at a fixed memory address that is mapped in every +// user-level process. +// +// This beats using CPU-specific instructions which are, on single-core +// devices, un-necessary and very costly (e.g. ARMv7-A "dmb" takes more +// than 180ns on a Cortex-A8 like the one on a Nexus One). Benchmarking +// shows that the extra function call cost is completely negligible on +// multi-core devices. +// +inline void MemoryBarrier() { + (*(LinuxKernelMemoryBarrierFunc)0xffff0fa0)(); +} +#define ROCKSDB_HAVE_MEMORY_BARRIER + +#endif + +// AtomicPointer built using platform-specific MemoryBarrier() +#if defined(ROCKSDB_HAVE_MEMORY_BARRIER) +class AtomicPointer { + private: + void* rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* p) : rep_(p) {} + inline void* NoBarrier_Load() const { return rep_; } + inline void NoBarrier_Store(void* v) { rep_ = v; } + inline void* Acquire_Load() const { + void* result = rep_; + MemoryBarrier(); + return result; + } + inline void Release_Store(void* v) { + MemoryBarrier(); + rep_ = v; + } +}; + +// AtomicPointer based on +#elif defined(ROCKSDB_ATOMIC_PRESENT) +class AtomicPointer { + private: + std::atomic rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* v) : rep_(v) { } + inline void* Acquire_Load() const { + return rep_.load(std::memory_order_acquire); + } + inline void Release_Store(void* v) { + rep_.store(v, std::memory_order_release); + } + inline void* NoBarrier_Load() const { + return rep_.load(std::memory_order_relaxed); + } + inline void NoBarrier_Store(void* v) { + rep_.store(v, std::memory_order_relaxed); + } +}; + +// We have neither MemoryBarrier(), nor +#else +#error Please implement AtomicPointer for this platform. + +#endif + +#undef ROCKSDB_HAVE_MEMORY_BARRIER +#undef ARCH_CPU_X86_FAMILY +#undef ARCH_CPU_ARM_FAMILY + +} // namespace port +} // namespace rocksdb + +#endif // PORT_ATOMIC_POINTER_H_ diff --git a/port/likely.h b/port/likely.h new file mode 100644 index 0000000000..ede0df5a15 --- /dev/null +++ b/port/likely.h @@ -0,0 +1,21 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef PORT_LIKELY_H_ +#define PORT_LIKELY_H_ + +#if defined(__GNUC__) && __GNUC__ >= 4 +#define LIKELY(x) (__builtin_expect((x), 1)) +#define UNLIKELY(x) (__builtin_expect((x), 0)) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +#endif // PORT_LIKELY_H_ diff --git a/port/port.h b/port/port.h new file mode 100644 index 0000000000..2dc9a0fa64 --- /dev/null +++ b/port/port.h @@ -0,0 +1,22 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_PORT_PORT_H_ +#define STORAGE_LEVELDB_PORT_PORT_H_ + +#include + +// Include the appropriate platform specific file below. If you are +// porting to a new platform, see "port_example.h" for documentation +// of what the new port_.h file must provide. +#if defined(ROCKSDB_PLATFORM_POSIX) +# include "port/port_posix.h" +#endif + +#endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/port/port_example.h b/port/port_example.h new file mode 100644 index 0000000000..f124abb068 --- /dev/null +++ b/port/port_example.h @@ -0,0 +1,133 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// This file contains the specification, but not the implementations, +// of the types/operations/etc. that should be defined by a platform +// specific port_.h file. Use this file as a reference for +// how to port this package to a new platform. + +#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ +#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ + +namespace rocksdb { +namespace port { + +// TODO(jorlow): Many of these belong more in the environment class rather than +// here. We should try moving them and see if it affects perf. + +// The following boolean constant must be true on a little-endian machine +// and false otherwise. +static const bool kLittleEndian = true /* or some other expression */; + +// ------------------ Threading ------------------- + +// A Mutex represents an exclusive lock. +class Mutex { + public: + Mutex(); + ~Mutex(); + + // Lock the mutex. Waits until other lockers have exited. + // Will deadlock if the mutex is already locked by this thread. + void Lock(); + + // Unlock the mutex. + // REQUIRES: This mutex was locked by this thread. + void Unlock(); + + // Optionally crash if this thread does not hold this mutex. + // The implementation must be fast, especially if NDEBUG is + // defined. The implementation is allowed to skip all checks. + void AssertHeld(); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + + // Atomically release *mu and block on this condition variable until + // either a call to SignalAll(), or a call to Signal() that picks + // this thread to wakeup. + // REQUIRES: this thread holds *mu + void Wait(); + + // If there are some threads waiting, wake up at least one of them. + void Signal(); + + // Wake up all waiting threads. + void SignallAll(); +}; + +// Thread-safe initialization. +// Used as follows: +// static port::OnceType init_control = LEVELDB_ONCE_INIT; +// static void Initializer() { ... do something ...; } +// ... +// port::InitOnce(&init_control, &Initializer); +typedef intptr_t OnceType; +#define LEVELDB_ONCE_INIT 0 +extern void InitOnce(port::OnceType*, void (*initializer)()); + +// A type that holds a pointer that can be read or written atomically +// (i.e., without word-tearing.) +class AtomicPointer { + private: + intptr_t rep_; + public: + // Initialize to arbitrary value + AtomicPointer(); + + // Initialize to hold v + explicit AtomicPointer(void* v) : rep_(v) { } + + // Read and return the stored pointer with the guarantee that no + // later memory access (read or write) by this thread can be + // reordered ahead of this read. + void* Acquire_Load() const; + + // Set v as the stored pointer with the guarantee that no earlier + // memory access (read or write) by this thread can be reordered + // after this store. + void Release_Store(void* v); + + // Read the stored pointer with no ordering guarantees. + void* NoBarrier_Load() const; + + // Set va as the stored pointer with no ordering guarantees. + void NoBarrier_Store(void* v); +}; + +// ------------------ Compression ------------------- + +// Store the snappy compression of "input[0,input_length-1]" in *output. +// Returns false if snappy is not supported by this port. +extern bool Snappy_Compress(const char* input, size_t input_length, + std::string* output); + +// If input[0,input_length-1] looks like a valid snappy compressed +// buffer, store the size of the uncompressed data in *result and +// return true. Else return false. +extern bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result); + +// Attempt to snappy uncompress input[0,input_length-1] into *output. +// Returns true if successful, false if the input is invalid lightweight +// compressed data. +// +// REQUIRES: at least the first "n" bytes of output[] must be writable +// where "n" is the result of a successful call to +// Snappy_GetUncompressedLength. +extern bool Snappy_Uncompress(const char* input_data, size_t input_length, + char* output); + +} // namespace port +} // namespace rocksdb + +#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ diff --git a/port/port_posix.cc b/port/port_posix.cc new file mode 100644 index 0000000000..911cebdf2d --- /dev/null +++ b/port/port_posix.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_posix.h" + +#include +#include +#include +#include +#include "util/logging.h" + +namespace rocksdb { +namespace port { + +static void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } +} + +Mutex::Mutex(bool adaptive) { +#ifdef OS_LINUX + if (!adaptive) { + PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); + } else { + pthread_mutexattr_t mutex_attr; + PthreadCall("init mutex attr", pthread_mutexattr_init(&mutex_attr)); + PthreadCall("set mutex attr", + pthread_mutexattr_settype(&mutex_attr, + PTHREAD_MUTEX_ADAPTIVE_NP)); + PthreadCall("init mutex", pthread_mutex_init(&mu_, &mutex_attr)); + PthreadCall("destroy mutex attr", + pthread_mutexattr_destroy(&mutex_attr)); + } +#else // ignore adaptive for non-linux platform + PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); +#endif // OS_LINUX +} + +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } + +void Mutex::Lock() { + PthreadCall("lock", pthread_mutex_lock(&mu_)); +#ifndef NDEBUG + locked_ = true; +#endif +} + +void Mutex::Unlock() { +#ifndef NDEBUG + locked_ = false; +#endif + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +void Mutex::AssertHeld() { +#ifndef NDEBUG + assert(locked_); +#endif +} + +CondVar::CondVar(Mutex* mu) + : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); +} + +CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } + +void CondVar::Wait() { +#ifndef NDEBUG + mu_->locked_ = false; +#endif + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +#ifndef NDEBUG + mu_->locked_ = true; +#endif +} + +void CondVar::Signal() { + PthreadCall("signal", pthread_cond_signal(&cv_)); +} + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +RWMutex::RWMutex() { PthreadCall("init mutex", pthread_rwlock_init(&mu_, NULL)); } + +RWMutex::~RWMutex() { PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_)); } + +void RWMutex::ReadLock() { PthreadCall("read lock", pthread_rwlock_rdlock(&mu_)); } + +void RWMutex::WriteLock() { PthreadCall("write lock", pthread_rwlock_wrlock(&mu_)); } + +void RWMutex::Unlock() { PthreadCall("unlock", pthread_rwlock_unlock(&mu_)); } + +void InitOnce(OnceType* once, void (*initializer)()) { + PthreadCall("once", pthread_once(once, initializer)); +} + +} // namespace port +} // namespace rocksdb diff --git a/port/port_posix.h b/port/port_posix.h new file mode 100644 index 0000000000..d20a5dff2c --- /dev/null +++ b/port/port_posix.h @@ -0,0 +1,488 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ +#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ + +#undef PLATFORM_IS_LITTLE_ENDIAN +#if defined(OS_MACOSX) + #include + #if defined(__DARWIN_LITTLE_ENDIAN) && defined(__DARWIN_BYTE_ORDER) + #define PLATFORM_IS_LITTLE_ENDIAN \ + (__DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN) + #endif +#elif defined(OS_SOLARIS) + #include + #ifdef _LITTLE_ENDIAN + #define PLATFORM_IS_LITTLE_ENDIAN true + #else + #define PLATFORM_IS_LITTLE_ENDIAN false + #endif +#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) ||\ + defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID) + #include + #include +#else + #include +#endif +#include +#ifdef SNAPPY +#include +#endif + +#ifdef ZLIB +#include +#endif + +#ifdef BZIP2 +#include +#endif + +#if defined(LZ4) +#include +#include +#endif + +#include +#include +#include +#include "rocksdb/options.h" +#include "port/atomic_pointer.h" + +#ifndef PLATFORM_IS_LITTLE_ENDIAN +#define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN) +#endif + +#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\ + defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\ + defined(OS_ANDROID) +// Use fread/fwrite/fflush on platforms without _unlocked variants +#define fread_unlocked fread +#define fwrite_unlocked fwrite +#define fflush_unlocked fflush +#endif + +#if defined(OS_MACOSX) || defined(OS_FREEBSD) ||\ + defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) +// Use fsync() on platforms without fdatasync() +#define fdatasync fsync +#endif + +#if defined(OS_ANDROID) && __ANDROID_API__ < 9 +// fdatasync() was only introduced in API level 9 on Android. Use fsync() +// when targetting older platforms. +#define fdatasync fsync +#endif + +namespace rocksdb { +namespace port { + +static const bool kLittleEndian = PLATFORM_IS_LITTLE_ENDIAN; +#undef PLATFORM_IS_LITTLE_ENDIAN + +class CondVar; + +class Mutex { + public: + /* implicit */ Mutex(bool adaptive = false); + ~Mutex(); + + void Lock(); + void Unlock(); + // this will assert if the mutex is not locked + // it does NOT verify that mutex is held by a calling thread + void AssertHeld(); + + private: + friend class CondVar; + pthread_mutex_t mu_; +#ifndef NDEBUG + bool locked_; +#endif + + // No copying + Mutex(const Mutex&); + void operator=(const Mutex&); +}; + +class RWMutex { + public: + RWMutex(); + ~RWMutex(); + + void ReadLock(); + void WriteLock(); + void Unlock(); + void AssertHeld() { } + + private: + pthread_rwlock_t mu_; // the underlying platform mutex + + // No copying allowed + RWMutex(const RWMutex&); + void operator=(const RWMutex&); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + private: + pthread_cond_t cv_; + Mutex* mu_; +}; + +typedef pthread_once_t OnceType; +#define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT +extern void InitOnce(OnceType* once, void (*initializer)()); + +inline bool Snappy_Compress(const CompressionOptions& opts, const char* input, + size_t length, ::std::string* output) { +#ifdef SNAPPY + output->resize(snappy::MaxCompressedLength(length)); + size_t outlen; + snappy::RawCompress(input, length, &(*output)[0], &outlen); + output->resize(outlen); + return true; +#endif + + return false; +} + +inline bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result) { +#ifdef SNAPPY + return snappy::GetUncompressedLength(input, length, result); +#else + return false; +#endif +} + +inline bool Snappy_Uncompress(const char* input, size_t length, + char* output) { +#ifdef SNAPPY + return snappy::RawUncompress(input, length, output); +#else + return false; +#endif +} + +inline bool Zlib_Compress(const CompressionOptions& opts, const char* input, + size_t length, ::std::string* output) { +#ifdef ZLIB + // The memLevel parameter specifies how much memory should be allocated for + // the internal compression state. + // memLevel=1 uses minimum memory but is slow and reduces compression ratio. + // memLevel=9 uses maximum memory for optimal speed. + // The default value is 8. See zconf.h for more details. + static const int memLevel = 8; + z_stream _stream; + memset(&_stream, 0, sizeof(z_stream)); + int st = deflateInit2(&_stream, opts.level, Z_DEFLATED, opts.window_bits, + memLevel, opts.strategy); + if (st != Z_OK) { + return false; + } + + // Resize output to be the plain data length. + // This may not be big enough if the compression actually expands data. + output->resize(length); + + // Compress the input, and put compressed data in output. + _stream.next_in = (Bytef *)input; + _stream.avail_in = length; + + // Initialize the output size. + _stream.avail_out = length; + _stream.next_out = (Bytef *)&(*output)[0]; + + int old_sz =0, new_sz =0, new_sz_delta =0; + bool done = false; + while (!done) { + int st = deflate(&_stream, Z_FINISH); + switch (st) { + case Z_STREAM_END: + done = true; + break; + case Z_OK: + // No output space. Increase the output space by 20%. + // (Should we fail the compression since it expands the size?) + old_sz = output->size(); + new_sz_delta = (int)(output->size() * 0.2); + new_sz = output->size() + (new_sz_delta < 10 ? 10 : new_sz_delta); + output->resize(new_sz); + // Set more output. + _stream.next_out = (Bytef *)&(*output)[old_sz]; + _stream.avail_out = new_sz - old_sz; + break; + case Z_BUF_ERROR: + default: + deflateEnd(&_stream); + return false; + } + } + + output->resize(output->size() - _stream.avail_out); + deflateEnd(&_stream); + return true; +#endif + return false; +} + +inline char* Zlib_Uncompress(const char* input_data, size_t input_length, + int* decompress_size, int windowBits = -14) { +#ifdef ZLIB + z_stream _stream; + memset(&_stream, 0, sizeof(z_stream)); + + // For raw inflate, the windowBits should be -8..-15. + // If windowBits is bigger than zero, it will use either zlib + // header or gzip header. Adding 32 to it will do automatic detection. + int st = inflateInit2(&_stream, + windowBits > 0 ? windowBits + 32 : windowBits); + if (st != Z_OK) { + return nullptr; + } + + _stream.next_in = (Bytef *)input_data; + _stream.avail_in = input_length; + + // Assume the decompressed data size will 5x of compressed size. + int output_len = input_length * 5; + char* output = new char[output_len]; + int old_sz = output_len; + + _stream.next_out = (Bytef *)output; + _stream.avail_out = output_len; + + char* tmp = nullptr; + int output_len_delta; + bool done = false; + + //while(_stream.next_in != nullptr && _stream.avail_in != 0) { + while (!done) { + int st = inflate(&_stream, Z_SYNC_FLUSH); + switch (st) { + case Z_STREAM_END: + done = true; + break; + case Z_OK: + // No output space. Increase the output space by 20%. + old_sz = output_len; + output_len_delta = (int)(output_len * 0.2); + output_len += output_len_delta < 10 ? 10 : output_len_delta; + tmp = new char[output_len]; + memcpy(tmp, output, old_sz); + delete[] output; + output = tmp; + + // Set more output. + _stream.next_out = (Bytef *)(output + old_sz); + _stream.avail_out = output_len - old_sz; + break; + case Z_BUF_ERROR: + default: + delete[] output; + inflateEnd(&_stream); + return nullptr; + } + } + + *decompress_size = output_len - _stream.avail_out; + inflateEnd(&_stream); + return output; +#endif + + return nullptr; +} + +inline bool BZip2_Compress(const CompressionOptions& opts, const char* input, + size_t length, ::std::string* output) { +#ifdef BZIP2 + bz_stream _stream; + memset(&_stream, 0, sizeof(bz_stream)); + + // Block size 1 is 100K. + // 0 is for silent. + // 30 is the default workFactor + int st = BZ2_bzCompressInit(&_stream, 1, 0, 30); + if (st != BZ_OK) { + return false; + } + + // Resize output to be the plain data length. + // This may not be big enough if the compression actually expands data. + output->resize(length); + + // Compress the input, and put compressed data in output. + _stream.next_in = (char *)input; + _stream.avail_in = length; + + // Initialize the output size. + _stream.next_out = (char *)&(*output)[0]; + _stream.avail_out = length; + + int old_sz =0, new_sz =0; + while(_stream.next_in != nullptr && _stream.avail_in != 0) { + int st = BZ2_bzCompress(&_stream, BZ_FINISH); + switch (st) { + case BZ_STREAM_END: + break; + case BZ_FINISH_OK: + // No output space. Increase the output space by 20%. + // (Should we fail the compression since it expands the size?) + old_sz = output->size(); + new_sz = (int)(output->size() * 1.2); + output->resize(new_sz); + // Set more output. + _stream.next_out = (char *)&(*output)[old_sz]; + _stream.avail_out = new_sz - old_sz; + break; + case BZ_SEQUENCE_ERROR: + default: + BZ2_bzCompressEnd(&_stream); + return false; + } + } + + output->resize(output->size() - _stream.avail_out); + BZ2_bzCompressEnd(&_stream); + return true; +#endif + return false; +} + +inline char* BZip2_Uncompress(const char* input_data, size_t input_length, + int* decompress_size) { +#ifdef BZIP2 + bz_stream _stream; + memset(&_stream, 0, sizeof(bz_stream)); + + int st = BZ2_bzDecompressInit(&_stream, 0, 0); + if (st != BZ_OK) { + return nullptr; + } + + _stream.next_in = (char *)input_data; + _stream.avail_in = input_length; + + // Assume the decompressed data size will be 5x of compressed size. + int output_len = input_length * 5; + char* output = new char[output_len]; + int old_sz = output_len; + + _stream.next_out = (char *)output; + _stream.avail_out = output_len; + + char* tmp = nullptr; + + while(_stream.next_in != nullptr && _stream.avail_in != 0) { + int st = BZ2_bzDecompress(&_stream); + switch (st) { + case BZ_STREAM_END: + break; + case BZ_OK: + // No output space. Increase the output space by 20%. + old_sz = output_len; + output_len = (int)(output_len * 1.2); + tmp = new char[output_len]; + memcpy(tmp, output, old_sz); + delete[] output; + output = tmp; + + // Set more output. + _stream.next_out = (char *)(output + old_sz); + _stream.avail_out = output_len - old_sz; + break; + default: + delete[] output; + BZ2_bzDecompressEnd(&_stream); + return nullptr; + } + } + + *decompress_size = output_len - _stream.avail_out; + BZ2_bzDecompressEnd(&_stream); + return output; +#endif + return nullptr; +} + +inline bool LZ4_Compress(const CompressionOptions &opts, const char *input, + size_t length, ::std::string* output) { +#ifdef LZ4 + int compressBound = LZ4_compressBound(length); + output->resize(8 + compressBound); + char *p = const_cast(output->c_str()); + memcpy(p, &length, sizeof(length)); + size_t outlen; + outlen = LZ4_compress_limitedOutput(input, p + 8, length, compressBound); + if (outlen == 0) { + return false; + } + output->resize(8 + outlen); + return true; +#endif + return false; +} + +inline char* LZ4_Uncompress(const char* input_data, size_t input_length, + int* decompress_size) { +#ifdef LZ4 + if (input_length < 8) { + return nullptr; + } + int output_len; + memcpy(&output_len, input_data, sizeof(output_len)); + char *output = new char[output_len]; + *decompress_size = LZ4_decompress_safe_partial( + input_data + 8, output, input_length - 8, output_len, output_len); + if (*decompress_size < 0) { + delete[] output; + return nullptr; + } + return output; +#endif + return nullptr; +} + +inline bool LZ4HC_Compress(const CompressionOptions &opts, const char* input, + size_t length, ::std::string* output) { +#ifdef LZ4 + int compressBound = LZ4_compressBound(length); + output->resize(8 + compressBound); + char *p = const_cast(output->c_str()); + memcpy(p, &length, sizeof(length)); + size_t outlen; +#ifdef LZ4_VERSION_MAJOR // they only started defining this since r113 + outlen = LZ4_compressHC2_limitedOutput(input, p + 8, length, compressBound, + opts.level); +#else + outlen = LZ4_compressHC_limitedOutput(input, p + 8, length, compressBound); +#endif + if (outlen == 0) { + return false; + } + output->resize(8 + outlen); + return true; +#endif + return false; +} + +#define CACHE_LINE_SIZE 64U + +} // namespace port +} // namespace rocksdb + +#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_ diff --git a/port/stack_trace.cc b/port/stack_trace.cc new file mode 100644 index 0000000000..76866e63cc --- /dev/null +++ b/port/stack_trace.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "port/stack_trace.h" + +namespace rocksdb { +namespace port { + +#if defined(ROCKSDB_LITE) || !(defined(OS_LINUX) || defined(OS_MACOSX)) + +// noop + +void InstallStackTraceHandler() {} +void PrintStack(int first_frames_to_skip) {} + +#else + +#include +#include +#include +#include +#include +#include +#include + +namespace { + +#ifdef OS_LINUX +const char* GetExecutableName() { + static char name[1024]; + + char link[1024]; + snprintf(link, sizeof(link), "/proc/%d/exe", getpid()); + auto read = readlink(link, name, sizeof(name)); + if (-1 == read) { + return nullptr; + } else { + name[read] = 0; + return name; + } +} + +void PrintStackTraceLine(const char* symbol, void* frame) { + static const char* executable = GetExecutableName(); + if (symbol) { + fprintf(stderr, "%s ", symbol); + } + if (executable) { + // out source to addr2line, for the address translation + const int kLineMax = 256; + char cmd[kLineMax]; + snprintf(cmd, kLineMax, "addr2line %p -e %s -f -C 2>&1", frame, executable); + auto f = popen(cmd, "r"); + if (f) { + char line[kLineMax]; + while (fgets(line, sizeof(line), f)) { + line[strlen(line) - 1] = 0; // remove newline + fprintf(stderr, "%s\t", line); + } + pclose(f); + } + } else { + fprintf(stderr, " %p", frame); + } + + fprintf(stderr, "\n"); +} +#elif OS_MACOSX + +void PrintStackTraceLine(const char* symbol, void* frame) { + static int pid = getpid(); + // out source to atos, for the address translation + const int kLineMax = 256; + char cmd[kLineMax]; + snprintf(cmd, kLineMax, "xcrun atos %p -p %d 2>&1", frame, pid); + auto f = popen(cmd, "r"); + if (f) { + char line[kLineMax]; + while (fgets(line, sizeof(line), f)) { + line[strlen(line) - 1] = 0; // remove newline + fprintf(stderr, "%s\t", line); + } + pclose(f); + } else if (symbol) { + fprintf(stderr, "%s ", symbol); + } + + fprintf(stderr, "\n"); +} + +#endif + +} // namespace + +void PrintStack(int first_frames_to_skip) { + const int kMaxFrames = 100; + void* frames[kMaxFrames]; + + auto num_frames = backtrace(frames, kMaxFrames); + auto symbols = backtrace_symbols(frames, num_frames); + + for (int i = first_frames_to_skip; i < num_frames; ++i) { + fprintf(stderr, "#%-2d ", i - first_frames_to_skip); + PrintStackTraceLine((symbols != nullptr) ? symbols[i] : nullptr, frames[i]); + } +} + +static void StackTraceHandler(int sig) { + // reset to default handler + signal(sig, SIG_DFL); + fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig)); + // skip the top three signal handler related frames + PrintStack(3); + // re-signal to default handler (so we still get core dump if needed...) + raise(sig); +} + +void InstallStackTraceHandler() { + // just use the plain old signal as it's simple and sufficient + // for this use case + signal(SIGILL, StackTraceHandler); + signal(SIGSEGV, StackTraceHandler); + signal(SIGBUS, StackTraceHandler); + signal(SIGABRT, StackTraceHandler); +} + +#endif + +} // namespace port +} // namespace rocksdb diff --git a/port/stack_trace.h b/port/stack_trace.h new file mode 100644 index 0000000000..8bc6c7d2ec --- /dev/null +++ b/port/stack_trace.h @@ -0,0 +1,19 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +namespace rocksdb { +namespace port { + +// Install a signal handler to print callstack on the following signals: +// SIGILL SIGSEGV SIGBUS SIGABRT +// Currently supports linux only. No-op otherwise. +void InstallStackTraceHandler(); + +// Prints stack, skips skip_first_frames frames +void PrintStack(int first_frames_to_skip = 0); + +} // namespace port +} // namespace rocksdb diff --git a/port/win/stdint.h b/port/win/stdint.h new file mode 100644 index 0000000000..39edd0db13 --- /dev/null +++ b/port/win/stdint.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// MSVC didn't ship with this file until the 2010 version. + +#ifndef STORAGE_LEVELDB_PORT_WIN_STDINT_H_ +#define STORAGE_LEVELDB_PORT_WIN_STDINT_H_ + +#if !defined(_MSC_VER) +#error This file should only be included when compiling with MSVC. +#endif + +// Define C99 equivalent types. +typedef signed char int8_t; +typedef signed short int16_t; +typedef signed int int32_t; +typedef signed long long int64_t; +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +#endif // STORAGE_LEVELDB_PORT_WIN_STDINT_H_ diff --git a/table/block.cc b/table/block.cc new file mode 100644 index 0000000000..6a6751ca75 --- /dev/null +++ b/table/block.cc @@ -0,0 +1,307 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Decodes the blocks generated by block_builder.cc. + +#include "table/block.h" + +#include +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "table/block_hash_index.h" +#include "table/format.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace rocksdb { + +uint32_t Block::NumRestarts() const { + assert(size_ >= 2*sizeof(uint32_t)); + return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); +} + +Block::Block(const BlockContents& contents) + : data_(contents.data.data()), + size_(contents.data.size()), + owned_(contents.heap_allocated), + cachable_(contents.cachable), + compression_type_(contents.compression_type) { + if (size_ < sizeof(uint32_t)) { + size_ = 0; // Error marker + } else { + restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); + if (restart_offset_ > size_ - sizeof(uint32_t)) { + // The size is too small for NumRestarts() and therefore + // restart_offset_ wrapped around. + size_ = 0; + } + } +} + +Block::~Block() { + if (owned_) { + delete[] data_; + } +} + +// Helper routine: decode the next block entry starting at "p", +// storing the number of shared key bytes, non_shared key bytes, +// and the length of the value in "*shared", "*non_shared", and +// "*value_length", respectively. Will not derefence past "limit". +// +// If any errors are detected, returns nullptr. Otherwise, returns a +// pointer to the key delta (just past the three decoded values). +static inline const char* DecodeEntry(const char* p, const char* limit, + uint32_t* shared, + uint32_t* non_shared, + uint32_t* value_length) { + if (limit - p < 3) return nullptr; + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + *value_length = reinterpret_cast(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr; + } + + if (static_cast(limit - p) < (*non_shared + *value_length)) { + return nullptr; + } + return p; +} + +class Block::Iter : public Iterator { + private: + const Comparator* const comparator_; + const char* const data_; // underlying block contents + uint32_t const restarts_; // Offset of restart array (list of fixed32) + uint32_t const num_restarts_; // Number of uint32_t entries in restart array + + // current_ is offset in data_ of current entry. >= restarts_ if !Valid + uint32_t current_; + uint32_t restart_index_; // Index of restart block in which current_ falls + std::string key_; + Slice value_; + Status status_; + BlockHashIndex* hash_index_; + + inline int Compare(const Slice& a, const Slice& b) const { + return comparator_->Compare(a, b); + } + + // Return the offset in data_ just past the end of the current entry. + inline uint32_t NextEntryOffset() const { + return (value_.data() + value_.size()) - data_; + } + + uint32_t GetRestartPoint(uint32_t index) { + assert(index < num_restarts_); + return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); + } + + void SeekToRestartPoint(uint32_t index) { + key_.clear(); + restart_index_ = index; + // current_ will be fixed by ParseNextKey(); + + // ParseNextKey() starts at the end of value_, so set value_ accordingly + uint32_t offset = GetRestartPoint(index); + value_ = Slice(data_ + offset, 0); + } + + public: + Iter(const Comparator* comparator, const char* data, uint32_t restarts, + uint32_t num_restarts, BlockHashIndex* hash_index) + : comparator_(comparator), + data_(data), + restarts_(restarts), + num_restarts_(num_restarts), + current_(restarts_), + restart_index_(num_restarts_), + hash_index_(hash_index) { + assert(num_restarts_ > 0); + } + + virtual bool Valid() const { return current_ < restarts_; } + virtual Status status() const { return status_; } + virtual Slice key() const { + assert(Valid()); + return key_; + } + virtual Slice value() const { + assert(Valid()); + return value_; + } + + virtual void Next() { + assert(Valid()); + ParseNextKey(); + } + + virtual void Prev() { + assert(Valid()); + + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + + SeekToRestartPoint(restart_index_); + do { + // Loop until end of current entry hits the start of original entry + } while (ParseNextKey() && NextEntryOffset() < original); + } + + virtual void Seek(const Slice& target) { + uint32_t index = 0; + bool ok = hash_index_ ? HashSeek(target, &index) + : BinarySeek(target, 0, num_restarts_ - 1, &index); + + if (!ok) { + return; + } + SeekToRestartPoint(index); + // Linear search (within restart block) for first key >= target + + while (true) { + if (!ParseNextKey() || Compare(key_, target) >= 0) { + return; + } + } + } + virtual void SeekToFirst() { + SeekToRestartPoint(0); + ParseNextKey(); + } + + virtual void SeekToLast() { + SeekToRestartPoint(num_restarts_ - 1); + while (ParseNextKey() && NextEntryOffset() < restarts_) { + // Keep skipping + } + } + + private: + void CorruptionError() { + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::Corruption("bad entry in block"); + key_.clear(); + value_.clear(); + } + + bool ParseNextKey() { + current_ = NextEntryOffset(); + const char* p = data_ + current_; + const char* limit = data_ + restarts_; // Restarts come right after data + if (p >= limit) { + // No more entries to return. Mark as invalid. + current_ = restarts_; + restart_index_ = num_restarts_; + return false; + } + + // Decode next entry + uint32_t shared, non_shared, value_length; + p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); + if (p == nullptr || key_.size() < shared) { + CorruptionError(); + return false; + } else { + key_.resize(shared); + key_.append(p, non_shared); + value_ = Slice(p + non_shared, value_length); + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + return true; + } + } + // Binary search in restart array to find the first restart point + // with a key >= target + bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, + uint32_t* index) { + assert(left <= right); + + while (left < right) { + uint32_t mid = (left + right + 1) / 2; + uint32_t region_offset = GetRestartPoint(mid); + uint32_t shared, non_shared, value_length; + const char* key_ptr = + DecodeEntry(data_ + region_offset, data_ + restarts_, &shared, + &non_shared, &value_length); + if (key_ptr == nullptr || (shared != 0)) { + CorruptionError(); + return false; + } + Slice mid_key(key_ptr, non_shared); + if (Compare(mid_key, target) < 0) { + // Key at "mid" is smaller than "target". Therefore all + // blocks before "mid" are uninteresting. + left = mid; + } else { + // Key at "mid" is >= "target". Therefore all blocks at or + // after "mid" are uninteresting. + right = mid - 1; + } + } + + *index = left; + return true; + } + + bool HashSeek(const Slice& target, uint32_t* index) { + assert(hash_index_); + auto restart_index = hash_index_->GetRestartIndex(target); + if (restart_index == nullptr) { + current_ = restarts_; + return 0; + } + + // the elements in restart_array[index : index + num_blocks] + // are all with same prefix. We'll do binary search in that small range. + auto left = restart_index->first_index; + auto right = restart_index->first_index + restart_index->num_blocks - 1; + return BinarySeek(target, left, right, index); + } +}; + +Iterator* Block::NewIterator(const Comparator* cmp) { + if (size_ < 2*sizeof(uint32_t)) { + return NewErrorIterator(Status::Corruption("bad block contents")); + } + const uint32_t num_restarts = NumRestarts(); + if (num_restarts == 0) { + return NewEmptyIterator(); + } else { + return new Iter(cmp, data_, restart_offset_, num_restarts, + hash_index_.get()); + } +} + +void Block::SetBlockHashIndex(BlockHashIndex* hash_index) { + hash_index_.reset(hash_index); +} + +} // namespace rocksdb diff --git a/table/block.h b/table/block.h new file mode 100644 index 0000000000..b363d62fec --- /dev/null +++ b/table/block.h @@ -0,0 +1,61 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" + +namespace rocksdb { + +struct BlockContents; +class Comparator; +class BlockHashIndex; + +class Block { + public: + // Initialize the block with the specified contents. + explicit Block(const BlockContents& contents); + + ~Block(); + + size_t size() const { return size_; } + const char* data() const { return data_; } + bool cachable() const { return cachable_; } + uint32_t NumRestarts() const; + CompressionType compression_type() const { return compression_type_; } + + // If hash index lookup is enabled and `use_hash_index` is true. This block + // will do hash lookup for the key prefix. + // + // NOTE: for the hash based lookup, if a key prefix doesn't match any key, + // the iterator will simply be set as "invalid", rather than returning + // the key that is just pass the target key. + Iterator* NewIterator(const Comparator* comparator); + void SetBlockHashIndex(BlockHashIndex* hash_index); + + private: + const char* data_; + size_t size_; + uint32_t restart_offset_; // Offset in data_ of restart array + bool owned_; // Block owns data_[] + bool cachable_; + CompressionType compression_type_; + std::unique_ptr hash_index_; + + // No copying allowed + Block(const Block&); + void operator=(const Block&); + + class Iter; +}; + +} // namespace rocksdb diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc new file mode 100644 index 0000000000..2ec6c11747 --- /dev/null +++ b/table/block_based_table_builder.cc @@ -0,0 +1,804 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based_table_builder.h" + +#include +#include +#include + +#include +#include +#include +#include + +#include "db/dbformat.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +#include "table/block.h" +#include "table/block_based_table_reader.h" +#include "table/block_builder.h" +#include "table/filter_block.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/table_builder.h" + +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" +#include "util/xxhash.h" + +namespace rocksdb { + +extern const std::string kHashIndexPrefixesBlock; +extern const std::string kHashIndexPrefixesMetadataBlock; +namespace { + +typedef BlockBasedTableOptions::IndexType IndexType; + +// The interface for building index. +// Instruction for adding a new concrete IndexBuilder: +// 1. Create a subclass instantiated from IndexBuilder. +// 2. Add a new entry associated with that subclass in TableOptions::IndexType. +// 3. Add a create function for the new subclass in CreateIndexBuilder. +// Note: we can devise more advanced design to simplify the process for adding +// new subclass, which will, on the other hand, increase the code complexity and +// catch unwanted attention from readers. Given that we won't add/change +// indexes frequently, it makes sense to just embrace a more straightforward +// design that just works. +class IndexBuilder { + public: + // Index builder will construct a set of blocks which contain: + // 1. One primary index block. + // 2. (Optional) a set of metablocks that contains the metadata of the + // primary index. + struct IndexBlocks { + Slice index_block_contents; + std::unordered_map meta_blocks; + }; + explicit IndexBuilder(const Comparator* comparator) + : comparator_(comparator) {} + + virtual ~IndexBuilder() {} + + // Add a new index entry to index block. + // To allow further optimization, we provide `last_key_in_current_block` and + // `first_key_in_next_block`, based on which the specific implementation can + // determine the best index key to be used for the index block. + // @last_key_in_current_block: this parameter maybe overridden with the value + // "substitute key". + // @first_key_in_next_block: it will be nullptr if the entry being added is + // the last one in the table + // + // REQUIRES: Finish() has not yet been called. + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) = 0; + + // This method will be called whenever a key is added. The subclasses may + // override OnKeyAdded() if they need to collect additional information. + virtual void OnKeyAdded(const Slice& key) {} + + // Inform the index builder that all entries has been written. Block builder + // may therefore perform any operation required for block finalization. + // + // REQUIRES: Finish() has not yet been called. + virtual Status Finish(IndexBlocks* index_blocks) = 0; + + // Get the estimated size for index block. + virtual size_t EstimatedSize() const = 0; + + protected: + const Comparator* comparator_; +}; + +// This index builder builds space-efficient index block. +// +// Optimizations: +// 1. Made block's `block_restart_interval` to be 1, which will avoid linear +// search when doing index lookup. +// 2. Shorten the key length for index block. Other than honestly using the +// last key in the data block as the index key, we instead find a shortest +// substitute key that serves the same function. +class ShortenedIndexBuilder : public IndexBuilder { + public: + explicit ShortenedIndexBuilder(const Comparator* comparator) + : IndexBuilder(comparator), + index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {} + + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override { + if (first_key_in_next_block != nullptr) { + comparator_->FindShortestSeparator(last_key_in_current_block, + *first_key_in_next_block); + } else { + comparator_->FindShortSuccessor(last_key_in_current_block); + } + + std::string handle_encoding; + block_handle.EncodeTo(&handle_encoding); + index_block_builder_.Add(*last_key_in_current_block, handle_encoding); + } + + virtual Status Finish(IndexBlocks* index_blocks) { + index_blocks->index_block_contents = index_block_builder_.Finish(); + return Status::OK(); + } + + virtual size_t EstimatedSize() const { + return index_block_builder_.CurrentSizeEstimate(); + } + + private: + BlockBuilder index_block_builder_; +}; + +// HashIndexBuilder contains a binary-searchable primary index and the +// metadata for secondary hash index construction. +// The metadata for hash index consists two parts: +// - a metablock that compactly contains a sequence of prefixes. All prefixes +// are stored consectively without any metadata (like, prefix sizes) being +// stored, which is kept in the other metablock. +// - a metablock contains the metadata of the prefixes, including prefix size, +// restart index and number of block it spans. The format looks like: +// +// +-----------------+---------------------------+---------------------+ <=prefix 1 +// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | +// +-----------------+---------------------------+---------------------+ <=prefix 2 +// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | +// +-----------------+---------------------------+---------------------+ +// | | +// | .... | +// | | +// +-----------------+---------------------------+---------------------+ <=prefix n +// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | +// +-----------------+---------------------------+---------------------+ +// +// The reason of separating these two metablocks is to enable the efficiently +// reuse the first metablock during hash index construction without unnecessary +// data copy or small heap allocations for prefixes. +class HashIndexBuilder : public IndexBuilder { + public: + explicit HashIndexBuilder(const Comparator* comparator, + const SliceTransform* hash_key_extractor) + : IndexBuilder(comparator), + primary_index_builder(comparator), + hash_key_extractor_(hash_key_extractor) {} + + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override { + ++current_restart_index_; + primary_index_builder.AddIndexEntry(last_key_in_current_block, + first_key_in_next_block, block_handle); + } + + virtual void OnKeyAdded(const Slice& key) override { + auto key_prefix = hash_key_extractor_->Transform(key); + bool is_first_entry = pending_block_num_ == 0; + + // Keys may share the prefix + if (is_first_entry || pending_entry_prefix_ != key_prefix) { + if (!is_first_entry) { + FlushPendingPrefix(); + } + + // need a hard copy otherwise the underlying data changes all the time. + // TODO(kailiu) ToString() is expensive. We may speed up can avoid data + // copy. + pending_entry_prefix_ = key_prefix.ToString(); + pending_block_num_ = 1; + pending_entry_index_ = current_restart_index_; + } else { + // entry number increments when keys share the prefix reside in + // differnt data blocks. + auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1; + assert(last_restart_index <= current_restart_index_); + if (last_restart_index != current_restart_index_) { + ++pending_block_num_; + } + } + } + + virtual Status Finish(IndexBlocks* index_blocks) { + FlushPendingPrefix(); + primary_index_builder.Finish(index_blocks); + index_blocks->meta_blocks.insert( + {kHashIndexPrefixesBlock.c_str(), prefix_block_}); + index_blocks->meta_blocks.insert( + {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_}); + return Status::OK(); + } + + virtual size_t EstimatedSize() const { + return primary_index_builder.EstimatedSize() + prefix_block_.size() + + prefix_meta_block_.size(); + } + + private: + void FlushPendingPrefix() { + prefix_block_.append(pending_entry_prefix_.data(), + pending_entry_prefix_.size()); + PutVarint32(&prefix_meta_block_, pending_entry_prefix_.size()); + PutVarint32(&prefix_meta_block_, pending_entry_index_); + PutVarint32(&prefix_meta_block_, pending_block_num_); + } + + ShortenedIndexBuilder primary_index_builder; + const SliceTransform* hash_key_extractor_; + + // stores a sequence of prefixes + std::string prefix_block_; + // stores the metadata of prefixes + std::string prefix_meta_block_; + + // The following 3 variables keeps unflushed prefix and its metadata. + // The details of block_num and entry_index can be found in + // "block_hash_index.{h,cc}" + uint32_t pending_block_num_ = 0; + uint32_t pending_entry_index_ = 0; + std::string pending_entry_prefix_; + + uint64_t current_restart_index_ = 0; +}; + +// Create a index builder based on its type. +IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator, + const SliceTransform* prefix_extractor) { + switch (type) { + case BlockBasedTableOptions::kBinarySearch: { + return new ShortenedIndexBuilder(comparator); + } + case BlockBasedTableOptions::kHashSearch: { + return new HashIndexBuilder(comparator, prefix_extractor); + } + default: { + assert(!"Do not recognize the index type "); + return nullptr; + } + } + // impossible. + assert(false); + return nullptr; +} + +bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) { + // Check to see if compressed less than 12.5% + return compressed_size < raw_size - (raw_size / 8u); +} + +Slice CompressBlock(const Slice& raw, + const CompressionOptions& compression_options, + CompressionType* type, std::string* compressed_output) { + if (*type == kNoCompression) { + return raw; + } + + // Will return compressed block contents if (1) the compression method is + // supported in this platform and (2) the compression rate is "good enough". + switch (*type) { + case kSnappyCompression: + if (port::Snappy_Compress(compression_options, raw.data(), raw.size(), + compressed_output) && + GoodCompressionRatio(compressed_output->size(), raw.size())) { + return *compressed_output; + } + break; // fall back to no compression. + case kZlibCompression: + if (port::Zlib_Compress(compression_options, raw.data(), raw.size(), + compressed_output) && + GoodCompressionRatio(compressed_output->size(), raw.size())) { + return *compressed_output; + } + break; // fall back to no compression. + case kBZip2Compression: + if (port::BZip2_Compress(compression_options, raw.data(), raw.size(), + compressed_output) && + GoodCompressionRatio(compressed_output->size(), raw.size())) { + return *compressed_output; + } + break; // fall back to no compression. + case kLZ4Compression: + if (port::LZ4_Compress(compression_options, raw.data(), raw.size(), + compressed_output) && + GoodCompressionRatio(compressed_output->size(), raw.size())) { + return *compressed_output; + } + break; // fall back to no compression. + case kLZ4HCCompression: + if (port::LZ4HC_Compress(compression_options, raw.data(), raw.size(), + compressed_output) && + GoodCompressionRatio(compressed_output->size(), raw.size())) { + return *compressed_output; + } + break; // fall back to no compression. + default: {} // Do not recognize this compression type + } + + // Compression method is not supported, or not good compression ratio, so just + // fall back to uncompressed form. + *type = kNoCompression; + return raw; +} + +} // anonymous namespace + +// kBlockBasedTableMagicNumber was picked by running +// echo rocksdb.table.block_based | sha1sum +// and taking the leading 64 bits. +// Please note that kBlockBasedTableMagicNumber may also be accessed by +// other .cc files so it have to be explicitly declared with "extern". +extern const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull; +// We also support reading and writing legacy block based table format (for +// backwards compatibility) +extern const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull; + +// A collector that collects properties of interest to block-based table. +// For now this class looks heavy-weight since we only write one additional +// property. +// But in the forseeable future, we will add more and more properties that are +// specific to block-based table. +class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector + : public TablePropertiesCollector { + public: + explicit BlockBasedTablePropertiesCollector( + BlockBasedTableOptions::IndexType index_type) + : index_type_(index_type) {} + + virtual Status Add(const Slice& key, const Slice& value) { + // Intentionally left blank. Have no interest in collecting stats for + // individual key/value pairs. + return Status::OK(); + } + + virtual Status Finish(UserCollectedProperties* properties) { + std::string val; + PutFixed32(&val, static_cast(index_type_)); + properties->insert({BlockBasedTablePropertyNames::kIndexType, val}); + + return Status::OK(); + } + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const { + return "BlockBasedTablePropertiesCollector"; + } + + virtual UserCollectedProperties GetReadableProperties() const { + // Intentionally left blank. + return UserCollectedProperties(); + } + + private: + BlockBasedTableOptions::IndexType index_type_; +}; + +struct BlockBasedTableBuilder::Rep { + Options options; + const InternalKeyComparator& internal_comparator; + WritableFile* file; + uint64_t offset = 0; + Status status; + BlockBuilder data_block; + + InternalKeySliceTransform internal_prefix_transform; + std::unique_ptr index_builder; + + std::string last_key; + CompressionType compression_type; + ChecksumType checksum_type; + TableProperties props; + + bool closed = false; // Either Finish() or Abandon() has been called. + FilterBlockBuilder* filter_block; + char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize]; + size_t compressed_cache_key_prefix_size; + + BlockHandle pending_handle; // Handle to add to index block + + std::string compressed_output; + std::unique_ptr flush_block_policy; + + std::vector> + table_properties_collectors; + + Rep(const Options& opt, const InternalKeyComparator& icomparator, + WritableFile* f, FlushBlockPolicyFactory* flush_block_policy_factory, + CompressionType compression_type, IndexType index_block_type, + ChecksumType checksum_type) + : options(opt), + internal_comparator(icomparator), + file(f), + data_block(options, &internal_comparator), + internal_prefix_transform(options.prefix_extractor.get()), + index_builder(CreateIndexBuilder(index_block_type, &internal_comparator, + &this->internal_prefix_transform)), + compression_type(compression_type), + checksum_type(checksum_type), + filter_block(opt.filter_policy == nullptr + ? nullptr + : new FilterBlockBuilder(opt, &internal_comparator)), + flush_block_policy(flush_block_policy_factory->NewFlushBlockPolicy( + options, data_block)) { + for (auto& collector_factories : + options.table_properties_collector_factories) { + table_properties_collectors.emplace_back( + collector_factories->CreateTablePropertiesCollector()); + } + table_properties_collectors.emplace_back( + new BlockBasedTablePropertiesCollector(index_block_type)); + } +}; + +BlockBasedTableBuilder::BlockBasedTableBuilder( + const Options& options, const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, WritableFile* file, + CompressionType compression_type) + : rep_(new Rep(options, internal_comparator, file, + table_options.flush_block_policy_factory.get(), + compression_type, table_options.index_type, + table_options.checksum)) { + if (rep_->filter_block != nullptr) { + rep_->filter_block->StartBlock(0); + } + if (options.block_cache_compressed.get() != nullptr) { + BlockBasedTable::GenerateCachePrefix( + options.block_cache_compressed.get(), file, + &rep_->compressed_cache_key_prefix[0], + &rep_->compressed_cache_key_prefix_size); + } +} + +BlockBasedTableBuilder::~BlockBasedTableBuilder() { + assert(rep_->closed); // Catch errors where caller forgot to call Finish() + delete rep_->filter_block; + delete rep_; +} + +void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { + Rep* r = rep_; + assert(!r->closed); + if (!ok()) return; + if (r->props.num_entries > 0) { + assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0); + } + r->index_builder->OnKeyAdded(key); + auto should_flush = r->flush_block_policy->Update(key, value); + if (should_flush) { + assert(!r->data_block.empty()); + Flush(); + + // Add item to index block. + // We do not emit the index entry for a block until we have seen the + // first key for the next data block. This allows us to use shorter + // keys in the index block. For example, consider a block boundary + // between the keys "the quick brown fox" and "the who". We can use + // "the r" as the key for the index block entry since it is >= all + // entries in the first block and < all entries in subsequent + // blocks. + if (ok()) { + r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle); + } + } + + if (r->filter_block != nullptr) { + r->filter_block->AddKey(key); + } + + r->last_key.assign(key.data(), key.size()); + r->data_block.Add(key, value); + r->props.num_entries++; + r->props.raw_key_size += key.size(); + r->props.raw_value_size += value.size(); + + NotifyCollectTableCollectorsOnAdd(key, value, r->table_properties_collectors, + r->options.info_log.get()); +} + +void BlockBasedTableBuilder::Flush() { + Rep* r = rep_; + assert(!r->closed); + if (!ok()) return; + if (r->data_block.empty()) return; + WriteBlock(&r->data_block, &r->pending_handle); + if (ok()) { + r->status = r->file->Flush(); + } + if (r->filter_block != nullptr) { + r->filter_block->StartBlock(r->offset); + } + r->props.data_size = r->offset; + ++r->props.num_data_blocks; +} + +void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block, + BlockHandle* handle) { + WriteBlock(block->Finish(), handle); + block->Reset(); +} + +void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, + BlockHandle* handle) { + // File format contains a sequence of blocks where each block has: + // block_data: uint8[n] + // type: uint8 + // crc: uint32 + assert(ok()); + Rep* r = rep_; + + auto type = r->compression_type; + auto block_contents = + CompressBlock(raw_block_contents, r->options.compression_opts, &type, + &r->compressed_output); + WriteRawBlock(block_contents, type, handle); + r->compressed_output.clear(); +} + +void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, + CompressionType type, + BlockHandle* handle) { + Rep* r = rep_; + StopWatch sw(r->options.env, r->options.statistics.get(), + WRITE_RAW_BLOCK_MICROS); + handle->set_offset(r->offset); + handle->set_size(block_contents.size()); + r->status = r->file->Append(block_contents); + if (r->status.ok()) { + char trailer[kBlockTrailerSize]; + trailer[0] = type; + char* trailer_without_type = trailer + 1; + switch (r->checksum_type) { + case kNoChecksum: + // we don't support no checksum yet + assert(false); + // intentional fallthrough in release binary + case kCRC32c: { + auto crc = crc32c::Value(block_contents.data(), block_contents.size()); + crc = crc32c::Extend(crc, trailer, 1); // Extend to cover block type + EncodeFixed32(trailer_without_type, crc32c::Mask(crc)); + break; + } + case kxxHash: { + void* xxh = XXH32_init(0); + XXH32_update(xxh, block_contents.data(), block_contents.size()); + XXH32_update(xxh, trailer, 1); // Extend to cover block type + EncodeFixed32(trailer_without_type, XXH32_digest(xxh)); + break; + } + } + + r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); + if (r->status.ok()) { + r->status = InsertBlockInCache(block_contents, type, handle); + } + if (r->status.ok()) { + r->offset += block_contents.size() + kBlockTrailerSize; + } + } +} + +Status BlockBasedTableBuilder::status() const { + return rep_->status; +} + +static void DeleteCachedBlock(const Slice& key, void* value) { + Block* block = reinterpret_cast(value); + delete block; +} + +// +// Make a copy of the block contents and insert into compressed block cache +// +Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, + const CompressionType type, + const BlockHandle* handle) { + Rep* r = rep_; + Cache* block_cache_compressed = r->options.block_cache_compressed.get(); + + if (type != kNoCompression && block_cache_compressed != nullptr) { + + Cache::Handle* cache_handle = nullptr; + size_t size = block_contents.size(); + + char* ubuf = new char[size]; // make a new copy + memcpy(ubuf, block_contents.data(), size); + + BlockContents results; + Slice sl(ubuf, size); + results.data = sl; + results.cachable = true; // XXX + results.heap_allocated = true; + results.compression_type = type; + + Block* block = new Block(results); + + // make cache key by appending the file offset to the cache prefix id + char* end = EncodeVarint64( + r->compressed_cache_key_prefix + + r->compressed_cache_key_prefix_size, + handle->offset()); + Slice key(r->compressed_cache_key_prefix, static_cast + (end - r->compressed_cache_key_prefix)); + + // Insert into compressed block cache. + cache_handle = block_cache_compressed->Insert(key, block, block->size(), + &DeleteCachedBlock); + block_cache_compressed->Release(cache_handle); + + // Invalidate OS cache. + r->file->InvalidateCache(r->offset, size); + } + return Status::OK(); +} + +Status BlockBasedTableBuilder::Finish() { + Rep* r = rep_; + bool empty_data_block = r->data_block.empty(); + Flush(); + assert(!r->closed); + r->closed = true; + + BlockHandle filter_block_handle, + metaindex_block_handle, + index_block_handle; + + // Write filter block + if (ok() && r->filter_block != nullptr) { + auto filter_contents = r->filter_block->Finish(); + r->props.filter_size = filter_contents.size(); + WriteRawBlock(filter_contents, kNoCompression, &filter_block_handle); + } + + // To make sure properties block is able to keep the accurate size of index + // block, we will finish writing all index entries here and flush them + // to storage after metaindex block is written. + if (ok() && !empty_data_block) { + r->index_builder->AddIndexEntry( + &r->last_key, nullptr /* no next data block */, r->pending_handle); + } + + IndexBuilder::IndexBlocks index_blocks; + auto s = r->index_builder->Finish(&index_blocks); + if (!s.ok()) { + return s; + } + + // Write meta blocks and metaindex block with the following order. + // 1. [meta block: filter] + // 2. [other meta blocks] + // 3. [meta block: properties] + // 4. [metaindex block] + // write meta blocks + MetaIndexBuilder meta_index_builder; + for (const auto& item : index_blocks.meta_blocks) { + BlockHandle block_handle; + WriteBlock(item.second, &block_handle); + meta_index_builder.Add(item.first, block_handle); + } + + if (ok()) { + if (r->filter_block != nullptr) { + // Add mapping from ".Name" to location + // of filter data. + std::string key = BlockBasedTable::kFilterBlockPrefix; + key.append(r->options.filter_policy->Name()); + meta_index_builder.Add(key, filter_block_handle); + } + + // Write properties block. + { + PropertyBlockBuilder property_block_builder; + std::vector failed_user_prop_collectors; + r->props.filter_policy_name = r->options.filter_policy != nullptr ? + r->options.filter_policy->Name() : ""; + r->props.index_size = + r->index_builder->EstimatedSize() + kBlockTrailerSize; + + // Add basic properties + property_block_builder.AddTableProperty(r->props); + + // Add use collected properties + NotifyCollectTableCollectorsOnFinish(r->table_properties_collectors, + r->options.info_log.get(), + &property_block_builder); + + BlockHandle properties_block_handle; + WriteRawBlock( + property_block_builder.Finish(), + kNoCompression, + &properties_block_handle + ); + + meta_index_builder.Add(kPropertiesBlock, properties_block_handle); + } // end of properties block writing + } // meta blocks + + // Write index block + if (ok()) { + // flush the meta index block + WriteRawBlock(meta_index_builder.Finish(), kNoCompression, + &metaindex_block_handle); + WriteBlock(index_blocks.index_block_contents, &index_block_handle); + } + + // Write footer + if (ok()) { + // No need to write out new footer if we're using default checksum. + // We're writing legacy magic number because we want old versions of RocksDB + // be able to read files generated with new release (just in case if + // somebody wants to roll back after an upgrade) + // TODO(icanadi) at some point in the future, when we're absolutely sure + // nobody will roll back to RocksDB 2.x versions, retire the legacy magic + // number and always write new table files with new magic number + bool legacy = (r->checksum_type == kCRC32c); + Footer footer(legacy ? kLegacyBlockBasedTableMagicNumber + : kBlockBasedTableMagicNumber); + footer.set_metaindex_handle(metaindex_block_handle); + footer.set_index_handle(index_block_handle); + footer.set_checksum(r->checksum_type); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + r->status = r->file->Append(footer_encoding); + if (r->status.ok()) { + r->offset += footer_encoding.size(); + } + } + + // Print out the table stats + if (ok()) { + // user collected properties + std::string user_collected; + user_collected.reserve(1024); + for (const auto& collector : r->table_properties_collectors) { + for (const auto& prop : collector->GetReadableProperties()) { + user_collected.append(prop.first); + user_collected.append("="); + user_collected.append(prop.second); + user_collected.append("; "); + } + } + + Log( + r->options.info_log, + "Table was constructed:\n" + " [basic properties]: %s\n" + " [user collected properties]: %s", + r->props.ToString().c_str(), + user_collected.c_str() + ); + } + + return r->status; +} + +void BlockBasedTableBuilder::Abandon() { + Rep* r = rep_; + assert(!r->closed); + r->closed = true; +} + +uint64_t BlockBasedTableBuilder::NumEntries() const { + return rep_->props.num_entries; +} + +uint64_t BlockBasedTableBuilder::FileSize() const { + return rep_->offset; +} + +const std::string BlockBasedTable::kFilterBlockPrefix = "filter."; + +} // namespace rocksdb diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h new file mode 100644 index 0000000000..1fae6d0692 --- /dev/null +++ b/table/block_based_table_builder.h @@ -0,0 +1,92 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "table/table_builder.h" + +namespace rocksdb { + +class BlockBuilder; +class BlockHandle; +class WritableFile; +struct BlockBasedTableOptions; + +class BlockBasedTableBuilder : public TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). + BlockBasedTableBuilder(const Options& options, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~BlockBasedTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + + private: + bool ok() const { return status().ok(); } + // Call block's Finish() method and then write the finalize block contents to + // file. + void WriteBlock(BlockBuilder* block, BlockHandle* handle); + // Directly write block content to the file. + void WriteBlock(const Slice& block_contents, BlockHandle* handle); + void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle); + Status InsertBlockInCache(const Slice& block_contents, + const CompressionType type, + const BlockHandle* handle); + struct Rep; + class BlockBasedTablePropertiesCollectorFactory; + class BlockBasedTablePropertiesCollector; + Rep* rep_; + + // Advanced operation: flush any buffered key/value pairs to file. + // Can be used to ensure that two adjacent entries never live in + // the same data block. Most clients should not need to use this method. + // REQUIRES: Finish(), Abandon() have not been called + void Flush(); + + // No copying allowed + BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete; + void operator=(const BlockBasedTableBuilder&) = delete; +}; + +} // namespace rocksdb diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc new file mode 100644 index 0000000000..22fd0dd939 --- /dev/null +++ b/table/block_based_table_factory.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + + +#include "table/block_based_table_factory.h" + +#include +#include +#include + +#include "rocksdb/flush_block_policy.h" +#include "table/block_based_table_builder.h" +#include "table/block_based_table_reader.h" +#include "port/port.h" + +namespace rocksdb { + +BlockBasedTableFactory::BlockBasedTableFactory( + const BlockBasedTableOptions& table_options) + : table_options_(table_options) { + if (table_options_.flush_block_policy_factory == nullptr) { + table_options_.flush_block_policy_factory.reset( + new FlushBlockBySizePolicyFactory()); + } +} + +Status BlockBasedTableFactory::NewTableReader( + const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const { + return BlockBasedTable::Open(options, soptions, table_options_, + internal_comparator, std::move(file), file_size, + table_reader); +} + +TableBuilder* BlockBasedTableFactory::NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const { + auto table_builder = new BlockBasedTableBuilder( + options, table_options_, internal_comparator, file, compression_type); + + return table_builder; +} + +TableFactory* NewBlockBasedTableFactory( + const BlockBasedTableOptions& table_options) { + return new BlockBasedTableFactory(table_options); +} + +const std::string BlockBasedTablePropertyNames::kIndexType = + "rocksdb.block.based.table.index.type"; +const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes"; +const std::string kHashIndexPrefixesMetadataBlock = + "rocksdb.hashindex.metadata"; + +} // namespace rocksdb diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h new file mode 100644 index 0000000000..656b531aec --- /dev/null +++ b/table/block_based_table_factory.h @@ -0,0 +1,53 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include +#include + +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +struct Options; +struct EnvOptions; + +using std::unique_ptr; +class BlockBasedTableBuilder; + +class BlockBasedTableFactory : public TableFactory { + public: + explicit BlockBasedTableFactory( + const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); + + ~BlockBasedTableFactory() {} + + const char* Name() const override { return "BlockBasedTable"; } + + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const override; + + TableBuilder* NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const override; + + private: + BlockBasedTableOptions table_options_; +}; + +extern const std::string kHashIndexPrefixesBlock; +extern const std::string kHashIndexPrefixesMetadataBlock; + +} // namespace rocksdb diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc new file mode 100644 index 0000000000..71fff659a4 --- /dev/null +++ b/table/block_based_table_reader.cc @@ -0,0 +1,1176 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based_table_reader.h" + +#include +#include + +#include "db/dbformat.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" + +#include "table/block.h" +#include "table/filter_block.h" +#include "table/block_hash_index.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/two_level_iterator.h" + +#include "util/coding.h" +#include "util/perf_context_imp.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +extern const uint64_t kBlockBasedTableMagicNumber; +extern const std::string kHashIndexPrefixesBlock; +extern const std::string kHashIndexPrefixesMetadataBlock; +using std::unique_ptr; + +typedef BlockBasedTable::IndexReader IndexReader; + +namespace { + +// The longest the prefix of the cache key used to identify blocks can be. +// We are using the fact that we know for Posix files the unique ID is three +// varints. +// For some reason, compiling for iOS complains that this variable is unused +const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) = + kMaxVarint64Length * 3 + 1; + +// Read the block identified by "handle" from "file". +// The only relevant option is options.verify_checksums for now. +// Set *didIO to true if didIO is not null. +// On failure return non-OK. +// On success fill *result and return OK - caller owns *result +Status ReadBlockFromFile(RandomAccessFile* file, const Footer& footer, + const ReadOptions& options, const BlockHandle& handle, + Block** result, Env* env, bool* didIO = nullptr, + bool do_uncompress = true) { + BlockContents contents; + Status s = ReadBlockContents(file, footer, options, handle, &contents, env, + do_uncompress); + if (s.ok()) { + *result = new Block(contents); + } + + if (didIO != nullptr) { + *didIO = true; + } + return s; +} + +// Delete the resource that is held by the iterator. +template +void DeleteHeldResource(void* arg, void* ignored) { + delete reinterpret_cast(arg); +} + +// Delete the entry resided in the cache. +template +void DeleteCachedEntry(const Slice& key, void* value) { + auto entry = reinterpret_cast(value); + delete entry; +} + +// Release the cached entry and decrement its ref count. +void ReleaseCachedEntry(void* arg, void* h) { + Cache* cache = reinterpret_cast(arg); + Cache::Handle* handle = reinterpret_cast(h); + cache->Release(handle); +} + +Slice GetCacheKey(const char* cache_key_prefix, size_t cache_key_prefix_size, + const BlockHandle& handle, char* cache_key) { + assert(cache_key != nullptr); + assert(cache_key_prefix_size != 0); + assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize); + memcpy(cache_key, cache_key_prefix, cache_key_prefix_size); + char* end = + EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset()); + return Slice(cache_key, static_cast(end - cache_key)); +} + +Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, + Tickers block_cache_miss_ticker, + Tickers block_cache_hit_ticker, + Statistics* statistics) { + auto cache_handle = block_cache->Lookup(key); + if (cache_handle != nullptr) { + PERF_COUNTER_ADD(block_cache_hit_count, 1); + // overall cache hit + RecordTick(statistics, BLOCK_CACHE_HIT); + // block-type specific cache hit + RecordTick(statistics, block_cache_hit_ticker); + } else { + // overall cache miss + RecordTick(statistics, BLOCK_CACHE_MISS); + // block-type specific cache miss + RecordTick(statistics, block_cache_miss_ticker); + } + + return cache_handle; +} + +} // namespace + +// -- IndexReader and its subclasses +// IndexReader is the interface that provide the functionality for index access. +class BlockBasedTable::IndexReader { + public: + explicit IndexReader(const Comparator* comparator) + : comparator_(comparator) {} + + virtual ~IndexReader() {} + + // Create an iterator for index access. + virtual Iterator* NewIterator() = 0; + + // The size of the index. + virtual size_t size() const = 0; + + protected: + const Comparator* comparator_; +}; + +// Index that allows binary search lookup for the first key of each block. +// This class can be viewed as a thin wrapper for `Block` class which already +// supports binary search. +class BinarySearchIndexReader : public IndexReader { + public: + // Read index from the file and create an intance for + // `BinarySearchIndexReader`. + // On success, index_reader will be populated; otherwise it will remain + // unmodified. + static Status Create(RandomAccessFile* file, const Footer& footer, + const BlockHandle& index_handle, Env* env, + const Comparator* comparator, + IndexReader** index_reader) { + Block* index_block = nullptr; + auto s = ReadBlockFromFile(file, footer, ReadOptions(), index_handle, + &index_block, env); + + if (s.ok()) { + *index_reader = new BinarySearchIndexReader(comparator, index_block); + } + + return s; + } + + virtual Iterator* NewIterator() override { + return index_block_->NewIterator(comparator_); + } + + virtual size_t size() const override { return index_block_->size(); } + + private: + BinarySearchIndexReader(const Comparator* comparator, Block* index_block) + : IndexReader(comparator), index_block_(index_block) { + assert(index_block_ != nullptr); + } + std::unique_ptr index_block_; +}; + +// Index that leverages an internal hash table to quicken the lookup for a given +// key. +class HashIndexReader : public IndexReader { + public: + static Status Create(const SliceTransform* hash_key_extractor, + const Footer& footer, RandomAccessFile* file, Env* env, + const Comparator* comparator, + const BlockHandle& index_handle, + Iterator* meta_index_iter, IndexReader** index_reader) { + Block* index_block = nullptr; + auto s = ReadBlockFromFile(file, footer, ReadOptions(), index_handle, + &index_block, env); + + if (!s.ok()) { + return s; + } + + // Get prefixes block + BlockHandle prefixes_handle; + s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock, + &prefixes_handle); + if (!s.ok()) { + return s; + } + + // Get index metadata block + BlockHandle prefixes_meta_handle; + s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock, + &prefixes_meta_handle); + if (!s.ok()) { + return s; + } + + // Read contents for the blocks + BlockContents prefixes_contents; + s = ReadBlockContents(file, footer, ReadOptions(), prefixes_handle, + &prefixes_contents, env, true /* do decompression */); + if (!s.ok()) { + return s; + } + BlockContents prefixes_meta_contents; + s = ReadBlockContents(file, footer, ReadOptions(), prefixes_meta_handle, + &prefixes_meta_contents, env, + true /* do decompression */); + if (!s.ok()) { + if (prefixes_contents.heap_allocated) { + delete[] prefixes_contents.data.data(); + } + return s; + } + + auto new_index_reader = + new HashIndexReader(comparator, index_block, prefixes_contents); + BlockHashIndex* hash_index = nullptr; + s = CreateBlockHashIndex(hash_key_extractor, prefixes_contents.data, + prefixes_meta_contents.data, &hash_index); + if (!s.ok()) { + return s; + } + + new_index_reader->index_block_->SetBlockHashIndex(hash_index); + + *index_reader = new_index_reader; + + // release resources + if (prefixes_meta_contents.heap_allocated) { + delete[] prefixes_meta_contents.data.data(); + } + return s; + } + + virtual Iterator* NewIterator() override { + return index_block_->NewIterator(comparator_); + } + + virtual size_t size() const override { return index_block_->size(); } + + private: + HashIndexReader(const Comparator* comparator, Block* index_block, + const BlockContents& prefixes_contents) + : IndexReader(comparator), + index_block_(index_block), + prefixes_contents_(prefixes_contents) { + assert(index_block_ != nullptr); + } + + ~HashIndexReader() { + if (prefixes_contents_.heap_allocated) { + delete[] prefixes_contents_.data.data(); + } + } + + std::unique_ptr index_block_; + BlockContents prefixes_contents_; +}; + + +struct BlockBasedTable::Rep { + Rep(const EnvOptions& storage_options, + const InternalKeyComparator& internal_comparator) + : soptions(storage_options), internal_comparator(internal_comparator) {} + + Options options; + const EnvOptions& soptions; + const InternalKeyComparator& internal_comparator; + Status status; + unique_ptr file; + char cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t cache_key_prefix_size = 0; + char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t compressed_cache_key_prefix_size = 0; + + // Footer contains the fixed table information + Footer footer; + // index_reader and filter will be populated and used only when + // options.block_cache is nullptr; otherwise we will get the index block via + // the block cache. + unique_ptr index_reader; + unique_ptr filter; + + std::shared_ptr table_properties; + BlockBasedTableOptions::IndexType index_type; + // TODO(kailiu) It is very ugly to use internal key in table, since table + // module should not be relying on db module. However to make things easier + // and compatible with existing code, we introduce a wrapper that allows + // block to extract prefix without knowing if a key is internal or not. + unique_ptr internal_prefix_transform; +}; + +BlockBasedTable::~BlockBasedTable() { + delete rep_; +} + +// CachableEntry represents the entries that *may* be fetched from block cache. +// field `value` is the item we want to get. +// field `cache_handle` is the cache handle to the block cache. If the value +// was not read from cache, `cache_handle` will be nullptr. +template +struct BlockBasedTable::CachableEntry { + CachableEntry(TValue* value, Cache::Handle* cache_handle) + : value(value) + , cache_handle(cache_handle) { + } + CachableEntry(): CachableEntry(nullptr, nullptr) { } + void Release(Cache* cache) { + if (cache_handle) { + cache->Release(cache_handle); + value = nullptr; + cache_handle = nullptr; + } + } + + TValue* value = nullptr; + // if the entry is from the cache, cache_handle will be populated. + Cache::Handle* cache_handle = nullptr; +}; + +// Helper function to setup the cache key's prefix for the Table. +void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { + assert(kMaxCacheKeyPrefixSize >= 10); + rep->cache_key_prefix_size = 0; + rep->compressed_cache_key_prefix_size = 0; + if (rep->options.block_cache != nullptr) { + GenerateCachePrefix(rep->options.block_cache.get(), rep->file.get(), + &rep->cache_key_prefix[0], + &rep->cache_key_prefix_size); + } + if (rep->options.block_cache_compressed != nullptr) { + GenerateCachePrefix(rep->options.block_cache_compressed.get(), + rep->file.get(), &rep->compressed_cache_key_prefix[0], + &rep->compressed_cache_key_prefix_size); + } +} + +void BlockBasedTable::GenerateCachePrefix(Cache* cc, + RandomAccessFile* file, char* buffer, size_t* size) { + + // generate an id from the file + *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); + + // If the prefix wasn't generated or was too long, + // create one from the cache. + if (*size == 0) { + char* end = EncodeVarint64(buffer, cc->NewId()); + *size = static_cast(end - buffer); + } +} + +void BlockBasedTable::GenerateCachePrefix(Cache* cc, + WritableFile* file, char* buffer, size_t* size) { + + // generate an id from the file + *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); + + // If the prefix wasn't generated or was too long, + // create one from the cache. + if (*size == 0) { + char* end = EncodeVarint64(buffer, cc->NewId()); + *size = static_cast(end - buffer); + } +} + +Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table_reader) { + table_reader->reset(); + + Footer footer(kBlockBasedTableMagicNumber); + auto s = ReadFooterFromFile(file.get(), file_size, &footer); + if (!s.ok()) return s; + + // We've successfully read the footer and the index block: we're + // ready to serve requests. + Rep* rep = new BlockBasedTable::Rep(soptions, internal_comparator); + rep->options = options; + rep->file = std::move(file); + rep->footer = footer; + rep->index_type = table_options.index_type; + SetupCacheKeyPrefix(rep); + unique_ptr new_table(new BlockBasedTable(rep)); + + // Read meta index + std::unique_ptr meta; + std::unique_ptr meta_iter; + s = ReadMetaBlock(rep, &meta, &meta_iter); + + // Read the properties + bool found_properties_block = true; + s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block); + + if (found_properties_block) { + s = meta_iter->status(); + TableProperties* table_properties = nullptr; + if (s.ok()) { + s = ReadProperties(meta_iter->value(), rep->file.get(), rep->footer, + rep->options.env, rep->options.info_log.get(), + &table_properties); + } + + if (!s.ok()) { + auto err_msg = + "[Warning] Encountered error while reading data from properties " + "block " + s.ToString(); + Log(rep->options.info_log, "%s", err_msg.c_str()); + } else { + rep->table_properties.reset(table_properties); + } + } else { + Log(WARN_LEVEL, rep->options.info_log, + "Cannot find Properties block from file."); + } + + // Will use block cache for index/filter blocks access? + if (options.block_cache && table_options.cache_index_and_filter_blocks) { + // Hack: Call NewIndexIterator() to implicitly add index to the block_cache + unique_ptr iter(new_table->NewIndexIterator(ReadOptions())); + s = iter->status(); + + if (s.ok()) { + // Hack: Call GetFilter() to implicitly add filter to the block_cache + auto filter_entry = new_table->GetFilter(); + filter_entry.Release(options.block_cache.get()); + } + } else { + // If we don't use block cache for index/filter blocks access, we'll + // pre-load these blocks, which will kept in member variables in Rep + // and with a same life-time as this table object. + IndexReader* index_reader = nullptr; + // TODO: we never really verify check sum for index block + s = new_table->CreateIndexReader(&index_reader, meta_iter.get()); + + if (s.ok()) { + rep->index_reader.reset(index_reader); + + // Set filter block + if (rep->options.filter_policy) { + std::string key = kFilterBlockPrefix; + key.append(rep->options.filter_policy->Name()); + BlockHandle handle; + if (FindMetaBlock(meta_iter.get(), key, &handle).ok()) { + rep->filter.reset(ReadFilter(handle, rep)); + } + } + } else { + delete index_reader; + } + } + + if (s.ok()) { + *table_reader = std::move(new_table); + } + + return s; +} + +void BlockBasedTable::SetupForCompaction() { + switch (rep_->options.access_hint_on_compaction_start) { + case Options::NONE: + break; + case Options::NORMAL: + rep_->file->Hint(RandomAccessFile::NORMAL); + break; + case Options::SEQUENTIAL: + rep_->file->Hint(RandomAccessFile::SEQUENTIAL); + break; + case Options::WILLNEED: + rep_->file->Hint(RandomAccessFile::WILLNEED); + break; + default: + assert(false); + } + compaction_optimized_ = true; +} + +std::shared_ptr BlockBasedTable::GetTableProperties() + const { + return rep_->table_properties; +} + +// Load the meta-block from the file. On success, return the loaded meta block +// and its iterator. +Status BlockBasedTable::ReadMetaBlock( + Rep* rep, + std::unique_ptr* meta_block, + std::unique_ptr* iter) { + // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates + // it is an empty block. + // TODO: we never really verify check sum for meta index block + Block* meta = nullptr; + Status s = ReadBlockFromFile( + rep->file.get(), + rep->footer, + ReadOptions(), + rep->footer.metaindex_handle(), + &meta, + rep->options.env); + + if (!s.ok()) { + auto err_msg = + "[Warning] Encountered error while reading data from properties" + "block " + s.ToString(); + Log(rep->options.info_log, "%s", err_msg.c_str()); + } + if (!s.ok()) { + delete meta; + return s; + } + + meta_block->reset(meta); + // meta block uses bytewise comparator. + iter->reset(meta->NewIterator(BytewiseComparator())); + return Status::OK(); +} + +Status BlockBasedTable::GetDataBlockFromCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics, + const ReadOptions& read_options, + BlockBasedTable::CachableEntry* block) { + Status s; + Block* compressed_block = nullptr; + Cache::Handle* block_cache_compressed_handle = nullptr; + + // Lookup uncompressed cache first + if (block_cache != nullptr) { + block->cache_handle = + GetEntryFromCache(block_cache, block_cache_key, BLOCK_CACHE_DATA_MISS, + BLOCK_CACHE_DATA_HIT, statistics); + if (block->cache_handle != nullptr) { + block->value = + reinterpret_cast(block_cache->Value(block->cache_handle)); + return s; + } + } + + // If not found, search from the compressed block cache. + assert(block->cache_handle == nullptr && block->value == nullptr); + + if (block_cache_compressed == nullptr) { + return s; + } + + assert(!compressed_block_cache_key.empty()); + block_cache_compressed_handle = + block_cache_compressed->Lookup(compressed_block_cache_key); + // if we found in the compressed cache, then uncompress and insert into + // uncompressed cache + if (block_cache_compressed_handle == nullptr) { + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); + return s; + } + + // found compressed block + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT); + compressed_block = reinterpret_cast( + block_cache_compressed->Value(block_cache_compressed_handle)); + assert(compressed_block->compression_type() != kNoCompression); + + // Retrieve the uncompressed contents into a new buffer + BlockContents contents; + s = UncompressBlockContents(compressed_block->data(), + compressed_block->size(), &contents); + + // Insert uncompressed block into block cache + if (s.ok()) { + block->value = new Block(contents); // uncompressed block + assert(block->value->compression_type() == kNoCompression); + if (block_cache != nullptr && block->value->cachable() && + read_options.fill_cache) { + block->cache_handle = + block_cache->Insert(block_cache_key, block->value, + block->value->size(), &DeleteCachedEntry); + assert(reinterpret_cast( + block_cache->Value(block->cache_handle)) == block->value); + } + } + + // Release hold on compressed cache entry + block_cache_compressed->Release(block_cache_compressed_handle); + return s; +} + +Status BlockBasedTable::PutDataBlockToCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, + const ReadOptions& read_options, Statistics* statistics, + CachableEntry* block, Block* raw_block) { + assert(raw_block->compression_type() == kNoCompression || + block_cache_compressed != nullptr); + + Status s; + // Retrieve the uncompressed contents into a new buffer + BlockContents contents; + if (raw_block->compression_type() != kNoCompression) { + s = UncompressBlockContents(raw_block->data(), raw_block->size(), + &contents); + } + if (!s.ok()) { + delete raw_block; + return s; + } + + if (raw_block->compression_type() != kNoCompression) { + block->value = new Block(contents); // uncompressed block + } else { + block->value = raw_block; + raw_block = nullptr; + } + + // Insert compressed block into compressed block cache. + // Release the hold on the compressed cache entry immediately. + if (block_cache_compressed != nullptr && raw_block != nullptr && + raw_block->cachable()) { + auto cache_handle = block_cache_compressed->Insert( + compressed_block_cache_key, raw_block, raw_block->size(), + &DeleteCachedEntry); + block_cache_compressed->Release(cache_handle); + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); + // Avoid the following code to delete this cached block. + raw_block = nullptr; + } + delete raw_block; + + // insert into uncompressed block cache + assert((block->value->compression_type() == kNoCompression)); + if (block_cache != nullptr && block->value->cachable()) { + block->cache_handle = + block_cache->Insert(block_cache_key, block->value, block->value->size(), + &DeleteCachedEntry); + RecordTick(statistics, BLOCK_CACHE_ADD); + assert(reinterpret_cast(block_cache->Value(block->cache_handle)) == + block->value); + } + + return s; +} + +FilterBlockReader* BlockBasedTable::ReadFilter(const BlockHandle& filter_handle, + BlockBasedTable::Rep* rep, + size_t* filter_size) { + // TODO: We might want to unify with ReadBlockFromFile() if we start + // requiring checksum verification in Table::Open. + ReadOptions opt; + BlockContents block; + if (!ReadBlockContents(rep->file.get(), rep->footer, opt, filter_handle, + &block, rep->options.env, false).ok()) { + return nullptr; + } + + if (filter_size) { + *filter_size = block.data.size(); + } + + return new FilterBlockReader( + rep->options, block.data, block.heap_allocated); +} + +BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( + bool no_io) const { + // filter pre-populated + if (rep_->filter != nullptr) { + return {rep_->filter.get(), nullptr /* cache handle */}; + } + + if (rep_->options.filter_policy == nullptr /* do not use filter at all */ || + rep_->options.block_cache == nullptr /* no block cache at all */) { + return {nullptr /* filter */, nullptr /* cache handle */}; + } + + // Fetching from the cache + Cache* block_cache = rep_->options.block_cache.get(); + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = GetCacheKey( + rep_->cache_key_prefix, + rep_->cache_key_prefix_size, + rep_->footer.metaindex_handle(), + cache_key + ); + + Statistics* statistics = rep_->options.statistics.get(); + auto cache_handle = + GetEntryFromCache(block_cache, key, BLOCK_CACHE_FILTER_MISS, + BLOCK_CACHE_FILTER_HIT, statistics); + + FilterBlockReader* filter = nullptr; + if (cache_handle != nullptr) { + filter = reinterpret_cast( + block_cache->Value(cache_handle)); + } else if (no_io) { + // Do not invoke any io. + return CachableEntry(); + } else { + size_t filter_size = 0; + std::unique_ptr meta; + std::unique_ptr iter; + auto s = ReadMetaBlock(rep_, &meta, &iter); + + if (s.ok()) { + std::string filter_block_key = kFilterBlockPrefix; + filter_block_key.append(rep_->options.filter_policy->Name()); + BlockHandle handle; + if (FindMetaBlock(iter.get(), filter_block_key, &handle).ok()) { + filter = ReadFilter(handle, rep_, &filter_size); + assert(filter); + assert(filter_size > 0); + + cache_handle = block_cache->Insert( + key, filter, filter_size, &DeleteCachedEntry); + RecordTick(statistics, BLOCK_CACHE_ADD); + } + } + } + + return { filter, cache_handle }; +} + +Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) { + // index reader has already been pre-populated. + if (rep_->index_reader) { + return rep_->index_reader->NewIterator(); + } + + bool no_io = read_options.read_tier == kBlockCacheTier; + Cache* block_cache = rep_->options.block_cache.get(); + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + rep_->footer.index_handle(), cache_key); + Statistics* statistics = rep_->options.statistics.get(); + auto cache_handle = + GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS, + BLOCK_CACHE_INDEX_HIT, statistics); + + if (cache_handle == nullptr && no_io) { + return NewErrorIterator(Status::Incomplete("no blocking io")); + } + + IndexReader* index_reader = nullptr; + if (cache_handle != nullptr) { + index_reader = + reinterpret_cast(block_cache->Value(cache_handle)); + } else { + // Create index reader and put it in the cache. + Status s; + s = CreateIndexReader(&index_reader); + + if (!s.ok()) { + // make sure if something goes wrong, index_reader shall remain intact. + assert(index_reader == nullptr); + return NewErrorIterator(s); + } + + cache_handle = block_cache->Insert(key, index_reader, index_reader->size(), + &DeleteCachedEntry); + RecordTick(statistics, BLOCK_CACHE_ADD); + } + + assert(cache_handle); + auto iter = index_reader->NewIterator(); + iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle); + + return iter; +} + +// Convert an index iterator value (i.e., an encoded BlockHandle) +// into an iterator over the contents of the corresponding block. +Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, + const ReadOptions& ro, bool* didIO, const Slice& index_value) { + const bool no_io = (ro.read_tier == kBlockCacheTier); + Cache* block_cache = rep->options.block_cache.get(); + Cache* block_cache_compressed = rep->options. + block_cache_compressed.get(); + CachableEntry block; + + BlockHandle handle; + Slice input = index_value; + // We intentionally allow extra stuff in index_value so that we + // can add more features in the future. + Status s = handle.DecodeFrom(&input); + + if (!s.ok()) { + return NewErrorIterator(s); + } + + // If either block cache is enabled, we'll try to read from it. + if (block_cache != nullptr || block_cache_compressed != nullptr) { + Statistics* statistics = rep->options.statistics.get(); + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + Slice key, /* key to the block cache */ + ckey /* key to the compressed block cache */; + + // create key for block cache + if (block_cache != nullptr) { + key = GetCacheKey(rep->cache_key_prefix, + rep->cache_key_prefix_size, handle, cache_key); + } + + if (block_cache_compressed != nullptr) { + ckey = GetCacheKey(rep->compressed_cache_key_prefix, + rep->compressed_cache_key_prefix_size, handle, + compressed_cache_key); + } + + s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, + statistics, ro, &block); + + if (block.value == nullptr && !no_io && ro.fill_cache) { + Histograms histogram = READ_BLOCK_GET_MICROS; + Block* raw_block = nullptr; + { + StopWatch sw(rep->options.env, statistics, histogram); + s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, + &raw_block, rep->options.env, didIO, + block_cache_compressed == nullptr); + } + + if (s.ok()) { + s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed, + ro, statistics, &block, raw_block); + } + } + } + + // Didn't get any data from block caches. + if (block.value == nullptr) { + if (no_io) { + // Could not read from block_cache and can't do IO + return NewErrorIterator(Status::Incomplete("no blocking io")); + } + s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, + &block.value, rep->options.env, didIO); + } + + Iterator* iter; + if (block.value != nullptr) { + iter = block.value->NewIterator(&rep->internal_comparator); + if (block.cache_handle != nullptr) { + iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, + block.cache_handle); + } else { + iter->RegisterCleanup(&DeleteHeldResource, block.value, nullptr); + } + } else { + iter = NewErrorIterator(s); + } + return iter; +} + +class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { + public: + BlockEntryIteratorState(BlockBasedTable* table, + const ReadOptions& read_options, bool* did_io) + : TwoLevelIteratorState(table->rep_->options.prefix_extractor != nullptr), + table_(table), read_options_(read_options), did_io_(did_io) {} + + Iterator* NewSecondaryIterator(const Slice& index_value) override { + return NewDataBlockIterator(table_->rep_, read_options_, did_io_, + index_value); + } + + bool PrefixMayMatch(const Slice& internal_key) override { + return table_->PrefixMayMatch(internal_key); + } + + private: + // Don't own table_ + BlockBasedTable* table_; + const ReadOptions read_options_; + // Don't own did_io_ + bool* did_io_; +}; + +// This will be broken if the user specifies an unusual implementation +// of Options.comparator, or if the user specifies an unusual +// definition of prefixes in Options.filter_policy. In particular, we +// require the following three properties: +// +// 1) key.starts_with(prefix(key)) +// 2) Compare(prefix(key), key) <= 0. +// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0 +// +// Otherwise, this method guarantees no I/O will be incurred. +// +// REQUIRES: this method shouldn't be called while the DB lock is held. +bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { + assert(rep_->options.prefix_extractor != nullptr); + auto prefix = rep_->options.prefix_extractor->Transform( + ExtractUserKey(internal_key)); + InternalKey internal_key_prefix(prefix, 0, kTypeValue); + auto internal_prefix = internal_key_prefix.Encode(); + + bool may_match = true; + Status s; + + if (!rep_->options.filter_policy) { + return true; + } + + // To prevent any io operation in this method, we set `read_tier` to make + // sure we always read index or filter only when they have already been + // loaded to memory. + ReadOptions no_io_read_options; + no_io_read_options.read_tier = kBlockCacheTier; + unique_ptr iiter(NewIndexIterator(no_io_read_options)); + iiter->Seek(internal_prefix); + + if (!iiter->Valid()) { + // we're past end of file + // if it's incomplete, it means that we avoided I/O + // and we're not really sure that we're past the end + // of the file + may_match = iiter->status().IsIncomplete(); + } else if (ExtractUserKey(iiter->key()).starts_with( + ExtractUserKey(internal_prefix))) { + // we need to check for this subtle case because our only + // guarantee is that "the key is a string >= last key in that data + // block" according to the doc/table_format.txt spec. + // + // Suppose iiter->key() starts with the desired prefix; it is not + // necessarily the case that the corresponding data block will + // contain the prefix, since iiter->key() need not be in the + // block. However, the next data block may contain the prefix, so + // we return true to play it safe. + may_match = true; + } else { + // iiter->key() does NOT start with the desired prefix. Because + // Seek() finds the first key that is >= the seek target, this + // means that iiter->key() > prefix. Thus, any data blocks coming + // after the data block corresponding to iiter->key() cannot + // possibly contain the key. Thus, the corresponding data block + // is the only one which could potentially contain the prefix. + Slice handle_value = iiter->value(); + BlockHandle handle; + s = handle.DecodeFrom(&handle_value); + assert(s.ok()); + auto filter_entry = GetFilter(true /* no io */); + may_match = + filter_entry.value == nullptr || + filter_entry.value->PrefixMayMatch(handle.offset(), internal_prefix); + filter_entry.Release(rep_->options.block_cache.get()); + } + + Statistics* statistics = rep_->options.statistics.get(); + RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); + if (!may_match) { + RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); + } + + return may_match; +} + +Iterator* BlockBasedTable::NewIterator(const ReadOptions& read_options, + Arena* arena) { + return NewTwoLevelIterator( + new BlockEntryIteratorState(this, read_options, nullptr), + NewIndexIterator(read_options), arena); +} + +Status BlockBasedTable::Get( + const ReadOptions& read_options, const Slice& key, void* handle_context, + bool (*result_handler)(void* handle_context, const ParsedInternalKey& k, + const Slice& v, bool didIO), + void (*mark_key_may_exist_handler)(void* handle_context)) { + Status s; + Iterator* iiter = NewIndexIterator(read_options); + auto filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier); + FilterBlockReader* filter = filter_entry.value; + bool done = false; + for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { + Slice handle_value = iiter->value(); + + BlockHandle handle; + bool may_not_exist_in_filter = + filter != nullptr && + handle.DecodeFrom(&handle_value).ok() && + !filter->KeyMayMatch(handle.offset(), key); + + if (may_not_exist_in_filter) { + // Not found + // TODO: think about interaction with Merge. If a user key cannot + // cross one data block, we should be fine. + RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL); + break; + } else { + bool didIO = false; + unique_ptr block_iter( + NewDataBlockIterator(rep_, read_options, &didIO, iiter->value())); + + if (read_options.read_tier && block_iter->status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for whether + // we can guarantee the key is not there when "no_io" is set + (*mark_key_may_exist_handler)(handle_context); + break; + } + + // Call the *saver function on each entry/block until it returns false + for (block_iter->Seek(key); block_iter->Valid(); block_iter->Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(block_iter->key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + + if (!(*result_handler)(handle_context, parsed_key, block_iter->value(), + didIO)) { + done = true; + break; + } + } + s = block_iter->status(); + } + } + + filter_entry.Release(rep_->options.block_cache.get()); + if (s.ok()) { + s = iiter->status(); + } + delete iiter; + return s; +} + +namespace { +bool SaveDidIO(void* arg, const ParsedInternalKey& key, const Slice& value, + bool didIO) { + *reinterpret_cast(arg) = didIO; + return false; +} +} // namespace + +bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, + const Slice& key) { + // We use Get() as it has logic that checks whether we read the + // block from the disk or not. + bool didIO = false; + Status s = Get(options, key, &didIO, SaveDidIO); + assert(s.ok()); + return !didIO; +} + +// REQUIRES: The following fields of rep_ should have already been populated: +// 1. file +// 2. index_handle, +// 3. options +// 4. internal_comparator +// 5. index_type +Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, + Iterator* preloaded_meta_index_iter) { + // Some old version of block-based tables don't have index type present in + // table properties. If that's the case we can safely use the kBinarySearch. + auto index_type_on_file = BlockBasedTableOptions::kBinarySearch; + if (rep_->table_properties) { + auto& props = rep_->table_properties->user_collected_properties; + auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); + if (pos != props.end()) { + index_type_on_file = static_cast( + DecodeFixed32(pos->second.c_str())); + } + } + + auto file = rep_->file.get(); + auto env = rep_->options.env; + auto comparator = &rep_->internal_comparator; + const Footer& footer = rep_->footer; + + switch (index_type_on_file) { + case BlockBasedTableOptions::kBinarySearch: { + return BinarySearchIndexReader::Create( + file, footer, footer.index_handle(), env, comparator, index_reader); + } + case BlockBasedTableOptions::kHashSearch: { + std::unique_ptr meta_guard; + std::unique_ptr meta_iter_guard; + auto meta_index_iter = preloaded_meta_index_iter; + if (meta_index_iter == nullptr) { + auto s = ReadMetaBlock(rep_, &meta_guard, &meta_iter_guard); + if (!s.ok()) { + return Status::Corruption("Unable to read the metaindex block"); + } + meta_index_iter = meta_iter_guard.get(); + } + + // We need to wrap data with internal_prefix_transform to make sure it can + // handle prefix correctly. + if (rep_->options.prefix_extractor == nullptr) { + return Status::InvalidArgument( + "BlockBasedTableOptions::kHashSearch requires " + "options.prefix_extractor to be set."); + } + + rep_->internal_prefix_transform.reset( + new InternalKeySliceTransform(rep_->options.prefix_extractor.get())); + return HashIndexReader::Create( + rep_->internal_prefix_transform.get(), footer, file, env, comparator, + footer.index_handle(), meta_index_iter, index_reader); + } + default: { + std::string error_message = + "Unrecognized index type: " + std::to_string(rep_->index_type); + return Status::InvalidArgument(error_message.c_str()); + } + } +} + +uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) { + unique_ptr index_iter(NewIndexIterator(ReadOptions())); + + index_iter->Seek(key); + uint64_t result; + if (index_iter->Valid()) { + BlockHandle handle; + Slice input = index_iter->value(); + Status s = handle.DecodeFrom(&input); + if (s.ok()) { + result = handle.offset(); + } else { + // Strange: we can't decode the block handle in the index block. + // We'll just return the offset of the metaindex block, which is + // close to the whole file size for this case. + result = rep_->footer.metaindex_handle().offset(); + } + } else { + // key is past the last key in the file. If table_properties is not + // available, approximate the offset by returning the offset of the + // metaindex block (which is right near the end of the file). + result = 0; + if (rep_->table_properties) { + result = rep_->table_properties->data_size; + } + // table_properties is not present in the table. + if (result == 0) { + result = rep_->footer.metaindex_handle().offset(); + } + } + return result; +} + +bool BlockBasedTable::TEST_filter_block_preloaded() const { + return rep_->filter != nullptr; +} + +bool BlockBasedTable::TEST_index_reader_preloaded() const { + return rep_->index_reader != nullptr; +} + +} // namespace rocksdb diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h new file mode 100644 index 0000000000..ba6a10c3e2 --- /dev/null +++ b/table/block_based_table_reader.h @@ -0,0 +1,201 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "table/table_reader.h" +#include "util/coding.h" + +namespace rocksdb { + +class Block; +class BlockHandle; +class Cache; +class FilterBlockReader; +class Footer; +class InternalKeyComparator; +class Iterator; +class RandomAccessFile; +class TableCache; +class TableReader; +class WritableFile; +struct BlockBasedTableOptions; +struct EnvOptions; +struct Options; +struct ReadOptions; + +using std::unique_ptr; + +// A Table is a sorted map from strings to strings. Tables are +// immutable and persistent. A Table may be safely accessed from +// multiple threads without external synchronization. +class BlockBasedTable : public TableReader { + public: + static const std::string kFilterBlockPrefix; + + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. + // + // If successful, returns ok and sets "*table_reader" to the newly opened + // table. The client should delete "*table_reader" when no longer needed. + // If there was an error while initializing the table, sets "*table_reader" + // to nullptr and returns a non-ok status. + // + // *file must remain live while this Table is in use. + static Status Open(const Options& db_options, const EnvOptions& env_options, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_key_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader); + + bool PrefixMayMatch(const Slice& internal_key); + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override; + + Status Get(const ReadOptions& readOptions, const Slice& key, + void* handle_context, + bool (*result_handler)(void* handle_context, + const ParsedInternalKey& k, const Slice& v, + bool didIO), + void (*mark_key_may_exist_handler)(void* handle_context) = + nullptr) override; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + uint64_t ApproximateOffsetOf(const Slice& key) override; + + // Returns true if the block for the specified key is in cache. + // REQUIRES: key is in this table. + bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); + + // Set up the table for Compaction. Might change some parameters with + // posix_fadvise + void SetupForCompaction() override; + + std::shared_ptr GetTableProperties() const override; + + ~BlockBasedTable(); + + bool TEST_filter_block_preloaded() const; + bool TEST_index_reader_preloaded() const; + // Implementation of IndexReader will be exposed to internal cc file only. + class IndexReader; + + private: + template + struct CachableEntry; + + struct Rep; + Rep* rep_; + bool compaction_optimized_; + + class BlockEntryIteratorState; + static Iterator* NewDataBlockIterator(Rep* rep, const ReadOptions& ro, + bool* didIO, const Slice& index_value); + + // For the following two functions: + // if `no_io == true`, we will not try to read filter/index from sst file + // were they not present in cache yet. + CachableEntry GetFilter(bool no_io = false) const; + + // Get the iterator from the index reader. + // + // Note: ErrorIterator with Status::Incomplete shall be returned if all the + // following conditions are met: + // 1. We enabled table_options.cache_index_and_filter_blocks. + // 2. index is not present in block cache. + // 3. We disallowed any io to be performed, that is, read_options == + // kBlockCacheTier + Iterator* NewIndexIterator(const ReadOptions& read_options); + + // Read block cache from block caches (if set): block_cache and + // block_cache_compressed. + // On success, Status::OK with be returned and @block will be populated with + // pointer to the block as well as its block handle. + static Status GetDataBlockFromCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics, + const ReadOptions& read_options, + BlockBasedTable::CachableEntry* block); + // Put a raw block (maybe compressed) to the corresponding block caches. + // This method will perform decompression against raw_block if needed and then + // populate the block caches. + // On success, Status::OK will be returned; also @block will be populated with + // uncompressed block and its cache handle. + // + // REQUIRES: raw_block is heap-allocated. PutDataBlockToCache() will be + // responsible for releasing its memory if error occurs. + static Status PutDataBlockToCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, + const ReadOptions& read_options, Statistics* statistics, + CachableEntry* block, Block* raw_block); + + // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found + // after a call to Seek(key), until handle_result returns false. + // May not make such a call if filter policy says that key is not present. + friend class TableCache; + friend class BlockBasedTableBuilder; + + void ReadMeta(const Footer& footer); + + // Create a index reader based on the index type stored in the table. + // Optionally, user can pass a preloaded meta_index_iter for the index that + // need to access extra meta blocks for index construction. This parameter + // helps avoid re-reading meta index block if caller already created one. + Status CreateIndexReader(IndexReader** index_reader, + Iterator* preloaded_meta_index_iter = nullptr); + + // Read the meta block from sst. + static Status ReadMetaBlock( + Rep* rep, + std::unique_ptr* meta_block, + std::unique_ptr* iter); + + // Create the filter from the filter block. + static FilterBlockReader* ReadFilter(const BlockHandle& filter_handle, + Rep* rep, size_t* filter_size = nullptr); + + static void SetupCacheKeyPrefix(Rep* rep); + + explicit BlockBasedTable(Rep* rep) + : rep_(rep), compaction_optimized_(false) {} + + // Generate a cache key prefix from the file + static void GenerateCachePrefix(Cache* cc, + RandomAccessFile* file, char* buffer, size_t* size); + static void GenerateCachePrefix(Cache* cc, + WritableFile* file, char* buffer, size_t* size); + + // The longest prefix of the cache key used to identify blocks. + // For Posix files the unique ID is three varints. + static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1; + + // No copying allowed + explicit BlockBasedTable(const TableReader&) = delete; + void operator=(const TableReader&) = delete; +}; + +} // namespace rocksdb diff --git a/table/block_builder.cc b/table/block_builder.cc new file mode 100644 index 0000000000..f812dbae74 --- /dev/null +++ b/table/block_builder.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// BlockBuilder generates blocks where keys are prefix-compressed: +// +// When we store a key, we drop the prefix shared with the previous +// string. This helps reduce the space requirement significantly. +// Furthermore, once every K keys, we do not apply the prefix +// compression and store the entire key. We call this a "restart +// point". The tail end of the block stores the offsets of all of the +// restart points, and can be used to do a binary search when looking +// for a particular key. Values are stored as-is (without compression) +// immediately following the corresponding key. +// +// An entry for a particular key-value pair has the form: +// shared_bytes: varint32 +// unshared_bytes: varint32 +// value_length: varint32 +// key_delta: char[unshared_bytes] +// value: char[value_length] +// shared_bytes == 0 for restart points. +// +// The trailer of the block has the form: +// restarts: uint32[num_restarts] +// num_restarts: uint32 +// restarts[i] contains the offset within the block of the ith restart point. + +#include "table/block_builder.h" + +#include +#include +#include "rocksdb/comparator.h" +#include "db/dbformat.h" +#include "util/coding.h" + +namespace rocksdb { + +BlockBuilder::BlockBuilder(int block_restart_interval, + const Comparator* comparator) + : block_restart_interval_(block_restart_interval), + comparator_(comparator), + restarts_(), + counter_(0), + finished_(false) { + assert(block_restart_interval_ >= 1); + restarts_.push_back(0); // First restart point is at offset 0 +} + +BlockBuilder::BlockBuilder(const Options& options, const Comparator* comparator) + : BlockBuilder(options.block_restart_interval, comparator) {} + +void BlockBuilder::Reset() { + buffer_.clear(); + restarts_.clear(); + restarts_.push_back(0); // First restart point is at offset 0 + counter_ = 0; + finished_ = false; + last_key_.clear(); +} + +size_t BlockBuilder::CurrentSizeEstimate() const { + return (buffer_.size() + // Raw data buffer + restarts_.size() * sizeof(uint32_t) + // Restart array + sizeof(uint32_t)); // Restart array length +} + +size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value) + const { + size_t estimate = CurrentSizeEstimate(); + estimate += key.size() + value.size(); + if (counter_ >= block_restart_interval_) { + estimate += sizeof(uint32_t); // a new restart entry. + } + + estimate += sizeof(int32_t); // varint for shared prefix length. + estimate += VarintLength(key.size()); // varint for key length. + estimate += VarintLength(value.size()); // varint for value length. + + return estimate; +} + +Slice BlockBuilder::Finish() { + // Append restart array + for (size_t i = 0; i < restarts_.size(); i++) { + PutFixed32(&buffer_, restarts_[i]); + } + PutFixed32(&buffer_, restarts_.size()); + finished_ = true; + return Slice(buffer_); +} + +void BlockBuilder::Add(const Slice& key, const Slice& value) { + Slice last_key_piece(last_key_); + assert(!finished_); + assert(counter_ <= block_restart_interval_); + assert(buffer_.empty() // No values yet? + || comparator_->Compare(key, last_key_piece) > 0); + size_t shared = 0; + if (counter_ < block_restart_interval_) { + // See how much sharing to do with previous string + const size_t min_length = std::min(last_key_piece.size(), key.size()); + while ((shared < min_length) && (last_key_piece[shared] == key[shared])) { + shared++; + } + } else { + // Restart compression + restarts_.push_back(buffer_.size()); + counter_ = 0; + } + const size_t non_shared = key.size() - shared; + + // Add "" to buffer_ + PutVarint32(&buffer_, shared); + PutVarint32(&buffer_, non_shared); + PutVarint32(&buffer_, value.size()); + + // Add string delta to buffer_ followed by value + buffer_.append(key.data() + shared, non_shared); + buffer_.append(value.data(), value.size()); + + // Update state + last_key_.resize(shared); + last_key_.append(key.data() + shared, non_shared); + assert(Slice(last_key_) == key); + counter_++; +} + +} // namespace rocksdb diff --git a/table/block_builder.h b/table/block_builder.h new file mode 100644 index 0000000000..ed2f290fd2 --- /dev/null +++ b/table/block_builder.h @@ -0,0 +1,65 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include +#include "rocksdb/slice.h" + +namespace rocksdb { + +struct Options; +class Comparator; + +class BlockBuilder { + public: + BlockBuilder(int block_builder, const Comparator* comparator); + explicit BlockBuilder(const Options& options, const Comparator* comparator); + + // Reset the contents as if the BlockBuilder was just constructed. + void Reset(); + + // REQUIRES: Finish() has not been callled since the last call to Reset(). + // REQUIRES: key is larger than any previously added key + void Add(const Slice& key, const Slice& value); + + // Finish building the block and return a slice that refers to the + // block contents. The returned slice will remain valid for the + // lifetime of this builder or until Reset() is called. + Slice Finish(); + + // Returns an estimate of the current (uncompressed) size of the block + // we are building. + size_t CurrentSizeEstimate() const; + + // Returns an estimated block size after appending key and value. + size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const; + + // Return true iff no entries have been added since the last Reset() + bool empty() const { + return buffer_.empty(); + } + + private: + const int block_restart_interval_; + const Comparator* comparator_; + + std::string buffer_; // Destination buffer + std::vector restarts_; // Restart points + int counter_; // Number of entries emitted since restart + bool finished_; // Has Finish() been called? + std::string last_key_; + + // No copying allowed + BlockBuilder(const BlockBuilder&); + void operator=(const BlockBuilder&); +}; + +} // namespace rocksdb diff --git a/table/block_hash_index.cc b/table/block_hash_index.cc new file mode 100644 index 0000000000..7a6e219a0a --- /dev/null +++ b/table/block_hash_index.cc @@ -0,0 +1,157 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "table/block_hash_index.h" + +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "util/coding.h" + +namespace rocksdb { + +Status CreateBlockHashIndex(const SliceTransform* hash_key_extractor, + const Slice& prefixes, const Slice& prefix_meta, + BlockHashIndex** hash_index) { + uint64_t pos = 0; + auto meta_pos = prefix_meta; + Status s; + *hash_index = new BlockHashIndex( + hash_key_extractor, + false /* external module manages memory space for prefixes */); + + while (!meta_pos.empty()) { + uint32_t prefix_size = 0; + uint32_t entry_index = 0; + uint32_t num_blocks = 0; + if (!GetVarint32(&meta_pos, &prefix_size) || + !GetVarint32(&meta_pos, &entry_index) || + !GetVarint32(&meta_pos, &num_blocks)) { + s = Status::Corruption( + "Corrupted prefix meta block: unable to read from it."); + break; + } + Slice prefix(prefixes.data() + pos, prefix_size); + (*hash_index)->Add(prefix, entry_index, num_blocks); + + pos += prefix_size; + } + + if (s.ok() && pos != prefixes.size()) { + s = Status::Corruption("Corrupted prefix meta block"); + } + + if (!s.ok()) { + delete *hash_index; + } + + return s; +} + +BlockHashIndex* CreateBlockHashIndexOnTheFly( + Iterator* index_iter, Iterator* data_iter, const uint32_t num_restarts, + const Comparator* comparator, const SliceTransform* hash_key_extractor) { + assert(hash_key_extractor); + auto hash_index = new BlockHashIndex( + hash_key_extractor, + true /* hash_index will copy prefix when Add() is called */); + uint64_t current_restart_index = 0; + + std::string pending_entry_prefix; + // pending_block_num == 0 also implies there is no entry inserted at all. + uint32_t pending_block_num = 0; + uint32_t pending_entry_index = 0; + + // scan all the entries and create a hash index based on their prefixes. + data_iter->SeekToFirst(); + for (index_iter->SeekToFirst(); + index_iter->Valid() && current_restart_index < num_restarts; + index_iter->Next()) { + Slice last_key_in_block = index_iter->key(); + assert(data_iter->Valid() && data_iter->status().ok()); + + // scan through all entries within a data block. + while (data_iter->Valid() && + comparator->Compare(data_iter->key(), last_key_in_block) <= 0) { + auto key_prefix = hash_key_extractor->Transform(data_iter->key()); + bool is_first_entry = pending_block_num == 0; + + // Keys may share the prefix + if (is_first_entry || pending_entry_prefix != key_prefix) { + if (!is_first_entry) { + bool succeeded = hash_index->Add( + pending_entry_prefix, pending_entry_index, pending_block_num); + if (!succeeded) { + delete hash_index; + return nullptr; + } + } + + // update the status. + // needs a hard copy otherwise the underlying data changes all the time. + pending_entry_prefix = key_prefix.ToString(); + pending_block_num = 1; + pending_entry_index = current_restart_index; + } else { + // entry number increments when keys share the prefix reside in + // differnt data blocks. + auto last_restart_index = pending_entry_index + pending_block_num - 1; + assert(last_restart_index <= current_restart_index); + if (last_restart_index != current_restart_index) { + ++pending_block_num; + } + } + data_iter->Next(); + } + + ++current_restart_index; + } + + // make sure all entries has been scaned. + assert(!index_iter->Valid()); + assert(!data_iter->Valid()); + + if (pending_block_num > 0) { + auto succeeded = hash_index->Add(pending_entry_prefix, pending_entry_index, + pending_block_num); + if (!succeeded) { + delete hash_index; + return nullptr; + } + } + + return hash_index; +} + +bool BlockHashIndex::Add(const Slice& prefix, uint32_t restart_index, + uint32_t num_blocks) { + auto prefix_to_insert = prefix; + if (kOwnPrefixes) { + auto prefix_ptr = arena_.Allocate(prefix.size()); + std::copy(prefix.data() /* begin */, + prefix.data() + prefix.size() /* end */, + prefix_ptr /* destination */); + prefix_to_insert = Slice(prefix_ptr, prefix.size()); + } + auto result = restart_indices_.insert( + {prefix_to_insert, RestartIndex(restart_index, num_blocks)}); + return result.second; +} + +const BlockHashIndex::RestartIndex* BlockHashIndex::GetRestartIndex( + const Slice& key) { + auto key_prefix = hash_key_extractor_->Transform(key); + + auto pos = restart_indices_.find(key_prefix); + if (pos == restart_indices_.end()) { + return nullptr; + } + + return &pos->second; +} + +} // namespace rocksdb diff --git a/table/block_hash_index.h b/table/block_hash_index.h new file mode 100644 index 0000000000..d5603d3660 --- /dev/null +++ b/table/block_hash_index.h @@ -0,0 +1,85 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include + +#include "rocksdb/status.h" +#include "util/arena.h" +#include "util/murmurhash.h" + +namespace rocksdb { + +class Comparator; +class Iterator; +class Slice; +class SliceTransform; + +// Build a hash-based index to speed up the lookup for "index block". +// BlockHashIndex accepts a key and, if found, returns its restart index within +// that index block. +class BlockHashIndex { + public: + // Represents a restart index in the index block's restart array. + struct RestartIndex { + explicit RestartIndex(uint32_t first_index, uint32_t num_blocks = 1) + : first_index(first_index), num_blocks(num_blocks) {} + + // For a given prefix, what is the restart index for the first data block + // that contains it. + uint32_t first_index = 0; + + // How many data blocks contains this prefix? + uint32_t num_blocks = 1; + }; + + // @params own_prefixes indicate if we should take care the memory space for + // the `key_prefix` + // passed by Add() + explicit BlockHashIndex(const SliceTransform* hash_key_extractor, + bool own_prefixes) + : hash_key_extractor_(hash_key_extractor), kOwnPrefixes(own_prefixes) {} + + // Maps a key to its restart first_index. + // Returns nullptr if the restart first_index is found + const RestartIndex* GetRestartIndex(const Slice& key); + + bool Add(const Slice& key_prefix, uint32_t restart_index, + uint32_t num_blocks); + + size_t ApproximateMemoryUsage() const { + return arena_.ApproximateMemoryUsage(); + } + + private: + const SliceTransform* hash_key_extractor_; + std::unordered_map restart_indices_; + + Arena arena_; + bool kOwnPrefixes; +}; + +// Create hash index by reading from the metadata blocks. +// @params prefixes: a sequence of prefixes. +// @params prefix_meta: contains the "metadata" to of the prefixes. +Status CreateBlockHashIndex(const SliceTransform* hash_key_extractor, + const Slice& prefixes, const Slice& prefix_meta, + BlockHashIndex** hash_index); + +// Create hash index by scanning the entries in index as well as the whole +// dataset. +// @params index_iter: an iterator with the pointer to the first entry in a +// block. +// @params data_iter: an iterator that can scan all the entries reside in a +// table. +// @params num_restarts: used for correctness verification. +// @params hash_key_extractor: extract the hashable part of a given key. +// On error, nullptr will be returned. +BlockHashIndex* CreateBlockHashIndexOnTheFly( + Iterator* index_iter, Iterator* data_iter, const uint32_t num_restarts, + const Comparator* comparator, const SliceTransform* hash_key_extractor); + +} // namespace rocksdb diff --git a/table/block_hash_index_test.cc b/table/block_hash_index_test.cc new file mode 100644 index 0000000000..6f7bcb2b76 --- /dev/null +++ b/table/block_hash_index_test.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "table/block_hash_index.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +typedef std::map Data; + +class MapIterator : public Iterator { + public: + explicit MapIterator(const Data& data) : data_(data), pos_(data_.end()) {} + + virtual bool Valid() const { return pos_ != data_.end(); } + + virtual void SeekToFirst() { pos_ = data_.begin(); } + + virtual void SeekToLast() { + pos_ = data_.end(); + --pos_; + } + + virtual void Seek(const Slice& target) { + pos_ = data_.find(target.ToString()); + } + + virtual void Next() { ++pos_; } + + virtual void Prev() { --pos_; } + + virtual Slice key() const { return pos_->first; } + + virtual Slice value() const { return pos_->second; } + + virtual Status status() const { return Status::OK(); } + + private: + const Data& data_; + Data::const_iterator pos_; +}; + +class BlockTest {}; + +TEST(BlockTest, BasicTest) { + const size_t keys_per_block = 4; + const size_t prefix_size = 2; + std::vector keys = {/* block 1 */ + "0101", "0102", "0103", "0201", + /* block 2 */ + "0202", "0203", "0301", "0401", + /* block 3 */ + "0501", "0601", "0701", "0801", + /* block 4 */ + "0802", "0803", "0804", "0805", + /* block 5 */ + "0806", "0807", "0808", "0809", }; + + Data data_entries; + for (const auto key : keys) { + data_entries.insert({key, key}); + } + + Data index_entries; + for (size_t i = 3; i < keys.size(); i += keys_per_block) { + // simply ignore the value part + index_entries.insert({keys[i], ""}); + } + + MapIterator data_iter(data_entries); + MapIterator index_iter(index_entries); + + auto prefix_extractor = NewFixedPrefixTransform(prefix_size); + std::unique_ptr block_hash_index(CreateBlockHashIndexOnTheFly( + &index_iter, &data_iter, index_entries.size(), BytewiseComparator(), + prefix_extractor)); + + std::map expected = { + {"01xx", BlockHashIndex::RestartIndex(0, 1)}, + {"02yy", BlockHashIndex::RestartIndex(0, 2)}, + {"03zz", BlockHashIndex::RestartIndex(1, 1)}, + {"04pp", BlockHashIndex::RestartIndex(1, 1)}, + {"05ww", BlockHashIndex::RestartIndex(2, 1)}, + {"06xx", BlockHashIndex::RestartIndex(2, 1)}, + {"07pp", BlockHashIndex::RestartIndex(2, 1)}, + {"08xz", BlockHashIndex::RestartIndex(2, 3)}, }; + + const BlockHashIndex::RestartIndex* index = nullptr; + // search existed prefixes + for (const auto& item : expected) { + index = block_hash_index->GetRestartIndex(item.first); + ASSERT_TRUE(index != nullptr); + ASSERT_EQ(item.second.first_index, index->first_index); + ASSERT_EQ(item.second.num_blocks, index->num_blocks); + } + + // search non exist prefixes + ASSERT_TRUE(!block_hash_index->GetRestartIndex("00xx")); + ASSERT_TRUE(!block_hash_index->GetRestartIndex("10yy")); + ASSERT_TRUE(!block_hash_index->GetRestartIndex("20zz")); + + delete prefix_extractor; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/table/block_test.cc b/table/block_test.cc new file mode 100644 index 0000000000..8ef4a5a8d4 --- /dev/null +++ b/table/block_test.cc @@ -0,0 +1,242 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include + +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/table.h" +#include "rocksdb/slice_transform.h" +#include "table/block.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "table/block_hash_index.h" +#include "util/random.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} +std::string GenerateKey(int primary_key, int secondary_key, int padding_size, + Random *rnd) { + char buf[50]; + char *p = &buf[0]; + snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key); + std::string k(p); + if (padding_size) { + k += RandomString(rnd, padding_size); + } + + return k; +} + +// Generate random key value pairs. +// The generated key will be sorted. You can tune the parameters to generated +// different kinds of test key/value pairs for different scenario. +void GenerateRandomKVs(std::vector *keys, + std::vector *values, const int from, + const int len, const int step = 1, + const int padding_size = 0, + const int keys_share_prefix = 1) { + Random rnd(302); + + // generate different prefix + for (int i = from; i < from + len; i += step) { + // generating keys that shares the prefix + for (int j = 0; j < keys_share_prefix; ++j) { + keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); + + // 100 bytes values + values->emplace_back(RandomString(&rnd, 100)); + } + } +} + +class BlockTest {}; + +// block test +TEST(BlockTest, SimpleTest) { + Random rnd(301); + Options options = Options(); + std::unique_ptr ic; + ic.reset(new test::PlainInternalKeyComparator(options.comparator)); + + std::vector keys; + std::vector values; + BlockBuilder builder(options, ic.get()); + int num_records = 100000; + + GenerateRandomKVs(&keys, &values, 0, num_records); + // add a bunch of records to a block + for (int i = 0; i < num_records; i++) { + builder.Add(keys[i], values[i]); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + contents.cachable = false; + contents.heap_allocated = false; + Block reader(contents); + + // read contents of block sequentially + int count = 0; + Iterator* iter = reader.NewIterator(options.comparator); + for (iter->SeekToFirst();iter->Valid(); count++, iter->Next()) { + + // read kv from block + Slice k = iter->key(); + Slice v = iter->value(); + + // compare with lookaside array + ASSERT_EQ(k.ToString().compare(keys[count]), 0); + ASSERT_EQ(v.ToString().compare(values[count]), 0); + } + delete iter; + + // read block contents randomly + iter = reader.NewIterator(options.comparator); + for (int i = 0; i < num_records; i++) { + + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + Slice k(keys[index]); + + // search in block for this key + iter->Seek(k); + ASSERT_TRUE(iter->Valid()); + Slice v = iter->value(); + ASSERT_EQ(v.ToString().compare(values[index]), 0); + } + delete iter; +} + +// return the block contents +BlockContents GetBlockContents(std::unique_ptr *builder, + const std::vector &keys, + const std::vector &values, + const int prefix_group_size = 1) { + builder->reset( + new BlockBuilder(1 /* restart interval */, BytewiseComparator())); + + // Add only half of the keys + for (size_t i = 0; i < keys.size(); ++i) { + (*builder)->Add(keys[i], values[i]); + } + Slice rawblock = (*builder)->Finish(); + + BlockContents contents; + contents.data = rawblock; + contents.cachable = false; + contents.heap_allocated = false; + + return contents; +} + +void CheckBlockContents(BlockContents contents, const int max_key, + const std::vector &keys, + const std::vector &values) { + const size_t prefix_size = 6; + // create block reader + Block reader1(contents); + Block reader2(contents); + + std::unique_ptr prefix_extractor( + NewFixedPrefixTransform(prefix_size)); + + { + auto iter1 = reader1.NewIterator(nullptr); + auto iter2 = reader1.NewIterator(nullptr); + reader1.SetBlockHashIndex(CreateBlockHashIndexOnTheFly( + iter1, iter2, keys.size(), BytewiseComparator(), + prefix_extractor.get())); + + delete iter1; + delete iter2; + } + + std::unique_ptr hash_iter( + reader1.NewIterator(BytewiseComparator())); + + std::unique_ptr regular_iter( + reader2.NewIterator(BytewiseComparator())); + + // Seek existent keys + for (size_t i = 0; i < keys.size(); i++) { + hash_iter->Seek(keys[i]); + ASSERT_OK(hash_iter->status()); + ASSERT_TRUE(hash_iter->Valid()); + + Slice v = hash_iter->value(); + ASSERT_EQ(v.ToString().compare(values[i]), 0); + } + + // Seek non-existent keys. + // For hash index, if no key with a given prefix is not found, iterator will + // simply be set as invalid; whereas the binary search based iterator will + // return the one that is closest. + for (int i = 1; i < max_key - 1; i += 2) { + auto key = GenerateKey(i, 0, 0, nullptr); + hash_iter->Seek(key); + ASSERT_TRUE(!hash_iter->Valid()); + + regular_iter->Seek(key); + ASSERT_TRUE(regular_iter->Valid()); + } +} + +// In this test case, no two key share same prefix. +TEST(BlockTest, SimpleIndexHash) { + const int kMaxKey = 100000; + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0 /* first key id */, + kMaxKey /* last key id */, 2 /* step */, + 8 /* padding size (8 bytes randomly generated suffix) */); + + std::unique_ptr builder; + auto contents = GetBlockContents(&builder, keys, values); + + CheckBlockContents(contents, kMaxKey, keys, values); +} + +TEST(BlockTest, IndexHashWithSharedPrefix) { + const int kMaxKey = 100000; + // for each prefix, there will be 5 keys starts with it. + const int kPrefixGroup = 5; + std::vector keys; + std::vector values; + // Generate keys with same prefix. + GenerateRandomKVs(&keys, &values, 0, // first key id + kMaxKey, // last key id + 2, // step + 10, // padding size, + kPrefixGroup); + + std::unique_ptr builder; + auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup); + + CheckBlockContents(contents, kMaxKey, keys, values); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/table/filter_block.cc b/table/filter_block.cc new file mode 100644 index 0000000000..3651a7d020 --- /dev/null +++ b/table/filter_block.cc @@ -0,0 +1,187 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/filter_block.h" + +#include "db/dbformat.h" +#include "rocksdb/filter_policy.h" +#include "util/coding.h" + +namespace rocksdb { + +// See doc/table_format.txt for an explanation of the filter block format. + +// Generate new filter every 2KB of data +static const size_t kFilterBaseLg = 11; +static const size_t kFilterBase = 1 << kFilterBaseLg; + +FilterBlockBuilder::FilterBlockBuilder(const Options& opt, + const Comparator* internal_comparator) + : policy_(opt.filter_policy), + prefix_extractor_(opt.prefix_extractor.get()), + whole_key_filtering_(opt.whole_key_filtering), + comparator_(internal_comparator) {} + +void FilterBlockBuilder::StartBlock(uint64_t block_offset) { + uint64_t filter_index = (block_offset / kFilterBase); + assert(filter_index >= filter_offsets_.size()); + while (filter_index > filter_offsets_.size()) { + GenerateFilter(); + } +} + +bool FilterBlockBuilder::SamePrefix(const Slice &key1, + const Slice &key2) const { + if (!prefix_extractor_->InDomain(key1) && + !prefix_extractor_->InDomain(key2)) { + return true; + } else if (!prefix_extractor_->InDomain(key1) || + !prefix_extractor_->InDomain(key2)) { + return false; + } else { + return (prefix_extractor_->Transform(key1) == + prefix_extractor_->Transform(key2)); + } +} + +void FilterBlockBuilder::AddKey(const Slice& key) { + // get slice for most recently added entry + Slice prev; + size_t added_to_start = 0; + + // add key to filter if needed + if (whole_key_filtering_) { + start_.push_back(entries_.size()); + ++added_to_start; + entries_.append(key.data(), key.size()); + } + + if (start_.size() > added_to_start) { + size_t prev_start = start_[start_.size() - 1 - added_to_start]; + const char* base = entries_.data() + prev_start; + size_t length = entries_.size() - prev_start; + prev = Slice(base, length); + } + + // add prefix to filter if needed + if (prefix_extractor_ && prefix_extractor_->InDomain(ExtractUserKey(key))) { + // If prefix_extractor_, this filter_block layer assumes we only + // operate on internal keys. + Slice user_key = ExtractUserKey(key); + // this assumes prefix(prefix(key)) == prefix(key), as the last + // entry in entries_ may be either a key or prefix, and we use + // prefix(last entry) to get the prefix of the last key. + if (prev.size() == 0 || + !SamePrefix(user_key, ExtractUserKey(prev))) { + Slice prefix = prefix_extractor_->Transform(user_key); + InternalKey internal_prefix_tmp(prefix, 0, kTypeValue); + Slice internal_prefix = internal_prefix_tmp.Encode(); + start_.push_back(entries_.size()); + entries_.append(internal_prefix.data(), internal_prefix.size()); + } + } +} + +Slice FilterBlockBuilder::Finish() { + if (!start_.empty()) { + GenerateFilter(); + } + + // Append array of per-filter offsets + const uint32_t array_offset = result_.size(); + for (size_t i = 0; i < filter_offsets_.size(); i++) { + PutFixed32(&result_, filter_offsets_[i]); + } + + PutFixed32(&result_, array_offset); + result_.push_back(kFilterBaseLg); // Save encoding parameter in result + return Slice(result_); +} + +void FilterBlockBuilder::GenerateFilter() { + const size_t num_entries = start_.size(); + if (num_entries == 0) { + // Fast path if there are no keys for this filter + filter_offsets_.push_back(result_.size()); + return; + } + + // Make list of keys from flattened key structure + start_.push_back(entries_.size()); // Simplify length computation + tmp_entries_.resize(num_entries); + for (size_t i = 0; i < num_entries; i++) { + const char* base = entries_.data() + start_[i]; + size_t length = start_[i+1] - start_[i]; + tmp_entries_[i] = Slice(base, length); + } + + // Generate filter for current set of keys and append to result_. + filter_offsets_.push_back(result_.size()); + policy_->CreateFilter(&tmp_entries_[0], num_entries, &result_); + + tmp_entries_.clear(); + entries_.clear(); + start_.clear(); +} + +FilterBlockReader::FilterBlockReader( + const Options& opt, const Slice& contents, bool delete_contents_after_use) + : policy_(opt.filter_policy), + prefix_extractor_(opt.prefix_extractor.get()), + whole_key_filtering_(opt.whole_key_filtering), + data_(nullptr), + offset_(nullptr), + num_(0), + base_lg_(0) { + size_t n = contents.size(); + if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array + base_lg_ = contents[n-1]; + uint32_t last_word = DecodeFixed32(contents.data() + n - 5); + if (last_word > n - 5) return; + data_ = contents.data(); + offset_ = data_ + last_word; + num_ = (n - 5 - last_word) / 4; + if (delete_contents_after_use) { + filter_data.reset(contents.data()); + } +} + +bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, + const Slice& key) { + if (!whole_key_filtering_) { + return true; + } + return MayMatch(block_offset, key); +} + +bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset, + const Slice& prefix) { + if (!prefix_extractor_) { + return true; + } + return MayMatch(block_offset, prefix); +} + +bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) { + uint64_t index = block_offset >> base_lg_; + if (index < num_) { + uint32_t start = DecodeFixed32(offset_ + index*4); + uint32_t limit = DecodeFixed32(offset_ + index*4 + 4); + if (start <= limit && limit <= (uint32_t)(offset_ - data_)) { + Slice filter = Slice(data_ + start, limit - start); + return policy_->KeyMayMatch(entry, filter); + } else if (start == limit) { + // Empty filters do not match any entries + return false; + } + } + return true; // Errors are treated as potential matches +} + +} diff --git a/table/filter_block.h b/table/filter_block.h new file mode 100644 index 0000000000..05c2bb9430 --- /dev/null +++ b/table/filter_block.h @@ -0,0 +1,92 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A filter block is stored near the end of a Table file. It contains +// filters (e.g., bloom filters) for all data blocks in the table combined +// into a single filter block. + +#pragma once + +#include +#include +#include +#include +#include +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "util/hash.h" + +namespace rocksdb { + +class FilterPolicy; + +// A FilterBlockBuilder is used to construct all of the filters for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// +// The sequence of calls to FilterBlockBuilder must match the regexp: +// (StartBlock AddKey*)* Finish +class FilterBlockBuilder { + public: + explicit FilterBlockBuilder(const Options& opt, + const Comparator* internal_comparator); + + void StartBlock(uint64_t block_offset); + void AddKey(const Slice& key); + Slice Finish(); + + private: + bool SamePrefix(const Slice &key1, const Slice &key2) const; + void GenerateFilter(); + + // important: all of these might point to invalid addresses + // at the time of destruction of this filter block. destructor + // should NOT dereference them. + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + const Comparator* comparator_; + + std::string entries_; // Flattened entry contents + std::vector start_; // Starting index in entries_ of each entry + std::string result_; // Filter data computed so far + std::vector tmp_entries_; // policy_->CreateFilter() argument + std::vector filter_offsets_; + + // No copying allowed + FilterBlockBuilder(const FilterBlockBuilder&); + void operator=(const FilterBlockBuilder&); +}; + +class FilterBlockReader { + public: + // REQUIRES: "contents" and *policy must stay live while *this is live. + FilterBlockReader( + const Options& opt, + const Slice& contents, + bool delete_contents_after_use = false); + bool KeyMayMatch(uint64_t block_offset, const Slice& key); + bool PrefixMayMatch(uint64_t block_offset, const Slice& prefix); + + private: + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + const char* data_; // Pointer to filter data (at block-start) + const char* offset_; // Pointer to beginning of offset array (at block-end) + size_t num_; // Number of entries in offset array + size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) + std::unique_ptr filter_data; + + + bool MayMatch(uint64_t block_offset, const Slice& entry); +}; + +} diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc new file mode 100644 index 0000000000..1703d59d17 --- /dev/null +++ b/table/filter_block_test.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/filter_block.h" + +#include "rocksdb/filter_policy.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +// For testing: emit an array with one hash value per key +class TestHashFilter : public FilterPolicy { + public: + virtual const char* Name() const { + return "TestHashFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { + uint32_t h = Hash(key.data(), key.size(), 1); + for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } +}; + +class FilterBlockTest { + public: + TestHashFilter policy_; + Options options_; + + FilterBlockTest() { + options_ = Options(); + options_.filter_policy = &policy_; + } +}; + +TEST(FilterBlockTest, EmptyBuilder) { + FilterBlockBuilder builder(options_, options_.comparator); + Slice block = builder.Finish(); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block)); + FilterBlockReader reader(options_, block); + ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(100000, "foo")); +} + +TEST(FilterBlockTest, SingleChunk) { + FilterBlockBuilder builder(options_, options_.comparator); + builder.StartBlock(100); + builder.AddKey("foo"); + builder.AddKey("bar"); + builder.AddKey("box"); + builder.StartBlock(200); + builder.AddKey("box"); + builder.StartBlock(300); + builder.AddKey("hello"); + Slice block = builder.Finish(); + FilterBlockReader reader(options_, block); + ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(100, "bar")); + ASSERT_TRUE(reader.KeyMayMatch(100, "box")); + ASSERT_TRUE(reader.KeyMayMatch(100, "hello")); + ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(100, "missing")); + ASSERT_TRUE(! reader.KeyMayMatch(100, "other")); +} + +TEST(FilterBlockTest, MultiChunk) { + FilterBlockBuilder builder(options_, options_.comparator); + + // First filter + builder.StartBlock(0); + builder.AddKey("foo"); + builder.StartBlock(2000); + builder.AddKey("bar"); + + // Second filter + builder.StartBlock(3100); + builder.AddKey("box"); + + // Third filter is empty + + // Last filter + builder.StartBlock(9000); + builder.AddKey("box"); + builder.AddKey("hello"); + + Slice block = builder.Finish(); + FilterBlockReader reader(options_, block); + + // Check first filter + ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(2000, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(0, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(0, "hello")); + + // Check second filter + ASSERT_TRUE(reader.KeyMayMatch(3100, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "hello")); + + // Check third filter (empty) + ASSERT_TRUE(! reader.KeyMayMatch(4100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "hello")); + + // Check last filter + ASSERT_TRUE(reader.KeyMayMatch(9000, "box")); + ASSERT_TRUE(reader.KeyMayMatch(9000, "hello")); + ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar")); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/table/flush_block_policy.cc b/table/flush_block_policy.cc new file mode 100644 index 0000000000..4e2235205f --- /dev/null +++ b/table/flush_block_policy.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "rocksdb/options.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/slice.h" +#include "table/block_builder.h" + +#include + +namespace rocksdb { + +// Flush block by size +class FlushBlockBySizePolicy : public FlushBlockPolicy { + public: + // @params block_size: Approximate size of user data packed per + // block. + // @params block_size_deviation: This is used to close a block before it + // reaches the configured + FlushBlockBySizePolicy(const uint64_t block_size, + const uint64_t block_size_deviation, + const BlockBuilder& data_block_builder) : + block_size_(block_size), + block_size_deviation_(block_size_deviation), + data_block_builder_(data_block_builder) { + } + + virtual bool Update(const Slice& key, + const Slice& value) override { + // it makes no sense to flush when the data block is empty + if (data_block_builder_.empty()) { + return false; + } + + auto curr_size = data_block_builder_.CurrentSizeEstimate(); + + // Do flush if one of the below two conditions is true: + // 1) if the current estimated size already exceeds the block size, + // 2) block_size_deviation is set and the estimated size after appending + // the kv will exceed the block size and the current size is under the + // the deviation. + return curr_size >= block_size_ || BlockAlmostFull(key, value); + } + + private: + bool BlockAlmostFull(const Slice& key, const Slice& value) const { + const auto curr_size = data_block_builder_.CurrentSizeEstimate(); + const auto estimated_size_after = + data_block_builder_.EstimateSizeAfterKV(key, value); + + return + estimated_size_after > block_size_ && + block_size_deviation_ > 0 && + curr_size * 100 > block_size_ * (100 - block_size_deviation_); + } + + const uint64_t block_size_; + const uint64_t block_size_deviation_; + const BlockBuilder& data_block_builder_; +}; + +FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + const Options& options, const BlockBuilder& data_block_builder) const { + return new FlushBlockBySizePolicy( + options.block_size, options.block_size_deviation, data_block_builder); +} + +} // namespace rocksdb diff --git a/table/format.cc b/table/format.cc new file mode 100644 index 0000000000..e9229dcf0f --- /dev/null +++ b/table/format.cc @@ -0,0 +1,371 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/format.h" + +#include +#include + +#include "port/port.h" +#include "rocksdb/env.h" +#include "table/block.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/perf_context_imp.h" +#include "util/xxhash.h" + +namespace rocksdb { + +extern const uint64_t kLegacyBlockBasedTableMagicNumber; +extern const uint64_t kBlockBasedTableMagicNumber; + +#ifndef ROCKSDB_LITE +extern const uint64_t kLegacyPlainTableMagicNumber; +extern const uint64_t kPlainTableMagicNumber; +#else +// ROCKSDB_LITE doesn't have plain table +const uint64_t kLegacyPlainTableMagicNumber = 0; +const uint64_t kPlainTableMagicNumber = 0; +#endif + +void BlockHandle::EncodeTo(std::string* dst) const { + // Sanity check that all fields have been set + assert(offset_ != ~static_cast(0)); + assert(size_ != ~static_cast(0)); + PutVarint64(dst, offset_); + PutVarint64(dst, size_); +} + +Status BlockHandle::DecodeFrom(Slice* input) { + if (GetVarint64(input, &offset_) && + GetVarint64(input, &size_)) { + return Status::OK(); + } else { + return Status::Corruption("bad block handle"); + } +} +const BlockHandle BlockHandle::kNullBlockHandle(0, 0); + +// legacy footer format: +// metaindex handle (varint64 offset, varint64 size) +// index handle (varint64 offset, varint64 size) +// to make the total size 2 * BlockHandle::kMaxEncodedLength +// table_magic_number (8 bytes) +// new footer format: +// checksum (char, 1 byte) +// metaindex handle (varint64 offset, varint64 size) +// index handle (varint64 offset, varint64 size) +// to make the total size 2 * BlockHandle::kMaxEncodedLength + 1 +// footer version (4 bytes) +// table_magic_number (8 bytes) +void Footer::EncodeTo(std::string* dst) const { + if (version() == kLegacyFooter) { + // has to be default checksum with legacy footer + assert(checksum_ == kCRC32c); + const size_t original_size = dst->size(); + metaindex_handle_.EncodeTo(dst); + index_handle_.EncodeTo(dst); + dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding + PutFixed32(dst, static_cast(table_magic_number() & 0xffffffffu)); + PutFixed32(dst, static_cast(table_magic_number() >> 32)); + assert(dst->size() == original_size + kVersion0EncodedLength); + } else { + const size_t original_size = dst->size(); + dst->push_back(static_cast(checksum_)); + metaindex_handle_.EncodeTo(dst); + index_handle_.EncodeTo(dst); + dst->resize(original_size + kVersion1EncodedLength - 12); // Padding + PutFixed32(dst, kFooterVersion); + PutFixed32(dst, static_cast(table_magic_number() & 0xffffffffu)); + PutFixed32(dst, static_cast(table_magic_number() >> 32)); + assert(dst->size() == original_size + kVersion1EncodedLength); + } +} + +namespace { +inline bool IsLegacyFooterFormat(uint64_t magic_number) { + return magic_number == kLegacyBlockBasedTableMagicNumber || + magic_number == kLegacyPlainTableMagicNumber; +} + +inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) { + if (magic_number == kLegacyBlockBasedTableMagicNumber) { + return kBlockBasedTableMagicNumber; + } + if (magic_number == kLegacyPlainTableMagicNumber) { + return kPlainTableMagicNumber; + } + assert(false); + return 0; +} +} // namespace + +Footer::Footer(uint64_t table_magic_number) + : version_(IsLegacyFooterFormat(table_magic_number) ? kLegacyFooter + : kFooterVersion), + checksum_(kCRC32c), + table_magic_number_(table_magic_number) {} + +Status Footer::DecodeFrom(Slice* input) { + assert(input != nullptr); + assert(input->size() >= kMinEncodedLength); + + const char *magic_ptr = + input->data() + input->size() - kMagicNumberLengthByte; + const uint32_t magic_lo = DecodeFixed32(magic_ptr); + const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); + uint64_t magic = ((static_cast(magic_hi) << 32) | + (static_cast(magic_lo))); + + // We check for legacy formats here and silently upconvert them + bool legacy = IsLegacyFooterFormat(magic); + if (legacy) { + magic = UpconvertLegacyFooterFormat(magic); + } + if (HasInitializedTableMagicNumber()) { + if (magic != table_magic_number()) { + char buffer[80]; + snprintf(buffer, sizeof(buffer) - 1, + "not an sstable (bad magic number --- %lx)", + (long)magic); + return Status::InvalidArgument(buffer); + } + } else { + set_table_magic_number(magic); + } + + if (legacy) { + // The size is already asserted to be at least kMinEncodedLength + // at the beginning of the function + input->remove_prefix(input->size() - kVersion0EncodedLength); + version_ = kLegacyFooter; + checksum_ = kCRC32c; + } else { + version_ = DecodeFixed32(magic_ptr - 4); + if (version_ != kFooterVersion) { + return Status::Corruption("bad footer version"); + } + // Footer version 1 will always occupy exactly this many bytes. + // It consists of the checksum type, two block handles, padding, + // a version number, and a magic number + if (input->size() < kVersion1EncodedLength) { + return Status::InvalidArgument("input is too short to be an sstable"); + } else { + input->remove_prefix(input->size() - kVersion1EncodedLength); + } + uint32_t checksum; + if (!GetVarint32(input, &checksum)) { + return Status::Corruption("bad checksum type"); + } + checksum_ = static_cast(checksum); + } + + Status result = metaindex_handle_.DecodeFrom(input); + if (result.ok()) { + result = index_handle_.DecodeFrom(input); + } + if (result.ok()) { + // We skip over any leftover data (just padding for now) in "input" + const char* end = magic_ptr + kMagicNumberLengthByte; + *input = Slice(end, input->data() + input->size() - end); + } + return result; +} + +Status ReadFooterFromFile(RandomAccessFile* file, + uint64_t file_size, + Footer* footer) { + if (file_size < Footer::kMinEncodedLength) { + return Status::InvalidArgument("file is too short to be an sstable"); + } + + char footer_space[Footer::kMaxEncodedLength]; + Slice footer_input; + size_t read_offset = (file_size > Footer::kMaxEncodedLength) + ? (file_size - Footer::kMaxEncodedLength) + : 0; + Status s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input, + footer_space); + if (!s.ok()) return s; + + // Check that we actually read the whole footer from the file. It may be + // that size isn't correct. + if (footer_input.size() < Footer::kMinEncodedLength) { + return Status::InvalidArgument("file is too short to be an sstable"); + } + + return footer->DecodeFrom(&footer_input); +} + +Status ReadBlockContents(RandomAccessFile* file, + const Footer& footer, + const ReadOptions& options, + const BlockHandle& handle, + BlockContents* result, + Env* env, + bool do_uncompress) { + result->data = Slice(); + result->cachable = false; + result->heap_allocated = false; + + // Read the block contents as well as the type/crc footer. + // See table_builder.cc for the code that built this structure. + size_t n = static_cast(handle.size()); + char* buf = new char[n + kBlockTrailerSize]; + Slice contents; + + PERF_TIMER_AUTO(block_read_time); + Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); + PERF_TIMER_MEASURE(block_read_time); + PERF_COUNTER_ADD(block_read_count, 1); + PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize); + + if (!s.ok()) { + delete[] buf; + return s; + } + if (contents.size() != n + kBlockTrailerSize) { + delete[] buf; + return Status::Corruption("truncated block read"); + } + + // Check the crc of the type and the block contents + const char* data = contents.data(); // Pointer to where Read put the data + if (options.verify_checksums) { + uint32_t value = DecodeFixed32(data + n + 1); + uint32_t actual = 0; + switch (footer.checksum()) { + case kCRC32c: + value = crc32c::Unmask(value); + actual = crc32c::Value(data, n + 1); + break; + case kxxHash: + actual = XXH32(data, n + 1, 0); + break; + default: + s = Status::Corruption("unknown checksum type"); + } + if (s.ok() && actual != value) { + s = Status::Corruption("block checksum mismatch"); + } + if (!s.ok()) { + delete[] buf; + return s; + } + PERF_TIMER_MEASURE(block_checksum_time); + } + + rocksdb::CompressionType compression_type = + static_cast(data[n]); + // If the caller has requested that the block not be uncompressed + if (!do_uncompress || compression_type == kNoCompression) { + if (data != buf) { + // File implementation gave us pointer to some other data. + // Use it directly under the assumption that it will be live + // while the file is open. + delete[] buf; + result->data = Slice(data, n); + result->heap_allocated = false; + result->cachable = false; // Do not double-cache + } else { + result->data = Slice(buf, n); + result->heap_allocated = true; + result->cachable = true; + } + result->compression_type = compression_type; + s = Status::OK(); + } else { + s = UncompressBlockContents(data, n, result); + delete[] buf; + } + PERF_TIMER_STOP(block_decompress_time); + return s; +} + +// +// The 'data' points to the raw block contents that was read in from file. +// This method allocates a new heap buffer and the raw block +// contents are uncompresed into this buffer. This +// buffer is returned via 'result' and it is upto the caller to +// free this buffer. +Status UncompressBlockContents(const char* data, size_t n, + BlockContents* result) { + char* ubuf = nullptr; + int decompress_size = 0; + assert(data[n] != kNoCompression); + switch (data[n]) { + case kSnappyCompression: { + size_t ulength = 0; + static char snappy_corrupt_msg[] = + "Snappy not supported or corrupted Snappy compressed block contents"; + if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) { + return Status::Corruption(snappy_corrupt_msg); + } + ubuf = new char[ulength]; + if (!port::Snappy_Uncompress(data, n, ubuf)) { + delete[] ubuf; + return Status::Corruption(snappy_corrupt_msg); + } + result->data = Slice(ubuf, ulength); + result->heap_allocated = true; + result->cachable = true; + break; + } + case kZlibCompression: + ubuf = port::Zlib_Uncompress(data, n, &decompress_size); + static char zlib_corrupt_msg[] = + "Zlib not supported or corrupted Zlib compressed block contents"; + if (!ubuf) { + return Status::Corruption(zlib_corrupt_msg); + } + result->data = Slice(ubuf, decompress_size); + result->heap_allocated = true; + result->cachable = true; + break; + case kBZip2Compression: + ubuf = port::BZip2_Uncompress(data, n, &decompress_size); + static char bzip2_corrupt_msg[] = + "Bzip2 not supported or corrupted Bzip2 compressed block contents"; + if (!ubuf) { + return Status::Corruption(bzip2_corrupt_msg); + } + result->data = Slice(ubuf, decompress_size); + result->heap_allocated = true; + result->cachable = true; + break; + case kLZ4Compression: + ubuf = port::LZ4_Uncompress(data, n, &decompress_size); + static char lz4_corrupt_msg[] = + "LZ4 not supported or corrupted LZ4 compressed block contents"; + if (!ubuf) { + return Status::Corruption(lz4_corrupt_msg); + } + result->data = Slice(ubuf, decompress_size); + result->heap_allocated = true; + result->cachable = true; + break; + case kLZ4HCCompression: + ubuf = port::LZ4_Uncompress(data, n, &decompress_size); + static char lz4hc_corrupt_msg[] = + "LZ4HC not supported or corrupted LZ4HC compressed block contents"; + if (!ubuf) { + return Status::Corruption(lz4hc_corrupt_msg); + } + result->data = Slice(ubuf, decompress_size); + result->heap_allocated = true; + result->cachable = true; + break; + default: + return Status::Corruption("bad block type"); + } + result->compression_type = kNoCompression; // not compressed any more + return Status::OK(); +} + +} // namespace rocksdb diff --git a/table/format.h b/table/format.h new file mode 100644 index 0000000000..a971c1a67c --- /dev/null +++ b/table/format.h @@ -0,0 +1,198 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +class Block; +class RandomAccessFile; +struct ReadOptions; + +// the length of the magic number in bytes. +const int kMagicNumberLengthByte = 8; + +// BlockHandle is a pointer to the extent of a file that stores a data +// block or a meta block. +class BlockHandle { + public: + BlockHandle(); + BlockHandle(uint64_t offset, uint64_t size); + + // The offset of the block in the file. + uint64_t offset() const { return offset_; } + void set_offset(uint64_t offset) { offset_ = offset; } + + // The size of the stored block + uint64_t size() const { return size_; } + void set_size(uint64_t size) { size_ = size; } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* input); + + // if the block handle's offset and size are both "0", we will view it + // as a null block handle that points to no where. + bool IsNull() const { + return offset_ == 0 && size_ == 0; + } + + static const BlockHandle& NullBlockHandle() { + return kNullBlockHandle; + } + + // Maximum encoding length of a BlockHandle + enum { kMaxEncodedLength = 10 + 10 }; + + private: + uint64_t offset_ = 0; + uint64_t size_ = 0; + + static const BlockHandle kNullBlockHandle; +}; + +// Footer encapsulates the fixed information stored at the tail +// end of every table file. +class Footer { + public: + // Constructs a footer without specifying its table magic number. + // In such case, the table magic number of such footer should be + // initialized via @ReadFooterFromFile(). + Footer() : Footer(kInvalidTableMagicNumber) {} + + // @table_magic_number serves two purposes: + // 1. Identify different types of the tables. + // 2. Help us to identify if a given file is a valid sst. + explicit Footer(uint64_t table_magic_number); + + // The version of the footer in this file + uint32_t version() const { return version_; } + + // The checksum type used in this file + ChecksumType checksum() const { return checksum_; } + void set_checksum(const ChecksumType c) { checksum_ = c; } + + // The block handle for the metaindex block of the table + const BlockHandle& metaindex_handle() const { return metaindex_handle_; } + void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; } + + // The block handle for the index block of the table + const BlockHandle& index_handle() const { return index_handle_; } + + void set_index_handle(const BlockHandle& h) { index_handle_ = h; } + + uint64_t table_magic_number() const { return table_magic_number_; } + + // The version of Footer we encode + enum { + kLegacyFooter = 0, + kFooterVersion = 1, + }; + + void EncodeTo(std::string* dst) const; + + // Set the current footer based on the input slice. If table_magic_number_ + // is not set (i.e., HasInitializedTableMagicNumber() is true), then this + // function will also initialize table_magic_number_. Otherwise, this + // function will verify whether the magic number specified in the input + // slice matches table_magic_number_ and update the current footer only + // when the test passes. + Status DecodeFrom(Slice* input); + + // Encoded length of a Footer. Note that the serialization of a Footer will + // always occupy at least kMinEncodedLength bytes. If fields are changed + // the version number should be incremented and kMaxEncodedLength should be + // increased accordingly. + enum { + // Footer version 0 (legacy) will always occupy exactly this many bytes. + // It consists of two block handles, padding, and a magic number. + kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8, + // Footer version 1 will always occupy exactly this many bytes. + // It consists of the checksum type, two block handles, padding, + // a version number, and a magic number + kVersion1EncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8, + + kMinEncodedLength = kVersion0EncodedLength, + kMaxEncodedLength = kVersion1EncodedLength + }; + + static const uint64_t kInvalidTableMagicNumber = 0; + + private: + // REQUIRES: magic number wasn't initialized. + void set_table_magic_number(uint64_t magic_number) { + assert(!HasInitializedTableMagicNumber()); + table_magic_number_ = magic_number; + } + + // return true if @table_magic_number_ is set to a value different + // from @kInvalidTableMagicNumber. + bool HasInitializedTableMagicNumber() const { + return (table_magic_number_ != kInvalidTableMagicNumber); + } + + uint32_t version_; + ChecksumType checksum_; + BlockHandle metaindex_handle_; + BlockHandle index_handle_; + uint64_t table_magic_number_ = 0; +}; + +// Read the footer from file +Status ReadFooterFromFile(RandomAccessFile* file, + uint64_t file_size, + Footer* footer); + +// 1-byte type + 32-bit crc +static const size_t kBlockTrailerSize = 5; + +struct BlockContents { + Slice data; // Actual contents of data + bool cachable; // True iff data can be cached + bool heap_allocated; // True iff caller should delete[] data.data() + CompressionType compression_type; +}; + +// Read the block identified by "handle" from "file". On failure +// return non-OK. On success fill *result and return OK. +extern Status ReadBlockContents(RandomAccessFile* file, + const Footer& footer, + const ReadOptions& options, + const BlockHandle& handle, + BlockContents* result, + Env* env, + bool do_uncompress); + +// The 'data' points to the raw block contents read in from file. +// This method allocates a new heap buffer and the raw block +// contents are uncompresed into this buffer. This buffer is +// returned via 'result' and it is upto the caller to +// free this buffer. +extern Status UncompressBlockContents(const char* data, + size_t n, + BlockContents* result); + +// Implementation details follow. Clients should ignore, + +inline BlockHandle::BlockHandle() + : BlockHandle(~static_cast(0), + ~static_cast(0)) { +} + +inline BlockHandle::BlockHandle(uint64_t offset, uint64_t size) + : offset_(offset), + size_(size) { +} + +} // namespace rocksdb diff --git a/table/iter_heap.h b/table/iter_heap.h new file mode 100644 index 0000000000..9569d36389 --- /dev/null +++ b/table/iter_heap.h @@ -0,0 +1,44 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#pragma once +#include + +#include "rocksdb/comparator.h" +#include "table/iterator_wrapper.h" + +namespace rocksdb { + +// Return the max of two keys. +class MaxIteratorComparator { + public: + MaxIteratorComparator(const Comparator* comparator) : + comparator_(comparator) {} + + bool operator()(IteratorWrapper* a, IteratorWrapper* b) { + return comparator_->Compare(a->key(), b->key()) <= 0; + } + private: + const Comparator* comparator_; +}; + +// Return the max of two keys. +class MinIteratorComparator { + public: + // if maxHeap is set comparator returns the max value. + // else returns the min Value. + // Can use to create a minHeap or a maxHeap. + MinIteratorComparator(const Comparator* comparator) : + comparator_(comparator) {} + + bool operator()(IteratorWrapper* a, IteratorWrapper* b) { + return comparator_->Compare(a->key(), b->key()) > 0; + } + private: + const Comparator* comparator_; +}; + +} // namespace rocksdb diff --git a/table/iterator.cc b/table/iterator.cc new file mode 100644 index 0000000000..4c360205a2 --- /dev/null +++ b/table/iterator.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/iterator.h" +#include "table/iterator_wrapper.h" +#include "util/arena.h" + +namespace rocksdb { + +Iterator::Iterator() { + cleanup_.function = nullptr; + cleanup_.next = nullptr; +} + +Iterator::~Iterator() { + if (cleanup_.function != nullptr) { + (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); + for (Cleanup* c = cleanup_.next; c != nullptr; ) { + (*c->function)(c->arg1, c->arg2); + Cleanup* next = c->next; + delete c; + c = next; + } + } +} + +void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { + assert(func != nullptr); + Cleanup* c; + if (cleanup_.function == nullptr) { + c = &cleanup_; + } else { + c = new Cleanup; + c->next = cleanup_.next; + cleanup_.next = c; + } + c->function = func; + c->arg1 = arg1; + c->arg2 = arg2; +} + +namespace { +class EmptyIterator : public Iterator { + public: + explicit EmptyIterator(const Status& s) : status_(s) { } + virtual bool Valid() const { return false; } + virtual void Seek(const Slice& target) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + virtual void Next() { assert(false); } + virtual void Prev() { assert(false); } + Slice key() const { assert(false); return Slice(); } + Slice value() const { assert(false); return Slice(); } + virtual Status status() const { return status_; } + private: + Status status_; +}; +} // namespace + +Iterator* NewEmptyIterator() { + return new EmptyIterator(Status::OK()); +} + +Iterator* NewEmptyIterator(Arena* arena) { + if (arena == nullptr) { + return NewEmptyIterator(); + } else { + auto mem = arena->AllocateAligned(sizeof(EmptyIterator)); + return new (mem) EmptyIterator(Status::OK()); + } +} + +Iterator* NewErrorIterator(const Status& status) { + return new EmptyIterator(status); +} + +Iterator* NewErrorIterator(const Status& status, Arena* arena) { + if (arena == nullptr) { + return NewErrorIterator(status); + } else { + auto mem = arena->AllocateAligned(sizeof(EmptyIterator)); + return new (mem) EmptyIterator(status); + } +} + +} // namespace rocksdb diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h new file mode 100644 index 0000000000..502cacb3e8 --- /dev/null +++ b/table/iterator_wrapper.h @@ -0,0 +1,81 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/iterator.h" + +namespace rocksdb { + +// A internal wrapper class with an interface similar to Iterator that +// caches the valid() and key() results for an underlying iterator. +// This can help avoid virtual function calls and also gives better +// cache locality. +class IteratorWrapper { + public: + IteratorWrapper(): iter_(nullptr), valid_(false) { } + explicit IteratorWrapper(Iterator* iter): iter_(nullptr) { + Set(iter); + } + ~IteratorWrapper() {} + Iterator* iter() const { return iter_; } + + // Takes ownership of "iter" and will delete it when destroyed, or + // when Set() is invoked again. + void Set(Iterator* iter) { + delete iter_; + iter_ = iter; + if (iter_ == nullptr) { + valid_ = false; + } else { + Update(); + } + } + + void DeleteIter(bool is_arena_mode) { + if (!is_arena_mode) { + delete iter_; + } else { + iter_->~Iterator(); + } + } + + // Iterator interface methods + bool Valid() const { return valid_; } + Slice key() const { assert(Valid()); return key_; } + Slice value() const { assert(Valid()); return iter_->value(); } + // Methods below require iter() != nullptr + Status status() const { assert(iter_); return iter_->status(); } + void Next() { assert(iter_); iter_->Next(); Update(); } + void Prev() { assert(iter_); iter_->Prev(); Update(); } + void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } + void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } + void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + + private: + void Update() { + valid_ = iter_->Valid(); + if (valid_) { + key_ = iter_->key(); + } + } + + Iterator* iter_; + bool valid_; + Slice key_; +}; + +class Arena; +// Return an empty iterator (yields nothing) allocated from arena. +extern Iterator* NewEmptyIterator(Arena* arena); + +// Return an empty iterator with the specified status, allocated arena. +extern Iterator* NewErrorIterator(const Status& status, Arena* arena); + +} // namespace rocksdb diff --git a/table/merger.cc b/table/merger.cc new file mode 100644 index 0000000000..9aab33ed36 --- /dev/null +++ b/table/merger.cc @@ -0,0 +1,356 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/merger.h" + +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "table/iter_heap.h" +#include "table/iterator_wrapper.h" +#include "util/arena.h" +#include "util/stop_watch.h" +#include "util/perf_context_imp.h" +#include "util/autovector.h" + +namespace rocksdb { +namespace { +typedef std::priority_queue< + IteratorWrapper*, + std::vector, + MaxIteratorComparator> MaxIterHeap; + +typedef std::priority_queue< + IteratorWrapper*, + std::vector, + MinIteratorComparator> MinIterHeap; + +// Return's a new MaxHeap of IteratorWrapper's using the provided Comparator. +MaxIterHeap NewMaxIterHeap(const Comparator* comparator) { + return MaxIterHeap(MaxIteratorComparator(comparator)); +} + +// Return's a new MinHeap of IteratorWrapper's using the provided Comparator. +MinIterHeap NewMinIterHeap(const Comparator* comparator) { + return MinIterHeap(MinIteratorComparator(comparator)); +} +} // namespace + +const size_t kNumIterReserve = 4; + +class MergingIterator : public Iterator { + public: + MergingIterator(const Comparator* comparator, Iterator** children, int n, + bool is_arena_mode) + : is_arena_mode_(is_arena_mode), + comparator_(comparator), + current_(nullptr), + use_heap_(true), + direction_(kForward), + maxHeap_(NewMaxIterHeap(comparator_)), + minHeap_(NewMinIterHeap(comparator_)) { + children_.resize(n); + for (int i = 0; i < n; i++) { + children_[i].Set(children[i]); + } + for (auto& child : children_) { + if (child.Valid()) { + minHeap_.push(&child); + } + } + } + + virtual void AddIterator(Iterator* iter) { + assert(direction_ == kForward); + children_.emplace_back(iter); + auto new_wrapper = children_.back(); + if (new_wrapper.Valid()) { + minHeap_.push(&new_wrapper); + } + } + + virtual ~MergingIterator() { + for (auto& child : children_) { + child.DeleteIter(is_arena_mode_); + } + } + + virtual bool Valid() const { + return (current_ != nullptr); + } + + virtual void SeekToFirst() { + ClearHeaps(); + for (auto& child : children_) { + child.SeekToFirst(); + if (child.Valid()) { + minHeap_.push(&child); + } + } + FindSmallest(); + direction_ = kForward; + } + + virtual void SeekToLast() { + ClearHeaps(); + for (auto& child : children_) { + child.SeekToLast(); + if (child.Valid()) { + maxHeap_.push(&child); + } + } + FindLargest(); + direction_ = kReverse; + } + + virtual void Seek(const Slice& target) { + // Invalidate the heap. + use_heap_ = false; + IteratorWrapper* first_child = nullptr; + PERF_TIMER_DECLARE(); + + for (auto& child : children_) { + PERF_TIMER_START(seek_child_seek_time); + child.Seek(target); + PERF_TIMER_STOP(seek_child_seek_time); + PERF_COUNTER_ADD(seek_child_seek_count, 1); + + if (child.Valid()) { + // This child has valid key + if (!use_heap_) { + if (first_child == nullptr) { + // It's the first child has valid key. Only put it int + // current_. Now the values in the heap should be invalid. + first_child = &child; + } else { + // We have more than one children with valid keys. Initialize + // the heap and put the first child into the heap. + PERF_TIMER_START(seek_min_heap_time); + ClearHeaps(); + minHeap_.push(first_child); + PERF_TIMER_STOP(seek_min_heap_time); + } + } + if (use_heap_) { + PERF_TIMER_START(seek_min_heap_time); + minHeap_.push(&child); + PERF_TIMER_STOP(seek_min_heap_time); + } + } + } + if (use_heap_) { + // If heap is valid, need to put the smallest key to curent_. + PERF_TIMER_START(seek_min_heap_time); + FindSmallest(); + PERF_TIMER_STOP(seek_min_heap_time); + } else { + // The heap is not valid, then the current_ iterator is the first + // one, or null if there is no first child. + current_ = first_child; + } + direction_ = kForward; + } + + virtual void Next() { + assert(Valid()); + + // Ensure that all children are positioned after key(). + // If we are moving in the forward direction, it is already + // true for all of the non-current_ children since current_ is + // the smallest child and key() == current_->key(). Otherwise, + // we explicitly position the non-current_ children. + if (direction_ != kForward) { + ClearHeaps(); + for (auto& child : children_) { + if (&child != current_) { + child.Seek(key()); + if (child.Valid() && + comparator_->Compare(key(), child.key()) == 0) { + child.Next(); + } + if (child.Valid()) { + minHeap_.push(&child); + } + } + } + direction_ = kForward; + } + + // as the current points to the current record. move the iterator forward. + // and if it is valid add it to the heap. + current_->Next(); + if (use_heap_) { + if (current_->Valid()) { + minHeap_.push(current_); + } + FindSmallest(); + } else if (!current_->Valid()) { + current_ = nullptr; + } + } + + virtual void Prev() { + assert(Valid()); + // Ensure that all children are positioned before key(). + // If we are moving in the reverse direction, it is already + // true for all of the non-current_ children since current_ is + // the largest child and key() == current_->key(). Otherwise, + // we explicitly position the non-current_ children. + if (direction_ != kReverse) { + ClearHeaps(); + for (auto& child : children_) { + if (&child != current_) { + child.Seek(key()); + if (child.Valid()) { + // Child is at first entry >= key(). Step back one to be < key() + child.Prev(); + } else { + // Child has no entries >= key(). Position at last entry. + child.SeekToLast(); + } + if (child.Valid()) { + maxHeap_.push(&child); + } + } + } + direction_ = kReverse; + } + + current_->Prev(); + if (current_->Valid()) { + maxHeap_.push(current_); + } + FindLargest(); + } + + virtual Slice key() const { + assert(Valid()); + return current_->key(); + } + + virtual Slice value() const { + assert(Valid()); + return current_->value(); + } + + virtual Status status() const { + Status status; + for (auto& child : children_) { + status = child.status(); + if (!status.ok()) { + break; + } + } + return status; + } + + private: + void FindSmallest(); + void FindLargest(); + void ClearHeaps(); + + bool is_arena_mode_; + const Comparator* comparator_; + autovector children_; + IteratorWrapper* current_; + // If the value is true, both of iterators in the heap and current_ + // contain valid rows. If it is false, only current_ can possibly contain + // valid rows. + // This flag is always true for reverse direction, as we always use heap for + // the reverse iterating case. + bool use_heap_; + // Which direction is the iterator moving? + enum Direction { + kForward, + kReverse + }; + Direction direction_; + MaxIterHeap maxHeap_; + MinIterHeap minHeap_; +}; + +void MergingIterator::FindSmallest() { + assert(use_heap_); + if (minHeap_.empty()) { + current_ = nullptr; + } else { + current_ = minHeap_.top(); + assert(current_->Valid()); + minHeap_.pop(); + } +} + +void MergingIterator::FindLargest() { + assert(use_heap_); + if (maxHeap_.empty()) { + current_ = nullptr; + } else { + current_ = maxHeap_.top(); + assert(current_->Valid()); + maxHeap_.pop(); + } +} + +void MergingIterator::ClearHeaps() { + use_heap_ = true; + maxHeap_ = NewMaxIterHeap(comparator_); + minHeap_ = NewMinIterHeap(comparator_); +} + +Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n, + Arena* arena) { + assert(n >= 0); + if (n == 0) { + return NewEmptyIterator(arena); + } else if (n == 1) { + return list[0]; + } else { + if (arena == nullptr) { + return new MergingIterator(cmp, list, n, false); + } else { + auto mem = arena->AllocateAligned(sizeof(MergingIterator)); + return new (mem) MergingIterator(cmp, list, n, true); + } + } +} + +MergeIteratorBuilder::MergeIteratorBuilder(const Comparator* comparator, + Arena* a) + : first_iter(nullptr), use_merging_iter(false), arena(a) { + + auto mem = arena->AllocateAligned(sizeof(MergingIterator)); + merge_iter = new (mem) MergingIterator(comparator, nullptr, 0, true); +} + +void MergeIteratorBuilder::AddIterator(Iterator* iter) { + if (!use_merging_iter && first_iter != nullptr) { + merge_iter->AddIterator(first_iter); + use_merging_iter = true; + } + if (use_merging_iter) { + merge_iter->AddIterator(iter); + } else { + first_iter = iter; + } +} + +Iterator* MergeIteratorBuilder::Finish() { + if (!use_merging_iter) { + return first_iter; + } else { + auto ret = merge_iter; + merge_iter = nullptr; + return ret; + } +} + +} // namespace rocksdb diff --git a/table/merger.h b/table/merger.h new file mode 100644 index 0000000000..7dcf2afe78 --- /dev/null +++ b/table/merger.h @@ -0,0 +1,60 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/types.h" + +namespace rocksdb { + +class Comparator; +class Iterator; +class Env; +class Arena; + +// Return an iterator that provided the union of the data in +// children[0,n-1]. Takes ownership of the child iterators and +// will delete them when the result iterator is deleted. +// +// The result does no duplicate suppression. I.e., if a particular +// key is present in K child iterators, it will be yielded K times. +// +// REQUIRES: n >= 0 +extern Iterator* NewMergingIterator(const Comparator* comparator, + Iterator** children, int n, + Arena* arena = nullptr); + +class MergingIterator; + +// A builder class to build a merging iterator by adding iterators one by one. +class MergeIteratorBuilder { + public: + // comparator: the comparator used in merging comparator + // arena: where the merging iterator needs to be allocated from. + explicit MergeIteratorBuilder(const Comparator* comparator, Arena* arena); + ~MergeIteratorBuilder() {} + + // Add iter to the merging iterator. + void AddIterator(Iterator* iter); + + // Get arena used to build the merging iterator. It is called one a child + // iterator needs to be allocated. + Arena* GetArena() { return arena; } + + // Return the result merging iterator. + Iterator* Finish(); + + private: + MergingIterator* merge_iter; + Iterator* first_iter; + bool use_merging_iter; + Arena* arena; +}; + +} // namespace rocksdb diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc new file mode 100644 index 0000000000..7443eb7315 --- /dev/null +++ b/table/meta_blocks.cc @@ -0,0 +1,276 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#include "table/meta_blocks.h" + +#include +#include + +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/block.h" +#include "table/format.h" +#include "util/coding.h" + +namespace rocksdb { + +MetaIndexBuilder::MetaIndexBuilder() + : meta_index_block_( + new BlockBuilder(1 /* restart interval */, BytewiseComparator())) { +} + +void MetaIndexBuilder::Add(const std::string& key, + const BlockHandle& handle) { + std::string handle_encoding; + handle.EncodeTo(&handle_encoding); + meta_block_handles_.insert({key, handle_encoding}); +} + +Slice MetaIndexBuilder::Finish() { + for (const auto& metablock : meta_block_handles_) { + meta_index_block_->Add(metablock.first, metablock.second); + } + return meta_index_block_->Finish(); +} + +PropertyBlockBuilder::PropertyBlockBuilder() + : properties_block_( + new BlockBuilder(1 /* restart interval */, BytewiseComparator())) { +} + +void PropertyBlockBuilder::Add(const std::string& name, + const std::string& val) { + props_.insert({name, val}); +} + +void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) { + assert(props_.find(name) == props_.end()); + + std::string dst; + PutVarint64(&dst, val); + + Add(name, dst); +} + +void PropertyBlockBuilder::Add( + const UserCollectedProperties& user_collected_properties) { + for (const auto& prop : user_collected_properties) { + Add(prop.first, prop.second); + } +} + +void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { + Add(TablePropertiesNames::kRawKeySize, props.raw_key_size); + Add(TablePropertiesNames::kRawValueSize, props.raw_value_size); + Add(TablePropertiesNames::kDataSize, props.data_size); + Add(TablePropertiesNames::kIndexSize, props.index_size); + Add(TablePropertiesNames::kNumEntries, props.num_entries); + Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks); + Add(TablePropertiesNames::kFilterSize, props.filter_size); + Add(TablePropertiesNames::kFormatVersion, props.format_version); + Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len); + + if (!props.filter_policy_name.empty()) { + Add(TablePropertiesNames::kFilterPolicy, + props.filter_policy_name); + } +} + +Slice PropertyBlockBuilder::Finish() { + for (const auto& prop : props_) { + properties_block_->Add(prop.first, prop.second); + } + + return properties_block_->Finish(); +} + +void LogPropertiesCollectionError( + Logger* info_log, const std::string& method, const std::string& name) { + assert(method == "Add" || method == "Finish"); + + std::string msg = + "[Warning] encountered error when calling TablePropertiesCollector::" + + method + "() with collector name: " + name; + Log(info_log, "%s", msg.c_str()); +} + +bool NotifyCollectTableCollectorsOnAdd( + const Slice& key, const Slice& value, + const std::vector>& collectors, + Logger* info_log) { + bool all_succeeded = true; + for (auto& collector : collectors) { + Status s = collector->Add(key, value); + all_succeeded = all_succeeded && s.ok(); + if (!s.ok()) { + LogPropertiesCollectionError(info_log, "Add" /* method */, + collector->Name()); + } + } + return all_succeeded; +} + +bool NotifyCollectTableCollectorsOnFinish( + const std::vector>& collectors, + Logger* info_log, PropertyBlockBuilder* builder) { + bool all_succeeded = true; + for (auto& collector : collectors) { + UserCollectedProperties user_collected_properties; + Status s = collector->Finish(&user_collected_properties); + + all_succeeded = all_succeeded && s.ok(); + if (!s.ok()) { + LogPropertiesCollectionError(info_log, "Finish" /* method */, + collector->Name()); + } else { + builder->Add(user_collected_properties); + } + } + + return all_succeeded; +} + +Status ReadProperties(const Slice &handle_value, RandomAccessFile *file, + const Footer &footer, Env *env, Logger *logger, + TableProperties **table_properties) { + assert(table_properties); + + Slice v = handle_value; + BlockHandle handle; + if (!handle.DecodeFrom(&v).ok()) { + return Status::InvalidArgument("Failed to decode properties block handle"); + } + + BlockContents block_contents; + ReadOptions read_options; + read_options.verify_checksums = false; + Status s = ReadBlockContents(file, footer, read_options, handle, + &block_contents, env, false); + + if (!s.ok()) { + return s; + } + + Block properties_block(block_contents); + std::unique_ptr iter( + properties_block.NewIterator(BytewiseComparator())); + + auto new_table_properties = new TableProperties(); + // All pre-defined properties of type uint64_t + std::unordered_map predefined_uint64_properties = { + {TablePropertiesNames::kDataSize, &new_table_properties->data_size}, + {TablePropertiesNames::kIndexSize, &new_table_properties->index_size}, + {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size}, + {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size}, + {TablePropertiesNames::kRawValueSize, + &new_table_properties->raw_value_size}, + {TablePropertiesNames::kNumDataBlocks, + &new_table_properties->num_data_blocks}, + {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries}, + {TablePropertiesNames::kFormatVersion, + &new_table_properties->format_version}, + {TablePropertiesNames::kFixedKeyLen, + &new_table_properties->fixed_key_len}, }; + + std::string last_key; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + s = iter->status(); + if (!s.ok()) { + break; + } + + auto key = iter->key().ToString(); + // properties block is strictly sorted with no duplicate key. + assert(last_key.empty() || + BytewiseComparator()->Compare(key, last_key) > 0); + last_key = key; + + auto raw_val = iter->value(); + auto pos = predefined_uint64_properties.find(key); + + if (pos != predefined_uint64_properties.end()) { + // handle predefined rocksdb properties + uint64_t val; + if (!GetVarint64(&raw_val, &val)) { + // skip malformed value + auto error_msg = + "[Warning] detect malformed value in properties meta-block:" + "\tkey: " + key + "\tval: " + raw_val.ToString(); + Log(logger, "%s", error_msg.c_str()); + continue; + } + *(pos->second) = val; + } else if (key == TablePropertiesNames::kFilterPolicy) { + new_table_properties->filter_policy_name = raw_val.ToString(); + } else { + // handle user-collected properties + new_table_properties->user_collected_properties.insert( + {key, raw_val.ToString()}); + } + } + if (s.ok()) { + *table_properties = new_table_properties; + } else { + delete new_table_properties; + } + + return s; +} + +Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size, + uint64_t table_magic_number, Env* env, + Logger* info_log, TableProperties** properties) { + // -- Read metaindex block + Footer footer(table_magic_number); + auto s = ReadFooterFromFile(file, file_size, &footer); + if (!s.ok()) { + return s; + } + + auto metaindex_handle = footer.metaindex_handle(); + BlockContents metaindex_contents; + ReadOptions read_options; + read_options.verify_checksums = false; + s = ReadBlockContents(file, footer, read_options, metaindex_handle, + &metaindex_contents, env, false); + if (!s.ok()) { + return s; + } + Block metaindex_block(metaindex_contents); + std::unique_ptr meta_iter( + metaindex_block.NewIterator(BytewiseComparator())); + + // -- Read property block + bool found_properties_block = true; + s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block); + if (!s.ok()) { + return s; + } + + TableProperties table_properties; + if (found_properties_block == true) { + s = ReadProperties(meta_iter->value(), file, footer, env, info_log, + properties); + } else { + s = Status::Corruption("Unable to read the property block."); + Log(WARN_LEVEL, info_log, "Cannot find Properties block from file."); + } + + return s; +} + +Status FindMetaBlock(Iterator* meta_index_iter, + const std::string& meta_block_name, + BlockHandle* block_handle) { + meta_index_iter->Seek(meta_block_name); + if (meta_index_iter->status().ok() && meta_index_iter->Valid() && + meta_index_iter->key() == meta_block_name) { + Slice v = meta_index_iter->value(); + return block_handle->DecodeFrom(&v); + } else { + return Status::Corruption("Cannot find the meta block", meta_block_name); + } +} + +} // namespace rocksdb diff --git a/table/meta_blocks.h b/table/meta_blocks.h new file mode 100644 index 0000000000..6cfc0babdf --- /dev/null +++ b/table/meta_blocks.h @@ -0,0 +1,131 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include +#include +#include + +#include "db/builder.h" +#include "rocksdb/comparator.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/table_properties.h" +#include "table/block_builder.h" + +namespace rocksdb { + +class BlockBuilder; +class BlockHandle; +class Env; +class Footer; +class Logger; +class RandomAccessFile; +struct TableProperties; + +// An STL style comparator that does the bytewise comparator comparasion +// internally. +struct BytewiseLessThan { + bool operator()(const std::string& key1, const std::string& key2) const { + // smaller entries will be placed in front. + return comparator->Compare(key1, key2) <= 0; + } + + const Comparator* comparator = BytewiseComparator(); +}; + +// When writing to a block that requires entries to be sorted by +// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap` +// before writng to store. +typedef std::map BytewiseSortedMap; + +class MetaIndexBuilder { + public: + MetaIndexBuilder(const MetaIndexBuilder&) = delete; + MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete; + + MetaIndexBuilder(); + void Add(const std::string& key, const BlockHandle& handle); + + // Write all the added key/value pairs to the block and return the contents + // of the block. + Slice Finish(); + + private: + // store the sorted key/handle of the metablocks. + BytewiseSortedMap meta_block_handles_; + std::unique_ptr meta_index_block_; +}; + +class PropertyBlockBuilder { + public: + PropertyBlockBuilder(const PropertyBlockBuilder&) = delete; + PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete; + + PropertyBlockBuilder(); + + void AddTableProperty(const TableProperties& props); + void Add(const std::string& key, uint64_t value); + void Add(const std::string& key, const std::string& value); + void Add(const UserCollectedProperties& user_collected_properties); + + // Write all the added entries to the block and return the block contents + Slice Finish(); + + private: + std::unique_ptr properties_block_; + BytewiseSortedMap props_; +}; + +// Were we encounter any error occurs during user-defined statistics collection, +// we'll write the warning message to info log. +void LogPropertiesCollectionError( + Logger* info_log, const std::string& method, const std::string& name); + +// Utility functions help table builder to trigger batch events for user +// defined property collectors. +// Return value indicates if there is any error occurred; if error occurred, +// the warning message will be logged. +// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all +// property collectors. +bool NotifyCollectTableCollectorsOnAdd( + const Slice& key, const Slice& value, + const std::vector>& collectors, + Logger* info_log); + +// NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all +// property collectors. The collected properties will be added to `builder`. +bool NotifyCollectTableCollectorsOnFinish( + const std::vector>& collectors, + Logger* info_log, PropertyBlockBuilder* builder); + +// Read the properties from the table. +// @returns a status to indicate if the operation succeeded. On success, +// *table_properties will point to a heap-allocated TableProperties +// object, otherwise value of `table_properties` will not be modified. +Status ReadProperties(const Slice &handle_value, RandomAccessFile *file, + const Footer &footer, Env *env, Logger *logger, + TableProperties **table_properties); + +// Directly read the properties from the properties block of a plain table. +// @returns a status to indicate if the operation succeeded. On success, +// *table_properties will point to a heap-allocated TableProperties +// object, otherwise value of `table_properties` will not be modified. +Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size, + uint64_t table_magic_number, Env* env, + Logger* info_log, TableProperties** properties); + +// Seek to the properties block. +// If it successfully seeks to the properties block, "is_found" will be +// set to true. +extern Status SeekToPropertiesBlock(Iterator* meta_iter, bool* is_found); + +// Find the meta block from the meta index block. +Status FindMetaBlock(Iterator* meta_index_iter, + const std::string& meta_block_name, + BlockHandle* block_handle); + +} // namespace rocksdb diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc new file mode 100644 index 0000000000..12037cf6a7 --- /dev/null +++ b/table/plain_table_builder.cc @@ -0,0 +1,211 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE +#include "table/plain_table_builder.h" + +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "table/plain_table_factory.h" +#include "db/dbformat.h" +#include "table/block_builder.h" +#include "table/filter_block.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +namespace { + +// a utility that helps writing block content to the file +// @offset will advance if @block_contents was successfully written. +// @block_handle the block handle this particular block. +Status WriteBlock( + const Slice& block_contents, + WritableFile* file, + uint64_t* offset, + BlockHandle* block_handle) { + block_handle->set_offset(*offset); + block_handle->set_size(block_contents.size()); + Status s = file->Append(block_contents); + + if (s.ok()) { + *offset += block_contents.size(); + } + return s; +} + +} // namespace + +// kPlainTableMagicNumber was picked by running +// echo rocksdb.table.plain | sha1sum +// and taking the leading 64 bits. +extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull; +extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; + +PlainTableBuilder::PlainTableBuilder(const Options& options, + WritableFile* file, + uint32_t user_key_len) : + options_(options), file_(file), user_key_len_(user_key_len) { + properties_.fixed_key_len = user_key_len; + + // for plain table, we put all the data in a big chuck. + properties_.num_data_blocks = 1; + // emphasize that currently plain table doesn't have persistent index or + // filter block. + properties_.index_size = 0; + properties_.filter_size = 0; + properties_.format_version = 0; + + for (auto& collector_factories : + options.table_properties_collector_factories) { + table_properties_collectors_.emplace_back( + collector_factories->CreateTablePropertiesCollector()); + } +} + +PlainTableBuilder::~PlainTableBuilder() { +} + +void PlainTableBuilder::Add(const Slice& key, const Slice& value) { + size_t user_key_size = key.size() - 8; + assert(user_key_len_ == 0 || user_key_size == user_key_len_); + + if (!IsFixedLength()) { + // Write key length + char key_size_buf[5]; // tmp buffer for key size as varint32 + char* ptr = EncodeVarint32(key_size_buf, user_key_size); + assert(ptr <= key_size_buf + sizeof(key_size_buf)); + auto len = ptr - key_size_buf; + file_->Append(Slice(key_size_buf, len)); + offset_ += len; + } + + // Write key + ParsedInternalKey parsed_key; + if (!ParseInternalKey(key, &parsed_key)) { + status_ = Status::Corruption(Slice()); + return; + } + // For value size as varint32 (up to 5 bytes). + // If the row is of value type with seqId 0, flush the special flag together + // in this buffer to safe one file append call, which takes 1 byte. + char value_size_buf[6]; + size_t value_size_buf_size = 0; + if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) { + file_->Append(Slice(key.data(), user_key_size)); + offset_ += user_key_size; + value_size_buf[0] = PlainTableFactory::kValueTypeSeqId0; + value_size_buf_size = 1; + } else { + file_->Append(key); + offset_ += key.size(); + } + + // Write value length + int value_size = value.size(); + char* end_ptr = + EncodeVarint32(value_size_buf + value_size_buf_size, value_size); + assert(end_ptr <= value_size_buf + sizeof(value_size_buf)); + value_size_buf_size = end_ptr - value_size_buf; + file_->Append(Slice(value_size_buf, value_size_buf_size)); + + // Write value + file_->Append(value); + offset_ += value_size + value_size_buf_size; + + properties_.num_entries++; + properties_.raw_key_size += key.size(); + properties_.raw_value_size += value.size(); + + // notify property collectors + NotifyCollectTableCollectorsOnAdd(key, value, table_properties_collectors_, + options_.info_log.get()); +} + +Status PlainTableBuilder::status() const { return status_; } + +Status PlainTableBuilder::Finish() { + assert(!closed_); + closed_ = true; + + properties_.data_size = offset_; + + // Write the following blocks + // 1. [meta block: properties] + // 2. [metaindex block] + // 3. [footer] + MetaIndexBuilder meta_index_builer; + + PropertyBlockBuilder property_block_builder; + // -- Add basic properties + property_block_builder.AddTableProperty(properties_); + + // -- Add user collected properties + NotifyCollectTableCollectorsOnFinish(table_properties_collectors_, + options_.info_log.get(), + &property_block_builder); + + // -- Write property block + BlockHandle property_block_handle; + auto s = WriteBlock( + property_block_builder.Finish(), + file_, + &offset_, + &property_block_handle + ); + if (!s.ok()) { + return s; + } + meta_index_builer.Add(kPropertiesBlock, property_block_handle); + + // -- write metaindex block + BlockHandle metaindex_block_handle; + s = WriteBlock( + meta_index_builer.Finish(), + file_, + &offset_, + &metaindex_block_handle + ); + if (!s.ok()) { + return s; + } + + // Write Footer + // no need to write out new footer if we're using default checksum + Footer footer(kLegacyPlainTableMagicNumber); + footer.set_metaindex_handle(metaindex_block_handle); + footer.set_index_handle(BlockHandle::NullBlockHandle()); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + s = file_->Append(footer_encoding); + if (s.ok()) { + offset_ += footer_encoding.size(); + } + + return s; +} + +void PlainTableBuilder::Abandon() { + closed_ = true; +} + +uint64_t PlainTableBuilder::NumEntries() const { + return properties_.num_entries; +} + +uint64_t PlainTableBuilder::FileSize() const { + return offset_; +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h new file mode 100644 index 0000000000..9b0f460805 --- /dev/null +++ b/table/plain_table_builder.h @@ -0,0 +1,87 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// IndexedTable is a simple table format for UNIT TEST ONLY. It is not built +// as production quality. + +#pragma once +#ifndef ROCKSDB_LITE +#include +#include +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "table/table_builder.h" +#include "rocksdb/table_properties.h" + +namespace rocksdb { + +class BlockBuilder; +class BlockHandle; +class WritableFile; +class TableBuilder; + +class PlainTableBuilder: public TableBuilder { +public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). The output file + // will be part of level specified by 'level'. A value of -1 means + // that the caller does not know which level the output file will reside. + PlainTableBuilder(const Options& options, WritableFile* file, + uint32_t user_key_size); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~PlainTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + +private: + Options options_; + std::vector> + table_properties_collectors_; + WritableFile* file_; + uint64_t offset_ = 0; + Status status_; + TableProperties properties_; + + const size_t user_key_len_; + bool closed_ = false; // Either Finish() or Abandon() has been called. + + bool IsFixedLength() const { + return user_key_len_ > 0; + } + + // No copying allowed + PlainTableBuilder(const PlainTableBuilder&) = delete; + void operator=(const PlainTableBuilder&) = delete; +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc new file mode 100644 index 0000000000..f9d88e9ef0 --- /dev/null +++ b/table/plain_table_factory.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE +#include "table/plain_table_factory.h" + +#include +#include +#include "db/dbformat.h" +#include "table/plain_table_builder.h" +#include "table/plain_table_reader.h" +#include "port/port.h" + +namespace rocksdb { + +Status PlainTableFactory::NewTableReader(const Options& options, + const EnvOptions& soptions, + const InternalKeyComparator& icomp, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table) const { + return PlainTableReader::Open(options, soptions, icomp, std::move(file), + file_size, table, bloom_bits_per_key_, + hash_table_ratio_, index_sparseness_, + huge_page_tlb_size_); +} + +TableBuilder* PlainTableFactory::NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const { + return new PlainTableBuilder(options, file, user_key_len_); +} + +extern TableFactory* NewPlainTableFactory(uint32_t user_key_len, + int bloom_bits_per_key, + double hash_table_ratio, + size_t index_sparseness, + size_t huge_page_tlb_size) { + return new PlainTableFactory(user_key_len, bloom_bits_per_key, + hash_table_ratio, index_sparseness, + huge_page_tlb_size); +} + +extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len, + int bloom_bits_per_key, + size_t index_sparseness, + size_t huge_page_tlb_size) { + return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0, + index_sparseness, huge_page_tlb_size); +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h new file mode 100644 index 0000000000..06ddbf4ea9 --- /dev/null +++ b/table/plain_table_factory.h @@ -0,0 +1,94 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifndef ROCKSDB_LITE +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +struct Options; +struct EnvOptions; + +using std::unique_ptr; +class Status; +class RandomAccessFile; +class WritableFile; +class Table; +class TableBuilder; + +// IndexedTable requires fixed length key, configured as a constructor +// parameter of the factory class. Output file format: +// +-------------+-----------------+ +// | version | user_key_length | +// +------------++------------------------------+ <= key1 offset +// | [key_size] | key1 | value_size | | +// +------------+-------------+-------------+ | +// | value1 | +// | | +// +----------------------------------------+---+ <= key2 offset +// | [key_size] | key2 | value_size | | +// +------------+-------------+-------------+ | +// | value2 | +// | | +// | ...... | +// +-----------------+--------------------------+ +// If user_key_length = kPlainTableVariableLength, it means the key is variable +// length, there will be an extra field for key size encoded before every key. +class PlainTableFactory : public TableFactory { + public: + ~PlainTableFactory() {} + // user_key_size is the length of the user key. If it is set to be + // kPlainTableVariableLength, then it means variable length. Otherwise, all + // the keys need to have the fix length of this value. bloom_bits_per_key is + // number of bits used for bloom filer per key. hash_table_ratio is + // the desired utilization of the hash table used for prefix hashing. + // hash_table_ratio = number of prefixes / #buckets in the hash table + // hash_table_ratio = 0 means skip hash table but only replying on binary + // search. + // index_sparseness determines index interval for keys + // inside the same prefix. It will be the maximum number of linear search + // required after hash and binary search. + // index_sparseness = 0 means index for every key. + // huge_page_tlb_size determines whether to allocate hash indexes from huge + // page TLB and the page size if allocating from there. See comments of + // Arena::AllocateAligned() for details. + explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength, + int bloom_bits_per_key = 0, + double hash_table_ratio = 0.75, + size_t index_sparseness = 16, + size_t huge_page_tlb_size = 0) + : user_key_len_(user_key_len), + bloom_bits_per_key_(bloom_bits_per_key), + hash_table_ratio_(hash_table_ratio), + index_sparseness_(index_sparseness), + huge_page_tlb_size_(huge_page_tlb_size) {} + const char* Name() const override { return "PlainTable"; } + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override; + TableBuilder* NewTableBuilder(const Options& options, + const InternalKeyComparator& icomparator, + WritableFile* file, + CompressionType compression_type) const + override; + + static const char kValueTypeSeqId0 = 0xFF; + + private: + uint32_t user_key_len_; + int bloom_bits_per_key_; + double hash_table_ratio_; + size_t index_sparseness_; + size_t huge_page_tlb_size_; +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc new file mode 100644 index 0000000000..22968ef6b7 --- /dev/null +++ b/table/plain_table_reader.cc @@ -0,0 +1,776 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE +#include "table/plain_table_reader.h" + +#include +#include + +#include "db/dbformat.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" + +#include "table/block.h" +#include "table/filter_block.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/two_level_iterator.h" +#include "table/plain_table_factory.h" + +#include "util/arena.h" +#include "util/coding.h" +#include "util/dynamic_bloom.h" +#include "util/hash.h" +#include "util/histogram.h" +#include "util/murmurhash.h" +#include "util/perf_context_imp.h" +#include "util/stop_watch.h" + + +namespace rocksdb { + +namespace { + +inline uint32_t GetSliceHash(const Slice& s) { + return Hash(s.data(), s.size(), 397) ; +} + +inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { + return hash % num_buckets; +} + +// Safely getting a uint32_t element from a char array, where, starting from +// `base`, every 4 bytes are considered as an fixed 32 bit integer. +inline uint32_t GetFixed32Element(const char* base, size_t offset) { + return DecodeFixed32(base + offset * sizeof(uint32_t)); +} + +} // namespace + +// Iterator to iterate IndexedTable +class PlainTableIterator : public Iterator { + public: + explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek); + ~PlainTableIterator(); + + bool Valid() const; + + void SeekToFirst(); + + void SeekToLast(); + + void Seek(const Slice& target); + + void Next(); + + void Prev(); + + Slice key() const; + + Slice value() const; + + Status status() const; + + private: + PlainTableReader* table_; + bool use_prefix_seek_; + uint32_t offset_; + uint32_t next_offset_; + IterKey key_; + Slice value_; + Status status_; + // No copying allowed + PlainTableIterator(const PlainTableIterator&) = delete; + void operator=(const Iterator&) = delete; +}; + +extern const uint64_t kPlainTableMagicNumber; +PlainTableReader::PlainTableReader( + const Options& options, unique_ptr&& file, + const EnvOptions& storage_options, const InternalKeyComparator& icomparator, + uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio, + size_t index_sparseness, const TableProperties* table_properties, + size_t huge_page_tlb_size) + : options_(options), + soptions_(storage_options), + file_(std::move(file)), + internal_comparator_(icomparator), + file_size_(file_size), + kHashTableRatio(hash_table_ratio), + kBloomBitsPerKey(bloom_bits_per_key), + kIndexIntervalForSamePrefixKeys(index_sparseness), + table_properties_(nullptr), + data_end_offset_(table_properties->data_size), + user_key_len_(table_properties->fixed_key_len), + huge_page_tlb_size_(huge_page_tlb_size) { + assert(kHashTableRatio >= 0.0); +} + +PlainTableReader::~PlainTableReader() { +} + +Status PlainTableReader::Open(const Options& options, + const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table_reader, + const int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, + size_t huge_page_tlb_size) { + assert(options.allow_mmap_reads); + + if (file_size > kMaxFileSize) { + return Status::NotSupported("File is too large for PlainTableReader!"); + } + + TableProperties* props = nullptr; + auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, + options.env, options.info_log.get(), &props); + if (!s.ok()) { + return s; + } + + std::unique_ptr new_reader(new PlainTableReader( + options, std::move(file), soptions, internal_comparator, file_size, + bloom_bits_per_key, hash_table_ratio, index_sparseness, props, + huge_page_tlb_size)); + + // -- Populate Index + s = new_reader->PopulateIndex(props); + if (!s.ok()) { + return s; + } + + *table_reader = std::move(new_reader); + return s; +} + +void PlainTableReader::SetupForCompaction() { +} + +Iterator* PlainTableReader::NewIterator(const ReadOptions& options, + Arena* arena) { + if (arena == nullptr) { + return new PlainTableIterator(this, options_.prefix_extractor != nullptr); + } else { + auto mem = arena->AllocateAligned(sizeof(PlainTableIterator)); + return new (mem) + PlainTableIterator(this, options_.prefix_extractor != nullptr); + } +} + +struct PlainTableReader::IndexRecord { + uint32_t hash; // hash of the prefix + uint32_t offset; // offset of a row + IndexRecord* next; +}; + +// Helper class to track all the index records +class PlainTableReader::IndexRecordList { + public: + explicit IndexRecordList(size_t num_records_per_group) + : kNumRecordsPerGroup(num_records_per_group), + current_group_(nullptr), + num_records_in_current_group_(num_records_per_group) {} + + ~IndexRecordList() { + for (size_t i = 0; i < groups_.size(); i++) { + delete[] groups_[i]; + } + } + + void AddRecord(murmur_t hash, uint32_t offset) { + if (num_records_in_current_group_ == kNumRecordsPerGroup) { + current_group_ = AllocateNewGroup(); + num_records_in_current_group_ = 0; + } + auto& new_record = current_group_[num_records_in_current_group_++]; + new_record.hash = hash; + new_record.offset = offset; + new_record.next = nullptr; + } + + size_t GetNumRecords() const { + return (groups_.size() - 1) * kNumRecordsPerGroup + + num_records_in_current_group_; + } + IndexRecord* At(size_t index) { + return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]); + } + + private: + IndexRecord* AllocateNewGroup() { + IndexRecord* result = new IndexRecord[kNumRecordsPerGroup]; + groups_.push_back(result); + return result; + } + + // Each group in `groups_` contains fix-sized records (determined by + // kNumRecordsPerGroup). Which can help us minimize the cost if resizing + // occurs. + const size_t kNumRecordsPerGroup; + IndexRecord* current_group_; + // List of arrays allocated + std::vector groups_; + size_t num_records_in_current_group_; +}; + +Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list, + int* num_prefixes) const { + Slice prev_key_prefix_slice; + uint32_t prev_key_prefix_hash = 0; + uint32_t pos = data_start_offset_; + int num_keys_per_prefix = 0; + bool is_first_record = true; + HistogramImpl keys_per_prefix_hist; + // Need map to be ordered to make sure sub indexes generated + // are in order. + + *num_prefixes = 0; + while (pos < data_end_offset_) { + uint32_t key_offset = pos; + ParsedInternalKey key; + Slice value_slice; + Status s = Next(&pos, &key, &value_slice); + if (!s.ok()) { + return s; + } + if (bloom_) { + // total order mode and bloom filter is enabled. + bloom_->AddHash(GetSliceHash(key.user_key)); + } + Slice key_prefix_slice = GetPrefix(key); + + if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { + ++(*num_prefixes); + if (!is_first_record) { + keys_per_prefix_hist.Add(num_keys_per_prefix); + } + num_keys_per_prefix = 0; + prev_key_prefix_slice = key_prefix_slice; + prev_key_prefix_hash = GetSliceHash(key_prefix_slice); + } + + if (kIndexIntervalForSamePrefixKeys == 0 || + num_keys_per_prefix++ % kIndexIntervalForSamePrefixKeys == 0) { + // Add an index key for every kIndexIntervalForSamePrefixKeys keys + record_list->AddRecord(prev_key_prefix_hash, key_offset); + } + is_first_record = false; + } + + keys_per_prefix_hist.Add(num_keys_per_prefix); + Log(options_.info_log, "Number of Keys per prefix Histogram: %s", + keys_per_prefix_hist.ToString().c_str()); + + return Status::OK(); +} + +void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) { + if (options_.prefix_extractor.get() != nullptr) { + uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey; + if (bloom_total_bits > 0) { + bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality, + 6, nullptr, huge_page_tlb_size_, + options_.info_log.get())); + } + } + + if (options_.prefix_extractor.get() == nullptr || kHashTableRatio <= 0) { + // Fall back to pure binary search if the user fails to specify a prefix + // extractor. + index_size_ = 1; + } else { + double hash_table_size_multipier = 1.0 / kHashTableRatio; + index_size_ = num_prefixes * hash_table_size_multipier + 1; + } +} + +size_t PlainTableReader::BucketizeIndexesAndFillBloom( + IndexRecordList* record_list, std::vector* hash_to_offsets, + std::vector* entries_per_bucket) { + bool first = true; + uint32_t prev_hash = 0; + size_t num_records = record_list->GetNumRecords(); + for (size_t i = 0; i < num_records; i++) { + IndexRecord* index_record = record_list->At(i); + uint32_t cur_hash = index_record->hash; + if (first || prev_hash != cur_hash) { + prev_hash = cur_hash; + first = false; + if (bloom_ && !IsTotalOrderMode()) { + bloom_->AddHash(cur_hash); + } + } + uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_); + IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket]; + index_record->next = prev_bucket_head; + (*hash_to_offsets)[bucket] = index_record; + (*entries_per_bucket)[bucket]++; + } + size_t sub_index_size = 0; + for (auto entry_count : *entries_per_bucket) { + if (entry_count <= 1) { + continue; + } + // Only buckets with more than 1 entry will have subindex. + sub_index_size += VarintLength(entry_count); + // total bytes needed to store these entries' in-file offsets. + sub_index_size += entry_count * kOffsetLen; + } + return sub_index_size; +} + +void PlainTableReader::FillIndexes( + const size_t kSubIndexSize, + const std::vector& hash_to_offsets, + const std::vector& entries_per_bucket) { + Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index", + kSubIndexSize); + auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize; + char* allocated = arena_.AllocateAligned( + total_allocate_size, huge_page_tlb_size_, options_.info_log.get()); + index_ = reinterpret_cast(allocated); + sub_index_ = allocated + sizeof(uint32_t) * index_size_; + + size_t sub_index_offset = 0; + for (int i = 0; i < index_size_; i++) { + uint32_t num_keys_for_bucket = entries_per_bucket[i]; + switch (num_keys_for_bucket) { + case 0: + // No key for bucket + index_[i] = data_end_offset_; + break; + case 1: + // point directly to the file offset + index_[i] = hash_to_offsets[i]->offset; + break; + default: + // point to second level indexes. + index_[i] = sub_index_offset | kSubIndexMask; + char* prev_ptr = &sub_index_[sub_index_offset]; + char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); + sub_index_offset += (cur_ptr - prev_ptr); + char* sub_index_pos = &sub_index_[sub_index_offset]; + IndexRecord* record = hash_to_offsets[i]; + int j; + for (j = num_keys_for_bucket - 1; j >= 0 && record; + j--, record = record->next) { + EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset); + } + assert(j == -1 && record == nullptr); + sub_index_offset += kOffsetLen * num_keys_for_bucket; + assert(sub_index_offset <= kSubIndexSize); + break; + } + } + assert(sub_index_offset == kSubIndexSize); + + Log(options_.info_log, "hash table size: %d, suffix_map length %zu", + index_size_, kSubIndexSize); +} + +Status PlainTableReader::PopulateIndex(TableProperties* props) { + assert(props != nullptr); + table_properties_.reset(props); + + // options.prefix_extractor is requried for a hash-based look-up. + if (options_.prefix_extractor.get() == nullptr && kHashTableRatio != 0) { + return Status::NotSupported( + "PlainTable requires a prefix extractor enable prefix hash mode."); + } + + // Get mmapped memory to file_data_. + Status s = file_->Read(0, file_size_, &file_data_, nullptr); + if (!s.ok()) { + return s; + } + + IndexRecordList record_list(kRecordsPerGroup); + // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows + // for a prefix (starting from the first one), generate a record of (hash, + // offset) and append it to IndexRecordList, which is a data structure created + // to store them. + int num_prefixes; + + // Allocate bloom filter here for total order mode. + if (IsTotalOrderMode()) { + uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey; + if (num_bloom_bits > 0) { + bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality, 6, + nullptr, huge_page_tlb_size_, + options_.info_log.get())); + } + } + + s = PopulateIndexRecordList(&record_list, &num_prefixes); + if (!s.ok()) { + return s; + } + // Calculated hash table and bloom filter size and allocate memory for indexes + // and bloom filter based on the number of prefixes. + AllocateIndexAndBloom(num_prefixes); + + // Bucketize all the index records to a temp data structure, in which for + // each bucket, we generate a linked list of IndexRecord, in reversed order. + std::vector hash_to_offsets(index_size_, nullptr); + std::vector entries_per_bucket(index_size_, 0); + size_t sub_index_size_needed = BucketizeIndexesAndFillBloom( + &record_list, &hash_to_offsets, &entries_per_bucket); + // From the temp data structure, populate indexes. + FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket); + + // Fill two table properties. + // TODO(sdong): after we have the feature of storing index in file, this + // properties need to be populated to index_size instead. + props->user_collected_properties["plain_table_hash_table_size"] = + std::to_string(index_size_ * 4U); + props->user_collected_properties["plain_table_sub_index_size"] = + std::to_string(sub_index_size_needed); + + return Status::OK(); +} + +Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, + uint32_t prefix_hash, bool& prefix_matched, + uint32_t* offset) const { + prefix_matched = false; + int bucket = GetBucketIdFromHash(prefix_hash, index_size_); + uint32_t bucket_value = index_[bucket]; + if (bucket_value == data_end_offset_) { + *offset = data_end_offset_; + return Status::OK(); + } else if ((bucket_value & kSubIndexMask) == 0) { + // point directly to the file + *offset = bucket_value; + return Status::OK(); + } + + // point to sub-index, need to do a binary search + uint32_t low = 0; + uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask; + + const char* index_ptr = &sub_index_[prefix_index_offset]; + uint32_t upper_bound = 0; + const char* base_ptr = GetVarint32Ptr(index_ptr, index_ptr + 4, &upper_bound); + uint32_t high = upper_bound; + ParsedInternalKey mid_key; + ParsedInternalKey parsed_target; + if (!ParseInternalKey(target, &parsed_target)) { + return Status::Corruption(Slice()); + } + + // The key is between [low, high). Do a binary search between it. + while (high - low > 1) { + uint32_t mid = (high + low) / 2; + uint32_t file_offset = GetFixed32Element(base_ptr, mid); + size_t tmp; + Status s = ReadKey(file_data_.data() + file_offset, &mid_key, &tmp); + if (!s.ok()) { + return s; + } + int cmp_result = internal_comparator_.Compare(mid_key, parsed_target); + if (cmp_result < 0) { + low = mid; + } else { + if (cmp_result == 0) { + // Happen to have found the exact key or target is smaller than the + // first key after base_offset. + prefix_matched = true; + *offset = file_offset; + return Status::OK(); + } else { + high = mid; + } + } + } + // Both of the key at the position low or low+1 could share the same + // prefix as target. We need to rule out one of them to avoid to go + // to the wrong prefix. + ParsedInternalKey low_key; + size_t tmp; + uint32_t low_key_offset = GetFixed32Element(base_ptr, low); + Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, &tmp); + if (GetPrefix(low_key) == prefix) { + prefix_matched = true; + *offset = low_key_offset; + } else if (low + 1 < upper_bound) { + // There is possible a next prefix, return it + prefix_matched = false; + *offset = GetFixed32Element(base_ptr, low + 1); + } else { + // target is larger than a key of the last prefix in this bucket + // but with a different prefix. Key does not exist. + *offset = data_end_offset_; + } + return Status::OK(); +} + +bool PlainTableReader::MatchBloom(uint32_t hash) const { + return bloom_.get() == nullptr || bloom_->MayContainHash(hash); +} + +Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) const { + return GetPrefixFromUserKey(target.user_key); +} + +Status PlainTableReader::ReadKey(const char* start, ParsedInternalKey* key, + size_t* bytes_read) const { + const char* key_ptr = nullptr; + *bytes_read = 0; + size_t user_key_size = 0; + if (IsFixedLength()) { + user_key_size = user_key_len_; + key_ptr = start; + } else { + uint32_t tmp_size = 0; + key_ptr = + GetVarint32Ptr(start, file_data_.data() + data_end_offset_, &tmp_size); + if (key_ptr == nullptr) { + return Status::Corruption( + "Unexpected EOF when reading the next key's size"); + } + user_key_size = (size_t)tmp_size; + *bytes_read = key_ptr - start; + } + if (key_ptr + user_key_size + 1 >= file_data_.data() + data_end_offset_) { + return Status::Corruption("Unexpected EOF when reading the next key"); + } + + if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) { + // Special encoding for the row with seqID=0 + key->user_key = Slice(key_ptr, user_key_size); + key->sequence = 0; + key->type = kTypeValue; + *bytes_read += user_key_size + 1; + } else { + if (start + user_key_size + 8 >= file_data_.data() + data_end_offset_) { + return Status::Corruption( + "Unexpected EOF when reading internal bytes of the next key"); + } + if (!ParseInternalKey(Slice(key_ptr, user_key_size + 8), key)) { + return Status::Corruption( + Slice("Incorrect value type found when reading the next key")); + } + *bytes_read += user_key_size + 8; + } + + return Status::OK(); +} + +Status PlainTableReader::Next(uint32_t* offset, ParsedInternalKey* key, + Slice* value) const { + if (*offset == data_end_offset_) { + *offset = data_end_offset_; + return Status::OK(); + } + + if (*offset > data_end_offset_) { + return Status::Corruption("Offset is out of file size"); + } + + const char* start = file_data_.data() + *offset; + size_t bytes_for_key; + Status s = ReadKey(start, key, &bytes_for_key); + if (!s.ok()) { + return s; + } + uint32_t value_size; + const char* value_ptr = GetVarint32Ptr( + start + bytes_for_key, file_data_.data() + data_end_offset_, &value_size); + if (value_ptr == nullptr) { + return Status::Corruption( + "Unexpected EOF when reading the next value's size."); + } + *offset = *offset + (value_ptr - start) + value_size; + if (*offset > data_end_offset_) { + return Status::Corruption("Unexpected EOF when reading the next value. "); + } + *value = Slice(value_ptr, value_size); + + return Status::OK(); +} + +Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, + void* arg, + bool (*saver)(void*, const ParsedInternalKey&, + const Slice&, bool), + void (*mark_key_may_exist)(void*)) { + // Check bloom filter first. + Slice prefix_slice; + uint32_t prefix_hash; + if (IsTotalOrderMode()) { + // Match whole user key for bloom filter check. + if (!MatchBloom(GetSliceHash(GetUserKey(target)))) { + return Status::OK(); + } + // in total order mode, there is only one bucket 0, and we always use empty + // prefix. + prefix_slice = Slice(); + prefix_hash = 0; + } else { + prefix_slice = GetPrefix(target); + prefix_hash = GetSliceHash(prefix_slice); + if (!MatchBloom(prefix_hash)) { + return Status::OK(); + } + } + uint32_t offset; + bool prefix_match; + Status s = + GetOffset(target, prefix_slice, prefix_hash, prefix_match, &offset); + if (!s.ok()) { + return s; + } + ParsedInternalKey found_key; + ParsedInternalKey parsed_target; + if (!ParseInternalKey(target, &parsed_target)) { + return Status::Corruption(Slice()); + } + + Slice found_value; + while (offset < data_end_offset_) { + Status s = Next(&offset, &found_key, &found_value); + if (!s.ok()) { + return s; + } + if (!prefix_match) { + // Need to verify prefix for the first key found if it is not yet + // checked. + if (GetPrefix(found_key) != prefix_slice) { + return Status::OK(); + } + prefix_match = true; + } + if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { + if (!(*saver)(arg, found_key, found_value, true)) { + break; + } + } + } + return Status::OK(); +} + +uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) { + return 0; +} + +PlainTableIterator::PlainTableIterator(PlainTableReader* table, + bool use_prefix_seek) + : table_(table), use_prefix_seek_(use_prefix_seek) { + next_offset_ = offset_ = table_->data_end_offset_; +} + +PlainTableIterator::~PlainTableIterator() { +} + +bool PlainTableIterator::Valid() const { + return offset_ < table_->data_end_offset_ + && offset_ >= table_->data_start_offset_; +} + +void PlainTableIterator::SeekToFirst() { + next_offset_ = table_->data_start_offset_; + if (next_offset_ >= table_->data_end_offset_) { + next_offset_ = offset_ = table_->data_end_offset_; + } else { + Next(); + } +} + +void PlainTableIterator::SeekToLast() { + assert(false); + status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable"); +} + +void PlainTableIterator::Seek(const Slice& target) { + // If the user doesn't set prefix seek option and we are not able to do a + // total Seek(). assert failure. + if (!use_prefix_seek_ && table_->index_size_ > 1) { + assert(false); + status_ = Status::NotSupported( + "PlainTable cannot issue non-prefix seek unless in total order mode."); + offset_ = next_offset_ = table_->data_end_offset_; + return; + } + + Slice prefix_slice = table_->GetPrefix(target); + uint32_t prefix_hash = 0; + // Bloom filter is ignored in total-order mode. + if (!table_->IsTotalOrderMode()) { + prefix_hash = GetSliceHash(prefix_slice); + if (!table_->MatchBloom(prefix_hash)) { + offset_ = next_offset_ = table_->data_end_offset_; + return; + } + } + bool prefix_match; + status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match, + &next_offset_); + if (!status_.ok()) { + offset_ = next_offset_ = table_->data_end_offset_; + return; + } + + if (next_offset_ < table_-> data_end_offset_) { + for (Next(); status_.ok() && Valid(); Next()) { + if (!prefix_match) { + // Need to verify the first key's prefix + if (table_->GetPrefix(key()) != prefix_slice) { + offset_ = next_offset_ = table_->data_end_offset_; + break; + } + prefix_match = true; + } + if (table_->internal_comparator_.Compare(key(), target) >= 0) { + break; + } + } + } else { + offset_ = table_->data_end_offset_; + } +} + +void PlainTableIterator::Next() { + offset_ = next_offset_; + if (offset_ < table_->data_end_offset_) { + Slice tmp_slice; + ParsedInternalKey parsed_key; + status_ = table_->Next(&next_offset_, &parsed_key, &value_); + if (status_.ok()) { + // Make a copy in this case. TODO optimize. + key_.SetInternalKey(parsed_key); + } else { + offset_ = next_offset_ = table_->data_end_offset_; + } + } +} + +void PlainTableIterator::Prev() { + assert(false); +} + +Slice PlainTableIterator::key() const { + assert(Valid()); + return key_.GetKey(); +} + +Slice PlainTableIterator::value() const { + assert(Valid()); + return value_; +} + +Status PlainTableIterator::status() const { + return status_; +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h new file mode 100644 index 0000000000..62239beb32 --- /dev/null +++ b/table/plain_table_reader.h @@ -0,0 +1,265 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifndef ROCKSDB_LITE +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/table_reader.h" +#include "table/plain_table_factory.h" +#include "util/arena.h" + +namespace rocksdb { + +class Block; +class BlockHandle; +class Footer; +struct Options; +class RandomAccessFile; +struct ReadOptions; +class TableCache; +class TableReader; +class DynamicBloom; +class InternalKeyComparator; + +using std::unique_ptr; +using std::unordered_map; +extern const uint32_t kPlainTableVariableLength; + +// Based on following output file format shown in plain_table_factory.h +// When opening the output file, IndexedTableReader creates a hash table +// from key prefixes to offset of the output file. IndexedTable will decide +// whether it points to the data offset of the first key with the key prefix +// or the offset of it. If there are too many keys share this prefix, it will +// create a binary search-able index from the suffix to offset on disk. +// +// The implementation of IndexedTableReader requires output file is mmaped +class PlainTableReader: public TableReader { + public: + static Status Open(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table, + const int bloom_bits_per_key, double hash_table_ratio, + size_t index_sparseness, size_t huge_page_tlb_size); + + Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override; + + Status Get(const ReadOptions&, const Slice& key, void* arg, + bool (*result_handler)(void* arg, const ParsedInternalKey& k, + const Slice& v, bool), + void (*mark_key_may_exist)(void*) = nullptr); + + uint64_t ApproximateOffsetOf(const Slice& key); + + void SetupForCompaction(); + + std::shared_ptr GetTableProperties() const { + return table_properties_; + } + + PlainTableReader(const Options& options, unique_ptr&& file, + const EnvOptions& storage_options, + const InternalKeyComparator& internal_comparator, + uint64_t file_size, int bloom_num_bits, + double hash_table_ratio, size_t index_sparseness, + const TableProperties* table_properties, + size_t huge_page_tlb_size); + virtual ~PlainTableReader(); + + protected: + // Check bloom filter to see whether it might contain this prefix. + // The hash of the prefix is given, since it can be reused for index lookup + // too. + virtual bool MatchBloom(uint32_t hash) const; + + // PopulateIndex() builds index of keys. It must be called before any query + // to the table. + // + // props: the table properties object that need to be stored. Ownership of + // the object will be passed. + // + // index_ contains buckets size of index_size_, each is a + // 32-bit integer. The lower 31 bits contain an offset value (explained below) + // and the first bit of the integer indicates type of the offset. + // + // +--------------+------------------------------------------------------+ + // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + + // +--------------+------------------------------------------------------+ + // + // Explanation for the "flag bit": + // + // 0 indicates that the bucket contains only one prefix (no conflict when + // hashing this prefix), whose first row starts from this offset of the + // file. + // 1 indicates that the bucket contains more than one prefixes, or there + // are too many rows for one prefix so we need a binary search for it. In + // this case, the offset indicates the offset of sub_index_ holding the + // binary search indexes of keys for those rows. Those binary search indexes + // are organized in this way: + // + // The first 4 bytes, indicate how many indexes (N) are stored after it. After + // it, there are N 32-bit integers, each points of an offset of the file, + // which + // points to starting of a row. Those offsets need to be guaranteed to be in + // ascending order so the keys they are pointing to are also in ascending + // order + // to make sure we can use them to do binary searches. Below is visual + // presentation of a bucket. + // + // + // number_of_records: varint32 + // record 1 file offset: fixedint32 + // record 2 file offset: fixedint32 + // .... + // record N file offset: fixedint32 + // + Status PopulateIndex(TableProperties* props); + + private: + struct IndexRecord; + class IndexRecordList; + + // Plain table maintains an index and a sub index. + // index is implemented by a hash table. + // subindex is a big of memory array. + // For more details about the in-memory index, please refer to: + // https://github.com/facebook/rocksdb/wiki/PlainTable-Format + // #wiki-in-memory-index-format + uint32_t* index_; + int index_size_ = 0; + char* sub_index_; + + Options options_; + const EnvOptions& soptions_; + unique_ptr file_; + + const InternalKeyComparator internal_comparator_; + // represents plain table's current status. + Status status_; + + Slice file_data_; + uint32_t file_size_; + + const double kHashTableRatio; + const int kBloomBitsPerKey; + // To speed up the search for keys with same prefix, we'll add index key for + // every N keys, where the "N" is determined by + // kIndexIntervalForSamePrefixKeys + const size_t kIndexIntervalForSamePrefixKeys = 16; + // Bloom filter is used to rule out non-existent key + unique_ptr bloom_; + Arena arena_; + + std::shared_ptr table_properties_; + // data_start_offset_ and data_end_offset_ defines the range of the + // sst file that stores data. + const uint32_t data_start_offset_ = 0; + const uint32_t data_end_offset_; + const size_t user_key_len_; + const size_t huge_page_tlb_size_; + + static const size_t kNumInternalBytes = 8; + static const uint32_t kSubIndexMask = 0x80000000; + static const size_t kOffsetLen = sizeof(uint32_t); + static const uint64_t kMaxFileSize = 1u << 31; + static const size_t kRecordsPerGroup = 256; + + bool IsFixedLength() const { + return user_key_len_ != kPlainTableVariableLength; + } + + size_t GetFixedInternalKeyLength() const { + return user_key_len_ + kNumInternalBytes; + } + + friend class TableCache; + friend class PlainTableIterator; + + // Internal helper function to generate an IndexRecordList object from all + // the rows, which contains index records as a list. + // If bloom_ is not null, all the keys' full-key hash will be added to the + // bloom filter. + Status PopulateIndexRecordList(IndexRecordList* record_list, + int* num_prefixes) const; + + // Internal helper function to allocate memory for indexes and bloom filters + void AllocateIndexAndBloom(int num_prefixes); + + // Internal helper function to bucket index record list to hash buckets. + // bucket_header is a vector of size hash_table_size_, with each entry + // containing a linklist of IndexRecord hashed to the same bucket, in reverse + // order. + // of offsets for the hash, in reversed order. + // entries_per_bucket is sized of index_size_. The value is how many index + // records are there in bucket_headers for the same bucket. + size_t BucketizeIndexesAndFillBloom( + IndexRecordList* record_list, std::vector* bucket_headers, + std::vector* entries_per_bucket); + + // Internal helper class to fill the indexes and bloom filters to internal + // data structures. bucket_headers and entries_per_bucket are bucketized + // indexes and counts generated by BucketizeIndexesAndFillBloom(). + void FillIndexes(const size_t kSubIndexSize, + const std::vector& bucket_headers, + const std::vector& entries_per_bucket); + + // Read a plain table key from the position `start`. The read content + // will be written to `key` and the size of read bytes will be populated + // in `bytes_read`. + Status ReadKey(const char* row_ptr, ParsedInternalKey* key, + size_t* bytes_read) const; + // Read the key and value at `offset` to parameters `key` and `value`. + // On success, `offset` will be updated as the offset for the next key. + Status Next(uint32_t* offset, ParsedInternalKey* key, Slice* value) const; + // Get file offset for key target. + // return value prefix_matched is set to true if the offset is confirmed + // for a key with the same prefix as target. + Status GetOffset(const Slice& target, const Slice& prefix, + uint32_t prefix_hash, bool& prefix_matched, + uint32_t* offset) const; + + Slice GetUserKey(const Slice& key) const { + return Slice(key.data(), key.size() - 8); + } + + Slice GetPrefix(const Slice& target) const { + assert(target.size() >= 8); // target is internal key + return GetPrefixFromUserKey(GetUserKey(target)); + } + + inline Slice GetPrefix(const ParsedInternalKey& target) const; + + Slice GetPrefixFromUserKey(const Slice& user_key) const { + if (!IsTotalOrderMode()) { + return options_.prefix_extractor->Transform(user_key); + } else { + // Use empty slice as prefix if prefix_extractor is not set. In that case, + // it falls back to pure binary search and total iterator seek is + // supported. + return Slice(); + } + } + + bool IsTotalOrderMode() const { + return (options_.prefix_extractor.get() == nullptr); + } + + // No copying allowed + explicit PlainTableReader(const TableReader&) = delete; + void operator=(const TableReader&) = delete; +}; +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/table/table_builder.h b/table/table_builder.h new file mode 100644 index 0000000000..ee32cff863 --- /dev/null +++ b/table/table_builder.h @@ -0,0 +1,55 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +namespace rocksdb { + +class Slice; +class Status; + +// TableBuilder provides the interface used to build a Table +// (an immutable and sorted map from keys to values). +// +// Multiple threads can invoke const methods on a TableBuilder without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same TableBuilder must use +// external synchronization. +class TableBuilder { + public: + // REQUIRES: Either Finish() or Abandon() has been called. + virtual ~TableBuilder() {} + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + virtual void Add(const Slice& key, const Slice& value) = 0; + + // Return non-ok iff some error has been detected. + virtual Status status() const = 0; + + // Finish building the table. + // REQUIRES: Finish(), Abandon() have not been called + virtual Status Finish() = 0; + + // Indicate that the contents of this builder should be abandoned. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + virtual void Abandon() = 0; + + // Number of calls to Add() so far. + virtual uint64_t NumEntries() const = 0; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + virtual uint64_t FileSize() const = 0; +}; + +} // namespace rocksdb diff --git a/table/table_properties.cc b/table/table_properties.cc new file mode 100644 index 0000000000..c7e1419437 --- /dev/null +++ b/table/table_properties.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "rocksdb/table_properties.h" +#include "rocksdb/iterator.h" +#include "rocksdb/env.h" + +namespace rocksdb { + +namespace { + void AppendProperty( + std::string& props, + const std::string& key, + const std::string& value, + const std::string& prop_delim, + const std::string& kv_delim) { + props.append(key); + props.append(kv_delim); + props.append(value); + props.append(prop_delim); + } + + template + void AppendProperty( + std::string& props, + const std::string& key, + const TValue& value, + const std::string& prop_delim, + const std::string& kv_delim) { + AppendProperty( + props, key, std::to_string(value), prop_delim, kv_delim + ); + } +} + +std::string TableProperties::ToString( + const std::string& prop_delim, + const std::string& kv_delim) const { + std::string result; + result.reserve(1024); + + // Basic Info + AppendProperty(result, "# data blocks", num_data_blocks, prop_delim, + kv_delim); + AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim); + + AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim); + AppendProperty(result, "raw average key size", + num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0, + prop_delim, kv_delim); + AppendProperty(result, "raw value size", raw_value_size, prop_delim, + kv_delim); + AppendProperty(result, "raw average value size", + num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0, + prop_delim, kv_delim); + + AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); + AppendProperty(result, "index block size", index_size, prop_delim, kv_delim); + AppendProperty(result, "filter block size", filter_size, prop_delim, + kv_delim); + AppendProperty(result, "(estimated) table size", + data_size + index_size + filter_size, prop_delim, kv_delim); + + AppendProperty( + result, "filter policy name", + filter_policy_name.empty() ? std::string("N/A") : filter_policy_name, + prop_delim, kv_delim); + + return result; +} + +const std::string TablePropertiesNames::kDataSize = + "rocksdb.data.size"; +const std::string TablePropertiesNames::kIndexSize = + "rocksdb.index.size"; +const std::string TablePropertiesNames::kFilterSize = + "rocksdb.filter.size"; +const std::string TablePropertiesNames::kRawKeySize = + "rocksdb.raw.key.size"; +const std::string TablePropertiesNames::kRawValueSize = + "rocksdb.raw.value.size"; +const std::string TablePropertiesNames::kNumDataBlocks = + "rocksdb.num.data.blocks"; +const std::string TablePropertiesNames::kNumEntries = + "rocksdb.num.entries"; +const std::string TablePropertiesNames::kFilterPolicy = + "rocksdb.filter.policy"; +const std::string TablePropertiesNames::kFormatVersion = + "rocksdb.format.version"; +const std::string TablePropertiesNames::kFixedKeyLen = + "rocksdb.fixed.key.length"; + +extern const std::string kPropertiesBlock = "rocksdb.properties"; +// Old property block name for backward compatibility +extern const std::string kPropertiesBlockOldName = "rocksdb.stats"; + +// Seek to the properties block. +// Return true if it successfully seeks to the properties block. +Status SeekToPropertiesBlock(Iterator* meta_iter, bool* is_found) { + *is_found = true; + meta_iter->Seek(kPropertiesBlock); + if (meta_iter->status().ok() && + (!meta_iter->Valid() || meta_iter->key() != kPropertiesBlock)) { + meta_iter->Seek(kPropertiesBlockOldName); + if (meta_iter->status().ok() && + (!meta_iter->Valid() || meta_iter->key() != kPropertiesBlockOldName)) { + *is_found = false; + } + } + return meta_iter->status(); +} + +} // namespace rocksdb diff --git a/table/table_reader.h b/table/table_reader.h new file mode 100644 index 0000000000..9238b880c7 --- /dev/null +++ b/table/table_reader.h @@ -0,0 +1,71 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +namespace rocksdb { + +class Iterator; +struct ParsedInternalKey; +class Slice; +class Arena; +struct ReadOptions; +struct TableProperties; + +// A Table is a sorted map from strings to strings. Tables are +// immutable and persistent. A Table may be safely accessed from +// multiple threads without external synchronization. +class TableReader { + public: + virtual ~TableReader() {} + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // arena: If not null, the arena needs to be used to allocate the Iterator. + // When destroying the iterator, the caller will not call "delete" + // but Iterator::~Iterator() directly. The destructor needs to destroy + // all the states but those allocated in arena. + virtual Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) = 0; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0; + + // Set up the table for Compaction. Might change some parameters with + // posix_fadvise + virtual void SetupForCompaction() = 0; + + virtual std::shared_ptr GetTableProperties() const = 0; + + // Calls (*result_handler)(handle_context, ...) repeatedly, starting with + // the entry found after a call to Seek(key), until result_handler returns + // false, where k is the actual internal key for a row found and v as the + // value of the key. didIO is true if I/O is involved in the operation. May + // not make such a call if filter policy says that key is not present. + // + // mark_key_may_exist_handler needs to be called when it is configured to be + // memory only and the key is not found in the block cache, with + // the parameter to be handle_context. + // + // readOptions is the options for the read + // key is the key to search for + virtual Status Get( + const ReadOptions& readOptions, const Slice& key, void* handle_context, + bool (*result_handler)(void* arg, const ParsedInternalKey& k, + const Slice& v, bool didIO), + void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0; +}; + +} // namespace rocksdb diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc new file mode 100644 index 0000000000..650849a553 --- /dev/null +++ b/table/table_reader_bench.cc @@ -0,0 +1,284 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include + +#include "rocksdb/db.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "port/atomic_pointer.h" +#include "table/block_based_table_factory.h" +#include "table/plain_table_factory.h" +#include "table/table_builder.h" +#include "util/histogram.h" +#include "util/testharness.h" +#include "util/testutil.h" + +using GFLAGS::ParseCommandLineFlags; +using GFLAGS::SetUsageMessage; + +namespace rocksdb { + +namespace { +// Make a key that i determines the first 4 characters and j determines the +// last 4 characters. +static std::string MakeKey(int i, int j, bool through_db) { + char buf[100]; + snprintf(buf, sizeof(buf), "%04d__key___%04d", i, j); + if (through_db) { + return std::string(buf); + } + // If we directly query table, which operates on internal keys + // instead of user keys, we need to add 8 bytes of internal + // information (row type etc) to user key to make an internal + // key. + InternalKey key(std::string(buf), 0, ValueType::kTypeValue); + return key.Encode().ToString(); +} + +static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey, + const Slice& v, bool didIO) { + return false; +} + +uint64_t Now(Env* env, bool measured_by_nanosecond) { + return measured_by_nanosecond ? env->NowNanos() : env->NowMicros(); +} +} // namespace + +// A very simple benchmark that. +// Create a table with roughly numKey1 * numKey2 keys, +// where there are numKey1 prefixes of the key, each has numKey2 number of +// distinguished key, differing in the suffix part. +// If if_query_empty_keys = false, query the existing keys numKey1 * numKey2 +// times randomly. +// If if_query_empty_keys = true, query numKey1 * numKey2 random empty keys. +// Print out the total time. +// If through_db=true, a full DB will be created and queries will be against +// it. Otherwise, operations will be directly through table level. +// +// If for_terator=true, instead of just query one key each time, it queries +// a range sharing the same prefix. +namespace { +void TableReaderBenchmark(Options& opts, EnvOptions& env_options, + ReadOptions& read_options, int num_keys1, + int num_keys2, int num_iter, int prefix_len, + bool if_query_empty_keys, bool for_iterator, + bool through_db, bool measured_by_nanosecond) { + rocksdb::InternalKeyComparator ikc(opts.comparator); + + std::string file_name = test::TmpDir() + + "/rocksdb_table_reader_benchmark"; + std::string dbname = test::TmpDir() + "/rocksdb_table_reader_bench_db"; + WriteOptions wo; + unique_ptr file; + Env* env = Env::Default(); + TableBuilder* tb = nullptr; + DB* db = nullptr; + Status s; + if (!through_db) { + env->NewWritableFile(file_name, &file, env_options); + tb = opts.table_factory->NewTableBuilder(opts, ikc, file.get(), + CompressionType::kNoCompression); + } else { + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + } + // Populate slightly more than 1M keys + for (int i = 0; i < num_keys1; i++) { + for (int j = 0; j < num_keys2; j++) { + std::string key = MakeKey(i * 2, j, through_db); + if (!through_db) { + tb->Add(key, key); + } else { + db->Put(wo, key, key); + } + } + } + if (!through_db) { + tb->Finish(); + file->Close(); + } else { + db->Flush(FlushOptions()); + } + + unique_ptr table_reader; + unique_ptr raf; + if (!through_db) { + Status s = env->NewRandomAccessFile(file_name, &raf, env_options); + uint64_t file_size; + env->GetFileSize(file_name, &file_size); + s = opts.table_factory->NewTableReader( + opts, env_options, ikc, std::move(raf), file_size, &table_reader); + } + + Random rnd(301); + std::string result; + HistogramImpl hist; + + void* arg = nullptr; + for (int it = 0; it < num_iter; it++) { + for (int i = 0; i < num_keys1; i++) { + for (int j = 0; j < num_keys2; j++) { + int r1 = rnd.Uniform(num_keys1) * 2; + int r2 = rnd.Uniform(num_keys2); + if (if_query_empty_keys) { + r1++; + r2 = num_keys2 * 2 - r2; + } + + if (!for_iterator) { + // Query one existing key; + std::string key = MakeKey(r1, r2, through_db); + uint64_t start_time = Now(env, measured_by_nanosecond); + port::MemoryBarrier(); + if (!through_db) { + s = table_reader->Get(read_options, key, arg, DummySaveValue, + nullptr); + } else { + s = db->Get(read_options, key, &result); + } + port::MemoryBarrier(); + hist.Add(Now(env, measured_by_nanosecond) - start_time); + } else { + int r2_len; + if (if_query_empty_keys) { + r2_len = 0; + } else { + r2_len = rnd.Uniform(num_keys2) + 1; + if (r2_len + r2 > num_keys2) { + r2_len = num_keys2 - r2; + } + } + std::string start_key = MakeKey(r1, r2, through_db); + std::string end_key = MakeKey(r1, r2 + r2_len, through_db); + uint64_t total_time = 0; + uint64_t start_time = Now(env, measured_by_nanosecond); + port::MemoryBarrier(); + Iterator* iter; + if (!through_db) { + iter = table_reader->NewIterator(read_options); + } else { + iter = db->NewIterator(read_options); + } + int count = 0; + for(iter->Seek(start_key); iter->Valid(); iter->Next()) { + if (if_query_empty_keys) { + break; + } + // verify key; + port::MemoryBarrier(); + total_time += Now(env, measured_by_nanosecond) - start_time; + assert(Slice(MakeKey(r1, r2 + count, through_db)) == iter->key()); + start_time = Now(env, measured_by_nanosecond); + if (++count >= r2_len) { + break; + } + } + if (count != r2_len) { + fprintf( + stderr, "Iterator cannot iterate expected number of entries. " + "Expected %d but got %d\n", r2_len, count); + assert(false); + } + delete iter; + port::MemoryBarrier(); + total_time += Now(env, measured_by_nanosecond) - start_time; + hist.Add(total_time); + } + } + } + } + + fprintf( + stderr, + "===================================================" + "====================================================\n" + "InMemoryTableSimpleBenchmark: %20s num_key1: %5d " + "num_key2: %5d %10s\n" + "===================================================" + "====================================================" + "\nHistogram (unit: %s): \n%s", + opts.table_factory->Name(), num_keys1, num_keys2, + for_iterator ? "iterator" : (if_query_empty_keys ? "empty" : "non_empty"), + measured_by_nanosecond ? "nanosecond" : "microsecond", + hist.ToString().c_str()); + if (!through_db) { + env->DeleteFile(file_name); + } else { + delete db; + db = nullptr; + DestroyDB(dbname, opts); + } +} +} // namespace +} // namespace rocksdb + +DEFINE_bool(query_empty, false, "query non-existing keys instead of existing " + "ones."); +DEFINE_int32(num_keys1, 4096, "number of distinguish prefix of keys"); +DEFINE_int32(num_keys2, 512, "number of distinguish keys for each prefix"); +DEFINE_int32(iter, 3, "query non-existing keys instead of existing ones"); +DEFINE_int32(prefix_len, 16, "Prefix length used for iterators and indexes"); +DEFINE_bool(iterator, false, "For test iterator"); +DEFINE_bool(through_db, false, "If enable, a DB instance will be created and " + "the query will be against DB. Otherwise, will be directly against " + "a table reader."); +DEFINE_bool(plain_table, false, "Use PlainTable"); +DEFINE_string(time_unit, "microsecond", + "The time unit used for measuring performance. User can specify " + "`microsecond` (default) or `nanosecond`"); + +int main(int argc, char** argv) { + SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + ParseCommandLineFlags(&argc, &argv, true); + + rocksdb::TableFactory* tf = new rocksdb::BlockBasedTableFactory(); + rocksdb::Options options; + if (FLAGS_prefix_len < 16) { + options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform( + FLAGS_prefix_len)); + } + rocksdb::ReadOptions ro; + rocksdb::EnvOptions env_options; + options.create_if_missing = true; + options.compression = rocksdb::CompressionType::kNoCompression; + + if (FLAGS_plain_table) { + options.allow_mmap_reads = true; + env_options.use_mmap_reads = true; + tf = new rocksdb::PlainTableFactory(16, (FLAGS_prefix_len == 16) ? 0 : 8, + 0.75); + options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform( + FLAGS_prefix_len)); + } else { + tf = new rocksdb::BlockBasedTableFactory(); + } + // if user provides invalid options, just fall back to microsecond. + bool measured_by_nanosecond = FLAGS_time_unit == "nanosecond"; + + options.table_factory = + std::shared_ptr(tf); + rocksdb::TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1, + FLAGS_num_keys2, FLAGS_iter, FLAGS_prefix_len, + FLAGS_query_empty, FLAGS_iterator, + FLAGS_through_db, measured_by_nanosecond); + delete tf; + return 0; +} + +#endif // GFLAGS diff --git a/table/table_test.cc b/table/table_test.cc new file mode 100644 index 0000000000..dd81baea88 --- /dev/null +++ b/table/table_test.cc @@ -0,0 +1,1805 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include + +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" + +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" + +#include "table/block.h" +#include "table/block_based_table_builder.h" +#include "table/block_based_table_factory.h" +#include "table/block_based_table_reader.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/plain_table_factory.h" + +#include "util/random.h" +#include "util/statistics.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +extern const uint64_t kLegacyBlockBasedTableMagicNumber; +extern const uint64_t kLegacyPlainTableMagicNumber; +extern const uint64_t kBlockBasedTableMagicNumber; +extern const uint64_t kPlainTableMagicNumber; + +namespace { + +// Return reverse of "key". +// Used to test non-lexicographic comparators. +std::string Reverse(const Slice& key) { + auto rev = key.ToString(); + std::reverse(rev.begin(), rev.end()); + return rev; +} + +class ReverseKeyComparator : public Comparator { + public: + virtual const char* Name() const { + return "rocksdb.ReverseBytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(Reverse(a), Reverse(b)); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + std::string s = Reverse(*start); + std::string l = Reverse(limit); + BytewiseComparator()->FindShortestSeparator(&s, l); + *start = Reverse(s); + } + + virtual void FindShortSuccessor(std::string* key) const { + std::string s = Reverse(*key); + BytewiseComparator()->FindShortSuccessor(&s); + *key = Reverse(s); + } +}; + +ReverseKeyComparator reverse_key_comparator; + +void Increment(const Comparator* cmp, std::string* key) { + if (cmp == BytewiseComparator()) { + key->push_back('\0'); + } else { + assert(cmp == &reverse_key_comparator); + std::string rev = Reverse(*key); + rev.push_back('\0'); + *key = Reverse(rev); + } +} + +// An STL comparator that uses a Comparator +struct STLLessThan { + const Comparator* cmp; + + STLLessThan() : cmp(BytewiseComparator()) { } + explicit STLLessThan(const Comparator* c) : cmp(c) { } + bool operator()(const std::string& a, const std::string& b) const { + return cmp->Compare(Slice(a), Slice(b)) < 0; + } +}; + +} // namespace + +class StringSink: public WritableFile { + public: + ~StringSink() { } + + const std::string& contents() const { return contents_; } + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + + virtual Status Append(const Slice& data) { + contents_.append(data.data(), data.size()); + return Status::OK(); + } + + private: + std::string contents_; +}; + + +class StringSource: public RandomAccessFile { + public: + StringSource(const Slice& contents, uint64_t uniq_id, bool mmap) + : contents_(contents.data(), contents.size()), uniq_id_(uniq_id), + mmap_(mmap) { + } + + virtual ~StringSource() { } + + uint64_t Size() const { return contents_.size(); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + if (offset > contents_.size()) { + return Status::InvalidArgument("invalid Read offset"); + } + if (offset + n > contents_.size()) { + n = contents_.size() - offset; + } + if (!mmap_) { + memcpy(scratch, &contents_[offset], n); + *result = Slice(scratch, n); + } else { + *result = Slice(&contents_[offset], n); + } + return Status::OK(); + } + + virtual size_t GetUniqueId(char* id, size_t max_size) const { + if (max_size < 20) { + return 0; + } + + char* rid = id; + rid = EncodeVarint64(rid, uniq_id_); + rid = EncodeVarint64(rid, 0); + return static_cast(rid-id); + } + + private: + std::string contents_; + uint64_t uniq_id_; + bool mmap_; +}; + +typedef std::map KVMap; + +// Helper class for tests to unify the interface between +// BlockBuilder/TableBuilder and Block/Table. +class Constructor { + public: + explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) {} + virtual ~Constructor() { } + + void Add(const std::string& key, const Slice& value) { + data_[key] = value.ToString(); + } + + // Finish constructing the data structure with all the keys that have + // been added so far. Returns the keys in sorted order in "*keys" + // and stores the key/value pairs in "*kvmap" + void Finish(const Options& options, + const InternalKeyComparator& internal_comparator, + std::vector* keys, KVMap* kvmap) { + last_internal_key_ = &internal_comparator; + *kvmap = data_; + keys->clear(); + for (KVMap::const_iterator it = data_.begin(); + it != data_.end(); + ++it) { + keys->push_back(it->first); + } + data_.clear(); + Status s = FinishImpl(options, internal_comparator, *kvmap); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + // Construct the data structure from the data in "data" + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) = 0; + + virtual Iterator* NewIterator() const = 0; + + virtual const KVMap& data() { return data_; } + + virtual DB* db() const { return nullptr; } // Overridden in DBConstructor + + protected: + const InternalKeyComparator* last_internal_key_; + + private: + KVMap data_; +}; + +class BlockConstructor: public Constructor { + public: + explicit BlockConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp), + block_(nullptr) { } + ~BlockConstructor() { + delete block_; + } + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) { + delete block_; + block_ = nullptr; + BlockBuilder builder(options, &internal_comparator); + + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + builder.Add(it->first, it->second); + } + // Open the block + data_ = builder.Finish().ToString(); + BlockContents contents; + contents.data = data_; + contents.cachable = false; + contents.heap_allocated = false; + block_ = new Block(contents); + return Status::OK(); + } + virtual Iterator* NewIterator() const { + return block_->NewIterator(comparator_); + } + + private: + const Comparator* comparator_; + std::string data_; + Block* block_; + + BlockConstructor(); +}; + +// A helper class that converts internal format keys into user keys +class KeyConvertingIterator: public Iterator { + public: + explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } + virtual ~KeyConvertingIterator() { delete iter_; } + virtual bool Valid() const { return iter_->Valid(); } + virtual void Seek(const Slice& target) { + ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + iter_->Seek(encoded); + } + virtual void SeekToFirst() { iter_->SeekToFirst(); } + virtual void SeekToLast() { iter_->SeekToLast(); } + virtual void Next() { iter_->Next(); } + virtual void Prev() { iter_->Prev(); } + + virtual Slice key() const { + assert(Valid()); + ParsedInternalKey key; + if (!ParseInternalKey(iter_->key(), &key)) { + status_ = Status::Corruption("malformed internal key"); + return Slice("corrupted key"); + } + return key.user_key; + } + + virtual Slice value() const { return iter_->value(); } + virtual Status status() const { + return status_.ok() ? iter_->status() : status_; + } + + private: + mutable Status status_; + Iterator* iter_; + + // No copying allowed + KeyConvertingIterator(const KeyConvertingIterator&); + void operator=(const KeyConvertingIterator&); +}; + +class TableConstructor: public Constructor { + public: + explicit TableConstructor(const Comparator* cmp, + bool convert_to_internal_key = false) + : Constructor(cmp), + convert_to_internal_key_(convert_to_internal_key) {} + ~TableConstructor() { Reset(); } + + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) { + Reset(); + sink_.reset(new StringSink()); + unique_ptr builder; + builder.reset(options.table_factory->NewTableBuilder( + options, internal_comparator, sink_.get(), options.compression)); + + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + if (convert_to_internal_key_) { + ParsedInternalKey ikey(it->first, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + builder->Add(encoded, it->second); + } else { + builder->Add(it->first, it->second); + } + ASSERT_TRUE(builder->status().ok()); + } + Status s = builder->Finish(); + ASSERT_TRUE(s.ok()) << s.ToString(); + + ASSERT_EQ(sink_->contents().size(), builder->FileSize()); + + // Open the table + uniq_id_ = cur_uniq_id_++; + source_.reset(new StringSource(sink_->contents(), uniq_id_, + options.allow_mmap_reads)); + return options.table_factory->NewTableReader( + options, soptions, internal_comparator, std::move(source_), + sink_->contents().size(), &table_reader_); + } + + virtual Iterator* NewIterator() const { + ReadOptions ro; + Iterator* iter = table_reader_->NewIterator(ro); + if (convert_to_internal_key_) { + return new KeyConvertingIterator(iter); + } else { + return iter; + } + } + + uint64_t ApproximateOffsetOf(const Slice& key) const { + return table_reader_->ApproximateOffsetOf(key); + } + + virtual Status Reopen(const Options& options) { + source_.reset( + new StringSource(sink_->contents(), uniq_id_, + options.allow_mmap_reads)); + return options.table_factory->NewTableReader( + options, soptions, *last_internal_key_, std::move(source_), + sink_->contents().size(), &table_reader_); + } + + virtual TableReader* table_reader() { + return table_reader_.get(); + } + + private: + void Reset() { + uniq_id_ = 0; + table_reader_.reset(); + sink_.reset(); + source_.reset(); + } + bool convert_to_internal_key_; + + uint64_t uniq_id_; + unique_ptr sink_; + unique_ptr source_; + unique_ptr table_reader_; + + TableConstructor(); + + static uint64_t cur_uniq_id_; + const EnvOptions soptions; +}; +uint64_t TableConstructor::cur_uniq_id_ = 1; + +class MemTableConstructor: public Constructor { + public: + explicit MemTableConstructor(const Comparator* cmp) + : Constructor(cmp), + internal_comparator_(cmp), + table_factory_(new SkipListFactory) { + Options options; + options.memtable_factory = table_factory_; + memtable_ = new MemTable(internal_comparator_, options); + memtable_->Ref(); + } + ~MemTableConstructor() { + delete memtable_->Unref(); + } + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) { + delete memtable_->Unref(); + Options memtable_options; + memtable_options.memtable_factory = table_factory_; + memtable_ = new MemTable(internal_comparator_, memtable_options); + memtable_->Ref(); + int seq = 1; + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + memtable_->Add(seq, kTypeValue, it->first, it->second); + seq++; + } + return Status::OK(); + } + virtual Iterator* NewIterator() const { + return new KeyConvertingIterator(memtable_->NewIterator(ReadOptions())); + } + + private: + InternalKeyComparator internal_comparator_; + MemTable* memtable_; + std::shared_ptr table_factory_; +}; + +class DBConstructor: public Constructor { + public: + explicit DBConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp) { + db_ = nullptr; + NewDB(); + } + ~DBConstructor() { + delete db_; + } + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) { + delete db_; + db_ = nullptr; + NewDB(); + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + WriteBatch batch; + batch.Put(it->first, it->second); + ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok()); + } + return Status::OK(); + } + virtual Iterator* NewIterator() const { + return db_->NewIterator(ReadOptions()); + } + + virtual DB* db() const { return db_; } + + private: + void NewDB() { + std::string name = test::TmpDir() + "/table_testdb"; + + Options options; + options.comparator = comparator_; + Status status = DestroyDB(name, options); + ASSERT_TRUE(status.ok()) << status.ToString(); + + options.create_if_missing = true; + options.error_if_exists = true; + options.write_buffer_size = 10000; // Something small to force merging + status = DB::Open(options, name, &db_); + ASSERT_TRUE(status.ok()) << status.ToString(); + } + + const Comparator* comparator_; + DB* db_; +}; + +static bool SnappyCompressionSupported() { +#ifdef SNAPPY + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Snappy_Compress(Options().compression_opts, + in.data(), in.size(), + &out); +#else + return false; +#endif +} + +static bool ZlibCompressionSupported() { +#ifdef ZLIB + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Zlib_Compress(Options().compression_opts, + in.data(), in.size(), + &out); +#else + return false; +#endif +} + +static bool BZip2CompressionSupported() { +#ifdef BZIP2 + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::BZip2_Compress(Options().compression_opts, + in.data(), in.size(), + &out); +#else + return false; +#endif +} + +static bool LZ4CompressionSupported() { +#ifdef LZ4 + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::LZ4_Compress(Options().compression_opts, in.data(), in.size(), + &out); +#else + return false; +#endif +} + +static bool LZ4HCCompressionSupported() { +#ifdef LZ4 + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::LZ4HC_Compress(Options().compression_opts, in.data(), in.size(), + &out); +#else + return false; +#endif +} + +enum TestType { + BLOCK_BASED_TABLE_TEST, + PLAIN_TABLE_SEMI_FIXED_PREFIX, + PLAIN_TABLE_FULL_STR_PREFIX, + PLAIN_TABLE_TOTAL_ORDER, + BLOCK_TEST, + MEMTABLE_TEST, + DB_TEST +}; + +struct TestArgs { + TestType type; + bool reverse_compare; + int restart_interval; + CompressionType compression; +}; + +static std::vector GenerateArgList() { + std::vector test_args; + std::vector test_types = { + BLOCK_BASED_TABLE_TEST, PLAIN_TABLE_SEMI_FIXED_PREFIX, + PLAIN_TABLE_FULL_STR_PREFIX, PLAIN_TABLE_TOTAL_ORDER, + BLOCK_TEST, MEMTABLE_TEST, + DB_TEST}; + std::vector reverse_compare_types = {false, true}; + std::vector restart_intervals = {16, 1, 1024}; + + // Only add compression if it is supported + std::vector compression_types; + compression_types.push_back(kNoCompression); + if (SnappyCompressionSupported()) { + compression_types.push_back(kSnappyCompression); + } + if (ZlibCompressionSupported()) { + compression_types.push_back(kZlibCompression); + } + if (BZip2CompressionSupported()) { + compression_types.push_back(kBZip2Compression); + } + if (LZ4CompressionSupported()) { + compression_types.push_back(kLZ4Compression); + } + if (LZ4HCCompressionSupported()) { + compression_types.push_back(kLZ4HCCompression); + } + + for (auto test_type : test_types) { + for (auto reverse_compare : reverse_compare_types) { + if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX || + test_type == PLAIN_TABLE_FULL_STR_PREFIX) { + // Plain table doesn't use restart index or compression. + TestArgs one_arg; + one_arg.type = test_type; + one_arg.reverse_compare = reverse_compare; + one_arg.restart_interval = restart_intervals[0]; + one_arg.compression = compression_types[0]; + test_args.push_back(one_arg); + continue; + } + + for (auto restart_interval : restart_intervals) { + for (auto compression_type : compression_types) { + TestArgs one_arg; + one_arg.type = test_type; + one_arg.reverse_compare = reverse_compare; + one_arg.restart_interval = restart_interval; + one_arg.compression = compression_type; + test_args.push_back(one_arg); + } + } + } + } + return test_args; +} + +// In order to make all tests run for plain table format, including +// those operating on empty keys, create a new prefix transformer which +// return fixed prefix if the slice is not shorter than the prefix length, +// and the full slice if it is shorter. +class FixedOrLessPrefixTransform : public SliceTransform { + private: + const size_t prefix_len_; + + public: + explicit FixedOrLessPrefixTransform(size_t prefix_len) : + prefix_len_(prefix_len) { + } + + virtual const char* Name() const { + return "rocksdb.FixedPrefix"; + } + + virtual Slice Transform(const Slice& src) const { + assert(InDomain(src)); + if (src.size() < prefix_len_) { + return src; + } + return Slice(src.data(), prefix_len_); + } + + virtual bool InDomain(const Slice& src) const { + return true; + } + + virtual bool InRange(const Slice& dst) const { + return (dst.size() <= prefix_len_); + } +}; + +class Harness { + public: + Harness() : constructor_(nullptr) { } + + void Init(const TestArgs& args) { + delete constructor_; + constructor_ = nullptr; + options_ = Options(); + + options_.block_restart_interval = args.restart_interval; + options_.compression = args.compression; + // Use shorter block size for tests to exercise block boundary + // conditions more. + options_.block_size = 256; + if (args.reverse_compare) { + options_.comparator = &reverse_key_comparator; + } + + internal_comparator_.reset( + new test::PlainInternalKeyComparator(options_.comparator)); + + support_prev_ = true; + only_support_prefix_seek_ = false; + BlockBasedTableOptions table_options; + switch (args.type) { + case BLOCK_BASED_TABLE_TEST: + table_options.flush_block_policy_factory.reset( + new FlushBlockBySizePolicyFactory()); + options_.table_factory.reset(new BlockBasedTableFactory(table_options)); + constructor_ = new TableConstructor(options_.comparator); + break; + case PLAIN_TABLE_SEMI_FIXED_PREFIX: + support_prev_ = false; + only_support_prefix_seek_ = true; + options_.prefix_extractor.reset(new FixedOrLessPrefixTransform(2)); + options_.allow_mmap_reads = true; + options_.table_factory.reset(NewPlainTableFactory()); + constructor_ = new TableConstructor(options_.comparator, true); + internal_comparator_.reset( + new InternalKeyComparator(options_.comparator)); + break; + case PLAIN_TABLE_FULL_STR_PREFIX: + support_prev_ = false; + only_support_prefix_seek_ = true; + options_.prefix_extractor.reset(NewNoopTransform()); + options_.allow_mmap_reads = true; + options_.table_factory.reset(NewPlainTableFactory()); + constructor_ = new TableConstructor(options_.comparator, true); + internal_comparator_.reset( + new InternalKeyComparator(options_.comparator)); + break; + case PLAIN_TABLE_TOTAL_ORDER: + support_prev_ = false; + only_support_prefix_seek_ = false; + options_.prefix_extractor = nullptr; + options_.allow_mmap_reads = true; + options_.table_factory.reset(NewTotalOrderPlainTableFactory()); + constructor_ = new TableConstructor(options_.comparator, true); + internal_comparator_.reset( + new InternalKeyComparator(options_.comparator)); + break; + case BLOCK_TEST: + constructor_ = new BlockConstructor(options_.comparator); + break; + case MEMTABLE_TEST: + constructor_ = new MemTableConstructor(options_.comparator); + break; + case DB_TEST: + constructor_ = new DBConstructor(options_.comparator); + break; + } + } + + ~Harness() { + delete constructor_; + } + + void Add(const std::string& key, const std::string& value) { + constructor_->Add(key, value); + } + + void Test(Random* rnd) { + std::vector keys; + KVMap data; + constructor_->Finish(options_, *internal_comparator_, &keys, &data); + + TestForwardScan(keys, data); + if (support_prev_) { + TestBackwardScan(keys, data); + } + TestRandomAccess(rnd, keys, data); + } + + void TestForwardScan(const std::vector& keys, + const KVMap& data) { + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToFirst(); + for (KVMap::const_iterator model_iter = data.begin(); + model_iter != data.end(); + ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Next(); + } + ASSERT_TRUE(!iter->Valid()); + delete iter; + } + + void TestBackwardScan(const std::vector& keys, + const KVMap& data) { + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToLast(); + for (KVMap::const_reverse_iterator model_iter = data.rbegin(); + model_iter != data.rend(); + ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Prev(); + } + ASSERT_TRUE(!iter->Valid()); + delete iter; + } + + void TestRandomAccess(Random* rnd, + const std::vector& keys, + const KVMap& data) { + static const bool kVerbose = false; + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + KVMap::const_iterator model_iter = data.begin(); + if (kVerbose) fprintf(stderr, "---\n"); + for (int i = 0; i < 200; i++) { + const int toss = rnd->Uniform(support_prev_ ? 5 : 3); + switch (toss) { + case 0: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Next\n"); + iter->Next(); + ++model_iter; + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 1: { + if (kVerbose) fprintf(stderr, "SeekToFirst\n"); + iter->SeekToFirst(); + model_iter = data.begin(); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 2: { + std::string key = PickRandomKey(rnd, keys); + model_iter = data.lower_bound(key); + if (kVerbose) fprintf(stderr, "Seek '%s'\n", + EscapeString(key).c_str()); + iter->Seek(Slice(key)); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 3: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Prev\n"); + iter->Prev(); + if (model_iter == data.begin()) { + model_iter = data.end(); // Wrap around to invalid value + } else { + --model_iter; + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 4: { + if (kVerbose) fprintf(stderr, "SeekToLast\n"); + iter->SeekToLast(); + if (keys.empty()) { + model_iter = data.end(); + } else { + std::string last = data.rbegin()->first; + model_iter = data.lower_bound(last); + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + } + } + delete iter; + } + + std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { + if (it == data.end()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const KVMap& data, + const KVMap::const_reverse_iterator& it) { + if (it == data.rend()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const Iterator* it) { + if (!it->Valid()) { + return "END"; + } else { + return "'" + it->key().ToString() + "->" + it->value().ToString() + "'"; + } + } + + std::string PickRandomKey(Random* rnd, const std::vector& keys) { + if (keys.empty()) { + return "foo"; + } else { + const int index = rnd->Uniform(keys.size()); + std::string result = keys[index]; + switch (rnd->Uniform(support_prev_ ? 3 : 1)) { + case 0: + // Return an existing key + break; + case 1: { + // Attempt to return something smaller than an existing key + if (result.size() > 0 && result[result.size() - 1] > '\0' + && (!only_support_prefix_seek_ + || options_.prefix_extractor->Transform(result).size() + < result.size())) { + result[result.size() - 1]--; + } + break; + } + case 2: { + // Return something larger than an existing key + Increment(options_.comparator, &result); + break; + } + } + return result; + } + } + + // Returns nullptr if not running against a DB + DB* db() const { return constructor_->db(); } + + private: + Options options_ = Options(); + Constructor* constructor_; + bool support_prev_; + bool only_support_prefix_seek_; + shared_ptr internal_comparator_; +}; + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +// Tests against all kinds of tables +class TableTest { + public: + const InternalKeyComparator& GetPlainInternalComparator( + const Comparator* comp) { + if (!plain_internal_comparator) { + plain_internal_comparator.reset( + new test::PlainInternalKeyComparator(comp)); + } + return *plain_internal_comparator; + } + + private: + std::unique_ptr plain_internal_comparator; +}; + +class GeneralTableTest : public TableTest {}; +class BlockBasedTableTest : public TableTest {}; +class PlainTableTest : public TableTest {}; +class TablePropertyTest {}; + +// This test serves as the living tutorial for the prefix scan of user collected +// properties. +TEST(TablePropertyTest, PrefixScanTest) { + UserCollectedProperties props{{"num.111.1", "1"}, + {"num.111.2", "2"}, + {"num.111.3", "3"}, + {"num.333.1", "1"}, + {"num.333.2", "2"}, + {"num.333.3", "3"}, + {"num.555.1", "1"}, + {"num.555.2", "2"}, + {"num.555.3", "3"}, }; + + // prefixes that exist + for (const std::string& prefix : {"num.111", "num.333", "num.555"}) { + int num = 0; + for (auto pos = props.lower_bound(prefix); + pos != props.end() && + pos->first.compare(0, prefix.size(), prefix) == 0; + ++pos) { + ++num; + auto key = prefix + "." + std::to_string(num); + ASSERT_EQ(key, pos->first); + ASSERT_EQ(std::to_string(num), pos->second); + } + ASSERT_EQ(3, num); + } + + // prefixes that don't exist + for (const std::string& prefix : + {"num.000", "num.222", "num.444", "num.666"}) { + auto pos = props.lower_bound(prefix); + ASSERT_TRUE(pos == props.end() || + pos->first.compare(0, prefix.size(), prefix) != 0); + } +} + +// This test include all the basic checks except those for index size and block +// size, which will be conducted in separated unit tests. +TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) { + TableConstructor c(BytewiseComparator()); + + c.Add("a1", "val1"); + c.Add("b2", "val2"); + c.Add("c3", "val3"); + c.Add("d4", "val4"); + c.Add("e5", "val5"); + c.Add("f6", "val6"); + c.Add("g7", "val7"); + c.Add("h8", "val8"); + c.Add("j9", "val9"); + + std::vector keys; + KVMap kvmap; + Options options; + options.compression = kNoCompression; + options.block_restart_interval = 1; + + c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, + &kvmap); + + auto& props = *c.table_reader()->GetTableProperties(); + ASSERT_EQ(kvmap.size(), props.num_entries); + + auto raw_key_size = kvmap.size() * 2ul; + auto raw_value_size = kvmap.size() * 4ul; + + ASSERT_EQ(raw_key_size, props.raw_key_size); + ASSERT_EQ(raw_value_size, props.raw_value_size); + ASSERT_EQ(1ul, props.num_data_blocks); + ASSERT_EQ("", props.filter_policy_name); // no filter policy is used + + // Verify data size. + BlockBuilder block_builder(options, options.comparator); + for (const auto& item : kvmap) { + block_builder.Add(item.first, item.second); + } + Slice content = block_builder.Finish(); + ASSERT_EQ(content.size() + kBlockTrailerSize, props.data_size); +} + +TEST(BlockBasedTableTest, FilterPolicyNameProperties) { + TableConstructor c(BytewiseComparator()); + c.Add("a1", "val1"); + std::vector keys; + KVMap kvmap; + Options options; + std::unique_ptr filter_policy(NewBloomFilterPolicy(10)); + options.filter_policy = filter_policy.get(); + + c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, + &kvmap); + auto& props = *c.table_reader()->GetTableProperties(); + ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name); +} + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +void AddInternalKey(TableConstructor* c, const std::string prefix, + int suffix_len = 800) { + static Random rnd(1023); + InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue); + c->Add(k.Encode().ToString(), "v"); +} + +TEST(TableTest, HashIndexTest) { + TableConstructor c(BytewiseComparator()); + + // keys with prefix length 3, make sure the key/value is big enough to fill + // one block + AddInternalKey(&c, "0015"); + AddInternalKey(&c, "0035"); + + AddInternalKey(&c, "0054"); + AddInternalKey(&c, "0055"); + + AddInternalKey(&c, "0056"); + AddInternalKey(&c, "0057"); + + AddInternalKey(&c, "0058"); + AddInternalKey(&c, "0075"); + + AddInternalKey(&c, "0076"); + AddInternalKey(&c, "0095"); + + std::vector keys; + KVMap kvmap; + Options options; + BlockBasedTableOptions table_options; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.block_cache = NewLRUCache(1024); + options.block_size = 1700; + + std::unique_ptr comparator( + new InternalKeyComparator(BytewiseComparator())); + c.Finish(options, *comparator, &keys, &kvmap); + auto reader = c.table_reader(); + + auto props = c.table_reader()->GetTableProperties(); + ASSERT_EQ(5u, props->num_data_blocks); + + std::unique_ptr hash_iter(reader->NewIterator(ReadOptions())); + + // -- Find keys do not exist, but have common prefix. + std::vector prefixes = {"001", "003", "005", "007", "009"}; + std::vector lower_bound = {keys[0], keys[1], keys[2], + keys[7], keys[9], }; + + // find the lower bound of the prefix + for (size_t i = 0; i < prefixes.size(); ++i) { + hash_iter->Seek(InternalKey(prefixes[i], 0, kTypeValue).Encode()); + ASSERT_OK(hash_iter->status()); + ASSERT_TRUE(hash_iter->Valid()); + + // seek the first element in the block + ASSERT_EQ(lower_bound[i], hash_iter->key().ToString()); + ASSERT_EQ("v", hash_iter->value().ToString()); + } + + // find the upper bound of prefixes + std::vector upper_bound = {keys[1], keys[2], keys[7], keys[9], }; + + // find existing keys + for (const auto& item : kvmap) { + auto ukey = ExtractUserKey(item.first).ToString(); + hash_iter->Seek(ukey); + + // ASSERT_OK(regular_iter->status()); + ASSERT_OK(hash_iter->status()); + + // ASSERT_TRUE(regular_iter->Valid()); + ASSERT_TRUE(hash_iter->Valid()); + + ASSERT_EQ(item.first, hash_iter->key().ToString()); + ASSERT_EQ(item.second, hash_iter->value().ToString()); + } + + for (size_t i = 0; i < prefixes.size(); ++i) { + // the key is greater than any existing keys. + auto key = prefixes[i] + "9"; + hash_iter->Seek(InternalKey(key, 0, kTypeValue).Encode()); + + ASSERT_OK(hash_iter->status()); + if (i == prefixes.size() - 1) { + // last key + ASSERT_TRUE(!hash_iter->Valid()); + } else { + ASSERT_TRUE(hash_iter->Valid()); + // seek the first element in the block + ASSERT_EQ(upper_bound[i], hash_iter->key().ToString()); + ASSERT_EQ("v", hash_iter->value().ToString()); + } + } + + // find keys with prefix that don't match any of the existing prefixes. + std::vector non_exist_prefixes = {"002", "004", "006", "008"}; + for (const auto& prefix : non_exist_prefixes) { + hash_iter->Seek(InternalKey(prefix, 0, kTypeValue).Encode()); + // regular_iter->Seek(prefix); + + ASSERT_OK(hash_iter->status()); + ASSERT_TRUE(!hash_iter->Valid()); + } +} + +// It's very hard to figure out the index block size of a block accurately. +// To make sure we get the index size, we just make sure as key number +// grows, the filter block size also grows. +TEST(BlockBasedTableTest, IndexSizeStat) { + uint64_t last_index_size = 0; + + // we need to use random keys since the pure human readable texts + // may be well compressed, resulting insignifcant change of index + // block size. + Random rnd(test::RandomSeed()); + std::vector keys; + + for (int i = 0; i < 100; ++i) { + keys.push_back(RandomString(&rnd, 10000)); + } + + // Each time we load one more key to the table. the table index block + // size is expected to be larger than last time's. + for (size_t i = 1; i < keys.size(); ++i) { + TableConstructor c(BytewiseComparator()); + for (size_t j = 0; j < i; ++j) { + c.Add(keys[j], "val"); + } + + std::vector ks; + KVMap kvmap; + Options options; + options.compression = kNoCompression; + options.block_restart_interval = 1; + + c.Finish(options, GetPlainInternalComparator(options.comparator), &ks, + &kvmap); + auto index_size = c.table_reader()->GetTableProperties()->index_size; + ASSERT_GT(index_size, last_index_size); + last_index_size = index_size; + } +} + +TEST(BlockBasedTableTest, NumBlockStat) { + Random rnd(test::RandomSeed()); + TableConstructor c(BytewiseComparator()); + Options options; + options.compression = kNoCompression; + options.block_restart_interval = 1; + options.block_size = 1000; + + for (int i = 0; i < 10; ++i) { + // the key/val are slightly smaller than block size, so that each block + // holds roughly one key/value pair. + c.Add(RandomString(&rnd, 900), "val"); + } + + std::vector ks; + KVMap kvmap; + c.Finish(options, GetPlainInternalComparator(options.comparator), &ks, + &kvmap); + ASSERT_EQ(kvmap.size(), + c.table_reader()->GetTableProperties()->num_data_blocks); +} + +// A simple tool that takes the snapshot of block cache statistics. +class BlockCachePropertiesSnapshot { + public: + explicit BlockCachePropertiesSnapshot(Statistics* statistics) { + block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_MISS); + block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_HIT); + index_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS); + index_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT); + data_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_DATA_MISS); + data_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_DATA_HIT); + filter_block_cache_miss = + statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS); + filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT); + } + + void AssertIndexBlockStat(int64_t index_block_cache_miss, + int64_t index_block_cache_hit) { + ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss); + ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit); + } + + void AssertFilterBlockStat(int64_t filter_block_cache_miss, + int64_t filter_block_cache_hit) { + ASSERT_EQ(filter_block_cache_miss, this->filter_block_cache_miss); + ASSERT_EQ(filter_block_cache_hit, this->filter_block_cache_hit); + } + + // Check if the fetched props matches the expected ones. + // TODO(kailiu) Use this only when you disabled filter policy! + void AssertEqual(int64_t index_block_cache_miss, + int64_t index_block_cache_hit, int64_t data_block_cache_miss, + int64_t data_block_cache_hit) const { + ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss); + ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit); + ASSERT_EQ(data_block_cache_miss, this->data_block_cache_miss); + ASSERT_EQ(data_block_cache_hit, this->data_block_cache_hit); + ASSERT_EQ(index_block_cache_miss + data_block_cache_miss, + this->block_cache_miss); + ASSERT_EQ(index_block_cache_hit + data_block_cache_hit, + this->block_cache_hit); + } + + private: + int64_t block_cache_miss = 0; + int64_t block_cache_hit = 0; + int64_t index_block_cache_miss = 0; + int64_t index_block_cache_hit = 0; + int64_t data_block_cache_miss = 0; + int64_t data_block_cache_hit = 0; + int64_t filter_block_cache_miss = 0; + int64_t filter_block_cache_hit = 0; +}; + +// Make sure, by default, index/filter blocks were pre-loaded (meaning we won't +// use block cache to store them). +TEST(BlockBasedTableTest, BlockCacheDisabledTest) { + Options options; + options.create_if_missing = true; + options.statistics = CreateDBStatistics(); + options.block_cache = NewLRUCache(1024); + std::unique_ptr filter_policy(NewBloomFilterPolicy(10)); + options.filter_policy = filter_policy.get(); + BlockBasedTableOptions table_options; + // Intentionally commented out: table_options.cache_index_and_filter_blocks = + // true; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + std::vector keys; + KVMap kvmap; + + TableConstructor c(BytewiseComparator()); + c.Add("key", "value"); + c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, + &kvmap); + + // preloading filter/index blocks is enabled. + auto reader = dynamic_cast(c.table_reader()); + ASSERT_TRUE(reader->TEST_filter_block_preloaded()); + ASSERT_TRUE(reader->TEST_index_reader_preloaded()); + + { + // nothing happens in the beginning + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertIndexBlockStat(0, 0); + props.AssertFilterBlockStat(0, 0); + } + + { + // a hack that just to trigger BlockBasedTable::GetFilter. + reader->Get(ReadOptions(), "non-exist-key", nullptr, nullptr, nullptr); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertIndexBlockStat(0, 0); + props.AssertFilterBlockStat(0, 0); + } +} + +// Due to the difficulities of the intersaction between statistics, this test +// only tests the case when "index block is put to block cache" +TEST(BlockBasedTableTest, FilterBlockInBlockCache) { + // -- Table construction + Options options; + options.create_if_missing = true; + options.statistics = CreateDBStatistics(); + options.block_cache = NewLRUCache(1024); + + // Enable the cache for index/filter blocks + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + std::vector keys; + KVMap kvmap; + + TableConstructor c(BytewiseComparator()); + c.Add("key", "value"); + c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, + &kvmap); + // preloading filter/index blocks is prohibited. + auto reader = dynamic_cast(c.table_reader()); + ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); + ASSERT_TRUE(!reader->TEST_index_reader_preloaded()); + + // -- PART 1: Open with regular block cache. + // Since block_cache is disabled, no cache activities will be involved. + unique_ptr iter; + + // At first, no block will be accessed. + { + BlockCachePropertiesSnapshot props(options.statistics.get()); + // index will be added to block cache. + props.AssertEqual(1, // index block miss + 0, 0, 0); + } + + // Only index block will be accessed + { + iter.reset(c.NewIterator()); + BlockCachePropertiesSnapshot props(options.statistics.get()); + // NOTE: to help better highlight the "detla" of each ticker, I use + // + to indicate the increment of changed + // value; other numbers remain the same. + props.AssertEqual(1, 0 + 1, // index block hit + 0, 0); + } + + // Only data block will be accessed + { + iter->SeekToFirst(); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(1, 1, 0 + 1, // data block miss + 0); + } + + // Data block will be in cache + { + iter.reset(c.NewIterator()); + iter->SeekToFirst(); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(1, 1 + 1, /* index block hit */ + 1, 0 + 1 /* data block hit */); + } + // release the iterator so that the block cache can reset correctly. + iter.reset(); + + // -- PART 2: Open without block cache + options.block_cache.reset(); + options.statistics = CreateDBStatistics(); // reset the stats + c.Reopen(options); + + { + iter.reset(c.NewIterator()); + iter->SeekToFirst(); + ASSERT_EQ("key", iter->key().ToString()); + BlockCachePropertiesSnapshot props(options.statistics.get()); + // Nothing is affected at all + props.AssertEqual(0, 0, 0, 0); + } + + // -- PART 3: Open with very small block cache + // In this test, no block will ever get hit since the block cache is + // too small to fit even one entry. + options.block_cache = NewLRUCache(1); + c.Reopen(options); + { + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(1, // index block miss + 0, 0, 0); + } + + + { + // Both index and data block get accessed. + // It first cache index block then data block. But since the cache size + // is only 1, index block will be purged after data block is inserted. + iter.reset(c.NewIterator()); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(1 + 1, // index block miss + 0, 0, // data block miss + 0); + } + + { + // SeekToFirst() accesses data block. With similar reason, we expect data + // block's cache miss. + iter->SeekToFirst(); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(2, 0, 0 + 1, // data block miss + 0); + } +} + +TEST(BlockBasedTableTest, BlockCacheLeak) { + // Check that when we reopen a table we don't lose access to blocks already + // in the cache. This test checks whether the Table actually makes use of the + // unique ID from the file. + + Options opt; + unique_ptr ikc; + ikc.reset(new test::PlainInternalKeyComparator(opt.comparator)); + opt.block_size = 1024; + opt.compression = kNoCompression; + opt.block_cache = + NewLRUCache(16 * 1024 * 1024); // big enough so we don't ever + // lose cached values. + + TableConstructor c(BytewiseComparator()); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector keys; + KVMap kvmap; + c.Finish(opt, *ikc, &keys, &kvmap); + + unique_ptr iter(c.NewIterator()); + iter->SeekToFirst(); + while (iter->Valid()) { + iter->key(); + iter->value(); + iter->Next(); + } + ASSERT_OK(iter->status()); + + ASSERT_OK(c.Reopen(opt)); + auto table_reader = dynamic_cast(c.table_reader()); + for (const std::string& key : keys) { + ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key)); + } +} + +TEST(PlainTableTest, BasicPlainTableProperties) { + PlainTableFactory factory(8, 8, 0); + StringSink sink; + Options options; + InternalKeyComparator ikc(options.comparator); + std::unique_ptr builder( + factory.NewTableBuilder(options, ikc, &sink, kNoCompression)); + + for (char c = 'a'; c <= 'z'; ++c) { + std::string key(8, c); + key.append("\1 "); // PlainTable expects internal key structure + std::string value(28, c + 42); + builder->Add(key, value); + } + ASSERT_OK(builder->Finish()); + + StringSource source(sink.contents(), 72242, true); + + TableProperties* props = nullptr; + auto s = ReadTableProperties(&source, sink.contents().size(), + kPlainTableMagicNumber, Env::Default(), nullptr, + &props); + std::unique_ptr props_guard(props); + ASSERT_OK(s); + + ASSERT_EQ(0ul, props->index_size); + ASSERT_EQ(0ul, props->filter_size); + ASSERT_EQ(16ul * 26, props->raw_key_size); + ASSERT_EQ(28ul * 26, props->raw_value_size); + ASSERT_EQ(26ul, props->num_entries); + ASSERT_EQ(1ul, props->num_data_blocks); +} + +TEST(GeneralTableTest, ApproximateOffsetOfPlain) { + TableConstructor c(BytewiseComparator()); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector keys; + KVMap kvmap; + Options options; + test::PlainInternalKeyComparator internal_comparator(options.comparator); + options.block_size = 1024; + options.compression = kNoCompression; + c.Finish(options, internal_comparator, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000)); +} + +static void DoCompressionTest(CompressionType comp) { + Random rnd(301); + TableConstructor c(BytewiseComparator()); + std::string tmp; + c.Add("k01", "hello"); + c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + c.Add("k03", "hello3"); + c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + std::vector keys; + KVMap kvmap; + Options options; + test::PlainInternalKeyComparator ikc(options.comparator); + options.block_size = 1024; + options.compression = comp; + c.Finish(options, ikc, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6100)); +} + +TEST(GeneralTableTest, ApproximateOffsetOfCompressed) { + std::vector compression_state; + if (!SnappyCompressionSupported()) { + fprintf(stderr, "skipping snappy compression tests\n"); + } else { + compression_state.push_back(kSnappyCompression); + } + + if (!ZlibCompressionSupported()) { + fprintf(stderr, "skipping zlib compression tests\n"); + } else { + compression_state.push_back(kZlibCompression); + } + + // TODO(kailiu) DoCompressionTest() doesn't work with BZip2. + /* + if (!BZip2CompressionSupported()) { + fprintf(stderr, "skipping bzip2 compression tests\n"); + } else { + compression_state.push_back(kBZip2Compression); + } + */ + + if (!LZ4CompressionSupported()) { + fprintf(stderr, "skipping lz4 compression tests\n"); + } else { + compression_state.push_back(kLZ4Compression); + } + + if (!LZ4HCCompressionSupported()) { + fprintf(stderr, "skipping lz4hc compression tests\n"); + } else { + compression_state.push_back(kLZ4HCCompression); + } + + for (auto state : compression_state) { + DoCompressionTest(state); + } +} + +TEST(Harness, Randomized) { + std::vector args = GenerateArgList(); + for (unsigned int i = 0; i < args.size(); i++) { + Init(args[i]); + Random rnd(test::RandomSeed() + 5); + for (int num_entries = 0; num_entries < 2000; + num_entries += (num_entries < 50 ? 1 : 200)) { + if ((num_entries % 10) == 0) { + fprintf(stderr, "case %d of %d: num_entries = %d\n", (i + 1), + static_cast(args.size()), num_entries); + } + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + } + } +} + +TEST(Harness, RandomizedLongDB) { + Random rnd(test::RandomSeed()); + TestArgs args = { DB_TEST, false, 16, kNoCompression }; + Init(args); + int num_entries = 100000; + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + + // We must have created enough data to force merging + int files = 0; + for (int level = 0; level < db()->NumberLevels(); level++) { + std::string value; + char name[100]; + snprintf(name, sizeof(name), "rocksdb.num-files-at-level%d", level); + ASSERT_TRUE(db()->GetProperty(name, &value)); + files += atoi(value.c_str()); + } + ASSERT_GT(files, 0); +} + +class MemTableTest { }; + +TEST(MemTableTest, Simple) { + InternalKeyComparator cmp(BytewiseComparator()); + auto table_factory = std::make_shared(); + Options options; + options.memtable_factory = table_factory; + MemTable* memtable = new MemTable(cmp, options); + memtable->Ref(); + WriteBatch batch; + WriteBatchInternal::SetSequence(&batch, 100); + batch.Put(std::string("k1"), std::string("v1")); + batch.Put(std::string("k2"), std::string("v2")); + batch.Put(std::string("k3"), std::string("v3")); + batch.Put(std::string("largekey"), std::string("vlarge")); + ColumnFamilyMemTablesDefault cf_mems_default(memtable, &options); + ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok()); + + Iterator* iter = memtable->NewIterator(ReadOptions()); + iter->SeekToFirst(); + while (iter->Valid()) { + fprintf(stderr, "key: '%s' -> '%s'\n", + iter->key().ToString().c_str(), + iter->value().ToString().c_str()); + iter->Next(); + } + + delete iter; + delete memtable->Unref(); +} + +// Test the empty key +TEST(Harness, SimpleEmptyKey) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 1); + Add("", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSingle) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 2); + Add("abc", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleMulti) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 3); + Add("abc", "v"); + Add("abcd", "v"); + Add("ac", "v2"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSpecialKey) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 4); + Add("\xff\xff", "v3"); + Test(&rnd); + } +} + +TEST(Harness, FooterTests) { + { + // upconvert legacy block based + std::string encoded; + Footer footer(kLegacyBlockBasedTableMagicNumber); + BlockHandle meta_index(10, 5), index(20, 15); + footer.set_metaindex_handle(meta_index); + footer.set_index_handle(index); + footer.EncodeTo(&encoded); + Footer decoded_footer; + Slice encoded_slice(encoded); + decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber); + ASSERT_EQ(decoded_footer.checksum(), kCRC32c); + ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); + ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); + ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); + ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + } + { + // xxhash block based + std::string encoded; + Footer footer(kBlockBasedTableMagicNumber); + BlockHandle meta_index(10, 5), index(20, 15); + footer.set_metaindex_handle(meta_index); + footer.set_index_handle(index); + footer.set_checksum(kxxHash); + footer.EncodeTo(&encoded); + Footer decoded_footer; + Slice encoded_slice(encoded); + decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber); + ASSERT_EQ(decoded_footer.checksum(), kxxHash); + ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); + ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); + ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); + ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + } + { + // upconvert legacy plain table + std::string encoded; + Footer footer(kLegacyPlainTableMagicNumber); + BlockHandle meta_index(10, 5), index(20, 15); + footer.set_metaindex_handle(meta_index); + footer.set_index_handle(index); + footer.EncodeTo(&encoded); + Footer decoded_footer; + Slice encoded_slice(encoded); + decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber); + ASSERT_EQ(decoded_footer.checksum(), kCRC32c); + ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); + ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); + ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); + ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + } + { + // xxhash block based + std::string encoded; + Footer footer(kPlainTableMagicNumber); + BlockHandle meta_index(10, 5), index(20, 15); + footer.set_metaindex_handle(meta_index); + footer.set_index_handle(index); + footer.set_checksum(kxxHash); + footer.EncodeTo(&encoded); + Footer decoded_footer; + Slice encoded_slice(encoded); + decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber); + ASSERT_EQ(decoded_footer.checksum(), kxxHash); + ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); + ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); + ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); + ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc new file mode 100644 index 0000000000..6af48f58ce --- /dev/null +++ b/table/two_level_iterator.cc @@ -0,0 +1,199 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/two_level_iterator.h" + +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block.h" +#include "table/format.h" +#include "util/arena.h" + +namespace rocksdb { + +namespace { + +class TwoLevelIterator: public Iterator { + public: + explicit TwoLevelIterator(TwoLevelIteratorState* state, + Iterator* first_level_iter); + + virtual ~TwoLevelIterator() { + first_level_iter_.DeleteIter(false); + second_level_iter_.DeleteIter(false); + } + + virtual void Seek(const Slice& target); + virtual void SeekToFirst(); + virtual void SeekToLast(); + virtual void Next(); + virtual void Prev(); + + virtual bool Valid() const { + return second_level_iter_.Valid(); + } + virtual Slice key() const { + assert(Valid()); + return second_level_iter_.key(); + } + virtual Slice value() const { + assert(Valid()); + return second_level_iter_.value(); + } + virtual Status status() const { + // It'd be nice if status() returned a const Status& instead of a Status + if (!first_level_iter_.status().ok()) { + return first_level_iter_.status(); + } else if (second_level_iter_.iter() != nullptr && + !second_level_iter_.status().ok()) { + return second_level_iter_.status(); + } else { + return status_; + } + } + + private: + void SaveError(const Status& s) { + if (status_.ok() && !s.ok()) status_ = s; + } + void SkipEmptyDataBlocksForward(); + void SkipEmptyDataBlocksBackward(); + void SetSecondLevelIterator(Iterator* iter); + void InitDataBlock(); + + std::unique_ptr state_; + IteratorWrapper first_level_iter_; + IteratorWrapper second_level_iter_; // May be nullptr + Status status_; + // If second_level_iter is non-nullptr, then "data_block_handle_" holds the + // "index_value" passed to block_function_ to create the second_level_iter. + std::string data_block_handle_; +}; + +TwoLevelIterator::TwoLevelIterator(TwoLevelIteratorState* state, + Iterator* first_level_iter) + : state_(state), first_level_iter_(first_level_iter) {} + +void TwoLevelIterator::Seek(const Slice& target) { + if (state_->check_prefix_may_match && + !state_->PrefixMayMatch(target)) { + SetSecondLevelIterator(nullptr); + return; + } + first_level_iter_.Seek(target); + + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.Seek(target); + } + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::SeekToFirst() { + first_level_iter_.SeekToFirst(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToFirst(); + } + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::SeekToLast() { + first_level_iter_.SeekToLast(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToLast(); + } + SkipEmptyDataBlocksBackward(); +} + +void TwoLevelIterator::Next() { + assert(Valid()); + second_level_iter_.Next(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::Prev() { + assert(Valid()); + second_level_iter_.Prev(); + SkipEmptyDataBlocksBackward(); +} + + +void TwoLevelIterator::SkipEmptyDataBlocksForward() { + while (second_level_iter_.iter() == nullptr || + (!second_level_iter_.Valid() && + !second_level_iter_.status().IsIncomplete())) { + // Move to next block + if (!first_level_iter_.Valid()) { + SetSecondLevelIterator(nullptr); + return; + } + first_level_iter_.Next(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToFirst(); + } + } +} + +void TwoLevelIterator::SkipEmptyDataBlocksBackward() { + while (second_level_iter_.iter() == nullptr || + (!second_level_iter_.Valid() && + !second_level_iter_.status().IsIncomplete())) { + // Move to next block + if (!first_level_iter_.Valid()) { + SetSecondLevelIterator(nullptr); + return; + } + first_level_iter_.Prev(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToLast(); + } + } +} + +void TwoLevelIterator::SetSecondLevelIterator(Iterator* iter) { + if (second_level_iter_.iter() != nullptr) { + SaveError(second_level_iter_.status()); + } + second_level_iter_.Set(iter); +} + +void TwoLevelIterator::InitDataBlock() { + if (!first_level_iter_.Valid()) { + SetSecondLevelIterator(nullptr); + } else { + Slice handle = first_level_iter_.value(); + if (second_level_iter_.iter() != nullptr + && handle.compare(data_block_handle_) == 0) { + // second_level_iter is already constructed with this iterator, so + // no need to change anything + } else { + Iterator* iter = state_->NewSecondaryIterator(handle); + data_block_handle_.assign(handle.data(), handle.size()); + SetSecondLevelIterator(iter); + } + } +} + +} // namespace + +Iterator* NewTwoLevelIterator(TwoLevelIteratorState* state, + Iterator* first_level_iter, Arena* arena) { + if (arena == nullptr) { + return new TwoLevelIterator(state, first_level_iter); + } else { + auto mem = arena->AllocateAligned(sizeof(TwoLevelIterator)); + return new (mem) TwoLevelIterator(state, first_level_iter); + } +} + +} // namespace rocksdb diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h new file mode 100644 index 0000000000..d955dd7631 --- /dev/null +++ b/table/two_level_iterator.h @@ -0,0 +1,50 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/iterator.h" +#include "rocksdb/env.h" +#include "table/iterator_wrapper.h" + +namespace rocksdb { + +struct ReadOptions; +class InternalKeyComparator; +class Arena; + +struct TwoLevelIteratorState { + explicit TwoLevelIteratorState(bool check_prefix_may_match) + : check_prefix_may_match(check_prefix_may_match) {} + + virtual ~TwoLevelIteratorState() {} + virtual Iterator* NewSecondaryIterator(const Slice& handle) = 0; + virtual bool PrefixMayMatch(const Slice& internal_key) = 0; + + // If call PrefixMayMatch() + bool check_prefix_may_match; +}; + + +// Return a new two level iterator. A two-level iterator contains an +// index iterator whose values point to a sequence of blocks where +// each block is itself a sequence of key,value pairs. The returned +// two-level iterator yields the concatenation of all key/value pairs +// in the sequence of blocks. Takes ownership of "index_iter" and +// will delete it when no longer needed. +// +// Uses a supplied function to convert an index_iter value into +// an iterator over the contents of the corresponding block. +// arena: If not null, the arena is used to allocate the Iterator. +// When destroying the iterator, the destructor will destroy +// all the states but those allocated in arena. +extern Iterator* NewTwoLevelIterator(TwoLevelIteratorState* state, + Iterator* first_level_iter, + Arena* arena = nullptr); + +} // namespace rocksdb diff --git a/third-party/rapidjson/document.h b/third-party/rapidjson/document.h new file mode 100644 index 0000000000..83d95a33d0 --- /dev/null +++ b/third-party/rapidjson/document.h @@ -0,0 +1,821 @@ +#ifndef RAPIDJSON_DOCUMENT_H_ +#define RAPIDJSON_DOCUMENT_H_ + +#include "reader.h" +#include "internal/strfunc.h" +#include // placement new + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4127) // conditional expression is constant +#endif + +namespace rapidjson { + +/////////////////////////////////////////////////////////////////////////////// +// GenericValue + +//! Represents a JSON value. Use Value for UTF8 encoding and default allocator. +/*! + A JSON value can be one of 7 types. This class is a variant type supporting + these types. + + Use the Value if UTF8 and default allocator + + \tparam Encoding Encoding of the value. (Even non-string values need to have the same encoding in a document) + \tparam Allocator Allocator type for allocating memory of object, array and string. +*/ +#pragma pack (push, 4) +template > +class GenericValue { +public: + //! Name-value pair in an object. + struct Member { + GenericValue name; //!< name of member (must be a string) + GenericValue value; //!< value of member. + }; + + typedef Encoding EncodingType; //!< Encoding type from template parameter. + typedef Allocator AllocatorType; //!< Allocator type from template parameter. + typedef typename Encoding::Ch Ch; //!< Character type derived from Encoding. + typedef Member* MemberIterator; //!< Member iterator for iterating in object. + typedef const Member* ConstMemberIterator; //!< Constant member iterator for iterating in object. + typedef GenericValue* ValueIterator; //!< Value iterator for iterating in array. + typedef const GenericValue* ConstValueIterator; //!< Constant value iterator for iterating in array. + + //!@name Constructors and destructor. + //@{ + + //! Default constructor creates a null value. + GenericValue() : flags_(kNullFlag) {} + + //! Copy constructor is not permitted. +private: + GenericValue(const GenericValue& rhs); + +public: + + //! Constructor with JSON value type. + /*! This creates a Value of specified type with default content. + \param type Type of the value. + \note Default content for number is zero. + */ + GenericValue(Type type) { + static const unsigned defaultFlags[7] = { + kNullFlag, kFalseFlag, kTrueFlag, kObjectFlag, kArrayFlag, kConstStringFlag, + kNumberFlag | kIntFlag | kUintFlag | kInt64Flag | kUint64Flag | kDoubleFlag + }; + RAPIDJSON_ASSERT(type <= kNumberType); + flags_ = defaultFlags[type]; + memset(&data_, 0, sizeof(data_)); + } + + //! Constructor for boolean value. + GenericValue(bool b) : flags_(b ? kTrueFlag : kFalseFlag) {} + + //! Constructor for int value. + GenericValue(int i) : flags_(kNumberIntFlag) { + data_.n.i64 = i; + if (i >= 0) + flags_ |= kUintFlag | kUint64Flag; + } + + //! Constructor for unsigned value. + GenericValue(unsigned u) : flags_(kNumberUintFlag) { + data_.n.u64 = u; + if (!(u & 0x80000000)) + flags_ |= kIntFlag | kInt64Flag; + } + + //! Constructor for int64_t value. + GenericValue(int64_t i64) : flags_(kNumberInt64Flag) { + data_.n.i64 = i64; + if (i64 >= 0) { + flags_ |= kNumberUint64Flag; + if (!(i64 & 0xFFFFFFFF00000000LL)) + flags_ |= kUintFlag; + if (!(i64 & 0xFFFFFFFF80000000LL)) + flags_ |= kIntFlag; + } + else if (i64 >= -2147483648LL) + flags_ |= kIntFlag; + } + + //! Constructor for uint64_t value. + GenericValue(uint64_t u64) : flags_(kNumberUint64Flag) { + data_.n.u64 = u64; + if (!(u64 & 0x8000000000000000ULL)) + flags_ |= kInt64Flag; + if (!(u64 & 0xFFFFFFFF00000000ULL)) + flags_ |= kUintFlag; + if (!(u64 & 0xFFFFFFFF80000000ULL)) + flags_ |= kIntFlag; + } + + //! Constructor for double value. + GenericValue(double d) : flags_(kNumberDoubleFlag) { data_.n.d = d; } + + //! Constructor for constant string (i.e. do not make a copy of string) + GenericValue(const Ch* s, SizeType length) { + RAPIDJSON_ASSERT(s != NULL); + flags_ = kConstStringFlag; + data_.s.str = s; + data_.s.length = length; + } + + //! Constructor for constant string (i.e. do not make a copy of string) + GenericValue(const Ch* s) { SetStringRaw(s, internal::StrLen(s)); } + + //! Constructor for copy-string (i.e. do make a copy of string) + GenericValue(const Ch* s, SizeType length, Allocator& allocator) { SetStringRaw(s, length, allocator); } + + //! Constructor for copy-string (i.e. do make a copy of string) + GenericValue(const Ch*s, Allocator& allocator) { SetStringRaw(s, internal::StrLen(s), allocator); } + + //! Destructor. + /*! Need to destruct elements of array, members of object, or copy-string. + */ + ~GenericValue() { + if (Allocator::kNeedFree) { // Shortcut by Allocator's trait + switch(flags_) { + case kArrayFlag: + for (GenericValue* v = data_.a.elements; v != data_.a.elements + data_.a.size; ++v) + v->~GenericValue(); + Allocator::Free(data_.a.elements); + break; + + case kObjectFlag: + for (Member* m = data_.o.members; m != data_.o.members + data_.o.size; ++m) { + m->name.~GenericValue(); + m->value.~GenericValue(); + } + Allocator::Free(data_.o.members); + break; + + case kCopyStringFlag: + Allocator::Free(const_cast(data_.s.str)); + break; + } + } + } + + //@} + + //!@name Assignment operators + //@{ + + //! Assignment with move semantics. + /*! \param rhs Source of the assignment. It will become a null value after assignment. + */ + GenericValue& operator=(GenericValue& rhs) { + RAPIDJSON_ASSERT(this != &rhs); + this->~GenericValue(); + memcpy(this, &rhs, sizeof(GenericValue)); + rhs.flags_ = kNullFlag; + return *this; + } + + //! Assignment with primitive types. + /*! \tparam T Either Type, int, unsigned, int64_t, uint64_t, const Ch* + \param value The value to be assigned. + */ + template + GenericValue& operator=(T value) { + this->~GenericValue(); + new (this) GenericValue(value); + return *this; + } + //@} + + //!@name Type + //@{ + + Type GetType() const { return static_cast(flags_ & kTypeMask); } + bool IsNull() const { return flags_ == kNullFlag; } + bool IsFalse() const { return flags_ == kFalseFlag; } + bool IsTrue() const { return flags_ == kTrueFlag; } + bool IsBool() const { return (flags_ & kBoolFlag) != 0; } + bool IsObject() const { return flags_ == kObjectFlag; } + bool IsArray() const { return flags_ == kArrayFlag; } + bool IsNumber() const { return (flags_ & kNumberFlag) != 0; } + bool IsInt() const { return (flags_ & kIntFlag) != 0; } + bool IsUint() const { return (flags_ & kUintFlag) != 0; } + bool IsInt64() const { return (flags_ & kInt64Flag) != 0; } + bool IsUint64() const { return (flags_ & kUint64Flag) != 0; } + bool IsDouble() const { return (flags_ & kDoubleFlag) != 0; } + bool IsString() const { return (flags_ & kStringFlag) != 0; } + + //@} + + //!@name Null + //@{ + + GenericValue& SetNull() { this->~GenericValue(); new (this) GenericValue(); return *this; } + + //@} + + //!@name Bool + //@{ + + bool GetBool() const { RAPIDJSON_ASSERT(IsBool()); return flags_ == kTrueFlag; } + GenericValue& SetBool(bool b) { this->~GenericValue(); new (this) GenericValue(b); return *this; } + + //@} + + //!@name Object + //@{ + + //! Set this value as an empty object. + GenericValue& SetObject() { this->~GenericValue(); new (this) GenericValue(kObjectType); return *this; } + + //! Get the value associated with the object's name. + GenericValue& operator[](const Ch* name) { + if (Member* member = FindMember(name)) + return member->value; + else { + static GenericValue NullValue; + return NullValue; + } + } + const GenericValue& operator[](const Ch* name) const { return const_cast(*this)[name]; } + + //! Member iterators. + ConstMemberIterator MemberBegin() const { RAPIDJSON_ASSERT(IsObject()); return data_.o.members; } + ConstMemberIterator MemberEnd() const { RAPIDJSON_ASSERT(IsObject()); return data_.o.members + data_.o.size; } + MemberIterator MemberBegin() { RAPIDJSON_ASSERT(IsObject()); return data_.o.members; } + MemberIterator MemberEnd() { RAPIDJSON_ASSERT(IsObject()); return data_.o.members + data_.o.size; } + + //! Check whether a member exists in the object. + bool HasMember(const Ch* name) const { return FindMember(name) != 0; } + + //! Add a member (name-value pair) to the object. + /*! \param name A string value as name of member. + \param value Value of any type. + \param allocator Allocator for reallocating memory. + \return The value itself for fluent API. + \note The ownership of name and value will be transfered to this object if success. + */ + GenericValue& AddMember(GenericValue& name, GenericValue& value, Allocator& allocator) { + RAPIDJSON_ASSERT(IsObject()); + RAPIDJSON_ASSERT(name.IsString()); + Object& o = data_.o; + if (o.size >= o.capacity) { + if (o.capacity == 0) { + o.capacity = kDefaultObjectCapacity; + o.members = (Member*)allocator.Malloc(o.capacity * sizeof(Member)); + } + else { + SizeType oldCapacity = o.capacity; + o.capacity *= 2; + o.members = (Member*)allocator.Realloc(o.members, oldCapacity * sizeof(Member), o.capacity * sizeof(Member)); + } + } + o.members[o.size].name.RawAssign(name); + o.members[o.size].value.RawAssign(value); + o.size++; + return *this; + } + + GenericValue& AddMember(const Ch* name, Allocator& nameAllocator, GenericValue& value, Allocator& allocator) { + GenericValue n(name, internal::StrLen(name), nameAllocator); + return AddMember(n, value, allocator); + } + + GenericValue& AddMember(const Ch* name, GenericValue& value, Allocator& allocator) { + GenericValue n(name, internal::StrLen(name)); + return AddMember(n, value, allocator); + } + + template + GenericValue& AddMember(const Ch* name, T value, Allocator& allocator) { + GenericValue n(name, internal::StrLen(name)); + GenericValue v(value); + return AddMember(n, v, allocator); + } + + //! Remove a member in object by its name. + /*! \param name Name of member to be removed. + \return Whether the member existed. + \note Removing member is implemented by moving the last member. So the ordering of members is changed. + */ + bool RemoveMember(const Ch* name) { + RAPIDJSON_ASSERT(IsObject()); + if (Member* m = FindMember(name)) { + RAPIDJSON_ASSERT(data_.o.size > 0); + RAPIDJSON_ASSERT(data_.o.members != 0); + + Member* last = data_.o.members + (data_.o.size - 1); + if (data_.o.size > 1 && m != last) { + // Move the last one to this place + m->name = last->name; + m->value = last->value; + } + else { + // Only one left, just destroy + m->name.~GenericValue(); + m->value.~GenericValue(); + } + --data_.o.size; + return true; + } + return false; + } + + //@} + + //!@name Array + //@{ + + //! Set this value as an empty array. + GenericValue& SetArray() { this->~GenericValue(); new (this) GenericValue(kArrayType); return *this; } + + //! Get the number of elements in array. + SizeType Size() const { RAPIDJSON_ASSERT(IsArray()); return data_.a.size; } + + //! Get the capacity of array. + SizeType Capacity() const { RAPIDJSON_ASSERT(IsArray()); return data_.a.capacity; } + + //! Check whether the array is empty. + bool Empty() const { RAPIDJSON_ASSERT(IsArray()); return data_.a.size == 0; } + + //! Remove all elements in the array. + /*! This function do not deallocate memory in the array, i.e. the capacity is unchanged. + */ + void Clear() { + RAPIDJSON_ASSERT(IsArray()); + for (SizeType i = 0; i < data_.a.size; ++i) + data_.a.elements[i].~GenericValue(); + data_.a.size = 0; + } + + //! Get an element from array by index. + /*! \param index Zero-based index of element. + \note +\code +Value a(kArrayType); +a.PushBack(123); +int x = a[0].GetInt(); // Error: operator[ is ambiguous, as 0 also mean a null pointer of const char* type. +int y = a[SizeType(0)].GetInt(); // Cast to SizeType will work. +int z = a[0u].GetInt(); // This works too. +\endcode + */ + GenericValue& operator[](SizeType index) { + RAPIDJSON_ASSERT(IsArray()); + RAPIDJSON_ASSERT(index < data_.a.size); + return data_.a.elements[index]; + } + const GenericValue& operator[](SizeType index) const { return const_cast(*this)[index]; } + + //! Element iterator + ValueIterator Begin() { RAPIDJSON_ASSERT(IsArray()); return data_.a.elements; } + ValueIterator End() { RAPIDJSON_ASSERT(IsArray()); return data_.a.elements + data_.a.size; } + ConstValueIterator Begin() const { return const_cast(*this).Begin(); } + ConstValueIterator End() const { return const_cast(*this).End(); } + + //! Request the array to have enough capacity to store elements. + /*! \param newCapacity The capacity that the array at least need to have. + \param allocator The allocator for allocating memory. It must be the same one use previously. + \return The value itself for fluent API. + */ + GenericValue& Reserve(SizeType newCapacity, Allocator &allocator) { + RAPIDJSON_ASSERT(IsArray()); + if (newCapacity > data_.a.capacity) { + data_.a.elements = (GenericValue*)allocator.Realloc(data_.a.elements, data_.a.capacity * sizeof(GenericValue), newCapacity * sizeof(GenericValue)); + data_.a.capacity = newCapacity; + } + return *this; + } + + //! Append a value at the end of the array. + /*! \param value The value to be appended. + \param allocator The allocator for allocating memory. It must be the same one use previously. + \return The value itself for fluent API. + \note The ownership of the value will be transfered to this object if success. + \note If the number of elements to be appended is known, calls Reserve() once first may be more efficient. + */ + GenericValue& PushBack(GenericValue& value, Allocator& allocator) { + RAPIDJSON_ASSERT(IsArray()); + if (data_.a.size >= data_.a.capacity) + Reserve(data_.a.capacity == 0 ? kDefaultArrayCapacity : data_.a.capacity * 2, allocator); + data_.a.elements[data_.a.size++].RawAssign(value); + return *this; + } + + template + GenericValue& PushBack(T value, Allocator& allocator) { + GenericValue v(value); + return PushBack(v, allocator); + } + + //! Remove the last element in the array. + GenericValue& PopBack() { + RAPIDJSON_ASSERT(IsArray()); + RAPIDJSON_ASSERT(!Empty()); + data_.a.elements[--data_.a.size].~GenericValue(); + return *this; + } + //@} + + //!@name Number + //@{ + + int GetInt() const { RAPIDJSON_ASSERT(flags_ & kIntFlag); return data_.n.i.i; } + unsigned GetUint() const { RAPIDJSON_ASSERT(flags_ & kUintFlag); return data_.n.u.u; } + int64_t GetInt64() const { RAPIDJSON_ASSERT(flags_ & kInt64Flag); return data_.n.i64; } + uint64_t GetUint64() const { RAPIDJSON_ASSERT(flags_ & kUint64Flag); return data_.n.u64; } + + double GetDouble() const { + RAPIDJSON_ASSERT(IsNumber()); + if ((flags_ & kDoubleFlag) != 0) return data_.n.d; // exact type, no conversion. + if ((flags_ & kIntFlag) != 0) return data_.n.i.i; // int -> double + if ((flags_ & kUintFlag) != 0) return data_.n.u.u; // unsigned -> double + if ((flags_ & kInt64Flag) != 0) return (double)data_.n.i64; // int64_t -> double (may lose precision) + RAPIDJSON_ASSERT((flags_ & kUint64Flag) != 0); return (double)data_.n.u64; // uint64_t -> double (may lose precision) + } + + GenericValue& SetInt(int i) { this->~GenericValue(); new (this) GenericValue(i); return *this; } + GenericValue& SetUint(unsigned u) { this->~GenericValue(); new (this) GenericValue(u); return *this; } + GenericValue& SetInt64(int64_t i64) { this->~GenericValue(); new (this) GenericValue(i64); return *this; } + GenericValue& SetUint64(uint64_t u64) { this->~GenericValue(); new (this) GenericValue(u64); return *this; } + GenericValue& SetDouble(double d) { this->~GenericValue(); new (this) GenericValue(d); return *this; } + + //@} + + //!@name String + //@{ + + const Ch* GetString() const { RAPIDJSON_ASSERT(IsString()); return data_.s.str; } + + //! Get the length of string. + /*! Since rapidjson permits "\u0000" in the json string, strlen(v.GetString()) may not equal to v.GetStringLength(). + */ + SizeType GetStringLength() const { RAPIDJSON_ASSERT(IsString()); return data_.s.length; } + + //! Set this value as a string without copying source string. + /*! This version has better performance with supplied length, and also support string containing null character. + \param s source string pointer. + \param length The length of source string, excluding the trailing null terminator. + \return The value itself for fluent API. + */ + GenericValue& SetString(const Ch* s, SizeType length) { this->~GenericValue(); SetStringRaw(s, length); return *this; } + + //! Set this value as a string without copying source string. + /*! \param s source string pointer. + \return The value itself for fluent API. + */ + GenericValue& SetString(const Ch* s) { return SetString(s, internal::StrLen(s)); } + + //! Set this value as a string by copying from source string. + /*! This version has better performance with supplied length, and also support string containing null character. + \param s source string. + \param length The length of source string, excluding the trailing null terminator. + \param allocator Allocator for allocating copied buffer. Commonly use document.GetAllocator(). + \return The value itself for fluent API. + */ + GenericValue& SetString(const Ch* s, SizeType length, Allocator& allocator) { this->~GenericValue(); SetStringRaw(s, length, allocator); return *this; } + + //! Set this value as a string by copying from source string. + /*! \param s source string. + \param allocator Allocator for allocating copied buffer. Commonly use document.GetAllocator(). + \return The value itself for fluent API. + */ + GenericValue& SetString(const Ch* s, Allocator& allocator) { SetString(s, internal::StrLen(s), allocator); return *this; } + + //@} + + //! Generate events of this value to a Handler. + /*! This function adopts the GoF visitor pattern. + Typical usage is to output this JSON value as JSON text via Writer, which is a Handler. + It can also be used to deep clone this value via GenericDocument, which is also a Handler. + \tparam Handler type of handler. + \param handler An object implementing concept Handler. + */ + template + const GenericValue& Accept(Handler& handler) const { + switch(GetType()) { + case kNullType: handler.Null(); break; + case kFalseType: handler.Bool(false); break; + case kTrueType: handler.Bool(true); break; + + case kObjectType: + handler.StartObject(); + for (Member* m = data_.o.members; m != data_.o.members + data_.o.size; ++m) { + handler.String(m->name.data_.s.str, m->name.data_.s.length, false); + m->value.Accept(handler); + } + handler.EndObject(data_.o.size); + break; + + case kArrayType: + handler.StartArray(); + for (GenericValue* v = data_.a.elements; v != data_.a.elements + data_.a.size; ++v) + v->Accept(handler); + handler.EndArray(data_.a.size); + break; + + case kStringType: + handler.String(data_.s.str, data_.s.length, false); + break; + + case kNumberType: + if (IsInt()) handler.Int(data_.n.i.i); + else if (IsUint()) handler.Uint(data_.n.u.u); + else if (IsInt64()) handler.Int64(data_.n.i64); + else if (IsUint64()) handler.Uint64(data_.n.u64); + else handler.Double(data_.n.d); + break; + } + return *this; + } + +private: + template + friend class GenericDocument; + + enum { + kBoolFlag = 0x100, + kNumberFlag = 0x200, + kIntFlag = 0x400, + kUintFlag = 0x800, + kInt64Flag = 0x1000, + kUint64Flag = 0x2000, + kDoubleFlag = 0x4000, + kStringFlag = 0x100000, + kCopyFlag = 0x200000, + + // Initial flags of different types. + kNullFlag = kNullType, + kTrueFlag = kTrueType | kBoolFlag, + kFalseFlag = kFalseType | kBoolFlag, + kNumberIntFlag = kNumberType | kNumberFlag | kIntFlag | kInt64Flag, + kNumberUintFlag = kNumberType | kNumberFlag | kUintFlag | kUint64Flag | kInt64Flag, + kNumberInt64Flag = kNumberType | kNumberFlag | kInt64Flag, + kNumberUint64Flag = kNumberType | kNumberFlag | kUint64Flag, + kNumberDoubleFlag = kNumberType | kNumberFlag | kDoubleFlag, + kConstStringFlag = kStringType | kStringFlag, + kCopyStringFlag = kStringType | kStringFlag | kCopyFlag, + kObjectFlag = kObjectType, + kArrayFlag = kArrayType, + + kTypeMask = 0xFF // bitwise-and with mask of 0xFF can be optimized by compiler + }; + + static const SizeType kDefaultArrayCapacity = 16; + static const SizeType kDefaultObjectCapacity = 16; + + struct String { + const Ch* str; + SizeType length; + unsigned hashcode; //!< reserved + }; // 12 bytes in 32-bit mode, 16 bytes in 64-bit mode + + // By using proper binary layout, retrieval of different integer types do not need conversions. + union Number { +#if RAPIDJSON_ENDIAN == RAPIDJSON_LITTLEENDIAN + struct I { + int i; + char padding[4]; + }i; + struct U { + unsigned u; + char padding2[4]; + }u; +#else + struct I { + char padding[4]; + int i; + }i; + struct U { + char padding2[4]; + unsigned u; + }u; +#endif + int64_t i64; + uint64_t u64; + double d; + }; // 8 bytes + + struct Object { + Member* members; + SizeType size; + SizeType capacity; + }; // 12 bytes in 32-bit mode, 16 bytes in 64-bit mode + + struct Array { + GenericValue* elements; + SizeType size; + SizeType capacity; + }; // 12 bytes in 32-bit mode, 16 bytes in 64-bit mode + + union Data { + String s; + Number n; + Object o; + Array a; + }; // 12 bytes in 32-bit mode, 16 bytes in 64-bit mode + + //! Find member by name. + Member* FindMember(const Ch* name) { + RAPIDJSON_ASSERT(name); + RAPIDJSON_ASSERT(IsObject()); + + SizeType length = internal::StrLen(name); + + Object& o = data_.o; + for (Member* member = o.members; member != data_.o.members + data_.o.size; ++member) + if (length == member->name.data_.s.length && memcmp(member->name.data_.s.str, name, length * sizeof(Ch)) == 0) + return member; + + return 0; + } + const Member* FindMember(const Ch* name) const { return const_cast(*this).FindMember(name); } + + // Initialize this value as array with initial data, without calling destructor. + void SetArrayRaw(GenericValue* values, SizeType count, Allocator& alloctaor) { + flags_ = kArrayFlag; + data_.a.elements = (GenericValue*)alloctaor.Malloc(count * sizeof(GenericValue)); + memcpy(data_.a.elements, values, count * sizeof(GenericValue)); + data_.a.size = data_.a.capacity = count; + } + + //! Initialize this value as object with initial data, without calling destructor. + void SetObjectRaw(Member* members, SizeType count, Allocator& alloctaor) { + flags_ = kObjectFlag; + data_.o.members = (Member*)alloctaor.Malloc(count * sizeof(Member)); + memcpy(data_.o.members, members, count * sizeof(Member)); + data_.o.size = data_.o.capacity = count; + } + + //! Initialize this value as constant string, without calling destructor. + void SetStringRaw(const Ch* s, SizeType length) { + RAPIDJSON_ASSERT(s != NULL); + flags_ = kConstStringFlag; + data_.s.str = s; + data_.s.length = length; + } + + //! Initialize this value as copy string with initial data, without calling destructor. + void SetStringRaw(const Ch* s, SizeType length, Allocator& allocator) { + RAPIDJSON_ASSERT(s != NULL); + flags_ = kCopyStringFlag; + data_.s.str = (Ch *)allocator.Malloc((length + 1) * sizeof(Ch)); + data_.s.length = length; + memcpy(const_cast(data_.s.str), s, length * sizeof(Ch)); + const_cast(data_.s.str)[length] = '\0'; + } + + //! Assignment without calling destructor + void RawAssign(GenericValue& rhs) { + memcpy(this, &rhs, sizeof(GenericValue)); + rhs.flags_ = kNullFlag; + } + + Data data_; + unsigned flags_; +}; +#pragma pack (pop) + +//! Value with UTF8 encoding. +typedef GenericValue > Value; + +/////////////////////////////////////////////////////////////////////////////// +// GenericDocument + +//! A document for parsing JSON text as DOM. +/*! + \implements Handler + \tparam Encoding encoding for both parsing and string storage. + \tparam Alloactor allocator for allocating memory for the DOM, and the stack during parsing. +*/ +template > +class GenericDocument : public GenericValue { +public: + typedef typename Encoding::Ch Ch; //!< Character type derived from Encoding. + typedef GenericValue ValueType; //!< Value type of the document. + typedef Allocator AllocatorType; //!< Allocator type from template parameter. + + //! Constructor + /*! \param allocator Optional allocator for allocating stack memory. + \param stackCapacity Initial capacity of stack in bytes. + */ + GenericDocument(Allocator* allocator = 0, size_t stackCapacity = kDefaultStackCapacity) : stack_(allocator, stackCapacity), parseError_(0), errorOffset_(0) {} + + //! Parse JSON text from an input stream. + /*! \tparam parseFlags Combination of ParseFlag. + \param stream Input stream to be parsed. + \return The document itself for fluent API. + */ + template + GenericDocument& ParseStream(Stream& stream) { + ValueType::SetNull(); // Remove existing root if exist + GenericReader reader; + if (reader.template Parse(stream, *this)) { + RAPIDJSON_ASSERT(stack_.GetSize() == sizeof(ValueType)); // Got one and only one root object + this->RawAssign(*stack_.template Pop(1)); // Add this-> to prevent issue 13. + parseError_ = 0; + errorOffset_ = 0; + } + else { + parseError_ = reader.GetParseError(); + errorOffset_ = reader.GetErrorOffset(); + ClearStack(); + } + return *this; + } + + //! Parse JSON text from a mutable string. + /*! \tparam parseFlags Combination of ParseFlag. + \param str Mutable zero-terminated string to be parsed. + \return The document itself for fluent API. + */ + template + GenericDocument& ParseInsitu(Ch* str) { + GenericInsituStringStream s(str); + return ParseStream(s); + } + + //! Parse JSON text from a read-only string. + /*! \tparam parseFlags Combination of ParseFlag (must not contain kParseInsituFlag). + \param str Read-only zero-terminated string to be parsed. + */ + template + GenericDocument& Parse(const Ch* str) { + RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag)); + GenericStringStream s(str); + return ParseStream(s); + } + + //! Whether a parse error was occured in the last parsing. + bool HasParseError() const { return parseError_ != 0; } + + //! Get the message of parsing error. + const char* GetParseError() const { return parseError_; } + + //! Get the offset in character of the parsing error. + size_t GetErrorOffset() const { return errorOffset_; } + + //! Get the allocator of this document. + Allocator& GetAllocator() { return stack_.GetAllocator(); } + + //! Get the capacity of stack in bytes. + size_t GetStackCapacity() const { return stack_.GetCapacity(); } + +private: + // Prohibit assignment + GenericDocument& operator=(const GenericDocument&); + + friend class GenericReader; // for Reader to call the following private handler functions + + // Implementation of Handler + void Null() { new (stack_.template Push()) ValueType(); } + void Bool(bool b) { new (stack_.template Push()) ValueType(b); } + void Int(int i) { new (stack_.template Push()) ValueType(i); } + void Uint(unsigned i) { new (stack_.template Push()) ValueType(i); } + void Int64(int64_t i) { new (stack_.template Push()) ValueType(i); } + void Uint64(uint64_t i) { new (stack_.template Push()) ValueType(i); } + void Double(double d) { new (stack_.template Push()) ValueType(d); } + + void String(const Ch* str, SizeType length, bool copy) { + if (copy) + new (stack_.template Push()) ValueType(str, length, GetAllocator()); + else + new (stack_.template Push()) ValueType(str, length); + } + + void StartObject() { new (stack_.template Push()) ValueType(kObjectType); } + + void EndObject(SizeType memberCount) { + typename ValueType::Member* members = stack_.template Pop(memberCount); + stack_.template Top()->SetObjectRaw(members, (SizeType)memberCount, GetAllocator()); + } + + void StartArray() { new (stack_.template Push()) ValueType(kArrayType); } + + void EndArray(SizeType elementCount) { + ValueType* elements = stack_.template Pop(elementCount); + stack_.template Top()->SetArrayRaw(elements, elementCount, GetAllocator()); + } + + void ClearStack() { + if (Allocator::kNeedFree) + while (stack_.GetSize() > 0) // Here assumes all elements in stack array are GenericValue (Member is actually 2 GenericValue objects) + (stack_.template Pop(1))->~ValueType(); + else + stack_.Clear(); + } + + static const size_t kDefaultStackCapacity = 1024; + internal::Stack stack_; + const char* parseError_; + size_t errorOffset_; +}; + +typedef GenericDocument > Document; + +} // namespace rapidjson + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#endif // RAPIDJSON_DOCUMENT_H_ diff --git a/third-party/rapidjson/filestream.h b/third-party/rapidjson/filestream.h new file mode 100644 index 0000000000..885894963f --- /dev/null +++ b/third-party/rapidjson/filestream.h @@ -0,0 +1,46 @@ +#ifndef RAPIDJSON_FILESTREAM_H_ +#define RAPIDJSON_FILESTREAM_H_ + +#include + +namespace rapidjson { + +//! Wrapper of C file stream for input or output. +/*! + This simple wrapper does not check the validity of the stream. + \implements Stream +*/ +class FileStream { +public: + typedef char Ch; //!< Character type. Only support char. + + FileStream(FILE* fp) : fp_(fp), count_(0) { Read(); } + char Peek() const { return current_; } + char Take() { char c = current_; Read(); return c; } + size_t Tell() const { return count_; } + void Put(char c) { fputc(c, fp_); } + + // Not implemented + char* PutBegin() { return 0; } + size_t PutEnd(char*) { return 0; } + +private: + void Read() { + RAPIDJSON_ASSERT(fp_ != 0); + int c = fgetc(fp_); + if (c != EOF) { + current_ = (char)c; + count_++; + } + else + current_ = '\0'; + } + + FILE* fp_; + char current_; + size_t count_; +}; + +} // namespace rapidjson + +#endif // RAPIDJSON_FILESTREAM_H_ diff --git a/third-party/rapidjson/internal/pow10.h b/third-party/rapidjson/internal/pow10.h new file mode 100644 index 0000000000..bf3a9afb04 --- /dev/null +++ b/third-party/rapidjson/internal/pow10.h @@ -0,0 +1,54 @@ +#ifndef RAPIDJSON_POW10_ +#define RAPIDJSON_POW10_ + +namespace rapidjson { +namespace internal { + +//! Computes integer powers of 10 in double (10.0^n). +/*! This function uses lookup table for fast and accurate results. + \param n positive/negative exponent. Must <= 308. + \return 10.0^n +*/ +inline double Pow10(int n) { + static const double e[] = { // 1e-308...1e308: 617 * 8 bytes = 4936 bytes + 1e-308,1e-307,1e-306,1e-305,1e-304,1e-303,1e-302,1e-301,1e-300, + 1e-299,1e-298,1e-297,1e-296,1e-295,1e-294,1e-293,1e-292,1e-291,1e-290,1e-289,1e-288,1e-287,1e-286,1e-285,1e-284,1e-283,1e-282,1e-281,1e-280, + 1e-279,1e-278,1e-277,1e-276,1e-275,1e-274,1e-273,1e-272,1e-271,1e-270,1e-269,1e-268,1e-267,1e-266,1e-265,1e-264,1e-263,1e-262,1e-261,1e-260, + 1e-259,1e-258,1e-257,1e-256,1e-255,1e-254,1e-253,1e-252,1e-251,1e-250,1e-249,1e-248,1e-247,1e-246,1e-245,1e-244,1e-243,1e-242,1e-241,1e-240, + 1e-239,1e-238,1e-237,1e-236,1e-235,1e-234,1e-233,1e-232,1e-231,1e-230,1e-229,1e-228,1e-227,1e-226,1e-225,1e-224,1e-223,1e-222,1e-221,1e-220, + 1e-219,1e-218,1e-217,1e-216,1e-215,1e-214,1e-213,1e-212,1e-211,1e-210,1e-209,1e-208,1e-207,1e-206,1e-205,1e-204,1e-203,1e-202,1e-201,1e-200, + 1e-199,1e-198,1e-197,1e-196,1e-195,1e-194,1e-193,1e-192,1e-191,1e-190,1e-189,1e-188,1e-187,1e-186,1e-185,1e-184,1e-183,1e-182,1e-181,1e-180, + 1e-179,1e-178,1e-177,1e-176,1e-175,1e-174,1e-173,1e-172,1e-171,1e-170,1e-169,1e-168,1e-167,1e-166,1e-165,1e-164,1e-163,1e-162,1e-161,1e-160, + 1e-159,1e-158,1e-157,1e-156,1e-155,1e-154,1e-153,1e-152,1e-151,1e-150,1e-149,1e-148,1e-147,1e-146,1e-145,1e-144,1e-143,1e-142,1e-141,1e-140, + 1e-139,1e-138,1e-137,1e-136,1e-135,1e-134,1e-133,1e-132,1e-131,1e-130,1e-129,1e-128,1e-127,1e-126,1e-125,1e-124,1e-123,1e-122,1e-121,1e-120, + 1e-119,1e-118,1e-117,1e-116,1e-115,1e-114,1e-113,1e-112,1e-111,1e-110,1e-109,1e-108,1e-107,1e-106,1e-105,1e-104,1e-103,1e-102,1e-101,1e-100, + 1e-99, 1e-98, 1e-97, 1e-96, 1e-95, 1e-94, 1e-93, 1e-92, 1e-91, 1e-90, 1e-89, 1e-88, 1e-87, 1e-86, 1e-85, 1e-84, 1e-83, 1e-82, 1e-81, 1e-80, + 1e-79, 1e-78, 1e-77, 1e-76, 1e-75, 1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66, 1e-65, 1e-64, 1e-63, 1e-62, 1e-61, 1e-60, + 1e-59, 1e-58, 1e-57, 1e-56, 1e-55, 1e-54, 1e-53, 1e-52, 1e-51, 1e-50, 1e-49, 1e-48, 1e-47, 1e-46, 1e-45, 1e-44, 1e-43, 1e-42, 1e-41, 1e-40, + 1e-39, 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29, 1e-28, 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, + 1e-19, 1e-18, 1e-17, 1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e+0, + 1e+1, 1e+2, 1e+3, 1e+4, 1e+5, 1e+6, 1e+7, 1e+8, 1e+9, 1e+10, 1e+11, 1e+12, 1e+13, 1e+14, 1e+15, 1e+16, 1e+17, 1e+18, 1e+19, 1e+20, + 1e+21, 1e+22, 1e+23, 1e+24, 1e+25, 1e+26, 1e+27, 1e+28, 1e+29, 1e+30, 1e+31, 1e+32, 1e+33, 1e+34, 1e+35, 1e+36, 1e+37, 1e+38, 1e+39, 1e+40, + 1e+41, 1e+42, 1e+43, 1e+44, 1e+45, 1e+46, 1e+47, 1e+48, 1e+49, 1e+50, 1e+51, 1e+52, 1e+53, 1e+54, 1e+55, 1e+56, 1e+57, 1e+58, 1e+59, 1e+60, + 1e+61, 1e+62, 1e+63, 1e+64, 1e+65, 1e+66, 1e+67, 1e+68, 1e+69, 1e+70, 1e+71, 1e+72, 1e+73, 1e+74, 1e+75, 1e+76, 1e+77, 1e+78, 1e+79, 1e+80, + 1e+81, 1e+82, 1e+83, 1e+84, 1e+85, 1e+86, 1e+87, 1e+88, 1e+89, 1e+90, 1e+91, 1e+92, 1e+93, 1e+94, 1e+95, 1e+96, 1e+97, 1e+98, 1e+99, 1e+100, + 1e+101,1e+102,1e+103,1e+104,1e+105,1e+106,1e+107,1e+108,1e+109,1e+110,1e+111,1e+112,1e+113,1e+114,1e+115,1e+116,1e+117,1e+118,1e+119,1e+120, + 1e+121,1e+122,1e+123,1e+124,1e+125,1e+126,1e+127,1e+128,1e+129,1e+130,1e+131,1e+132,1e+133,1e+134,1e+135,1e+136,1e+137,1e+138,1e+139,1e+140, + 1e+141,1e+142,1e+143,1e+144,1e+145,1e+146,1e+147,1e+148,1e+149,1e+150,1e+151,1e+152,1e+153,1e+154,1e+155,1e+156,1e+157,1e+158,1e+159,1e+160, + 1e+161,1e+162,1e+163,1e+164,1e+165,1e+166,1e+167,1e+168,1e+169,1e+170,1e+171,1e+172,1e+173,1e+174,1e+175,1e+176,1e+177,1e+178,1e+179,1e+180, + 1e+181,1e+182,1e+183,1e+184,1e+185,1e+186,1e+187,1e+188,1e+189,1e+190,1e+191,1e+192,1e+193,1e+194,1e+195,1e+196,1e+197,1e+198,1e+199,1e+200, + 1e+201,1e+202,1e+203,1e+204,1e+205,1e+206,1e+207,1e+208,1e+209,1e+210,1e+211,1e+212,1e+213,1e+214,1e+215,1e+216,1e+217,1e+218,1e+219,1e+220, + 1e+221,1e+222,1e+223,1e+224,1e+225,1e+226,1e+227,1e+228,1e+229,1e+230,1e+231,1e+232,1e+233,1e+234,1e+235,1e+236,1e+237,1e+238,1e+239,1e+240, + 1e+241,1e+242,1e+243,1e+244,1e+245,1e+246,1e+247,1e+248,1e+249,1e+250,1e+251,1e+252,1e+253,1e+254,1e+255,1e+256,1e+257,1e+258,1e+259,1e+260, + 1e+261,1e+262,1e+263,1e+264,1e+265,1e+266,1e+267,1e+268,1e+269,1e+270,1e+271,1e+272,1e+273,1e+274,1e+275,1e+276,1e+277,1e+278,1e+279,1e+280, + 1e+281,1e+282,1e+283,1e+284,1e+285,1e+286,1e+287,1e+288,1e+289,1e+290,1e+291,1e+292,1e+293,1e+294,1e+295,1e+296,1e+297,1e+298,1e+299,1e+300, + 1e+301,1e+302,1e+303,1e+304,1e+305,1e+306,1e+307,1e+308 + }; + RAPIDJSON_ASSERT(n <= 308); + return n < -308 ? 0.0 : e[n + 308]; +} + +} // namespace internal +} // namespace rapidjson + +#endif // RAPIDJSON_POW10_ diff --git a/third-party/rapidjson/internal/stack.h b/third-party/rapidjson/internal/stack.h new file mode 100644 index 0000000000..966893b3fc --- /dev/null +++ b/third-party/rapidjson/internal/stack.h @@ -0,0 +1,82 @@ +#ifndef RAPIDJSON_INTERNAL_STACK_H_ +#define RAPIDJSON_INTERNAL_STACK_H_ + +namespace rapidjson { +namespace internal { + +/////////////////////////////////////////////////////////////////////////////// +// Stack + +//! A type-unsafe stack for storing different types of data. +/*! \tparam Allocator Allocator for allocating stack memory. +*/ +template +class Stack { +public: + Stack(Allocator* allocator, size_t stack_capacity) : allocator_(allocator), own_allocator_(0), stack_(0), stack_top_(0), stack_end_(0), stack_capacity_(stack_capacity) { + RAPIDJSON_ASSERT(stack_capacity_ > 0); + if (!allocator_) + own_allocator_ = allocator_ = new Allocator(); + stack_top_ = stack_ = (char*)allocator_->Malloc(stack_capacity_); + stack_end_ = stack_ + stack_capacity_; + } + + ~Stack() { + Allocator::Free(stack_); + delete own_allocator_; // Only delete if it is owned by the stack + } + + void Clear() { /*stack_top_ = 0;*/ stack_top_ = stack_; } + + template + T* Push(size_t count = 1) { + // Expand the stack if needed + if (stack_top_ + sizeof(T) * count >= stack_end_) { + size_t new_capacity = stack_capacity_ * 2; + size_t size = GetSize(); + size_t new_size = GetSize() + sizeof(T) * count; + if (new_capacity < new_size) + new_capacity = new_size; + stack_ = (char*)allocator_->Realloc(stack_, stack_capacity_, new_capacity); + stack_capacity_ = new_capacity; + stack_top_ = stack_ + size; + stack_end_ = stack_ + stack_capacity_; + } + T* ret = (T*)stack_top_; + stack_top_ += sizeof(T) * count; + return ret; + } + + template + T* Pop(size_t count) { + RAPIDJSON_ASSERT(GetSize() >= count * sizeof(T)); + stack_top_ -= count * sizeof(T); + return (T*)stack_top_; + } + + template + T* Top() { + RAPIDJSON_ASSERT(GetSize() >= sizeof(T)); + return (T*)(stack_top_ - sizeof(T)); + } + + template + T* Bottom() { return (T*)stack_; } + + Allocator& GetAllocator() { return *allocator_; } + size_t GetSize() const { return stack_top_ - stack_; } + size_t GetCapacity() const { return stack_capacity_; } + +private: + Allocator* allocator_; + Allocator* own_allocator_; + char *stack_; + char *stack_top_; + char *stack_end_; + size_t stack_capacity_; +}; + +} // namespace internal +} // namespace rapidjson + +#endif // RAPIDJSON_STACK_H_ diff --git a/third-party/rapidjson/internal/strfunc.h b/third-party/rapidjson/internal/strfunc.h new file mode 100644 index 0000000000..bbf444fe6d --- /dev/null +++ b/third-party/rapidjson/internal/strfunc.h @@ -0,0 +1,24 @@ +#ifndef RAPIDJSON_INTERNAL_STRFUNC_H_ +#define RAPIDJSON_INTERNAL_STRFUNC_H_ + +namespace rapidjson { +namespace internal { + +//! Custom strlen() which works on different character types. +/*! \tparam Ch Character type (e.g. char, wchar_t, short) + \param s Null-terminated input string. + \return Number of characters in the string. + \note This has the same semantics as strlen(), the return value is not number of Unicode codepoints. +*/ +template +inline SizeType StrLen(const Ch* s) { + const Ch* p = s; + while (*p != '\0') + ++p; + return SizeType(p - s); +} + +} // namespace internal +} // namespace rapidjson + +#endif // RAPIDJSON_INTERNAL_STRFUNC_H_ diff --git a/third-party/rapidjson/license.txt b/third-party/rapidjson/license.txt new file mode 100644 index 0000000000..03d97d163e --- /dev/null +++ b/third-party/rapidjson/license.txt @@ -0,0 +1,19 @@ +Copyright (C) 2011 Milo Yip + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/third-party/rapidjson/prettywriter.h b/third-party/rapidjson/prettywriter.h new file mode 100644 index 0000000000..238ff5ff62 --- /dev/null +++ b/third-party/rapidjson/prettywriter.h @@ -0,0 +1,156 @@ +#ifndef RAPIDJSON_PRETTYWRITER_H_ +#define RAPIDJSON_PRETTYWRITER_H_ + +#include "writer.h" + +namespace rapidjson { + +//! Writer with indentation and spacing. +/*! + \tparam Stream Type of ouptut stream. + \tparam Encoding Encoding of both source strings and output. + \tparam Allocator Type of allocator for allocating memory of stack. +*/ +template, typename Allocator = MemoryPoolAllocator<> > +class PrettyWriter : public Writer { +public: + typedef Writer Base; + typedef typename Base::Ch Ch; + + //! Constructor + /*! \param stream Output stream. + \param allocator User supplied allocator. If it is null, it will create a private one. + \param levelDepth Initial capacity of + */ + PrettyWriter(Stream& stream, Allocator* allocator = 0, size_t levelDepth = Base::kDefaultLevelDepth) : + Base(stream, allocator, levelDepth), indentChar_(' '), indentCharCount_(4) {} + + //! Set custom indentation. + /*! \param indentChar Character for indentation. Must be whitespace character (' ', '\t', '\n', '\r'). + \param indentCharCount Number of indent characters for each indentation level. + \note The default indentation is 4 spaces. + */ + PrettyWriter& SetIndent(Ch indentChar, unsigned indentCharCount) { + RAPIDJSON_ASSERT(indentChar == ' ' || indentChar == '\t' || indentChar == '\n' || indentChar == '\r'); + indentChar_ = indentChar; + indentCharCount_ = indentCharCount; + return *this; + } + + //@name Implementation of Handler. + //@{ + + PrettyWriter& Null() { PrettyPrefix(kNullType); Base::WriteNull(); return *this; } + PrettyWriter& Bool(bool b) { PrettyPrefix(b ? kTrueType : kFalseType); Base::WriteBool(b); return *this; } + PrettyWriter& Int(int i) { PrettyPrefix(kNumberType); Base::WriteInt(i); return *this; } + PrettyWriter& Uint(unsigned u) { PrettyPrefix(kNumberType); Base::WriteUint(u); return *this; } + PrettyWriter& Int64(int64_t i64) { PrettyPrefix(kNumberType); Base::WriteInt64(i64); return *this; } + PrettyWriter& Uint64(uint64_t u64) { PrettyPrefix(kNumberType); Base::WriteUint64(u64); return *this; } + PrettyWriter& Double(double d) { PrettyPrefix(kNumberType); Base::WriteDouble(d); return *this; } + + PrettyWriter& String(const Ch* str, SizeType length, bool copy = false) { + (void)copy; + PrettyPrefix(kStringType); + Base::WriteString(str, length); + return *this; + } + + PrettyWriter& StartObject() { + PrettyPrefix(kObjectType); + new (Base::level_stack_.template Push()) typename Base::Level(false); + Base::WriteStartObject(); + return *this; + } + + PrettyWriter& EndObject(SizeType memberCount = 0) { + (void)memberCount; + RAPIDJSON_ASSERT(Base::level_stack_.GetSize() >= sizeof(typename Base::Level)); + RAPIDJSON_ASSERT(!Base::level_stack_.template Top()->inArray); + bool empty = Base::level_stack_.template Pop(1)->valueCount == 0; + + if (!empty) { + Base::stream_.Put('\n'); + WriteIndent(); + } + Base::WriteEndObject(); + return *this; + } + + PrettyWriter& StartArray() { + PrettyPrefix(kArrayType); + new (Base::level_stack_.template Push()) typename Base::Level(true); + Base::WriteStartArray(); + return *this; + } + + PrettyWriter& EndArray(SizeType memberCount = 0) { + (void)memberCount; + RAPIDJSON_ASSERT(Base::level_stack_.GetSize() >= sizeof(typename Base::Level)); + RAPIDJSON_ASSERT(Base::level_stack_.template Top()->inArray); + bool empty = Base::level_stack_.template Pop(1)->valueCount == 0; + + if (!empty) { + Base::stream_.Put('\n'); + WriteIndent(); + } + Base::WriteEndArray(); + return *this; + } + + //@} + + //! Simpler but slower overload. + PrettyWriter& String(const Ch* str) { return String(str, internal::StrLen(str)); } + +protected: + void PrettyPrefix(Type type) { + (void)type; + if (Base::level_stack_.GetSize() != 0) { // this value is not at root + typename Base::Level* level = Base::level_stack_.template Top(); + + if (level->inArray) { + if (level->valueCount > 0) { + Base::stream_.Put(','); // add comma if it is not the first element in array + Base::stream_.Put('\n'); + } + else + Base::stream_.Put('\n'); + WriteIndent(); + } + else { // in object + if (level->valueCount > 0) { + if (level->valueCount % 2 == 0) { + Base::stream_.Put(','); + Base::stream_.Put('\n'); + } + else { + Base::stream_.Put(':'); + Base::stream_.Put(' '); + } + } + else + Base::stream_.Put('\n'); + + if (level->valueCount % 2 == 0) + WriteIndent(); + } + if (!level->inArray && level->valueCount % 2 == 0) + RAPIDJSON_ASSERT(type == kStringType); // if it's in object, then even number should be a name + level->valueCount++; + } + else + RAPIDJSON_ASSERT(type == kObjectType || type == kArrayType); + } + + void WriteIndent() { + size_t count = (Base::level_stack_.GetSize() / sizeof(typename Base::Level)) * indentCharCount_; + PutN(Base::stream_, indentChar_, count); + } + + Ch indentChar_; + unsigned indentCharCount_; +}; + +} // namespace rapidjson + +#endif // RAPIDJSON_RAPIDJSON_H_ diff --git a/third-party/rapidjson/rapidjson.h b/third-party/rapidjson/rapidjson.h new file mode 100644 index 0000000000..7acb2aa4fd --- /dev/null +++ b/third-party/rapidjson/rapidjson.h @@ -0,0 +1,525 @@ +#ifndef RAPIDJSON_RAPIDJSON_H_ +#define RAPIDJSON_RAPIDJSON_H_ + +// Copyright (c) 2011-2012 Milo Yip (miloyip@gmail.com) +// Version 0.11 + +#include // malloc(), realloc(), free() +#include // memcpy() + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_NO_INT64DEFINE + +// Here defines int64_t and uint64_t types in global namespace. +// If user have their own definition, can define RAPIDJSON_NO_INT64DEFINE to disable this. +#ifndef RAPIDJSON_NO_INT64DEFINE +#ifdef _MSC_VER +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#else +#include +#endif +#endif // RAPIDJSON_NO_INT64TYPEDEF + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_ENDIAN +#define RAPIDJSON_LITTLEENDIAN 0 //!< Little endian machine +#define RAPIDJSON_BIGENDIAN 1 //!< Big endian machine + +//! Endianness of the machine. +/*! GCC provided macro for detecting endianness of the target machine. But other + compilers may not have this. User can define RAPIDJSON_ENDIAN to either + RAPIDJSON_LITTLEENDIAN or RAPIDJSON_BIGENDIAN. +*/ +#ifndef RAPIDJSON_ENDIAN +#ifdef __BYTE_ORDER__ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define RAPIDJSON_ENDIAN RAPIDJSON_LITTLEENDIAN +#else +#define RAPIDJSON_ENDIAN RAPIDJSON_BIGENDIAN +#endif // __BYTE_ORDER__ +#else +#define RAPIDJSON_ENDIAN RAPIDJSON_LITTLEENDIAN // Assumes little endian otherwise. +#endif +#endif // RAPIDJSON_ENDIAN + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_SSE2/RAPIDJSON_SSE42/RAPIDJSON_SIMD + +// Enable SSE2 optimization. +//#define RAPIDJSON_SSE2 + +// Enable SSE4.2 optimization. +//#define RAPIDJSON_SSE42 + +#if defined(RAPIDJSON_SSE2) || defined(RAPIDJSON_SSE42) +#define RAPIDJSON_SIMD +#endif + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_NO_SIZETYPEDEFINE + +#ifndef RAPIDJSON_NO_SIZETYPEDEFINE +namespace rapidjson { +//! Use 32-bit array/string indices even for 64-bit platform, instead of using size_t. +/*! User may override the SizeType by defining RAPIDJSON_NO_SIZETYPEDEFINE. +*/ +typedef unsigned SizeType; +} // namespace rapidjson +#endif + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_ASSERT + +//! Assertion. +/*! By default, rapidjson uses C assert() for assertion. + User can override it by defining RAPIDJSON_ASSERT(x) macro. +*/ +#ifndef RAPIDJSON_ASSERT +#include +#define RAPIDJSON_ASSERT(x) assert(x) +#endif // RAPIDJSON_ASSERT + +/////////////////////////////////////////////////////////////////////////////// +// Helpers + +#define RAPIDJSON_MULTILINEMACRO_BEGIN do { +#define RAPIDJSON_MULTILINEMACRO_END \ +} while((void)0, 0) + +namespace rapidjson { + +/////////////////////////////////////////////////////////////////////////////// +// Allocator + +/*! \class rapidjson::Allocator + \brief Concept for allocating, resizing and freeing memory block. + + Note that Malloc() and Realloc() are non-static but Free() is static. + + So if an allocator need to support Free(), it needs to put its pointer in + the header of memory block. + +\code +concept Allocator { + static const bool kNeedFree; //!< Whether this allocator needs to call Free(). + + // Allocate a memory block. + // \param size of the memory block in bytes. + // \returns pointer to the memory block. + void* Malloc(size_t size); + + // Resize a memory block. + // \param originalPtr The pointer to current memory block. Null pointer is permitted. + // \param originalSize The current size in bytes. (Design issue: since some allocator may not book-keep this, explicitly pass to it can save memory.) + // \param newSize the new size in bytes. + void* Realloc(void* originalPtr, size_t originalSize, size_t newSize); + + // Free a memory block. + // \param pointer to the memory block. Null pointer is permitted. + static void Free(void *ptr); +}; +\endcode +*/ + +/////////////////////////////////////////////////////////////////////////////// +// CrtAllocator + +//! C-runtime library allocator. +/*! This class is just wrapper for standard C library memory routines. + \implements Allocator +*/ +class CrtAllocator { +public: + static const bool kNeedFree = true; + void* Malloc(size_t size) { return malloc(size); } + void* Realloc(void* originalPtr, size_t originalSize, size_t newSize) { (void)originalSize; return realloc(originalPtr, newSize); } + static void Free(void *ptr) { free(ptr); } +}; + +/////////////////////////////////////////////////////////////////////////////// +// MemoryPoolAllocator + +//! Default memory allocator used by the parser and DOM. +/*! This allocator allocate memory blocks from pre-allocated memory chunks. + + It does not free memory blocks. And Realloc() only allocate new memory. + + The memory chunks are allocated by BaseAllocator, which is CrtAllocator by default. + + User may also supply a buffer as the first chunk. + + If the user-buffer is full then additional chunks are allocated by BaseAllocator. + + The user-buffer is not deallocated by this allocator. + + \tparam BaseAllocator the allocator type for allocating memory chunks. Default is CrtAllocator. + \implements Allocator +*/ +template +class MemoryPoolAllocator { +public: + static const bool kNeedFree = false; //!< Tell users that no need to call Free() with this allocator. (concept Allocator) + + //! Constructor with chunkSize. + /*! \param chunkSize The size of memory chunk. The default is kDefaultChunkSize. + \param baseAllocator The allocator for allocating memory chunks. + */ + MemoryPoolAllocator(size_t chunkSize = kDefaultChunkCapacity, BaseAllocator* baseAllocator = 0) : + chunkHead_(0), chunk_capacity_(chunkSize), userBuffer_(0), baseAllocator_(baseAllocator), ownBaseAllocator_(0) + { + if (!baseAllocator_) + ownBaseAllocator_ = baseAllocator_ = new BaseAllocator(); + AddChunk(chunk_capacity_); + } + + //! Constructor with user-supplied buffer. + /*! The user buffer will be used firstly. When it is full, memory pool allocates new chunk with chunk size. + + The user buffer will not be deallocated when this allocator is destructed. + + \param buffer User supplied buffer. + \param size Size of the buffer in bytes. It must at least larger than sizeof(ChunkHeader). + \param chunkSize The size of memory chunk. The default is kDefaultChunkSize. + \param baseAllocator The allocator for allocating memory chunks. + */ + MemoryPoolAllocator(char *buffer, size_t size, size_t chunkSize = kDefaultChunkCapacity, BaseAllocator* baseAllocator = 0) : + chunkHead_(0), chunk_capacity_(chunkSize), userBuffer_(buffer), baseAllocator_(baseAllocator), ownBaseAllocator_(0) + { + RAPIDJSON_ASSERT(buffer != 0); + RAPIDJSON_ASSERT(size > sizeof(ChunkHeader)); + chunkHead_ = (ChunkHeader*)buffer; + chunkHead_->capacity = size - sizeof(ChunkHeader); + chunkHead_->size = 0; + chunkHead_->next = 0; + } + + //! Destructor. + /*! This deallocates all memory chunks, excluding the user-supplied buffer. + */ + ~MemoryPoolAllocator() { + Clear(); + delete ownBaseAllocator_; + } + + //! Deallocates all memory chunks, excluding the user-supplied buffer. + void Clear() { + while(chunkHead_ != 0 && chunkHead_ != (ChunkHeader *)userBuffer_) { + ChunkHeader* next = chunkHead_->next; + baseAllocator_->Free(chunkHead_); + chunkHead_ = next; + } + } + + //! Computes the total capacity of allocated memory chunks. + /*! \return total capacity in bytes. + */ + size_t Capacity() { + size_t capacity = 0; + for (ChunkHeader* c = chunkHead_; c != 0; c = c->next) + capacity += c->capacity; + return capacity; + } + + //! Computes the memory blocks allocated. + /*! \return total used bytes. + */ + size_t Size() { + size_t size = 0; + for (ChunkHeader* c = chunkHead_; c != 0; c = c->next) + size += c->size; + return size; + } + + //! Allocates a memory block. (concept Allocator) + void* Malloc(size_t size) { + size = (size + 3) & ~3; // Force aligning size to 4 + + if (chunkHead_->size + size > chunkHead_->capacity) + AddChunk(chunk_capacity_ > size ? chunk_capacity_ : size); + + char *buffer = (char *)(chunkHead_ + 1) + chunkHead_->size; + RAPIDJSON_ASSERT(((uintptr_t)buffer & 3) == 0); // returned buffer is aligned to 4 + chunkHead_->size += size; + + return buffer; + } + + //! Resizes a memory block (concept Allocator) + void* Realloc(void* originalPtr, size_t originalSize, size_t newSize) { + if (originalPtr == 0) + return Malloc(newSize); + + // Do not shrink if new size is smaller than original + if (originalSize >= newSize) + return originalPtr; + + // Simply expand it if it is the last allocation and there is sufficient space + if (originalPtr == (char *)(chunkHead_ + 1) + chunkHead_->size - originalSize) { + size_t increment = newSize - originalSize; + increment = (increment + 3) & ~3; // Force aligning size to 4 + if (chunkHead_->size + increment <= chunkHead_->capacity) { + chunkHead_->size += increment; + RAPIDJSON_ASSERT(((uintptr_t)originalPtr & 3) == 0); // returned buffer is aligned to 4 + return originalPtr; + } + } + + // Realloc process: allocate and copy memory, do not free original buffer. + void* newBuffer = Malloc(newSize); + RAPIDJSON_ASSERT(newBuffer != 0); // Do not handle out-of-memory explicitly. + return memcpy(newBuffer, originalPtr, originalSize); + } + + //! Frees a memory block (concept Allocator) + static void Free(void *) {} // Do nothing + +private: + //! Creates a new chunk. + /*! \param capacity Capacity of the chunk in bytes. + */ + void AddChunk(size_t capacity) { + ChunkHeader* chunk = (ChunkHeader*)baseAllocator_->Malloc(sizeof(ChunkHeader) + capacity); + chunk->capacity = capacity; + chunk->size = 0; + chunk->next = chunkHead_; + chunkHead_ = chunk; + } + + static const int kDefaultChunkCapacity = 64 * 1024; //!< Default chunk capacity. + + //! Chunk header for perpending to each chunk. + /*! Chunks are stored as a singly linked list. + */ + struct ChunkHeader { + size_t capacity; //!< Capacity of the chunk in bytes (excluding the header itself). + size_t size; //!< Current size of allocated memory in bytes. + ChunkHeader *next; //!< Next chunk in the linked list. + }; + + ChunkHeader *chunkHead_; //!< Head of the chunk linked-list. Only the head chunk serves allocation. + size_t chunk_capacity_; //!< The minimum capacity of chunk when they are allocated. + char *userBuffer_; //!< User supplied buffer. + BaseAllocator* baseAllocator_; //!< base allocator for allocating memory chunks. + BaseAllocator* ownBaseAllocator_; //!< base allocator created by this object. +}; + +/////////////////////////////////////////////////////////////////////////////// +// Encoding + +/*! \class rapidjson::Encoding + \brief Concept for encoding of Unicode characters. + +\code +concept Encoding { + typename Ch; //! Type of character. + + //! \brief Encode a Unicode codepoint to a buffer. + //! \param buffer pointer to destination buffer to store the result. It should have sufficient size of encoding one character. + //! \param codepoint An unicode codepoint, ranging from 0x0 to 0x10FFFF inclusively. + //! \returns the pointer to the next character after the encoded data. + static Ch* Encode(Ch *buffer, unsigned codepoint); +}; +\endcode +*/ + +/////////////////////////////////////////////////////////////////////////////// +// UTF8 + +//! UTF-8 encoding. +/*! http://en.wikipedia.org/wiki/UTF-8 + \tparam CharType Type for storing 8-bit UTF-8 data. Default is char. + \implements Encoding +*/ +template +struct UTF8 { + typedef CharType Ch; + + static Ch* Encode(Ch *buffer, unsigned codepoint) { + if (codepoint <= 0x7F) + *buffer++ = codepoint & 0xFF; + else if (codepoint <= 0x7FF) { + *buffer++ = 0xC0 | ((codepoint >> 6) & 0xFF); + *buffer++ = 0x80 | ((codepoint & 0x3F)); + } + else if (codepoint <= 0xFFFF) { + *buffer++ = 0xE0 | ((codepoint >> 12) & 0xFF); + *buffer++ = 0x80 | ((codepoint >> 6) & 0x3F); + *buffer++ = 0x80 | (codepoint & 0x3F); + } + else { + RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); + *buffer++ = 0xF0 | ((codepoint >> 18) & 0xFF); + *buffer++ = 0x80 | ((codepoint >> 12) & 0x3F); + *buffer++ = 0x80 | ((codepoint >> 6) & 0x3F); + *buffer++ = 0x80 | (codepoint & 0x3F); + } + return buffer; + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// UTF16 + +//! UTF-16 encoding. +/*! http://en.wikipedia.org/wiki/UTF-16 + \tparam CharType Type for storing 16-bit UTF-16 data. Default is wchar_t. C++11 may use char16_t instead. + \implements Encoding +*/ +template +struct UTF16 { + typedef CharType Ch; + + static Ch* Encode(Ch* buffer, unsigned codepoint) { + if (codepoint <= 0xFFFF) { + RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair + *buffer++ = static_cast(codepoint); + } + else { + RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); + unsigned v = codepoint - 0x10000; + *buffer++ = static_cast((v >> 10) + 0xD800); + *buffer++ = (v & 0x3FF) + 0xDC00; + } + return buffer; + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// UTF32 + +//! UTF-32 encoding. +/*! http://en.wikipedia.org/wiki/UTF-32 + \tparam Ch Type for storing 32-bit UTF-32 data. Default is unsigned. C++11 may use char32_t instead. + \implements Encoding +*/ +template +struct UTF32 { + typedef CharType Ch; + + static Ch *Encode(Ch* buffer, unsigned codepoint) { + RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); + *buffer++ = codepoint; + return buffer; + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Stream + +/*! \class rapidjson::Stream + \brief Concept for reading and writing characters. + + For read-only stream, no need to implement PutBegin(), Put() and PutEnd(). + + For write-only stream, only need to implement Put(). + +\code +concept Stream { + typename Ch; //!< Character type of the stream. + + //! Read the current character from stream without moving the read cursor. + Ch Peek() const; + + //! Read the current character from stream and moving the read cursor to next character. + Ch Take(); + + //! Get the current read cursor. + //! \return Number of characters read from start. + size_t Tell(); + + //! Begin writing operation at the current read pointer. + //! \return The begin writer pointer. + Ch* PutBegin(); + + //! Write a character. + void Put(Ch c); + + //! End the writing operation. + //! \param begin The begin write pointer returned by PutBegin(). + //! \return Number of characters written. + size_t PutEnd(Ch* begin); +} +\endcode +*/ + +//! Put N copies of a character to a stream. +template +inline void PutN(Stream& stream, Ch c, size_t n) { + for (size_t i = 0; i < n; i++) + stream.Put(c); +} + +/////////////////////////////////////////////////////////////////////////////// +// StringStream + +//! Read-only string stream. +/*! \implements Stream +*/ +template +struct GenericStringStream { + typedef typename Encoding::Ch Ch; + + GenericStringStream(const Ch *src) : src_(src), head_(src) {} + + Ch Peek() const { return *src_; } + Ch Take() { return *src_++; } + size_t Tell() const { return src_ - head_; } + + Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } + void Put(Ch) { RAPIDJSON_ASSERT(false); } + size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } + + const Ch* src_; //!< Current read position. + const Ch* head_; //!< Original head of the string. +}; + +typedef GenericStringStream > StringStream; + +/////////////////////////////////////////////////////////////////////////////// +// InsituStringStream + +//! A read-write string stream. +/*! This string stream is particularly designed for in-situ parsing. + \implements Stream +*/ +template +struct GenericInsituStringStream { + typedef typename Encoding::Ch Ch; + + GenericInsituStringStream(Ch *src) : src_(src), dst_(0), head_(src) {} + + // Read + Ch Peek() { return *src_; } + Ch Take() { return *src_++; } + size_t Tell() { return src_ - head_; } + + // Write + Ch* PutBegin() { return dst_ = src_; } + void Put(Ch c) { RAPIDJSON_ASSERT(dst_ != 0); *dst_++ = c; } + size_t PutEnd(Ch* begin) { return dst_ - begin; } + + Ch* src_; + Ch* dst_; + Ch* head_; +}; + +typedef GenericInsituStringStream > InsituStringStream; + +/////////////////////////////////////////////////////////////////////////////// +// Type + +//! Type of JSON value +enum Type { + kNullType = 0, //!< null + kFalseType = 1, //!< false + kTrueType = 2, //!< true + kObjectType = 3, //!< object + kArrayType = 4, //!< array + kStringType = 5, //!< string + kNumberType = 6, //!< number +}; + +} // namespace rapidjson + +#endif // RAPIDJSON_RAPIDJSON_H_ diff --git a/third-party/rapidjson/reader.h b/third-party/rapidjson/reader.h new file mode 100644 index 0000000000..96bbc6eb59 --- /dev/null +++ b/third-party/rapidjson/reader.h @@ -0,0 +1,683 @@ +#ifndef RAPIDJSON_READER_H_ +#define RAPIDJSON_READER_H_ + +// Copyright (c) 2011 Milo Yip (miloyip@gmail.com) +// Version 0.1 + +#include "rapidjson.h" +#include "internal/pow10.h" +#include "internal/stack.h" +#include + +#ifdef RAPIDJSON_SSE42 +#include +#elif defined(RAPIDJSON_SSE2) +#include +#endif + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4127) // conditional expression is constant +#endif + +#ifndef RAPIDJSON_PARSE_ERROR +#define RAPIDJSON_PARSE_ERROR(msg, offset) \ + RAPIDJSON_MULTILINEMACRO_BEGIN \ + parseError_ = msg; \ + errorOffset_ = offset; \ + longjmp(jmpbuf_, 1); \ + RAPIDJSON_MULTILINEMACRO_END +#endif + +namespace rapidjson { + +/////////////////////////////////////////////////////////////////////////////// +// ParseFlag + +//! Combination of parseFlags +enum ParseFlag { + kParseDefaultFlags = 0, //!< Default parse flags. Non-destructive parsing. Text strings are decoded into allocated buffer. + kParseInsituFlag = 1 //!< In-situ(destructive) parsing. +}; + +/////////////////////////////////////////////////////////////////////////////// +// Handler + +/*! \class rapidjson::Handler + \brief Concept for receiving events from GenericReader upon parsing. +\code +concept Handler { + typename Ch; + + void Null(); + void Bool(bool b); + void Int(int i); + void Uint(unsigned i); + void Int64(int64_t i); + void Uint64(uint64_t i); + void Double(double d); + void String(const Ch* str, SizeType length, bool copy); + void StartObject(); + void EndObject(SizeType memberCount); + void StartArray(); + void EndArray(SizeType elementCount); +}; +\endcode +*/ +/////////////////////////////////////////////////////////////////////////////// +// BaseReaderHandler + +//! Default implementation of Handler. +/*! This can be used as base class of any reader handler. + \implements Handler +*/ +template > +struct BaseReaderHandler { + typedef typename Encoding::Ch Ch; + + void Default() {} + void Null() { Default(); } + void Bool(bool) { Default(); } + void Int(int) { Default(); } + void Uint(unsigned) { Default(); } + void Int64(int64_t) { Default(); } + void Uint64(uint64_t) { Default(); } + void Double(double) { Default(); } + void String(const Ch*, SizeType, bool) { Default(); } + void StartObject() { Default(); } + void EndObject(SizeType) { Default(); } + void StartArray() { Default(); } + void EndArray(SizeType) { Default(); } +}; + +/////////////////////////////////////////////////////////////////////////////// +// SkipWhitespace + +//! Skip the JSON white spaces in a stream. +/*! \param stream A input stream for skipping white spaces. + \note This function has SSE2/SSE4.2 specialization. +*/ +template +void SkipWhitespace(Stream& stream) { + Stream s = stream; // Use a local copy for optimization + while (s.Peek() == ' ' || s.Peek() == '\n' || s.Peek() == '\r' || s.Peek() == '\t') + s.Take(); + stream = s; +} + +#ifdef RAPIDJSON_SSE42 +//! Skip whitespace with SSE 4.2 pcmpistrm instruction, testing 16 8-byte characters at once. +inline const char *SkipWhitespace_SIMD(const char* p) { + static const char whitespace[16] = " \n\r\t"; + __m128i w = _mm_loadu_si128((const __m128i *)&whitespace[0]); + + for (;;) { + __m128i s = _mm_loadu_si128((const __m128i *)p); + unsigned r = _mm_cvtsi128_si32(_mm_cmpistrm(w, s, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK | _SIDD_NEGATIVE_POLARITY)); + if (r == 0) // all 16 characters are whitespace + p += 16; + else { // some of characters may be non-whitespace +#ifdef _MSC_VER // Find the index of first non-whitespace + unsigned long offset; + if (_BitScanForward(&offset, r)) + return p + offset; +#else + if (r != 0) + return p + __builtin_ffs(r) - 1; +#endif + } + } +} + +#elif defined(RAPIDJSON_SSE2) + +//! Skip whitespace with SSE2 instructions, testing 16 8-byte characters at once. +inline const char *SkipWhitespace_SIMD(const char* p) { + static const char whitespaces[4][17] = { + " ", + "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r", + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"}; + + __m128i w0 = _mm_loadu_si128((const __m128i *)&whitespaces[0][0]); + __m128i w1 = _mm_loadu_si128((const __m128i *)&whitespaces[1][0]); + __m128i w2 = _mm_loadu_si128((const __m128i *)&whitespaces[2][0]); + __m128i w3 = _mm_loadu_si128((const __m128i *)&whitespaces[3][0]); + + for (;;) { + __m128i s = _mm_loadu_si128((const __m128i *)p); + __m128i x = _mm_cmpeq_epi8(s, w0); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1)); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2)); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3)); + unsigned short r = ~_mm_movemask_epi8(x); + if (r == 0) // all 16 characters are whitespace + p += 16; + else { // some of characters may be non-whitespace +#ifdef _MSC_VER // Find the index of first non-whitespace + unsigned long offset; + if (_BitScanForward(&offset, r)) + return p + offset; +#else + if (r != 0) + return p + __builtin_ffs(r) - 1; +#endif + } + } +} + +#endif // RAPIDJSON_SSE2 + +#ifdef RAPIDJSON_SIMD +//! Template function specialization for InsituStringStream +template<> inline void SkipWhitespace(InsituStringStream& stream) { + stream.src_ = const_cast(SkipWhitespace_SIMD(stream.src_)); +} + +//! Template function specialization for StringStream +template<> inline void SkipWhitespace(StringStream& stream) { + stream.src_ = SkipWhitespace_SIMD(stream.src_); +} +#endif // RAPIDJSON_SIMD + +/////////////////////////////////////////////////////////////////////////////// +// GenericReader + +//! SAX-style JSON parser. Use Reader for UTF8 encoding and default allocator. +/*! GenericReader parses JSON text from a stream, and send events synchronously to an + object implementing Handler concept. + + It needs to allocate a stack for storing a single decoded string during + non-destructive parsing. + + For in-situ parsing, the decoded string is directly written to the source + text string, no temporary buffer is required. + + A GenericReader object can be reused for parsing multiple JSON text. + + \tparam Encoding Encoding of both the stream and the parse output. + \tparam Allocator Allocator type for stack. +*/ +template > +class GenericReader { +public: + typedef typename Encoding::Ch Ch; + + //! Constructor. + /*! \param allocator Optional allocator for allocating stack memory. (Only use for non-destructive parsing) + \param stackCapacity stack capacity in bytes for storing a single decoded string. (Only use for non-destructive parsing) + */ + GenericReader(Allocator* allocator = 0, size_t stackCapacity = kDefaultStackCapacity) : stack_(allocator, stackCapacity), parseError_(0), errorOffset_(0) {} + + //! Parse JSON text. + /*! \tparam parseFlags Combination of ParseFlag. + \tparam Stream Type of input stream. + \tparam Handler Type of handler which must implement Handler concept. + \param stream Input stream to be parsed. + \param handler The handler to receive events. + \return Whether the parsing is successful. + */ + template + bool Parse(Stream& stream, Handler& handler) { + parseError_ = 0; + errorOffset_ = 0; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4611) // interaction between '_setjmp' and C++ object destruction is non-portable +#endif + if (setjmp(jmpbuf_)) { +#ifdef _MSC_VER +#pragma warning(pop) +#endif + stack_.Clear(); + return false; + } + + SkipWhitespace(stream); + + if (stream.Peek() == '\0') + RAPIDJSON_PARSE_ERROR("Text only contains white space(s)", stream.Tell()); + else { + switch (stream.Peek()) { + case '{': ParseObject(stream, handler); break; + case '[': ParseArray(stream, handler); break; + default: RAPIDJSON_PARSE_ERROR("Expect either an object or array at root", stream.Tell()); + } + SkipWhitespace(stream); + + if (stream.Peek() != '\0') + RAPIDJSON_PARSE_ERROR("Nothing should follow the root object or array.", stream.Tell()); + } + + return true; + } + + bool HasParseError() const { return parseError_ != 0; } + const char* GetParseError() const { return parseError_; } + size_t GetErrorOffset() const { return errorOffset_; } + +private: + // Parse object: { string : value, ... } + template + void ParseObject(Stream& stream, Handler& handler) { + RAPIDJSON_ASSERT(stream.Peek() == '{'); + stream.Take(); // Skip '{' + handler.StartObject(); + SkipWhitespace(stream); + + if (stream.Peek() == '}') { + stream.Take(); + handler.EndObject(0); // empty object + return; + } + + for (SizeType memberCount = 0;;) { + if (stream.Peek() != '"') { + RAPIDJSON_PARSE_ERROR("Name of an object member must be a string", stream.Tell()); + break; + } + + ParseString(stream, handler); + SkipWhitespace(stream); + + if (stream.Take() != ':') { + RAPIDJSON_PARSE_ERROR("There must be a colon after the name of object member", stream.Tell()); + break; + } + SkipWhitespace(stream); + + ParseValue(stream, handler); + SkipWhitespace(stream); + + ++memberCount; + + switch(stream.Take()) { + case ',': SkipWhitespace(stream); break; + case '}': handler.EndObject(memberCount); return; + default: RAPIDJSON_PARSE_ERROR("Must be a comma or '}' after an object member", stream.Tell()); + } + } + } + + // Parse array: [ value, ... ] + template + void ParseArray(Stream& stream, Handler& handler) { + RAPIDJSON_ASSERT(stream.Peek() == '['); + stream.Take(); // Skip '[' + handler.StartArray(); + SkipWhitespace(stream); + + if (stream.Peek() == ']') { + stream.Take(); + handler.EndArray(0); // empty array + return; + } + + for (SizeType elementCount = 0;;) { + ParseValue(stream, handler); + ++elementCount; + SkipWhitespace(stream); + + switch (stream.Take()) { + case ',': SkipWhitespace(stream); break; + case ']': handler.EndArray(elementCount); return; + default: RAPIDJSON_PARSE_ERROR("Must be a comma or ']' after an array element.", stream.Tell()); + } + } + } + + template + void ParseNull(Stream& stream, Handler& handler) { + RAPIDJSON_ASSERT(stream.Peek() == 'n'); + stream.Take(); + + if (stream.Take() == 'u' && stream.Take() == 'l' && stream.Take() == 'l') + handler.Null(); + else + RAPIDJSON_PARSE_ERROR("Invalid value", stream.Tell() - 1); + } + + template + void ParseTrue(Stream& stream, Handler& handler) { + RAPIDJSON_ASSERT(stream.Peek() == 't'); + stream.Take(); + + if (stream.Take() == 'r' && stream.Take() == 'u' && stream.Take() == 'e') + handler.Bool(true); + else + RAPIDJSON_PARSE_ERROR("Invalid value", stream.Tell()); + } + + template + void ParseFalse(Stream& stream, Handler& handler) { + RAPIDJSON_ASSERT(stream.Peek() == 'f'); + stream.Take(); + + if (stream.Take() == 'a' && stream.Take() == 'l' && stream.Take() == 's' && stream.Take() == 'e') + handler.Bool(false); + else + RAPIDJSON_PARSE_ERROR("Invalid value", stream.Tell() - 1); + } + + // Helper function to parse four hexidecimal digits in \uXXXX in ParseString(). + template + unsigned ParseHex4(Stream& stream) { + Stream s = stream; // Use a local copy for optimization + unsigned codepoint = 0; + for (int i = 0; i < 4; i++) { + Ch c = s.Take(); + codepoint <<= 4; + codepoint += c; + if (c >= '0' && c <= '9') + codepoint -= '0'; + else if (c >= 'A' && c <= 'F') + codepoint -= 'A' - 10; + else if (c >= 'a' && c <= 'f') + codepoint -= 'a' - 10; + else + RAPIDJSON_PARSE_ERROR("Incorrect hex digit after \\u escape", s.Tell() - 1); + } + stream = s; // Restore stream + return codepoint; + } + + // Parse string, handling the prefix and suffix double quotes and escaping. + template + void ParseString(Stream& stream, Handler& handler) { +#define Z16 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + static const Ch escape[256] = { + Z16, Z16, 0, 0,'\"', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,'/', + Z16, Z16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,'\\', 0, 0, 0, + 0, 0,'\b', 0, 0, 0,'\f', 0, 0, 0, 0, 0, 0, 0,'\n', 0, + 0, 0,'\r', 0,'\t', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16 + }; +#undef Z16 + + Stream s = stream; // Use a local copy for optimization + RAPIDJSON_ASSERT(s.Peek() == '\"'); + s.Take(); // Skip '\"' + Ch *head; + SizeType len; + if (parseFlags & kParseInsituFlag) + head = s.PutBegin(); + else + len = 0; + +#define RAPIDJSON_PUT(x) \ + do { \ + if (parseFlags & kParseInsituFlag) \ + s.Put(x); \ + else { \ + *stack_.template Push() = x; \ + ++len; \ + } \ + } while(false) + + for (;;) { + Ch c = s.Take(); + if (c == '\\') { // Escape + Ch e = s.Take(); + if ((sizeof(Ch) == 1 || e < 256) && escape[(unsigned char)e]) + RAPIDJSON_PUT(escape[(unsigned char)e]); + else if (e == 'u') { // Unicode + unsigned codepoint = ParseHex4(s); + if (codepoint >= 0xD800 && codepoint <= 0xDBFF) { // Handle UTF-16 surrogate pair + if (s.Take() != '\\' || s.Take() != 'u') { + RAPIDJSON_PARSE_ERROR("Missing the second \\u in surrogate pair", s.Tell() - 2); + return; + } + unsigned codepoint2 = ParseHex4(s); + if (codepoint2 < 0xDC00 || codepoint2 > 0xDFFF) { + RAPIDJSON_PARSE_ERROR("The second \\u in surrogate pair is invalid", s.Tell() - 2); + return; + } + codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000; + } + + Ch buffer[4]; + SizeType count = SizeType(Encoding::Encode(buffer, codepoint) - &buffer[0]); + + if (parseFlags & kParseInsituFlag) + for (SizeType i = 0; i < count; i++) + s.Put(buffer[i]); + else { + memcpy(stack_.template Push(count), buffer, count * sizeof(Ch)); + len += count; + } + } + else { + RAPIDJSON_PARSE_ERROR("Unknown escape character", stream.Tell() - 1); + return; + } + } + else if (c == '"') { // Closing double quote + if (parseFlags & kParseInsituFlag) { + size_t length = s.PutEnd(head); + RAPIDJSON_ASSERT(length <= 0xFFFFFFFF); + RAPIDJSON_PUT('\0'); // null-terminate the string + handler.String(head, SizeType(length), false); + } + else { + RAPIDJSON_PUT('\0'); + handler.String(stack_.template Pop(len), len - 1, true); + } + stream = s; // restore stream + return; + } + else if (c == '\0') { + RAPIDJSON_PARSE_ERROR("lacks ending quotation before the end of string", stream.Tell() - 1); + return; + } + else if ((unsigned)c < 0x20) { // RFC 4627: unescaped = %x20-21 / %x23-5B / %x5D-10FFFF + RAPIDJSON_PARSE_ERROR("Incorrect unescaped character in string", stream.Tell() - 1); + return; + } + else + RAPIDJSON_PUT(c); // Normal character, just copy + } +#undef RAPIDJSON_PUT + } + + template + void ParseNumber(Stream& stream, Handler& handler) { + Stream s = stream; // Local copy for optimization + // Parse minus + bool minus = false; + if (s.Peek() == '-') { + minus = true; + s.Take(); + } + + // Parse int: zero / ( digit1-9 *DIGIT ) + unsigned i; + bool try64bit = false; + if (s.Peek() == '0') { + i = 0; + s.Take(); + } + else if (s.Peek() >= '1' && s.Peek() <= '9') { + i = s.Take() - '0'; + + if (minus) + while (s.Peek() >= '0' && s.Peek() <= '9') { + if (i >= 214748364) { // 2^31 = 2147483648 + if (i != 214748364 || s.Peek() > '8') { + try64bit = true; + break; + } + } + i = i * 10 + (s.Take() - '0'); + } + else + while (s.Peek() >= '0' && s.Peek() <= '9') { + if (i >= 429496729) { // 2^32 - 1 = 4294967295 + if (i != 429496729 || s.Peek() > '5') { + try64bit = true; + break; + } + } + i = i * 10 + (s.Take() - '0'); + } + } + else { + RAPIDJSON_PARSE_ERROR("Expect a value here.", stream.Tell()); + return; + } + + // Parse 64bit int + uint64_t i64 = 0; + bool useDouble = false; + if (try64bit) { + i64 = i; + if (minus) + while (s.Peek() >= '0' && s.Peek() <= '9') { + if (i64 >= 922337203685477580uLL) // 2^63 = 9223372036854775808 + if (i64 != 922337203685477580uLL || s.Peek() > '8') { + useDouble = true; + break; + } + i64 = i64 * 10 + (s.Take() - '0'); + } + else + while (s.Peek() >= '0' && s.Peek() <= '9') { + if (i64 >= 1844674407370955161uLL) // 2^64 - 1 = 18446744073709551615 + if (i64 != 1844674407370955161uLL || s.Peek() > '5') { + useDouble = true; + break; + } + i64 = i64 * 10 + (s.Take() - '0'); + } + } + + // Force double for big integer + double d = 0.0; + if (useDouble) { + d = (double)i64; + while (s.Peek() >= '0' && s.Peek() <= '9') { + if (d >= 1E307) { + RAPIDJSON_PARSE_ERROR("Number too big to store in double", stream.Tell()); + return; + } + d = d * 10 + (s.Take() - '0'); + } + } + + // Parse frac = decimal-point 1*DIGIT + int expFrac = 0; + if (s.Peek() == '.') { + if (!useDouble) { + d = try64bit ? (double)i64 : (double)i; + useDouble = true; + } + s.Take(); + + if (s.Peek() >= '0' && s.Peek() <= '9') { + d = d * 10 + (s.Take() - '0'); + --expFrac; + } + else { + RAPIDJSON_PARSE_ERROR("At least one digit in fraction part", stream.Tell()); + return; + } + + while (s.Peek() >= '0' && s.Peek() <= '9') { + if (expFrac > -16) { + d = d * 10 + (s.Peek() - '0'); + --expFrac; + } + s.Take(); + } + } + + // Parse exp = e [ minus / plus ] 1*DIGIT + int exp = 0; + if (s.Peek() == 'e' || s.Peek() == 'E') { + if (!useDouble) { + d = try64bit ? (double)i64 : (double)i; + useDouble = true; + } + s.Take(); + + bool expMinus = false; + if (s.Peek() == '+') + s.Take(); + else if (s.Peek() == '-') { + s.Take(); + expMinus = true; + } + + if (s.Peek() >= '0' && s.Peek() <= '9') { + exp = s.Take() - '0'; + while (s.Peek() >= '0' && s.Peek() <= '9') { + exp = exp * 10 + (s.Take() - '0'); + if (exp > 308) { + RAPIDJSON_PARSE_ERROR("Number too big to store in double", stream.Tell()); + return; + } + } + } + else { + RAPIDJSON_PARSE_ERROR("At least one digit in exponent", s.Tell()); + return; + } + + if (expMinus) + exp = -exp; + } + + // Finish parsing, call event according to the type of number. + if (useDouble) { + d *= internal::Pow10(exp + expFrac); + handler.Double(minus ? -d : d); + } + else { + if (try64bit) { + if (minus) + handler.Int64(-(int64_t)i64); + else + handler.Uint64(i64); + } + else { + if (minus) + handler.Int(-(int)i); + else + handler.Uint(i); + } + } + + stream = s; // restore stream + } + + // Parse any JSON value + template + void ParseValue(Stream& stream, Handler& handler) { + switch (stream.Peek()) { + case 'n': ParseNull (stream, handler); break; + case 't': ParseTrue (stream, handler); break; + case 'f': ParseFalse (stream, handler); break; + case '"': ParseString(stream, handler); break; + case '{': ParseObject(stream, handler); break; + case '[': ParseArray (stream, handler); break; + default : ParseNumber(stream, handler); + } + } + + static const size_t kDefaultStackCapacity = 256; //!< Default stack capacity in bytes for storing a single decoded string. + internal::Stack stack_; //!< A stack for storing decoded string temporarily during non-destructive parsing. + jmp_buf jmpbuf_; //!< setjmp buffer for fast exit from nested parsing function calls. + const char* parseError_; + size_t errorOffset_; +}; // class GenericReader + +//! Reader with UTF8 encoding and default allocator. +typedef GenericReader > Reader; + +} // namespace rapidjson + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#endif // RAPIDJSON_READER_H_ diff --git a/third-party/rapidjson/stringbuffer.h b/third-party/rapidjson/stringbuffer.h new file mode 100644 index 0000000000..269ae10761 --- /dev/null +++ b/third-party/rapidjson/stringbuffer.h @@ -0,0 +1,49 @@ +#ifndef RAPIDJSON_STRINGBUFFER_H_ +#define RAPIDJSON_STRINGBUFFER_H_ + +#include "rapidjson.h" +#include "internal/stack.h" + +namespace rapidjson { + +//! Represents an in-memory output stream. +/*! + \tparam Encoding Encoding of the stream. + \tparam Allocator type for allocating memory buffer. + \implements Stream +*/ +template +struct GenericStringBuffer { + typedef typename Encoding::Ch Ch; + + GenericStringBuffer(Allocator* allocator = 0, size_t capacity = kDefaultCapacity) : stack_(allocator, capacity) {} + + void Put(Ch c) { *stack_.template Push() = c; } + + void Clear() { stack_.Clear(); } + + const char* GetString() const { + // Push and pop a null terminator. This is safe. + *stack_.template Push() = '\0'; + stack_.template Pop(1); + + return stack_.template Bottom(); + } + + size_t Size() const { return stack_.GetSize(); } + + static const size_t kDefaultCapacity = 256; + mutable internal::Stack stack_; +}; + +typedef GenericStringBuffer > StringBuffer; + +//! Implement specialized version of PutN() with memset() for better performance. +template<> +inline void PutN(GenericStringBuffer >& stream, char c, size_t n) { + memset(stream.stack_.Push(n), c, n * sizeof(c)); +} + +} // namespace rapidjson + +#endif // RAPIDJSON_STRINGBUFFER_H_ diff --git a/third-party/rapidjson/writer.h b/third-party/rapidjson/writer.h new file mode 100644 index 0000000000..d96f2081a9 --- /dev/null +++ b/third-party/rapidjson/writer.h @@ -0,0 +1,241 @@ +#ifndef RAPIDJSON_WRITER_H_ +#define RAPIDJSON_WRITER_H_ + +#include "rapidjson.h" +#include "internal/stack.h" +#include "internal/strfunc.h" +#include // snprintf() or _sprintf_s() +#include // placement new + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4127) // conditional expression is constant +#endif + +namespace rapidjson { + +//! JSON writer +/*! Writer implements the concept Handler. + It generates JSON text by events to an output stream. + + User may programmatically calls the functions of a writer to generate JSON text. + + On the other side, a writer can also be passed to objects that generates events, + + for example Reader::Parse() and Document::Accept(). + + \tparam Stream Type of ouptut stream. + \tparam Encoding Encoding of both source strings and output. + \implements Handler +*/ +template, typename Allocator = MemoryPoolAllocator<> > +class Writer { +public: + typedef typename Encoding::Ch Ch; + + Writer(Stream& stream, Allocator* allocator = 0, size_t levelDepth = kDefaultLevelDepth) : + stream_(stream), level_stack_(allocator, levelDepth * sizeof(Level)) {} + + //@name Implementation of Handler + //@{ + Writer& Null() { Prefix(kNullType); WriteNull(); return *this; } + Writer& Bool(bool b) { Prefix(b ? kTrueType : kFalseType); WriteBool(b); return *this; } + Writer& Int(int i) { Prefix(kNumberType); WriteInt(i); return *this; } + Writer& Uint(unsigned u) { Prefix(kNumberType); WriteUint(u); return *this; } + Writer& Int64(int64_t i64) { Prefix(kNumberType); WriteInt64(i64); return *this; } + Writer& Uint64(uint64_t u64) { Prefix(kNumberType); WriteUint64(u64); return *this; } + Writer& Double(double d) { Prefix(kNumberType); WriteDouble(d); return *this; } + + Writer& String(const Ch* str, SizeType length, bool copy = false) { + (void)copy; + Prefix(kStringType); + WriteString(str, length); + return *this; + } + + Writer& StartObject() { + Prefix(kObjectType); + new (level_stack_.template Push()) Level(false); + WriteStartObject(); + return *this; + } + + Writer& EndObject(SizeType memberCount = 0) { + (void)memberCount; + RAPIDJSON_ASSERT(level_stack_.GetSize() >= sizeof(Level)); + RAPIDJSON_ASSERT(!level_stack_.template Top()->inArray); + level_stack_.template Pop(1); + WriteEndObject(); + return *this; + } + + Writer& StartArray() { + Prefix(kArrayType); + new (level_stack_.template Push()) Level(true); + WriteStartArray(); + return *this; + } + + Writer& EndArray(SizeType elementCount = 0) { + (void)elementCount; + RAPIDJSON_ASSERT(level_stack_.GetSize() >= sizeof(Level)); + RAPIDJSON_ASSERT(level_stack_.template Top()->inArray); + level_stack_.template Pop(1); + WriteEndArray(); + return *this; + } + //@} + + //! Simpler but slower overload. + Writer& String(const Ch* str) { return String(str, internal::StrLen(str)); } + +protected: + //! Information for each nested level + struct Level { + Level(bool inArray_) : inArray(inArray_), valueCount(0) {} + bool inArray; //!< true if in array, otherwise in object + size_t valueCount; //!< number of values in this level + }; + + static const size_t kDefaultLevelDepth = 32; + + void WriteNull() { + stream_.Put('n'); stream_.Put('u'); stream_.Put('l'); stream_.Put('l'); + } + + void WriteBool(bool b) { + if (b) { + stream_.Put('t'); stream_.Put('r'); stream_.Put('u'); stream_.Put('e'); + } + else { + stream_.Put('f'); stream_.Put('a'); stream_.Put('l'); stream_.Put('s'); stream_.Put('e'); + } + } + + void WriteInt(int i) { + if (i < 0) { + stream_.Put('-'); + i = -i; + } + WriteUint((unsigned)i); + } + + void WriteUint(unsigned u) { + char buffer[10]; + char *p = buffer; + do { + *p++ = (u % 10) + '0'; + u /= 10; + } while (u > 0); + + do { + --p; + stream_.Put(*p); + } while (p != buffer); + } + + void WriteInt64(int64_t i64) { + if (i64 < 0) { + stream_.Put('-'); + i64 = -i64; + } + WriteUint64((uint64_t)i64); + } + + void WriteUint64(uint64_t u64) { + char buffer[20]; + char *p = buffer; + do { + *p++ = char(u64 % 10) + '0'; + u64 /= 10; + } while (u64 > 0); + + do { + --p; + stream_.Put(*p); + } while (p != buffer); + } + + //! \todo Optimization with custom double-to-string converter. + void WriteDouble(double d) { + char buffer[100]; +#if _MSC_VER + int ret = sprintf_s(buffer, sizeof(buffer), "%g", d); +#else + int ret = snprintf(buffer, sizeof(buffer), "%g", d); +#endif + RAPIDJSON_ASSERT(ret >= 1); + for (int i = 0; i < ret; i++) + stream_.Put(buffer[i]); + } + + void WriteString(const Ch* str, SizeType length) { + static const char hexDigits[] = "0123456789ABCDEF"; + static const char escape[256] = { +#define Z16 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + //0 1 2 3 4 5 6 7 8 9 A B C D E F + 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'b', 't', 'n', 'u', 'f', 'r', 'u', 'u', // 00 + 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', // 10 + 0, 0, '"', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20 + Z16, Z16, // 30~4F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,'\\', 0, 0, 0, // 50 + Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16 // 60~FF +#undef Z16 + }; + + stream_.Put('\"'); + for (const Ch* p = str; p != str + length; ++p) { + if ((sizeof(Ch) == 1 || *p < 256) && escape[(unsigned char)*p]) { + stream_.Put('\\'); + stream_.Put(escape[(unsigned char)*p]); + if (escape[(unsigned char)*p] == 'u') { + stream_.Put('0'); + stream_.Put('0'); + stream_.Put(hexDigits[(*p) >> 4]); + stream_.Put(hexDigits[(*p) & 0xF]); + } + } + else + stream_.Put(*p); + } + stream_.Put('\"'); + } + + void WriteStartObject() { stream_.Put('{'); } + void WriteEndObject() { stream_.Put('}'); } + void WriteStartArray() { stream_.Put('['); } + void WriteEndArray() { stream_.Put(']'); } + + void Prefix(Type type) { + (void)type; + if (level_stack_.GetSize() != 0) { // this value is not at root + Level* level = level_stack_.template Top(); + if (level->valueCount > 0) { + if (level->inArray) + stream_.Put(','); // add comma if it is not the first element in array + else // in object + stream_.Put((level->valueCount % 2 == 0) ? ',' : ':'); + } + if (!level->inArray && level->valueCount % 2 == 0) + RAPIDJSON_ASSERT(type == kStringType); // if it's in object, then even number should be a name + level->valueCount++; + } + else + RAPIDJSON_ASSERT(type == kObjectType || type == kArrayType); + } + + Stream& stream_; + internal::Stack level_stack_; + +private: + // Prohibit assignment for VC C4512 warning + Writer& operator=(const Writer& w); +}; + +} // namespace rapidjson + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#endif // RAPIDJSON_RAPIDJSON_H_ diff --git a/tools/auto_sanity_test.sh b/tools/auto_sanity_test.sh new file mode 100755 index 0000000000..2d63c0a85f --- /dev/null +++ b/tools/auto_sanity_test.sh @@ -0,0 +1,71 @@ +TMP_DIR="/tmp/rocksdb-sanity-test" + +if [ "$#" -lt 2 ]; then + echo "usage: ./auto_sanity_test.sh [new_commit] [old_commit]" + echo "Missing either [new_commit] or [old_commit], perform sanity check with the latest and 10th latest commits." + recent_commits=`git log | grep -e "^commit [a-z0-9]\+$"| head -n10 | sed -e 's/commit //g'` + commit_new=`echo "$recent_commits" | head -n1` + commit_old=`echo "$recent_commits" | tail -n1` + echo "the most recent commits are:" + echo "$recent_commits" +else + commit_new=$1 + commit_old=$2 +fi + +if [ ! -d $TMP_DIR ]; then + mkdir $TMP_DIR +fi +dir_new="${TMP_DIR}/${commit_new}" +dir_old="${TMP_DIR}/${commit_old}" + +function makestuff() { + echo "make clean" + make clean > /dev/null + echo "make db_sanity_test -j32" + make db_sanity_test -j32 > /dev/null + if [ $? -ne 0 ]; then + echo "[ERROR] Failed to perform 'make db_sanity_test'" + exit 1 + fi +} + +rm -r -f $dir_new +rm -r -f $dir_old + +echo "Running db sanity check with commits $commit_new and $commit_old." + +echo "=============================================================" +echo "Making build $commit_new" +makestuff +mv db_sanity_test new_db_sanity_test +echo "Creating db based on the new commit --- $commit_new" +./new_db_sanity_test $dir_new create + +echo "=============================================================" +echo "Making build $commit_old" +makestuff +mv db_sanity_test old_db_sanity_test +echo "Creating db based on the old commit --- $commit_old" +./old_db_sanity_test $dir_old create + +echo "=============================================================" +echo "Verifying new db $dir_new using the old commit --- $commit_old" +./old_db_sanity_test $dir_new verify +if [ $? -ne 0 ]; then + echo "[ERROR] Verification of $dir_new using commit $commit_old failed." + exit 2 +fi + +echo "=============================================================" +echo "Verifying old db $dir_old using the new commit --- $commit_new" +./new_db_sanity_test $dir_old verify +if [ $? -ne 0 ]; then + echo "[ERROR] Verification of $dir_old using commit $commit_new failed." + exit 2 +fi + +rm old_db_sanity_test +rm new_db_sanity_test + +echo "Auto sanity test passed!" diff --git a/tools/blob_store_bench.cc b/tools/blob_store_bench.cc new file mode 100644 index 0000000000..60a0b84a63 --- /dev/null +++ b/tools/blob_store_bench.cc @@ -0,0 +1,280 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include + +#include "rocksdb/env.h" +#include "util/blob_store.h" +#include "util/testutil.h" + +#define KB 1024LL +#define MB 1024*1024LL +// BlobStore does costly asserts to make sure it's running correctly, which +// significantly impacts benchmark runtime. +// NDEBUG will compile out those asserts. +#ifndef NDEBUG +#define NDEBUG +#endif + +using namespace rocksdb; +using namespace std; + +// used by all threads +uint64_t timeout_sec; +Env *env; +BlobStore* bs; + +namespace { +std::string RandomString(Random* rnd, uint64_t len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} +} // namespace + +struct Result { + uint32_t writes; + uint32_t reads; + uint32_t deletes; + uint64_t data_written; + uint64_t data_read; + + void print() { + printf("Total writes = %u\n", writes); + printf("Total reads = %u\n", reads); + printf("Total deletes = %u\n", deletes); + printf("Write throughput = %lf MB/s\n", + (double)data_written / (1024*1024.0) / timeout_sec); + printf("Read throughput = %lf MB/s\n", + (double)data_read / (1024*1024.0) / timeout_sec); + printf("Total throughput = %lf MB/s\n", + (double)(data_read + data_written) / (1024*1024.0) / timeout_sec); + } + + Result() { + writes = reads = deletes = data_read = data_written = 0; + } + + Result (uint32_t writes, uint32_t reads, uint32_t deletes, + uint64_t data_written, uint64_t data_read) : + writes(writes), reads(reads), deletes(deletes), + data_written(data_written), data_read(data_read) {} + +}; + +namespace { +Result operator + (const Result &a, const Result &b) { + return Result(a.writes + b.writes, a.reads + b.reads, + a.deletes + b.deletes, a.data_written + b.data_written, + a.data_read + b.data_read); +} +} // namespace + +struct WorkerThread { + uint64_t data_size_from, data_size_to; + double read_ratio; + uint64_t working_set_size; // start deleting once you reach this + Result result; + atomic stopped; + + WorkerThread(uint64_t data_size_from, uint64_t data_size_to, + double read_ratio, uint64_t working_set_size) : + data_size_from(data_size_from), data_size_to(data_size_to), + read_ratio(read_ratio), working_set_size(working_set_size), + stopped(false) {} + + WorkerThread(const WorkerThread& wt) : + data_size_from(wt.data_size_from), data_size_to(wt.data_size_to), + read_ratio(wt.read_ratio), working_set_size(wt.working_set_size), + stopped(false) {} +}; + +static void WorkerThreadBody(void* arg) { + WorkerThread* t = reinterpret_cast(arg); + Random rnd(5); + string buf; + vector> blobs; + vector random_strings; + + for (int i = 0; i < 10; ++i) { + random_strings.push_back(RandomString(&rnd, t->data_size_to)); + } + + uint64_t total_size = 0; + + uint64_t start_micros = env->NowMicros(); + while (env->NowMicros() - start_micros < timeout_sec * 1000 * 1000) { + if (blobs.size() && rand() < RAND_MAX * t->read_ratio) { + // read + int bi = rand() % blobs.size(); + Status s = bs->Get(blobs[bi].first, &buf); + assert(s.ok()); + t->result.data_read += buf.size(); + t->result.reads++; + } else { + // write + uint64_t size = rand() % (t->data_size_to - t->data_size_from) + + t->data_size_from; + total_size += size; + string put_str = random_strings[rand() % random_strings.size()]; + blobs.push_back(make_pair(Blob(), size)); + Status s = bs->Put(Slice(put_str.data(), size), &blobs.back().first); + assert(s.ok()); + t->result.data_written += size; + t->result.writes++; + } + + while (total_size >= t->working_set_size) { + // delete random + int bi = rand() % blobs.size(); + total_size -= blobs[bi].second; + bs->Delete(blobs[bi].first); + blobs.erase(blobs.begin() + bi); + t->result.deletes++; + } + } + t->stopped.store(true); +} + +namespace { +Result StartBenchmark(vector& config) { + for (auto w : config) { + env->StartThread(WorkerThreadBody, w); + } + + Result result; + + for (auto w : config) { + while (!w->stopped.load()); + result = result + w->result; + } + + for (auto w : config) { + delete w; + } + + delete bs; + + return result; +} + +vector SetupBenchmarkBalanced() { + string test_path; + env->GetTestDirectory(&test_path); + test_path.append("/blob_store"); + + // config start + uint32_t block_size = 16*KB; + uint32_t file_size = 1*MB; + double read_write_ratio = 0.5; + uint64_t data_read_from = 16*KB; + uint64_t data_read_to = 32*KB; + int number_of_threads = 10; + uint64_t working_set_size = 5*MB; + timeout_sec = 5; + // config end + + bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env); + + vector config; + + for (int i = 0; i < number_of_threads; ++i) { + config.push_back(new WorkerThread(data_read_from, + data_read_to, + read_write_ratio, + working_set_size)); + }; + + return config; +} + +vector SetupBenchmarkWriteHeavy() { + string test_path; + env->GetTestDirectory(&test_path); + test_path.append("/blob_store"); + + // config start + uint32_t block_size = 16*KB; + uint32_t file_size = 1*MB; + double read_write_ratio = 0.1; + uint64_t data_read_from = 16*KB; + uint64_t data_read_to = 32*KB; + int number_of_threads = 10; + uint64_t working_set_size = 5*MB; + timeout_sec = 5; + // config end + + bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env); + + vector config; + + for (int i = 0; i < number_of_threads; ++i) { + config.push_back(new WorkerThread(data_read_from, + data_read_to, + read_write_ratio, + working_set_size)); + }; + + return config; +} + +vector SetupBenchmarkReadHeavy() { + string test_path; + env->GetTestDirectory(&test_path); + test_path.append("/blob_store"); + + // config start + uint32_t block_size = 16*KB; + uint32_t file_size = 1*MB; + double read_write_ratio = 0.9; + uint64_t data_read_from = 16*KB; + uint64_t data_read_to = 32*KB; + int number_of_threads = 10; + uint64_t working_set_size = 5*MB; + timeout_sec = 5; + // config end + + bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env); + + vector config; + + for (int i = 0; i < number_of_threads; ++i) { + config.push_back(new WorkerThread(data_read_from, + data_read_to, + read_write_ratio, + working_set_size)); + }; + + return config; +} +} // namespace + +int main(int argc, const char** argv) { + srand(33); + env = Env::Default(); + + { + printf("--- Balanced read/write benchmark ---\n"); + vector config = SetupBenchmarkBalanced(); + Result r = StartBenchmark(config); + r.print(); + } + { + printf("--- Write heavy benchmark ---\n"); + vector config = SetupBenchmarkWriteHeavy(); + Result r = StartBenchmark(config); + r.print(); + } + { + printf("--- Read heavy benchmark ---\n"); + vector config = SetupBenchmarkReadHeavy(); + Result r = StartBenchmark(config); + r.print(); + } + + return 0; +} diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py new file mode 100644 index 0000000000..3c93eca36b --- /dev/null +++ b/tools/db_crashtest.py @@ -0,0 +1,150 @@ +#! /usr/bin/env python +import os +import re +import sys +import time +import random +import getopt +import logging +import tempfile +import subprocess +import shutil + +# This script runs and kills db_stress multiple times. It checks consistency +# in case of unsafe crashes in RocksDB. + +def main(argv): + try: + opts, args = getopt.getopt(argv, "hd:t:i:o:b:") + except getopt.GetoptError: + print("db_crashtest.py -d -t <#threads> " + "-i -o " + "-b \n") + sys.exit(2) + + # default values, will be overridden by cmdline args + interval = 120 # time for one db_stress instance to run + duration = 6000 # total time for this script to test db_stress + threads = 32 + # since we will be killing anyway, use large value for ops_per_thread + ops_per_thread = 100000000 + write_buf_size = 4 * 1024 * 1024 + + for opt, arg in opts: + if opt == '-h': + print("db_crashtest.py -d " + " -t <#threads> -i " + " -o -b \n") + sys.exit() + elif opt == "-d": + duration = int(arg) + elif opt == "-t": + threads = int(arg) + elif opt == "-i": + interval = int(arg) + elif opt == "-o": + ops_per_thread = int(arg) + elif opt == "-b": + write_buf_size = int(arg) + else: + print("db_crashtest.py -d " + " -t <#threads> -i " + " -o -b \n") + sys.exit(2) + + exit_time = time.time() + duration + + print("Running blackbox-crash-test with \ninterval_between_crash=" + + str(interval) + "\ntotal-duration=" + str(duration) + + "\nthreads=" + str(threads) + "\nops_per_thread=" + + str(ops_per_thread) + "\nwrite_buffer_size=" + + str(write_buf_size) + "\n") + + dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_') + + while time.time() < exit_time: + run_had_errors = False + killtime = time.time() + interval + + cmd = re.sub('\s+', ' ', """ + ./db_stress + --test_batches_snapshots=1 + --ops_per_thread=%s + --threads=%s + --write_buffer_size=%s + --destroy_db_initially=0 + --reopen=20 + --readpercent=45 + --prefixpercent=5 + --writepercent=35 + --delpercent=5 + --iterpercent=10 + --db=%s + --max_key=100000000 + --disable_seek_compaction=%s + --mmap_read=%s + --block_size=16384 + --cache_size=1048576 + --open_files=500000 + --verify_checksum=1 + --sync=0 + --progress_reports=0 + --disable_wal=0 + --disable_data_sync=1 + --target_file_size_base=2097152 + --target_file_size_multiplier=2 + --max_write_buffer_number=3 + --max_background_compactions=20 + --max_bytes_for_level_base=10485760 + --filter_deletes=%s + --memtablerep=prefix_hash + --prefix_size=7 + """ % (ops_per_thread, + threads, + write_buf_size, + dbname, + random.randint(0, 1), + random.randint(0, 1), + random.randint(0, 1))) + + child = subprocess.Popen([cmd], + stderr=subprocess.PIPE, shell=True) + print("Running db_stress with pid=%d: %s\n\n" + % (child.pid, cmd)) + + stop_early = False + while time.time() < killtime: + if child.poll() is not None: + print("WARNING: db_stress ended before kill: exitcode=%d\n" + % child.returncode) + stop_early = True + break + time.sleep(1) + + if not stop_early: + if child.poll() is not None: + print("WARNING: db_stress ended before kill: exitcode=%d\n" + % child.returncode) + else: + child.kill() + print("KILLED %d\n" % child.pid) + time.sleep(1) # time to stabilize after a kill + + while True: + line = child.stderr.readline().strip() + if line != '': + run_had_errors = True + print('***' + line + '^') + else: + break + + if run_had_errors: + sys.exit(2) + + time.sleep(1) # time to stabilize before the next run + + # we need to clean up after ourselves -- only do this on test success + shutil.rmtree(dbname, True) + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/tools/db_crashtest2.py b/tools/db_crashtest2.py new file mode 100644 index 0000000000..3ef383afcd --- /dev/null +++ b/tools/db_crashtest2.py @@ -0,0 +1,174 @@ +#! /usr/bin/env python +import os +import re +import sys +import time +import random +import getopt +import logging +import tempfile +import subprocess +import shutil + +# This python script runs db_stress multiple times. Some runs with +# kill_random_test that causes rocksdb to crash at various points in code. + +def main(argv): + try: + opts, args = getopt.getopt(argv, "hd:t:k:o:b:") + except getopt.GetoptError: + print str(getopt.GetoptError) + print "db_crashtest2.py -d -t <#threads> " \ + "-k -o "\ + "-b \n" + sys.exit(2) + + # default values, will be overridden by cmdline args + kill_random_test = 97 # kill with probability 1/97 by default + duration = 10000 # total time for this script to test db_stress + threads = 32 + ops_per_thread = 200000 + write_buf_size = 4 * 1024 * 1024 + + for opt, arg in opts: + if opt == '-h': + print "db_crashtest2.py -d -t <#threads> " \ + "-k -o " \ + "-b \n" + sys.exit() + elif opt == "-d": + duration = int(arg) + elif opt == "-t": + threads = int(arg) + elif opt == "-k": + kill_random_test = int(arg) + elif opt == "-o": + ops_per_thread = int(arg) + elif opt == "-b": + write_buf_size = int(arg) + else: + print "unrecognized option " + str(opt) + "\n" + print "db_crashtest2.py -d -t <#threads> " \ + "-k -o " \ + "-b \n" + sys.exit(2) + + exit_time = time.time() + duration + + print "Running whitebox-crash-test with \ntotal-duration=" + str(duration) \ + + "\nthreads=" + str(threads) + "\nops_per_thread=" \ + + str(ops_per_thread) + "\nwrite_buffer_size=" \ + + str(write_buf_size) + "\n" + + total_check_mode = 4 + check_mode = 0 + + while time.time() < exit_time: + killoption = "" + if check_mode == 0: + # run with kill_random_test + killoption = " --kill_random_test=" + str(kill_random_test) + # use large ops per thread since we will kill it anyway + additional_opts = "--ops_per_thread=" + \ + str(100 * ops_per_thread) + killoption + elif check_mode == 1: + # normal run with universal compaction mode + additional_opts = "--ops_per_thread=" + str(ops_per_thread) + \ + " --compaction_style=1" + elif check_mode == 2: + # normal run with FIFO compaction mode + # ops_per_thread is divided by 5 because FIFO compaction + # style is quite a bit slower on reads with lot of files + additional_opts = "--ops_per_thread=" + str(ops_per_thread / 5) + \ + " --compaction_style=2" + else: + # normal run + additional_opts = "--ops_per_thread=" + str(ops_per_thread) + + dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_') + cmd = re.sub('\s+', ' ', """ + ./db_stress + --test_batches_snapshots=%s + --threads=%s + --write_buffer_size=%s + --destroy_db_initially=0 + --reopen=20 + --readpercent=45 + --prefixpercent=5 + --writepercent=35 + --delpercent=5 + --iterpercent=10 + --db=%s + --max_key=100000000 + --disable_seek_compaction=%s + --mmap_read=%s + --block_size=16384 + --cache_size=1048576 + --open_files=500000 + --verify_checksum=1 + --sync=0 + --progress_reports=0 + --disable_wal=0 + --disable_data_sync=1 + --target_file_size_base=2097152 + --target_file_size_multiplier=2 + --max_write_buffer_number=3 + --max_background_compactions=20 + --max_bytes_for_level_base=10485760 + --filter_deletes=%s + --memtablerep=prefix_hash + --prefix_size=7 + %s + """ % (random.randint(0, 1), + threads, + write_buf_size, + dbname, + random.randint(0, 1), + random.randint(0, 1), + random.randint(0, 1), + additional_opts)) + + print "Running:" + cmd + "\n" + + popen = subprocess.Popen([cmd], stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True) + stdoutdata, stderrdata = popen.communicate() + retncode = popen.returncode + msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format( + check_mode, killoption, retncode)) + print msg + print stdoutdata + + expected = False + if (killoption == '') and (retncode == 0): + # we expect zero retncode if no kill option + expected = True + elif killoption != '' and retncode < 0: + # we expect negative retncode if kill option was given + expected = True + + if not expected: + print "TEST FAILED. See kill option and exit code above!!!\n" + sys.exit(1) + + stdoutdata = stdoutdata.lower() + errorcount = (stdoutdata.count('error') - + stdoutdata.count('got errors 0 times')) + print "#times error occurred in output is " + str(errorcount) + "\n" + + if (errorcount > 0): + print "TEST FAILED. Output has 'error'!!!\n" + sys.exit(2) + if (stdoutdata.find('fail') >= 0): + print "TEST FAILED. Output has 'fail'!!!\n" + sys.exit(2) + # we need to clean up after ourselves -- only do this on test success + shutil.rmtree(dbname, True) + + check_mode = (check_mode + 1) % total_check_mode + + time.sleep(1) # time to stabilize after a kill + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc new file mode 100644 index 0000000000..5970bb684e --- /dev/null +++ b/tools/db_repl_stress.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include + +#include + +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/types.h" +#include "port/atomic_pointer.h" +#include "util/testutil.h" + +// Run a thread to perform Put's. +// Another thread uses GetUpdatesSince API to keep getting the updates. +// options : +// --num_inserts = the num of inserts the first thread should perform. +// --wal_ttl = the wal ttl for the run. + +using namespace rocksdb; + +using GFLAGS::ParseCommandLineFlags; +using GFLAGS::SetUsageMessage; + +struct DataPumpThread { + size_t no_records; + DB* db; // Assumption DB is Open'ed already. +}; + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +static void DataPumpThreadBody(void* arg) { + DataPumpThread* t = reinterpret_cast(arg); + DB* db = t->db; + Random rnd(301); + size_t i = 0; + while(i++ < t->no_records) { + if(!db->Put(WriteOptions(), Slice(RandomString(&rnd, 500)), + Slice(RandomString(&rnd, 500))).ok()) { + fprintf(stderr, "Error in put\n"); + exit(1); + } + } +} + +struct ReplicationThread { + port::AtomicPointer stop; + DB* db; + volatile size_t no_read; +}; + +static void ReplicationThreadBody(void* arg) { + ReplicationThread* t = reinterpret_cast(arg); + DB* db = t->db; + unique_ptr iter; + SequenceNumber currentSeqNum = 1; + while (t->stop.Acquire_Load() != nullptr) { + iter.reset(); + Status s; + while(!db->GetUpdatesSince(currentSeqNum, &iter).ok()) { + if (t->stop.Acquire_Load() == nullptr) { + return; + } + } + fprintf(stderr, "Refreshing iterator\n"); + for(;iter->Valid(); iter->Next(), t->no_read++, currentSeqNum++) { + BatchResult res = iter->GetBatch(); + if (res.sequence != currentSeqNum) { + fprintf(stderr, + "Missed a seq no. b/w %ld and %ld\n", + (long)currentSeqNum, + (long)res.sequence); + exit(1); + } + } + } +} + +DEFINE_uint64(num_inserts, 1000, "the num of inserts the first thread should" + " perform."); +DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)"); +DEFINE_uint64(wal_size_limit_MB, 10, "the wal size limit for the run" + "(in MB)"); + +int main(int argc, const char** argv) { + SetUsageMessage( + std::string("\nUSAGE:\n") + std::string(argv[0]) + + " --num_inserts= --wal_ttl_seconds=" + + " --wal_size_limit_MB="); + ParseCommandLineFlags(&argc, const_cast(&argv), true); + + Env* env = Env::Default(); + std::string default_db_path; + env->GetTestDirectory(&default_db_path); + default_db_path += "db_repl_stress"; + Options options; + options.create_if_missing = true; + options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds; + options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB; + DB* db; + DestroyDB(default_db_path, options); + + Status s = DB::Open(options, default_db_path, &db); + + if (!s.ok()) { + fprintf(stderr, "Could not open DB due to %s\n", s.ToString().c_str()); + exit(1); + } + + DataPumpThread dataPump; + dataPump.no_records = FLAGS_num_inserts; + dataPump.db = db; + env->StartThread(DataPumpThreadBody, &dataPump); + + ReplicationThread replThread; + replThread.db = db; + replThread.no_read = 0; + replThread.stop.Release_Store(env); // store something to make it non-null. + + env->StartThread(ReplicationThreadBody, &replThread); + while(replThread.no_read < FLAGS_num_inserts); + replThread.stop.Release_Store(nullptr); + if (replThread.no_read < dataPump.no_records) { + // no. read should be => than inserted. + fprintf(stderr, "No. of Record's written and read not same\nRead : %zu" + " Written : %zu\n", replThread.no_read, dataPump.no_records); + exit(1); + } + fprintf(stderr, "Successful!\n"); + exit(0); +} + +#endif // GFLAGS diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc new file mode 100644 index 0000000000..4ae120c21e --- /dev/null +++ b/tools/db_sanity_test.cc @@ -0,0 +1,204 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include +#include + +#include "include/rocksdb/db.h" +#include "include/rocksdb/options.h" +#include "include/rocksdb/env.h" +#include "include/rocksdb/slice.h" +#include "include/rocksdb/status.h" +#include "include/rocksdb/comparator.h" +#include "include/rocksdb/table.h" +#include "include/rocksdb/slice_transform.h" + +namespace rocksdb { + +class SanityTest { + public: + explicit SanityTest(const std::string& path) + : env_(Env::Default()), path_(path) { + env_->CreateDirIfMissing(path); + } + virtual ~SanityTest() {} + + virtual std::string Name() const = 0; + virtual Options GetOptions() const = 0; + + Status Create() { + Options options = GetOptions(); + options.create_if_missing = true; + std::string dbname = path_ + Name(); + DestroyDB(dbname, options); + DB* db; + Status s = DB::Open(options, dbname, &db); + std::unique_ptr db_guard(db); + if (!s.ok()) { + return s; + } + for (int i = 0; i < 1000000; ++i) { + std::string k = "key" + std::to_string(i); + std::string v = "value" + std::to_string(i); + s = db->Put(WriteOptions(), Slice(k), Slice(v)); + if (!s.ok()) { + return s; + } + } + return Status::OK(); + } + Status Verify() { + DB* db; + std::string dbname = path_ + Name(); + Status s = DB::Open(GetOptions(), dbname, &db); + std::unique_ptr db_guard(db); + if (!s.ok()) { + return s; + } + for (int i = 0; i < 1000000; ++i) { + std::string k = "key" + std::to_string(i); + std::string v = "value" + std::to_string(i); + std::string result; + s = db->Get(ReadOptions(), Slice(k), &result); + if (!s.ok()) { + return s; + } + if (result != v) { + return Status::Corruption("Unexpected value for key " + k); + } + } + return Status::OK(); + } + + private: + Env* env_; + std::string const path_; +}; + +class SanityTestBasic : public SanityTest { + public: + explicit SanityTestBasic(const std::string& path) : SanityTest(path) {} + virtual Options GetOptions() const { + Options options; + options.create_if_missing = true; + return options; + } + virtual std::string Name() const { return "Basic"; } +}; + +class SanityTestSpecialComparator : public SanityTest { + public: + explicit SanityTestSpecialComparator(const std::string& path) + : SanityTest(path) { + options_.comparator = new NewComparator(); + } + ~SanityTestSpecialComparator() { delete options_.comparator; } + virtual Options GetOptions() const { return options_; } + virtual std::string Name() const { return "SpecialComparator"; } + + private: + class NewComparator : public Comparator { + public: + virtual const char* Name() const { return "rocksdb.NewComparator"; } + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* s, const Slice& l) const { + BytewiseComparator()->FindShortestSeparator(s, l); + } + virtual void FindShortSuccessor(std::string* key) const { + BytewiseComparator()->FindShortSuccessor(key); + } + }; + Options options_; +}; + +class SanityTestZlibCompression : public SanityTest { + public: + explicit SanityTestZlibCompression(const std::string& path) + : SanityTest(path) { + options_.compression = kZlibCompression; + } + virtual Options GetOptions() const { return options_; } + virtual std::string Name() const { return "ZlibCompression"; } + + private: + Options options_; +}; + +class SanityTestPlainTableFactory : public SanityTest { + public: + explicit SanityTestPlainTableFactory(const std::string& path) + : SanityTest(path) { + options_.table_factory.reset(NewPlainTableFactory()); + options_.prefix_extractor.reset(NewFixedPrefixTransform(2)); + options_.allow_mmap_reads = true; + } + ~SanityTestPlainTableFactory() {} + virtual Options GetOptions() const { return options_; } + virtual std::string Name() const { return "PlainTable"; } + + private: + Options options_; +}; + +namespace { +bool RunSanityTests(const std::string& command, const std::string& path) { + std::vector sanity_tests = { + new SanityTestBasic(path), + new SanityTestSpecialComparator(path), + new SanityTestZlibCompression(path), + new SanityTestPlainTableFactory(path)}; + + if (command == "create") { + fprintf(stderr, "Creating...\n"); + } else { + fprintf(stderr, "Verifying...\n"); + } + for (auto sanity_test : sanity_tests) { + Status s; + fprintf(stderr, "%s -- ", sanity_test->Name().c_str()); + if (command == "create") { + s = sanity_test->Create(); + } else { + assert(command == "verify"); + s = sanity_test->Verify(); + } + fprintf(stderr, "%s\n", s.ToString().c_str()); + if (!s.ok()) { + fprintf(stderr, "FAIL\n"); + return false; + } + + delete sanity_test; + } + return true; +} +} // namespace + +} // namespace rocksdb + +int main(int argc, char** argv) { + std::string path, command; + bool ok = (argc == 3); + if (ok) { + path = std::string(argv[1]); + command = std::string(argv[2]); + ok = (command == "create" || command == "verify"); + } + if (!ok) { + fprintf(stderr, "Usage: %s [create|verify] \n", argv[0]); + exit(1); + } + if (path.back() != '/') { + path += "/"; + } + + bool sanity_ok = rocksdb::RunSanityTests(command, path); + + return sanity_ok ? 0 : 1; +} diff --git a/tools/db_stress.cc b/tools/db_stress.cc new file mode 100644 index 0000000000..7e3101c07a --- /dev/null +++ b/tools/db_stress.cc @@ -0,0 +1,1808 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// The test uses an array to compare against values written to the database. +// Keys written to the array are in 1:1 correspondence to the actual values in +// the database according to the formula in the function GenerateValue. + +// Space is reserved in the array from 0 to FLAGS_max_key and values are +// randomly written/deleted/read from those positions. During verification we +// compare all the positions in the array. To shorten/elongate the running +// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread, +// (sometimes also FLAGS_threads). +// +// NOTE that if FLAGS_test_batches_snapshots is set, the test will have +// different behavior. See comment of the flag for details. + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include +#include +#include +#include +#include "db/db_impl.h" +#include "db/version_set.h" +#include "rocksdb/statistics.h" +#include "rocksdb/cache.h" +#include "utilities/db_ttl.h" +#include "rocksdb/env.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" +#include "port/port.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/histogram.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/testutil.h" +#include "util/logging.h" +#include "hdfs/env_hdfs.h" +#include "utilities/merge_operators.h" + +using GFLAGS::ParseCommandLineFlags; +using GFLAGS::RegisterFlagValidator; +using GFLAGS::SetUsageMessage; + +static const long KB = 1024; + +static bool ValidateUint32Range(const char* flagname, uint64_t value) { + if (value > std::numeric_limits::max()) { + fprintf(stderr, + "Invalid value for --%s: %lu, overflow\n", + flagname, + (unsigned long)value); + return false; + } + return true; +} + +DEFINE_uint64(seed, 2341234, "Seed for PRNG"); +static const bool FLAGS_seed_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range); + +DEFINE_int64(max_key, 1 * KB* KB, + "Max number of key/values to place in database"); + +DEFINE_int32(column_families, 10, "Number of column families"); + +DEFINE_bool(test_batches_snapshots, false, + "If set, the test uses MultiGet(), Multiut() and MultiDelete()" + " which read/write/delete multiple keys in a batch. In this mode," + " we do not verify db content by comparing the content with the " + "pre-allocated array. Instead, we do partial verification inside" + " MultiGet() by checking various values in a batch. Benefit of" + " this mode:\n" + "\t(a) No need to acquire mutexes during writes (less cache " + "flushes in multi-core leading to speed up)\n" + "\t(b) No long validation at the end (more speed up)\n" + "\t(c) Test snapshot and atomicity of batch writes"); + +DEFINE_int32(threads, 32, "Number of concurrent threads to run."); + +DEFINE_int32(ttl, -1, + "Opens the db with this ttl value if this is not -1. " + "Carefully specify a large value such that verifications on " + "deleted values don't fail"); + +DEFINE_int32(value_size_mult, 8, + "Size of value will be this number times rand_int(1,3) bytes"); + +DEFINE_bool(verify_before_write, false, "Verify before write"); + +DEFINE_bool(histogram, false, "Print histogram of operation timings"); + +DEFINE_bool(destroy_db_initially, true, + "Destroys the database dir before start if this is true"); + +DEFINE_bool(verbose, false, "Verbose"); + +DEFINE_bool(progress_reports, true, + "If true, db_stress will report number of finished operations"); + +DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size, + "Number of bytes to buffer in memtable before compacting"); + +DEFINE_int32(max_write_buffer_number, + rocksdb::Options().max_write_buffer_number, + "The number of in-memory memtables. " + "Each memtable is of size FLAGS_write_buffer_size."); + +DEFINE_int32(min_write_buffer_number_to_merge, + rocksdb::Options().min_write_buffer_number_to_merge, + "The minimum number of write buffers that will be merged together " + "before writing to storage. This is cheap because it is an " + "in-memory merge. If this feature is not enabled, then all these " + "write buffers are flushed to L0 as separate files and this " + "increases read amplification because a get request has to check " + "in all of these files. Also, an in-memory merge may result in " + "writing less data to storage if there are duplicate records in" + " each of these individual write buffers."); + +DEFINE_int32(open_files, rocksdb::Options().max_open_files, + "Maximum number of files to keep open at the same time " + "(use default if == 0)"); + +DEFINE_int64(compressed_cache_size, -1, + "Number of bytes to use as a cache of compressed data." + " Negative means use default settings."); + +DEFINE_int32(compaction_style, rocksdb::Options().compaction_style, ""); + +DEFINE_int32(level0_file_num_compaction_trigger, + rocksdb::Options().level0_file_num_compaction_trigger, + "Level0 compaction start trigger"); + +DEFINE_int32(level0_slowdown_writes_trigger, + rocksdb::Options().level0_slowdown_writes_trigger, + "Number of files in level-0 that will slow down writes"); + +DEFINE_int32(level0_stop_writes_trigger, + rocksdb::Options().level0_stop_writes_trigger, + "Number of files in level-0 that will trigger put stop."); + +DEFINE_int32(block_size, rocksdb::Options().block_size, + "Number of bytes in a block."); + +DEFINE_int32(max_background_compactions, + rocksdb::Options().max_background_compactions, + "The maximum number of concurrent background compactions " + "that can occur in parallel."); + +DEFINE_int32(compaction_thread_pool_adjust_interval, 0, + "The interval (in milliseconds) to adjust compaction thread pool " + "size. Don't change it periodically if the value is 0."); + +DEFINE_int32(compaction_thread_pool_varations, 2, + "Range of bakground thread pool size variations when adjusted " + "periodically."); + +DEFINE_int32(max_background_flushes, rocksdb::Options().max_background_flushes, + "The maximum number of concurrent background flushes " + "that can occur in parallel."); + +DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger" + " compaction in universal style"); + +DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files to " + "compact in universal style compaction"); + +DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact" + " in universal style compaction"); + +DEFINE_int32(universal_max_size_amplification_percent, 0, + "The max size amplification for universal style compaction"); + +DEFINE_int32(clear_column_family_one_in, 1000000, + "With a chance of 1/N, delete a column family and then recreate " + "it again. If N == 0, never drop/create column families. " + "When test_batches_snapshots is true, this flag has no effect"); + +DEFINE_int64(cache_size, 2 * KB * KB * KB, + "Number of bytes to use as a cache of uncompressed data."); + +static bool ValidateInt32Positive(const char* flagname, int32_t value) { + if (value < 0) { + fprintf(stderr, "Invalid value for --%s: %d, must be >=0\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(reopen, 10, "Number of times database reopens"); +static const bool FLAGS_reopen_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive); + +DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. " + "Negative means use default settings."); + +DEFINE_string(db, "", "Use the db with the following name."); + +DEFINE_bool(verify_checksum, false, + "Verify checksum for every block read from storage"); + +DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads, + "Allow reads to occur via mmap-ing files"); + +// Database statistics +static std::shared_ptr dbstats; +DEFINE_bool(statistics, false, "Create database statistics"); + +DEFINE_bool(sync, false, "Sync all writes to disk"); + +DEFINE_bool(disable_data_sync, false, + "If true, do not wait until data is synced to disk."); + +DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync"); + +DEFINE_int32(kill_random_test, 0, + "If non-zero, kill at various points in source code with " + "probability 1/this"); +static const bool FLAGS_kill_random_test_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_kill_random_test, &ValidateInt32Positive); +extern int rocksdb_kill_odds; + +DEFINE_bool(disable_wal, false, "If true, do not write WAL for write."); + +DEFINE_int32(target_file_size_base, 64 * KB, + "Target level-1 file size for compaction"); + +DEFINE_int32(target_file_size_multiplier, 1, + "A multiplier to compute targe level-N file size (N >= 2)"); + +DEFINE_uint64(max_bytes_for_level_base, 256 * KB, "Max bytes for level-1"); + +DEFINE_int32(max_bytes_for_level_multiplier, 2, + "A multiplier to compute max bytes for level-N (N >= 2)"); + +static bool ValidateInt32Percent(const char* flagname, int32_t value) { + if (value < 0 || value>100) { + fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(readpercent, 10, + "Ratio of reads to total workload (expressed as a percentage)"); +static const bool FLAGS_readpercent_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent); + +DEFINE_int32(prefixpercent, 20, + "Ratio of prefix iterators to total workload (expressed as a" + " percentage)"); +static const bool FLAGS_prefixpercent_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent); + +DEFINE_int32(writepercent, 45, + " Ratio of deletes to total workload (expressed as a percentage)"); +static const bool FLAGS_writepercent_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent); + +DEFINE_int32(delpercent, 15, + "Ratio of deletes to total workload (expressed as a percentage)"); +static const bool FLAGS_delpercent_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent); + +DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload" + " (expressed as a percentage)"); +static const bool FLAGS_iterpercent_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent); + +DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run"); +static const bool FLAGS_num_iterations_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range); + +DEFINE_bool(disable_seek_compaction, false, + "Option to disable compation triggered by read."); + +namespace { +enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "none")) + return rocksdb::kNoCompression; + else if (!strcasecmp(ctype, "snappy")) + return rocksdb::kSnappyCompression; + else if (!strcasecmp(ctype, "zlib")) + return rocksdb::kZlibCompression; + else if (!strcasecmp(ctype, "bzip2")) + return rocksdb::kBZip2Compression; + else if (!strcasecmp(ctype, "lz4")) + return rocksdb::kLZ4Compression; + else if (!strcasecmp(ctype, "lz4hc")) + return rocksdb::kLZ4HCCompression; + + fprintf(stdout, "Cannot parse compression type '%s'\n", ctype); + return rocksdb::kSnappyCompression; //default value +} +} // namespace + +DEFINE_string(compression_type, "snappy", + "Algorithm to use to compress the database"); +static enum rocksdb::CompressionType FLAGS_compression_type_e = + rocksdb::kSnappyCompression; + +DEFINE_string(hdfs, "", "Name of hdfs environment"); +// posix or hdfs environment +static rocksdb::Env* FLAGS_env = rocksdb::Env::Default(); + +DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread."); +static const bool FLAGS_ops_per_thread_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range); + +DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock"); +static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_log2_keys_per_lock, &ValidateUint32Range); + +DEFINE_int32(purge_redundant_percent, 50, + "Percentage of times we want to purge redundant keys in memory " + "before flushing"); +static const bool FLAGS_purge_redundant_percent_dummy __attribute__((unused)) = + RegisterFlagValidator(&FLAGS_purge_redundant_percent, + &ValidateInt32Percent); + +DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop" + " the delete if key not present"); + +enum RepFactory { + kSkipList, + kHashSkipList, + kVectorRep +}; + +namespace { +enum RepFactory StringToRepFactory(const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "skip_list")) + return kSkipList; + else if (!strcasecmp(ctype, "prefix_hash")) + return kHashSkipList; + else if (!strcasecmp(ctype, "vector")) + return kVectorRep; + + fprintf(stdout, "Cannot parse memreptable %s\n", ctype); + return kSkipList; +} +} // namespace + +static enum RepFactory FLAGS_rep_factory; +DEFINE_string(memtablerep, "prefix_hash", ""); + +static bool ValidatePrefixSize(const char* flagname, int32_t value) { + if (value < 0 || value > 8) { + fprintf(stderr, "Invalid value for --%s: %d. 0 <= PrefixSize <= 8\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(prefix_size, 7, "Control the prefix size for HashSkipListRep"); +static const bool FLAGS_prefix_size_dummy = + RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize); + +DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge " + "that behaves like a Put"); + + +namespace rocksdb { + +// convert long to a big-endian slice key +static std::string Key(long val) { + std::string little_endian_key; + std::string big_endian_key; + PutFixed64(&little_endian_key, val); + assert(little_endian_key.size() == sizeof(val)); + big_endian_key.resize(sizeof(val)); + for (int i=0; i<(int)sizeof(val); i++) { + big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i]; + } + return big_endian_key; +} + +static std::string StringToHex(const std::string& str) { + std::string result = "0x"; + char buf[10]; + for (size_t i = 0; i < str.length(); i++) { + snprintf(buf, 10, "%02X", (unsigned char)str[i]); + result += buf; + } + return result; +} + + +class StressTest; +namespace { + +class Stats { + private: + double start_; + double finish_; + double seconds_; + long done_; + long gets_; + long prefixes_; + long writes_; + long deletes_; + long iterator_size_sums_; + long founds_; + long iterations_; + long errors_; + int next_report_; + size_t bytes_; + double last_op_finish_; + HistogramImpl hist_; + + public: + Stats() { } + + void Start() { + next_report_ = 100; + hist_.Clear(); + done_ = 0; + gets_ = 0; + prefixes_ = 0; + writes_ = 0; + deletes_ = 0; + iterator_size_sums_ = 0; + founds_ = 0; + iterations_ = 0; + errors_ = 0; + bytes_ = 0; + seconds_ = 0; + start_ = FLAGS_env->NowMicros(); + last_op_finish_ = start_; + finish_ = start_; + } + + void Merge(const Stats& other) { + hist_.Merge(other.hist_); + done_ += other.done_; + gets_ += other.gets_; + prefixes_ += other.prefixes_; + writes_ += other.writes_; + deletes_ += other.deletes_; + iterator_size_sums_ += other.iterator_size_sums_; + founds_ += other.founds_; + iterations_ += other.iterations_; + errors_ += other.errors_; + bytes_ += other.bytes_; + seconds_ += other.seconds_; + if (other.start_ < start_) start_ = other.start_; + if (other.finish_ > finish_) finish_ = other.finish_; + } + + void Stop() { + finish_ = FLAGS_env->NowMicros(); + seconds_ = (finish_ - start_) * 1e-6; + } + + void FinishedSingleOp() { + if (FLAGS_histogram) { + double now = FLAGS_env->NowMicros(); + double micros = now - last_op_finish_; + hist_.Add(micros); + if (micros > 20000) { + fprintf(stdout, "long op: %.1f micros%30s\r", micros, ""); + } + last_op_finish_ = now; + } + + done_++; + if (FLAGS_progress_reports) { + if (done_ >= next_report_) { + if (next_report_ < 1000) next_report_ += 100; + else if (next_report_ < 5000) next_report_ += 500; + else if (next_report_ < 10000) next_report_ += 1000; + else if (next_report_ < 50000) next_report_ += 5000; + else if (next_report_ < 100000) next_report_ += 10000; + else if (next_report_ < 500000) next_report_ += 50000; + else next_report_ += 100000; + fprintf(stdout, "... finished %ld ops%30s\r", done_, ""); + } + } + } + + void AddBytesForWrites(int nwrites, size_t nbytes) { + writes_ += nwrites; + bytes_ += nbytes; + } + + void AddGets(int ngets, int nfounds) { + founds_ += nfounds; + gets_ += ngets; + } + + void AddPrefixes(int nprefixes, int count) { + prefixes_ += nprefixes; + iterator_size_sums_ += count; + } + + void AddIterations(int n) { + iterations_ += n; + } + + void AddDeletes(int n) { + deletes_ += n; + } + + void AddErrors(int n) { + errors_ += n; + } + + void Report(const char* name) { + std::string extra; + if (bytes_ < 1 || done_ < 1) { + fprintf(stderr, "No writes or ops?\n"); + return; + } + + double elapsed = (finish_ - start_) * 1e-6; + double bytes_mb = bytes_ / 1048576.0; + double rate = bytes_mb / elapsed; + double throughput = (double)done_/elapsed; + + fprintf(stdout, "%-12s: ", name); + fprintf(stdout, "%.3f micros/op %ld ops/sec\n", + seconds_ * 1e6 / done_, (long)throughput); + fprintf(stdout, "%-12s: Wrote %.2f MB (%.2f MB/sec) (%ld%% of %ld ops)\n", + "", bytes_mb, rate, (100*writes_)/done_, done_); + fprintf(stdout, "%-12s: Wrote %ld times\n", "", writes_); + fprintf(stdout, "%-12s: Deleted %ld times\n", "", deletes_); + fprintf(stdout, "%-12s: %ld read and %ld found the key\n", "", + gets_, founds_); + fprintf(stdout, "%-12s: Prefix scanned %ld times\n", "", prefixes_); + fprintf(stdout, "%-12s: Iterator size sum is %ld\n", "", + iterator_size_sums_); + fprintf(stdout, "%-12s: Iterated %ld times\n", "", iterations_); + fprintf(stdout, "%-12s: Got errors %ld times\n", "", errors_); + + if (FLAGS_histogram) { + fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); + } + fflush(stdout); + } +}; + +// State shared by all concurrent executions of the same benchmark. +class SharedState { + public: + static const uint32_t SENTINEL; + + explicit SharedState(StressTest* stress_test) + : cv_(&mu_), + seed_(FLAGS_seed), + max_key_(FLAGS_max_key), + log2_keys_per_lock_(FLAGS_log2_keys_per_lock), + num_threads_(FLAGS_threads), + num_initialized_(0), + num_populated_(0), + vote_reopen_(0), + num_done_(0), + start_(false), + start_verify_(false), + should_stop_bg_thread_(false), + bg_thread_finished_(false), + stress_test_(stress_test), + verification_failure_(false) { + if (FLAGS_test_batches_snapshots) { + fprintf(stdout, "No lock creation because test_batches_snapshots set\n"); + return; + } + values_.resize(FLAGS_column_families); + + for (int i = 0; i < FLAGS_column_families; ++i) { + values_[i] = std::vector(max_key_, SENTINEL); + } + + long num_locks = (max_key_ >> log2_keys_per_lock_); + if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) { + num_locks++; + } + fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families); + key_locks_.resize(FLAGS_column_families); + for (int i = 0; i < FLAGS_column_families; ++i) { + key_locks_[i] = std::vector(num_locks); + } + } + + ~SharedState() {} + + port::Mutex* GetMutex() { + return &mu_; + } + + port::CondVar* GetCondVar() { + return &cv_; + } + + StressTest* GetStressTest() const { + return stress_test_; + } + + long GetMaxKey() const { + return max_key_; + } + + uint32_t GetNumThreads() const { + return num_threads_; + } + + void IncInitialized() { + num_initialized_++; + } + + void IncOperated() { + num_populated_++; + } + + void IncDone() { + num_done_++; + } + + void IncVotedReopen() { + vote_reopen_ = (vote_reopen_ + 1) % num_threads_; + } + + bool AllInitialized() const { + return num_initialized_ >= num_threads_; + } + + bool AllOperated() const { + return num_populated_ >= num_threads_; + } + + bool AllDone() const { + return num_done_ >= num_threads_; + } + + bool AllVotedReopen() { + return (vote_reopen_ == 0); + } + + void SetStart() { + start_ = true; + } + + void SetStartVerify() { + start_verify_ = true; + } + + bool Started() const { + return start_; + } + + bool VerifyStarted() const { + return start_verify_; + } + + void SetVerificationFailure() { verification_failure_.store(true); } + + bool HasVerificationFailedYet() { return verification_failure_.load(); } + + port::Mutex* GetMutexForKey(int cf, long key) { + return &key_locks_[cf][key >> log2_keys_per_lock_]; + } + + void LockColumnFamily(int cf) { + for (auto& mutex : key_locks_[cf]) { + mutex.Lock(); + } + } + + void UnlockColumnFamily(int cf) { + for (auto& mutex : key_locks_[cf]) { + mutex.Unlock(); + } + } + + void ClearColumnFamily(int cf) { + std::fill(values_[cf].begin(), values_[cf].end(), SENTINEL); + } + + void Put(int cf, long key, uint32_t value_base) { + values_[cf][key] = value_base; + } + + uint32_t Get(int cf, long key) const { return values_[cf][key]; } + + void Delete(int cf, long key) { values_[cf][key] = SENTINEL; } + + uint32_t GetSeed() const { return seed_; } + + void SetShouldStopBgThread() { should_stop_bg_thread_ = true; } + + bool ShoudStopBgThread() { return should_stop_bg_thread_; } + + void SetBgThreadFinish() { bg_thread_finished_ = true; } + + bool BgThreadFinished() const { return bg_thread_finished_; } + + private: + port::Mutex mu_; + port::CondVar cv_; + const uint32_t seed_; + const long max_key_; + const uint32_t log2_keys_per_lock_; + const int num_threads_; + long num_initialized_; + long num_populated_; + long vote_reopen_; + long num_done_; + bool start_; + bool start_verify_; + bool should_stop_bg_thread_; + bool bg_thread_finished_; + StressTest* stress_test_; + std::atomic verification_failure_; + + std::vector> values_; + std::vector> key_locks_; +}; + +const uint32_t SharedState::SENTINEL = 0xffffffff; + +// Per-thread state for concurrent executions of the same benchmark. +struct ThreadState { + uint32_t tid; // 0..n-1 + Random rand; // Has different seeds for different threads + SharedState* shared; + Stats stats; + + ThreadState(uint32_t index, SharedState *shared) + : tid(index), + rand(1000 + index + shared->GetSeed()), + shared(shared) { + } +}; + +} // namespace + +class StressTest { + public: + StressTest() + : cache_(NewLRUCache(FLAGS_cache_size)), + compressed_cache_(FLAGS_compressed_cache_size >= 0 + ? NewLRUCache(FLAGS_compressed_cache_size) + : nullptr), + filter_policy_(FLAGS_bloom_bits >= 0 + ? NewBloomFilterPolicy(FLAGS_bloom_bits) + : nullptr), + db_(nullptr), + new_column_family_name_(0), + num_times_reopened_(0) { + if (FLAGS_destroy_db_initially) { + std::vector files; + FLAGS_env->GetChildren(FLAGS_db, &files); + for (unsigned int i = 0; i < files.size(); i++) { + if (Slice(files[i]).starts_with("heap-")) { + FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]); + } + } + DestroyDB(FLAGS_db, Options()); + } + } + + ~StressTest() { + for (auto cf : column_families_) { + delete cf; + } + column_families_.clear(); + delete db_; + delete filter_policy_; + } + + bool Run() { + PrintEnv(); + Open(); + SharedState shared(this); + uint32_t n = shared.GetNumThreads(); + + std::vector threads(n); + for (uint32_t i = 0; i < n; i++) { + threads[i] = new ThreadState(i, &shared); + FLAGS_env->StartThread(ThreadBody, threads[i]); + } + ThreadState bg_thread(0, &shared); + if (FLAGS_compaction_thread_pool_adjust_interval > 0) { + FLAGS_env->StartThread(PoolSizeChangeThread, &bg_thread); + } + + // Each thread goes through the following states: + // initializing -> wait for others to init -> read/populate/depopulate + // wait for others to operate -> verify -> done + + { + MutexLock l(shared.GetMutex()); + while (!shared.AllInitialized()) { + shared.GetCondVar()->Wait(); + } + + double now = FLAGS_env->NowMicros(); + fprintf(stdout, "%s Starting database operations\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str()); + + shared.SetStart(); + shared.GetCondVar()->SignalAll(); + while (!shared.AllOperated()) { + shared.GetCondVar()->Wait(); + } + + now = FLAGS_env->NowMicros(); + if (FLAGS_test_batches_snapshots) { + fprintf(stdout, "%s Limited verification already done during gets\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str()); + } else { + fprintf(stdout, "%s Starting verification\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str()); + } + + shared.SetStartVerify(); + shared.GetCondVar()->SignalAll(); + while (!shared.AllDone()) { + shared.GetCondVar()->Wait(); + } + } + + for (unsigned int i = 1; i < n; i++) { + threads[0]->stats.Merge(threads[i]->stats); + } + threads[0]->stats.Report("Stress Test"); + + for (unsigned int i = 0; i < n; i++) { + delete threads[i]; + threads[i] = nullptr; + } + double now = FLAGS_env->NowMicros(); + if (!FLAGS_test_batches_snapshots) { + fprintf(stdout, "%s Verification successful\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str()); + } + PrintStatistics(); + + if (FLAGS_compaction_thread_pool_adjust_interval > 0) { + MutexLock l(shared.GetMutex()); + shared.SetShouldStopBgThread(); + while (!shared.BgThreadFinished()) { + shared.GetCondVar()->Wait(); + } + } + + if (shared.HasVerificationFailedYet()) { + printf("Verification failed :(\n"); + return false; + } + return true; + } + + private: + + static void ThreadBody(void* v) { + ThreadState* thread = reinterpret_cast(v); + SharedState* shared = thread->shared; + + { + MutexLock l(shared->GetMutex()); + shared->IncInitialized(); + if (shared->AllInitialized()) { + shared->GetCondVar()->SignalAll(); + } + while (!shared->Started()) { + shared->GetCondVar()->Wait(); + } + } + thread->shared->GetStressTest()->OperateDb(thread); + + { + MutexLock l(shared->GetMutex()); + shared->IncOperated(); + if (shared->AllOperated()) { + shared->GetCondVar()->SignalAll(); + } + while (!shared->VerifyStarted()) { + shared->GetCondVar()->Wait(); + } + } + + if (!FLAGS_test_batches_snapshots) { + thread->shared->GetStressTest()->VerifyDb(thread); + } + + { + MutexLock l(shared->GetMutex()); + shared->IncDone(); + if (shared->AllDone()) { + shared->GetCondVar()->SignalAll(); + } + } + + } + + static void PoolSizeChangeThread(void* v) { + assert(FLAGS_compaction_thread_pool_adjust_interval > 0); + ThreadState* thread = reinterpret_cast(v); + SharedState* shared = thread->shared; + + while (true) { + { + MutexLock l(shared->GetMutex()); + if (shared->ShoudStopBgThread()) { + shared->SetBgThreadFinish(); + shared->GetCondVar()->SignalAll(); + return; + } + } + + auto thread_pool_size_base = FLAGS_max_background_compactions; + auto thread_pool_size_var = FLAGS_compaction_thread_pool_varations; + int new_thread_pool_size = + thread_pool_size_base - thread_pool_size_var + + thread->rand.Next() % (thread_pool_size_var * 2 + 1); + if (new_thread_pool_size < 1) { + new_thread_pool_size = 1; + } + FLAGS_env->SetBackgroundThreads(new_thread_pool_size); + // Sleep up to 3 seconds + FLAGS_env->SleepForMicroseconds( + thread->rand.Next() % FLAGS_compaction_thread_pool_adjust_interval * + 1000 + + 1); + } + } + + // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ... + // ("9"+K, "9"+V) in DB atomically i.e in a single batch. + // Also refer MultiGet. + Status MultiPut(ThreadState* thread, const WriteOptions& writeoptions, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, size_t sz) { + std::string keys[10] = {"9", "8", "7", "6", "5", + "4", "3", "2", "1", "0"}; + std::string values[10] = {"9", "8", "7", "6", "5", + "4", "3", "2", "1", "0"}; + Slice value_slices[10]; + WriteBatch batch; + Status s; + for (int i = 0; i < 10; i++) { + keys[i] += key.ToString(); + values[i] += value.ToString(); + value_slices[i] = values[i]; + if (FLAGS_use_merge) { + batch.Merge(column_family, keys[i], value_slices[i]); + } else { + batch.Put(column_family, keys[i], value_slices[i]); + } + } + + s = db_->Write(writeoptions, &batch); + if (!s.ok()) { + fprintf(stderr, "multiput error: %s\n", s.ToString().c_str()); + thread->stats.AddErrors(1); + } else { + // we did 10 writes each of size sz + 1 + thread->stats.AddBytesForWrites(10, (sz + 1) * 10); + } + + return s; + } + + // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K) + // in DB atomically i.e in a single batch. Also refer MultiGet. + Status MultiDelete(ThreadState* thread, const WriteOptions& writeoptions, + ColumnFamilyHandle* column_family, const Slice& key) { + std::string keys[10] = {"9", "7", "5", "3", "1", + "8", "6", "4", "2", "0"}; + + WriteBatch batch; + Status s; + for (int i = 0; i < 10; i++) { + keys[i] += key.ToString(); + batch.Delete(column_family, keys[i]); + } + + s = db_->Write(writeoptions, &batch); + if (!s.ok()) { + fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str()); + thread->stats.AddErrors(1); + } else { + thread->stats.AddDeletes(10); + } + + return s; + } + + // Given a key K, this gets values for "0"+K, "1"+K,..."9"+K + // in the same snapshot, and verifies that all the values are of the form + // "0"+V, "1"+V,..."9"+V. + // ASSUMES that MultiPut was used to put (K, V) into the DB. + Status MultiGet(ThreadState* thread, const ReadOptions& readoptions, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) { + std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}; + Slice key_slices[10]; + std::string values[10]; + ReadOptions readoptionscopy = readoptions; + readoptionscopy.snapshot = db_->GetSnapshot(); + Status s; + for (int i = 0; i < 10; i++) { + keys[i] += key.ToString(); + key_slices[i] = keys[i]; + s = db_->Get(readoptionscopy, column_family, key_slices[i], value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + values[i] = ""; + thread->stats.AddErrors(1); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (s.IsNotFound()) { + values[i] = ""; + thread->stats.AddGets(1, 0); + } else { + values[i] = *value; + + char expected_prefix = (keys[i])[0]; + char actual_prefix = (values[i])[0]; + if (actual_prefix != expected_prefix) { + fprintf(stderr, "error expected prefix = %c actual = %c\n", + expected_prefix, actual_prefix); + } + (values[i])[0] = ' '; // blank out the differing character + thread->stats.AddGets(1, 1); + } + } + db_->ReleaseSnapshot(readoptionscopy.snapshot); + + // Now that we retrieved all values, check that they all match + for (int i = 1; i < 10; i++) { + if (values[i] != values[0]) { + fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n", + key.ToString(true).c_str(), StringToHex(values[0]).c_str(), + StringToHex(values[i]).c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } + } + + return s; + } + + // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P + // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes + // of the key. Each of these 10 scans returns a series of values; + // each series should be the same length, and it is verified for each + // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V. + // ASSUMES that MultiPut was used to put (K, V) + Status MultiPrefixScan(ThreadState* thread, const ReadOptions& readoptions, + ColumnFamilyHandle* column_family, + const Slice& key) { + std::string prefixes[10] = {"0", "1", "2", "3", "4", + "5", "6", "7", "8", "9"}; + Slice prefix_slices[10]; + ReadOptions readoptionscopy[10]; + const Snapshot* snapshot = db_->GetSnapshot(); + Iterator* iters[10]; + Status s = Status::OK(); + for (int i = 0; i < 10; i++) { + prefixes[i] += key.ToString(); + prefixes[i].resize(FLAGS_prefix_size); + prefix_slices[i] = Slice(prefixes[i]); + readoptionscopy[i] = readoptions; + readoptionscopy[i].snapshot = snapshot; + iters[i] = db_->NewIterator(readoptionscopy[i], column_family); + iters[i]->Seek(prefix_slices[i]); + } + + int count = 0; + while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) { + count++; + std::string values[10]; + // get list of all values for this iteration + for (int i = 0; i < 10; i++) { + // no iterator should finish before the first one + assert(iters[i]->Valid() && + iters[i]->key().starts_with(prefix_slices[i])); + values[i] = iters[i]->value().ToString(); + + char expected_first = (prefixes[i])[0]; + char actual_first = (values[i])[0]; + + if (actual_first != expected_first) { + fprintf(stderr, "error expected first = %c actual = %c\n", + expected_first, actual_first); + } + (values[i])[0] = ' '; // blank out the differing character + } + // make sure all values are equivalent + for (int i = 0; i < 10; i++) { + if (values[i] != values[0]) { + fprintf(stderr, "error : %d, inconsistent values for prefix %s: %s, %s\n", + i, prefixes[i].c_str(), StringToHex(values[0]).c_str(), + StringToHex(values[i]).c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } + iters[i]->Next(); + } + } + + // cleanup iterators and snapshot + for (int i = 0; i < 10; i++) { + // if the first iterator finished, they should have all finished + assert(!iters[i]->Valid() || + !iters[i]->key().starts_with(prefix_slices[i])); + assert(iters[i]->status().ok()); + delete iters[i]; + } + db_->ReleaseSnapshot(snapshot); + + if (s.ok()) { + thread->stats.AddPrefixes(1, count); + } else { + thread->stats.AddErrors(1); + } + + return s; + } + + // Given a key K, this creates an iterator which scans to K and then + // does a random sequence of Next/Prev operations. + Status MultiIterate(ThreadState* thread, const ReadOptions& readoptions, + ColumnFamilyHandle* column_family, const Slice& key) { + Status s; + const Snapshot* snapshot = db_->GetSnapshot(); + ReadOptions readoptionscopy = readoptions; + readoptionscopy.snapshot = snapshot; + unique_ptr iter(db_->NewIterator(readoptionscopy, column_family)); + + iter->Seek(key); + for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) { + if (thread->rand.OneIn(2)) { + iter->Next(); + } else { + iter->Prev(); + } + } + + if (s.ok()) { + thread->stats.AddIterations(1); + } else { + thread->stats.AddErrors(1); + } + + db_->ReleaseSnapshot(snapshot); + + return s; + } + + void OperateDb(ThreadState* thread) { + ReadOptions read_opts(FLAGS_verify_checksum, true); + WriteOptions write_opts; + char value[100]; + long max_key = thread->shared->GetMaxKey(); + std::string from_db; + if (FLAGS_sync) { + write_opts.sync = true; + } + write_opts.disableWAL = FLAGS_disable_wal; + const int prefixBound = (int)FLAGS_readpercent + (int)FLAGS_prefixpercent; + const int writeBound = prefixBound + (int)FLAGS_writepercent; + const int delBound = writeBound + (int)FLAGS_delpercent; + + thread->stats.Start(); + for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { + if (thread->shared->HasVerificationFailedYet()) { + break; + } + if (i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) { + { + thread->stats.FinishedSingleOp(); + MutexLock l(thread->shared->GetMutex()); + thread->shared->IncVotedReopen(); + if (thread->shared->AllVotedReopen()) { + thread->shared->GetStressTest()->Reopen(); + thread->shared->GetCondVar()->SignalAll(); + } + else { + thread->shared->GetCondVar()->Wait(); + } + // Commenting this out as we don't want to reset stats on each open. + // thread->stats.Start(); + } + } + + if (!FLAGS_test_batches_snapshots && + FLAGS_clear_column_family_one_in != 0) { + if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) { + // drop column family and then create it again (can't drop default) + int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1; + std::string new_name = + std::to_string(new_column_family_name_.fetch_add(1)); + { + MutexLock l(thread->shared->GetMutex()); + fprintf( + stdout, + "[CF %d] Dropping and recreating column family. new name: %s\n", + cf, new_name.c_str()); + } + thread->shared->LockColumnFamily(cf); + Status s __attribute__((unused)); + s = db_->DropColumnFamily(column_families_[cf]); + delete column_families_[cf]; + assert(s.ok()); + s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name, + &column_families_[cf]); + column_family_names_[cf] = new_name; + thread->shared->ClearColumnFamily(cf); + assert(s.ok()); + thread->shared->UnlockColumnFamily(cf); + } + } + + long rand_key = thread->rand.Next() % max_key; + int rand_column_family = thread->rand.Next() % FLAGS_column_families; + std::string keystr = Key(rand_key); + Slice key = keystr; + int prob_op = thread->rand.Uniform(100); + std::unique_ptr l; + if (!FLAGS_test_batches_snapshots) { + l.reset(new MutexLock( + thread->shared->GetMutexForKey(rand_column_family, rand_key))); + } + auto column_family = column_families_[rand_column_family]; + + if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) { + // OPERATION read + if (!FLAGS_test_batches_snapshots) { + Status s = db_->Get(read_opts, column_family, key, &from_db); + if (s.ok()) { + // found case + thread->stats.AddGets(1, 1); + } else if (s.IsNotFound()) { + // not found case + thread->stats.AddGets(1, 0); + } else { + // errors case + thread->stats.AddErrors(1); + } + } else { + MultiGet(thread, read_opts, column_family, key, &from_db); + } + } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) { + // OPERATION prefix scan + // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are + // (8 - FLAGS_prefix_size) bytes besides the prefix. So there will + // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same + // prefix + if (!FLAGS_test_batches_snapshots) { + Slice prefix = Slice(key.data(), FLAGS_prefix_size); + Iterator* iter = db_->NewIterator(read_opts, column_family); + int64_t count = 0; + for (iter->Seek(prefix); + iter->Valid() && iter->key().starts_with(prefix); iter->Next()) { + ++count; + } + assert(count <= + (static_cast(1) << ((8 - FLAGS_prefix_size) * 8))); + if (iter->status().ok()) { + thread->stats.AddPrefixes(1, count); + } else { + thread->stats.AddErrors(1); + } + delete iter; + } else { + MultiPrefixScan(thread, read_opts, column_family, key); + } + } else if (prefixBound <= prob_op && prob_op < writeBound) { + // OPERATION write + uint32_t value_base = thread->rand.Next(); + size_t sz = GenerateValue(value_base, value, sizeof(value)); + Slice v(value, sz); + if (!FLAGS_test_batches_snapshots) { + if (FLAGS_verify_before_write) { + std::string keystr2 = Key(rand_key); + Slice k = keystr2; + Status s = db_->Get(read_opts, column_family, k, &from_db); + if (VerifyValue(rand_column_family, rand_key, read_opts, + thread->shared, from_db, s, true) == false) { + break; + } + } + thread->shared->Put(rand_column_family, rand_key, value_base); + if (FLAGS_use_merge) { + db_->Merge(write_opts, column_family, key, v); + } else { + db_->Put(write_opts, column_family, key, v); + } + thread->stats.AddBytesForWrites(1, sz); + } else { + MultiPut(thread, write_opts, column_family, key, v, sz); + } + PrintKeyValue(rand_column_family, rand_key, value, sz); + } else if (writeBound <= prob_op && prob_op < delBound) { + // OPERATION delete + if (!FLAGS_test_batches_snapshots) { + thread->shared->Delete(rand_column_family, rand_key); + db_->Delete(write_opts, column_family, key); + thread->stats.AddDeletes(1); + } else { + MultiDelete(thread, write_opts, column_family, key); + } + } else { + // OPERATION iterate + MultiIterate(thread, read_opts, column_family, key); + } + thread->stats.FinishedSingleOp(); + } + + thread->stats.Stop(); + } + + void VerifyDb(ThreadState* thread) const { + ReadOptions options(FLAGS_verify_checksum, true); + auto shared = thread->shared; + static const long max_key = shared->GetMaxKey(); + static const long keys_per_thread = max_key / shared->GetNumThreads(); + long start = keys_per_thread * thread->tid; + long end = start + keys_per_thread; + if (thread->tid == shared->GetNumThreads() - 1) { + end = max_key; + } + for (size_t cf = 0; cf < column_families_.size(); ++cf) { + if (thread->shared->HasVerificationFailedYet()) { + break; + } + if (!thread->rand.OneIn(2)) { + // Use iterator to verify this range + unique_ptr iter( + db_->NewIterator(options, column_families_[cf])); + iter->Seek(Key(start)); + for (long i = start; i < end; i++) { + if (thread->shared->HasVerificationFailedYet()) { + break; + } + // TODO(ljin): update "long" to uint64_t + // Reseek when the prefix changes + if (i % (static_cast(1) << 8 * (8 - FLAGS_prefix_size)) == + 0) { + iter->Seek(Key(i)); + } + std::string from_db; + std::string keystr = Key(i); + Slice k = keystr; + Status s = iter->status(); + if (iter->Valid()) { + if (iter->key().compare(k) > 0) { + s = Status::NotFound(Slice()); + } else if (iter->key().compare(k) == 0) { + from_db = iter->value().ToString(); + iter->Next(); + } else if (iter->key().compare(k) < 0) { + VerificationAbort(shared, "An out of range key was found", cf, i); + } + } else { + // The iterator found no value for the key in question, so do not + // move to the next item in the iterator + s = Status::NotFound(Slice()); + } + VerifyValue(cf, i, options, shared, from_db, s, true); + if (from_db.length()) { + PrintKeyValue(cf, i, from_db.data(), from_db.length()); + } + } + } else { + // Use Get to verify this range + for (long i = start; i < end; i++) { + if (thread->shared->HasVerificationFailedYet()) { + break; + } + std::string from_db; + std::string keystr = Key(i); + Slice k = keystr; + Status s = db_->Get(options, column_families_[cf], k, &from_db); + VerifyValue(cf, i, options, shared, from_db, s, true); + if (from_db.length()) { + PrintKeyValue(cf, i, from_db.data(), from_db.length()); + } + } + } + } + } + + void VerificationAbort(SharedState* shared, std::string msg, int cf, + long key) const { + printf("Verification failed for column family %d key %ld: %s\n", cf, key, + msg.c_str()); + shared->SetVerificationFailure(); + } + + bool VerifyValue(int cf, long key, const ReadOptions& opts, + SharedState* shared, const std::string& value_from_db, + Status s, bool strict = false) const { + if (shared->HasVerificationFailedYet()) { + return false; + } + // compare value_from_db with the value in the shared state + char value[100]; + uint32_t value_base = shared->Get(cf, key); + if (value_base == SharedState::SENTINEL && !strict) { + return true; + } + + if (s.ok()) { + if (value_base == SharedState::SENTINEL) { + VerificationAbort(shared, "Unexpected value found", cf, key); + return false; + } + size_t sz = GenerateValue(value_base, value, sizeof(value)); + if (value_from_db.length() != sz) { + VerificationAbort(shared, "Length of value read is not equal", cf, key); + return false; + } + if (memcmp(value_from_db.data(), value, sz) != 0) { + VerificationAbort(shared, "Contents of value read don't match", cf, + key); + return false; + } + } else { + if (value_base != SharedState::SENTINEL) { + VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key); + return false; + } + } + return true; + } + + static void PrintKeyValue(int cf, uint32_t key, const char* value, + size_t sz) { + if (!FLAGS_verbose) { + return; + } + fprintf(stdout, "[CF %d] %u ==> (%u) ", cf, key, (unsigned int)sz); + for (size_t i = 0; i < sz; i++) { + fprintf(stdout, "%X", value[i]); + } + fprintf(stdout, "\n"); + } + + static size_t GenerateValue(uint32_t rand, char *v, size_t max_sz) { + size_t value_sz = ((rand % 3) + 1) * FLAGS_value_size_mult; + assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t)); + *((uint32_t*)v) = rand; + for (size_t i=sizeof(uint32_t); i < value_sz; i++) { + v[i] = (char)(rand ^ i); + } + v[value_sz] = '\0'; + return value_sz; // the size of the value set. + } + + void PrintEnv() const { + fprintf(stdout, "RocksDB version : %d.%d\n", kMajorVersion, + kMinorVersion); + fprintf(stdout, "Column families : %d\n", FLAGS_column_families); + if (!FLAGS_test_batches_snapshots) { + fprintf(stdout, "Clear CFs one in : %d\n", + FLAGS_clear_column_family_one_in); + } + fprintf(stdout, "Number of threads : %d\n", FLAGS_threads); + fprintf(stdout, + "Ops per thread : %lu\n", + (unsigned long)FLAGS_ops_per_thread); + std::string ttl_state("unused"); + if (FLAGS_ttl > 0) { + ttl_state = NumberToString(FLAGS_ttl); + } + fprintf(stdout, "Time to live(sec) : %s\n", ttl_state.c_str()); + fprintf(stdout, "Read percentage : %d%%\n", FLAGS_readpercent); + fprintf(stdout, "Prefix percentage : %d%%\n", FLAGS_prefixpercent); + fprintf(stdout, "Write percentage : %d%%\n", FLAGS_writepercent); + fprintf(stdout, "Delete percentage : %d%%\n", FLAGS_delpercent); + fprintf(stdout, "Iterate percentage : %d%%\n", FLAGS_iterpercent); + fprintf(stdout, "Write-buffer-size : %d\n", FLAGS_write_buffer_size); + fprintf(stdout, + "Iterations : %lu\n", + (unsigned long)FLAGS_num_iterations); + fprintf(stdout, + "Max key : %lu\n", + (unsigned long)FLAGS_max_key); + fprintf(stdout, "Ratio #ops/#keys : %f\n", + (1.0 * FLAGS_ops_per_thread * FLAGS_threads)/FLAGS_max_key); + fprintf(stdout, "Num times DB reopens: %d\n", FLAGS_reopen); + fprintf(stdout, "Batches/snapshots : %d\n", + FLAGS_test_batches_snapshots); + fprintf(stdout, "Purge redundant %% : %d\n", + FLAGS_purge_redundant_percent); + fprintf(stdout, "Deletes use filter : %d\n", + FLAGS_filter_deletes); + fprintf(stdout, "Num keys per lock : %d\n", + 1 << FLAGS_log2_keys_per_lock); + + const char* compression = ""; + switch (FLAGS_compression_type_e) { + case rocksdb::kNoCompression: + compression = "none"; + break; + case rocksdb::kSnappyCompression: + compression = "snappy"; + break; + case rocksdb::kZlibCompression: + compression = "zlib"; + break; + case rocksdb::kBZip2Compression: + compression = "bzip2"; + break; + case rocksdb::kLZ4Compression: + compression = "lz4"; + case rocksdb::kLZ4HCCompression: + compression = "lz4hc"; + break; + } + + fprintf(stdout, "Compression : %s\n", compression); + + const char* memtablerep = ""; + switch (FLAGS_rep_factory) { + case kSkipList: + memtablerep = "skip_list"; + break; + case kHashSkipList: + memtablerep = "prefix_hash"; + break; + case kVectorRep: + memtablerep = "vector"; + break; + } + + fprintf(stdout, "Memtablerep : %s\n", memtablerep); + + fprintf(stdout, "------------------------------------------------\n"); + } + + void Open() { + assert(db_ == nullptr); + options_.block_cache = cache_; + options_.block_cache_compressed = compressed_cache_; + options_.write_buffer_size = FLAGS_write_buffer_size; + options_.max_write_buffer_number = FLAGS_max_write_buffer_number; + options_.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + options_.max_background_compactions = FLAGS_max_background_compactions; + options_.max_background_flushes = FLAGS_max_background_flushes; + options_.compaction_style = + static_cast(FLAGS_compaction_style); + options_.block_size = FLAGS_block_size; + options_.filter_policy = filter_policy_; + options_.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size)); + options_.max_open_files = FLAGS_open_files; + options_.statistics = dbstats; + options_.env = FLAGS_env; + options_.disableDataSync = FLAGS_disable_data_sync; + options_.use_fsync = FLAGS_use_fsync; + options_.allow_mmap_reads = FLAGS_mmap_read; + rocksdb_kill_odds = FLAGS_kill_random_test; + options_.target_file_size_base = FLAGS_target_file_size_base; + options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier; + options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; + options_.max_bytes_for_level_multiplier = + FLAGS_max_bytes_for_level_multiplier; + options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger; + options_.level0_slowdown_writes_trigger = + FLAGS_level0_slowdown_writes_trigger; + options_.level0_file_num_compaction_trigger = + FLAGS_level0_file_num_compaction_trigger; + options_.compression = FLAGS_compression_type_e; + options_.create_if_missing = true; + options_.disable_seek_compaction = FLAGS_disable_seek_compaction; + options_.max_manifest_file_size = 10 * 1024; + options_.filter_deletes = FLAGS_filter_deletes; + if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) { + fprintf(stderr, + "prefix_size should be non-zero iff memtablerep == prefix_hash\n"); + exit(1); + } + switch (FLAGS_rep_factory) { + case kHashSkipList: + options_.memtable_factory.reset(NewHashSkipListRepFactory()); + break; + case kSkipList: + // no need to do anything + break; + case kVectorRep: + options_.memtable_factory.reset(new VectorRepFactory()); + break; + } + static Random purge_percent(1000); // no benefit from non-determinism here + if (static_cast(purge_percent.Uniform(100)) < + FLAGS_purge_redundant_percent - 1) { + options_.purge_redundant_kvs_while_flush = false; + } + + if (FLAGS_use_merge) { + options_.merge_operator = MergeOperators::CreatePutOperator(); + } + + // set universal style compaction configurations, if applicable + if (FLAGS_universal_size_ratio != 0) { + options_.compaction_options_universal.size_ratio = + FLAGS_universal_size_ratio; + } + if (FLAGS_universal_min_merge_width != 0) { + options_.compaction_options_universal.min_merge_width = + FLAGS_universal_min_merge_width; + } + if (FLAGS_universal_max_merge_width != 0) { + options_.compaction_options_universal.max_merge_width = + FLAGS_universal_max_merge_width; + } + if (FLAGS_universal_max_size_amplification_percent != 0) { + options_.compaction_options_universal.max_size_amplification_percent = + FLAGS_universal_max_size_amplification_percent; + } + + fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); + + Status s; + if (FLAGS_ttl == -1) { + std::vector existing_column_families; + s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db, + &existing_column_families); // ignore errors + if (!s.ok()) { + // DB doesn't exist + assert(existing_column_families.empty()); + assert(column_family_names_.empty()); + column_family_names_.push_back(kDefaultColumnFamilyName); + } else if (column_family_names_.empty()) { + // this is the first call to the function Open() + column_family_names_ = existing_column_families; + } else { + // this is a reopen. just assert that existing column_family_names are + // equivalent to what we remember + auto sorted_cfn = column_family_names_; + sort(sorted_cfn.begin(), sorted_cfn.end()); + sort(existing_column_families.begin(), existing_column_families.end()); + if (sorted_cfn != existing_column_families) { + fprintf(stderr, + "Expected column families differ from the existing:\n"); + printf("Expected: {"); + for (auto cf : sorted_cfn) { + printf("%s ", cf.c_str()); + } + printf("}\n"); + printf("Existing: {"); + for (auto cf : existing_column_families) { + printf("%s ", cf.c_str()); + } + printf("}\n"); + } + assert(sorted_cfn == existing_column_families); + } + std::vector cf_descriptors; + for (auto name : column_family_names_) { + if (name != kDefaultColumnFamilyName) { + new_column_family_name_ = + std::max(new_column_family_name_.load(), std::stoi(name) + 1); + } + cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_)); + } + s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors, + &column_families_, &db_); + if (s.ok()) { + while (s.ok() && + column_families_.size() < (size_t)FLAGS_column_families) { + ColumnFamilyHandle* cf = nullptr; + std::string name = std::to_string(new_column_family_name_.load()); + new_column_family_name_++; + s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), name, &cf); + column_families_.push_back(cf); + column_family_names_.push_back(name); + } + } + assert(!s.ok() || column_families_.size() == + static_cast(FLAGS_column_families)); + } else { + DBWithTTL* db_with_ttl; + s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl); + db_ = db_with_ttl; + } + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + } + + void Reopen() { + for (auto cf : column_families_) { + delete cf; + } + column_families_.clear(); + delete db_; + db_ = nullptr; + + num_times_reopened_++; + double now = FLAGS_env->NowMicros(); + fprintf(stdout, "%s Reopening database for the %dth time\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(), + num_times_reopened_); + Open(); + } + + void PrintStatistics() { + if (dbstats) { + fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str()); + } + } + + private: + shared_ptr cache_; + shared_ptr compressed_cache_; + const FilterPolicy* filter_policy_; + DB* db_; + Options options_; + std::vector column_families_; + std::vector column_family_names_; + std::atomic new_column_family_name_; + int num_times_reopened_; +}; + +} // namespace rocksdb + +int main(int argc, char** argv) { + SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_statistics) { + dbstats = rocksdb::CreateDBStatistics(); + } + FLAGS_compression_type_e = + StringToCompressionType(FLAGS_compression_type.c_str()); + if (!FLAGS_hdfs.empty()) { + FLAGS_env = new rocksdb::HdfsEnv(FLAGS_hdfs); + } + FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); + + // The number of background threads should be at least as much the + // max number of concurrent compactions. + FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions); + + if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size <= 0) { + fprintf(stderr, + "Error: prefixpercent is non-zero while prefix_size is " + "not positive!\n"); + exit(1); + } + if (FLAGS_test_batches_snapshots && FLAGS_prefix_size <= 0) { + fprintf(stderr, + "Error: please specify prefix_size for " + "test_batches_snapshots test!\n"); + exit(1); + } + if ((FLAGS_readpercent + FLAGS_prefixpercent + + FLAGS_writepercent + FLAGS_delpercent + FLAGS_iterpercent) != 100) { + fprintf(stderr, + "Error: Read+Prefix+Write+Delete+Iterate percents != 100!\n"); + exit(1); + } + if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) { + fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n"); + exit(1); + } + if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) { + fprintf(stderr, + "Error: #DB-reopens should be < ops_per_thread\n" + "Provided reopens = %d and ops_per_thread = %lu\n", + FLAGS_reopen, + (unsigned long)FLAGS_ops_per_thread); + exit(1); + } + + // Choose a location for the test database if none given with --db= + if (FLAGS_db.empty()) { + std::string default_db_path; + rocksdb::Env::Default()->GetTestDirectory(&default_db_path); + default_db_path += "/dbstress"; + FLAGS_db = default_db_path; + } + + rocksdb::StressTest stress; + if (stress.Run()) { + return 0; + } else { + return 1; + } +} + +#endif // GFLAGS diff --git a/tools/ldb.cc b/tools/ldb.cc new file mode 100644 index 0000000000..4581b8011a --- /dev/null +++ b/tools/ldb.cc @@ -0,0 +1,13 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include "rocksdb/ldb_tool.h" + +int main(int argc, char** argv) { + rocksdb::LDBTool tool; + tool.Run(argc, argv); + return 0; +} diff --git a/tools/ldb_test.py b/tools/ldb_test.py new file mode 100644 index 0000000000..b4ef5221f4 --- /dev/null +++ b/tools/ldb_test.py @@ -0,0 +1,383 @@ +import os +import os.path +import shutil +import subprocess +import time +import unittest +import tempfile + +def my_check_output(*popenargs, **kwargs): + """ + If we had python 2.7, we should simply use subprocess.check_output. + This is a stop-gap solution for python 2.6 + """ + if 'stdout' in kwargs: + raise ValueError('stdout argument not allowed, it will be overridden.') + process = subprocess.Popen(stderr=subprocess.PIPE, stdout=subprocess.PIPE, + *popenargs, **kwargs) + output, unused_err = process.communicate() + retcode = process.poll() + if retcode: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + raise Exception("Exit code is not 0. It is %d. Command: %s" % + (retcode, cmd)) + return output + +def run_err_null(cmd): + return os.system(cmd + " 2>/dev/null ") + +class LDBTestCase(unittest.TestCase): + def setUp(self): + self.TMP_DIR = tempfile.mkdtemp(prefix="ldb_test_") + self.DB_NAME = "testdb" + + def tearDown(self): + assert(self.TMP_DIR.strip() != "/" + and self.TMP_DIR.strip() != "/tmp" + and self.TMP_DIR.strip() != "/tmp/") #Just some paranoia + + shutil.rmtree(self.TMP_DIR) + + def dbParam(self, dbName): + return "--db=%s" % os.path.join(self.TMP_DIR, dbName) + + def assertRunOKFull(self, params, expectedOutput, unexpected=False): + """ + All command-line params must be specified. + Allows full flexibility in testing; for example: missing db param. + + """ + + output = my_check_output("./ldb %s |grep -v \"Created bg thread\"" % + params, shell=True) + if not unexpected: + self.assertEqual(output.strip(), expectedOutput.strip()) + else: + self.assertNotEqual(output.strip(), expectedOutput.strip()) + + def assertRunFAILFull(self, params): + """ + All command-line params must be specified. + Allows full flexibility in testing; for example: missing db param. + + """ + try: + + my_check_output("./ldb %s >/dev/null 2>&1 |grep -v \"Created bg \ + thread\"" % params, shell=True) + except Exception, e: + return + self.fail( + "Exception should have been raised for command with params: %s" % + params) + + def assertRunOK(self, params, expectedOutput, unexpected=False): + """ + Uses the default test db. + + """ + self.assertRunOKFull("%s %s" % (self.dbParam(self.DB_NAME), params), + expectedOutput, unexpected) + + def assertRunFAIL(self, params): + """ + Uses the default test db. + """ + self.assertRunFAILFull("%s %s" % (self.dbParam(self.DB_NAME), params)) + + def testSimpleStringPutGet(self): + print "Running testSimpleStringPutGet..." + self.assertRunFAIL("put x1 y1") + self.assertRunOK("put --create_if_missing x1 y1", "OK") + self.assertRunOK("get x1", "y1") + self.assertRunFAIL("get x2") + + self.assertRunOK("put x2 y2", "OK") + self.assertRunOK("get x1", "y1") + self.assertRunOK("get x2", "y2") + self.assertRunFAIL("get x3") + + self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2") + self.assertRunOK("put x3 y3", "OK") + + self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan --from=x", "x1 : y1\nx2 : y2\nx3 : y3") + + self.assertRunOK("scan --to=x2", "x1 : y1") + self.assertRunOK("scan --from=x1 --to=z --max_keys=1", "x1 : y1") + self.assertRunOK("scan --from=x1 --to=z --max_keys=2", + "x1 : y1\nx2 : y2") + + self.assertRunOK("scan --from=x1 --to=z --max_keys=3", + "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan --from=x1 --to=z --max_keys=4", + "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan --from=x1 --to=x2", "x1 : y1") + self.assertRunOK("scan --from=x2 --to=x4", "x2 : y2\nx3 : y3") + self.assertRunFAIL("scan --from=x4 --to=z") # No results => FAIL + self.assertRunFAIL("scan --from=x1 --to=z --max_keys=foo") + + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3") + + self.assertRunOK("delete x1", "OK") + self.assertRunOK("scan", "x2 : y2\nx3 : y3") + + self.assertRunOK("delete NonExistentKey", "OK") + # It is weird that GET and SCAN raise exception for + # non-existent key, while delete does not + + self.assertRunOK("checkconsistency", "OK") + + def dumpDb(self, params, dumpFile): + return 0 == run_err_null("./ldb dump %s > %s" % (params, dumpFile)) + + def loadDb(self, params, dumpFile): + return 0 == run_err_null("cat %s | ./ldb load %s" % (dumpFile, params)) + + def testStringBatchPut(self): + print "Running testStringBatchPut..." + self.assertRunOK("batchput x1 y1 --create_if_missing", "OK") + self.assertRunOK("scan", "x1 : y1") + self.assertRunOK("batchput x2 y2 x3 y3 \"x4 abc\" \"y4 xyz\"", "OK") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 abc : y4 xyz") + self.assertRunFAIL("batchput") + self.assertRunFAIL("batchput k1") + self.assertRunFAIL("batchput k1 v1 k2") + + def testCountDelimDump(self): + print "Running testCountDelimDump..." + self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK") + self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK") + self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8") + self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8") + self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK") + self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8") + + def testCountDelimIDump(self): + print "Running testCountDelimIDump..." + self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK") + self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK") + self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8") + self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8") + self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK") + self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8") + + def testInvalidCmdLines(self): + print "Running testInvalidCmdLines..." + # db not specified + self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing") + # No param called he + self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing") + # max_keys is not applicable for put + self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing") + # hex has invalid boolean value + + def testHexPutGet(self): + print "Running testHexPutGet..." + self.assertRunOK("put a1 b1 --create_if_missing", "OK") + self.assertRunOK("scan", "a1 : b1") + self.assertRunOK("scan --hex", "0x6131 : 0x6231") + self.assertRunFAIL("put --hex 6132 6232") + self.assertRunOK("put --hex 0x6132 0x6232", "OK") + self.assertRunOK("scan --hex", "0x6131 : 0x6231\n0x6132 : 0x6232") + self.assertRunOK("scan", "a1 : b1\na2 : b2") + self.assertRunOK("get a1", "b1") + self.assertRunOK("get --hex 0x6131", "0x6231") + self.assertRunOK("get a2", "b2") + self.assertRunOK("get --hex 0x6132", "0x6232") + self.assertRunOK("get --key_hex 0x6132", "b2") + self.assertRunOK("get --key_hex --value_hex 0x6132", "0x6232") + self.assertRunOK("get --value_hex a2", "0x6232") + self.assertRunOK("scan --key_hex --value_hex", + "0x6131 : 0x6231\n0x6132 : 0x6232") + self.assertRunOK("scan --hex --from=0x6131 --to=0x6133", + "0x6131 : 0x6231\n0x6132 : 0x6232") + self.assertRunOK("scan --hex --from=0x6131 --to=0x6132", + "0x6131 : 0x6231") + self.assertRunOK("scan --key_hex", "0x6131 : b1\n0x6132 : b2") + self.assertRunOK("scan --value_hex", "a1 : 0x6231\na2 : 0x6232") + self.assertRunOK("batchput --hex 0x6133 0x6233 0x6134 0x6234", "OK") + self.assertRunOK("scan", "a1 : b1\na2 : b2\na3 : b3\na4 : b4") + self.assertRunOK("delete --hex 0x6133", "OK") + self.assertRunOK("scan", "a1 : b1\na2 : b2\na4 : b4") + self.assertRunOK("checkconsistency", "OK") + + def testTtlPutGet(self): + print "Running testTtlPutGet..." + self.assertRunOK("put a1 b1 --ttl --create_if_missing", "OK") + self.assertRunOK("scan --hex", "0x6131 : 0x6231", True) + self.assertRunOK("dump --ttl ", "a1 ==> b1", True) + self.assertRunOK("dump --hex --ttl ", + "0x6131 ==> 0x6231\nKeys in range: 1") + self.assertRunOK("scan --hex --ttl", "0x6131 : 0x6231") + self.assertRunOK("get --value_hex a1", "0x6231", True) + self.assertRunOK("get --ttl a1", "b1") + self.assertRunOK("put a3 b3 --create_if_missing", "OK") + # fails because timstamp's length is greater than value's + self.assertRunFAIL("get --ttl a3") + self.assertRunOK("checkconsistency", "OK") + + def testInvalidCmdLines(self): + print "Running testInvalidCmdLines..." + # db not specified + self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing") + # No param called he + self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing") + # max_keys is not applicable for put + self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing") + # hex has invalid boolean value + self.assertRunFAIL("put 0x6133 0x6233 --hex=Boo --create_if_missing") + + def testDumpLoad(self): + print "Running testDumpLoad..." + self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4", + "OK") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + + # Dump and load without any additional params specified + dumpFilePath = os.path.join(self.TMP_DIR, "dump1") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump1") + self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump and load in hex + dumpFilePath = os.path.join(self.TMP_DIR, "dump2") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump2") + self.assertTrue(self.dumpDb("--db=%s --hex" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --hex --create_if_missing" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump only a portion of the key range + dumpFilePath = os.path.join(self.TMP_DIR, "dump3") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump3") + self.assertTrue(self.dumpDb( + "--db=%s --from=x1 --to=x3" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2") + + # Dump upto max_keys rows + dumpFilePath = os.path.join(self.TMP_DIR, "dump4") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump4") + self.assertTrue(self.dumpDb( + "--db=%s --max_keys=3" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3") + + # Load into an existing db, create_if_missing is not specified + self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb("--db=%s" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump and load with WAL disabled + dumpFilePath = os.path.join(self.TMP_DIR, "dump5") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump5") + self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --disable_wal --create_if_missing" % loadedDbPath, + dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump and load with lots of extra params specified + extraParams = " ".join(["--bloom_bits=14", "--compression_type=bzip2", + "--block_size=1024", "--auto_compaction=true", + "--write_buffer_size=4194304", + "--file_size=2097152"]) + dumpFilePath = os.path.join(self.TMP_DIR, "dump6") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump6") + self.assertTrue(self.dumpDb( + "--db=%s %s" % (origDbPath, extraParams), dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s %s --create_if_missing" % (loadedDbPath, extraParams), + dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump with count_only + dumpFilePath = os.path.join(self.TMP_DIR, "dump7") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump7") + self.assertTrue(self.dumpDb( + "--db=%s --count_only" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)) + # DB should have atleast one value for scan to work + self.assertRunOKFull("put --db=%s k1 v1" % loadedDbPath, "OK") + self.assertRunOKFull("scan --db=%s" % loadedDbPath, "k1 : v1") + + # Dump command fails because of typo in params + dumpFilePath = os.path.join(self.TMP_DIR, "dump8") + self.assertFalse(self.dumpDb( + "--db=%s --create_if_missing" % origDbPath, dumpFilePath)) + + def testMiscAdminTask(self): + print "Running testMiscAdminTask..." + # These tests need to be improved; for example with asserts about + # whether compaction or level reduction actually took place. + self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4", + "OK") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + + self.assertTrue(0 == run_err_null( + "./ldb compact --db=%s" % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue(0 == run_err_null( + "./ldb reduce_levels --db=%s --new_levels=2" % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue(0 == run_err_null( + "./ldb reduce_levels --db=%s --new_levels=3" % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue(0 == run_err_null( + "./ldb compact --db=%s --from=x1 --to=x3" % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue(0 == run_err_null( + "./ldb compact --db=%s --hex --from=0x6131 --to=0x6134" + % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + #TODO(dilip): Not sure what should be passed to WAL.Currently corrupted. + self.assertTrue(0 == run_err_null( + "./ldb dump_wal --db=%s --walfile=%s --header" % ( + origDbPath, os.path.join(origDbPath, "LOG")))) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + def testCheckConsistency(self): + print "Running testCheckConsistency..." + + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("put x1 y1 --create_if_missing", "OK") + self.assertRunOK("put x2 y2", "OK") + self.assertRunOK("get x1", "y1") + self.assertRunOK("checkconsistency", "OK") + + sstFilePath = my_check_output("ls %s" % os.path.join(dbPath, "*.sst"), + shell=True) + + # Modify the file + my_check_output("echo 'evil' > %s" % sstFilePath, shell=True) + self.assertRunFAIL("checkconsistency") + + # Delete the file + my_check_output("rm -f %s" % sstFilePath, shell=True) + self.assertRunFAIL("checkconsistency") + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc new file mode 100644 index 0000000000..b588b52d2a --- /dev/null +++ b/tools/reduce_levels_test.cc @@ -0,0 +1,197 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "rocksdb/db.h" +#include "db/db_impl.h" +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testutil.h" +#include "util/testharness.h" +#include "util/ldb_cmd.h" + +namespace rocksdb { + +class ReduceLevelTest { +public: + ReduceLevelTest() { + dbname_ = test::TmpDir() + "/db_reduce_levels_test"; + DestroyDB(dbname_, Options()); + db_ = nullptr; + } + + Status OpenDB(bool create_if_missing, int levels, + int mem_table_compact_level); + + Status Put(const std::string& k, const std::string& v) { + return db_->Put(WriteOptions(), k, v); + } + + std::string Get(const std::string& k) { + ReadOptions options; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + Status CompactMemTable() { + if (db_ == nullptr) { + return Status::InvalidArgument("DB not opened."); + } + DBImpl* db_impl = reinterpret_cast(db_); + return db_impl->TEST_FlushMemTable(); + } + + void CloseDB() { + if (db_ != nullptr) { + delete db_; + db_ = nullptr; + } + } + + bool ReduceLevels(int target_level); + + int FilesOnLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + +private: + std::string dbname_; + DB* db_; +}; + +Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels, + int mem_table_compact_level) { + rocksdb::Options opt; + opt.num_levels = num_levels; + opt.create_if_missing = create_if_missing; + opt.max_mem_compaction_level = mem_table_compact_level; + rocksdb::Status st = rocksdb::DB::Open(opt, dbname_, &db_); + if (!st.ok()) { + fprintf(stderr, "Can't open the db:%s\n", st.ToString().c_str()); + } + return st; +} + +bool ReduceLevelTest::ReduceLevels(int target_level) { + std::vector args = rocksdb::ReduceDBLevelsCommand::PrepareArgs( + dbname_, target_level, false); + LDBCommand* level_reducer = LDBCommand::InitFromCmdLineArgs(args); + level_reducer->Run(); + bool is_succeed = level_reducer->GetExecuteState().IsSucceed(); + delete level_reducer; + return is_succeed; +} + +TEST(ReduceLevelTest, Last_Level) { + // create files on all levels; + ASSERT_OK(OpenDB(true, 4, 3)); + ASSERT_OK(Put("aaaa", "11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(3), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(3)); + ASSERT_OK(OpenDB(true, 3, 1)); + ASSERT_EQ(FilesOnLevel(2), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(2)); + ASSERT_OK(OpenDB(true, 2, 1)); + ASSERT_EQ(FilesOnLevel(1), 1); + CloseDB(); +} + +TEST(ReduceLevelTest, Top_Level) { + // create files on all levels; + ASSERT_OK(OpenDB(true, 5, 0)); + ASSERT_OK(Put("aaaa", "11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(0), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(4)); + ASSERT_OK(OpenDB(true, 4, 0)); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(3)); + ASSERT_OK(OpenDB(true, 3, 0)); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(2)); + ASSERT_OK(OpenDB(true, 2, 0)); + CloseDB(); +} + +TEST(ReduceLevelTest, All_Levels) { + // create files on all levels; + ASSERT_OK(OpenDB(true, 5, 1)); + ASSERT_OK(Put("a", "a11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(1), 1); + CloseDB(); + + ASSERT_OK(OpenDB(true, 5, 2)); + ASSERT_OK(Put("b", "b11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(1), 1); + ASSERT_EQ(FilesOnLevel(2), 1); + CloseDB(); + + ASSERT_OK(OpenDB(true, 5, 3)); + ASSERT_OK(Put("c", "c11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(1), 1); + ASSERT_EQ(FilesOnLevel(2), 1); + ASSERT_EQ(FilesOnLevel(3), 1); + CloseDB(); + + ASSERT_OK(OpenDB(true, 5, 4)); + ASSERT_OK(Put("d", "d11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(1), 1); + ASSERT_EQ(FilesOnLevel(2), 1); + ASSERT_EQ(FilesOnLevel(3), 1); + ASSERT_EQ(FilesOnLevel(4), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(4)); + ASSERT_OK(OpenDB(true, 4, 0)); + ASSERT_EQ("a11111", Get("a")); + ASSERT_EQ("b11111", Get("b")); + ASSERT_EQ("c11111", Get("c")); + ASSERT_EQ("d11111", Get("d")); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(3)); + ASSERT_OK(OpenDB(true, 3, 0)); + ASSERT_EQ("a11111", Get("a")); + ASSERT_EQ("b11111", Get("b")); + ASSERT_EQ("c11111", Get("c")); + ASSERT_EQ("d11111", Get("d")); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(2)); + ASSERT_OK(OpenDB(true, 2, 0)); + ASSERT_EQ("a11111", Get("a")); + ASSERT_EQ("b11111", Get("b")); + ASSERT_EQ("c11111", Get("c")); + ASSERT_EQ("d11111", Get("d")); + CloseDB(); +} + +} + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc new file mode 100644 index 0000000000..9a144bb0b3 --- /dev/null +++ b/tools/sst_dump.cc @@ -0,0 +1,367 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/block_based_table_factory.h" +#include "table/plain_table_factory.h" +#include "table/meta_blocks.h" +#include "table/block.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "util/ldb_cmd.h" +#include "util/random.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class SstFileReader { + public: + explicit SstFileReader(const std::string& file_name, + bool verify_checksum, + bool output_hex); + + Status ReadSequential(bool print_kv, + uint64_t read_num, + bool has_from, + const std::string& from_key, + bool has_to, + const std::string& to_key); + + Status ReadTableProperties( + std::shared_ptr* table_properties); + uint64_t GetReadNumber() { return read_num_; } + + private: + Status NewTableReader(const std::string& file_path); + Status SetTableOptionsByMagicNumber(uint64_t table_magic_number, + RandomAccessFile* file, + uint64_t file_size); + + std::string file_name_; + uint64_t read_num_; + bool verify_checksum_; + bool output_hex_; + EnvOptions soptions_; + + Status init_result_; + unique_ptr table_reader_; + unique_ptr file_; + // options_ and internal_comparator_ will also be used in + // ReadSequential internally (specifically, seek-related operations) + Options options_; + InternalKeyComparator internal_comparator_; +}; + +SstFileReader::SstFileReader(const std::string& file_path, + bool verify_checksum, + bool output_hex) + :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum), + output_hex_(output_hex), internal_comparator_(BytewiseComparator()) { + fprintf(stdout, "Process %s\n", file_path.c_str()); + + init_result_ = NewTableReader(file_name_); +} + +extern uint64_t kBlockBasedTableMagicNumber; +extern uint64_t kPlainTableMagicNumber; + +Status SstFileReader::NewTableReader(const std::string& file_path) { + uint64_t magic_number; + + // read table magic number + Footer footer; + + unique_ptr file; + uint64_t file_size; + Status s = options_.env->NewRandomAccessFile(file_path, &file_, soptions_); + if (s.ok()) { + s = options_.env->GetFileSize(file_path, &file_size); + } + if (s.ok()) { + s = ReadFooterFromFile(file_.get(), file_size, &footer); + } + if (s.ok()) { + magic_number = footer.table_magic_number(); + } + + if (s.ok()) { + if (magic_number == kPlainTableMagicNumber) { + soptions_.use_mmap_reads = true; + } + options_.comparator = &internal_comparator_; + s = SetTableOptionsByMagicNumber(magic_number, file_.get(), file_size); + } + + if (s.ok()) { + s = options_.table_factory->NewTableReader( + options_, soptions_, internal_comparator_, std::move(file_), file_size, + &table_reader_); + } + return s; +} + +Status SstFileReader::SetTableOptionsByMagicNumber(uint64_t table_magic_number, + RandomAccessFile* file, + uint64_t file_size) { + TableProperties* table_properties; + Status s = rocksdb::ReadTableProperties(file, file_size, table_magic_number, + options_.env, options_.info_log.get(), + &table_properties); + if (!s.ok()) { + return s; + } + std::unique_ptr props_guard(table_properties); + + if (table_magic_number == kBlockBasedTableMagicNumber) { + options_.table_factory = std::make_shared(); + fprintf(stdout, "Sst file format: block-based\n"); + } else if (table_magic_number == kPlainTableMagicNumber) { + options_.allow_mmap_reads = true; + options_.table_factory = std::make_shared( + table_properties->fixed_key_len, 2, 0.8); + options_.prefix_extractor.reset(NewNoopTransform()); + fprintf(stdout, "Sst file format: plain table\n"); + } else { + char error_msg_buffer[80]; + snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1, + "Unsupported table magic number --- %lx", + (long)table_magic_number); + return Status::InvalidArgument(error_msg_buffer); + } + + return Status::OK(); +} + +Status SstFileReader::ReadSequential(bool print_kv, + uint64_t read_num, + bool has_from, + const std::string& from_key, + bool has_to, + const std::string& to_key) { + if (!table_reader_) { + return init_result_; + } + + Iterator* iter = table_reader_->NewIterator(ReadOptions(verify_checksum_, + false)); + uint64_t i = 0; + if (has_from) { + InternalKey ikey(from_key, kMaxSequenceNumber, kValueTypeForSeek); + iter->Seek(ikey.Encode()); + } else { + iter->SeekToFirst(); + } + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + Slice value = iter->value(); + ++i; + if (read_num > 0 && i > read_num) + break; + + ParsedInternalKey ikey; + if (!ParseInternalKey(key, &ikey)) { + std::cerr << "Internal Key [" + << key.ToString(true /* in hex*/) + << "] parse error!\n"; + continue; + } + + // If end marker was specified, we stop before it + if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) { + break; + } + + if (print_kv) { + fprintf(stdout, "%s => %s\n", + ikey.DebugString(output_hex_).c_str(), + value.ToString(output_hex_).c_str()); + } + } + + read_num_ += i; + + Status ret = iter->status(); + delete iter; + return ret; +} + +Status SstFileReader::ReadTableProperties( + std::shared_ptr* table_properties) { + if (!table_reader_) { + return init_result_; + } + + *table_properties = table_reader_->GetTableProperties(); + return init_result_; +} + +} // namespace rocksdb + +static void print_help() { + fprintf(stderr, + "sst_dump [--command=check|scan] [--verify_checksum] " + "--file=data_dir_OR_sst_file" + " [--output_hex]" + " [--input_key_hex]" + " [--from=]" + " [--to=]" + " [--read_num=NUM]" + " [--show_properties]\n"); +} + +namespace { +string HexToString(const string& str) { + string parsed; + if (str[0] != '0' || str[1] != 'x') { + fprintf(stderr, "Invalid hex input %s. Must start with 0x\n", + str.c_str()); + throw "Invalid hex input"; + } + + for (unsigned int i = 2; i < str.length();) { + int c; + sscanf(str.c_str() + i, "%2X", &c); + parsed.push_back(c); + i += 2; + } + return parsed; +} +} // namespace + +int main(int argc, char** argv) { + const char* dir_or_file = nullptr; + uint64_t read_num = -1; + std::string command; + + char junk; + uint64_t n; + bool verify_checksum = false; + bool output_hex = false; + bool input_key_hex = false; + bool has_from = false; + bool has_to = false; + bool show_properties = false; + std::string from_key; + std::string to_key; + for (int i = 1; i < argc; i++) { + if (strncmp(argv[i], "--file=", 7) == 0) { + dir_or_file = argv[i] + 7; + } else if (strcmp(argv[i], "--output_hex") == 0) { + output_hex = true; + } else if (strcmp(argv[i], "--input_key_hex") == 0) { + input_key_hex = true; + } else if (sscanf(argv[i], + "--read_num=%lu%c", + (unsigned long*)&n, &junk) == 1) { + read_num = n; + } else if (strcmp(argv[i], "--verify_checksum") == 0) { + verify_checksum = true; + } else if (strncmp(argv[i], "--command=", 10) == 0) { + command = argv[i] + 10; + } else if (strncmp(argv[i], "--from=", 7) == 0) { + from_key = argv[i] + 7; + has_from = true; + } else if (strncmp(argv[i], "--to=", 5) == 0) { + to_key = argv[i] + 5; + has_to = true; + } else if (strcmp(argv[i], "--show_properties") == 0) { + show_properties = true; + } else { + print_help(); + exit(1); + } + } + + + if (input_key_hex) { + if (has_from) { + from_key = HexToString(from_key); + } + if (has_to) { + to_key = HexToString(to_key); + } + } + + if (dir_or_file == nullptr) { + print_help(); + exit(1); + } + + std::vector filenames; + rocksdb::Env* env = rocksdb::Env::Default(); + rocksdb::Status st = env->GetChildren(dir_or_file, &filenames); + bool dir = true; + if (!st.ok()) { + filenames.clear(); + filenames.push_back(dir_or_file); + dir = false; + } + + fprintf(stdout, "from [%s] to [%s]\n", + rocksdb::Slice(from_key).ToString(true).c_str(), + rocksdb::Slice(to_key).ToString(true).c_str()); + + uint64_t total_read = 0; + for (size_t i = 0; i < filenames.size(); i++) { + std::string filename = filenames.at(i); + if (filename.length() <= 4 || + filename.rfind(".sst") != filename.length() - 4) { + // ignore + continue; + } + if (dir) { + filename = std::string(dir_or_file) + "/" + filename; + } + rocksdb::SstFileReader reader(filename, verify_checksum, + output_hex); + rocksdb::Status st; + // scan all files in give file path. + if (command == "" || command == "scan" || command == "check") { + st = reader.ReadSequential(command != "check", + read_num > 0 ? (read_num - total_read) : + read_num, + has_from, from_key, has_to, to_key); + if (!st.ok()) { + fprintf(stderr, "%s: %s\n", filename.c_str(), + st.ToString().c_str()); + } + total_read += reader.GetReadNumber(); + if (read_num > 0 && total_read > read_num) { + break; + } + } + if (show_properties) { + std::shared_ptr table_properties; + st = reader.ReadTableProperties(&table_properties); + if (!st.ok()) { + fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str()); + } else { + fprintf(stdout, + "Table Properties:\n" + "------------------------------\n" + " %s", + table_properties->ToString("\n ", ": ").c_str()); + fprintf(stdout, "# deleted keys: %zd\n", + rocksdb::GetDeletedKeys( + table_properties->user_collected_properties)); + } + } + } +} diff --git a/util/arena.cc b/util/arena.cc new file mode 100644 index 0000000000..60a01c2453 --- /dev/null +++ b/util/arena.cc @@ -0,0 +1,130 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena.h" +#include +#include +#include "rocksdb/env.h" + +namespace rocksdb { + +const size_t Arena::kInlineSize; +const size_t Arena::kMinBlockSize = 4096; +const size_t Arena::kMaxBlockSize = 2 << 30; +static const int kAlignUnit = sizeof(void*); + +size_t OptimizeBlockSize(size_t block_size) { + // Make sure block_size is in optimal range + block_size = std::max(Arena::kMinBlockSize, block_size); + block_size = std::min(Arena::kMaxBlockSize, block_size); + + // make sure block_size is the multiple of kAlignUnit + if (block_size % kAlignUnit != 0) { + block_size = (1 + block_size / kAlignUnit) * kAlignUnit; + } + + return block_size; +} + +Arena::Arena(size_t block_size) : kBlockSize(OptimizeBlockSize(block_size)) { + assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize && + kBlockSize % kAlignUnit == 0); + alloc_bytes_remaining_ = sizeof(inline_block_); + blocks_memory_ += alloc_bytes_remaining_; + aligned_alloc_ptr_ = inline_block_; + unaligned_alloc_ptr_ = inline_block_ + alloc_bytes_remaining_; +} + +Arena::~Arena() { + for (const auto& block : blocks_) { + delete[] block; + } + for (const auto& mmap_info : huge_blocks_) { + auto ret = munmap(mmap_info.addr_, mmap_info.length_); + if (ret != 0) { + // TODO(sdong): Better handling + } + } +} + +char* Arena::AllocateFallback(size_t bytes, bool aligned) { + if (bytes > kBlockSize / 4) { + ++irregular_block_num; + // Object is more than a quarter of our block size. Allocate it separately + // to avoid wasting too much space in leftover bytes. + return AllocateNewBlock(bytes); + } + + // We waste the remaining space in the current block. + auto block_head = AllocateNewBlock(kBlockSize); + alloc_bytes_remaining_ = kBlockSize - bytes; + + if (aligned) { + aligned_alloc_ptr_ = block_head + bytes; + unaligned_alloc_ptr_ = block_head + kBlockSize; + return block_head; + } else { + aligned_alloc_ptr_ = block_head; + unaligned_alloc_ptr_ = block_head + kBlockSize - bytes; + return unaligned_alloc_ptr_; + } +} + +char* Arena::AllocateAligned(size_t bytes, size_t huage_page_size, + Logger* logger) { + assert((kAlignUnit & (kAlignUnit - 1)) == + 0); // Pointer size should be a power of 2 + +#ifdef MAP_HUGETLB + if (huage_page_size > 0 && bytes > 0) { + // Allocate from a huge page TBL table. + assert(logger != nullptr); // logger need to be passed in. + size_t reserved_size = + ((bytes - 1U) / huage_page_size + 1U) * huage_page_size; + assert(reserved_size >= bytes); + void* addr = mmap(nullptr, reserved_size, (PROT_READ | PROT_WRITE), + (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0); + + if (addr == MAP_FAILED) { + Warn(logger, "AllocateAligned fail to allocate huge TLB pages: %s", + strerror(errno)); + // fail back to malloc + } else { + blocks_memory_ += reserved_size; + huge_blocks_.push_back(MmapInfo(addr, reserved_size)); + return reinterpret_cast(addr); + } + } +#endif + + size_t current_mod = + reinterpret_cast(aligned_alloc_ptr_) & (kAlignUnit - 1); + size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod); + size_t needed = bytes + slop; + char* result; + if (needed <= alloc_bytes_remaining_) { + result = aligned_alloc_ptr_ + slop; + aligned_alloc_ptr_ += needed; + alloc_bytes_remaining_ -= needed; + } else { + // AllocateFallback always returned aligned memory + result = AllocateFallback(bytes, true /* aligned */); + } + assert((reinterpret_cast(result) & (kAlignUnit - 1)) == 0); + return result; +} + +char* Arena::AllocateNewBlock(size_t block_bytes) { + char* block = new char[block_bytes]; + blocks_memory_ += block_bytes; + blocks_.push_back(block); + return block; +} + +} // namespace rocksdb diff --git a/util/arena.h b/util/arena.h new file mode 100644 index 0000000000..0855c205c4 --- /dev/null +++ b/util/arena.h @@ -0,0 +1,128 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Arena is an implementation of Arena class. For a request of small size, +// it allocates a block with pre-defined block size. For a request of big +// size, it uses malloc to directly get the requested size. + +#pragma once +#include +#include +#include +#include +#include +#include "util/arena.h" + +namespace rocksdb { + +class Logger; + +const size_t kInlineSize = 2048; + +class Arena { + public: + // No copying allowed + Arena(const Arena&) = delete; + void operator=(const Arena&) = delete; + + static const size_t kInlineSize = 2048; + static const size_t kMinBlockSize; + static const size_t kMaxBlockSize; + + explicit Arena(size_t block_size = kMinBlockSize); + ~Arena(); + + char* Allocate(size_t bytes); + + // huge_page_size: if >0, will try to allocate from huage page TLB. + // The argument will be the size of the page size for huge page TLB. Bytes + // will be rounded up to multiple of the page size to allocate through mmap + // anonymous option with huge page on. The extra space allocated will be + // wasted. If allocation fails, will fall back to normal case. To enable it, + // need to reserve huge pages for it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt for details. + // huge page allocation can fail. In this case it will fail back to + // normal cases. The messages will be logged to logger. So when calling with + // huge_page_tlb_size > 0, we highly recommend a logger is passed in. + // Otherwise, the error message will be printed out to stderr directly. + char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, + Logger* logger = nullptr); + + // Returns an estimate of the total memory usage of data allocated + // by the arena (exclude the space allocated but not yet used for future + // allocations). + size_t ApproximateMemoryUsage() const { + return blocks_memory_ + blocks_.capacity() * sizeof(char*) - + alloc_bytes_remaining_; + } + + size_t MemoryAllocatedBytes() const { return blocks_memory_; } + + size_t AllocatedAndUnused() const { return alloc_bytes_remaining_; } + + // If an allocation is too big, we'll allocate an irregular block with the + // same size of that allocation. + virtual size_t IrregularBlockNum() const { return irregular_block_num; } + + size_t BlockSize() const { return kBlockSize; } + + private: + char inline_block_[kInlineSize]; + // Number of bytes allocated in one block + const size_t kBlockSize; + // Array of new[] allocated memory blocks + typedef std::vector Blocks; + Blocks blocks_; + + struct MmapInfo { + void* addr_; + size_t length_; + + MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {} + }; + std::vector huge_blocks_; + size_t irregular_block_num = 0; + + // Stats for current active block. + // For each block, we allocate aligned memory chucks from one end and + // allocate unaligned memory chucks from the other end. Otherwise the + // memory waste for alignment will be higher if we allocate both types of + // memory from one direction. + char* unaligned_alloc_ptr_ = nullptr; + char* aligned_alloc_ptr_ = nullptr; + // How many bytes left in currently active block? + size_t alloc_bytes_remaining_ = 0; + + char* AllocateFallback(size_t bytes, bool aligned); + char* AllocateNewBlock(size_t block_bytes); + + // Bytes of memory in blocks allocated so far + size_t blocks_memory_ = 0; +}; + +inline char* Arena::Allocate(size_t bytes) { + // The semantics of what to return are a bit messy if we allow + // 0-byte allocations, so we disallow them here (we don't need + // them for our internal use). + assert(bytes > 0); + if (bytes <= alloc_bytes_remaining_) { + unaligned_alloc_ptr_ -= bytes; + alloc_bytes_remaining_ -= bytes; + return unaligned_alloc_ptr_; + } + return AllocateFallback(bytes, false /* unaligned */); +} + +// check and adjust the block_size so that the return value is +// 1. in the range of [kMinBlockSize, kMaxBlockSize]. +// 2. the multiple of align unit. +extern size_t OptimizeBlockSize(size_t block_size); + +} // namespace rocksdb diff --git a/util/arena_test.cc b/util/arena_test.cc new file mode 100644 index 0000000000..7b6cfd0af1 --- /dev/null +++ b/util/arena_test.cc @@ -0,0 +1,142 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace rocksdb { + +class ArenaTest {}; + +TEST(ArenaTest, Empty) { Arena arena0; } + +TEST(ArenaTest, MemoryAllocatedBytes) { + const int N = 17; + size_t req_sz; // requested size + size_t bsz = 8192; // block size + size_t expected_memory_allocated; + + Arena arena(bsz); + + // requested size > quarter of a block: + // allocate requested size separately + req_sz = 3001; + for (int i = 0; i < N; i++) { + arena.Allocate(req_sz); + } + expected_memory_allocated = req_sz * N + Arena::kInlineSize; + ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated); + + arena.Allocate(Arena::kInlineSize - 1); + + // requested size < quarter of a block: + // allocate a block with the default size, then try to use unused part + // of the block. So one new block will be allocated for the first + // Allocate(99) call. All the remaining calls won't lead to new allocation. + req_sz = 99; + for (int i = 0; i < N; i++) { + arena.Allocate(req_sz); + } + expected_memory_allocated += bsz; + ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated); + + // requested size > quarter of a block: + // allocate requested size separately + req_sz = 99999999; + for (int i = 0; i < N; i++) { + arena.Allocate(req_sz); + } + expected_memory_allocated += req_sz * N; + ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated); +} + +// Make sure we didn't count the allocate but not used memory space in +// Arena::ApproximateMemoryUsage() +TEST(ArenaTest, ApproximateMemoryUsageTest) { + const size_t kBlockSize = 4096; + const size_t kEntrySize = kBlockSize / 8; + const size_t kZero = 0; + Arena arena(kBlockSize); + ASSERT_EQ(kZero, arena.ApproximateMemoryUsage()); + + // allocate inline bytes + arena.AllocateAligned(8); + arena.AllocateAligned(Arena::kInlineSize / 2 - 16); + arena.AllocateAligned(Arena::kInlineSize / 2); + ASSERT_EQ(arena.ApproximateMemoryUsage(), Arena::kInlineSize - 8); + ASSERT_EQ(arena.MemoryAllocatedBytes(), Arena::kInlineSize); + + auto num_blocks = kBlockSize / kEntrySize; + + // first allocation + arena.AllocateAligned(kEntrySize); + auto mem_usage = arena.MemoryAllocatedBytes(); + ASSERT_EQ(mem_usage, kBlockSize + Arena::kInlineSize); + auto usage = arena.ApproximateMemoryUsage(); + ASSERT_LT(usage, mem_usage); + for (size_t i = 1; i < num_blocks; ++i) { + arena.AllocateAligned(kEntrySize); + ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes()); + ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize); + usage = arena.ApproximateMemoryUsage(); + } + ASSERT_GT(usage, mem_usage); +} + +TEST(ArenaTest, Simple) { + std::vector> allocated; + Arena arena; + const int N = 100000; + size_t bytes = 0; + Random rnd(301); + for (int i = 0; i < N; i++) { + size_t s; + if (i % (N / 10) == 0) { + s = i; + } else { + s = rnd.OneIn(4000) + ? rnd.Uniform(6000) + : (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); + } + if (s == 0) { + // Our arena disallows size 0 allocations. + s = 1; + } + char* r; + if (rnd.OneIn(10)) { + r = arena.AllocateAligned(s); + } else { + r = arena.Allocate(s); + } + + for (unsigned int b = 0; b < s; b++) { + // Fill the "i"th allocation with a known bit pattern + r[b] = i % 256; + } + bytes += s; + allocated.push_back(std::make_pair(s, r)); + ASSERT_GE(arena.ApproximateMemoryUsage(), bytes); + if (i > N / 10) { + ASSERT_LE(arena.ApproximateMemoryUsage(), bytes * 1.10); + } + } + for (unsigned int i = 0; i < allocated.size(); i++) { + size_t num_bytes = allocated[i].first; + const char* p = allocated[i].second; + for (unsigned int b = 0; b < num_bytes; b++) { + // Check the "i"th allocation for the known bit pattern + ASSERT_EQ(int(p[b]) & 0xff, (int)(i % 256)); + } + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/util/auto_roll_logger.cc b/util/auto_roll_logger.cc new file mode 100644 index 0000000000..19c2b8ca3b --- /dev/null +++ b/util/auto_roll_logger.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/auto_roll_logger.h" +#include "util/mutexlock.h" + +using namespace std; + +namespace rocksdb { + +// -- AutoRollLogger +Status AutoRollLogger::ResetLogger() { + status_ = env_->NewLogger(log_fname_, &logger_); + + if (!status_.ok()) { + return status_; + } + + if (logger_->GetLogFileSize() == + (size_t)Logger::DO_NOT_SUPPORT_GET_LOG_FILE_SIZE) { + status_ = Status::NotSupported( + "The underlying logger doesn't support GetLogFileSize()"); + } + if (status_.ok()) { + cached_now = static_cast(env_->NowMicros() * 1e-6); + ctime_ = cached_now; + cached_now_access_count = 0; + } + + return status_; +} + +void AutoRollLogger::RollLogFile() { + std::string old_fname = OldInfoLogFileName( + dbname_, env_->NowMicros(), db_absolute_path_, db_log_dir_); + env_->RenameFile(log_fname_, old_fname); +} + +void AutoRollLogger::Logv(const char* format, va_list ap) { + assert(GetStatus().ok()); + + std::shared_ptr logger; + { + MutexLock l(&mutex_); + if ((kLogFileTimeToRoll > 0 && LogExpired()) || + (kMaxLogFileSize > 0 && logger_->GetLogFileSize() >= kMaxLogFileSize)) { + RollLogFile(); + Status s = ResetLogger(); + if (!s.ok()) { + // can't really log the error if creating a new LOG file failed + return; + } + } + + // pin down the current logger_ instance before releasing the mutex. + logger = logger_; + } + + // Another thread could have put a new Logger instance into logger_ by now. + // However, since logger is still hanging on to the previous instance + // (reference count is not zero), we don't have to worry about it being + // deleted while we are accessing it. + // Note that logv itself is not mutex protected to allow maximum concurrency, + // as thread safety should have been handled by the underlying logger. + logger->Logv(format, ap); +} + +bool AutoRollLogger::LogExpired() { + if (cached_now_access_count >= call_NowMicros_every_N_records_) { + cached_now = static_cast(env_->NowMicros() * 1e-6); + cached_now_access_count = 0; + } + + ++cached_now_access_count; + return cached_now >= ctime_ + kLogFileTimeToRoll; +} + +Status CreateLoggerFromOptions( + const std::string& dbname, + const std::string& db_log_dir, + Env* env, + const DBOptions& options, + std::shared_ptr* logger) { + std::string db_absolute_path; + env->GetAbsolutePath(dbname, &db_absolute_path); + std::string fname = InfoLogFileName(dbname, db_absolute_path, db_log_dir); + + // Currently we only support roll by time-to-roll and log size + if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) { + AutoRollLogger* result = new AutoRollLogger( + env, dbname, db_log_dir, + options.max_log_file_size, + options.log_file_time_to_roll, options.info_log_level); + Status s = result->GetStatus(); + if (!s.ok()) { + delete result; + } else { + logger->reset(result); + } + return s; + } else { + // Open a log file in the same directory as the db + env->CreateDir(dbname); // In case it does not exist + env->RenameFile(fname, OldInfoLogFileName(dbname, env->NowMicros(), + db_absolute_path, db_log_dir)); + auto s = env->NewLogger(fname, logger); + if (logger->get() != nullptr) { + (*logger)->SetInfoLogLevel(options.info_log_level); + } + return s; + } +} + +} // namespace rocksdb diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h new file mode 100644 index 0000000000..c592d79ce8 --- /dev/null +++ b/util/auto_roll_logger.h @@ -0,0 +1,91 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#pragma once +#include "db/filename.h" +#include "port/port.h" +#include "util/posix_logger.h" + +namespace rocksdb { + +// Rolls the log file by size and/or time +class AutoRollLogger : public Logger { + public: + AutoRollLogger(Env* env, const std::string& dbname, + const std::string& db_log_dir, size_t log_max_size, + size_t log_file_time_to_roll, + const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) + : Logger(log_level), + dbname_(dbname), + db_log_dir_(db_log_dir), + env_(env), + status_(Status::OK()), + kMaxLogFileSize(log_max_size), + kLogFileTimeToRoll(log_file_time_to_roll), + cached_now(static_cast(env_->NowMicros() * 1e-6)), + ctime_(cached_now), + cached_now_access_count(0), + call_NowMicros_every_N_records_(100), + mutex_() { + env->GetAbsolutePath(dbname, &db_absolute_path_); + log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_); + RollLogFile(); + ResetLogger(); + } + + void Logv(const char* format, va_list ap); + + // check if the logger has encountered any problem. + Status GetStatus() { + return status_; + } + + size_t GetLogFileSize() const { + return logger_->GetLogFileSize(); + } + + virtual ~AutoRollLogger() { + } + + void SetCallNowMicrosEveryNRecords(uint64_t call_NowMicros_every_N_records) { + call_NowMicros_every_N_records_ = call_NowMicros_every_N_records; + } + + private: + + bool LogExpired(); + Status ResetLogger(); + void RollLogFile(); + + std::string log_fname_; // Current active info log's file name. + std::string dbname_; + std::string db_log_dir_; + std::string db_absolute_path_; + Env* env_; + std::shared_ptr logger_; + // current status of the logger + Status status_; + const size_t kMaxLogFileSize; + const size_t kLogFileTimeToRoll; + // to avoid frequent env->NowMicros() calls, we cached the current time + uint64_t cached_now; + uint64_t ctime_; + uint64_t cached_now_access_count; + uint64_t call_NowMicros_every_N_records_; + port::Mutex mutex_; +}; + +// Facade to craete logger automatically +Status CreateLoggerFromOptions( + const std::string& dbname, + const std::string& db_log_dir, + Env* env, + const DBOptions& options, + std::shared_ptr* logger); + +} // namespace rocksdb diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc new file mode 100755 index 0000000000..c49894f596 --- /dev/null +++ b/util/auto_roll_logger_test.cc @@ -0,0 +1,292 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include +#include +#include +#include +#include "util/testharness.h" +#include "util/auto_roll_logger.h" +#include "rocksdb/db.h" +#include +#include + +using namespace std; + +namespace rocksdb { + +class AutoRollLoggerTest { + public: + static void InitTestDb() { + string deleteCmd = "rm -rf " + kTestDir; + ASSERT_TRUE(system(deleteCmd.c_str()) == 0); + Env::Default()->CreateDir(kTestDir); + } + + void RollLogFileBySizeTest(AutoRollLogger* logger, + size_t log_max_size, + const string& log_message); + uint64_t RollLogFileByTimeTest(AutoRollLogger* logger, + size_t time, + const string& log_message); + + static const string kSampleMessage; + static const string kTestDir; + static const string kLogFile; + static Env* env; +}; + +const string AutoRollLoggerTest::kSampleMessage( + "this is the message to be written to the log file!!"); +const string AutoRollLoggerTest::kTestDir(test::TmpDir() + "/db_log_test"); +const string AutoRollLoggerTest::kLogFile(test::TmpDir() + "/db_log_test/LOG"); +Env* AutoRollLoggerTest::env = Env::Default(); + +// In this test we only want to Log some simple log message with +// no format. LogMessage() provides such a simple interface and +// avoids the [format-security] warning which occurs when you +// call Log(logger, log_message) directly. +namespace { +void LogMessage(Logger* logger, const char* message) { + Log(logger, "%s", message); +} + +void LogMessage(const InfoLogLevel log_level, Logger* logger, + const char* message) { + Log(log_level, logger, "%s", message); +} +} // namespace + +namespace { +void GetFileCreateTime(const std::string& fname, uint64_t* file_ctime) { + struct stat s; + if (stat(fname.c_str(), &s) != 0) { + *file_ctime = (uint64_t)0; + } + *file_ctime = static_cast(s.st_ctime); +} +} // namespace + +void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger, + size_t log_max_size, + const string& log_message) { + logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); + // measure the size of each message, which is supposed + // to be equal or greater than log_message.size() + LogMessage(logger, log_message.c_str()); + size_t message_size = logger->GetLogFileSize(); + size_t current_log_size = message_size; + + // Test the cases when the log file will not be rolled. + while (current_log_size + message_size < log_max_size) { + LogMessage(logger, log_message.c_str()); + current_log_size += message_size; + ASSERT_EQ(current_log_size, logger->GetLogFileSize()); + } + + // Now the log file will be rolled + LogMessage(logger, log_message.c_str()); + // Since rotation is checked before actual logging, we need to + // trigger the rotation by logging another message. + LogMessage(logger, log_message.c_str()); + + ASSERT_TRUE(message_size == logger->GetLogFileSize()); +} + +uint64_t AutoRollLoggerTest::RollLogFileByTimeTest( + AutoRollLogger* logger, size_t time, const string& log_message) { + uint64_t expected_create_time; + uint64_t actual_create_time; + uint64_t total_log_size; + ASSERT_OK(env->GetFileSize(kLogFile, &total_log_size)); + GetFileCreateTime(kLogFile, &expected_create_time); + logger->SetCallNowMicrosEveryNRecords(0); + + // -- Write to the log for several times, which is supposed + // to be finished before time. + for (int i = 0; i < 10; ++i) { + LogMessage(logger, log_message.c_str()); + ASSERT_OK(logger->GetStatus()); + // Make sure we always write to the same log file (by + // checking the create time); + GetFileCreateTime(kLogFile, &actual_create_time); + + // Also make sure the log size is increasing. + ASSERT_EQ(expected_create_time, actual_create_time); + ASSERT_GT(logger->GetLogFileSize(), total_log_size); + total_log_size = logger->GetLogFileSize(); + } + + // -- Make the log file expire + sleep(time); + LogMessage(logger, log_message.c_str()); + + // At this time, the new log file should be created. + GetFileCreateTime(kLogFile, &actual_create_time); + ASSERT_GT(actual_create_time, expected_create_time); + ASSERT_LT(logger->GetLogFileSize(), total_log_size); + expected_create_time = actual_create_time; + + return expected_create_time; +} + +TEST(AutoRollLoggerTest, RollLogFileBySize) { + InitTestDb(); + size_t log_max_size = 1024 * 5; + + AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0); + + RollLogFileBySizeTest(&logger, log_max_size, + kSampleMessage + ":RollLogFileBySize"); +} + +TEST(AutoRollLoggerTest, RollLogFileByTime) { + size_t time = 1; + size_t log_size = 1024 * 5; + + InitTestDb(); + // -- Test the existence of file during the server restart. + ASSERT_TRUE(!env->FileExists(kLogFile)); + AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 1); + ASSERT_TRUE(env->FileExists(kLogFile)); + + RollLogFileByTimeTest(&logger, time, kSampleMessage + ":RollLogFileByTime"); +} + +TEST(AutoRollLoggerTest, + OpenLogFilesMultipleTimesWithOptionLog_max_size) { + // If only 'log_max_size' options is specified, then every time + // when rocksdb is restarted, a new empty log file will be created. + InitTestDb(); + // WORKAROUND: + // avoid complier's complaint of "comparison between signed + // and unsigned integer expressions" because literal 0 is + // treated as "singed". + size_t kZero = 0; + size_t log_size = 1024; + + AutoRollLogger* logger = new AutoRollLogger( + Env::Default(), kTestDir, "", log_size, 0); + + LogMessage(logger, kSampleMessage.c_str()); + ASSERT_GT(logger->GetLogFileSize(), kZero); + delete logger; + + // reopens the log file and an empty log file will be created. + logger = new AutoRollLogger( + Env::Default(), kTestDir, "", log_size, 0); + ASSERT_EQ(logger->GetLogFileSize(), kZero); + delete logger; +} + +TEST(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) { + size_t time = 1, log_max_size = 1024 * 5; + + InitTestDb(); + + AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, time); + + // Test the ability to roll by size + RollLogFileBySizeTest( + &logger, log_max_size, + kSampleMessage + ":CompositeRollByTimeAndSizeLogger"); + + // Test the ability to roll by Time + RollLogFileByTimeTest( &logger, time, + kSampleMessage + ":CompositeRollByTimeAndSizeLogger"); +} + +TEST(AutoRollLoggerTest, CreateLoggerFromOptions) { + DBOptions options; + shared_ptr logger; + + // Normal logger + ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + ASSERT_TRUE(dynamic_cast(logger.get())); + + // Only roll by size + InitTestDb(); + options.max_log_file_size = 1024; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + AutoRollLogger* auto_roll_logger = + dynamic_cast(logger.get()); + ASSERT_TRUE(auto_roll_logger); + RollLogFileBySizeTest( + auto_roll_logger, options.max_log_file_size, + kSampleMessage + ":CreateLoggerFromOptions - size"); + + // Only roll by Time + InitTestDb(); + options.max_log_file_size = 0; + options.log_file_time_to_roll = 1; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + auto_roll_logger = + dynamic_cast(logger.get()); + RollLogFileByTimeTest( + auto_roll_logger, options.log_file_time_to_roll, + kSampleMessage + ":CreateLoggerFromOptions - time"); + + // roll by both Time and size + InitTestDb(); + options.max_log_file_size = 1024 * 5; + options.log_file_time_to_roll = 1; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + auto_roll_logger = + dynamic_cast(logger.get()); + RollLogFileBySizeTest( + auto_roll_logger, options.max_log_file_size, + kSampleMessage + ":CreateLoggerFromOptions - both"); + RollLogFileByTimeTest( + auto_roll_logger, options.log_file_time_to_roll, + kSampleMessage + ":CreateLoggerFromOptions - both"); +} + +TEST(AutoRollLoggerTest, InfoLogLevel) { + InitTestDb(); + + size_t log_size = 8192; + size_t log_lines = 0; + // an extra-scope to force the AutoRollLogger to flush the log file when it + // becomes out of scope. + { + AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0); + for (int log_level = InfoLogLevel::FATAL_LEVEL; + log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) { + logger.SetInfoLogLevel((InfoLogLevel)log_level); + for (int log_type = InfoLogLevel::DEBUG_LEVEL; + log_type <= InfoLogLevel::FATAL_LEVEL; log_type++) { + // log messages with log level smaller than log_level will not be + // logged. + LogMessage((InfoLogLevel)log_type, &logger, kSampleMessage.c_str()); + } + log_lines += InfoLogLevel::FATAL_LEVEL - log_level + 1; + } + for (int log_level = InfoLogLevel::FATAL_LEVEL; + log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) { + logger.SetInfoLogLevel((InfoLogLevel)log_level); + + // again, messages with level smaller than log_level will not be logged. + Debug(&logger, "%s", kSampleMessage.c_str()); + Info(&logger, "%s", kSampleMessage.c_str()); + Warn(&logger, "%s", kSampleMessage.c_str()); + Error(&logger, "%s", kSampleMessage.c_str()); + Fatal(&logger, "%s", kSampleMessage.c_str()); + log_lines += InfoLogLevel::FATAL_LEVEL - log_level + 1; + } + } + std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str()); + size_t lines = std::count(std::istreambuf_iterator(inFile), + std::istreambuf_iterator(), '\n'); + ASSERT_EQ(log_lines, lines); + inFile.close(); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/autovector.h b/util/autovector.h new file mode 100644 index 0000000000..b57cedfc1b --- /dev/null +++ b/util/autovector.h @@ -0,0 +1,319 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include +#include +#include +#include + +namespace rocksdb { + +#ifdef ROCKSDB_LITE +template +class autovector : public std::vector {}; +#else +// A vector that leverages pre-allocated stack-based array to achieve better +// performance for array with small amount of items. +// +// The interface resembles that of vector, but with less features since we aim +// to solve the problem that we have in hand, rather than implementing a +// full-fledged generic container. +// +// Currently we don't support: +// * reserve()/shrink_to_fit() +// If used correctly, in most cases, people should not touch the +// underlying vector at all. +// * random insert()/erase(), please only use push_back()/pop_back(). +// * No move/swap operations. Each autovector instance has a +// stack-allocated array and if we want support move/swap operations, we +// need to copy the arrays other than just swapping the pointers. In this +// case we'll just explicitly forbid these operations since they may +// lead users to make false assumption by thinking they are inexpensive +// operations. +// +// Naming style of public methods almost follows that of the STL's. +template +class autovector { + public: + // General STL-style container member types. + typedef T value_type; + typedef typename std::vector::difference_type difference_type; + typedef typename std::vector::size_type size_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + + // This class is the base for regular/const iterator + template + class iterator_impl { + public: + // -- iterator traits + typedef iterator_impl self_type; + typedef TValueType value_type; + typedef TValueType& reference; + typedef TValueType* pointer; + typedef typename TAutoVector::difference_type difference_type; + typedef std::random_access_iterator_tag iterator_category; + + iterator_impl(TAutoVector* vect, size_t index) + : vect_(vect), index_(index) {}; + iterator_impl(const iterator_impl&) = default; + ~iterator_impl() {} + iterator_impl& operator=(const iterator_impl&) = default; + + // -- Advancement + // iterator++ + self_type& operator++() { + ++index_; + return *this; + } + + // ++iterator + self_type operator++(int) { + auto old = *this; + ++index_; + return old; + } + + // iterator-- + self_type& operator--() { + --index_; + return *this; + } + + // --iterator + self_type operator--(int) { + auto old = *this; + --index_; + return old; + } + + self_type operator-(difference_type len) { + return self_type(vect_, index_ - len); + } + + difference_type operator-(const self_type& other) { + assert(vect_ == other.vect_); + return index_ - other.index_; + } + + self_type operator+(difference_type len) { + return self_type(vect_, index_ + len); + } + + self_type& operator+=(difference_type len) { + index_ += len; + return *this; + } + + self_type& operator-=(difference_type len) { + index_ -= len; + return *this; + } + + // -- Reference + reference operator*() { + assert(vect_->size() >= index_); + return (*vect_)[index_]; + } + pointer operator->() { + assert(vect_->size() >= index_); + return &(*vect_)[index_]; + } + + // -- Logical Operators + bool operator==(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ == other.index_; + } + + bool operator!=(const self_type& other) const { return !(*this == other); } + + bool operator>(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ > other.index_; + } + + bool operator<(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ < other.index_; + } + + bool operator>=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ >= other.index_; + } + + bool operator<=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ <= other.index_; + } + + private: + TAutoVector* vect_ = nullptr; + size_t index_ = 0; + }; + + typedef iterator_impl iterator; + typedef iterator_impl const_iterator; + typedef std::reverse_iterator reverse_iterator; + typedef std::reverse_iterator const_reverse_iterator; + + autovector() = default; + ~autovector() = default; + + // -- Immutable operations + // Indicate if all data resides in in-stack data structure. + bool only_in_stack() const { + // If no element was inserted at all, the vector's capacity will be `0`. + return vect_.capacity() == 0; + } + + size_type size() const { return num_stack_items_ + vect_.size(); } + + // resize does not guarantee anything about the contents of the newly + // available elements + void resize(size_type n) { + if (n > kSize) { + vect_.resize(n - kSize); + num_stack_items_ = kSize; + } else { + vect_.clear(); + num_stack_items_ = n; + } + } + + bool empty() const { return size() == 0; } + + // will not check boundry + const_reference operator[](size_type n) const { + return n < kSize ? values_[n] : vect_[n - kSize]; + } + + reference operator[](size_type n) { + return n < kSize ? values_[n] : vect_[n - kSize]; + } + + // will check boundry + const_reference at(size_type n) const { + if (n >= size()) { + throw std::out_of_range("autovector: index out of range"); + } + return (*this)[n]; + } + + reference at(size_type n) { + if (n >= size()) { + throw std::out_of_range("autovector: index out of range"); + } + return (*this)[n]; + } + + reference front() { + assert(!empty()); + return *begin(); + } + + const_reference front() const { + assert(!empty()); + return *begin(); + } + + reference back() { + assert(!empty()); + return *(end() - 1); + } + + const_reference back() const { + assert(!empty()); + return *(end() - 1); + } + + // -- Mutable Operations + void push_back(T&& item) { + if (num_stack_items_ < kSize) { + values_[num_stack_items_++] = std::move(item); + } else { + vect_.push_back(item); + } + } + + void push_back(const T& item) { push_back(value_type(item)); } + + template + void emplace_back(Args&&... args) { + push_back(value_type(args...)); + } + + void pop_back() { + assert(!empty()); + if (!vect_.empty()) { + vect_.pop_back(); + } else { + --num_stack_items_; + } + } + + void clear() { + num_stack_items_ = 0; + vect_.clear(); + } + + // -- Copy and Assignment + autovector& assign(const autovector& other); + + autovector(const autovector& other) { assign(other); } + + autovector& operator=(const autovector& other) { return assign(other); } + + // move operation are disallowed since it is very hard to make sure both + // autovectors are allocated from the same function stack. + autovector& operator=(autovector&& other) = delete; + autovector(autovector&& other) = delete; + + // -- Iterator Operations + iterator begin() { return iterator(this, 0); } + + const_iterator begin() const { return const_iterator(this, 0); } + + iterator end() { return iterator(this, this->size()); } + + const_iterator end() const { return const_iterator(this, this->size()); } + + reverse_iterator rbegin() { return reverse_iterator(end()); } + + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + + reverse_iterator rend() { return reverse_iterator(begin()); } + + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + private: + size_type num_stack_items_ = 0; // current number of items + value_type values_[kSize]; // the first `kSize` items + // used only if there are more than `kSize` items. + std::vector vect_; +}; + +template +autovector& autovector::assign(const autovector& other) { + // copy the internal vector + vect_.assign(other.vect_.begin(), other.vect_.end()); + + // copy array + num_stack_items_ = other.num_stack_items_; + std::copy(other.values_, other.values_ + num_stack_items_, values_); + + return *this; +} +#endif // ROCKSDB_LITE +} // namespace rocksdb diff --git a/util/autovector_test.cc b/util/autovector_test.cc new file mode 100644 index 0000000000..25ebaa24b5 --- /dev/null +++ b/util/autovector_test.cc @@ -0,0 +1,316 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include + +#include "rocksdb/env.h" +#include "util/autovector.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +using namespace std; + +class AutoVectorTest { }; + +const unsigned long kSize = 8; +TEST(AutoVectorTest, PushBackAndPopBack) { + autovector vec; + ASSERT_TRUE(vec.empty()); + ASSERT_EQ(0ul, vec.size()); + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.push_back(i); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + ASSERT_TRUE(vec.only_in_stack()); + } else { + ASSERT_TRUE(!vec.only_in_stack()); + } + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i]); + ASSERT_EQ(i, vec.at(i)); + } + + size_t size = vec.size(); + while (size != 0) { + vec.pop_back(); + // will always be in heap + ASSERT_TRUE(!vec.only_in_stack()); + ASSERT_EQ(--size, vec.size()); + } + + ASSERT_TRUE(vec.empty()); +} + +TEST(AutoVectorTest, EmplaceBack) { + typedef std::pair ValueType; + autovector vec; + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.emplace_back(i, std::to_string(i + 123)); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + ASSERT_TRUE(vec.only_in_stack()); + } else { + ASSERT_TRUE(!vec.only_in_stack()); + } + + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i].first); + ASSERT_EQ(std::to_string(i + 123), vec[i].second); + } + + vec.clear(); + ASSERT_TRUE(vec.empty()); + ASSERT_TRUE(!vec.only_in_stack()); +} + +TEST(AutoVectorTest, Resize) { + autovector vec; + + vec.resize(kSize); + ASSERT_TRUE(vec.only_in_stack()); + for (size_t i = 0; i < kSize; ++i) { + vec[i] = i; + } + + vec.resize(kSize * 2); + ASSERT_TRUE(!vec.only_in_stack()); + for (size_t i = 0; i < kSize; ++i) { + ASSERT_EQ(vec[i], i); + } + for (size_t i = 0; i < kSize; ++i) { + vec[i + kSize] = i; + } + + vec.resize(1); + ASSERT_EQ(1U, vec.size()); +} + +namespace { +void AssertEqual( + const autovector& a, const autovector& b) { + ASSERT_EQ(a.size(), b.size()); + ASSERT_EQ(a.empty(), b.empty()); + ASSERT_EQ(a.only_in_stack(), b.only_in_stack()); + for (size_t i = 0; i < a.size(); ++i) { + ASSERT_EQ(a[i], b[i]); + } +} +} // namespace + +TEST(AutoVectorTest, CopyAndAssignment) { + // Test both heap-allocated and stack-allocated cases. + for (auto size : { kSize / 2, kSize * 1000 }) { + autovector vec; + for (size_t i = 0; i < size; ++i) { + vec.push_back(i); + } + + { + autovector other; + other = vec; + AssertEqual(other, vec); + } + + { + autovector other(vec); + AssertEqual(other, vec); + } + } +} + +TEST(AutoVectorTest, Iterators) { + autovector vec; + for (size_t i = 0; i < kSize * 1000; ++i) { + vec.push_back(std::to_string(i)); + } + + // basic operator test + ASSERT_EQ(vec.front(), *vec.begin()); + ASSERT_EQ(vec.back(), *(vec.end() - 1)); + ASSERT_TRUE(vec.begin() < vec.end()); + + // non-const iterator + size_t index = 0; + for (const auto& item : vec) { + ASSERT_EQ(vec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = vec.rbegin(); pos != vec.rend(); ++pos) { + ASSERT_EQ(vec[index--], *pos); + } + + // const iterator + const auto& cvec = vec; + index = 0; + for (const auto& item : cvec) { + ASSERT_EQ(cvec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = cvec.rbegin(); pos != cvec.rend(); ++pos) { + ASSERT_EQ(cvec[index--], *pos); + } + + // forward and backward + auto pos = vec.begin(); + while (pos != vec.end()) { + auto old_val = *pos; + auto old = pos++; + // HACK: make sure -> works + ASSERT_TRUE(!old->empty()); + ASSERT_EQ(old_val, *old); + ASSERT_TRUE(pos == vec.end() || old_val != *pos); + } + + pos = vec.begin(); + for (size_t i = 0; i < vec.size(); i += 2) { + // Cannot use ASSERT_EQ since that macro depends on iostream serialization + ASSERT_TRUE(pos + 2 - 2 == pos); + pos += 2; + ASSERT_TRUE(pos >= vec.begin()); + ASSERT_TRUE(pos <= vec.end()); + + size_t diff = static_cast(pos - vec.begin()); + ASSERT_EQ(i + 2, diff); + } +} + +namespace { +vector GetTestKeys(size_t size) { + vector keys; + keys.resize(size); + + int index = 0; + for (auto& key : keys) { + key = "item-" + to_string(index++); + } + return keys; +} +} // namespace + +template +void BenchmarkVectorCreationAndInsertion( + string name, size_t ops, size_t item_size, + const std::vector& items) { + auto env = Env::Default(); + + int index = 0; + auto start_time = env->NowNanos(); + auto ops_remaining = ops; + while(ops_remaining--) { + TVector v; + for (size_t i = 0; i < item_size; ++i) { + v.push_back(items[index++]); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "created " << ops << " " << name << " instances:\n\t" + << "each was inserted with " << item_size << " elements\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; +} + +template +size_t BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) { + TVector v; + for (const auto& item : GetTestKeys(elem_size)) { + v.push_back(item); + } + auto env = Env::Default(); + + auto ops_remaining = ops; + auto start_time = env->NowNanos(); + size_t total = 0; + while (ops_remaining--) { + auto end = v.end(); + for (auto pos = v.begin(); pos != end; ++pos) { + total += pos->size(); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "performed " << ops << " sequence access against " << name << "\n\t" + << "size: " << elem_size << "\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; + // HACK avoid compiler's optimization to ignore total + return total; +} + +// This test case only reports the performance between std::vector +// and autovector. We chose string for comparison because in most +// o our use cases we used std::vector. +TEST(AutoVectorTest, PerfBench) { + // We run same operations for kOps times in order to get a more fair result. + size_t kOps = 100000; + + // Creation and insertion test + // Test the case when there is: + // * no element inserted: internal array of std::vector may not really get + // initialize. + // * one element inserted: internal array of std::vector must have + // initialized. + // * kSize elements inserted. This shows the most time we'll spend if we + // keep everything in stack. + // * 2 * kSize elements inserted. The internal vector of + // autovector must have been initialized. + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: std::string)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + auto string_keys = GetTestKeys(kOps * 2 * kSize); + for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + BenchmarkVectorCreationAndInsertion>( + "vector", kOps, insertions, string_keys + ); + BenchmarkVectorCreationAndInsertion>( + "autovector", kOps, insertions, string_keys + ); + cout << "-----------------------------------" << endl; + } + + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: uint64_t)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + vector int_keys(kOps * 2 * kSize); + for (size_t i = 0; i < kOps * 2 * kSize; ++i) { + int_keys[i] = i; + } + for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + BenchmarkVectorCreationAndInsertion>( + "vector", kOps, insertions, int_keys + ); + BenchmarkVectorCreationAndInsertion>( + "autovector", kOps, insertions, int_keys + ); + cout << "-----------------------------------" << endl; + } + + // Sequence Access Test + cout << "=====================================================" << endl; + cout << "Sequence Access Test" << endl; + cout << "=====================================================" << endl; + for (auto elem_size : { kSize / 2, kSize, 2 * kSize }) { + BenchmarkSequenceAccess>( + "vector", kOps, elem_size + ); + BenchmarkSequenceAccess>( + "autovector", kOps, elem_size + ); + cout << "-----------------------------------" << endl; + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/benchharness.cc b/util/benchharness.cc new file mode 100644 index 0000000000..8cd37007b4 --- /dev/null +++ b/util/benchharness.cc @@ -0,0 +1,398 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This code is derived from Benchmark.cpp implemented in Folly, the opensourced +// Facebook C++ library available at https://github.com/facebook/folly +// The code has removed any dependence on other folly and boost libraries + +#include "util/benchharness.h" + +#include +#include +#include +#include +#include +#include +#include + +using std::function; +using std::get; +using std::make_pair; +using std::max; +using std::min; +using std::pair; +using std::sort; +using std::string; +using std::tuple; +using std::vector; + +DEFINE_bool(benchmark, false, "Run benchmarks."); + +DEFINE_int64(bm_min_usec, 100, + "Minimum # of microseconds we'll accept for each benchmark."); + +DEFINE_int64(bm_min_iters, 1, + "Minimum # of iterations we'll try for each benchmark."); + +DEFINE_int32(bm_max_secs, 1, + "Maximum # of seconds we'll spend on each benchmark."); + + +namespace rocksdb { +namespace benchmark { + +BenchmarkSuspender::NanosecondsSpent BenchmarkSuspender::nsSpent; + +typedef function BenchmarkFun; +static vector> benchmarks; + +// Add the global baseline +BENCHMARK(globalBenchmarkBaseline) { + asm volatile(""); +} + +void detail::AddBenchmarkImpl(const char* file, const char* name, + BenchmarkFun fun) { + benchmarks.emplace_back(file, name, std::move(fun)); +} + +/** + * Given a point, gives density at that point as a number 0.0 < x <= + * 1.0. The result is 1.0 if all samples are equal to where, and + * decreases near 0 if all points are far away from it. The density is + * computed with the help of a radial basis function. + */ +static double Density(const double * begin, const double *const end, + const double where, const double bandwidth) { + assert(begin < end); + assert(bandwidth > 0.0); + double sum = 0.0; + for (auto i = begin; i < end; i++) { + auto d = (*i - where) / bandwidth; + sum += exp(- d * d); + } + return sum / (end - begin); +} + +/** + * Computes mean and variance for a bunch of data points. Note that + * mean is currently not being used. + */ +static pair +MeanVariance(const double * begin, const double *const end) { + assert(begin < end); + double sum = 0.0, sum2 = 0.0; + for (auto i = begin; i < end; i++) { + sum += *i; + sum2 += *i * *i; + } + auto const n = end - begin; + return make_pair(sum / n, sqrt((sum2 - sum * sum / n) / n)); +} + +/** + * Computes the mode of a sample set through brute force. Assumes + * input is sorted. + */ +static double Mode(const double * begin, const double *const end) { + assert(begin < end); + // Lower bound and upper bound for result and their respective + // densities. + auto + result = 0.0, + bestDensity = 0.0; + + // Get the variance so we pass it down to Density() + auto const sigma = MeanVariance(begin, end).second; + if (!sigma) { + // No variance means constant signal + return *begin; + } + + for (auto i = begin; i < end; i++) { + assert(i == begin || *i >= i[-1]); + auto candidate = Density(begin, end, *i, sigma * sqrt(2.0)); + if (candidate > bestDensity) { + // Found a new best + bestDensity = candidate; + result = *i; + } else { + // Density is decreasing... we could break here if we definitely + // knew this is unimodal. + } + } + + return result; +} + +/** + * Given a bunch of benchmark samples, estimate the actual run time. + */ +static double EstimateTime(double * begin, double * end) { + assert(begin < end); + + // Current state of the art: get the minimum. After some + // experimentation, it seems taking the minimum is the best. + + return *std::min_element(begin, end); + + // What follows after estimates the time as the mode of the + // distribution. + + // Select the awesomest (i.e. most frequent) result. We do this by + // sorting and then computing the longest run length. + sort(begin, end); + + // Eliminate outliers. A time much larger than the minimum time is + // considered an outlier. + while (end[-1] > 2.0 * *begin) { + --end; + if (begin == end) { +// LOG(INFO) << *begin; + } + assert(begin < end); + } + + double result = 0; + + /* Code used just for comparison purposes */ { + unsigned bestFrequency = 0; + unsigned candidateFrequency = 1; + double candidateValue = *begin; + for (auto current = begin + 1; ; ++current) { + if (current == end || *current != candidateValue) { + // Done with the current run, see if it was best + if (candidateFrequency > bestFrequency) { + bestFrequency = candidateFrequency; + result = candidateValue; + } + if (current == end) { + break; + } + // Start a new run + candidateValue = *current; + candidateFrequency = 1; + } else { + // Cool, inside a run, increase the frequency + ++candidateFrequency; + } + } + } + + result = Mode(begin, end); + + return result; +} + +static double RunBenchmarkGetNSPerIteration(const BenchmarkFun& fun, + const double globalBaseline) { + // They key here is accuracy; too low numbers means the accuracy was + // coarse. We up the ante until we get to at least minNanoseconds + // timings. + static const auto minNanoseconds = FLAGS_bm_min_usec * 1000UL; + + // We do measurements in several epochs and take the minimum, to + // account for jitter. + static const unsigned int epochs = 1000; + // We establish a total time budget as we don't want a measurement + // to take too long. This will curtail the number of actual epochs. + const uint64_t timeBudgetInNs = FLAGS_bm_max_secs * 1000000000; + auto env = Env::Default(); + uint64_t global = env->NowNanos(); + + double epochResults[epochs] = { 0 }; + size_t actualEpochs = 0; + + for (; actualEpochs < epochs; ++actualEpochs) { + for (unsigned int n = FLAGS_bm_min_iters; n < (1UL << 30); n *= 2) { + auto const nsecs = fun(n); + if (nsecs < minNanoseconds) { + continue; + } + // We got an accurate enough timing, done. But only save if + // smaller than the current result. + epochResults[actualEpochs] = max(0.0, + static_cast(nsecs) / n - globalBaseline); + // Done with the current epoch, we got a meaningful timing. + break; + } + uint64_t now = env->NowNanos(); + if ((now - global) >= timeBudgetInNs) { + // No more time budget available. + ++actualEpochs; + break; + } + } + + // If the benchmark was basically drowned in baseline noise, it's + // possible it became negative. + return max(0.0, EstimateTime(epochResults, epochResults + actualEpochs)); +} + +struct ScaleInfo { + double boundary; + const char* suffix; +}; + +static const ScaleInfo kTimeSuffixes[] { + { 365.25 * 24 * 3600, "years" }, + { 24 * 3600, "days" }, + { 3600, "hr" }, + { 60, "min" }, + { 1, "s" }, + { 1E-3, "ms" }, + { 1E-6, "us" }, + { 1E-9, "ns" }, + { 1E-12, "ps" }, + { 1E-15, "fs" }, + { 0, nullptr }, +}; + +static const ScaleInfo kMetricSuffixes[] { + { 1E24, "Y" }, // yotta + { 1E21, "Z" }, // zetta + { 1E18, "X" }, // "exa" written with suffix 'X' so as to not create + // confusion with scientific notation + { 1E15, "P" }, // peta + { 1E12, "T" }, // terra + { 1E9, "G" }, // giga + { 1E6, "M" }, // mega + { 1E3, "K" }, // kilo + { 1, "" }, + { 1E-3, "m" }, // milli + { 1E-6, "u" }, // micro + { 1E-9, "n" }, // nano + { 1E-12, "p" }, // pico + { 1E-15, "f" }, // femto + { 1E-18, "a" }, // atto + { 1E-21, "z" }, // zepto + { 1E-24, "y" }, // yocto + { 0, nullptr }, +}; + +static string HumanReadable(double n, unsigned int decimals, + const ScaleInfo* scales) { + if (std::isinf(n) || std::isnan(n)) { + return std::to_string(n); + } + + const double absValue = fabs(n); + const ScaleInfo* scale = scales; + while (absValue < scale[0].boundary && scale[1].suffix != nullptr) { + ++scale; + } + + const double scaledValue = n / scale->boundary; + char a[80]; + snprintf(a, sizeof(a), "%.*f%s", decimals, scaledValue, scale->suffix); + return a; +} + +static string ReadableTime(double n, unsigned int decimals) { + return HumanReadable(n, decimals, kTimeSuffixes); +} + +static string MetricReadable(double n, unsigned int decimals) { + return HumanReadable(n, decimals, kMetricSuffixes); +} + +static void PrintBenchmarkResultsAsTable( + const vector >& data) { + // Width available + static const uint columns = 76; + + // Compute the longest benchmark name + size_t longestName = 0; + for (size_t i = 1; i < benchmarks.size(); i++) { + longestName = max(longestName, strlen(get<1>(benchmarks[i]))); + } + + // Print a horizontal rule + auto separator = [&](char pad) { + puts(string(columns, pad).c_str()); + }; + + // Print header for a file + auto header = [&](const char* file) { + separator('='); + printf("%-*srelative time/iter iters/s\n", + columns - 28, file); + separator('='); + }; + + double baselineNsPerIter = std::numeric_limits::max(); + const char* lastFile = ""; + + for (auto& datum : data) { + auto file = get<0>(datum); + if (strcmp(file, lastFile)) { + // New file starting + header(file); + lastFile = file; + } + + string s = get<1>(datum); + if (s == "-") { + separator('-'); + continue; + } + bool useBaseline /* = void */; + if (s[0] == '%') { + s.erase(0, 1); + useBaseline = true; + } else { + baselineNsPerIter = get<2>(datum); + useBaseline = false; + } + s.resize(columns - 29, ' '); + auto nsPerIter = get<2>(datum); + auto secPerIter = nsPerIter / 1E9; + auto itersPerSec = 1 / secPerIter; + if (!useBaseline) { + // Print without baseline + printf("%*s %9s %7s\n", + static_cast(s.size()), s.c_str(), + ReadableTime(secPerIter, 2).c_str(), + MetricReadable(itersPerSec, 2).c_str()); + } else { + // Print with baseline + auto rel = baselineNsPerIter / nsPerIter * 100.0; + printf("%*s %7.2f%% %9s %7s\n", + static_cast(s.size()), s.c_str(), + rel, + ReadableTime(secPerIter, 2).c_str(), + MetricReadable(itersPerSec, 2).c_str()); + } + } + separator('='); +} + +void RunBenchmarks() { + ASSERT_TRUE(!benchmarks.empty()); + + vector> results; + results.reserve(benchmarks.size() - 1); + + // PLEASE KEEP QUIET. MEASUREMENTS IN PROGRESS. + + auto const globalBaseline = RunBenchmarkGetNSPerIteration( + get<2>(benchmarks.front()), 0); + for (size_t i = 1; i < benchmarks.size(); i++) { + double elapsed = 0.0; + if (strcmp(get<1>(benchmarks[i]), "-") != 0) { // skip separators + elapsed = RunBenchmarkGetNSPerIteration(get<2>(benchmarks[i]), + globalBaseline); + } + results.emplace_back(get<0>(benchmarks[i]), + get<1>(benchmarks[i]), elapsed); + } + + // PLEASE MAKE NOISE. MEASUREMENTS DONE. + + PrintBenchmarkResultsAsTable(results); +} + +} // namespace benchmark +} // namespace rocksdb diff --git a/util/benchharness.h b/util/benchharness.h new file mode 100644 index 0000000000..4fdef520c8 --- /dev/null +++ b/util/benchharness.h @@ -0,0 +1,357 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This code is derived from Benchmark.h implemented in Folly, the opensourced +// Facebook C++ library available at https://github.com/facebook/folly +// The code has removed any dependence on other folly and boost libraries + +#pragma once + +#include + +#include +#include +#include + +#include "util/testharness.h" +#include "rocksdb/env.h" + +namespace rocksdb { +namespace benchmark { + +/** + * Runs all benchmarks defined. Usually put in main(). + */ +void RunBenchmarks(); + +namespace detail { + +/** + * Adds a benchmark wrapped in a std::function. Only used + * internally. Pass by value is intentional. + */ +void AddBenchmarkImpl(const char* file, + const char* name, + std::function); + +} // namespace detail + + +/** + * Supporting type for BENCHMARK_SUSPEND defined below. + */ +struct BenchmarkSuspender { + BenchmarkSuspender() { start_ = Env::Default()->NowNanos(); } + + BenchmarkSuspender(const BenchmarkSuspender&) = delete; + BenchmarkSuspender(BenchmarkSuspender && rhs) { + start_ = rhs.start_; + rhs.start_ = 0; + } + + BenchmarkSuspender& operator=(const BenchmarkSuspender &) = delete; + BenchmarkSuspender& operator=(BenchmarkSuspender && rhs) { + if (start_ > 0) { + tally(); + } + start_ = rhs.start_; + rhs.start_ = 0; + return *this; + } + + ~BenchmarkSuspender() { + if (start_ > 0) { + tally(); + } + } + + void Dismiss() { + assert(start_ > 0); + tally(); + start_ = 0; + } + + void Rehire() { start_ = Env::Default()->NowNanos(); } + + /** + * This helps the macro definition. To get around the dangers of + * operator bool, returns a pointer to member (which allows no + * arithmetic). + */ + /* implicit */ + operator int BenchmarkSuspender::*() const { return nullptr; } + + /** + * Accumulates nanoseconds spent outside benchmark. + */ + typedef uint64_t NanosecondsSpent; + static NanosecondsSpent nsSpent; + + private: + void tally() { + uint64_t end = Env::Default()->NowNanos(); + nsSpent += start_ - end; + start_ = end; + } + + uint64_t start_; +}; + +/** + * Adds a benchmark. Usually not called directly but instead through + * the macro BENCHMARK defined below. The lambda function involved + * must take exactly one parameter of type unsigned, and the benchmark + * uses it with counter semantics (iteration occurs inside the + * function). + */ +template +void +AddBenchmark_n(const char* file, const char* name, Lambda&& lambda) { + auto execute = [=](unsigned int times) -> uint64_t { + BenchmarkSuspender::nsSpent = 0; + uint64_t start, end; + auto env = Env::Default(); + + // CORE MEASUREMENT STARTS + start = env->NowNanos(); + lambda(times); + end = env->NowNanos(); + // CORE MEASUREMENT ENDS + return (end - start) - BenchmarkSuspender::nsSpent; + }; + + detail::AddBenchmarkImpl(file, name, + std::function(execute)); +} + +/** + * Adds a benchmark. Usually not called directly but instead through + * the macro BENCHMARK defined below. The lambda function involved + * must take zero parameters, and the benchmark calls it repeatedly + * (iteration occurs outside the function). + */ +template +void +AddBenchmark(const char* file, const char* name, Lambda&& lambda) { + AddBenchmark_n(file, name, [=](unsigned int times) { + while (times-- > 0) { + lambda(); + } + }); +} + +} // namespace benchmark +} // namespace rocksdb + +/** + * FB_ONE_OR_NONE(hello, world) expands to hello and + * FB_ONE_OR_NONE(hello) expands to nothing. This macro is used to + * insert or eliminate text based on the presence of another argument. + */ +#define FB_ONE_OR_NONE(a, ...) FB_THIRD(a, ## __VA_ARGS__, a) +#define FB_THIRD(a, b, ...) __VA_ARGS__ + +#define FB_CONCATENATE_IMPL(s1, s2) s1##s2 +#define FB_CONCATENATE(s1, s2) FB_CONCATENATE_IMPL(s1, s2) + +#define FB_ANONYMOUS_VARIABLE(str) FB_CONCATENATE(str, __LINE__) + +#define FB_STRINGIZE(x) #x + +/** + * Introduces a benchmark function. Used internally, see BENCHMARK and + * friends below. + */ +#define BENCHMARK_IMPL_N(funName, stringName, paramType, paramName) \ + static void funName(paramType); \ + static bool FB_ANONYMOUS_VARIABLE(rocksdbBenchmarkUnused) = ( \ + ::rocksdb::benchmark::AddBenchmark_n(__FILE__, stringName, \ + [](paramType paramName) { funName(paramName); }), \ + true); \ + static void funName(paramType paramName) + +#define BENCHMARK_IMPL(funName, stringName) \ + static void funName(); \ + static bool FB_ANONYMOUS_VARIABLE(rocksdbBenchmarkUnused) = ( \ + ::rocksdb::benchmark::AddBenchmark(__FILE__, stringName, \ + []() { funName(); }), \ + true); \ + static void funName() + +/** + * Introduces a benchmark function. Use with either one one or two + * arguments. The first is the name of the benchmark. Use something + * descriptive, such as insertVectorBegin. The second argument may be + * missing, or could be a symbolic counter. The counter dictates how + * many internal iteration the benchmark does. Example: + * + * BENCHMARK(vectorPushBack) { + * vector v; + * v.push_back(42); + * } + * + * BENCHMARK_N(insertVectorBegin, n) { + * vector v; + * FOR_EACH_RANGE (i, 0, n) { + * v.insert(v.begin(), 42); + * } + * } + */ +#define BENCHMARK_N(name, ...) \ + BENCHMARK_IMPL_N( \ + name, \ + FB_STRINGIZE(name), \ + FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__), \ + __VA_ARGS__) + +#define BENCHMARK(name) \ + BENCHMARK_IMPL( \ + name, \ + FB_STRINGIZE(name)) + +/** + * Defines a benchmark that passes a parameter to another one. This is + * common for benchmarks that need a "problem size" in addition to + * "number of iterations". Consider: + * + * void pushBack(uint n, size_t initialSize) { + * vector v; + * BENCHMARK_SUSPEND { + * v.resize(initialSize); + * } + * FOR_EACH_RANGE (i, 0, n) { + * v.push_back(i); + * } + * } + * BENCHMARK_PARAM(pushBack, 0) + * BENCHMARK_PARAM(pushBack, 1000) + * BENCHMARK_PARAM(pushBack, 1000000) + * + * The benchmark above estimates the speed of push_back at different + * initial sizes of the vector. The framework will pass 0, 1000, and + * 1000000 for initialSize, and the iteration count for n. + */ +#define BENCHMARK_PARAM(name, param) \ + BENCHMARK_NAMED_PARAM(name, param, param) + +/* + * Like BENCHMARK_PARAM(), but allows a custom name to be specified for each + * parameter, rather than using the parameter value. + * + * Useful when the parameter value is not a valid token for string pasting, + * of when you want to specify multiple parameter arguments. + * + * For example: + * + * void addValue(uint n, int64_t bucketSize, int64_t min, int64_t max) { + * Histogram hist(bucketSize, min, max); + * int64_t num = min; + * FOR_EACH_RANGE (i, 0, n) { + * hist.addValue(num); + * ++num; + * if (num > max) { num = min; } + * } + * } + * + * BENCHMARK_NAMED_PARAM(addValue, 0_to_100, 1, 0, 100) + * BENCHMARK_NAMED_PARAM(addValue, 0_to_1000, 10, 0, 1000) + * BENCHMARK_NAMED_PARAM(addValue, 5k_to_20k, 250, 5000, 20000) + */ +#define BENCHMARK_NAMED_PARAM(name, param_name, ...) \ + BENCHMARK_IMPL( \ + FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \ + FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")") { \ + name(__VA_ARGS__); \ + } + +#define BENCHMARK_NAMED_PARAM_N(name, param_name, ...) \ + BENCHMARK_IMPL_N( \ + FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \ + FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \ + unsigned, \ + iters) { \ + name(iters, ## __VA_ARGS__); \ + } + +/** + * Just like BENCHMARK, but prints the time relative to a + * baseline. The baseline is the most recent BENCHMARK() seen in + * lexical order. Example: + * + * // This is the baseline + * BENCHMARK_N(insertVectorBegin, n) { + * vector v; + * FOR_EACH_RANGE (i, 0, n) { + * v.insert(v.begin(), 42); + * } + * } + * + * BENCHMARK_RELATIVE_N(insertListBegin, n) { + * list s; + * FOR_EACH_RANGE (i, 0, n) { + * s.insert(s.begin(), 42); + * } + * } + * + * Any number of relative benchmark can be associated with a + * baseline. Another BENCHMARK() occurrence effectively establishes a + * new baseline. + */ +#define BENCHMARK_RELATIVE_N(name, ...) \ + BENCHMARK_IMPL_N( \ + name, \ + "%" FB_STRINGIZE(name), \ + FB_ONE_OR_NONE(unsigned, ## __VA_ARGS__), \ + __VA_ARGS__) + +#define BENCHMARK_RELATIVE(name) \ + BENCHMARK_IMPL( \ + name, \ + "%" FB_STRINGIZE(name)) + +/** + * A combination of BENCHMARK_RELATIVE and BENCHMARK_PARAM. + */ +#define BENCHMARK_RELATIVE_PARAM(name, param) \ + BENCHMARK_RELATIVE_NAMED_PARAM(name, param, param) + +/** + * A combination of BENCHMARK_RELATIVE and BENCHMARK_NAMED_PARAM. + */ +#define BENCHMARK_RELATIVE_NAMED_PARAM(name, param_name, ...) \ + BENCHMARK_IMPL_N( \ + FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \ + "%" FB_STRINGIZE(name) "(" FB_STRINGIZE(param_name) ")", \ + unsigned, \ + iters) { \ + name(iters, ## __VA_ARGS__); \ + } + +/** + * Draws a line of dashes. + */ +#define BENCHMARK_DRAW_LINE() \ + static bool FB_ANONYMOUS_VARIABLE(rocksdbBenchmarkUnused) = ( \ + ::rocksdb::benchmark::AddBenchmark(__FILE__, "-", []() { }), \ + true); + +/** + * Allows execution of code that doesn't count torward the benchmark's + * time budget. Example: + * + * BENCHMARK_START_GROUP(insertVectorBegin, n) { + * vector v; + * BENCHMARK_SUSPEND { + * v.reserve(n); + * } + * FOR_EACH_RANGE (i, 0, n) { + * v.insert(v.begin(), 42); + * } + * } + */ +#define BENCHMARK_SUSPEND \ + if (auto FB_ANONYMOUS_VARIABLE(BENCHMARK_SUSPEND) = \ + ::rocksdb::benchmark::BenchmarkSuspender()) {} \ + else diff --git a/util/benchharness_test.cc b/util/benchharness_test.cc new file mode 100644 index 0000000000..75ff658929 --- /dev/null +++ b/util/benchharness_test.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include "util/benchharness.h" +#include + +namespace rocksdb { + +BENCHMARK(insertFrontVector) { + std::vector v; + for (int i = 0; i < 100; i++) { + v.insert(v.begin(), i); + } +} + +BENCHMARK_RELATIVE(insertBackVector) { + std::vector v; + for (size_t i = 0; i < 100; i++) { + v.insert(v.end(), i); + } +} + +BENCHMARK_N(insertFrontVector_n, n) { + std::vector v; + for (size_t i = 0; i < n; i++) { + v.insert(v.begin(), i); + } +} + +BENCHMARK_RELATIVE_N(insertBackVector_n, n) { + std::vector v; + for (size_t i = 0; i < n; i++) { + v.insert(v.end(), i); + } +} + +BENCHMARK_N(insertFrontEnd_n, n) { + std::vector v; + for (size_t i = 0; i < n; i++) { + v.insert(v.begin(), i); + } + for (size_t i = 0; i < n; i++) { + v.insert(v.end(), i); + } +} + +BENCHMARK_RELATIVE_N(insertFrontEndSuspend_n, n) { + std::vector v; + for (size_t i = 0; i < n; i++) { + v.insert(v.begin(), i); + } + BENCHMARK_SUSPEND { + for (size_t i = 0; i < n; i++) { + v.insert(v.end(), i); + } + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::benchmark::RunBenchmarks(); + return 0; +} diff --git a/util/blob_store.cc b/util/blob_store.cc new file mode 100644 index 0000000000..daaf4bc02b --- /dev/null +++ b/util/blob_store.cc @@ -0,0 +1,270 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE +#include "util/blob_store.h" + +namespace rocksdb { + +using namespace std; + +// BlobChunk +bool BlobChunk::ImmediatelyBefore(const BlobChunk& chunk) const { + // overlapping!? + assert(!Overlap(chunk)); + // size == 0 is a marker, not a block + return size != 0 && + bucket_id == chunk.bucket_id && + offset + size == chunk.offset; +} + +bool BlobChunk::Overlap(const BlobChunk &chunk) const { + return size != 0 && chunk.size != 0 && bucket_id == chunk.bucket_id && + ((offset >= chunk.offset && offset < chunk.offset + chunk.size) || + (chunk.offset >= offset && chunk.offset < offset + size)); +} + +// Blob +string Blob::ToString() const { + string ret; + for (auto chunk : chunks) { + PutFixed32(&ret, chunk.bucket_id); + PutFixed32(&ret, chunk.offset); + PutFixed32(&ret, chunk.size); + } + return ret; +} + +Blob::Blob(const std::string& blob) { + for (uint32_t i = 0; i < blob.size(); ) { + uint32_t t[3] = {0}; + for (int j = 0; j < 3 && i + sizeof(uint32_t) - 1 < blob.size(); + ++j, i += sizeof(uint32_t)) { + t[j] = DecodeFixed32(blob.data() + i); + } + chunks.push_back(BlobChunk(t[0], t[1], t[2])); + } +} + +// FreeList +Status FreeList::Free(const Blob& blob) { + // add it back to the free list + for (auto chunk : blob.chunks) { + free_blocks_ += chunk.size; + if (fifo_free_chunks_.size() && + fifo_free_chunks_.back().ImmediatelyBefore(chunk)) { + fifo_free_chunks_.back().size += chunk.size; + } else { + fifo_free_chunks_.push_back(chunk); + } + } + + return Status::OK(); +} + +Status FreeList::Allocate(uint32_t blocks, Blob* blob) { + if (free_blocks_ < blocks) { + return Status::Incomplete(""); + } + + blob->chunks.clear(); + free_blocks_ -= blocks; + + while (blocks > 0) { + assert(fifo_free_chunks_.size() > 0); + auto& front = fifo_free_chunks_.front(); + if (front.size > blocks) { + blob->chunks.push_back(BlobChunk(front.bucket_id, front.offset, blocks)); + front.offset += blocks; + front.size -= blocks; + blocks = 0; + } else { + blob->chunks.push_back(front); + blocks -= front.size; + fifo_free_chunks_.pop_front(); + } + } + assert(blocks == 0); + + return Status::OK(); +} + +bool FreeList::Overlap(const Blob &blob) const { + for (auto chunk : blob.chunks) { + for (auto itr = fifo_free_chunks_.begin(); + itr != fifo_free_chunks_.end(); + ++itr) { + if (itr->Overlap(chunk)) { + return true; + } + } + } + return false; +} + +// BlobStore +BlobStore::BlobStore(const string& directory, + uint64_t block_size, + uint32_t blocks_per_bucket, + uint32_t max_buckets, + Env* env) : + directory_(directory), + block_size_(block_size), + blocks_per_bucket_(blocks_per_bucket), + env_(env), + max_buckets_(max_buckets) { + env_->CreateDirIfMissing(directory_); + + storage_options_.use_mmap_writes = false; + storage_options_.use_mmap_reads = false; + + buckets_size_ = 0; + buckets_ = new unique_ptr[max_buckets_]; + + CreateNewBucket(); +} + +BlobStore::~BlobStore() { + // TODO we don't care about recovery for now + delete [] buckets_; +} + +Status BlobStore::Put(const Slice& value, Blob* blob) { + // convert size to number of blocks + Status s = Allocate((value.size() + block_size_ - 1) / block_size_, blob); + if (!s.ok()) { + return s; + } + auto size_left = (uint64_t) value.size(); + + uint64_t offset = 0; // in bytes, not blocks + for (auto chunk : blob->chunks) { + uint64_t write_size = min(chunk.size * block_size_, size_left); + assert(chunk.bucket_id < buckets_size_); + s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_, + Slice(value.data() + offset, + write_size)); + if (!s.ok()) { + Delete(*blob); + return s; + } + offset += write_size; + size_left -= write_size; + if (write_size < chunk.size * block_size_) { + // if we have any space left in the block, fill it up with zeros + string zero_string(chunk.size * block_size_ - write_size, 0); + s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_ + + write_size, + Slice(zero_string)); + } + } + + if (size_left > 0) { + Delete(*blob); + return Status::Corruption("Tried to write more data than fits in the blob"); + } + + return Status::OK(); +} + +Status BlobStore::Get(const Blob& blob, + string* value) const { + { + // assert that it doesn't overlap with free list + // it will get compiled out for release + MutexLock l(&free_list_mutex_); + assert(!free_list_.Overlap(blob)); + } + + value->resize(blob.Size() * block_size_); + + uint64_t offset = 0; // in bytes, not blocks + for (auto chunk : blob.chunks) { + Slice result; + assert(chunk.bucket_id < buckets_size_); + Status s; + s = buckets_[chunk.bucket_id].get()->Read(chunk.offset * block_size_, + chunk.size * block_size_, + &result, + &value->at(offset)); + if (!s.ok()) { + value->clear(); + return s; + } + if (result.size() < chunk.size * block_size_) { + value->clear(); + return Status::Corruption("Could not read in from file"); + } + offset += chunk.size * block_size_; + } + + // remove the '\0's at the end of the string + value->erase(find(value->begin(), value->end(), '\0'), value->end()); + + return Status::OK(); +} + +Status BlobStore::Delete(const Blob& blob) { + MutexLock l(&free_list_mutex_); + return free_list_.Free(blob); +} + +Status BlobStore::Sync() { + for (size_t i = 0; i < buckets_size_; ++i) { + Status s = buckets_[i].get()->Sync(); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} + +Status BlobStore::Allocate(uint32_t blocks, Blob* blob) { + MutexLock l(&free_list_mutex_); + Status s; + + s = free_list_.Allocate(blocks, blob); + if (!s.ok()) { + s = CreateNewBucket(); + if (!s.ok()) { + return s; + } + s = free_list_.Allocate(blocks, blob); + } + + return s; +} + +// called with free_list_mutex_ held +Status BlobStore::CreateNewBucket() { + MutexLock l(&buckets_mutex_); + + if (buckets_size_ >= max_buckets_) { + return Status::NotSupported("Max size exceeded\n"); + } + + int new_bucket_id = buckets_size_; + + char fname[200]; + sprintf(fname, "%s/%d.bs", directory_.c_str(), new_bucket_id); + + Status s = env_->NewRandomRWFile(string(fname), + &buckets_[new_bucket_id], + storage_options_); + if (!s.ok()) { + return s; + } + + // whether Allocate succeeds or not, does not affect the overall correctness + // of this function - calling Allocate is really optional + // (also, tmpfs does not support allocate) + buckets_[new_bucket_id].get()->Allocate(0, block_size_ * blocks_per_bucket_); + + buckets_size_ = new_bucket_id + 1; + + return free_list_.Free(Blob(new_bucket_id, 0, blocks_per_bucket_)); +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/util/blob_store.h b/util/blob_store.h new file mode 100644 index 0000000000..ce8633740c --- /dev/null +++ b/util/blob_store.h @@ -0,0 +1,163 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE +#pragma once +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include "util/coding.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +struct BlobChunk { + uint32_t bucket_id; + uint32_t offset; // in blocks + uint32_t size; // in blocks + BlobChunk() {} + BlobChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) : + bucket_id(bucket_id), offset(offset), size(size) {} + + // returns true if it's immediately before chunk + bool ImmediatelyBefore(const BlobChunk& chunk) const; + // returns true if chunks overlap + bool Overlap(const BlobChunk &chunk) const; +}; + +// We represent each Blob as a string in format: +// bucket_id offset size|bucket_id offset size... +// The string can be used to reference the Blob stored on external +// device/file +// Not thread-safe! +struct Blob { + // Generates the string + std::string ToString() const; + // Parses the previously generated string + explicit Blob(const std::string& blob); + // Creates unfragmented Blob + Blob(uint32_t bucket_id, uint32_t offset, uint32_t size) { + SetOneChunk(bucket_id, offset, size); + } + Blob() {} + + void SetOneChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) { + chunks.clear(); + chunks.push_back(BlobChunk(bucket_id, offset, size)); + } + + uint32_t Size() const { // in blocks + uint32_t ret = 0; + for (auto chunk : chunks) { + ret += chunk.size; + } + assert(ret > 0); + return ret; + } + + // bucket_id, offset, size + std::vector chunks; +}; + +// Keeps a list of free chunks +// NOT thread-safe. Externally synchronized +class FreeList { + public: + FreeList() : + free_blocks_(0) {} + ~FreeList() {} + + // Allocates a a blob. Stores the allocated blob in + // 'blob'. Returns non-OK status if it failed to allocate. + // Thread-safe + Status Allocate(uint32_t blocks, Blob* blob); + // Frees the blob for reuse. Thread-safe + Status Free(const Blob& blob); + + // returns true if blob is overlapping with any of the + // chunks stored in free list + bool Overlap(const Blob &blob) const; + + private: + std::deque fifo_free_chunks_; + uint32_t free_blocks_; + mutable port::Mutex mutex_; +}; + +// thread-safe +class BlobStore { + public: + // directory - wherever the blobs should be stored. It will be created + // if missing + // block_size - self explanatory + // blocks_per_bucket - how many blocks we want to keep in one bucket. + // Bucket is a device or a file that we use to store the blobs. + // If we don't have enough blocks to allocate a new blob, we will + // try to create a new file or device. + // max_buckets - maximum number of buckets BlobStore will create + // BlobStore max size in bytes is + // max_buckets * blocks_per_bucket * block_size + // env - env for creating new files + BlobStore(const std::string& directory, + uint64_t block_size, + uint32_t blocks_per_bucket, + uint32_t max_buckets, + Env* env); + ~BlobStore(); + + // Allocates space for value.size bytes (rounded up to be multiple of + // block size) and writes value.size bytes from value.data to a backing store. + // Sets Blob blob that can than be used for addressing the + // stored value. Returns non-OK status on error. + Status Put(const Slice& value, Blob* blob); + // Value needs to have enough space to store all the loaded stuff. + // This function is thread safe! + Status Get(const Blob& blob, std::string* value) const; + // Frees the blob for reuse, but does not delete the data + // on the backing store. + Status Delete(const Blob& blob); + // Sync all opened files that are modified + Status Sync(); + + private: + const std::string directory_; + // block_size_ is uint64_t because when we multiply with + // blocks_size_ we want the result to be uint64_t or + // we risk overflowing + const uint64_t block_size_; + const uint32_t blocks_per_bucket_; + Env* env_; + EnvOptions storage_options_; + // protected by free_list_mutex_ + FreeList free_list_; + // free_list_mutex_ is locked BEFORE buckets_mutex_ + mutable port::Mutex free_list_mutex_; + // protected by buckets_mutex_ + // array of buckets + unique_ptr* buckets_; + // number of buckets in the array + uint32_t buckets_size_; + uint32_t max_buckets_; + mutable port::Mutex buckets_mutex_; + + // Calls FreeList allocate. If free list can't allocate + // new blob, creates new bucket and tries again + // Thread-safe + Status Allocate(uint32_t blocks, Blob* blob); + + // Creates a new backing store and adds all the blocks + // from the new backing store to the free list + Status CreateNewBucket(); +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/util/blob_store_test.cc b/util/blob_store_test.cc new file mode 100644 index 0000000000..f199f5ddd0 --- /dev/null +++ b/util/blob_store_test.cc @@ -0,0 +1,200 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/blob_store.h" + +#include "util/testharness.h" +#include "util/testutil.h" +#include "util/random.h" + +#include +#include + +namespace rocksdb { + +using namespace std; + +class BlobStoreTest { }; + +TEST(BlobStoreTest, RangeParseTest) { + Blob e; + for (int i = 0; i < 5; ++i) { + e.chunks.push_back(BlobChunk(rand(), rand(), rand())); + } + string x = e.ToString(); + Blob nx(x); + + ASSERT_EQ(nx.ToString(), x); +} + +// make sure we're reusing the freed space +TEST(BlobStoreTest, SanityTest) { + const uint64_t block_size = 10; + const uint32_t blocks_per_file = 20; + Random random(5); + + BlobStore blob_store(test::TmpDir() + "/blob_store_test", + block_size, + blocks_per_file, + 1000, + Env::Default()); + + string buf; + + // put string of size 170 + test::RandomString(&random, 170, &buf); + Blob r1; + ASSERT_OK(blob_store.Put(Slice(buf), &r1)); + // use the first file + for (size_t i = 0; i < r1.chunks.size(); ++i) { + ASSERT_EQ(r1.chunks[0].bucket_id, 0u); + } + + // put string of size 30 + test::RandomString(&random, 30, &buf); + Blob r2; + ASSERT_OK(blob_store.Put(Slice(buf), &r2)); + // use the first file + for (size_t i = 0; i < r2.chunks.size(); ++i) { + ASSERT_EQ(r2.chunks[0].bucket_id, 0u); + } + + // delete blob of size 170 + ASSERT_OK(blob_store.Delete(r1)); + + // put a string of size 100 + test::RandomString(&random, 100, &buf); + Blob r3; + ASSERT_OK(blob_store.Put(Slice(buf), &r3)); + // use the first file + for (size_t i = 0; i < r3.chunks.size(); ++i) { + ASSERT_EQ(r3.chunks[0].bucket_id, 0u); + } + + // put a string of size 70 + test::RandomString(&random, 70, &buf); + Blob r4; + ASSERT_OK(blob_store.Put(Slice(buf), &r4)); + // use the first file + for (size_t i = 0; i < r4.chunks.size(); ++i) { + ASSERT_EQ(r4.chunks[0].bucket_id, 0u); + } + + // put a string of size 5 + test::RandomString(&random, 5, &buf); + Blob r5; + ASSERT_OK(blob_store.Put(Slice(buf), &r5)); + // now you get to use the second file + for (size_t i = 0; i < r5.chunks.size(); ++i) { + ASSERT_EQ(r5.chunks[0].bucket_id, 1u); + } +} + +TEST(BlobStoreTest, FragmentedChunksTest) { + const uint64_t block_size = 10; + const uint32_t blocks_per_file = 20; + Random random(5); + + BlobStore blob_store(test::TmpDir() + "/blob_store_test", + block_size, + blocks_per_file, + 1000, + Env::Default()); + + string buf; + + vector r(4); + + // put 4 strings of size 50 + for (int k = 0; k < 4; ++k) { + test::RandomString(&random, 50, &buf); + ASSERT_OK(blob_store.Put(Slice(buf), &r[k])); + // use the first file + for (size_t i = 0; i < r[k].chunks.size(); ++i) { + ASSERT_EQ(r[k].chunks[0].bucket_id, 0u); + } + } + + // delete the first and third + ASSERT_OK(blob_store.Delete(r[0])); + ASSERT_OK(blob_store.Delete(r[2])); + + // put string of size 100. it should reuse space that we deleting + // by deleting first and third strings of size 50 + test::RandomString(&random, 100, &buf); + Blob r2; + ASSERT_OK(blob_store.Put(Slice(buf), &r2)); + // use the first file + for (size_t i = 0; i < r2.chunks.size(); ++i) { + ASSERT_EQ(r2.chunks[0].bucket_id, 0u); + } +} + +TEST(BlobStoreTest, CreateAndStoreTest) { + const uint64_t block_size = 10; + const uint32_t blocks_per_file = 1000; + const int max_blurb_size = 300; + Random random(5); + + BlobStore blob_store(test::TmpDir() + "/blob_store_test", + block_size, + blocks_per_file, + 10000, + Env::Default()); + vector> ranges; + + for (int i = 0; i < 2000; ++i) { + int decision = rand() % 5; + if (decision <= 2 || ranges.size() == 0) { + string buf; + int size_blocks = (rand() % max_blurb_size + 1); + int string_size = size_blocks * block_size - (rand() % block_size); + test::RandomString(&random, string_size, &buf); + Blob r; + ASSERT_OK(blob_store.Put(Slice(buf), &r)); + ranges.push_back(make_pair(r, buf)); + } else if (decision == 3) { + int ti = rand() % ranges.size(); + string out_buf; + ASSERT_OK(blob_store.Get(ranges[ti].first, &out_buf)); + ASSERT_EQ(ranges[ti].second, out_buf); + } else { + int ti = rand() % ranges.size(); + ASSERT_OK(blob_store.Delete(ranges[ti].first)); + ranges.erase(ranges.begin() + ti); + } + } + ASSERT_OK(blob_store.Sync()); +} + +TEST(BlobStoreTest, MaxSizeTest) { + const uint64_t block_size = 10; + const uint32_t blocks_per_file = 100; + const int max_buckets = 10; + Random random(5); + + BlobStore blob_store(test::TmpDir() + "/blob_store_test", + block_size, + blocks_per_file, + max_buckets, + Env::Default()); + string buf; + for (int i = 0; i < max_buckets; ++i) { + test::RandomString(&random, 1000, &buf); + Blob r; + ASSERT_OK(blob_store.Put(Slice(buf), &r)); + } + + test::RandomString(&random, 1000, &buf); + Blob r; + // should fail because max size + Status s = blob_store.Put(Slice(buf), &r); + ASSERT_EQ(s.ok(), false); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/bloom.cc b/util/bloom.cc new file mode 100644 index 0000000000..78ae04a266 --- /dev/null +++ b/util/bloom.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/filter_policy.h" + +#include "rocksdb/slice.h" +#include "util/hash.h" + +namespace rocksdb { + +namespace { +static uint32_t BloomHash(const Slice& key) { + return Hash(key.data(), key.size(), 0xbc9f1d34); +} + +class BloomFilterPolicy : public FilterPolicy { + private: + size_t bits_per_key_; + size_t k_; + uint32_t (*hash_func_)(const Slice& key); + + void initialize() { + // We intentionally round down to reduce probing cost a little bit + k_ = static_cast(bits_per_key_ * 0.69); // 0.69 =~ ln(2) + if (k_ < 1) k_ = 1; + if (k_ > 30) k_ = 30; + } + + public: + explicit BloomFilterPolicy(int bits_per_key, + uint32_t (*hash_func)(const Slice& key)) + : bits_per_key_(bits_per_key), hash_func_(hash_func) { + initialize(); + } + explicit BloomFilterPolicy(int bits_per_key) + : bits_per_key_(bits_per_key) { + hash_func_ = BloomHash; + initialize(); + } + + virtual const char* Name() const { + return "rocksdb.BuiltinBloomFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + // Compute bloom filter size (in both bits and bytes) + size_t bits = n * bits_per_key_; + + // For small n, we can see a very high false positive rate. Fix it + // by enforcing a minimum bloom filter length. + if (bits < 64) bits = 64; + + size_t bytes = (bits + 7) / 8; + bits = bytes * 8; + + const size_t init_size = dst->size(); + dst->resize(init_size + bytes, 0); + dst->push_back(static_cast(k_)); // Remember # of probes in filter + char* array = &(*dst)[init_size]; + for (size_t i = 0; i < (size_t)n; i++) { + // Use double-hashing to generate a sequence of hash values. + // See analysis in [Kirsch,Mitzenmacher 2006]. + uint32_t h = hash_func_(keys[i]); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (size_t j = 0; j < k_; j++) { + const uint32_t bitpos = h % bits; + array[bitpos/8] |= (1 << (bitpos % 8)); + h += delta; + } + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const { + const size_t len = bloom_filter.size(); + if (len < 2) return false; + + const char* array = bloom_filter.data(); + const size_t bits = (len - 1) * 8; + + // Use the encoded k so that we can read filters generated by + // bloom filters created using different parameters. + const size_t k = array[len-1]; + if (k > 30) { + // Reserved for potentially new encodings for short bloom filters. + // Consider it a match. + return true; + } + + uint32_t h = hash_func_(key); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (size_t j = 0; j < k; j++) { + const uint32_t bitpos = h % bits; + if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false; + h += delta; + } + return true; + } +}; +} + +const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) { + return new BloomFilterPolicy(bits_per_key); +} + +} // namespace rocksdb diff --git a/util/bloom_test.cc b/util/bloom_test.cc new file mode 100644 index 0000000000..881e3b0f59 --- /dev/null +++ b/util/bloom_test.cc @@ -0,0 +1,182 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include + +#include "rocksdb/filter_policy.h" + +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +using GFLAGS::ParseCommandLineFlags; + +DEFINE_int32(bits_per_key, 10, ""); + +namespace rocksdb { + +static const int kVerbose = 1; + +static Slice Key(int i, char* buffer) { + memcpy(buffer, &i, sizeof(i)); + return Slice(buffer, sizeof(i)); +} + +class BloomTest { + private: + const FilterPolicy* policy_; + std::string filter_; + std::vector keys_; + + public: + BloomTest() : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key)) { } + + ~BloomTest() { + delete policy_; + } + + void Reset() { + keys_.clear(); + filter_.clear(); + } + + void Add(const Slice& s) { + keys_.push_back(s.ToString()); + } + + void Build() { + std::vector key_slices; + for (size_t i = 0; i < keys_.size(); i++) { + key_slices.push_back(Slice(keys_[i])); + } + filter_.clear(); + policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_); + keys_.clear(); + if (kVerbose >= 2) DumpFilter(); + } + + size_t FilterSize() const { + return filter_.size(); + } + + void DumpFilter() { + fprintf(stderr, "F("); + for (size_t i = 0; i+1 < filter_.size(); i++) { + const unsigned int c = static_cast(filter_[i]); + for (int j = 0; j < 8; j++) { + fprintf(stderr, "%c", (c & (1 <KeyMayMatch(s, filter_); + } + + double FalsePositiveRate() { + char buffer[sizeof(int)]; + int result = 0; + for (int i = 0; i < 10000; i++) { + if (Matches(Key(i + 1000000000, buffer))) { + result++; + } + } + return result / 10000.0; + } +}; + +TEST(BloomTest, EmptyFilter) { + ASSERT_TRUE(! Matches("hello")); + ASSERT_TRUE(! Matches("world")); +} + +TEST(BloomTest, Small) { + Add("hello"); + Add("world"); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + ASSERT_TRUE(! Matches("x")); + ASSERT_TRUE(! Matches("foo")); +} + +static int NextLength(int length) { + if (length < 10) { + length += 1; + } else if (length < 100) { + length += 10; + } else if (length < 1000) { + length += 100; + } else { + length += 1000; + } + return length; +} + +TEST(BloomTest, VaryingLengths) { + char buffer[sizeof(int)]; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + + for (int length = 1; length <= 10000; length = NextLength(length)) { + Reset(); + for (int i = 0; i < length; i++) { + Add(Key(i, buffer)); + } + Build(); + + ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 40)) << length; + + // All added keys must match + for (int i = 0; i < length; i++) { + ASSERT_TRUE(Matches(Key(i, buffer))) + << "Length " << length << "; key " << i; + } + + // Check false positive rate + double rate = FalsePositiveRate(); + if (kVerbose >= 1) { + fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n", + rate*100.0, length, static_cast(FilterSize())); + } + ASSERT_LE(rate, 0.02); // Must not be over 2% + if (rate > 0.0125) mediocre_filters++; // Allowed, but not too often + else good_filters++; + } + if (kVerbose >= 1) { + fprintf(stderr, "Filters: %d good, %d mediocre\n", + good_filters, mediocre_filters); + } + ASSERT_LE(mediocre_filters, good_filters/5); +} + +// Different bits-per-byte + +} // namespace rocksdb + +int main(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + + return rocksdb::test::RunAllTests(); +} + +#endif // GFLAGS diff --git a/util/build_version.h b/util/build_version.h new file mode 100644 index 0000000000..2035a7898f --- /dev/null +++ b/util/build_version.h @@ -0,0 +1,16 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#if !defined(IOS_CROSS_COMPILE) +// if we compile with Xcode, we don't run build_detect_vesion, so we don't +// generate these variables +// these variables tell us about the git config and time +extern const char* rocksdb_build_git_sha; + +// these variables tell us when the compilation occurred +extern const char* rocksdb_build_compile_time; +extern const char* rocksdb_build_compile_date; +#endif diff --git a/util/cache.cc b/util/cache.cc new file mode 100644 index 0000000000..f1c48a8299 --- /dev/null +++ b/util/cache.cc @@ -0,0 +1,481 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include + +#include "rocksdb/cache.h" +#include "port/port.h" +#include "util/autovector.h" +#include "util/hash.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +Cache::~Cache() { +} + +namespace { + +// LRU cache implementation + +// An entry is a variable length heap-allocated structure. Entries +// are kept in a circular doubly linked list ordered by access time. +struct LRUHandle { + void* value; + void (*deleter)(const Slice&, void* value); + LRUHandle* next_hash; + LRUHandle* next; + LRUHandle* prev; + size_t charge; // TODO(opt): Only allow uint32_t? + size_t key_length; + uint32_t refs; + uint32_t hash; // Hash of key(); used for fast sharding and comparisons + char key_data[1]; // Beginning of key + + Slice key() const { + // For cheaper lookups, we allow a temporary Handle object + // to store a pointer to a key in "value". + if (next == this) { + return *(reinterpret_cast(value)); + } else { + return Slice(key_data, key_length); + } + } +}; + +// We provide our own simple hash table since it removes a whole bunch +// of porting hacks and is also faster than some of the built-in hash +// table implementations in some of the compiler/runtime combinations +// we have tested. E.g., readrandom speeds up by ~5% over the g++ +// 4.4.3's builtin hashtable. +class HandleTable { + public: + HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); } + ~HandleTable() { delete[] list_; } + + LRUHandle* Lookup(const Slice& key, uint32_t hash) { + return *FindPointer(key, hash); + } + + LRUHandle* Insert(LRUHandle* h) { + LRUHandle** ptr = FindPointer(h->key(), h->hash); + LRUHandle* old = *ptr; + h->next_hash = (old == nullptr ? nullptr : old->next_hash); + *ptr = h; + if (old == nullptr) { + ++elems_; + if (elems_ > length_) { + // Since each cache entry is fairly large, we aim for a small + // average linked list length (<= 1). + Resize(); + } + } + return old; + } + + LRUHandle* Remove(const Slice& key, uint32_t hash) { + LRUHandle** ptr = FindPointer(key, hash); + LRUHandle* result = *ptr; + if (result != nullptr) { + *ptr = result->next_hash; + --elems_; + } + return result; + } + + private: + // The table consists of an array of buckets where each bucket is + // a linked list of cache entries that hash into the bucket. + uint32_t length_; + uint32_t elems_; + LRUHandle** list_; + + // Return a pointer to slot that points to a cache entry that + // matches key/hash. If there is no such cache entry, return a + // pointer to the trailing slot in the corresponding linked list. + LRUHandle** FindPointer(const Slice& key, uint32_t hash) { + LRUHandle** ptr = &list_[hash & (length_ - 1)]; + while (*ptr != nullptr && + ((*ptr)->hash != hash || key != (*ptr)->key())) { + ptr = &(*ptr)->next_hash; + } + return ptr; + } + + void Resize() { + uint32_t new_length = 16; + while (new_length < elems_ * 1.5) { + new_length *= 2; + } + LRUHandle** new_list = new LRUHandle*[new_length]; + memset(new_list, 0, sizeof(new_list[0]) * new_length); + uint32_t count = 0; + for (uint32_t i = 0; i < length_; i++) { + LRUHandle* h = list_[i]; + while (h != nullptr) { + LRUHandle* next = h->next_hash; + uint32_t hash = h->hash; + LRUHandle** ptr = &new_list[hash & (new_length - 1)]; + h->next_hash = *ptr; + *ptr = h; + h = next; + count++; + } + } + assert(elems_ == count); + delete[] list_; + list_ = new_list; + length_ = new_length; + } +}; + +// A single shard of sharded cache. +class LRUCache { + public: + LRUCache(); + ~LRUCache(); + + // Separate from constructor so caller can easily make an array of LRUCache + void SetCapacity(size_t capacity) { capacity_ = capacity; } + void SetRemoveScanCountLimit(size_t remove_scan_count_limit) { + remove_scan_count_limit_ = remove_scan_count_limit; + } + + // Like Cache methods, but with an extra "hash" parameter. + Cache::Handle* Insert(const Slice& key, uint32_t hash, + void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)); + Cache::Handle* Lookup(const Slice& key, uint32_t hash); + void Release(Cache::Handle* handle); + void Erase(const Slice& key, uint32_t hash); + // Although in some platforms the update of size_t is atomic, to make sure + // GetUsage() works correctly under any platforms, we'll protect this + // function with mutex. + size_t GetUsage() const { + MutexLock l(&mutex_); + return usage_; + } + + void ApplyToAllCacheEntries(void (*callback)(void*, size_t), + bool thread_safe); + + private: + void LRU_Remove(LRUHandle* e); + void LRU_Append(LRUHandle* e); + // Just reduce the reference count by 1. + // Return true if last reference + bool Unref(LRUHandle* e); + // Call deleter and free + void FreeEntry(LRUHandle* e); + + // Initialized before use. + size_t capacity_; + uint32_t remove_scan_count_limit_; + + // mutex_ protects the following state. + // We don't count mutex_ as the cache's internal state so semantically we + // don't mind mutex_ invoking the non-const actions. + mutable port::Mutex mutex_; + size_t usage_; + + // Dummy head of LRU list. + // lru.prev is newest entry, lru.next is oldest entry. + LRUHandle lru_; + + HandleTable table_; +}; + +LRUCache::LRUCache() + : usage_(0) { + // Make empty circular linked list + lru_.next = &lru_; + lru_.prev = &lru_; +} + +LRUCache::~LRUCache() { + for (LRUHandle* e = lru_.next; e != &lru_; ) { + LRUHandle* next = e->next; + assert(e->refs == 1); // Error if caller has an unreleased handle + if (Unref(e)) { + FreeEntry(e); + } + e = next; + } +} + +bool LRUCache::Unref(LRUHandle* e) { + assert(e->refs > 0); + e->refs--; + return e->refs == 0; +} + +void LRUCache::FreeEntry(LRUHandle* e) { + assert(e->refs == 0); + (*e->deleter)(e->key(), e->value); + free(e); +} + +void LRUCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t), + bool thread_safe) { + if (thread_safe) { + mutex_.Lock(); + } + for (auto e = lru_.next; e != &lru_; e = e->next) { + callback(e->value, e->charge); + } + if (thread_safe) { + mutex_.Unlock(); + } +} + +void LRUCache::LRU_Remove(LRUHandle* e) { + e->next->prev = e->prev; + e->prev->next = e->next; + usage_ -= e->charge; +} + +void LRUCache::LRU_Append(LRUHandle* e) { + // Make "e" newest entry by inserting just before lru_ + e->next = &lru_; + e->prev = lru_.prev; + e->prev->next = e; + e->next->prev = e; + usage_ += e->charge; +} + +Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) { + MutexLock l(&mutex_); + LRUHandle* e = table_.Lookup(key, hash); + if (e != nullptr) { + e->refs++; + LRU_Remove(e); + LRU_Append(e); + } + return reinterpret_cast(e); +} + +void LRUCache::Release(Cache::Handle* handle) { + LRUHandle* e = reinterpret_cast(handle); + bool last_reference = false; + { + MutexLock l(&mutex_); + last_reference = Unref(e); + } + if (last_reference) { + FreeEntry(e); + } +} + +Cache::Handle* LRUCache::Insert( + const Slice& key, uint32_t hash, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) { + + LRUHandle* e = reinterpret_cast( + malloc(sizeof(LRUHandle)-1 + key.size())); + autovector last_reference_list; + + e->value = value; + e->deleter = deleter; + e->charge = charge; + e->key_length = key.size(); + e->hash = hash; + e->refs = 2; // One from LRUCache, one for the returned handle + memcpy(e->key_data, key.data(), key.size()); + + { + MutexLock l(&mutex_); + + LRU_Append(e); + + LRUHandle* old = table_.Insert(e); + if (old != nullptr) { + LRU_Remove(old); + if (Unref(old)) { + last_reference_list.push_back(old); + } + } + + if (remove_scan_count_limit_ > 0) { + // Try to free the space by evicting the entries that are only + // referenced by the cache first. + LRUHandle* cur = lru_.next; + for (unsigned int scanCount = 0; + usage_ > capacity_ && cur != &lru_ + && scanCount < remove_scan_count_limit_; scanCount++) { + LRUHandle* next = cur->next; + if (cur->refs <= 1) { + LRU_Remove(cur); + table_.Remove(cur->key(), cur->hash); + if (Unref(cur)) { + last_reference_list.push_back(cur); + } + } + cur = next; + } + } + + // Free the space following strict LRU policy until enough space + // is freed. + while (usage_ > capacity_ && lru_.next != &lru_) { + LRUHandle* old = lru_.next; + LRU_Remove(old); + table_.Remove(old->key(), old->hash); + if (Unref(old)) { + last_reference_list.push_back(old); + } + } + } + + // we free the entries here outside of mutex for + // performance reasons + for (auto entry : last_reference_list) { + FreeEntry(entry); + } + + return reinterpret_cast(e); +} + +void LRUCache::Erase(const Slice& key, uint32_t hash) { + LRUHandle* e; + bool last_reference = false; + { + MutexLock l(&mutex_); + e = table_.Remove(key, hash); + if (e != nullptr) { + LRU_Remove(e); + last_reference = Unref(e); + } + } + // mutex not held here + // last_reference will only be true if e != nullptr + if (last_reference) { + FreeEntry(e); + } +} + +static int kNumShardBits = 4; // default values, can be overridden +static int kRemoveScanCountLimit = 0; // default values, can be overridden + +class ShardedLRUCache : public Cache { + private: + LRUCache* shards_; + port::Mutex id_mutex_; + uint64_t last_id_; + int num_shard_bits_; + size_t capacity_; + + static inline uint32_t HashSlice(const Slice& s) { + return Hash(s.data(), s.size(), 0); + } + + uint32_t Shard(uint32_t hash) { + // Note, hash >> 32 yields hash in gcc, not the zero we expect! + return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0; + } + + void init(size_t capacity, int numbits, int removeScanCountLimit) { + num_shard_bits_ = numbits; + capacity_ = capacity; + int num_shards = 1 << num_shard_bits_; + shards_ = new LRUCache[num_shards]; + const size_t per_shard = (capacity + (num_shards - 1)) / num_shards; + for (int s = 0; s < num_shards; s++) { + shards_[s].SetCapacity(per_shard); + shards_[s].SetRemoveScanCountLimit(removeScanCountLimit); + } + } + + public: + explicit ShardedLRUCache(size_t capacity) + : last_id_(0) { + init(capacity, kNumShardBits, kRemoveScanCountLimit); + } + ShardedLRUCache(size_t capacity, int num_shard_bits, + int removeScanCountLimit) + : last_id_(0) { + init(capacity, num_shard_bits, removeScanCountLimit); + } + virtual ~ShardedLRUCache() { + delete[] shards_; + } + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) { + const uint32_t hash = HashSlice(key); + return shards_[Shard(hash)].Insert(key, hash, value, charge, deleter); + } + virtual Handle* Lookup(const Slice& key) { + const uint32_t hash = HashSlice(key); + return shards_[Shard(hash)].Lookup(key, hash); + } + virtual void Release(Handle* handle) { + LRUHandle* h = reinterpret_cast(handle); + shards_[Shard(h->hash)].Release(handle); + } + virtual void Erase(const Slice& key) { + const uint32_t hash = HashSlice(key); + shards_[Shard(hash)].Erase(key, hash); + } + virtual void* Value(Handle* handle) { + return reinterpret_cast(handle)->value; + } + virtual uint64_t NewId() { + MutexLock l(&id_mutex_); + return ++(last_id_); + } + virtual size_t GetCapacity() const { + return capacity_; + } + + virtual size_t GetUsage() const { + // We will not lock the cache when getting the usage from shards. + // for (size_t i = 0; i < num_shard_bits_; ++i) + int num_shards = 1 << num_shard_bits_; + size_t usage = 0; + for (int s = 0; s < num_shards; s++) { + usage += shards_[s].GetUsage(); + } + return usage; + } + + virtual void DisownData() { + shards_ = nullptr; + } + + virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), + bool thread_safe) override { + int num_shards = 1 << num_shard_bits_; + for (int s = 0; s < num_shards; s++) { + shards_[s].ApplyToAllCacheEntries(callback, thread_safe); + } + } +}; + +} // end anonymous namespace + +shared_ptr NewLRUCache(size_t capacity) { + return NewLRUCache(capacity, kNumShardBits); +} + +shared_ptr NewLRUCache(size_t capacity, int num_shard_bits) { + return NewLRUCache(capacity, num_shard_bits, kRemoveScanCountLimit); +} + +shared_ptr NewLRUCache(size_t capacity, int num_shard_bits, + int removeScanCountLimit) { + if (num_shard_bits >= 20) { + return nullptr; // the cache cannot be sharded into too many fine pieces + } + return std::make_shared(capacity, + num_shard_bits, + removeScanCountLimit); +} + +} // namespace rocksdb diff --git a/util/cache_test.cc b/util/cache_test.cc new file mode 100644 index 0000000000..c12cdb7e1b --- /dev/null +++ b/util/cache_test.cc @@ -0,0 +1,449 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/cache.h" + +#include +#include +#include +#include "util/coding.h" +#include "util/testharness.h" + +namespace rocksdb { + +// Conversions between numeric keys/values and the types expected by Cache. +static std::string EncodeKey(int k) { + std::string result; + PutFixed32(&result, k); + return result; +} +static int DecodeKey(const Slice& k) { + assert(k.size() == 4); + return DecodeFixed32(k.data()); +} +static void* EncodeValue(uintptr_t v) { return reinterpret_cast(v); } +static int DecodeValue(void* v) { return reinterpret_cast(v); } + +class CacheTest { + public: + static CacheTest* current_; + + static void Deleter(const Slice& key, void* v) { + current_->deleted_keys_.push_back(DecodeKey(key)); + current_->deleted_values_.push_back(DecodeValue(v)); + } + + static const int kCacheSize = 1000; + static const int kNumShardBits = 4; + static const int kRemoveScanCountLimit = 16; + + static const int kCacheSize2 = 100; + static const int kNumShardBits2 = 2; + static const int kRemoveScanCountLimit2 = 200; + + std::vector deleted_keys_; + std::vector deleted_values_; + shared_ptr cache_; + shared_ptr cache2_; + + CacheTest() : + cache_(NewLRUCache(kCacheSize, kNumShardBits, kRemoveScanCountLimit)), + cache2_(NewLRUCache(kCacheSize2, kNumShardBits2, + kRemoveScanCountLimit2)) { + current_ = this; + } + + ~CacheTest() { + } + + int Lookup(shared_ptr cache, int key) { + Cache::Handle* handle = cache->Lookup(EncodeKey(key)); + const int r = (handle == nullptr) ? -1 : DecodeValue(cache->Value(handle)); + if (handle != nullptr) { + cache->Release(handle); + } + return r; + } + + void Insert(shared_ptr cache, int key, int value, int charge = 1) { + cache->Release(cache->Insert(EncodeKey(key), EncodeValue(value), charge, + &CacheTest::Deleter)); + } + + void Erase(shared_ptr cache, int key) { + cache->Erase(EncodeKey(key)); + } + + + int Lookup(int key) { + return Lookup(cache_, key); + } + + void Insert(int key, int value, int charge = 1) { + Insert(cache_, key, value, charge); + } + + void Erase(int key) { + Erase(cache_, key); + } + + int Lookup2(int key) { + return Lookup(cache2_, key); + } + + void Insert2(int key, int value, int charge = 1) { + Insert(cache2_, key, value, charge); + } + + void Erase2(int key) { + Erase(cache2_, key); + } +}; +CacheTest* CacheTest::current_; + +namespace { +void dumbDeleter(const Slice& key, void* value) { } +} // namespace + +TEST(CacheTest, UsageTest) { + // cache is shared_ptr and will be automatically cleaned up. + const uint64_t kCapacity = 100000; + auto cache = NewLRUCache(kCapacity, 8, 200); + + size_t usage = 0; + const char* value = "abcdef"; + // make sure everything will be cached + for (int i = 1; i < 100; ++i) { + std::string key(i, 'a'); + auto kv_size = key.size() + 5; + cache->Release( + cache->Insert(key, (void*)value, kv_size, dumbDeleter) + ); + usage += kv_size; + ASSERT_EQ(usage, cache->GetUsage()); + } + + // make sure the cache will be overloaded + for (uint64_t i = 1; i < kCapacity; ++i) { + auto key = std::to_string(i); + cache->Release( + cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter) + ); + } + + // the usage should be close to the capacity + ASSERT_GT(kCapacity, cache->GetUsage()); + ASSERT_LT(kCapacity * 0.95, cache->GetUsage()); +} + +TEST(CacheTest, HitAndMiss) { + ASSERT_EQ(-1, Lookup(100)); + + Insert(100, 101); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(200, 201); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(100, 102); + ASSERT_EQ(102, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); +} + +TEST(CacheTest, Erase) { + Erase(200); + ASSERT_EQ(0U, deleted_keys_.size()); + + Insert(100, 101); + Insert(200, 201); + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1U, deleted_keys_.size()); +} + +TEST(CacheTest, EntriesArePinned) { + Insert(100, 101); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); + + Insert(100, 102); + Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); + ASSERT_EQ(0U, deleted_keys_.size()); + + cache_->Release(h1); + ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(1U, deleted_keys_.size()); + + cache_->Release(h2); + ASSERT_EQ(2U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[1]); + ASSERT_EQ(102, deleted_values_[1]); +} + +TEST(CacheTest, EvictionPolicy) { + Insert(100, 101); + Insert(200, 201); + + // Frequently used entry must be kept around + for (int i = 0; i < kCacheSize + 100; i++) { + Insert(1000+i, 2000+i); + ASSERT_EQ(2000+i, Lookup(1000+i)); + ASSERT_EQ(101, Lookup(100)); + } + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); +} + +TEST(CacheTest, EvictionPolicyRef) { + Insert(100, 101); + Insert(101, 102); + Insert(102, 103); + Insert(103, 104); + Insert(200, 101); + Insert(201, 102); + Insert(202, 103); + Insert(203, 104); + Cache::Handle* h201 = cache_->Lookup(EncodeKey(200)); + Cache::Handle* h202 = cache_->Lookup(EncodeKey(201)); + Cache::Handle* h203 = cache_->Lookup(EncodeKey(202)); + Cache::Handle* h204 = cache_->Lookup(EncodeKey(203)); + Insert(300, 101); + Insert(301, 102); + Insert(302, 103); + Insert(303, 104); + + // Insert entries much more than Cache capacity + for (int i = 0; i < kCacheSize + 100; i++) { + Insert(1000 + i, 2000 + i); + } + + // Check whether the entries inserted in the beginning + // are evicted. Ones without extra ref are evicted and + // those with are not. + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(-1, Lookup(101)); + ASSERT_EQ(-1, Lookup(102)); + ASSERT_EQ(-1, Lookup(103)); + + ASSERT_EQ(-1, Lookup(300)); + ASSERT_EQ(-1, Lookup(301)); + ASSERT_EQ(-1, Lookup(302)); + ASSERT_EQ(-1, Lookup(303)); + + ASSERT_EQ(101, Lookup(200)); + ASSERT_EQ(102, Lookup(201)); + ASSERT_EQ(103, Lookup(202)); + ASSERT_EQ(104, Lookup(203)); + + // Cleaning up all the handles + cache_->Release(h201); + cache_->Release(h202); + cache_->Release(h203); + cache_->Release(h204); +} + +TEST(CacheTest, EvictionPolicyRef2) { + std::vector handles; + + Insert(100, 101); + // Insert entries much more than Cache capacity + for (int i = 0; i < kCacheSize + 100; i++) { + Insert(1000 + i, 2000 + i); + if (i < kCacheSize ) { + handles.push_back(cache_->Lookup(EncodeKey(1000 + i))); + } + } + + // Make sure referenced keys are also possible to be deleted + // if there are not sufficient non-referenced keys + for (int i = 0; i < 5; i++) { + ASSERT_EQ(-1, Lookup(1000 + i)); + } + + for (int i = kCacheSize; i < kCacheSize + 100; i++) { + ASSERT_EQ(2000 + i, Lookup(1000 + i)); + } + ASSERT_EQ(-1, Lookup(100)); + + // Cleaning up all the handles + while (handles.size() > 0) { + cache_->Release(handles.back()); + handles.pop_back(); + } +} + +TEST(CacheTest, EvictionPolicyRefLargeScanLimit) { + std::vector handles2; + + // Cache2 has a cache RemoveScanCountLimit higher than cache size + // so it would trigger a boundary condition. + + // Populate the cache with 10 more keys than its size. + // Reference all keys except one close to the end. + for (int i = 0; i < kCacheSize2 + 10; i++) { + Insert2(1000 + i, 2000+i); + if (i != kCacheSize2 ) { + handles2.push_back(cache2_->Lookup(EncodeKey(1000 + i))); + } + } + + // Make sure referenced keys are also possible to be deleted + // if there are not sufficient non-referenced keys + for (int i = 0; i < 3; i++) { + ASSERT_EQ(-1, Lookup2(1000 + i)); + } + // The non-referenced value is deleted even if it's accessed + // recently. + ASSERT_EQ(-1, Lookup2(1000 + kCacheSize2)); + // Other values recently accessed are not deleted since they + // are referenced. + for (int i = kCacheSize2 - 10; i < kCacheSize2 + 10; i++) { + if (i != kCacheSize2) { + ASSERT_EQ(2000 + i, Lookup2(1000 + i)); + } + } + + // Cleaning up all the handles + while (handles2.size() > 0) { + cache2_->Release(handles2.back()); + handles2.pop_back(); + } +} + + + +TEST(CacheTest, HeavyEntries) { + // Add a bunch of light and heavy entries and then count the combined + // size of items still in the cache, which must be approximately the + // same as the total capacity. + const int kLight = 1; + const int kHeavy = 10; + int added = 0; + int index = 0; + while (added < 2*kCacheSize) { + const int weight = (index & 1) ? kLight : kHeavy; + Insert(index, 1000+index, weight); + added += weight; + index++; + } + + int cached_weight = 0; + for (int i = 0; i < index; i++) { + const int weight = (i & 1 ? kLight : kHeavy); + int r = Lookup(i); + if (r >= 0) { + cached_weight += weight; + ASSERT_EQ(1000+i, r); + } + } + ASSERT_LE(cached_weight, kCacheSize + kCacheSize/10); +} + +TEST(CacheTest, NewId) { + uint64_t a = cache_->NewId(); + uint64_t b = cache_->NewId(); + ASSERT_NE(a, b); +} + + +class Value { + private: + int v_; + public: + explicit Value(int v) : v_(v) { } + + ~Value() { std::cout << v_ << " is destructed\n"; } +}; + +namespace { +void deleter(const Slice& key, void* value) { + delete (Value *)value; +} +} // namespace + +TEST(CacheTest, BadEviction) { + int n = 10; + + // a LRUCache with n entries and one shard only + std::shared_ptr cache = NewLRUCache(n, 0); + + std::vector handles(n+1); + + // Insert n+1 entries, but not releasing. + for (int i = 0; i < n+1; i++) { + std::string key = std::to_string(i+1); + handles[i] = cache->Insert(key, new Value(i+1), 1, &deleter); + } + + // Guess what's in the cache now? + for (int i = 0; i < n+1; i++) { + std::string key = std::to_string(i+1); + auto h = cache->Lookup(key); + std::cout << key << (h?" found\n":" not found\n"); + // Only the first entry should be missing + ASSERT_TRUE(h || i == 0); + if (h) cache->Release(h); + } + + for (int i = 0; i < n+1; i++) { + cache->Release(handles[i]); + } + std::cout << "Poor entries\n"; +} + +namespace { +std::vector> callback_state; +void callback(void* entry, size_t charge) { + callback_state.push_back({DecodeValue(entry), static_cast(charge)}); +} +}; + +TEST(CacheTest, ApplyToAllCacheEntiresTest) { + std::vector> inserted; + callback_state.clear(); + + for (int i = 0; i < 10; ++i) { + Insert(i, i * 2, i + 1); + inserted.push_back({i * 2, i + 1}); + } + cache_->ApplyToAllCacheEntries(callback, true); + + sort(inserted.begin(), inserted.end()); + sort(callback_state.begin(), callback_state.end()); + ASSERT_TRUE(inserted == callback_state); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/coding.cc b/util/coding.cc new file mode 100644 index 0000000000..31ae0e356c --- /dev/null +++ b/util/coding.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" + +namespace rocksdb { + +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast(dst); + static const int B = 128; + if (v < (1 << 7)) { + *(ptr++) = v; + } else if (v < (1 << 14)) { + *(ptr++) = v | B; + *(ptr++) = v >> 7; + } else if (v < (1 << 21)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = v >> 14; + } else if (v < (1 << 28)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = v >> 21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = (v >> 21) | B; + *(ptr++) = v >> 28; + } + return reinterpret_cast(ptr); +} + +const char* GetVarint32PtrFallback(const char* p, const char* limit, + uint32_t* value) { + uint32_t result = 0; + for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { + uint32_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return nullptr; +} + +const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { + uint64_t result = 0; + for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { + uint64_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return nullptr; +} + +void BitStreamPutInt(char* dst, size_t dstlen, size_t offset, + uint32_t bits, uint64_t value) { + assert((offset + bits + 7)/8 <= dstlen); + assert(bits <= 64); + + unsigned char* ptr = reinterpret_cast(dst); + + size_t byteOffset = offset / 8; + size_t bitOffset = offset % 8; + + // This prevents unused variable warnings when compiling. +#ifndef NDEBUG + // Store truncated value. + uint64_t origValue = (bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value; + uint32_t origBits = bits; +#endif + + while (bits > 0) { + size_t bitsToGet = std::min(bits, 8 - bitOffset); + unsigned char mask = ((1 << bitsToGet) - 1); + + ptr[byteOffset] = (ptr[byteOffset] & ~(mask << bitOffset)) + + ((value & mask) << bitOffset); + + value >>= bitsToGet; + byteOffset += 1; + bitOffset = 0; + bits -= bitsToGet; + } + + assert(origValue == BitStreamGetInt(dst, dstlen, offset, origBits)); +} + +uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset, + uint32_t bits) { + assert((offset + bits + 7)/8 <= srclen); + assert(bits <= 64); + + const unsigned char* ptr = reinterpret_cast(src); + + uint64_t result = 0; + + size_t byteOffset = offset / 8; + size_t bitOffset = offset % 8; + size_t shift = 0; + + while (bits > 0) { + size_t bitsToGet = std::min(bits, 8 - bitOffset); + unsigned char mask = ((1 << bitsToGet) - 1); + + result += (uint64_t)((ptr[byteOffset] >> bitOffset) & mask) << shift; + + shift += bitsToGet; + byteOffset += 1; + bitOffset = 0; + bits -= bitsToGet; + } + + return result; +} + +void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits, + uint64_t value) { + assert((offset + bits + 7)/8 <= dst->size()); + + const size_t kTmpBufLen = sizeof(value) + 1; + char tmpBuf[kTmpBufLen]; + + // Number of bytes of tmpBuf being used + const size_t kUsedBytes = (offset%8 + bits)/8; + + // Copy relevant parts of dst to tmpBuf + for (size_t idx = 0; idx <= kUsedBytes; ++idx) { + tmpBuf[idx] = (*dst)[offset/8 + idx]; + } + + BitStreamPutInt(tmpBuf, kTmpBufLen, offset%8, bits, value); + + // Copy tmpBuf back to dst + for (size_t idx = 0; idx <= kUsedBytes; ++idx) { + (*dst)[offset/8 + idx] = tmpBuf[idx]; + } + + // Do the check here too as we are working with a buffer. + assert(((bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value) == + BitStreamGetInt(dst, offset, bits)); +} + +} // namespace rocksdb diff --git a/util/coding.h b/util/coding.h new file mode 100644 index 0000000000..8ffba51cbc --- /dev/null +++ b/util/coding.h @@ -0,0 +1,294 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Endian-neutral encoding: +// * Fixed-length numbers are encoded with least-significant byte first +// * In addition we support variable length "varint" encoding +// * Strings are encoded prefixed by their length in varint format + +#pragma once +#include +#include +#include +#include + +#include "rocksdb/write_batch.h" +#include "port/port.h" + +namespace rocksdb { + +// The maximum length of a varint in bytes for 32 and 64 bits respectively. +const unsigned int kMaxVarint32Length = 5; +const unsigned int kMaxVarint64Length = 10; + +// Standard Put... routines append to a string +extern void PutFixed32(std::string* dst, uint32_t value); +extern void PutFixed64(std::string* dst, uint64_t value); +extern void PutVarint32(std::string* dst, uint32_t value); +extern void PutVarint64(std::string* dst, uint64_t value); +extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); +extern void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts); + +// Standard Get... routines parse a value from the beginning of a Slice +// and advance the slice past the parsed value. +extern bool GetVarint32(Slice* input, uint32_t* value); +extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); +// This function assumes data is well-formed. +extern Slice GetLengthPrefixedSlice(const char* data); + +extern Slice GetSliceUntil(Slice* slice, char delimiter); + +// Pointer-based variants of GetVarint... These either store a value +// in *v and return a pointer just past the parsed value, or return +// nullptr on error. These routines only look at bytes in the range +// [p..limit-1] +extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); +extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); + +// Returns the length of the varint32 or varint64 encoding of "v" +extern int VarintLength(uint64_t v); + +// Lower-level versions of Put... that write directly into a character buffer +// REQUIRES: dst has enough space for the value being written +extern void EncodeFixed32(char* dst, uint32_t value); +extern void EncodeFixed64(char* dst, uint64_t value); + +// Lower-level versions of Put... that write directly into a character buffer +// and return a pointer just past the last byte written. +// REQUIRES: dst has enough space for the value being written +extern char* EncodeVarint32(char* dst, uint32_t value); +extern char* EncodeVarint64(char* dst, uint64_t value); + +// Lower-level versions of Get... that read directly from a character buffer +// without any bounds checking. + +inline uint32_t DecodeFixed32(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint32_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast(static_cast(ptr[0]))) + | (static_cast(static_cast(ptr[1])) << 8) + | (static_cast(static_cast(ptr[2])) << 16) + | (static_cast(static_cast(ptr[3])) << 24)); + } +} + +inline uint64_t DecodeFixed64(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint64_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + uint64_t lo = DecodeFixed32(ptr); + uint64_t hi = DecodeFixed32(ptr + 4); + return (hi << 32) | lo; + } +} + +// Internal routine for use by fallback path of GetVarint32Ptr +extern const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value); +inline const char* GetVarint32Ptr(const char* p, + const char* limit, + uint32_t* value) { + if (p < limit) { + uint32_t result = *(reinterpret_cast(p)); + if ((result & 128) == 0) { + *value = result; + return p + 1; + } + } + return GetVarint32PtrFallback(p, limit, value); +} + +// Writes an unsigned integer with bits number of bits with its least +// significant bit at offset. +// Bits are numbered from 0 to 7 in the first byte, 8 to 15 in the second and +// so on. +// value is truncated to the bits number of least significant bits. +// REQUIRES: (offset+bits+7)/8 <= dstlen +// REQUIRES: bits <= 64 +extern void BitStreamPutInt(char* dst, size_t dstlen, size_t offset, + uint32_t bits, uint64_t value); + +// Reads an unsigned integer with bits number of bits with its least +// significant bit at offset. +// Bits are numbered in the same way as ByteStreamPutInt(). +// REQUIRES: (offset+bits+7)/8 <= srclen +// REQUIRES: bits <= 64 +extern uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset, + uint32_t bits); + +// Convenience functions +extern void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits, + uint64_t value); +extern uint64_t BitStreamGetInt(const std::string* src, size_t offset, + uint32_t bits); +extern uint64_t BitStreamGetInt(const Slice* src, size_t offset, + uint32_t bits); + +// -- Implementation of the functions declared above +inline void EncodeFixed32(char* buf, uint32_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; +#endif +} + +inline void EncodeFixed64(char* buf, uint64_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; +#endif +} + +inline void PutFixed32(std::string* dst, uint32_t value) { + char buf[sizeof(value)]; + EncodeFixed32(buf, value); + dst->append(buf, sizeof(buf)); +} + +inline void PutFixed64(std::string* dst, uint64_t value) { + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + dst->append(buf, sizeof(buf)); +} + +inline void PutVarint32(std::string* dst, uint32_t v) { + char buf[5]; + char* ptr = EncodeVarint32(buf, v); + dst->append(buf, ptr - buf); +} + +inline char* EncodeVarint64(char* dst, uint64_t v) { + static const unsigned int B = 128; + unsigned char* ptr = reinterpret_cast(dst); + while (v >= B) { + *(ptr++) = (v & (B - 1)) | B; + v >>= 7; + } + *(ptr++) = static_cast(v); + return reinterpret_cast(ptr); +} + +inline void PutVarint64(std::string* dst, uint64_t v) { + char buf[10]; + char* ptr = EncodeVarint64(buf, v); + dst->append(buf, ptr - buf); +} + +inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { + PutVarint32(dst, value.size()); + dst->append(value.data(), value.size()); +} + +inline void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts) { + uint32_t total_bytes = 0; + for (int i = 0; i < slice_parts.num_parts; ++i) { + total_bytes += slice_parts.parts[i].size(); + } + PutVarint32(dst, total_bytes); + for (int i = 0; i < slice_parts.num_parts; ++i) { + dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size()); + } +} + +inline int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +inline bool GetVarint32(Slice* input, uint32_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint32Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +inline bool GetVarint64(Slice* input, uint64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint64Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) { + uint32_t len = 0; + if (GetVarint32(input, &len) && input->size() >= len) { + *result = Slice(input->data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } +} + +inline Slice GetLengthPrefixedSlice(const char* data) { + uint32_t len = 0; + // +5: we assume "data" is not corrupted + auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len); + return Slice(p, len); +} + +inline Slice GetSliceUntil(Slice* slice, char delimiter) { + uint32_t len = 0; + for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) { + // nothing + } + + Slice ret(slice->data(), len); + slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0)); + return ret; +} + +inline uint64_t BitStreamGetInt(const std::string* src, size_t offset, + uint32_t bits) { + return BitStreamGetInt(src->data(), src->size(), offset, bits); +} + +inline uint64_t BitStreamGetInt(const Slice* src, size_t offset, + uint32_t bits) { + return BitStreamGetInt(src->data(), src->size(), offset, bits); +} + +} // namespace rocksdb diff --git a/util/coding_test.cc b/util/coding_test.cc new file mode 100644 index 0000000000..ed542d6bf8 --- /dev/null +++ b/util/coding_test.cc @@ -0,0 +1,296 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include "util/testharness.h" + +namespace rocksdb { + +class Coding { }; + +TEST(Coding, Fixed32) { + std::string s; + for (uint32_t v = 0; v < 100000; v++) { + PutFixed32(&s, v); + } + + const char* p = s.data(); + for (uint32_t v = 0; v < 100000; v++) { + uint32_t actual = DecodeFixed32(p); + ASSERT_EQ(v, actual); + p += sizeof(uint32_t); + } +} + +TEST(Coding, Fixed64) { + std::string s; + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + PutFixed64(&s, v - 1); + PutFixed64(&s, v + 0); + PutFixed64(&s, v + 1); + } + + const char* p = s.data(); + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + uint64_t actual = 0; + actual = DecodeFixed64(p); + ASSERT_EQ(v-1, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+0, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+1, actual); + p += sizeof(uint64_t); + } +} + +// Test that encoding routines generate little-endian encodings +TEST(Coding, EncodingOutput) { + std::string dst; + PutFixed32(&dst, 0x04030201); + ASSERT_EQ(4U, dst.size()); + ASSERT_EQ(0x01, static_cast(dst[0])); + ASSERT_EQ(0x02, static_cast(dst[1])); + ASSERT_EQ(0x03, static_cast(dst[2])); + ASSERT_EQ(0x04, static_cast(dst[3])); + + dst.clear(); + PutFixed64(&dst, 0x0807060504030201ull); + ASSERT_EQ(8U, dst.size()); + ASSERT_EQ(0x01, static_cast(dst[0])); + ASSERT_EQ(0x02, static_cast(dst[1])); + ASSERT_EQ(0x03, static_cast(dst[2])); + ASSERT_EQ(0x04, static_cast(dst[3])); + ASSERT_EQ(0x05, static_cast(dst[4])); + ASSERT_EQ(0x06, static_cast(dst[5])); + ASSERT_EQ(0x07, static_cast(dst[6])); + ASSERT_EQ(0x08, static_cast(dst[7])); +} + +TEST(Coding, Varint32) { + std::string s; + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t v = (i / 32) << (i % 32); + PutVarint32(&s, v); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t expected = (i / 32) << (i % 32); + uint32_t actual = 0; + const char* start = p; + p = GetVarint32Ptr(p, limit, &actual); + ASSERT_TRUE(p != nullptr); + ASSERT_EQ(expected, actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, s.data() + s.size()); +} + +TEST(Coding, Varint64) { + // Construct the list of values to check + std::vector values; + // Some special values + values.push_back(0); + values.push_back(100); + values.push_back(~static_cast(0)); + values.push_back(~static_cast(0) - 1); + for (uint32_t k = 0; k < 64; k++) { + // Test values near powers of two + const uint64_t power = 1ull << k; + values.push_back(power); + values.push_back(power-1); + values.push_back(power+1); + }; + + std::string s; + for (unsigned int i = 0; i < values.size(); i++) { + PutVarint64(&s, values[i]); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (unsigned int i = 0; i < values.size(); i++) { + ASSERT_TRUE(p < limit); + uint64_t actual = 0; + const char* start = p; + p = GetVarint64Ptr(p, limit, &actual); + ASSERT_TRUE(p != nullptr); + ASSERT_EQ(values[i], actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, limit); + +} + +TEST(Coding, Varint32Overflow) { + uint32_t result; + std::string input("\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result) + == nullptr); +} + +TEST(Coding, Varint32Truncation) { + uint32_t large_value = (1u << 31) + 100; + std::string s; + PutVarint32(&s, large_value); + uint32_t result; + for (unsigned int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr); + } + ASSERT_TRUE( + GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != nullptr); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Varint64Overflow) { + uint64_t result; + std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result) + == nullptr); +} + +TEST(Coding, Varint64Truncation) { + uint64_t large_value = (1ull << 63) + 100ull; + std::string s; + PutVarint64(&s, large_value); + uint64_t result; + for (unsigned int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr); + } + ASSERT_TRUE( + GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != nullptr); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Strings) { + std::string s; + PutLengthPrefixedSlice(&s, Slice("")); + PutLengthPrefixedSlice(&s, Slice("foo")); + PutLengthPrefixedSlice(&s, Slice("bar")); + PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x'))); + + Slice input(s); + Slice v; + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("foo", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("bar", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ(std::string(200, 'x'), v.ToString()); + ASSERT_EQ("", input.ToString()); +} + +TEST(Coding, BitStream) { + const int kNumBytes = 10; + char bytes[kNumBytes+1]; + for (int i = 0; i < kNumBytes + 1; ++i) { + bytes[i] = '\0'; + } + + // Simple byte aligned test. + for (int i = 0; i < kNumBytes; ++i) { + BitStreamPutInt(bytes, kNumBytes, i*8, 8, 255-i); + + ASSERT_EQ((unsigned char)bytes[i], (unsigned char)(255-i)); + } + for (int i = 0; i < kNumBytes; ++i) { + ASSERT_EQ(BitStreamGetInt(bytes, kNumBytes, i*8, 8), (uint32_t)(255-i)); + } + ASSERT_EQ(bytes[kNumBytes], '\0'); + + // Write and read back at strange offsets + for (int i = 0; i < kNumBytes + 1; ++i) { + bytes[i] = '\0'; + } + for (int i = 0; i < kNumBytes; ++i) { + BitStreamPutInt(bytes, kNumBytes, i*5+1, 4, (i * 7) % (1 << 4)); + } + for (int i = 0; i < kNumBytes; ++i) { + ASSERT_EQ(BitStreamGetInt(bytes, kNumBytes, i*5+1, 4), + (uint32_t)((i * 7) % (1 << 4))); + } + ASSERT_EQ(bytes[kNumBytes], '\0'); + + // Create 11011011 as a bit pattern + for (int i = 0; i < kNumBytes + 1; ++i) { + bytes[i] = '\0'; + } + for (int i = 0; i < kNumBytes; ++i) { + BitStreamPutInt(bytes, kNumBytes, i*8, 2, 3); + BitStreamPutInt(bytes, kNumBytes, i*8+3, 2, 3); + BitStreamPutInt(bytes, kNumBytes, i*8+6, 2, 3); + + ASSERT_EQ((unsigned char)bytes[i], + (unsigned char)(3 + (3 << 3) + (3 << 6))); + } + ASSERT_EQ(bytes[kNumBytes], '\0'); + + + // Test large values + for (int i = 0; i < kNumBytes + 1; ++i) { + bytes[i] = '\0'; + } + BitStreamPutInt(bytes, kNumBytes, 0, 64, (uint64_t)(-1)); + for (int i = 0; i < 64/8; ++i) { + ASSERT_EQ((unsigned char)bytes[i], + (unsigned char)(255)); + } + ASSERT_EQ(bytes[64/8], '\0'); + + +} + +TEST(Coding, BitStreamConvenienceFuncs) { + std::string bytes(1, '\0'); + + // Check that independent changes to byte are preserved. + BitStreamPutInt(&bytes, 0, 2, 3); + BitStreamPutInt(&bytes, 3, 2, 3); + BitStreamPutInt(&bytes, 6, 2, 3); + ASSERT_EQ((unsigned char)bytes[0], (unsigned char)(3 + (3 << 3) + (3 << 6))); + ASSERT_EQ(BitStreamGetInt(&bytes, 0, 2), 3u); + ASSERT_EQ(BitStreamGetInt(&bytes, 3, 2), 3u); + ASSERT_EQ(BitStreamGetInt(&bytes, 6, 2), 3u); + Slice slice(bytes); + ASSERT_EQ(BitStreamGetInt(&slice, 0, 2), 3u); + ASSERT_EQ(BitStreamGetInt(&slice, 3, 2), 3u); + ASSERT_EQ(BitStreamGetInt(&slice, 6, 2), 3u); + + // Test overlapping crossing over byte boundaries + bytes = std::string(2, '\0'); + BitStreamPutInt(&bytes, 6, 4, 15); + ASSERT_EQ((unsigned char)bytes[0], 3 << 6); + ASSERT_EQ((unsigned char)bytes[1], 3); + ASSERT_EQ(BitStreamGetInt(&bytes, 6, 4), 15u); + slice = Slice(bytes); + ASSERT_EQ(BitStreamGetInt(&slice, 6, 4), 15u); + + // Test 64-bit number + bytes = std::string(64/8, '\0'); + BitStreamPutInt(&bytes, 0, 64, (uint64_t)(-1)); + ASSERT_EQ(BitStreamGetInt(&bytes, 0, 64), (uint64_t)(-1)); + slice = Slice(bytes); + ASSERT_EQ(BitStreamGetInt(&slice, 0, 64), (uint64_t)(-1)); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/comparator.cc b/util/comparator.cc new file mode 100644 index 0000000000..adeacac0ac --- /dev/null +++ b/util/comparator.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "port/port.h" +#include "util/logging.h" + +namespace rocksdb { + +Comparator::~Comparator() { } + +namespace { +class BytewiseComparatorImpl : public Comparator { + public: + BytewiseComparatorImpl() { } + + virtual const char* Name() const { + return "leveldb.BytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return a.compare(b); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + if (diff_index >= min_length) { + // Do not shorten if one string is a prefix of the other + } else { + uint8_t diff_byte = static_cast((*start)[diff_index]); + if (diff_byte < static_cast(0xff) && + diff_byte + 1 < static_cast(limit[diff_index])) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + assert(Compare(*start, limit) < 0); + } + } + } + + virtual void FindShortSuccessor(std::string* key) const { + // Find first character that can be incremented + size_t n = key->size(); + for (size_t i = 0; i < n; i++) { + const uint8_t byte = (*key)[i]; + if (byte != static_cast(0xff)) { + (*key)[i] = byte + 1; + key->resize(i+1); + return; + } + } + // *key is a run of 0xffs. Leave it alone. + } +}; +} // namespace + +static port::OnceType once = LEVELDB_ONCE_INIT; +static const Comparator* bytewise; + +static void InitModule() { + bytewise = new BytewiseComparatorImpl; +} + +const Comparator* BytewiseComparator() { + port::InitOnce(&once, InitModule); + return bytewise; +} + +} // namespace rocksdb diff --git a/util/crc32c.cc b/util/crc32c.cc new file mode 100644 index 0000000000..d27fb4be98 --- /dev/null +++ b/util/crc32c.cc @@ -0,0 +1,393 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// four bytes at a time. + +#include "util/crc32c.h" + +#include +#ifdef __SSE4_2__ +#include +#endif +#include "util/coding.h" + +namespace rocksdb { +namespace crc32c { + +static const uint32_t table0_[256] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, + 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, + 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, + 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, + 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, + 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, + 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, + 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, + 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, + 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, + 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, + 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, + 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, + 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, + 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, + 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, + 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, + 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, + 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, + 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, + 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, + 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 +}; +static const uint32_t table1_[256] = { + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, + 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, + 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, + 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, + 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, + 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, + 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, + 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, + 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, + 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, + 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, + 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, + 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, + 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, + 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, + 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, + 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, + 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, + 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, + 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, + 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, + 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, + 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, + 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, + 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, + 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, + 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, + 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, + 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, + 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, + 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, + 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, + 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, + 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, + 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, + 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, + 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, + 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, + 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, + 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, + 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, + 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, + 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, + 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, + 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, + 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, + 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, + 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, + 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, + 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, + 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, + 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, + 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, + 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 +}; +static const uint32_t table2_[256] = { + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, + 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, + 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, + 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, + 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, + 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, + 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, + 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, + 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, + 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, + 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, + 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, + 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, + 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, + 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, + 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, + 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, + 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, + 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, + 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, + 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, + 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, + 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, + 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, + 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, + 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, + 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, + 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, + 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, + 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, + 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, + 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, + 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, + 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, + 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, + 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, + 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, + 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, + 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, + 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, + 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, + 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, + 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, + 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, + 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, + 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, + 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, + 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, + 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, + 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, + 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, + 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, + 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, + 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 +}; +static const uint32_t table3_[256] = { + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, + 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, + 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, + 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, + 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, + 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, + 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, + 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, + 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, + 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, + 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, + 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, + 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, + 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, + 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, + 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, + 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, + 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, + 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, + 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, + 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, + 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, + 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, + 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, + 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, + 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, + 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, + 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, + 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, + 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, + 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, + 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, + 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, + 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, + 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, + 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, + 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, + 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, + 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, + 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, + 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, + 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, + 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, + 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, + 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, + 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, + 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, + 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, + 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, + 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, + 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, + 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, + 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, + 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 +}; + +// Used to fetch a naturally-aligned 32-bit word in little endian byte-order +static inline uint32_t LE_LOAD32(const uint8_t *p) { + return DecodeFixed32(reinterpret_cast(p)); +} + +#ifdef __SSE4_2__ +static inline uint64_t LE_LOAD64(const uint8_t *p) { + return DecodeFixed64(reinterpret_cast(p)); +} +#endif + +static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) { + uint32_t c = *l ^ LE_LOAD32(*p); + *p += 4; + *l = table3_[c & 0xff] ^ + table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ + table0_[c >> 24]; + // DO it twice. + c = *l ^ LE_LOAD32(*p); + *p += 4; + *l = table3_[c & 0xff] ^ + table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ + table0_[c >> 24]; +} + +static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) { +#ifdef __SSE4_2__ + *l = _mm_crc32_u64(*l, LE_LOAD64(*p)); + *p += 8; +#else + Slow_CRC32(l, p); +#endif +} + +template +uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { + const uint8_t *p = reinterpret_cast(buf); + const uint8_t *e = p + size; + uint64_t l = crc ^ 0xffffffffu; + +// Align n to (1 << m) byte boundary +#define ALIGN(n, m) ((n + ((1 << m) - 1)) & ~((1 << m) - 1)) + +#define STEP1 do { \ + int c = (l & 0xff) ^ *p++; \ + l = table0_[c] ^ (l >> 8); \ +} while (0) + + + // Point x at first 16-byte aligned byte in string. This might be + // just past the end of the string. + const uintptr_t pval = reinterpret_cast(p); + const uint8_t* x = reinterpret_cast(ALIGN(pval, 4)); + if (x <= e) { + // Process bytes until finished or p is 16-byte aligned + while (p != x) { + STEP1; + } + } + // Process bytes 16 at a time + while ((e-p) >= 16) { + CRC32(&l, &p); + CRC32(&l, &p); + } + // Process bytes 8 at a time + while ((e-p) >= 8) { + CRC32(&l, &p); + } + // Process the last few bytes + while (p != e) { + STEP1; + } +#undef STEP1 +#undef ALIGN + return l ^ 0xffffffffu; +} + +// Detect if SS42 or not. +static bool isSSE42() { +#if defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE) + uint32_t c_; + uint32_t d_; + __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx"); + return c_ & (1U << 20); // copied from CpuId.h in Folly. +#else + return false; +#endif +} + +typedef uint32_t (*Function)(uint32_t, const char*, size_t); + +static inline Function Choose_Extend() { + return isSSE42() ? ExtendImpl : ExtendImpl; +} + +Function ChosenExtend = Choose_Extend(); + +uint32_t Extend(uint32_t crc, const char* buf, size_t size) { + return ChosenExtend(crc, buf, size); +} + +} // namespace crc32c +} // namespace rocksdb diff --git a/util/crc32c.h b/util/crc32c.h new file mode 100644 index 0000000000..e5e6e143e2 --- /dev/null +++ b/util/crc32c.h @@ -0,0 +1,46 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +namespace rocksdb { +namespace crc32c { + +// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the +// crc32c of some string A. Extend() is often used to maintain the +// crc32c of a stream of data. +extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); + +// Return the crc32c of data[0,n-1] +inline uint32_t Value(const char* data, size_t n) { + return Extend(0, data, n); +} + +static const uint32_t kMaskDelta = 0xa282ead8ul; + +// Return a masked representation of crc. +// +// Motivation: it is problematic to compute the CRC of a string that +// contains embedded CRCs. Therefore we recommend that CRCs stored +// somewhere (e.g., in files) should be masked before being stored. +inline uint32_t Mask(uint32_t crc) { + // Rotate right by 15 bits and add a constant. + return ((crc >> 15) | (crc << 17)) + kMaskDelta; +} + +// Return the crc whose masked representation is masked_crc. +inline uint32_t Unmask(uint32_t masked_crc) { + uint32_t rot = masked_crc - kMaskDelta; + return ((rot >> 17) | (rot << 15)); +} + +} // namespace crc32c +} // namespace rocksdb diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc new file mode 100644 index 0000000000..300c9d3c72 --- /dev/null +++ b/util/crc32c_test.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/crc32c.h" +#include "util/testharness.h" + +namespace rocksdb { +namespace crc32c { + +class CRC { }; + +TEST(CRC, StandardResults) { + // From rfc3720 section B.4. + char buf[32]; + + memset(buf, 0, sizeof(buf)); + ASSERT_EQ(0x8a9136aaU, Value(buf, sizeof(buf))); + + memset(buf, 0xff, sizeof(buf)); + ASSERT_EQ(0x62a8ab43U, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = i; + } + ASSERT_EQ(0x46dd794eU, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = 31 - i; + } + ASSERT_EQ(0x113fdb5cU, Value(buf, sizeof(buf))); + + unsigned char data[48] = { + 0x01, 0xc0, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x18, + 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + }; + ASSERT_EQ(0xd9963a56, Value(reinterpret_cast(data), sizeof(data))); +} + +TEST(CRC, Values) { + ASSERT_NE(Value("a", 1), Value("foo", 3)); +} + +TEST(CRC, Extend) { + ASSERT_EQ(Value("hello world", 11), + Extend(Value("hello ", 6), "world", 5)); +} + +TEST(CRC, Mask) { + uint32_t crc = Value("foo", 3); + ASSERT_NE(crc, Mask(crc)); + ASSERT_NE(crc, Mask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); +} + +} // namespace crc32c +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc new file mode 100644 index 0000000000..7173bbb934 --- /dev/null +++ b/util/dynamic_bloom.cc @@ -0,0 +1,60 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "dynamic_bloom.h" + +#include + +#include "port/port.h" +#include "rocksdb/slice.h" +#include "util/hash.h" + +namespace rocksdb { + +namespace { +static uint32_t BloomHash(const Slice& key) { + return Hash(key.data(), key.size(), 0xbc9f1d34); +} + +uint32_t GetNumBlocks(uint32_t total_bits) { + uint32_t num_blocks = (total_bits + CACHE_LINE_SIZE * 8 - 1) / + (CACHE_LINE_SIZE * 8) * (CACHE_LINE_SIZE * 8); + // Make num_blocks an odd number to make sure more bits are involved + // when determining which block. + if (num_blocks % 2 == 0) { + num_blocks++; + } + return num_blocks; +} +} + +DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t locality, + uint32_t num_probes, + uint32_t (*hash_func)(const Slice& key), + size_t huge_page_tlb_size, Logger* logger) + : kTotalBits(((locality > 0) ? GetNumBlocks(total_bits) : total_bits + 7) / + 8 * 8), + kNumBlocks((locality > 0) ? kTotalBits / (CACHE_LINE_SIZE * 8) : 0), + kNumProbes(num_probes), + hash_func_(hash_func == nullptr ? &BloomHash : hash_func) { + assert(kNumBlocks > 0 || kTotalBits > 0); + assert(kNumProbes > 0); + + uint32_t sz = kTotalBits / 8; + if (kNumBlocks > 0) { + sz += CACHE_LINE_SIZE - 1; + } + raw_ = reinterpret_cast( + arena_.AllocateAligned(sz, huge_page_tlb_size, logger)); + memset(raw_, 0, sz); + if (kNumBlocks > 0 && (reinterpret_cast(raw_) % CACHE_LINE_SIZE)) { + data_ = raw_ + CACHE_LINE_SIZE - + reinterpret_cast(raw_) % CACHE_LINE_SIZE; + } else { + data_ = raw_; + } +} + +} // rocksdb diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h new file mode 100644 index 0000000000..e591345910 --- /dev/null +++ b/util/dynamic_bloom.h @@ -0,0 +1,119 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include + +#include "port/port.h" +#include + +namespace rocksdb { + +class Slice; +class Logger; + +class DynamicBloom { + public: + // total_bits: fixed total bits for the bloom + // num_probes: number of hash probes for a single key + // locality: If positive, optimize for cache line locality, 0 otherwise. + // hash_func: customized hash function + // huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB + // withi this page size. Need to reserve huge pages for + // it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + explicit DynamicBloom(uint32_t total_bits, uint32_t locality = 0, + uint32_t num_probes = 6, + uint32_t (*hash_func)(const Slice& key) = nullptr, + size_t huge_page_tlb_size = 0, + Logger* logger = nullptr); + + ~DynamicBloom() {} + + // Assuming single threaded access to this function. + void Add(const Slice& key); + + // Assuming single threaded access to this function. + void AddHash(uint32_t hash); + + // Multithreaded access to this function is OK + bool MayContain(const Slice& key); + + // Multithreaded access to this function is OK + bool MayContainHash(uint32_t hash); + + private: + const uint32_t kTotalBits; + const uint32_t kNumBlocks; + const uint32_t kNumProbes; + + uint32_t (*hash_func_)(const Slice& key); + unsigned char* data_; + unsigned char* raw_; + + Arena arena_; +}; + +inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); } + +inline bool DynamicBloom::MayContain(const Slice& key) { + return (MayContainHash(hash_func_(key))); +} + +inline bool DynamicBloom::MayContainHash(uint32_t h) { + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + if (kNumBlocks != 0) { + uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); + for (uint32_t i = 0; i < kNumProbes; ++i) { + // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized + // to a simple and operation by compiler. + const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); + if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { + return false; + } + // Rotate h so that we don't reuse the same bytes. + h = h / (CACHE_LINE_SIZE * 8) + + (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE); + h += delta; + } + } else { + for (uint32_t i = 0; i < kNumProbes; ++i) { + const uint32_t bitpos = h % kTotalBits; + if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { + return false; + } + h += delta; + } + } + return true; +} + +inline void DynamicBloom::AddHash(uint32_t h) { + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + if (kNumBlocks != 0) { + uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); + for (uint32_t i = 0; i < kNumProbes; ++i) { + // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized + // to a simple and operation by compiler. + const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); + data_[bitpos / 8] |= (1 << (bitpos % 8)); + // Rotate h so that we don't reuse the same bytes. + h = h / (CACHE_LINE_SIZE * 8) + + (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE); + h += delta; + } + } else { + for (uint32_t i = 0; i < kNumProbes; ++i) { + const uint32_t bitpos = h % kTotalBits; + data_[bitpos / 8] |= (1 << (bitpos % 8)); + h += delta; + } + } +} + +} // rocksdb diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc new file mode 100644 index 0000000000..d345addbac --- /dev/null +++ b/util/dynamic_bloom_test.cc @@ -0,0 +1,215 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#define __STDC_FORMAT_MACROS +#include +#include +#include + +#include "dynamic_bloom.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "util/stop_watch.h" + +using GFLAGS::ParseCommandLineFlags; + +DEFINE_int32(bits_per_key, 10, ""); +DEFINE_int32(num_probes, 6, ""); +DEFINE_bool(enable_perf, false, ""); + +namespace rocksdb { + +static Slice Key(uint64_t i, char* buffer) { + memcpy(buffer, &i, sizeof(i)); + return Slice(buffer, sizeof(i)); +} + +class DynamicBloomTest { +}; + +TEST(DynamicBloomTest, EmptyFilter) { + DynamicBloom bloom1(100, 0, 2); + ASSERT_TRUE(!bloom1.MayContain("hello")); + ASSERT_TRUE(!bloom1.MayContain("world")); + + DynamicBloom bloom2(CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2); + ASSERT_TRUE(!bloom2.MayContain("hello")); + ASSERT_TRUE(!bloom2.MayContain("world")); +} + +TEST(DynamicBloomTest, Small) { + DynamicBloom bloom1(100, 0, 2); + bloom1.Add("hello"); + bloom1.Add("world"); + ASSERT_TRUE(bloom1.MayContain("hello")); + ASSERT_TRUE(bloom1.MayContain("world")); + ASSERT_TRUE(!bloom1.MayContain("x")); + ASSERT_TRUE(!bloom1.MayContain("foo")); + + DynamicBloom bloom2(CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2); + bloom2.Add("hello"); + bloom2.Add("world"); + ASSERT_TRUE(bloom2.MayContain("hello")); + ASSERT_TRUE(bloom2.MayContain("world")); + ASSERT_TRUE(!bloom2.MayContain("x")); + ASSERT_TRUE(!bloom2.MayContain("foo")); +} + +static uint32_t NextNum(uint32_t num) { + if (num < 10) { + num += 1; + } else if (num < 100) { + num += 10; + } else if (num < 1000) { + num += 100; + } else { + num += 1000; + } + return num; +} + +TEST(DynamicBloomTest, VaryingLengths) { + char buffer[sizeof(uint64_t)]; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + uint32_t num_probes = static_cast(FLAGS_num_probes); + + fprintf(stderr, "bits_per_key: %d num_probes: %d\n", + FLAGS_bits_per_key, num_probes); + + for (uint32_t enable_locality = 0; enable_locality < 2; ++enable_locality) { + for (uint32_t num = 1; num <= 10000; num = NextNum(num)) { + uint32_t bloom_bits = 0; + if (enable_locality == 0) { + bloom_bits = std::max(num * FLAGS_bits_per_key, 64U); + } else { + bloom_bits = std::max(num * FLAGS_bits_per_key, + enable_locality * CACHE_LINE_SIZE * 8); + } + DynamicBloom bloom(bloom_bits, enable_locality, num_probes); + for (uint64_t i = 0; i < num; i++) { + bloom.Add(Key(i, buffer)); + ASSERT_TRUE(bloom.MayContain(Key(i, buffer))); + } + + // All added keys must match + for (uint64_t i = 0; i < num; i++) { + ASSERT_TRUE(bloom.MayContain(Key(i, buffer))) + << "Num " << num << "; key " << i; + } + + // Check false positive rate + + int result = 0; + for (uint64_t i = 0; i < 10000; i++) { + if (bloom.MayContain(Key(i + 1000000000, buffer))) { + result++; + } + } + double rate = result / 10000.0; + + fprintf(stderr, + "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, " + "enable locality?%u\n", + rate * 100.0, num, bloom_bits, enable_locality); + + if (rate > 0.0125) + mediocre_filters++; // Allowed, but not too often + else + good_filters++; + } + + fprintf(stderr, "Filters: %d good, %d mediocre\n", + good_filters, mediocre_filters); + ASSERT_LE(mediocre_filters, good_filters/5); + } +} + +TEST(DynamicBloomTest, perf) { + StopWatchNano timer(Env::Default()); + uint32_t num_probes = static_cast(FLAGS_num_probes); + + if (!FLAGS_enable_perf) { + return; + } + + for (uint64_t m = 1; m <= 8; ++m) { + const uint64_t num_keys = m * 8 * 1024 * 1024; + fprintf(stderr, "testing %" PRIu64 "M keys\n", m * 8); + + DynamicBloom std_bloom(num_keys * 10, 0, num_probes); + + timer.Start(); + for (uint64_t i = 1; i <= num_keys; ++i) { + std_bloom.Add(Slice(reinterpret_cast(&i), 8)); + } + + uint64_t elapsed = timer.ElapsedNanos(); + fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n", + elapsed / num_keys); + + uint64_t count = 0; + timer.Start(); + for (uint64_t i = 1; i <= num_keys; ++i) { + if (std_bloom.MayContain(Slice(reinterpret_cast(&i), 8))) { + ++count; + } + } + elapsed = timer.ElapsedNanos(); + fprintf(stderr, "standard bloom, avg query latency %" PRIu64 "\n", + elapsed / count); + ASSERT_TRUE(count == num_keys); + + // Locality enabled version + DynamicBloom blocked_bloom(num_keys * 10, 1, num_probes); + + timer.Start(); + for (uint64_t i = 1; i <= num_keys; ++i) { + blocked_bloom.Add(Slice(reinterpret_cast(&i), 8)); + } + + elapsed = timer.ElapsedNanos(); + fprintf(stderr, + "blocked bloom(enable locality), avg add latency %" PRIu64 "\n", + elapsed / num_keys); + + count = 0; + timer.Start(); + for (uint64_t i = 1; i <= num_keys; ++i) { + if (blocked_bloom.MayContain( + Slice(reinterpret_cast(&i), 8))) { + ++count; + } + } + + elapsed = timer.ElapsedNanos(); + fprintf(stderr, + "blocked bloom(enable locality), avg query latency %" PRIu64 "\n", + elapsed / count); + ASSERT_TRUE(count == num_keys); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + + return rocksdb::test::RunAllTests(); +} + +#endif // GFLAGS diff --git a/util/env.cc b/util/env.cc new file mode 100644 index 0000000000..1c0cae4c34 --- /dev/null +++ b/util/env.cc @@ -0,0 +1,251 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/env.h" + +#include +#include "rocksdb/options.h" +#include "util/arena.h" +#include "util/autovector.h" + +namespace rocksdb { + +Env::~Env() { +} + +SequentialFile::~SequentialFile() { +} + +RandomAccessFile::~RandomAccessFile() { +} + +WritableFile::~WritableFile() { +} + +Logger::~Logger() { +} + +FileLock::~FileLock() { +} + +void LogFlush(Logger *info_log) { + if (info_log) { + info_log->Flush(); + } +} + +void Log(Logger* info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap); + va_end(ap); + } +} + +void Log(const InfoLogLevel log_level, Logger* info_log, const char* format, + ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(log_level, format, ap); + va_end(ap); + } +} + +void Debug(Logger* info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::DEBUG_LEVEL, format, ap); + va_end(ap); + } +} + +void Info(Logger* info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap); + va_end(ap); + } +} + +void Warn(Logger* info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::WARN_LEVEL, format, ap); + va_end(ap); + } +} +void Error(Logger* info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::ERROR_LEVEL, format, ap); + va_end(ap); + } +} +void Fatal(Logger* info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::FATAL_LEVEL, format, ap); + va_end(ap); + } +} + +void LogFlush(const shared_ptr& info_log) { + if (info_log) { + info_log->Flush(); + } +} + +void Log(const InfoLogLevel log_level, const shared_ptr& info_log, + const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(log_level, format, ap); + va_end(ap); + } +} + +void Debug(const shared_ptr& info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::DEBUG_LEVEL, format, ap); + va_end(ap); + } +} + +void Info(const shared_ptr& info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap); + va_end(ap); + } +} + +void Warn(const shared_ptr& info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::WARN_LEVEL, format, ap); + va_end(ap); + } +} + +void Error(const shared_ptr& info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::ERROR_LEVEL, format, ap); + va_end(ap); + } +} + +void Fatal(const shared_ptr& info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::FATAL_LEVEL, format, ap); + va_end(ap); + } +} + +void Log(const shared_ptr& info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap); + va_end(ap); + } +} + +Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname, + bool should_sync) { + unique_ptr file; + EnvOptions soptions; + Status s = env->NewWritableFile(fname, &file, soptions); + if (!s.ok()) { + return s; + } + s = file->Append(data); + if (s.ok() && should_sync) { + s = file->Sync(); + } + if (!s.ok()) { + env->DeleteFile(fname); + } + return s; +} + +Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { + EnvOptions soptions; + data->clear(); + unique_ptr file; + Status s = env->NewSequentialFile(fname, &file, soptions); + if (!s.ok()) { + return s; + } + static const int kBufferSize = 8192; + char* space = new char[kBufferSize]; + while (true) { + Slice fragment; + s = file->Read(kBufferSize, &fragment, space); + if (!s.ok()) { + break; + } + data->append(fragment.data(), fragment.size()); + if (fragment.empty()) { + break; + } + } + delete[] space; + return s; +} + +EnvWrapper::~EnvWrapper() { +} + +namespace { // anonymous namespace + +void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) { + env_options->use_os_buffer = options.allow_os_buffer; + env_options->use_mmap_reads = options.allow_mmap_reads; + env_options->use_mmap_writes = options.allow_mmap_writes; + env_options->set_fd_cloexec = options.is_fd_close_on_exec; + env_options->bytes_per_sync = options.bytes_per_sync; +} + +} + +EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options) const { + return env_options; +} + +EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const { + return env_options; +} + +EnvOptions::EnvOptions(const DBOptions& options) { + AssignEnvOptions(this, options); +} + +EnvOptions::EnvOptions() { + DBOptions options; + AssignEnvOptions(this, options); +} + + +} // namespace rocksdb diff --git a/util/env_hdfs.cc b/util/env_hdfs.cc new file mode 100644 index 0000000000..1618e54685 --- /dev/null +++ b/util/env_hdfs.cc @@ -0,0 +1,591 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifdef USE_HDFS +#ifndef ROCKSDB_HDFS_FILE_C +#define ROCKSDB_HDFS_FILE_C + +#include +#include +#include +#include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "hdfs/env_hdfs.h" + +#define HDFS_EXISTS 0 +#define HDFS_DOESNT_EXIST -1 +#define HDFS_SUCCESS 0 + +// +// This file defines an HDFS environment for rocksdb. It uses the libhdfs +// api to access HDFS. All HDFS files created by one instance of rocksdb +// will reside on the same HDFS cluster. +// + +namespace rocksdb { + +namespace { + +// Log error message +static Status IOError(const std::string& context, int err_number) { + return Status::IOError(context, strerror(err_number)); +} + +// assume that there is one global logger for now. It is not thread-safe, +// but need not be because the logger is initialized at db-open time. +static Logger* mylog = nullptr; + +// Used for reading a file from HDFS. It implements both sequential-read +// access methods as well as random read access methods. +class HdfsReadableFile : virtual public SequentialFile, + virtual public RandomAccessFile { + private: + hdfsFS fileSys_; + std::string filename_; + hdfsFile hfile_; + + public: + HdfsReadableFile(hdfsFS fileSys, const std::string& fname) + : fileSys_(fileSys), filename_(fname), hfile_(nullptr) { + Log(mylog, "[hdfs] HdfsReadableFile opening file %s\n", + filename_.c_str()); + hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_RDONLY, 0, 0, 0); + Log(mylog, "[hdfs] HdfsReadableFile opened file %s hfile_=0x%p\n", + filename_.c_str(), hfile_); + } + + virtual ~HdfsReadableFile() { + Log(mylog, "[hdfs] HdfsReadableFile closing file %s\n", + filename_.c_str()); + hdfsCloseFile(fileSys_, hfile_); + Log(mylog, "[hdfs] HdfsReadableFile closed file %s\n", + filename_.c_str()); + hfile_ = nullptr; + } + + bool isValid() { + return hfile_ != nullptr; + } + + // sequential access, read data at current offset in file + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + Log(mylog, "[hdfs] HdfsReadableFile reading %s %ld\n", + filename_.c_str(), n); + + char* buffer = scratch; + size_t total_bytes_read = 0; + tSize bytes_read = 0; + tSize remaining_bytes = (tSize)n; + + // Read a total of n bytes repeatedly until we hit error or eof + while (remaining_bytes > 0) { + bytes_read = hdfsRead(fileSys_, hfile_, buffer, remaining_bytes); + if (bytes_read <= 0) { + break; + } + assert(bytes_read <= remaining_bytes); + + total_bytes_read += bytes_read; + remaining_bytes -= bytes_read; + buffer += bytes_read; + } + assert(total_bytes_read <= n); + + Log(mylog, "[hdfs] HdfsReadableFile read %s\n", filename_.c_str()); + + if (bytes_read < 0) { + s = IOError(filename_, errno); + } else { + *result = Slice(scratch, total_bytes_read); + } + + return s; + } + + // random access, read data from specified offset in file + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + Log(mylog, "[hdfs] HdfsReadableFile preading %s\n", filename_.c_str()); + ssize_t bytes_read = hdfsPread(fileSys_, hfile_, offset, + (void*)scratch, (tSize)n); + Log(mylog, "[hdfs] HdfsReadableFile pread %s\n", filename_.c_str()); + *result = Slice(scratch, (bytes_read < 0) ? 0 : bytes_read); + if (bytes_read < 0) { + // An error: return a non-ok status + s = IOError(filename_, errno); + } + return s; + } + + virtual Status Skip(uint64_t n) { + Log(mylog, "[hdfs] HdfsReadableFile skip %s\n", filename_.c_str()); + // get current offset from file + tOffset current = hdfsTell(fileSys_, hfile_); + if (current < 0) { + return IOError(filename_, errno); + } + // seek to new offset in file + tOffset newoffset = current + n; + int val = hdfsSeek(fileSys_, hfile_, newoffset); + if (val < 0) { + return IOError(filename_, errno); + } + return Status::OK(); + } + + private: + + // returns true if we are at the end of file, false otherwise + bool feof() { + Log(mylog, "[hdfs] HdfsReadableFile feof %s\n", filename_.c_str()); + if (hdfsTell(fileSys_, hfile_) == fileSize()) { + return true; + } + return false; + } + + // the current size of the file + tOffset fileSize() { + Log(mylog, "[hdfs] HdfsReadableFile fileSize %s\n", filename_.c_str()); + hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, filename_.c_str()); + tOffset size = 0L; + if (pFileInfo != nullptr) { + size = pFileInfo->mSize; + hdfsFreeFileInfo(pFileInfo, 1); + } else { + throw HdfsFatalException("fileSize on unknown file " + filename_); + } + return size; + } +}; + +// Appends to an existing file in HDFS. +class HdfsWritableFile: public WritableFile { + private: + hdfsFS fileSys_; + std::string filename_; + hdfsFile hfile_; + + public: + HdfsWritableFile(hdfsFS fileSys, const std::string& fname) + : fileSys_(fileSys), filename_(fname) , hfile_(nullptr) { + Log(mylog, "[hdfs] HdfsWritableFile opening %s\n", filename_.c_str()); + hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_WRONLY, 0, 0, 0); + Log(mylog, "[hdfs] HdfsWritableFile opened %s\n", filename_.c_str()); + assert(hfile_ != nullptr); + } + virtual ~HdfsWritableFile() { + if (hfile_ != nullptr) { + Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str()); + hdfsCloseFile(fileSys_, hfile_); + Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str()); + hfile_ = nullptr; + } + } + + // If the file was successfully created, then this returns true. + // Otherwise returns false. + bool isValid() { + return hfile_ != nullptr; + } + + // The name of the file, mostly needed for debug logging. + const std::string& getName() { + return filename_; + } + + virtual Status Append(const Slice& data) { + Log(mylog, "[hdfs] HdfsWritableFile Append %s\n", filename_.c_str()); + const char* src = data.data(); + size_t left = data.size(); + size_t ret = hdfsWrite(fileSys_, hfile_, src, left); + Log(mylog, "[hdfs] HdfsWritableFile Appended %s\n", filename_.c_str()); + if (ret != left) { + return IOError(filename_, errno); + } + return Status::OK(); + } + + virtual Status Flush() { + return Status::OK(); + } + + virtual Status Sync() { + Status s; + Log(mylog, "[hdfs] HdfsWritableFile Sync %s\n", filename_.c_str()); + if (hdfsFlush(fileSys_, hfile_) == -1) { + return IOError(filename_, errno); + } + if (hdfsHSync(fileSys_, hfile_) == -1) { + return IOError(filename_, errno); + } + Log(mylog, "[hdfs] HdfsWritableFile Synced %s\n", filename_.c_str()); + return Status::OK(); + } + + // This is used by HdfsLogger to write data to the debug log file + virtual Status Append(const char* src, size_t size) { + if (hdfsWrite(fileSys_, hfile_, src, size) != (tSize)size) { + return IOError(filename_, errno); + } + return Status::OK(); + } + + virtual Status Close() { + Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str()); + if (hdfsCloseFile(fileSys_, hfile_) != 0) { + return IOError(filename_, errno); + } + Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str()); + hfile_ = nullptr; + return Status::OK(); + } +}; + +// The object that implements the debug logs to reside in HDFS. +class HdfsLogger : public Logger { + private: + HdfsWritableFile* file_; + uint64_t (*gettid_)(); // Return the thread id for the current thread + + public: + HdfsLogger(HdfsWritableFile* f, uint64_t (*gettid)()) + : file_(f), gettid_(gettid) { + Log(mylog, "[hdfs] HdfsLogger opened %s\n", + file_->getName().c_str()); + } + + virtual ~HdfsLogger() { + Log(mylog, "[hdfs] HdfsLogger closed %s\n", + file_->getName().c_str()); + delete file_; + if (mylog != nullptr && mylog == this) { + mylog = nullptr; + } + } + + virtual void Logv(const char* format, va_list ap) { + const uint64_t thread_id = (*gettid_)(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec, + static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + file_->Append(base, p-base); + file_->Flush(); + if (base != buffer) { + delete[] base; + } + break; + } + } +}; + +} // namespace + +// Finally, the hdfs environment + +const std::string HdfsEnv::kProto = "hdfs://"; +const std::string HdfsEnv::pathsep = "/"; + +// open a file for sequential reading +Status HdfsEnv::NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname); + if (f == nullptr || !f->isValid()) { + delete f; + *result = nullptr; + return IOError(fname, errno); + } + result->reset(dynamic_cast(f)); + return Status::OK(); +} + +// open a file for random reading +Status HdfsEnv::NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname); + if (f == nullptr || !f->isValid()) { + delete f; + *result = nullptr; + return IOError(fname, errno); + } + result->reset(dynamic_cast(f)); + return Status::OK(); +} + +// create a new file for writing +Status HdfsEnv::NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + Status s; + HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname); + if (f == nullptr || !f->isValid()) { + delete f; + *result = nullptr; + return IOError(fname, errno); + } + result->reset(dynamic_cast(f)); + return Status::OK(); +} + +Status HdfsEnv::NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return Status::NotSupported("NewRandomRWFile not supported on HdfsEnv"); +} + +class HdfsDirectory : public Directory { + public: + explicit HdfsDirectory(int fd) : fd_(fd) {} + ~HdfsDirectory() {} + + virtual Status Fsync() { return Status::OK(); } + + private: + int fd_; +}; + +Status HdfsEnv::NewDirectory(const std::string& name, + unique_ptr* result) { + int value = hdfsExists(fileSys_, name.c_str()); + switch (value) { + case HDFS_EXISTS: + result->reset(new HdfsDirectory(0)); + return Status::OK(); + default: // fail if the directory doesn't exist + Log(mylog, "NewDirectory hdfsExists call failed"); + throw HdfsFatalException("hdfsExists call failed with error " + + std::to_string(value) + " on path " + name + + ".\n"); + } +} + +bool HdfsEnv::FileExists(const std::string& fname) { + + int value = hdfsExists(fileSys_, fname.c_str()); + switch (value) { + case HDFS_EXISTS: + return true; + case HDFS_DOESNT_EXIST: + return false; + default: // anything else should be an error + Log(mylog, "FileExists hdfsExists call failed"); + throw HdfsFatalException("hdfsExists call failed with error " + + std::to_string(value) + " on path " + fname + + ".\n"); + } +} + +Status HdfsEnv::GetChildren(const std::string& path, + std::vector* result) { + int value = hdfsExists(fileSys_, path.c_str()); + switch (value) { + case HDFS_EXISTS: { // directory exists + int numEntries = 0; + hdfsFileInfo* pHdfsFileInfo = 0; + pHdfsFileInfo = hdfsListDirectory(fileSys_, path.c_str(), &numEntries); + if (numEntries >= 0) { + for(int i = 0; i < numEntries; i++) { + char* pathname = pHdfsFileInfo[i].mName; + char* filename = rindex(pathname, '/'); + if (filename != nullptr) { + result->push_back(filename+1); + } + } + if (pHdfsFileInfo != nullptr) { + hdfsFreeFileInfo(pHdfsFileInfo, numEntries); + } + } else { + // numEntries < 0 indicates error + Log(mylog, "hdfsListDirectory call failed with error "); + throw HdfsFatalException( + "hdfsListDirectory call failed negative error.\n"); + } + break; + } + case HDFS_DOESNT_EXIST: // directory does not exist, exit + break; + default: // anything else should be an error + Log(mylog, "GetChildren hdfsExists call failed"); + throw HdfsFatalException("hdfsExists call failed with error " + + std::to_string(value) + ".\n"); + } + return Status::OK(); +} + +Status HdfsEnv::DeleteFile(const std::string& fname) { + if (hdfsDelete(fileSys_, fname.c_str(), 1) == 0) { + return Status::OK(); + } + return IOError(fname, errno); +}; + +Status HdfsEnv::CreateDir(const std::string& name) { + if (hdfsCreateDirectory(fileSys_, name.c_str()) == 0) { + return Status::OK(); + } + return IOError(name, errno); +}; + +Status HdfsEnv::CreateDirIfMissing(const std::string& name) { + const int value = hdfsExists(fileSys_, name.c_str()); + // Not atomic. state might change b/w hdfsExists and CreateDir. + switch (value) { + case HDFS_EXISTS: + return Status::OK(); + case HDFS_DOESNT_EXIST: + return CreateDir(name); + default: // anything else should be an error + Log(mylog, "CreateDirIfMissing hdfsExists call failed"); + throw HdfsFatalException("hdfsExists call failed with error " + + std::to_string(value) + ".\n"); + } +}; + +Status HdfsEnv::DeleteDir(const std::string& name) { + return DeleteFile(name); +}; + +Status HdfsEnv::GetFileSize(const std::string& fname, uint64_t* size) { + *size = 0L; + hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, fname.c_str()); + if (pFileInfo != nullptr) { + *size = pFileInfo->mSize; + hdfsFreeFileInfo(pFileInfo, 1); + return Status::OK(); + } + return IOError(fname, errno); +} + +Status HdfsEnv::GetFileModificationTime(const std::string& fname, + uint64_t* time) { + hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, fname.c_str()); + if (pFileInfo != nullptr) { + *time = static_cast(pFileInfo->mLastMod); + hdfsFreeFileInfo(pFileInfo, 1); + return Status::OK(); + } + return IOError(fname, errno); + +} + +// The rename is not atomic. HDFS does not allow a renaming if the +// target already exists. So, we delete the target before attemting the +// rename. +Status HdfsEnv::RenameFile(const std::string& src, const std::string& target) { + hdfsDelete(fileSys_, target.c_str(), 1); + if (hdfsRename(fileSys_, src.c_str(), target.c_str()) == 0) { + return Status::OK(); + } + return IOError(src, errno); +} + +Status HdfsEnv::LockFile(const std::string& fname, FileLock** lock) { + // there isn's a very good way to atomically check and create + // a file via libhdfs + *lock = nullptr; + return Status::OK(); +} + +Status HdfsEnv::UnlockFile(FileLock* lock) { + return Status::OK(); +} + +Status HdfsEnv::NewLogger(const std::string& fname, + shared_ptr* result) { + HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname); + if (f == nullptr || !f->isValid()) { + delete f; + *result = nullptr; + return IOError(fname, errno); + } + HdfsLogger* h = new HdfsLogger(f, &HdfsEnv::gettid); + result->reset(h); + if (mylog == nullptr) { + // mylog = h; // uncomment this for detailed logging + } + return Status::OK(); +} + +} // namespace rocksdb + +#endif // ROCKSDB_HDFS_FILE_C + +#else // USE_HDFS + +// dummy placeholders used when HDFS is not available +#include "rocksdb/env.h" +#include "hdfs/env_hdfs.h" +namespace rocksdb { + Status HdfsEnv::NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return Status::NotSupported("Not compiled with hdfs support"); + } +} + +#endif diff --git a/util/env_posix.cc b/util/env_posix.cc new file mode 100644 index 0000000000..2679586068 --- /dev/null +++ b/util/env_posix.cc @@ -0,0 +1,1726 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef OS_LINUX +#include +#endif +#include +#include +#include +#include +#if defined(OS_LINUX) +#include +#include +#endif +#if defined(LEVELDB_PLATFORM_ANDROID) +#include +#endif +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "port/port.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/posix_logger.h" +#include "util/random.h" +#include + +// Get nano time for mach systems +#ifdef __MACH__ +#include +#include +#endif + +#if !defined(TMPFS_MAGIC) +#define TMPFS_MAGIC 0x01021994 +#endif +#if !defined(XFS_SUPER_MAGIC) +#define XFS_SUPER_MAGIC 0x58465342 +#endif +#if !defined(EXT4_SUPER_MAGIC) +#define EXT4_SUPER_MAGIC 0xEF53 +#endif + +// For non linux platform, the following macros are used only as place +// holder. +#ifndef OS_LINUX +#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */ +#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */ +#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ +#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */ +#define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */ +#endif + +// This is only set from db_stress.cc and for testing only. +// If non-zero, kill at various points in source code with probability 1/this +int rocksdb_kill_odds = 0; + +namespace rocksdb { + +namespace { + +// A wrapper for fadvise, if the platform doesn't support fadvise, +// it will simply return Status::NotSupport. +int Fadvise(int fd, off_t offset, size_t len, int advice) { +#ifdef OS_LINUX + return posix_fadvise(fd, offset, len, advice); +#else + return 0; // simply do nothing. +#endif +} + +// list of pathnames that are locked +static std::set lockedFiles; +static port::Mutex mutex_lockedFiles; + +static Status IOError(const std::string& context, int err_number) { + return Status::IOError(context, strerror(err_number)); +} + +#ifdef NDEBUG +// empty in release build +#define TEST_KILL_RANDOM(rocksdb_kill_odds) +#else + +// Kill the process with probablity 1/odds for testing. +static void TestKillRandom(int odds, const std::string& srcfile, + int srcline) { + time_t curtime = time(nullptr); + Random r((uint32_t)curtime); + + assert(odds > 0); + bool crash = r.OneIn(odds); + if (crash) { + fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); + fflush(stdout); + kill(getpid(), SIGTERM); + } +} + +// To avoid crashing always at some frequently executed codepaths (during +// kill random test), use this factor to reduce odds +#define REDUCE_ODDS 2 +#define REDUCE_ODDS2 4 + +#define TEST_KILL_RANDOM(rocksdb_kill_odds) { \ + if (rocksdb_kill_odds > 0) { \ + TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__); \ + } \ +} + +#endif + +#if defined(OS_LINUX) +namespace { + static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) { + if (max_size < kMaxVarint64Length*3) { + return 0; + } + + struct stat buf; + int result = fstat(fd, &buf); + if (result == -1) { + return 0; + } + + long version = 0; + result = ioctl(fd, FS_IOC_GETVERSION, &version); + if (result == -1) { + return 0; + } + uint64_t uversion = (uint64_t)version; + + char* rid = id; + rid = EncodeVarint64(rid, buf.st_dev); + rid = EncodeVarint64(rid, buf.st_ino); + rid = EncodeVarint64(rid, uversion); + assert(rid >= id); + return static_cast(rid-id); + } +} +#endif + +class PosixSequentialFile: public SequentialFile { + private: + std::string filename_; + FILE* file_; + int fd_; + bool use_os_buffer_; + + public: + PosixSequentialFile(const std::string& fname, FILE* f, + const EnvOptions& options) + : filename_(fname), file_(f), fd_(fileno(f)), + use_os_buffer_(options.use_os_buffer) { + } + virtual ~PosixSequentialFile() { fclose(file_); } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = 0; + do { + r = fread_unlocked(scratch, 1, n, file_); + } while (r == 0 && ferror(file_) && errno == EINTR); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + // We also clear the error so that the reads can continue + // if a new data is written to the file + clearerr(file_); + } else { + // A partial read with an error: return a non-ok status + s = IOError(filename_, errno); + } + } + if (!use_os_buffer_) { + // we need to fadvise away the entire range of pages because + // we do not want readahead pages to be cached. + Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages + } + return s; + } + + virtual Status Skip(uint64_t n) { + if (fseek(file_, n, SEEK_CUR)) { + return IOError(filename_, errno); + } + return Status::OK(); + } + + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } +}; + +// pread() based random-access +class PosixRandomAccessFile: public RandomAccessFile { + private: + std::string filename_; + int fd_; + bool use_os_buffer_; + + public: + PosixRandomAccessFile(const std::string& fname, int fd, + const EnvOptions& options) + : filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) { + assert(!options.use_mmap_reads); + } + virtual ~PosixRandomAccessFile() { close(fd_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + ssize_t r = -1; + do { + r = pread(fd_, scratch, n, static_cast(offset)); + } while (r < 0 && errno == EINTR); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + // An error: return a non-ok status + s = IOError(filename_, errno); + } + if (!use_os_buffer_) { + // we need to fadvise away the entire range of pages because + // we do not want readahead pages to be cached. + Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages + } + return s; + } + +#ifdef OS_LINUX + virtual size_t GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(fd_, id, max_size); + } +#endif + + virtual void Hint(AccessPattern pattern) { + switch(pattern) { + case NORMAL: + Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL); + break; + case RANDOM: + Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM); + break; + case SEQUENTIAL: + Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); + break; + case WILLNEED: + Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED); + break; + case DONTNEED: + Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); + break; + default: + assert(false); + break; + } + } + + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } +}; + +// mmap() based random-access +class PosixMmapReadableFile: public RandomAccessFile { + private: + int fd_; + std::string filename_; + void* mmapped_region_; + size_t length_; + + public: + // base[0,length-1] contains the mmapped contents of the file. + PosixMmapReadableFile(const int fd, const std::string& fname, + void* base, size_t length, + const EnvOptions& options) + : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) { + fd_ = fd_ + 0; // suppress the warning for used variables + assert(options.use_mmap_reads); + assert(options.use_os_buffer); + } + virtual ~PosixMmapReadableFile() { + int ret = munmap(mmapped_region_, length_); + if (ret != 0) { + fprintf(stdout, "failed to munmap %p length %zu \n", + mmapped_region_, length_); + } + } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + if (offset + n > length_) { + *result = Slice(); + s = IOError(filename_, EINVAL); + } else { + *result = Slice(reinterpret_cast(mmapped_region_) + offset, n); + } + return s; + } + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } +}; + +// We preallocate up to an extra megabyte and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class PosixMmapFile : public WritableFile { + private: + std::string filename_; + int fd_; + size_t page_size_; + size_t map_size_; // How much extra memory to map at a time + char* base_; // The mapped region + char* limit_; // Limit of the mapped region + char* dst_; // Where to write next (in range [base_,limit_]) + char* last_sync_; // Where have we synced up to + uint64_t file_offset_; // Offset of base_ in file + // Have we done an munmap of unsynced data? + bool pending_sync_; +#ifdef ROCKSDB_FALLOCATE_PRESENT + bool fallocate_with_keep_size_; +#endif + + // Roundup x to a multiple of y + static size_t Roundup(size_t x, size_t y) { + return ((x + y - 1) / y) * y; + } + + size_t TruncateToPageBoundary(size_t s) { + s -= (s & (page_size_ - 1)); + assert((s % page_size_) == 0); + return s; + } + + bool UnmapCurrentRegion() { + bool result = true; + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (base_ != nullptr) { + if (last_sync_ < limit_) { + // Defer syncing this data until next Sync() call, if any + pending_sync_ = true; + } + if (munmap(base_, limit_ - base_) != 0) { + result = false; + } + file_offset_ += limit_ - base_; + base_ = nullptr; + limit_ = nullptr; + last_sync_ = nullptr; + dst_ = nullptr; + + // Increase the amount we map the next time, but capped at 1MB + if (map_size_ < (1<<20)) { + map_size_ *= 2; + } + } + return result; + } + + Status MapNewRegion() { +#ifdef ROCKSDB_FALLOCATE_PRESENT + assert(base_ == nullptr); + + TEST_KILL_RANDOM(rocksdb_kill_odds); + // we can't fallocate with FALLOC_FL_KEEP_SIZE here + int alloc_status = fallocate(fd_, 0, file_offset_, map_size_); + if (alloc_status != 0) { + // fallback to posix_fallocate + alloc_status = posix_fallocate(fd_, file_offset_, map_size_); + } + if (alloc_status != 0) { + return Status::IOError("Error allocating space to file : " + filename_ + + "Error : " + strerror(alloc_status)); + } + + TEST_KILL_RANDOM(rocksdb_kill_odds); + void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, + fd_, file_offset_); + if (ptr == MAP_FAILED) { + return Status::IOError("MMap failed on " + filename_); + } + + TEST_KILL_RANDOM(rocksdb_kill_odds); + + base_ = reinterpret_cast(ptr); + limit_ = base_ + map_size_; + dst_ = base_; + last_sync_ = base_; + return Status::OK(); +#else + return Status::NotSupported("This platform doesn't support fallocate()"); +#endif + } + + public: + PosixMmapFile(const std::string& fname, int fd, size_t page_size, + const EnvOptions& options) + : filename_(fname), + fd_(fd), + page_size_(page_size), + map_size_(Roundup(65536, page_size)), + base_(nullptr), + limit_(nullptr), + dst_(nullptr), + last_sync_(nullptr), + file_offset_(0), + pending_sync_(false) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + fallocate_with_keep_size_ = options.fallocate_with_keep_size; +#endif + assert((page_size & (page_size - 1)) == 0); + assert(options.use_mmap_writes); + } + + + ~PosixMmapFile() { + if (fd_ >= 0) { + PosixMmapFile::Close(); + } + } + + virtual Status Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS); + PrepareWrite(GetFileSize(), left); + while (left > 0) { + assert(base_ <= dst_); + assert(dst_ <= limit_); + size_t avail = limit_ - dst_; + if (avail == 0) { + if (UnmapCurrentRegion()) { + Status s = MapNewRegion(); + if (!s.ok()) { + return s; + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + } + } + + size_t n = (left <= avail) ? left : avail; + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + return Status::OK(); + } + + virtual Status Close() { + Status s; + size_t unused = limit_ - dst_; + + TEST_KILL_RANDOM(rocksdb_kill_odds); + + if (!UnmapCurrentRegion()) { + s = IOError(filename_, errno); + } else if (unused > 0) { + // Trim the extra space at the end of the file + if (ftruncate(fd_, file_offset_ - unused) < 0) { + s = IOError(filename_, errno); + } + } + + TEST_KILL_RANDOM(rocksdb_kill_odds); + + if (close(fd_) < 0) { + if (s.ok()) { + s = IOError(filename_, errno); + } + } + + fd_ = -1; + base_ = nullptr; + limit_ = nullptr; + return s; + } + + virtual Status Flush() { + TEST_KILL_RANDOM(rocksdb_kill_odds); + return Status::OK(); + } + + virtual Status Sync() { + Status s; + + if (pending_sync_) { + // Some unmapped data was not synced + TEST_KILL_RANDOM(rocksdb_kill_odds); + pending_sync_ = false; + if (fdatasync(fd_) < 0) { + s = IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS); + } + + if (dst_ > last_sync_) { + // Find the beginnings of the pages that contain the first and last + // bytes to be synced. + size_t p1 = TruncateToPageBoundary(last_sync_ - base_); + size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); + last_sync_ = dst_; + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { + s = IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + } + + return s; + } + + /** + * Flush data as well as metadata to stable storage. + */ + virtual Status Fsync() { + if (pending_sync_) { + // Some unmapped data was not synced + TEST_KILL_RANDOM(rocksdb_kill_odds); + pending_sync_ = false; + if (fsync(fd_) < 0) { + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + } + // This invocation to Sync will not issue the call to + // fdatasync because pending_sync_ has already been cleared. + return Sync(); + } + + /** + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ + virtual uint64_t GetFileSize() { + size_t used = dst_ - base_; + return file_offset_ + used; + } + + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } + +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual Status Allocate(off_t offset, off_t len) { + TEST_KILL_RANDOM(rocksdb_kill_odds); + int alloc_status = fallocate( + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + if (alloc_status == 0) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } +#endif +}; + +// Use posix write to write data to a file. +class PosixWritableFile : public WritableFile { + private: + const std::string filename_; + int fd_; + size_t cursize_; // current size of cached data in buf_ + size_t capacity_; // max size of buf_ + unique_ptr buf_; // a buffer to cache writes + uint64_t filesize_; + bool pending_sync_; + bool pending_fsync_; + uint64_t last_sync_size_; + uint64_t bytes_per_sync_; +#ifdef ROCKSDB_FALLOCATE_PRESENT + bool fallocate_with_keep_size_; +#endif + + public: + PosixWritableFile(const std::string& fname, int fd, size_t capacity, + const EnvOptions& options) + : filename_(fname), + fd_(fd), + cursize_(0), + capacity_(capacity), + buf_(new char[capacity]), + filesize_(0), + pending_sync_(false), + pending_fsync_(false), + last_sync_size_(0), + bytes_per_sync_(options.bytes_per_sync) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + fallocate_with_keep_size_ = options.fallocate_with_keep_size; +#endif + assert(!options.use_mmap_writes); + } + + ~PosixWritableFile() { + if (fd_ >= 0) { + PosixWritableFile::Close(); + } + } + + virtual Status Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + Status s; + pending_sync_ = true; + pending_fsync_ = true; + + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + + PrepareWrite(GetFileSize(), left); + // if there is no space in the cache, then flush + if (cursize_ + left > capacity_) { + s = Flush(); + if (!s.ok()) { + return s; + } + // Increase the buffer size, but capped at 1MB + if (capacity_ < (1<<20)) { + capacity_ *= 2; + buf_.reset(new char[capacity_]); + } + assert(cursize_ == 0); + } + + // if the write fits into the cache, then write to cache + // otherwise do a write() syscall to write to OS buffers. + if (cursize_ + left <= capacity_) { + memcpy(buf_.get()+cursize_, src, left); + cursize_ += left; + } else { + while (left != 0) { + ssize_t done = write(fd_, src, left); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + + left -= done; + src += done; + } + } + filesize_ += data.size(); + return Status::OK(); + } + + virtual Status Close() { + Status s; + s = Flush(); // flush cache to OS + if (!s.ok()) { + return s; + } + + TEST_KILL_RANDOM(rocksdb_kill_odds); + + size_t block_size; + size_t last_allocated_block; + GetPreallocationStatus(&block_size, &last_allocated_block); + if (last_allocated_block > 0) { + // trim the extra space preallocated at the end of the file + int dummy __attribute__((unused)); + dummy = ftruncate(fd_, filesize_); // ignore errors + } + + if (close(fd_) < 0) { + if (s.ok()) { + s = IOError(filename_, errno); + } + } + fd_ = -1; + return s; + } + + // write out the cached data to the OS cache + virtual Status Flush() { + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + size_t left = cursize_; + char* src = buf_.get(); + while (left != 0) { + ssize_t done = write(fd_, src, left); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + left -= done; + src += done; + } + cursize_ = 0; + + // sync OS cache to disk for every bytes_per_sync_ + // TODO: give log file and sst file different options (log + // files could be potentially cached in OS for their whole + // life time, thus we might not want to flush at all). + if (bytes_per_sync_ && + filesize_ - last_sync_size_ >= bytes_per_sync_) { + RangeSync(last_sync_size_, filesize_ - last_sync_size_); + last_sync_size_ = filesize_; + } + + return Status::OK(); + } + + virtual Status Sync() { + Status s = Flush(); + if (!s.ok()) { + return s; + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (pending_sync_ && fdatasync(fd_) < 0) { + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + pending_sync_ = false; + return Status::OK(); + } + + virtual Status Fsync() { + Status s = Flush(); + if (!s.ok()) { + return s; + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (pending_fsync_ && fsync(fd_) < 0) { + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + pending_fsync_ = false; + pending_sync_ = false; + return Status::OK(); + } + + virtual uint64_t GetFileSize() { + return filesize_; + } + + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } + +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual Status Allocate(off_t offset, off_t len) { + TEST_KILL_RANDOM(rocksdb_kill_odds); + int alloc_status = fallocate( + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + if (alloc_status == 0) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } + + virtual Status RangeSync(off64_t offset, off64_t nbytes) { + if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } + virtual size_t GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(fd_, id, max_size); + } +#endif +}; + +class PosixRandomRWFile : public RandomRWFile { + private: + const std::string filename_; + int fd_; + bool pending_sync_; + bool pending_fsync_; +#ifdef ROCKSDB_FALLOCATE_PRESENT + bool fallocate_with_keep_size_; +#endif + + public: + PosixRandomRWFile(const std::string& fname, int fd, const EnvOptions& options) + : filename_(fname), + fd_(fd), + pending_sync_(false), + pending_fsync_(false) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + fallocate_with_keep_size_ = options.fallocate_with_keep_size; +#endif + assert(!options.use_mmap_writes && !options.use_mmap_reads); + } + + ~PosixRandomRWFile() { + if (fd_ >= 0) { + Close(); + } + } + + virtual Status Write(uint64_t offset, const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + Status s; + pending_sync_ = true; + pending_fsync_ = true; + + while (left != 0) { + ssize_t done = pwrite(fd_, src, left, offset); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return IOError(filename_, errno); + } + + left -= done; + src += done; + offset += done; + } + + return Status::OK(); + } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + ssize_t r = pread(fd_, scratch, n, static_cast(offset)); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + s = IOError(filename_, errno); + } + return s; + } + + virtual Status Close() { + Status s = Status::OK(); + if (fd_ >= 0 && close(fd_) < 0) { + s = IOError(filename_, errno); + } + fd_ = -1; + return s; + } + + virtual Status Sync() { + if (pending_sync_ && fdatasync(fd_) < 0) { + return IOError(filename_, errno); + } + pending_sync_ = false; + return Status::OK(); + } + + virtual Status Fsync() { + if (pending_fsync_ && fsync(fd_) < 0) { + return IOError(filename_, errno); + } + pending_fsync_ = false; + pending_sync_ = false; + return Status::OK(); + } + +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual Status Allocate(off_t offset, off_t len) { + TEST_KILL_RANDOM(rocksdb_kill_odds); + int alloc_status = fallocate( + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + if (alloc_status == 0) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } +#endif +}; + +class PosixDirectory : public Directory { + public: + explicit PosixDirectory(int fd) : fd_(fd) {} + ~PosixDirectory() { + close(fd_); + } + + virtual Status Fsync() { + if (fsync(fd_) == -1) { + return IOError("directory", errno); + } + return Status::OK(); + } + + private: + int fd_; +}; + +static int LockOrUnlock(const std::string& fname, int fd, bool lock) { + mutex_lockedFiles.Lock(); + if (lock) { + // If it already exists in the lockedFiles set, then it is already locked, + // and fail this lock attempt. Otherwise, insert it into lockedFiles. + // This check is needed because fcntl() does not detect lock conflict + // if the fcntl is issued by the same thread that earlier acquired + // this lock. + if (lockedFiles.insert(fname).second == false) { + mutex_lockedFiles.Unlock(); + errno = ENOLCK; + return -1; + } + } else { + // If we are unlocking, then verify that we had locked it earlier, + // it should already exist in lockedFiles. Remove it from lockedFiles. + if (lockedFiles.erase(fname) != 1) { + mutex_lockedFiles.Unlock(); + errno = ENOLCK; + return -1; + } + } + errno = 0; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (lock ? F_WRLCK : F_UNLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + int value = fcntl(fd, F_SETLK, &f); + if (value == -1 && lock) { + // if there is an error in locking, then remove the pathname from lockedfiles + lockedFiles.erase(fname); + } + mutex_lockedFiles.Unlock(); + return value; +} + +class PosixFileLock : public FileLock { + public: + int fd_; + std::string filename; +}; + + +namespace { +void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + exit(1); + } +} +} + +class PosixEnv : public Env { + public: + PosixEnv(); + + virtual ~PosixEnv(){ + for (const auto tid : threads_to_join_) { + pthread_join(tid, nullptr); + } + } + + void SetFD_CLOEXEC(int fd, const EnvOptions* options) { + if ((options == nullptr || options->set_fd_cloexec) && fd > 0) { + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); + } + } + + virtual Status NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + FILE* f = nullptr; + do { + f = fopen(fname.c_str(), "r"); + } while (f == nullptr && errno == EINTR); + if (f == nullptr) { + *result = nullptr; + return IOError(fname, errno); + } else { + int fd = fileno(f); + SetFD_CLOEXEC(fd, &options); + result->reset(new PosixSequentialFile(fname, f, options)); + return Status::OK(); + } + } + + virtual Status NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + Status s; + int fd = open(fname.c_str(), O_RDONLY); + SetFD_CLOEXEC(fd, &options); + if (fd < 0) { + s = IOError(fname, errno); + } else if (options.use_mmap_reads && sizeof(void*) >= 8) { + // Use of mmap for random reads has been removed because it + // kills performance when storage is fast. + // Use mmap when virtual address-space is plentiful. + uint64_t size; + s = GetFileSize(fname, &size); + if (s.ok()) { + void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0); + if (base != MAP_FAILED) { + result->reset(new PosixMmapReadableFile(fd, fname, base, + size, options)); + } else { + s = IOError(fname, errno); + } + } + close(fd); + } else { + result->reset(new PosixRandomAccessFile(fname, fd, options)); + } + return s; + } + + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + Status s; + int fd = -1; + do { + fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + } while (fd < 0 && errno == EINTR); + if (fd < 0) { + s = IOError(fname, errno); + } else { + SetFD_CLOEXEC(fd, &options); + if (options.use_mmap_writes) { + if (!checkedDiskForMmap_) { + // this will be executed once in the program's lifetime. + // do not use mmapWrite on non ext-3/xfs/tmpfs systems. + if (!SupportsFastAllocate(fname)) { + forceMmapOff = true; + } + checkedDiskForMmap_ = true; + } + } + if (options.use_mmap_writes && !forceMmapOff) { + result->reset(new PosixMmapFile(fname, fd, page_size_, options)); + } else { + // disable mmap writes + EnvOptions no_mmap_writes_options = options; + no_mmap_writes_options.use_mmap_writes = false; + + result->reset( + new PosixWritableFile(fname, fd, 65536, no_mmap_writes_options) + ); + } + } + return s; + } + + virtual Status NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + // no support for mmap yet + if (options.use_mmap_writes || options.use_mmap_reads) { + return Status::NotSupported("No support for mmap read/write yet"); + } + Status s; + const int fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644); + if (fd < 0) { + s = IOError(fname, errno); + } else { + SetFD_CLOEXEC(fd, &options); + result->reset(new PosixRandomRWFile(fname, fd, options)); + } + return s; + } + + virtual Status NewDirectory(const std::string& name, + unique_ptr* result) { + result->reset(); + const int fd = open(name.c_str(), 0); + if (fd < 0) { + return IOError(name, errno); + } else { + result->reset(new PosixDirectory(fd)); + } + return Status::OK(); + } + + virtual bool FileExists(const std::string& fname) { + return access(fname.c_str(), F_OK) == 0; + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) { + result->clear(); + DIR* d = opendir(dir.c_str()); + if (d == nullptr) { + return IOError(dir, errno); + } + struct dirent* entry; + while ((entry = readdir(d)) != nullptr) { + result->push_back(entry->d_name); + } + closedir(d); + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) { + Status result; + if (unlink(fname.c_str()) != 0) { + result = IOError(fname, errno); + } + return result; + }; + + virtual Status CreateDir(const std::string& name) { + Status result; + if (mkdir(name.c_str(), 0755) != 0) { + result = IOError(name, errno); + } + return result; + }; + + virtual Status CreateDirIfMissing(const std::string& name) { + Status result; + if (mkdir(name.c_str(), 0755) != 0) { + if (errno != EEXIST) { + result = IOError(name, errno); + } else if (!DirExists(name)) { // Check that name is actually a + // directory. + // Message is taken from mkdir + result = Status::IOError("`"+name+"' exists but is not a directory"); + } + } + return result; + }; + + virtual Status DeleteDir(const std::string& name) { + Status result; + if (rmdir(name.c_str()) != 0) { + result = IOError(name, errno); + } + return result; + }; + + virtual Status GetFileSize(const std::string& fname, uint64_t* size) { + Status s; + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + *size = 0; + s = IOError(fname, errno); + } else { + *size = sbuf.st_size; + } + return s; + } + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) { + struct stat s; + if (stat(fname.c_str(), &s) !=0) { + return IOError(fname, errno); + } + *file_mtime = static_cast(s.st_mtime); + return Status::OK(); + } + virtual Status RenameFile(const std::string& src, const std::string& target) { + Status result; + if (rename(src.c_str(), target.c_str()) != 0) { + result = IOError(src, errno); + } + return result; + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = nullptr; + Status result; + int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + if (fd < 0) { + result = IOError(fname, errno); + } else if (LockOrUnlock(fname, fd, true) == -1) { + result = IOError("lock " + fname, errno); + close(fd); + } else { + SetFD_CLOEXEC(fd, nullptr); + PosixFileLock* my_lock = new PosixFileLock; + my_lock->fd_ = fd; + my_lock->filename = fname; + *lock = my_lock; + } + return result; + } + + virtual Status UnlockFile(FileLock* lock) { + PosixFileLock* my_lock = reinterpret_cast(lock); + Status result; + if (LockOrUnlock(my_lock->filename, my_lock->fd_, false) == -1) { + result = IOError("unlock", errno); + } + close(my_lock->fd_); + delete my_lock; + return result; + } + + virtual void Schedule(void (*function)(void*), void* arg, Priority pri = LOW); + + virtual void StartThread(void (*function)(void* arg), void* arg); + + virtual void WaitForJoin(); + + virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override; + + virtual Status GetTestDirectory(std::string* result) { + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + *result = env; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid())); + *result = buf; + } + // Directory may already exist + CreateDir(*result); + return Status::OK(); + } + + static uint64_t gettid(pthread_t tid) { + uint64_t thread_id = 0; + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); + return thread_id; + } + + static uint64_t gettid() { + pthread_t tid = pthread_self(); + return gettid(tid); + } + + virtual Status NewLogger(const std::string& fname, + shared_ptr* result) { + FILE* f = fopen(fname.c_str(), "w"); + if (f == nullptr) { + result->reset(); + return IOError(fname, errno); + } else { + int fd = fileno(f); + SetFD_CLOEXEC(fd, nullptr); + result->reset(new PosixLogger(f, &PosixEnv::gettid, this)); + return Status::OK(); + } + } + + virtual uint64_t NowMicros() { + struct timeval tv; + // TODO(kailiu) MAC DON'T HAVE THIS + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + virtual uint64_t NowNanos() { +#ifdef OS_LINUX + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#elif __MACH__ + clock_serv_t cclock; + mach_timespec_t ts; + host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); + clock_get_time(cclock, &ts); + mach_port_deallocate(mach_task_self(), cclock); +#endif + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; + } + + virtual void SleepForMicroseconds(int micros) { + usleep(micros); + } + + virtual Status GetHostName(char* name, uint64_t len) { + int ret = gethostname(name, len); + if (ret < 0) { + if (errno == EFAULT || errno == EINVAL) + return Status::InvalidArgument(strerror(errno)); + else + return IOError("GetHostName", errno); + } + return Status::OK(); + } + + virtual Status GetCurrentTime(int64_t* unix_time) { + time_t ret = time(nullptr); + if (ret == (time_t) -1) { + return IOError("GetCurrentTime", errno); + } + *unix_time = (int64_t) ret; + return Status::OK(); + } + + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) { + if (db_path.find('/') == 0) { + *output_path = db_path; + return Status::OK(); + } + + char the_path[256]; + char* ret = getcwd(the_path, 256); + if (ret == nullptr) { + return Status::IOError(strerror(errno)); + } + + *output_path = ret; + return Status::OK(); + } + + // Allow increasing the number of worker threads. + virtual void SetBackgroundThreads(int num, Priority pri) { + assert(pri >= Priority::LOW && pri <= Priority::HIGH); + thread_pools_[pri].SetBackgroundThreads(num); + } + + virtual std::string TimeToString(uint64_t secondsSince1970) { + const time_t seconds = (time_t)secondsSince1970; + struct tm t; + int maxsize = 64; + std::string dummy; + dummy.reserve(maxsize); + dummy.resize(maxsize); + char* p = &dummy[0]; + localtime_r(&seconds, &t); + snprintf(p, maxsize, + "%04d/%02d/%02d-%02d:%02d:%02d ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec); + return dummy; + } + + EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const { + EnvOptions optimized = env_options; + optimized.use_mmap_writes = false; + // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it + // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit + // test and make this false + optimized.fallocate_with_keep_size = true; + return optimized; + } + + EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) const { + EnvOptions optimized = env_options; + optimized.use_mmap_writes = false; + optimized.fallocate_with_keep_size = true; + return optimized; + } + + private: + bool checkedDiskForMmap_; + bool forceMmapOff; // do we override Env options? + + + // Returns true iff the named directory exists and is a directory. + virtual bool DirExists(const std::string& dname) { + struct stat statbuf; + if (stat(dname.c_str(), &statbuf) == 0) { + return S_ISDIR(statbuf.st_mode); + } + return false; // stat() failed return false + } + + bool SupportsFastAllocate(const std::string& path) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + struct statfs s; + if (statfs(path.c_str(), &s)){ + return false; + } + switch (s.f_type) { + case EXT4_SUPER_MAGIC: + return true; + case XFS_SUPER_MAGIC: + return true; + case TMPFS_MAGIC: + return true; + default: + return false; + } +#else + return false; +#endif + } + + size_t page_size_; + + + class ThreadPool { + public: + ThreadPool() + : total_threads_limit_(1), + bgthreads_(0), + queue_(), + queue_len_(0), + exit_all_threads_(false) { + PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); + PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, nullptr)); + } + + ~ThreadPool() { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + assert(!exit_all_threads_); + exit_all_threads_ = true; + PthreadCall("signalall", pthread_cond_broadcast(&bgsignal_)); + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + for (const auto tid : bgthreads_) { + pthread_join(tid, nullptr); + } + } + + // Return true if there is at least one thread needs to terminate. + bool HasExcessiveThread() { + return static_cast(bgthreads_.size()) > total_threads_limit_; + } + + // Return true iff the current thread is the excessive thread to terminate. + // Always terminate the running thread that is added last, even if there are + // more than one thread to terminate. + bool IsLastExcessiveThread(size_t thread_id) { + return HasExcessiveThread() && thread_id == bgthreads_.size() - 1; + } + + // Is one of the threads to terminate. + bool IsExcessiveThread(size_t thread_id) { + return static_cast(thread_id) >= total_threads_limit_; + } + + void BGThread(size_t thread_id) { + while (true) { + // Wait until there is an item that is ready to run + PthreadCall("lock", pthread_mutex_lock(&mu_)); + // Stop waiting if the thread needs to do work or needs to terminate. + while (!exit_all_threads_ && !IsLastExcessiveThread(thread_id) && + (queue_.empty() || IsExcessiveThread(thread_id))) { + PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_)); + } + if (exit_all_threads_) { // mechanism to let BG threads exit safely + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + break; + } + if (IsLastExcessiveThread(thread_id)) { + // Current thread is the last generated one and is excessive. + // We always terminate excessive thread in the reverse order of + // generation time. + auto terminating_thread = bgthreads_.back(); + pthread_detach(terminating_thread); + bgthreads_.pop_back(); + if (HasExcessiveThread()) { + // There is still at least more excessive thread to terminate. + WakeUpAllThreads(); + } + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + // TODO(sdong): temp logging. Need to help debugging. Remove it when + // the feature is proved to be stable. + fprintf(stdout, "Bg thread %zu terminates %llx\n", thread_id, + static_cast(gettid())); + break; + } + void (*function)(void*) = queue_.front().function; + void* arg = queue_.front().arg; + queue_.pop_front(); + queue_len_.store(queue_.size(), std::memory_order_relaxed); + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + (*function)(arg); + } + } + + // Helper struct for passing arguments when creating threads. + struct BGThreadMetadata { + ThreadPool* thread_pool_; + size_t thread_id_; // Thread count in the thread. + explicit BGThreadMetadata(ThreadPool* thread_pool, size_t thread_id) + : thread_pool_(thread_pool), thread_id_(thread_id) {} + }; + + static void* BGThreadWrapper(void* arg) { + BGThreadMetadata* meta = reinterpret_cast(arg); + size_t thread_id = meta->thread_id_; + ThreadPool* tp = meta->thread_pool_; + delete meta; + tp->BGThread(thread_id); + return nullptr; + } + + void WakeUpAllThreads() { + PthreadCall("signalall", pthread_cond_broadcast(&bgsignal_)); + } + + void SetBackgroundThreads(int num) { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + if (exit_all_threads_) { + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + return; + } + if (num != total_threads_limit_) { + total_threads_limit_ = num; + WakeUpAllThreads(); + StartBGThreads(); + } + assert(total_threads_limit_ > 0); + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + } + + void StartBGThreads() { + // Start background thread if necessary + while ((int)bgthreads_.size() < total_threads_limit_) { + pthread_t t; + PthreadCall( + "create thread", + pthread_create(&t, nullptr, &ThreadPool::BGThreadWrapper, + new BGThreadMetadata(this, bgthreads_.size()))); + + // Set the thread name to aid debugging +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + char name_buf[16]; + snprintf(name_buf, sizeof name_buf, "rocksdb:bg%zu", bgthreads_.size()); + name_buf[sizeof name_buf - 1] = '\0'; + pthread_setname_np(t, name_buf); +#endif +#endif + + bgthreads_.push_back(t); + } + } + + void Schedule(void (*function)(void*), void* arg) { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + + if (exit_all_threads_) { + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + return; + } + + StartBGThreads(); + + // Add to priority queue + queue_.push_back(BGItem()); + queue_.back().function = function; + queue_.back().arg = arg; + queue_len_.store(queue_.size(), std::memory_order_relaxed); + + if (!HasExcessiveThread()) { + // Wake up at least one waiting thread. + PthreadCall("signal", pthread_cond_signal(&bgsignal_)); + } else { + // Need to wake up all threads to make sure the one woken + // up is not the one to terminate. + WakeUpAllThreads(); + } + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + } + + unsigned int GetQueueLen() const { + return queue_len_.load(std::memory_order_relaxed); + } + + private: + // Entry per Schedule() call + struct BGItem { void* arg; void (*function)(void*); }; + typedef std::deque BGQueue; + + pthread_mutex_t mu_; + pthread_cond_t bgsignal_; + int total_threads_limit_; + std::vector bgthreads_; + BGQueue queue_; + std::atomic_uint queue_len_; // Queue length. Used for stats reporting + bool exit_all_threads_; + }; + + std::vector thread_pools_; + + pthread_mutex_t mu_; + std::vector threads_to_join_; + +}; + +PosixEnv::PosixEnv() : checkedDiskForMmap_(false), + forceMmapOff(false), + page_size_(getpagesize()), + thread_pools_(Priority::TOTAL) { + PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); +} + +void PosixEnv::Schedule(void (*function)(void*), void* arg, Priority pri) { + assert(pri >= Priority::LOW && pri <= Priority::HIGH); + thread_pools_[pri].Schedule(function, arg); +} + +unsigned int PosixEnv::GetThreadPoolQueueLen(Priority pri) const { + assert(pri >= Priority::LOW && pri <= Priority::HIGH); + return thread_pools_[pri].GetQueueLen(); +} + +namespace { +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; +} +static void* StartThreadWrapper(void* arg) { + StartThreadState* state = reinterpret_cast(arg); + state->user_function(state->arg); + delete state; + return nullptr; +} + +void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { + pthread_t t; + StartThreadState* state = new StartThreadState; + state->user_function = function; + state->arg = arg; + PthreadCall("start thread", + pthread_create(&t, nullptr, &StartThreadWrapper, state)); + PthreadCall("lock", pthread_mutex_lock(&mu_)); + threads_to_join_.push_back(t); + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +void PosixEnv::WaitForJoin() { + for (const auto tid : threads_to_join_) { + pthread_join(tid, nullptr); + } + threads_to_join_.clear(); +} + +} // namespace + +std::string Env::GenerateUniqueId() { + std::string uuid_file = "/proc/sys/kernel/random/uuid"; + if (FileExists(uuid_file)) { + std::string uuid; + Status s = ReadFileToString(this, uuid_file, &uuid); + if (s.ok()) { + return uuid; + } + } + // Could not read uuid_file - generate uuid using "nanos-random" + Random64 r(time(nullptr)); + uint64_t random_uuid_portion = + r.Uniform(std::numeric_limits::max()); + uint64_t nanos_uuid_portion = NowNanos(); + char uuid2[200]; + snprintf(uuid2, + 200, + "%lx-%lx", + (unsigned long)nanos_uuid_portion, + (unsigned long)random_uuid_portion); + return uuid2; +} + +Env* Env::Default() { + static PosixEnv default_env; + return &default_env; +} + +} // namespace rocksdb diff --git a/util/env_test.cc b/util/env_test.cc new file mode 100644 index 0000000000..c0d00ce94d --- /dev/null +++ b/util/env_test.cc @@ -0,0 +1,741 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include +#include + +#ifdef OS_LINUX +#include +#include +#endif + +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/coding.h" +#include "util/log_buffer.h" +#include "util/mutexlock.h" +#include "util/testharness.h" + +namespace rocksdb { + +static const int kDelayMicros = 100000; + +class EnvPosixTest { + private: + port::Mutex mu_; + std::string events_; + + public: + Env* env_; + EnvPosixTest() : env_(Env::Default()) { } +}; + +static void SetBool(void* ptr) { + reinterpret_cast(ptr)->NoBarrier_Store(ptr); +} + +TEST(EnvPosixTest, RunImmediately) { + port::AtomicPointer called (nullptr); + env_->Schedule(&SetBool, &called); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(called.NoBarrier_Load() != nullptr); +} + +TEST(EnvPosixTest, RunMany) { + port::AtomicPointer last_id (nullptr); + + struct CB { + port::AtomicPointer* last_id_ptr; // Pointer to shared slot + uintptr_t id; // Order# for the execution of this callback + + CB(port::AtomicPointer* p, int i) : last_id_ptr(p), id(i) { } + + static void Run(void* v) { + CB* cb = reinterpret_cast(v); + void* cur = cb->last_id_ptr->NoBarrier_Load(); + ASSERT_EQ(cb->id-1, reinterpret_cast(cur)); + cb->last_id_ptr->Release_Store(reinterpret_cast(cb->id)); + } + }; + + // Schedule in different order than start time + CB cb1(&last_id, 1); + CB cb2(&last_id, 2); + CB cb3(&last_id, 3); + CB cb4(&last_id, 4); + env_->Schedule(&CB::Run, &cb1); + env_->Schedule(&CB::Run, &cb2); + env_->Schedule(&CB::Run, &cb3); + env_->Schedule(&CB::Run, &cb4); + + Env::Default()->SleepForMicroseconds(kDelayMicros); + void* cur = last_id.Acquire_Load(); + ASSERT_EQ(4U, reinterpret_cast(cur)); +} + +struct State { + port::Mutex mu; + int val; + int num_running; +}; + +static void ThreadBody(void* arg) { + State* s = reinterpret_cast(arg); + s->mu.Lock(); + s->val += 1; + s->num_running -= 1; + s->mu.Unlock(); +} + +TEST(EnvPosixTest, StartThread) { + State state; + state.val = 0; + state.num_running = 3; + for (int i = 0; i < 3; i++) { + env_->StartThread(&ThreadBody, &state); + } + while (true) { + state.mu.Lock(); + int num = state.num_running; + state.mu.Unlock(); + if (num == 0) { + break; + } + Env::Default()->SleepForMicroseconds(kDelayMicros); + } + ASSERT_EQ(state.val, 3); +} + +TEST(EnvPosixTest, TwoPools) { + + class CB { + public: + CB(const std::string& pool_name, int pool_size) + : mu_(), + num_running_(0), + num_finished_(0), + pool_size_(pool_size), + pool_name_(pool_name) { } + + static void Run(void* v) { + CB* cb = reinterpret_cast(v); + cb->Run(); + } + + void Run() { + { + MutexLock l(&mu_); + num_running_++; + std::cout << "Pool " << pool_name_ << ": " + << num_running_ << " running threads.\n"; + // make sure we don't have more than pool_size_ jobs running. + ASSERT_LE(num_running_, pool_size_); + } + + // sleep for 1 sec + Env::Default()->SleepForMicroseconds(1000000); + + { + MutexLock l(&mu_); + num_running_--; + num_finished_++; + } + } + + int NumFinished() { + MutexLock l(&mu_); + return num_finished_; + } + + private: + port::Mutex mu_; + int num_running_; + int num_finished_; + int pool_size_; + std::string pool_name_; + }; + + const int kLowPoolSize = 2; + const int kHighPoolSize = 4; + const int kJobs = 8; + + CB low_pool_job("low", kLowPoolSize); + CB high_pool_job("high", kHighPoolSize); + + env_->SetBackgroundThreads(kLowPoolSize); + env_->SetBackgroundThreads(kHighPoolSize, Env::Priority::HIGH); + + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + // schedule same number of jobs in each pool + for (int i = 0; i < kJobs; i++) { + env_->Schedule(&CB::Run, &low_pool_job); + env_->Schedule(&CB::Run, &high_pool_job, Env::Priority::HIGH); + } + // Wait a short while for the jobs to be dispatched. + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize), + env_->GetThreadPoolQueueLen()); + ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize), + env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + ASSERT_EQ((unsigned int)(kJobs - kHighPoolSize), + env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + // wait for all jobs to finish + while (low_pool_job.NumFinished() < kJobs || + high_pool_job.NumFinished() < kJobs) { + env_->SleepForMicroseconds(kDelayMicros); + } + + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); +} + +TEST(EnvPosixTest, DecreaseNumBgThreads) { + class SleepingBackgroundTask { + public: + explicit SleepingBackgroundTask() + : bg_cv_(&mutex_), should_sleep_(true), sleeping_(false) {} + void DoSleep() { + MutexLock l(&mutex_); + sleeping_ = true; + while (should_sleep_) { + bg_cv_.Wait(); + } + sleeping_ = false; + bg_cv_.SignalAll(); + } + + void WakeUp() { + MutexLock l(&mutex_); + should_sleep_ = false; + bg_cv_.SignalAll(); + + while (sleeping_) { + bg_cv_.Wait(); + } + } + + bool IsSleeping() { + MutexLock l(&mutex_); + return sleeping_; + } + + static void DoSleepTask(void* arg) { + reinterpret_cast(arg)->DoSleep(); + } + + private: + port::Mutex mutex_; + port::CondVar bg_cv_; // Signalled when background work finishes + bool should_sleep_; + bool sleeping_; + }; + + std::vector tasks(10); + + // Set number of thread to 1 first. + env_->SetBackgroundThreads(1, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + + // Schedule 3 tasks. 0 running; Task 1, 2 waiting. + for (size_t i = 0; i < 3; i++) { + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &tasks[i], + Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + } + ASSERT_EQ(2U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[0].IsSleeping()); + ASSERT_TRUE(!tasks[1].IsSleeping()); + ASSERT_TRUE(!tasks[2].IsSleeping()); + + // Increase to 2 threads. Task 0, 1 running; 2 waiting + env_->SetBackgroundThreads(2, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[0].IsSleeping()); + ASSERT_TRUE(tasks[1].IsSleeping()); + ASSERT_TRUE(!tasks[2].IsSleeping()); + + // Shrink back to 1 thread. Still task 0, 1 running, 2 waiting + env_->SetBackgroundThreads(1, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[0].IsSleeping()); + ASSERT_TRUE(tasks[1].IsSleeping()); + ASSERT_TRUE(!tasks[2].IsSleeping()); + + // The last task finishes. Task 0 running, 2 waiting. + tasks[1].WakeUp(); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[0].IsSleeping()); + ASSERT_TRUE(!tasks[1].IsSleeping()); + ASSERT_TRUE(!tasks[2].IsSleeping()); + + // Increase to 5 threads. Task 0 and 2 running. + env_->SetBackgroundThreads(5, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ((unsigned int)0, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[0].IsSleeping()); + ASSERT_TRUE(tasks[2].IsSleeping()); + + // Change number of threads a couple of times while there is no sufficient + // tasks. + env_->SetBackgroundThreads(7, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + tasks[2].WakeUp(); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + env_->SetBackgroundThreads(3, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + env_->SetBackgroundThreads(4, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + env_->SetBackgroundThreads(5, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + env_->SetBackgroundThreads(4, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + Env::Default()->SleepForMicroseconds(kDelayMicros * 50); + + // Enqueue 5 more tasks. Thread pool size now is 4. + // Task 0, 3, 4, 5 running;6, 7 waiting. + for (size_t i = 3; i < 8; i++) { + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &tasks[i], + Env::Priority::HIGH); + } + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(2U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[3].IsSleeping()); + ASSERT_TRUE(tasks[4].IsSleeping()); + ASSERT_TRUE(tasks[5].IsSleeping()); + ASSERT_TRUE(!tasks[6].IsSleeping()); + ASSERT_TRUE(!tasks[7].IsSleeping()); + + // Wake up task 0, 3 and 4. Task 5, 6, 7 running. + tasks[0].WakeUp(); + tasks[3].WakeUp(); + tasks[4].WakeUp(); + + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ((unsigned int)0, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + for (size_t i = 5; i < 8; i++) { + ASSERT_TRUE(tasks[i].IsSleeping()); + } + + // Shrink back to 1 thread. Still task 5, 6, 7 running + env_->SetBackgroundThreads(1, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(tasks[5].IsSleeping()); + ASSERT_TRUE(tasks[6].IsSleeping()); + ASSERT_TRUE(tasks[7].IsSleeping()); + + // Wake up task 6. Task 5, 7 running + tasks[6].WakeUp(); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(tasks[5].IsSleeping()); + ASSERT_TRUE(!tasks[6].IsSleeping()); + ASSERT_TRUE(tasks[7].IsSleeping()); + + // Wake up threads 7. Task 5 running + tasks[7].WakeUp(); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(!tasks[7].IsSleeping()); + + // Enqueue thread 8 and 9. Task 5 running; one of 8, 9 might be running. + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &tasks[8], + Env::Priority::HIGH); + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &tasks[9], + Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_GT(env_->GetThreadPoolQueueLen(Env::Priority::HIGH), (unsigned int)0); + ASSERT_TRUE(!tasks[8].IsSleeping() || !tasks[9].IsSleeping()); + + // Increase to 4 threads. Task 5, 8, 9 running. + env_->SetBackgroundThreads(4, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ((unsigned int)0, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[8].IsSleeping()); + ASSERT_TRUE(tasks[9].IsSleeping()); + + // Shrink to 1 thread + env_->SetBackgroundThreads(1, Env::Priority::HIGH); + + // Wake up thread 9. + tasks[9].WakeUp(); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(!tasks[9].IsSleeping()); + ASSERT_TRUE(tasks[8].IsSleeping()); + + // Wake up thread 8 + tasks[8].WakeUp(); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(!tasks[8].IsSleeping()); + + // Wake up the last thread + tasks[5].WakeUp(); + + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(!tasks[5].IsSleeping()); +} + +#ifdef OS_LINUX +// To make sure the Env::GetUniqueId() related tests work correctly, The files +// should be stored in regular storage like "hard disk" or "flash device". +// Otherwise we cannot get the correct id. +// +// The following function act as the replacement of test::TmpDir() that may be +// customized by user to be on a storage that doesn't work with GetUniqueId(). +// +// TODO(kailiu) This function still assumes /tmp/ reside in regular +// storage system. +namespace { +bool IsSingleVarint(const std::string& s) { + Slice slice(s); + + uint64_t v; + if (!GetVarint64(&slice, &v)) { + return false; + } + + return slice.size() == 0; +} + +bool IsUniqueIDValid(const std::string& s) { + return !s.empty() && !IsSingleVarint(s); +} + +const size_t MAX_ID_SIZE = 100; +char temp_id[MAX_ID_SIZE]; + +std::string GetOnDiskTestDir() { + char base[100]; + snprintf(base, sizeof(base), "/tmp/rocksdbtest-%d", + static_cast(geteuid())); + // Directory may already exist + Env::Default()->CreateDirIfMissing(base); + + return base; +} +} // namespace + +// Only works in linux platforms +TEST(EnvPosixTest, RandomAccessUniqueID) { + // Create file. + const EnvOptions soptions; + std::string fname = GetOnDiskTestDir() + "/" + "testfile"; + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + + unique_ptr file; + + // Get Unique ID + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id1(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id1)); + + // Get Unique ID again + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id2(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id2)); + + // Get Unique ID again after waiting some time. + env_->SleepForMicroseconds(1000000); + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id3(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id3)); + + // Check IDs are the same. + ASSERT_EQ(unique_id1, unique_id2); + ASSERT_EQ(unique_id2, unique_id3); + + // Delete the file + env_->DeleteFile(fname); +} + +// only works in linux platforms +#ifdef ROCKSDB_FALLOCATE_PRESENT +TEST(EnvPosixTest, AllocateTest) { + std::string fname = GetOnDiskTestDir() + "/preallocate_testfile"; + EnvOptions soptions; + soptions.use_mmap_writes = false; + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + + // allocate 100 MB + size_t kPreallocateSize = 100 * 1024 * 1024; + size_t kBlockSize = 512; + std::string data = "test"; + wfile->SetPreallocationBlockSize(kPreallocateSize); + ASSERT_OK(wfile->Append(Slice(data))); + ASSERT_OK(wfile->Flush()); + + struct stat f_stat; + stat(fname.c_str(), &f_stat); + ASSERT_EQ((unsigned int)data.size(), f_stat.st_size); + // verify that blocks are preallocated + // Note here that we don't check the exact number of blocks preallocated -- + // we only require that number of allocated blocks is at least what we expect. + // It looks like some FS give us more blocks that we asked for. That's fine. + // It might be worth investigating further. + auto st_blocks = f_stat.st_blocks; + ASSERT_LE((unsigned int)(kPreallocateSize / kBlockSize), st_blocks); + + // close the file, should deallocate the blocks + wfile.reset(); + + stat(fname.c_str(), &f_stat); + ASSERT_EQ((unsigned int)data.size(), f_stat.st_size); + // verify that preallocated blocks were deallocated on file close + ASSERT_GT(st_blocks, f_stat.st_blocks); +} +#endif + +// Returns true if any of the strings in ss are the prefix of another string. +bool HasPrefix(const std::unordered_set& ss) { + for (const std::string& s: ss) { + if (s.empty()) { + return true; + } + for (size_t i = 1; i < s.size(); ++i) { + if (ss.count(s.substr(0, i)) != 0) { + return true; + } + } + } + return false; +} + +// Only works in linux platforms +TEST(EnvPosixTest, RandomAccessUniqueIDConcurrent) { + // Check whether a bunch of concurrently existing files have unique IDs. + const EnvOptions soptions; + + // Create the files + std::vector fnames; + for (int i = 0; i < 1000; ++i) { + fnames.push_back(GetOnDiskTestDir() + "/" + "testfile" + std::to_string(i)); + + // Create file. + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fnames[i], &wfile, soptions)); + } + + // Collect and check whether the IDs are unique. + std::unordered_set ids; + for (const std::string fname: fnames) { + unique_ptr file; + std::string unique_id; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + unique_id = std::string(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id)); + + ASSERT_TRUE(ids.count(unique_id) == 0); + ids.insert(unique_id); + } + + // Delete the files + for (const std::string fname: fnames) { + ASSERT_OK(env_->DeleteFile(fname)); + } + + ASSERT_TRUE(!HasPrefix(ids)); +} + +// Only works in linux platforms +TEST(EnvPosixTest, RandomAccessUniqueIDDeletes) { + const EnvOptions soptions; + + std::string fname = GetOnDiskTestDir() + "/" + "testfile"; + + // Check that after file is deleted we don't get same ID again in a new file. + std::unordered_set ids; + for (int i = 0; i < 1000; ++i) { + // Create file. + { + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + } + + // Get Unique ID + std::string unique_id; + { + unique_ptr file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + unique_id = std::string(temp_id, id_size); + } + + ASSERT_TRUE(IsUniqueIDValid(unique_id)); + ASSERT_TRUE(ids.count(unique_id) == 0); + ids.insert(unique_id); + + // Delete the file + ASSERT_OK(env_->DeleteFile(fname)); + } + + ASSERT_TRUE(!HasPrefix(ids)); +} + +// Only works in linux platforms +TEST(EnvPosixTest, InvalidateCache) { + const EnvOptions soptions; + std::string fname = test::TmpDir() + "/" + "testfile"; + + // Create file. + { + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + ASSERT_OK(wfile.get()->Append(Slice("Hello world"))); + ASSERT_OK(wfile.get()->InvalidateCache(0, 0)); + ASSERT_OK(wfile.get()->Close()); + } + + // Random Read + { + unique_ptr file; + char scratch[100]; + Slice result; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + ASSERT_OK(file.get()->Read(0, 11, &result, scratch)); + ASSERT_EQ(memcmp(scratch, "Hello world", 11), 0); + ASSERT_OK(file.get()->InvalidateCache(0, 11)); + ASSERT_OK(file.get()->InvalidateCache(0, 0)); + } + + // Sequential Read + { + unique_ptr file; + char scratch[100]; + Slice result; + ASSERT_OK(env_->NewSequentialFile(fname, &file, soptions)); + ASSERT_OK(file.get()->Read(11, &result, scratch)); + ASSERT_EQ(memcmp(scratch, "Hello world", 11), 0); + ASSERT_OK(file.get()->InvalidateCache(0, 11)); + ASSERT_OK(file.get()->InvalidateCache(0, 0)); + } + // Delete the file + ASSERT_OK(env_->DeleteFile(fname)); +} +#endif + +TEST(EnvPosixTest, PosixRandomRWFileTest) { + EnvOptions soptions; + soptions.use_mmap_writes = soptions.use_mmap_reads = false; + std::string fname = test::TmpDir() + "/" + "testfile"; + + unique_ptr file; + ASSERT_OK(env_->NewRandomRWFile(fname, &file, soptions)); + // If you run the unit test on tmpfs, then tmpfs might not + // support fallocate. It is still better to trigger that + // code-path instead of eliminating it completely. + file.get()->Allocate(0, 10*1024*1024); + ASSERT_OK(file.get()->Write(100, Slice("Hello world"))); + ASSERT_OK(file.get()->Write(105, Slice("Hello world"))); + ASSERT_OK(file.get()->Sync()); + ASSERT_OK(file.get()->Fsync()); + char scratch[100]; + Slice result; + ASSERT_OK(file.get()->Read(100, 16, &result, scratch)); + ASSERT_EQ(result.compare("HelloHello world"), 0); + ASSERT_OK(file.get()->Close()); +} + +class TestLogger : public Logger { + public: + virtual void Logv(const char* format, va_list ap) override { + log_count++; + + char new_format[550]; + std::fill_n(new_format, sizeof(new_format), '2'); + { + va_list backup_ap; + va_copy(backup_ap, ap); + int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap); + // 48 bytes for extra information + bytes allocated + + if (new_format[0] == '[') { + // "[DEBUG] " + ASSERT_TRUE(n <= 56 + (512 - static_cast(sizeof(struct timeval)))); + } else { + ASSERT_TRUE(n <= 48 + (512 - static_cast(sizeof(struct timeval)))); + } + va_end(backup_ap); + } + + for (size_t i = 0; i < sizeof(new_format); i++) { + if (new_format[i] == 'x') { + char_x_count++; + } else if (new_format[i] == '\0') { + char_0_count++; + } + } + } + int log_count; + int char_x_count; + int char_0_count; +}; + +TEST(EnvPosixTest, LogBufferTest) { + TestLogger test_logger; + test_logger.SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); + test_logger.log_count = 0; + test_logger.char_x_count = 0; + test_logger.char_0_count = 0; + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, &test_logger); + LogBuffer log_buffer_debug(DEBUG_LEVEL, &test_logger); + + char bytes200[200]; + std::fill_n(bytes200, sizeof(bytes200), '1'); + bytes200[sizeof(bytes200) - 1] = '\0'; + char bytes600[600]; + std::fill_n(bytes600, sizeof(bytes600), '1'); + bytes600[sizeof(bytes600) - 1] = '\0'; + char bytes9000[9000]; + std::fill_n(bytes9000, sizeof(bytes9000), '1'); + bytes9000[sizeof(bytes9000) - 1] = '\0'; + + LogToBuffer(&log_buffer, "x%sx", bytes200); + LogToBuffer(&log_buffer, "x%sx", bytes600); + LogToBuffer(&log_buffer, "x%sx%sx%sx", bytes200, bytes200, bytes200); + LogToBuffer(&log_buffer, "x%sx%sx", bytes200, bytes600); + LogToBuffer(&log_buffer, "x%sx%sx", bytes600, bytes9000); + + LogToBuffer(&log_buffer_debug, "x%sx", bytes200); + test_logger.SetInfoLogLevel(DEBUG_LEVEL); + LogToBuffer(&log_buffer_debug, "x%sx%sx%sx", bytes600, bytes9000, bytes200); + + ASSERT_EQ(0, test_logger.log_count); + log_buffer.FlushBufferToLog(); + log_buffer_debug.FlushBufferToLog(); + ASSERT_EQ(6, test_logger.log_count); + ASSERT_EQ(6, test_logger.char_0_count); + ASSERT_EQ(10, test_logger.char_x_count); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/filelock_test.cc b/util/filelock_test.cc new file mode 100644 index 0000000000..a9e30a5d3b --- /dev/null +++ b/util/filelock_test.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "rocksdb/status.h" +#include "rocksdb/env.h" + +#include +#include "util/coding.h" +#include "util/testharness.h" + +namespace rocksdb { + +class LockTest { + public: + static LockTest* current_; + std::string file_; + rocksdb::Env* env_; + + LockTest() : file_(test::TmpDir() + "/db_testlock_file"), + env_(rocksdb::Env::Default()) { + current_ = this; + } + + ~LockTest() { + } + + Status LockFile(FileLock** db_lock) { + return env_->LockFile(file_, db_lock); + } + + Status UnlockFile(FileLock* db_lock) { + return env_->UnlockFile(db_lock); + } +}; +LockTest* LockTest::current_; + +TEST(LockTest, LockBySameThread) { + FileLock* lock1; + FileLock* lock2; + + // acquire a lock on a file + ASSERT_OK(LockFile(&lock1)); + + // re-acquire the lock on the same file. This should fail. + ASSERT_TRUE(LockFile(&lock2).IsIOError()); + + // release the lock + ASSERT_OK(UnlockFile(lock1)); + +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/filter_policy.cc b/util/filter_policy.cc new file mode 100644 index 0000000000..e950b75f7e --- /dev/null +++ b/util/filter_policy.cc @@ -0,0 +1,16 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/filter_policy.h" + +namespace rocksdb { + +FilterPolicy::~FilterPolicy() { } + +} // namespace rocksdb diff --git a/util/hash.cc b/util/hash.cc new file mode 100644 index 0000000000..e38c186c3b --- /dev/null +++ b/util/hash.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "util/coding.h" +#include "util/hash.h" + +namespace rocksdb { + +uint32_t Hash(const char* data, size_t n, uint32_t seed) { + // Similar to murmur hash + const uint32_t m = 0xc6a4a793; + const uint32_t r = 24; + const char* limit = data + n; + uint32_t h = seed ^ (n * m); + + // Pick up four bytes at a time + while (data + 4 <= limit) { + uint32_t w = DecodeFixed32(data); + data += 4; + h += w; + h *= m; + h ^= (h >> 16); + } + + // Pick up remaining bytes + switch (limit - data) { + case 3: + h += data[2] << 16; + // fall through + case 2: + h += data[1] << 8; + // fall through + case 1: + h += data[0]; + h *= m; + h ^= (h >> r); + break; + } + return h; +} + +} // namespace rocksdb diff --git a/util/hash.h b/util/hash.h new file mode 100644 index 0000000000..c9eb659ab8 --- /dev/null +++ b/util/hash.h @@ -0,0 +1,20 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Simple hash function used for internal data structures + +#pragma once +#include +#include + +namespace rocksdb { + +extern uint32_t Hash(const char* data, size_t n, uint32_t seed); + +} diff --git a/util/hash_cuckoo_rep.cc b/util/hash_cuckoo_rep.cc new file mode 100644 index 0000000000..e2d2c38e66 --- /dev/null +++ b/util/hash_cuckoo_rep.cc @@ -0,0 +1,636 @@ + +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#ifndef ROCKSDB_LITE +#include "util/hash_cuckoo_rep.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/memtablerep.h" +#include "util/murmurhash.h" +#include "db/memtable.h" +#include "db/skiplist.h" +#include "util/stl_wrappers.h" + +namespace rocksdb { +namespace { + +// the default maximum size of the cuckoo path searching queue +static const int kCuckooPathMaxSearchSteps = 100; + +struct CuckooStep { + static const int kNullStep = -1; + // the bucket id in the cuckoo array. + int bucket_id_; + // index of cuckoo-step array that points to its previous step, + // -1 if it the beginning step. + int prev_step_id_; + // the depth of the current step. + unsigned int depth_; + + CuckooStep() : bucket_id_(-1), prev_step_id_(kNullStep), depth_(1) {} + + CuckooStep(CuckooStep&&) = default; + CuckooStep& operator=(CuckooStep&&) = default; + + CuckooStep(const CuckooStep&) = delete; + CuckooStep& operator=(const CuckooStep&) = delete; + + CuckooStep(int bucket_id, int prev_step_id, int depth) + : bucket_id_(bucket_id), prev_step_id_(prev_step_id), depth_(depth) {} +}; + +class HashCuckooRep : public MemTableRep { + public: + explicit HashCuckooRep(const MemTableRep::KeyComparator& compare, + Arena* arena, const size_t bucket_count, + const unsigned int hash_func_count) + : MemTableRep(arena), + compare_(compare), + arena_(arena), + bucket_count_(bucket_count), + cuckoo_path_max_depth_(kDefaultCuckooPathMaxDepth), + occupied_count_(0), + hash_function_count_(hash_func_count), + backup_table_(nullptr) { + char* mem = reinterpret_cast( + arena_->Allocate(sizeof(std::atomic) * bucket_count_)); + cuckoo_array_ = new (mem) std::atomic[bucket_count_]; + for (unsigned int bid = 0; bid < bucket_count_; ++bid) { + cuckoo_array_[bid].store(nullptr, std::memory_order_relaxed); + } + + cuckoo_path_ = reinterpret_cast( + arena_->Allocate(sizeof(int*) * (cuckoo_path_max_depth_ + 1))); + is_nearly_full_ = false; + } + + // return false, indicating HashCuckooRep does not support merge operator. + virtual bool IsMergeOperatorSupported() const override { return false; } + + // return false, indicating HashCuckooRep does not support snapshot. + virtual bool IsSnapshotSupported() const override { return false; } + + // Returns true iff an entry that compares equal to key is in the collection. + virtual bool Contains(const char* internal_key) const override; + + virtual ~HashCuckooRep() override {} + + // Insert the specified key (internal_key) into the mem-table. Assertion + // fails if + // the current mem-table already contains the specified key. + virtual void Insert(KeyHandle handle) override; + + // This function returns std::numeric_limits::max() in the following + // three cases to disallow further write operations: + // 1. when the fullness reaches kMaxFullnes. + // 2. when the backup_table_ is used. + // + // otherwise, this function will always return 0. + virtual size_t ApproximateMemoryUsage() override { + if (is_nearly_full_) { + return std::numeric_limits::max(); + } + return 0; + } + + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, + const char* entry)) override; + + class Iterator : public MemTableRep::Iterator { + std::shared_ptr> bucket_; + typename std::vector::const_iterator mutable cit_; + const KeyComparator& compare_; + std::string tmp_; // For passing to EncodeKey + bool mutable sorted_; + void DoSort() const; + + public: + explicit Iterator(std::shared_ptr> bucket, + const KeyComparator& compare); + + // Initialize an iterator over the specified collection. + // The returned iterator is not valid. + // explicit Iterator(const MemTableRep* collection); + virtual ~Iterator() override{}; + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const override; + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const override; + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() override; + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() override; + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& user_key, const char* memtable_key) override; + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() override; + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() override; + }; + + struct CuckooStepBuffer { + CuckooStepBuffer() : write_index_(0), read_index_(0) {} + ~CuckooStepBuffer() {} + + int write_index_; + int read_index_; + CuckooStep steps_[kCuckooPathMaxSearchSteps]; + + CuckooStep& NextWriteBuffer() { return steps_[write_index_++]; } + + inline const CuckooStep& ReadNext() { return steps_[read_index_++]; } + + inline bool HasNewWrite() { return write_index_ > read_index_; } + + inline void reset() { + write_index_ = 0; + read_index_ = 0; + } + + inline bool IsFull() { return write_index_ >= kCuckooPathMaxSearchSteps; } + + // returns the number of steps that has been read + inline int ReadCount() { return read_index_; } + + // returns the number of steps that has been written to the buffer. + inline int WriteCount() { return write_index_; } + }; + + private: + const MemTableRep::KeyComparator& compare_; + // the pointer to Arena to allocate memory, immutable after construction. + Arena* const arena_; + // the number of hash bucket in the hash table. + const size_t bucket_count_; + // the maxinum depth of the cuckoo path. + const unsigned int cuckoo_path_max_depth_; + // the current number of entries in cuckoo_array_ which has been occupied. + size_t occupied_count_; + // the current number of hash functions used in the cuckoo hash. + unsigned int hash_function_count_; + // the backup MemTableRep to handle the case where cuckoo hash cannot find + // a vacant bucket for inserting the key of a put request. + std::shared_ptr backup_table_; + // the array to store pointers, pointing to the actual data. + std::atomic* cuckoo_array_; + // a buffer to store cuckoo path + int* cuckoo_path_; + // a boolean flag indicating whether the fullness of bucket array + // reaches the point to make the current memtable immutable. + bool is_nearly_full_; + + // the default maximum depth of the cuckoo path. + static const unsigned int kDefaultCuckooPathMaxDepth = 10; + + CuckooStepBuffer step_buffer_; + + // returns the bucket id assogied to the input slice based on the + unsigned int GetHash(const Slice& slice, const int hash_func_id) const { + // the seeds used in the Murmur hash to produce different hash functions. + static const int kMurmurHashSeeds[HashCuckooRepFactory::kMaxHashCount] = { + 545609244, 1769731426, 763324157, 13099088, 592422103, + 1899789565, 248369300, 1984183468, 1613664382, 1491157517}; + return MurmurHash(slice.data(), slice.size(), + kMurmurHashSeeds[hash_func_id]) % + bucket_count_; + } + + // A cuckoo path is a sequence of bucket ids, where each id points to a + // location of cuckoo_array_. This path describes the displacement sequence + // of entries in order to store the desired data specified by the input user + // key. The path starts from one of the locations associated with the + // specified user key and ends at a vacant space in the cuckoo array. This + // function will update the cuckoo_path. + // + // @return true if it found a cuckoo path. + bool FindCuckooPath(const char* internal_key, const Slice& user_key, + int* cuckoo_path, size_t* cuckoo_path_length, + int initial_hash_id = 0); + + // Perform quick insert by checking whether there is a vacant bucket in one + // of the possible locations of the input key. If so, then the function will + // return true and the key will be stored in that vacant bucket. + // + // This function is a helper function of FindCuckooPath that discovers the + // first possible steps of a cuckoo path. It begins by first computing + // the possible locations of the input keys (and stores them in bucket_ids.) + // Then, if one of its possible locations is vacant, then the input key will + // be stored in that vacant space and the function will return true. + // Otherwise, the function will return false indicating a complete search + // of cuckoo-path is needed. + bool QuickInsert(const char* internal_key, const Slice& user_key, + int bucket_ids[], const int initial_hash_id); + + // Unhide default implementations of GetIterator + using MemTableRep::GetIterator; + // Returns the pointer to the internal iterator to the buckets where buckets + // are sorted according to the user specified KeyComparator. Note that + // any insert after this function call may affect the sorted nature of + // the returned iterator. + virtual MemTableRep::Iterator* GetIterator(Arena* arena) override { + std::vector compact_buckets; + for (unsigned int bid = 0; bid < bucket_count_; ++bid) { + const char* bucket = cuckoo_array_[bid].load(std::memory_order_relaxed); + if (bucket != nullptr) { + compact_buckets.push_back(bucket); + } + } + MemTableRep* backup_table = backup_table_.get(); + if (backup_table != nullptr) { + std::unique_ptr iter(backup_table->GetIterator()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + compact_buckets.push_back(iter->key()); + } + } + if (arena == nullptr) { + return new Iterator( + std::shared_ptr>( + new std::vector(std::move(compact_buckets))), + compare_); + } else { + auto mem = arena->AllocateAligned(sizeof(Iterator)); + return new (mem) Iterator( + std::shared_ptr>( + new std::vector(std::move(compact_buckets))), + compare_); + } + } +}; + +void HashCuckooRep::Get(const LookupKey& key, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + Slice user_key = key.user_key(); + for (unsigned int hid = 0; hid < hash_function_count_; ++hid) { + const char* bucket = + cuckoo_array_[GetHash(user_key, hid)].load(std::memory_order_acquire); + if (bucket != nullptr) { + auto bucket_user_key = UserKey(bucket); + if (user_key.compare(bucket_user_key) == 0) { + callback_func(callback_args, bucket); + break; + } + } else { + // as Put() always stores at the vacant bucket located by the + // hash function with the smallest possible id, when we first + // find a vacant bucket in Get(), that means a miss. + break; + } + } + MemTableRep* backup_table = backup_table_.get(); + if (backup_table != nullptr) { + backup_table->Get(key, callback_args, callback_func); + } +} + +void HashCuckooRep::Insert(KeyHandle handle) { + static const float kMaxFullness = 0.90; + + auto* key = static_cast(handle); + int initial_hash_id = 0; + size_t cuckoo_path_length = 0; + auto user_key = UserKey(key); + // find cuckoo path + if (FindCuckooPath(key, user_key, cuckoo_path_, &cuckoo_path_length, + initial_hash_id) == false) { + // if true, then we can't find a vacant bucket for this key even we + // have used up all the hash functions. Then use a backup memtable to + // store such key, which will further make this mem-table become + // immutable. + if (backup_table_.get() == nullptr) { + VectorRepFactory factory(10); + backup_table_.reset( + factory.CreateMemTableRep(compare_, arena_, nullptr, nullptr)); + is_nearly_full_ = true; + } + backup_table_->Insert(key); + return; + } + // when reaching this point, means the insert can be done successfully. + occupied_count_++; + if (occupied_count_ >= bucket_count_ * kMaxFullness) { + is_nearly_full_ = true; + } + + // perform kickout process if the length of cuckoo path > 1. + if (cuckoo_path_length == 0) return; + + // the cuckoo path stores the kickout path in reverse order. + // so the kickout or displacement is actually performed + // in reverse order, which avoids false-negatives on read + // by moving each key involved in the cuckoo path to the new + // location before replacing it. + for (size_t i = 1; i < cuckoo_path_length; ++i) { + int kicked_out_bid = cuckoo_path_[i - 1]; + int current_bid = cuckoo_path_[i]; + // since we only allow one writer at a time, it is safe to do relaxed read. + cuckoo_array_[kicked_out_bid] + .store(cuckoo_array_[current_bid].load(std::memory_order_relaxed), + std::memory_order_release); + } + int insert_key_bid = cuckoo_path_[cuckoo_path_length - 1]; + cuckoo_array_[insert_key_bid].store(key, std::memory_order_release); +} + +bool HashCuckooRep::Contains(const char* internal_key) const { + auto user_key = UserKey(internal_key); + for (unsigned int hid = 0; hid < hash_function_count_; ++hid) { + const char* stored_key = + cuckoo_array_[GetHash(user_key, hid)].load(std::memory_order_acquire); + if (stored_key != nullptr) { + if (compare_(internal_key, stored_key) == 0) { + return true; + } + } + } + return false; +} + +bool HashCuckooRep::QuickInsert(const char* internal_key, const Slice& user_key, + int bucket_ids[], const int initial_hash_id) { + int cuckoo_bucket_id = -1; + + // Below does the followings: + // 0. Calculate all possible locations of the input key. + // 1. Check if there is a bucket having same user_key as the input does. + // 2. If there exists such bucket, then replace this bucket by the newly + // insert data and return. This step also performs duplication check. + // 3. If no such bucket exists but exists a vacant bucket, then insert the + // input data into it. + // 4. If step 1 to 3 all fail, then return false. + for (unsigned int hid = initial_hash_id; hid < hash_function_count_; ++hid) { + bucket_ids[hid] = GetHash(user_key, hid); + // since only one PUT is allowed at a time, and this is part of the PUT + // operation, so we can safely perform relaxed load. + const char* stored_key = + cuckoo_array_[bucket_ids[hid]].load(std::memory_order_relaxed); + if (stored_key == nullptr) { + if (cuckoo_bucket_id == -1) { + cuckoo_bucket_id = bucket_ids[hid]; + } + } else { + const auto bucket_user_key = UserKey(stored_key); + if (bucket_user_key.compare(user_key) == 0) { + cuckoo_bucket_id = bucket_ids[hid]; + break; + } + } + } + + if (cuckoo_bucket_id != -1) { + cuckoo_array_[cuckoo_bucket_id] + .store(internal_key, std::memory_order_release); + return true; + } + + return false; +} + +// Perform pre-check and find the shortest cuckoo path. A cuckoo path +// is a displacement sequence for inserting the specified input key. +// +// @return true if it successfully found a vacant space or cuckoo-path. +// If the return value is true but the length of cuckoo_path is zero, +// then it indicates that a vacant bucket or an bucket with matched user +// key with the input is found, and a quick insertion is done. +bool HashCuckooRep::FindCuckooPath(const char* internal_key, + const Slice& user_key, int* cuckoo_path, + size_t* cuckoo_path_length, + const int initial_hash_id) { + int bucket_ids[HashCuckooRepFactory::kMaxHashCount]; + *cuckoo_path_length = 0; + + if (QuickInsert(internal_key, user_key, bucket_ids, initial_hash_id)) { + return true; + } + // If this step is reached, then it means: + // 1. no vacant bucket in any of the possible locations of the input key. + // 2. none of the possible locations of the input key has the same user + // key as the input `internal_key`. + + // the front and back indices for the step_queue_ + step_buffer_.reset(); + + for (unsigned int hid = initial_hash_id; hid < hash_function_count_; ++hid) { + /// CuckooStep& current_step = step_queue_[front_pos++]; + CuckooStep& current_step = step_buffer_.NextWriteBuffer(); + current_step.bucket_id_ = bucket_ids[hid]; + current_step.prev_step_id_ = CuckooStep::kNullStep; + current_step.depth_ = 1; + } + + while (step_buffer_.HasNewWrite()) { + int step_id = step_buffer_.read_index_; + const CuckooStep& step = step_buffer_.ReadNext(); + // Since it's a BFS process, then the first step with its depth deeper + // than the maximum allowed depth indicates all the remaining steps + // in the step buffer queue will all exceed the maximum depth. + // Return false immediately indicating we can't find a vacant bucket + // for the input key before the maximum allowed depth. + if (step.depth_ >= cuckoo_path_max_depth_) { + return false; + } + // again, we can perform no barrier load safely here as the current + // thread is the only writer. + auto bucket_user_key = + UserKey(cuckoo_array_[step.bucket_id_].load(std::memory_order_relaxed)); + if (step.prev_step_id_ != CuckooStep::kNullStep) { + if (bucket_user_key.compare(user_key) == 0) { + // then there is a loop in the current path, stop discovering this path. + continue; + } + } + // if the current bucket stores at its nth location, then we only consider + // its mth location where m > n. This property makes sure that all reads + // will not miss if we do have data associated to the query key. + // + // The n and m in the above statement is the start_hid and hid in the code. + unsigned int start_hid = hash_function_count_; + for (unsigned int hid = 0; hid < hash_function_count_; ++hid) { + bucket_ids[hid] = GetHash(bucket_user_key, hid); + if (step.bucket_id_ == bucket_ids[hid]) { + start_hid = hid; + } + } + // must found a bucket which is its current "home". + assert(start_hid != hash_function_count_); + + // explore all possible next steps from the current step. + for (unsigned int hid = start_hid + 1; hid < hash_function_count_; ++hid) { + CuckooStep& next_step = step_buffer_.NextWriteBuffer(); + next_step.bucket_id_ = bucket_ids[hid]; + next_step.prev_step_id_ = step_id; + next_step.depth_ = step.depth_ + 1; + // once a vacant bucket is found, trace back all its previous steps + // to generate a cuckoo path. + if (cuckoo_array_[next_step.bucket_id_].load(std::memory_order_relaxed) == + nullptr) { + // store the last step in the cuckoo path. Note that cuckoo_path + // stores steps in reverse order. This allows us to move keys along + // the cuckoo path by storing each key to the new place first before + // removing it from the old place. This property ensures reads will + // not missed due to moving keys along the cuckoo path. + cuckoo_path[(*cuckoo_path_length)++] = next_step.bucket_id_; + int depth; + for (depth = step.depth_; depth > 0 && step_id != CuckooStep::kNullStep; + depth--) { + const CuckooStep& prev_step = step_buffer_.steps_[step_id]; + cuckoo_path[(*cuckoo_path_length)++] = prev_step.bucket_id_; + step_id = prev_step.prev_step_id_; + } + assert(depth == 0 && step_id == CuckooStep::kNullStep); + return true; + } + if (step_buffer_.IsFull()) { + // if true, then it reaches maxinum number of cuckoo search steps. + return false; + } + } + } + + // tried all possible paths but still not unable to find a cuckoo path + // which path leads to a vacant bucket. + return false; +} + +HashCuckooRep::Iterator::Iterator( + std::shared_ptr> bucket, + const KeyComparator& compare) + : bucket_(bucket), + cit_(bucket_->end()), + compare_(compare), + sorted_(false) {} + +void HashCuckooRep::Iterator::DoSort() const { + if (!sorted_) { + std::sort(bucket_->begin(), bucket_->end(), + stl_wrappers::Compare(compare_)); + cit_ = bucket_->begin(); + sorted_ = true; + } +} + +// Returns true iff the iterator is positioned at a valid node. +bool HashCuckooRep::Iterator::Valid() const { + DoSort(); + return cit_ != bucket_->end(); +} + +// Returns the key at the current position. +// REQUIRES: Valid() +const char* HashCuckooRep::Iterator::key() const { + assert(Valid()); + return *cit_; +} + +// Advances to the next position. +// REQUIRES: Valid() +void HashCuckooRep::Iterator::Next() { + assert(Valid()); + if (cit_ == bucket_->end()) { + return; + } + ++cit_; +} + +// Advances to the previous position. +// REQUIRES: Valid() +void HashCuckooRep::Iterator::Prev() { + assert(Valid()); + if (cit_ == bucket_->begin()) { + // If you try to go back from the first element, the iterator should be + // invalidated. So we set it to past-the-end. This means that you can + // treat the container circularly. + cit_ = bucket_->end(); + } else { + --cit_; + } +} + +// Advance to the first entry with a key >= target +void HashCuckooRep::Iterator::Seek(const Slice& user_key, + const char* memtable_key) { + DoSort(); + // Do binary search to find first value not less than the target + const char* encoded_key = + (memtable_key != nullptr) ? memtable_key : EncodeKey(&tmp_, user_key); + cit_ = std::equal_range(bucket_->begin(), bucket_->end(), encoded_key, + [this](const char* a, const char* b) { + return compare_(a, b) < 0; + }).first; +} + +// Position at the first entry in collection. +// Final state of iterator is Valid() iff collection is not empty. +void HashCuckooRep::Iterator::SeekToFirst() { + DoSort(); + cit_ = bucket_->begin(); +} + +// Position at the last entry in collection. +// Final state of iterator is Valid() iff collection is not empty. +void HashCuckooRep::Iterator::SeekToLast() { + DoSort(); + cit_ = bucket_->end(); + if (bucket_->size() != 0) { + --cit_; + } +} + +} // anom namespace + +MemTableRep* HashCuckooRepFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, Logger* logger) { + // The estimated average fullness. The write performance of any close hash + // degrades as the fullness of the mem-table increases. Setting kFullness + // to a value around 0.7 can better avoid write performance degradation while + // keeping efficient memory usage. + static const float kFullness = 0.7; + size_t pointer_size = sizeof(std::atomic); + assert(write_buffer_size_ >= (average_data_size_ + pointer_size)); + size_t bucket_count = + (write_buffer_size_ / (average_data_size_ + pointer_size)) / kFullness + + 1; + unsigned int hash_function_count = hash_function_count_; + if (hash_function_count < 2) { + hash_function_count = 2; + } + if (hash_function_count > kMaxHashCount) { + hash_function_count = kMaxHashCount; + } + return new HashCuckooRep(compare, arena, bucket_count, hash_function_count); +} + +MemTableRepFactory* NewHashCuckooRepFactory(size_t write_buffer_size, + size_t average_data_size, + unsigned int hash_function_count) { + return new HashCuckooRepFactory(write_buffer_size, average_data_size, + hash_function_count); +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/util/hash_cuckoo_rep.h b/util/hash_cuckoo_rep.h new file mode 100644 index 0000000000..669b6b7d42 --- /dev/null +++ b/util/hash_cuckoo_rep.h @@ -0,0 +1,42 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE +#pragma once +#include "rocksdb/slice_transform.h" +#include "rocksdb/memtablerep.h" + +namespace rocksdb { + +class HashCuckooRepFactory : public MemTableRepFactory { + public: + // maxinum number of hash functions used in the cuckoo hash. + static const unsigned int kMaxHashCount = 10; + + explicit HashCuckooRepFactory(size_t write_buffer_size, + size_t average_data_size, + unsigned int hash_function_count) + : write_buffer_size_(write_buffer_size), + average_data_size_(average_data_size), + hash_function_count_(hash_function_count) {} + + virtual ~HashCuckooRepFactory() {} + + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, Logger* logger) override; + + virtual const char* Name() const override { return "HashCuckooRepFactory"; } + + private: + size_t write_buffer_size_; + size_t average_data_size_; + const unsigned int hash_function_count_; +}; +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc new file mode 100644 index 0000000000..60f245b5ff --- /dev/null +++ b/util/hash_linklist_rep.cc @@ -0,0 +1,495 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#ifndef ROCKSDB_LITE +#include "util/hash_linklist_rep.h" + +#include "rocksdb/memtablerep.h" +#include "util/arena.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "port/port.h" +#include "port/atomic_pointer.h" +#include "util/murmurhash.h" +#include "db/memtable.h" +#include "db/skiplist.h" + +namespace rocksdb { +namespace { + +typedef const char* Key; + +struct Node { + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + Node* Next() { + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return reinterpret_cast(next_.Acquire_Load()); + } + void SetNext(Node* x) { + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_.Release_Store(x); + } + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next() { + return reinterpret_cast(next_.NoBarrier_Load()); + } + + void NoBarrier_SetNext(Node* x) { + next_.NoBarrier_Store(x); + } + + private: + port::AtomicPointer next_; + public: + char key[0]; +}; + +class HashLinkListRep : public MemTableRep { + public: + HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, size_t bucket_size, + size_t huge_page_tlb_size, Logger* logger); + + virtual KeyHandle Allocate(const size_t len, char** buf) override; + + virtual void Insert(KeyHandle handle) override; + + virtual bool Contains(const char* key) const override; + + virtual size_t ApproximateMemoryUsage() override; + + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, + const char* entry)) override; + + virtual ~HashLinkListRep(); + + virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override; + + virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override; + + virtual MemTableRep::Iterator* GetDynamicPrefixIterator( + Arena* arena = nullptr) override; + + private: + friend class DynamicIterator; + typedef SkipList FullList; + + size_t bucket_size_; + + // Maps slices (which are transformed user keys) to buckets of keys sharing + // the same transform. + port::AtomicPointer* buckets_; + + // The user-supplied transform whose domain is the user keys. + const SliceTransform* transform_; + + const MemTableRep::KeyComparator& compare_; + + bool BucketContains(Node* head, const Slice& key) const; + + Slice GetPrefix(const Slice& internal_key) const { + return transform_->Transform(ExtractUserKey(internal_key)); + } + + size_t GetHash(const Slice& slice) const { + return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_; + } + + Node* GetBucket(size_t i) const { + return static_cast(buckets_[i].Acquire_Load()); + } + + Node* GetBucket(const Slice& slice) const { + return GetBucket(GetHash(slice)); + } + + bool Equal(const Slice& a, const Key& b) const { + return (compare_(b, a) == 0); + } + + + bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + + bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, internal_key) < 0); + } + + bool KeyIsAfterNode(const Key& key, const Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, key) < 0); + } + + + Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const; + + class FullListIterator : public MemTableRep::Iterator { + public: + explicit FullListIterator(FullList* list, Arena* arena) + : iter_(list), full_list_(list), arena_(arena) {} + + virtual ~FullListIterator() { + } + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const { + return iter_.Valid(); + } + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const { + assert(Valid()); + return iter_.key(); + } + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() { + assert(Valid()); + iter_.Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() { + assert(Valid()); + iter_.Prev(); + } + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& internal_key, const char* memtable_key) { + const char* encoded_key = + (memtable_key != nullptr) ? + memtable_key : EncodeKey(&tmp_, internal_key); + iter_.Seek(encoded_key); + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() { + iter_.SeekToFirst(); + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() { + iter_.SeekToLast(); + } + private: + FullList::Iterator iter_; + // To destruct with the iterator. + std::unique_ptr full_list_; + std::unique_ptr arena_; + std::string tmp_; // For passing to EncodeKey + }; + + class Iterator : public MemTableRep::Iterator { + public: + explicit Iterator(const HashLinkListRep* const hash_link_list_rep, + Node* head) : + hash_link_list_rep_(hash_link_list_rep), head_(head), node_(nullptr) { + } + + virtual ~Iterator() { + } + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const { + return node_ != nullptr; + } + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const { + assert(Valid()); + return node_->key; + } + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() { + assert(Valid()); + node_ = node_->Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& internal_key, const char* memtable_key) { + node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_, + internal_key); + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + protected: + void Reset(Node* head) { + head_ = head; + node_ = nullptr; + } + private: + friend class HashLinkListRep; + const HashLinkListRep* const hash_link_list_rep_; + Node* head_; + Node* node_; + + virtual void SeekToHead() { + node_ = head_; + } + }; + + class DynamicIterator : public HashLinkListRep::Iterator { + public: + explicit DynamicIterator(HashLinkListRep& memtable_rep) + : HashLinkListRep::Iterator(&memtable_rep, nullptr), + memtable_rep_(memtable_rep) {} + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& k, const char* memtable_key) { + auto transformed = memtable_rep_.GetPrefix(k); + Reset(memtable_rep_.GetBucket(transformed)); + HashLinkListRep::Iterator::Seek(k, memtable_key); + } + + private: + // the underlying memtable + const HashLinkListRep& memtable_rep_; + }; + + class EmptyIterator : public MemTableRep::Iterator { + // This is used when there wasn't a bucket. It is cheaper than + // instantiating an empty bucket over which to iterate. + public: + EmptyIterator() { } + virtual bool Valid() const { + return false; + } + virtual const char* key() const { + assert(false); + return nullptr; + } + virtual void Next() { } + virtual void Prev() { } + virtual void Seek(const Slice& user_key, const char* memtable_key) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + private: + }; +}; + +HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare, + Arena* arena, const SliceTransform* transform, + size_t bucket_size, size_t huge_page_tlb_size, + Logger* logger) + : MemTableRep(arena), + bucket_size_(bucket_size), + transform_(transform), + compare_(compare) { + char* mem = arena_->AllocateAligned(sizeof(port::AtomicPointer) * bucket_size, + huge_page_tlb_size, logger); + + buckets_ = new (mem) port::AtomicPointer[bucket_size]; + + for (size_t i = 0; i < bucket_size_; ++i) { + buckets_[i].NoBarrier_Store(nullptr); + } +} + +HashLinkListRep::~HashLinkListRep() { +} + +KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) { + char* mem = arena_->AllocateAligned(sizeof(Node) + len); + Node* x = new (mem) Node(); + *buf = x->key; + return static_cast(x); +} + +void HashLinkListRep::Insert(KeyHandle handle) { + Node* x = static_cast(handle); + assert(!Contains(x->key)); + Slice internal_key = GetLengthPrefixedSlice(x->key); + auto transformed = GetPrefix(internal_key); + auto& bucket = buckets_[GetHash(transformed)]; + Node* head = static_cast(bucket.Acquire_Load()); + + if (!head) { + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(nullptr); + bucket.Release_Store(static_cast(x)); + return; + } + + Node* cur = head; + Node* prev = nullptr; + while (true) { + if (cur == nullptr) { + break; + } + Node* next = cur->Next(); + // Make sure the lists are sorted. + // If x points to head_ or next points nullptr, it is trivially satisfied. + assert((cur == head) || (next == nullptr) || + KeyIsAfterNode(next->key, cur)); + if (KeyIsAfterNode(internal_key, cur)) { + // Keep searching in this list + prev = cur; + cur = next; + } else { + break; + } + } + + // Our data structure does not allow duplicate insertion + assert(cur == nullptr || !Equal(x->key, cur->key)); + + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(cur); + + if (prev) { + prev->SetNext(x); + } else { + bucket.Release_Store(static_cast(x)); + } +} + +bool HashLinkListRep::Contains(const char* key) const { + Slice internal_key = GetLengthPrefixedSlice(key); + + auto transformed = GetPrefix(internal_key); + auto bucket = GetBucket(transformed); + if (bucket == nullptr) { + return false; + } + return BucketContains(bucket, internal_key); +} + +size_t HashLinkListRep::ApproximateMemoryUsage() { + // Memory is always allocated from the arena. + return 0; +} + +void HashLinkListRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + auto transformed = transform_->Transform(k.user_key()); + auto bucket = GetBucket(transformed); + if (bucket != nullptr) { + Iterator iter(this, bucket); + for (iter.Seek(k.internal_key(), nullptr); + iter.Valid() && callback_func(callback_args, iter.key()); + iter.Next()) { + } + } +} + +MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) { + // allocate a new arena of similar size to the one currently in use + Arena* new_arena = new Arena(arena_->BlockSize()); + auto list = new FullList(compare_, new_arena); + for (size_t i = 0; i < bucket_size_; ++i) { + auto bucket = GetBucket(i); + if (bucket != nullptr) { + Iterator itr(this, bucket); + for (itr.SeekToHead(); itr.Valid(); itr.Next()) { + list->Insert(itr.key()); + } + } + } + if (alloc_arena == nullptr) { + return new FullListIterator(list, new_arena); + } else { + auto mem = alloc_arena->AllocateAligned(sizeof(FullListIterator)); + return new (mem) FullListIterator(list, new_arena); + } +} + +MemTableRep::Iterator* HashLinkListRep::GetIterator(const Slice& slice) { + auto bucket = GetBucket(transform_->Transform(slice)); + if (bucket == nullptr) { + return new EmptyIterator(); + } + return new Iterator(this, bucket); +} + +MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator( + Arena* alloc_arena) { + if (alloc_arena == nullptr) { + return new DynamicIterator(*this); + } else { + auto mem = alloc_arena->AllocateAligned(sizeof(DynamicIterator)); + return new (mem) DynamicIterator(*this); + } +} + +bool HashLinkListRep::BucketContains(Node* head, const Slice& user_key) const { + Node* x = FindGreaterOrEqualInBucket(head, user_key); + return (x != nullptr && Equal(user_key, x->key)); +} + +Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head, + const Slice& key) const { + Node* x = head; + while (true) { + if (x == nullptr) { + return x; + } + Node* next = x->Next(); + // Make sure the lists are sorted. + // If x points to head_ or next points nullptr, it is trivially satisfied. + assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x)); + if (KeyIsAfterNode(key, x)) { + // Keep searching in this list + x = next; + } else { + break; + } + } + return x; +} + +} // anon namespace + +MemTableRep* HashLinkListRepFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, Logger* logger) { + return new HashLinkListRep(compare, arena, transform, bucket_count_, + huge_page_tlb_size_, logger); +} + +MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count, + size_t huge_page_tlb_size) { + return new HashLinkListRepFactory(bucket_count, huge_page_tlb_size); +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/util/hash_linklist_rep.h b/util/hash_linklist_rep.h new file mode 100644 index 0000000000..bf96e8b0e5 --- /dev/null +++ b/util/hash_linklist_rep.h @@ -0,0 +1,38 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE +#pragma once +#include "rocksdb/slice_transform.h" +#include "rocksdb/memtablerep.h" + +namespace rocksdb { + +class HashLinkListRepFactory : public MemTableRepFactory { + public: + explicit HashLinkListRepFactory(size_t bucket_count, + size_t huge_page_tlb_size) + : bucket_count_(bucket_count), huge_page_tlb_size_(huge_page_tlb_size) {} + + virtual ~HashLinkListRepFactory() {} + + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, Logger* logger) override; + + virtual const char* Name() const override { + return "HashLinkListRepFactory"; + } + + private: + const size_t bucket_count_; + const size_t huge_page_tlb_size_; +}; + +} +#endif // ROCKSDB_LITE diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc new file mode 100644 index 0000000000..baee12ad53 --- /dev/null +++ b/util/hash_skiplist_rep.cc @@ -0,0 +1,347 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#ifndef ROCKSDB_LITE +#include "util/hash_skiplist_rep.h" + +#include "rocksdb/memtablerep.h" +#include "util/arena.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "port/port.h" +#include "port/atomic_pointer.h" +#include "util/murmurhash.h" +#include "db/memtable.h" +#include "db/skiplist.h" + +namespace rocksdb { +namespace { + +class HashSkipListRep : public MemTableRep { + public: + HashSkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, size_t bucket_size, + int32_t skiplist_height, int32_t skiplist_branching_factor); + + virtual void Insert(KeyHandle handle) override; + + virtual bool Contains(const char* key) const override; + + virtual size_t ApproximateMemoryUsage() override; + + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, + const char* entry)) override; + + virtual ~HashSkipListRep(); + + virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override; + + virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override; + + virtual MemTableRep::Iterator* GetDynamicPrefixIterator( + Arena* arena = nullptr) override; + + private: + friend class DynamicIterator; + typedef SkipList Bucket; + + size_t bucket_size_; + + const int32_t skiplist_height_; + const int32_t skiplist_branching_factor_; + + // Maps slices (which are transformed user keys) to buckets of keys sharing + // the same transform. + port::AtomicPointer* buckets_; + + // The user-supplied transform whose domain is the user keys. + const SliceTransform* transform_; + + const MemTableRep::KeyComparator& compare_; + // immutable after construction + Arena* const arena_; + + inline size_t GetHash(const Slice& slice) const { + return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_; + } + inline Bucket* GetBucket(size_t i) const { + return static_cast(buckets_[i].Acquire_Load()); + } + inline Bucket* GetBucket(const Slice& slice) const { + return GetBucket(GetHash(slice)); + } + // Get a bucket from buckets_. If the bucket hasn't been initialized yet, + // initialize it before returning. + Bucket* GetInitializedBucket(const Slice& transformed); + + class Iterator : public MemTableRep::Iterator { + public: + explicit Iterator(Bucket* list, bool own_list = true, + Arena* arena = nullptr) + : list_(list), iter_(list), own_list_(own_list), arena_(arena) {} + + virtual ~Iterator() { + // if we own the list, we should also delete it + if (own_list_) { + assert(list_ != nullptr); + delete list_; + } + } + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const { + return list_ != nullptr && iter_.Valid(); + } + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const { + assert(Valid()); + return iter_.key(); + } + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() { + assert(Valid()); + iter_.Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() { + assert(Valid()); + iter_.Prev(); + } + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& internal_key, const char* memtable_key) { + if (list_ != nullptr) { + const char* encoded_key = + (memtable_key != nullptr) ? + memtable_key : EncodeKey(&tmp_, internal_key); + iter_.Seek(encoded_key); + } + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() { + if (list_ != nullptr) { + iter_.SeekToFirst(); + } + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() { + if (list_ != nullptr) { + iter_.SeekToLast(); + } + } + protected: + void Reset(Bucket* list) { + if (own_list_) { + assert(list_ != nullptr); + delete list_; + } + list_ = list; + iter_.SetList(list); + own_list_ = false; + } + private: + // if list_ is nullptr, we should NEVER call any methods on iter_ + // if list_ is nullptr, this Iterator is not Valid() + Bucket* list_; + Bucket::Iterator iter_; + // here we track if we own list_. If we own it, we are also + // responsible for it's cleaning. This is a poor man's shared_ptr + bool own_list_; + std::unique_ptr arena_; + std::string tmp_; // For passing to EncodeKey + }; + + class DynamicIterator : public HashSkipListRep::Iterator { + public: + explicit DynamicIterator(const HashSkipListRep& memtable_rep) + : HashSkipListRep::Iterator(nullptr, false), + memtable_rep_(memtable_rep) {} + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& k, const char* memtable_key) { + auto transformed = memtable_rep_.transform_->Transform(ExtractUserKey(k)); + Reset(memtable_rep_.GetBucket(transformed)); + HashSkipListRep::Iterator::Seek(k, memtable_key); + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + private: + // the underlying memtable + const HashSkipListRep& memtable_rep_; + }; + + class EmptyIterator : public MemTableRep::Iterator { + // This is used when there wasn't a bucket. It is cheaper than + // instantiating an empty bucket over which to iterate. + public: + EmptyIterator() { } + virtual bool Valid() const { + return false; + } + virtual const char* key() const { + assert(false); + return nullptr; + } + virtual void Next() { } + virtual void Prev() { } + virtual void Seek(const Slice& internal_key, + const char* memtable_key) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + private: + }; +}; + +HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare, + Arena* arena, const SliceTransform* transform, + size_t bucket_size, int32_t skiplist_height, + int32_t skiplist_branching_factor) + : MemTableRep(arena), + bucket_size_(bucket_size), + skiplist_height_(skiplist_height), + skiplist_branching_factor_(skiplist_branching_factor), + transform_(transform), + compare_(compare), + arena_(arena) { + buckets_ = new port::AtomicPointer[bucket_size]; + + for (size_t i = 0; i < bucket_size_; ++i) { + buckets_[i].NoBarrier_Store(nullptr); + } +} + +HashSkipListRep::~HashSkipListRep() { + delete[] buckets_; +} + +HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket( + const Slice& transformed) { + size_t hash = GetHash(transformed); + auto bucket = GetBucket(hash); + if (bucket == nullptr) { + auto addr = arena_->AllocateAligned(sizeof(Bucket)); + bucket = new (addr) Bucket(compare_, arena_, skiplist_height_, + skiplist_branching_factor_); + buckets_[hash].Release_Store(static_cast(bucket)); + } + return bucket; +} + +void HashSkipListRep::Insert(KeyHandle handle) { + auto* key = static_cast(handle); + assert(!Contains(key)); + auto transformed = transform_->Transform(UserKey(key)); + auto bucket = GetInitializedBucket(transformed); + bucket->Insert(key); +} + +bool HashSkipListRep::Contains(const char* key) const { + auto transformed = transform_->Transform(UserKey(key)); + auto bucket = GetBucket(transformed); + if (bucket == nullptr) { + return false; + } + return bucket->Contains(key); +} + +size_t HashSkipListRep::ApproximateMemoryUsage() { + return sizeof(buckets_); +} + +void HashSkipListRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + auto transformed = transform_->Transform(k.user_key()); + auto bucket = GetBucket(transformed); + if (bucket != nullptr) { + Bucket::Iterator iter(bucket); + for (iter.Seek(k.memtable_key().data()); + iter.Valid() && callback_func(callback_args, iter.key()); + iter.Next()) { + } + } +} + +MemTableRep::Iterator* HashSkipListRep::GetIterator(Arena* arena) { + // allocate a new arena of similar size to the one currently in use + Arena* new_arena = new Arena(arena_->BlockSize()); + auto list = new Bucket(compare_, new_arena); + for (size_t i = 0; i < bucket_size_; ++i) { + auto bucket = GetBucket(i); + if (bucket != nullptr) { + Bucket::Iterator itr(bucket); + for (itr.SeekToFirst(); itr.Valid(); itr.Next()) { + list->Insert(itr.key()); + } + } + } + if (arena == nullptr) { + return new Iterator(list, true, new_arena); + } else { + auto mem = arena->AllocateAligned(sizeof(Iterator)); + return new (mem) Iterator(list, true, new_arena); + } +} + +MemTableRep::Iterator* HashSkipListRep::GetIterator(const Slice& slice) { + auto bucket = GetBucket(transform_->Transform(slice)); + if (bucket == nullptr) { + return new EmptyIterator(); + } + return new Iterator(bucket, false); +} + +MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator(Arena* arena) { + if (arena == nullptr) { + return new DynamicIterator(*this); + } else { + auto mem = arena->AllocateAligned(sizeof(DynamicIterator)); + return new (mem) DynamicIterator(*this); + } +} + +} // anon namespace + +MemTableRep* HashSkipListRepFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, Logger* logger) { + return new HashSkipListRep(compare, arena, transform, bucket_count_, + skiplist_height_, skiplist_branching_factor_); +} + +MemTableRepFactory* NewHashSkipListRepFactory( + size_t bucket_count, int32_t skiplist_height, + int32_t skiplist_branching_factor) { + return new HashSkipListRepFactory(bucket_count, skiplist_height, + skiplist_branching_factor); +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h new file mode 100644 index 0000000000..6fec60a47a --- /dev/null +++ b/util/hash_skiplist_rep.h @@ -0,0 +1,43 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE +#pragma once +#include "rocksdb/slice_transform.h" +#include "rocksdb/memtablerep.h" + +namespace rocksdb { + +class HashSkipListRepFactory : public MemTableRepFactory { + public: + explicit HashSkipListRepFactory( + size_t bucket_count, + int32_t skiplist_height, + int32_t skiplist_branching_factor) + : bucket_count_(bucket_count), + skiplist_height_(skiplist_height), + skiplist_branching_factor_(skiplist_branching_factor) { } + + virtual ~HashSkipListRepFactory() {} + + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, Logger* logger) override; + + virtual const char* Name() const override { + return "HashSkipListRepFactory"; + } + + private: + const size_t bucket_count_; + const int32_t skiplist_height_; + const int32_t skiplist_branching_factor_; +}; + +} +#endif // ROCKSDB_LITE diff --git a/util/histogram.cc b/util/histogram.cc new file mode 100644 index 0000000000..968769cef5 --- /dev/null +++ b/util/histogram.cc @@ -0,0 +1,198 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/histogram.h" + +#include +#include +#include +#include "port/port.h" + +namespace rocksdb { + +HistogramBucketMapper::HistogramBucketMapper() + : + // Add newer bucket index here. + // Should be alwyas added in sorted order. + // If you change this, you also need to change + // size of array buckets_ in HistogramImpl + bucketValues_( + {1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 12, 14, + 16, 18, 20, 25, 30, 35, + 40, 45, 50, 60, 70, 80, + 90, 100, 120, 140, 160, 180, + 200, 250, 300, 350, 400, 450, + 500, 600, 700, 800, 900, 1000, + 1200, 1400, 1600, 1800, 2000, 2500, + 3000, 3500, 4000, 4500, 5000, 6000, + 7000, 8000, 9000, 10000, 12000, 14000, + 16000, 18000, 20000, 25000, 30000, 35000, + 40000, 45000, 50000, 60000, 70000, 80000, + 90000, 100000, 120000, 140000, 160000, 180000, + 200000, 250000, 300000, 350000, 400000, 450000, + 500000, 600000, 700000, 800000, 900000, 1000000, + 1200000, 1400000, 1600000, 1800000, 2000000, 2500000, + 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, + 7000000, 8000000, 9000000, 10000000, 12000000, 14000000, + 16000000, 18000000, 20000000, 25000000, 30000000, 35000000, + 40000000, 45000000, 50000000, 60000000, 70000000, 80000000, + 90000000, 100000000, 120000000, 140000000, 160000000, 180000000, + 200000000, 250000000, 300000000, 350000000, 400000000, 450000000, + 500000000, 600000000, 700000000, 800000000, 900000000, 1000000000}), + maxBucketValue_(bucketValues_.back()), + minBucketValue_(bucketValues_.front()) { + for (size_t i =0; i < bucketValues_.size(); ++i) { + valueIndexMap_[bucketValues_[i]] = i; + } +} + +const size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { + if (value >= maxBucketValue_) { + return bucketValues_.size() - 1; + } else if ( value >= minBucketValue_ ) { + std::map::const_iterator lowerBound = + valueIndexMap_.lower_bound(value); + if (lowerBound != valueIndexMap_.end()) { + return lowerBound->second; + } else { + return 0; + } + } else { + return 0; + } +} + +namespace { + const HistogramBucketMapper bucketMapper; +} + +void HistogramImpl::Clear() { + min_ = bucketMapper.LastValue(); + max_ = 0; + num_ = 0; + sum_ = 0; + sum_squares_ = 0; + memset(buckets_, 0, sizeof buckets_); +} + +bool HistogramImpl::Empty() { return sum_squares_ == 0; } + +void HistogramImpl::Add(uint64_t value) { + const size_t index = bucketMapper.IndexForValue(value); + buckets_[index] += 1; + if (min_ > value) min_ = value; + if (max_ < value) max_ = value; + num_++; + sum_ += value; + sum_squares_ += (value * value); +} + +void HistogramImpl::Merge(const HistogramImpl& other) { + if (other.min_ < min_) min_ = other.min_; + if (other.max_ > max_) max_ = other.max_; + num_ += other.num_; + sum_ += other.sum_; + sum_squares_ += other.sum_squares_; + for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) { + buckets_[b] += other.buckets_[b]; + } +} + +double HistogramImpl::Median() const { + return Percentile(50.0); +} + +double HistogramImpl::Percentile(double p) const { + double threshold = num_ * (p / 100.0); + double sum = 0; + for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) { + sum += buckets_[b]; + if (sum >= threshold) { + // Scale linearly within this bucket + double left_point = (b == 0) ? 0 : bucketMapper.BucketLimit(b-1); + double right_point = bucketMapper.BucketLimit(b); + double left_sum = sum - buckets_[b]; + double right_sum = sum; + double pos = 0; + double right_left_diff = right_sum - left_sum; + if (right_left_diff != 0) { + pos = (threshold - left_sum) / (right_sum - left_sum); + } + double r = left_point + (right_point - left_point) * pos; + if (r < min_) r = min_; + if (r > max_) r = max_; + return r; + } + } + return max_; +} + +double HistogramImpl::Average() const { + if (num_ == 0.0) return 0; + return sum_ / num_; +} + +double HistogramImpl::StandardDeviation() const { + if (num_ == 0.0) return 0; + double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_); + return sqrt(variance); +} + +std::string HistogramImpl::ToString() const { + std::string r; + char buf[200]; + snprintf(buf, sizeof(buf), + "Count: %.0f Average: %.4f StdDev: %.2f\n", + num_, Average(), StandardDeviation()); + r.append(buf); + snprintf(buf, sizeof(buf), + "Min: %.4f Median: %.4f Max: %.4f\n", + (num_ == 0.0 ? 0.0 : min_), Median(), max_); + r.append(buf); + snprintf(buf, sizeof(buf), + "Percentiles: " + "P50: %.2f P75: %.2f P99: %.2f P99.9: %.2f P99.99: %.2f\n", + Percentile(50), Percentile(75), Percentile(99), Percentile(99.9), + Percentile(99.99)); + r.append(buf); + r.append("------------------------------------------------------\n"); + const double mult = 100.0 / num_; + double sum = 0; + for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) { + if (buckets_[b] <= 0.0) continue; + sum += buckets_[b]; + snprintf(buf, sizeof(buf), + "[ %7lu, %7lu ) %8lu %7.3f%% %7.3f%% ", + // left + (unsigned long)((b == 0) ? 0 : bucketMapper.BucketLimit(b-1)), + (unsigned long)bucketMapper.BucketLimit(b), // right + (unsigned long)buckets_[b], // count + (mult * buckets_[b]), // percentage + (mult * sum)); // cumulative percentage + r.append(buf); + + // Add hash marks based on percentage; 20 marks for 100%. + int marks = static_cast(20*(buckets_[b] / num_) + 0.5); + r.append(marks, '#'); + r.push_back('\n'); + } + return r; +} + +void HistogramImpl::Data(HistogramData * const data) const { + assert(data); + data->median = Median(); + data->percentile95 = Percentile(95); + data->percentile99 = Percentile(99); + data->average = Average(); + data->standard_deviation = StandardDeviation(); +} + +} // namespace levedb diff --git a/util/histogram.h b/util/histogram.h new file mode 100644 index 0000000000..d95588dc2d --- /dev/null +++ b/util/histogram.h @@ -0,0 +1,79 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/statistics.h" + +#include +#include +#include +#include + +namespace rocksdb { + +class HistogramBucketMapper { + public: + + HistogramBucketMapper(); + + // converts a value to the bucket index. + const size_t IndexForValue(const uint64_t value) const; + // number of buckets required. + + const size_t BucketCount() const { + return bucketValues_.size(); + } + + uint64_t LastValue() const { + return maxBucketValue_; + } + + uint64_t FirstValue() const { + return minBucketValue_; + } + + uint64_t BucketLimit(const uint64_t bucketNumber) const { + assert(bucketNumber < BucketCount()); + return bucketValues_[bucketNumber]; + } + + private: + const std::vector bucketValues_; + const uint64_t maxBucketValue_; + const uint64_t minBucketValue_; + std::map valueIndexMap_; +}; + +class HistogramImpl { + public: + virtual void Clear(); + virtual bool Empty(); + virtual void Add(uint64_t value); + void Merge(const HistogramImpl& other); + + virtual std::string ToString() const; + + virtual double Median() const; + virtual double Percentile(double p) const; + virtual double Average() const; + virtual double StandardDeviation() const; + virtual void Data(HistogramData * const data) const; + + private: + // To be able to use HistogramImpl as thread local variable, its constructor + // has to be static. That's why we're using manually values from BucketMapper + double min_ = 1000000000; // this is BucketMapper:LastValue() + double max_ = 0; + double num_ = 0; + double sum_ = 0; + double sum_squares_ = 0; + uint64_t buckets_[138] = {0}; // this is BucketMapper::BucketCount() +}; + +} // namespace rocksdb diff --git a/util/histogram_test.cc b/util/histogram_test.cc new file mode 100644 index 0000000000..065f9579ab --- /dev/null +++ b/util/histogram_test.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/histogram.h" + +#include "util/testharness.h" + +namespace rocksdb { + +class HistogramTest { }; + +TEST(HistogramTest, BasicOperation) { + + HistogramImpl histogram; + for (uint64_t i = 1; i <= 100; i++) { + histogram.Add(i); + } + + { + double median = histogram.Median(); + // ASSERT_LE(median, 50); + ASSERT_GT(median, 0); + } + + { + double percentile100 = histogram.Percentile(100.0); + ASSERT_LE(percentile100, 100.0); + ASSERT_GT(percentile100, 0.0); + double percentile99 = histogram.Percentile(99.0); + double percentile85 = histogram.Percentile(85.0); + ASSERT_LE(percentile99, 99.0); + ASSERT_TRUE(percentile99 >= percentile85); + } + + ASSERT_EQ(histogram.Average(), 50.5); // avg is acurately caluclated. +} + +TEST(HistogramTest, EmptyHistogram) { + HistogramImpl histogram; + ASSERT_EQ(histogram.Median(), 0.0); + ASSERT_EQ(histogram.Percentile(85.0), 0.0); + ASSERT_EQ(histogram.Average(), 0.0); +} + +TEST(HistogramTest, ClearHistogram) { + HistogramImpl histogram; + for (uint64_t i = 1; i <= 100; i++) { + histogram.Add(i); + } + histogram.Clear(); + ASSERT_EQ(histogram.Median(), 0); + ASSERT_EQ(histogram.Percentile(85.0), 0); + ASSERT_EQ(histogram.Average(), 0); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc new file mode 100644 index 0000000000..597179fd94 --- /dev/null +++ b/util/ldb_cmd.cc @@ -0,0 +1,1839 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef ROCKSDB_LITE +#include "util/ldb_cmd.h" + +#include "db/dbformat.h" +#include "db/db_impl.h" +#include "db/log_reader.h" +#include "db/filename.h" +#include "db/write_batch_internal.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/cache.h" +#include "util/coding.h" +#include "utilities/ttl/db_ttl_impl.h" + +#include +#include +#include +#include +#include + +namespace rocksdb { + +using namespace std; + +const string LDBCommand::ARG_DB = "db"; +const string LDBCommand::ARG_HEX = "hex"; +const string LDBCommand::ARG_KEY_HEX = "key_hex"; +const string LDBCommand::ARG_VALUE_HEX = "value_hex"; +const string LDBCommand::ARG_TTL = "ttl"; +const string LDBCommand::ARG_TTL_START = "start_time"; +const string LDBCommand::ARG_TTL_END = "end_time"; +const string LDBCommand::ARG_TIMESTAMP = "timestamp"; +const string LDBCommand::ARG_FROM = "from"; +const string LDBCommand::ARG_TO = "to"; +const string LDBCommand::ARG_MAX_KEYS = "max_keys"; +const string LDBCommand::ARG_BLOOM_BITS = "bloom_bits"; +const string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type"; +const string LDBCommand::ARG_BLOCK_SIZE = "block_size"; +const string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction"; +const string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size"; +const string LDBCommand::ARG_FILE_SIZE = "file_size"; +const string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing"; + +const char* LDBCommand::DELIM = " ==> "; + +LDBCommand* LDBCommand::InitFromCmdLineArgs( + int argc, + char** argv, + const Options& options +) { + vector args; + for (int i = 1; i < argc; i++) { + args.push_back(argv[i]); + } + return InitFromCmdLineArgs(args, options); +} + +/** + * Parse the command-line arguments and create the appropriate LDBCommand2 + * instance. + * The command line arguments must be in the following format: + * ./ldb --db=PATH_TO_DB [--commonOpt1=commonOpt1Val] .. + * COMMAND ... [-cmdSpecificOpt1=cmdSpecificOpt1Val] .. + * This is similar to the command line format used by HBaseClientTool. + * Command name is not included in args. + * Returns nullptr if the command-line cannot be parsed. + */ +LDBCommand* LDBCommand::InitFromCmdLineArgs( + const vector& args, + const Options& options +) { + // --x=y command line arguments are added as x->y map entries. + map option_map; + + // Command-line arguments of the form --hex end up in this array as hex + vector flags; + + // Everything other than option_map and flags. Represents commands + // and their parameters. For eg: put key1 value1 go into this vector. + vector cmdTokens; + + const string OPTION_PREFIX = "--"; + + for (const auto& arg : args) { + if (arg[0] == '-' && arg[1] == '-'){ + vector splits = stringSplit(arg, '='); + if (splits.size() == 2) { + string optionKey = splits[0].substr(OPTION_PREFIX.size()); + option_map[optionKey] = splits[1]; + } else { + string optionKey = splits[0].substr(OPTION_PREFIX.size()); + flags.push_back(optionKey); + } + } else { + cmdTokens.push_back(arg); + } + } + + if (cmdTokens.size() < 1) { + fprintf(stderr, "Command not specified!"); + return nullptr; + } + + string cmd = cmdTokens[0]; + vector cmdParams(cmdTokens.begin()+1, cmdTokens.end()); + LDBCommand* command = LDBCommand::SelectCommand( + cmd, + cmdParams, + option_map, + flags + ); + + if (command) { + command->SetOptions(options); + } + return command; +} + +LDBCommand* LDBCommand::SelectCommand( + const std::string& cmd, + const vector& cmdParams, + const map& option_map, + const vector& flags + ) { + + if (cmd == GetCommand::Name()) { + return new GetCommand(cmdParams, option_map, flags); + } else if (cmd == PutCommand::Name()) { + return new PutCommand(cmdParams, option_map, flags); + } else if (cmd == BatchPutCommand::Name()) { + return new BatchPutCommand(cmdParams, option_map, flags); + } else if (cmd == ScanCommand::Name()) { + return new ScanCommand(cmdParams, option_map, flags); + } else if (cmd == DeleteCommand::Name()) { + return new DeleteCommand(cmdParams, option_map, flags); + } else if (cmd == ApproxSizeCommand::Name()) { + return new ApproxSizeCommand(cmdParams, option_map, flags); + } else if (cmd == DBQuerierCommand::Name()) { + return new DBQuerierCommand(cmdParams, option_map, flags); + } else if (cmd == CompactorCommand::Name()) { + return new CompactorCommand(cmdParams, option_map, flags); + } else if (cmd == WALDumperCommand::Name()) { + return new WALDumperCommand(cmdParams, option_map, flags); + } else if (cmd == ReduceDBLevelsCommand::Name()) { + return new ReduceDBLevelsCommand(cmdParams, option_map, flags); + } else if (cmd == ChangeCompactionStyleCommand::Name()) { + return new ChangeCompactionStyleCommand(cmdParams, option_map, flags); + } else if (cmd == DBDumperCommand::Name()) { + return new DBDumperCommand(cmdParams, option_map, flags); + } else if (cmd == DBLoaderCommand::Name()) { + return new DBLoaderCommand(cmdParams, option_map, flags); + } else if (cmd == ManifestDumpCommand::Name()) { + return new ManifestDumpCommand(cmdParams, option_map, flags); + } else if (cmd == ListColumnFamiliesCommand::Name()) { + return new ListColumnFamiliesCommand(cmdParams, option_map, flags); + } else if (cmd == InternalDumpCommand::Name()) { + return new InternalDumpCommand(cmdParams, option_map, flags); + } else if (cmd == CheckConsistencyCommand::Name()) { + return new CheckConsistencyCommand(cmdParams, option_map, flags); + } + return nullptr; +} + + +/** + * Parses the specific integer option and fills in the value. + * Returns true if the option is found. + * Returns false if the option is not found or if there is an error parsing the + * value. If there is an error, the specified exec_state is also + * updated. + */ +bool LDBCommand::ParseIntOption(const map& options, + const string& option, int& value, + LDBCommandExecuteResult& exec_state) { + + map::const_iterator itr = option_map_.find(option); + if (itr != option_map_.end()) { + try { + value = stoi(itr->second); + return true; + } catch(const invalid_argument&) { + exec_state = LDBCommandExecuteResult::FAILED(option + + " has an invalid value."); + } catch(const out_of_range&) { + exec_state = LDBCommandExecuteResult::FAILED(option + + " has a value out-of-range."); + } + } + return false; +} + +/** + * Parses the specified option and fills in the value. + * Returns true if the option is found. + * Returns false otherwise. + */ +bool LDBCommand::ParseStringOption(const map& options, + const string& option, string* value) { + auto itr = option_map_.find(option); + if (itr != option_map_.end()) { + *value = itr->second; + return true; + } + return false; +} + +Options LDBCommand::PrepareOptionsForOpenDB() { + + Options opt = options_; + opt.create_if_missing = false; + + map::const_iterator itr; + + int bits; + if (ParseIntOption(option_map_, ARG_BLOOM_BITS, bits, exec_state_)) { + if (bits > 0) { + opt.filter_policy = NewBloomFilterPolicy(bits); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOOM_BITS + + " must be > 0."); + } + } + + int block_size; + if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, block_size, exec_state_)) { + if (block_size > 0) { + opt.block_size = block_size; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOCK_SIZE + + " must be > 0."); + } + } + + itr = option_map_.find(ARG_AUTO_COMPACTION); + if (itr != option_map_.end()) { + opt.disable_auto_compactions = ! StringToBool(itr->second); + } + + itr = option_map_.find(ARG_COMPRESSION_TYPE); + if (itr != option_map_.end()) { + string comp = itr->second; + if (comp == "no") { + opt.compression = kNoCompression; + } else if (comp == "snappy") { + opt.compression = kSnappyCompression; + } else if (comp == "zlib") { + opt.compression = kZlibCompression; + } else if (comp == "bzip2") { + opt.compression = kBZip2Compression; + } else if (comp == "lz4") { + opt.compression = kLZ4Compression; + } else if (comp == "lz4hc") { + opt.compression = kLZ4HCCompression; + } else { + // Unknown compression. + exec_state_ = LDBCommandExecuteResult::FAILED( + "Unknown compression level: " + comp); + } + } + + int write_buffer_size; + if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size, + exec_state_)) { + if (write_buffer_size > 0) { + opt.write_buffer_size = write_buffer_size; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_WRITE_BUFFER_SIZE + + " must be > 0."); + } + } + + int file_size; + if (ParseIntOption(option_map_, ARG_FILE_SIZE, file_size, exec_state_)) { + if (file_size > 0) { + opt.target_file_size_base = file_size; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_FILE_SIZE + + " must be > 0."); + } + } + + return opt; +} + +bool LDBCommand::ParseKeyValue(const string& line, string* key, string* value, + bool is_key_hex, bool is_value_hex) { + size_t pos = line.find(DELIM); + if (pos != string::npos) { + *key = line.substr(0, pos); + *value = line.substr(pos + strlen(DELIM)); + if (is_key_hex) { + *key = HexToString(*key); + } + if (is_value_hex) { + *value = HexToString(*value); + } + return true; + } else { + return false; + } +} + +/** + * Make sure that ONLY the command-line options and flags expected by this + * command are specified on the command-line. Extraneous options are usually + * the result of user error. + * Returns true if all checks pass. Else returns false, and prints an + * appropriate error msg to stderr. + */ +bool LDBCommand::ValidateCmdLineOptions() { + + for (map::const_iterator itr = option_map_.begin(); + itr != option_map_.end(); itr++) { + if (find(valid_cmd_line_options_.begin(), + valid_cmd_line_options_.end(), itr->first) == + valid_cmd_line_options_.end()) { + fprintf(stderr, "Invalid command-line option %s\n", itr->first.c_str()); + return false; + } + } + + for (vector::const_iterator itr = flags_.begin(); + itr != flags_.end(); itr++) { + if (find(valid_cmd_line_options_.begin(), + valid_cmd_line_options_.end(), *itr) == + valid_cmd_line_options_.end()) { + fprintf(stderr, "Invalid command-line flag %s\n", itr->c_str()); + return false; + } + } + + if (!NoDBOpen() && option_map_.find(ARG_DB) == option_map_.end()) { + fprintf(stderr, "%s must be specified\n", ARG_DB.c_str()); + return false; + } + + return true; +} + +CompactorCommand::CompactorCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_FROM, ARG_TO, ARG_HEX, ARG_KEY_HEX, + ARG_VALUE_HEX, ARG_TTL})), + null_from_(true), null_to_(true) { + + map::const_iterator itr = options.find(ARG_FROM); + if (itr != options.end()) { + null_from_ = false; + from_ = itr->second; + } + + itr = options.find(ARG_TO); + if (itr != options.end()) { + null_to_ = false; + to_ = itr->second; + } + + if (is_key_hex_) { + if (!null_from_) { + from_ = HexToString(from_); + } + if (!null_to_) { + to_ = HexToString(to_); + } + } +} + +void CompactorCommand::Help(string& ret) { + ret.append(" "); + ret.append(CompactorCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append("\n"); +} + +void CompactorCommand::DoCommand() { + + Slice* begin = nullptr; + Slice* end = nullptr; + if (!null_from_) { + begin = new Slice(from_); + } + if (!null_to_) { + end = new Slice(to_); + } + + db_->CompactRange(begin, end); + exec_state_ = LDBCommandExecuteResult::SUCCEED(""); + + delete begin; + delete end; +} + +const string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal"; +const string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load"; +const string DBLoaderCommand::ARG_COMPACT = "compact"; + +DBLoaderCommand::DBLoaderCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_FROM, ARG_TO, ARG_CREATE_IF_MISSING, + ARG_DISABLE_WAL, ARG_BULK_LOAD, + ARG_COMPACT})), + create_if_missing_(false), disable_wal_(false), bulk_load_(false), + compact_(false) { + + create_if_missing_ = IsFlagPresent(flags, ARG_CREATE_IF_MISSING); + disable_wal_ = IsFlagPresent(flags, ARG_DISABLE_WAL); + bulk_load_ = IsFlagPresent(flags, ARG_BULK_LOAD); + compact_ = IsFlagPresent(flags, ARG_COMPACT); +} + +void DBLoaderCommand::Help(string& ret) { + ret.append(" "); + ret.append(DBLoaderCommand::Name()); + ret.append(" [--" + ARG_CREATE_IF_MISSING + "]"); + ret.append(" [--" + ARG_DISABLE_WAL + "]"); + ret.append(" [--" + ARG_BULK_LOAD + "]"); + ret.append(" [--" + ARG_COMPACT + "]"); + ret.append("\n"); +} + +Options DBLoaderCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + opt.create_if_missing = create_if_missing_; + if (bulk_load_) { + opt.PrepareForBulkLoad(); + } + return opt; +} + +void DBLoaderCommand::DoCommand() { + if (!db_) { + return; + } + + WriteOptions write_options; + if (disable_wal_) { + write_options.disableWAL = true; + } + + int bad_lines = 0; + string line; + while (getline(cin, line, '\n')) { + string key; + string value; + if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) { + db_->Put(write_options, Slice(key), Slice(value)); + } else if (0 == line.find("Keys in range:")) { + // ignore this line + } else if (0 == line.find("Created bg thread 0x")) { + // ignore this line + } else { + bad_lines ++; + } + } + + if (bad_lines > 0) { + cout << "Warning: " << bad_lines << " bad lines ignored." << endl; + } + if (compact_) { + db_->CompactRange(nullptr, nullptr); + } +} + +// ---------------------------------------------------------------------------- + +const string ManifestDumpCommand::ARG_VERBOSE = "verbose"; +const string ManifestDumpCommand::ARG_PATH = "path"; + +void ManifestDumpCommand::Help(string& ret) { + ret.append(" "); + ret.append(ManifestDumpCommand::Name()); + ret.append(" [--" + ARG_VERBOSE + "]"); + ret.append(" [--" + ARG_PATH + "=]"); + ret.append("\n"); +} + +ManifestDumpCommand::ManifestDumpCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX})), + verbose_(false), + path_("") +{ + verbose_ = IsFlagPresent(flags, ARG_VERBOSE); + + map::const_iterator itr = options.find(ARG_PATH); + if (itr != options.end()) { + path_ = itr->second; + if (path_.empty()) { + exec_state_ = LDBCommandExecuteResult::FAILED("--path: missing pathname"); + } + } +} + +void ManifestDumpCommand::DoCommand() { + + std::string manifestfile; + + if (!path_.empty()) { + manifestfile = path_; + } else { + bool found = false; + // We need to find the manifest file by searching the directory + // containing the db for files of the form MANIFEST_[0-9]+ + DIR* d = opendir(db_path_.c_str()); + if (d == nullptr) { + exec_state_ = LDBCommandExecuteResult::FAILED( + db_path_ + " is not a directory"); + return; + } + struct dirent* entry; + while ((entry = readdir(d)) != nullptr) { + unsigned int match; + unsigned long long num; + if (sscanf(entry->d_name, + "MANIFEST-%ln%ln", + (unsigned long*)&num, + (unsigned long*)&match) + && match == strlen(entry->d_name)) { + if (!found) { + manifestfile = db_path_ + "/" + std::string(entry->d_name); + found = true; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Multiple MANIFEST files found; use --path to select one"); + return; + } + } + } + closedir(d); + } + + if (verbose_) { + printf("Processing Manifest file %s\n", manifestfile.c_str()); + } + + Options options; + EnvOptions sopt; + std::string file(manifestfile); + std::string dbname("dummy"); + std::shared_ptr tc(NewLRUCache( + options.max_open_files - 10, options.table_cache_numshardbits, + options.table_cache_remove_scan_count_limit)); + VersionSet* versions = new VersionSet(dbname, &options, sopt, tc.get()); + Status s = versions->DumpManifest(options, file, verbose_, is_key_hex_); + if (!s.ok()) { + printf("Error in processing file %s %s\n", manifestfile.c_str(), + s.ToString().c_str()); + } + if (verbose_) { + printf("Processing Manifest file %s done\n", manifestfile.c_str()); + } +} + +// ---------------------------------------------------------------------------- + +void ListColumnFamiliesCommand::Help(string& ret) { + ret.append(" "); + ret.append(ListColumnFamiliesCommand::Name()); + ret.append(" full_path_to_db_directory "); + ret.append("\n"); +} + +ListColumnFamiliesCommand::ListColumnFamiliesCommand( + const vector& params, const map& options, + const vector& flags) + : LDBCommand(options, flags, false, {}) { + + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "dbname must be specified for the list_column_families command"); + } else { + dbname_ = params[0]; + } +} + +void ListColumnFamiliesCommand::DoCommand() { + vector column_families; + Status s = DB::ListColumnFamilies(DBOptions(), dbname_, &column_families); + if (!s.ok()) { + printf("Error in processing db %s %s\n", dbname_.c_str(), + s.ToString().c_str()); + } else { + printf("Column families in %s: \n{", dbname_.c_str()); + bool first = true; + for (auto cf : column_families) { + if (!first) { + printf(", "); + } + first = false; + printf("%s", cf.c_str()); + } + printf("}\n"); + } +} + +// ---------------------------------------------------------------------------- + +namespace { + +string ReadableTime(int unixtime) { + char time_buffer [80]; + time_t rawtime = unixtime; + struct tm * timeinfo = localtime(&rawtime); + strftime(time_buffer, 80, "%c", timeinfo); + return string(time_buffer); +} + +// This function only called when it's the sane case of >1 buckets in time-range +// Also called only when timekv falls between ttl_start and ttl_end provided +void IncBucketCounts(vector& bucket_counts, int ttl_start, + int time_range, int bucket_size, int timekv, int num_buckets) { + assert(time_range > 0 && timekv >= ttl_start && bucket_size > 0 && + timekv < (ttl_start + time_range) && num_buckets > 1); + int bucket = (timekv - ttl_start) / bucket_size; + bucket_counts[bucket]++; +} + +void PrintBucketCounts(const vector& bucket_counts, int ttl_start, + int ttl_end, int bucket_size, int num_buckets) { + int time_point = ttl_start; + for(int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) { + fprintf(stdout, "Keys in range %s to %s : %lu\n", + ReadableTime(time_point).c_str(), + ReadableTime(time_point + bucket_size).c_str(), + (unsigned long)bucket_counts[i]); + } + fprintf(stdout, "Keys in range %s to %s : %lu\n", + ReadableTime(time_point).c_str(), + ReadableTime(ttl_end).c_str(), + (unsigned long)bucket_counts[num_buckets - 1]); +} + +} // namespace + +const string InternalDumpCommand::ARG_COUNT_ONLY = "count_only"; +const string InternalDumpCommand::ARG_COUNT_DELIM = "count_delim"; +const string InternalDumpCommand::ARG_STATS = "stats"; +const string InternalDumpCommand::ARG_INPUT_KEY_HEX = "input_key_hex"; + +InternalDumpCommand::InternalDumpCommand(const vector& params, + const map& options, + const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions({ ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_FROM, ARG_TO, ARG_MAX_KEYS, + ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS, + ARG_INPUT_KEY_HEX})), + has_from_(false), + has_to_(false), + max_keys_(-1), + delim_("."), + count_only_(false), + count_delim_(false), + print_stats_(false), + is_input_key_hex_(false) { + + has_from_ = ParseStringOption(options, ARG_FROM, &from_); + has_to_ = ParseStringOption(options, ARG_TO, &to_); + + ParseIntOption(options, ARG_MAX_KEYS, max_keys_, exec_state_); + map::const_iterator itr = options.find(ARG_COUNT_DELIM); + if (itr != options.end()) { + delim_ = itr->second; + count_delim_ = true; + // fprintf(stdout,"delim = %c\n",delim_[0]); + } else { + count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM); + delim_="."; + } + + print_stats_ = IsFlagPresent(flags, ARG_STATS); + count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY); + is_input_key_hex_ = IsFlagPresent(flags, ARG_INPUT_KEY_HEX); + + if (is_input_key_hex_) { + if (has_from_) { + from_ = HexToString(from_); + } + if (has_to_) { + to_ = HexToString(to_); + } + } +} + +void InternalDumpCommand::Help(string& ret) { + ret.append(" "); + ret.append(InternalDumpCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append(" [--" + ARG_INPUT_KEY_HEX + "]"); + ret.append(" [--" + ARG_MAX_KEYS + "=]"); + ret.append(" [--" + ARG_COUNT_ONLY + "]"); + ret.append(" [--" + ARG_COUNT_DELIM + "=]"); + ret.append(" [--" + ARG_STATS + "]"); + ret.append("\n"); +} + +void InternalDumpCommand::DoCommand() { + if (!db_) { + return; + } + + if (print_stats_) { + string stats; + if (db_->GetProperty("rocksdb.stats", &stats)) { + fprintf(stdout, "%s\n", stats.c_str()); + } + } + + // Cast as DBImpl to get internal iterator + DBImpl* idb = dynamic_cast(db_); + if (!idb) { + exec_state_ = LDBCommandExecuteResult::FAILED("DB is not DBImpl"); + return; + } + string rtype1,rtype2,row,val; + rtype2 = ""; + uint64_t c=0; + uint64_t s1=0,s2=0; + // Setup internal key iterator + auto iter = unique_ptr(idb->TEST_NewInternalIterator()); + Status st = iter->status(); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error:" + + st.ToString()); + } + + if (has_from_) { + InternalKey ikey(from_, kMaxSequenceNumber, kValueTypeForSeek); + iter->Seek(ikey.Encode()); + } else { + iter->SeekToFirst(); + } + + long long count = 0; + for (; iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + if (!ParseInternalKey(iter->key(), &ikey)) { + fprintf(stderr, "Internal Key [%s] parse error!\n", + iter->key().ToString(true /* in hex*/).data()); + // TODO: add error counter + continue; + } + + // If end marker was specified, we stop before it + if (has_to_ && options_.comparator->Compare(ikey.user_key, to_) >= 0) { + break; + } + + ++count; + int k; + if (count_delim_) { + rtype1 = ""; + s1=0; + row = iter->key().ToString(); + val = iter->value().ToString(); + for(k=0;row[k]!='\x01' && row[k]!='\0';k++) + s1++; + for(k=0;val[k]!='\x01' && val[k]!='\0';k++) + s1++; + for(int j=0;row[j]!=delim_[0] && row[j]!='\0' && row[j]!='\x01';j++) + rtype1+=row[j]; + if(rtype2.compare("") && rtype2.compare(rtype1)!=0) { + fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(), + (long long)c,(long long)s2); + c=1; + s2=s1; + rtype2 = rtype1; + } else { + c++; + s2+=s1; + rtype2=rtype1; + } + } + + if (!count_only_ && !count_delim_) { + string key = ikey.DebugString(is_key_hex_); + string value = iter->value().ToString(is_value_hex_); + std::cout << key << " => " << value << "\n"; + } + + // Terminate if maximum number of keys have been dumped + if (max_keys_ > 0 && count >= max_keys_) break; + } + if(count_delim_) { + fprintf(stdout,"%s => count:%lld\tsize:%lld\n", rtype2.c_str(), + (long long)c,(long long)s2); + } else + fprintf(stdout, "Internal keys in range: %lld\n", (long long) count); +} + + +const string DBDumperCommand::ARG_COUNT_ONLY = "count_only"; +const string DBDumperCommand::ARG_COUNT_DELIM = "count_delim"; +const string DBDumperCommand::ARG_STATS = "stats"; +const string DBDumperCommand::ARG_TTL_BUCKET = "bucket"; + +DBDumperCommand::DBDumperCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, + ARG_VALUE_HEX, ARG_FROM, ARG_TO, + ARG_MAX_KEYS, ARG_COUNT_ONLY, + ARG_COUNT_DELIM, ARG_STATS, ARG_TTL_START, + ARG_TTL_END, ARG_TTL_BUCKET, + ARG_TIMESTAMP})), + null_from_(true), + null_to_(true), + max_keys_(-1), + count_only_(false), + count_delim_(false), + print_stats_(false) { + + map::const_iterator itr = options.find(ARG_FROM); + if (itr != options.end()) { + null_from_ = false; + from_ = itr->second; + } + + itr = options.find(ARG_TO); + if (itr != options.end()) { + null_to_ = false; + to_ = itr->second; + } + + itr = options.find(ARG_MAX_KEYS); + if (itr != options.end()) { + try { + max_keys_ = stoi(itr->second); + } catch(const invalid_argument&) { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS + + " has an invalid value"); + } catch(const out_of_range&) { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS + + " has a value out-of-range"); + } + } + itr = options.find(ARG_COUNT_DELIM); + if (itr != options.end()) { + delim_ = itr->second; + count_delim_ = true; + } else { + count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM); + delim_="."; + } + + print_stats_ = IsFlagPresent(flags, ARG_STATS); + count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY); + + if (is_key_hex_) { + if (!null_from_) { + from_ = HexToString(from_); + } + if (!null_to_) { + to_ = HexToString(to_); + } + } +} + +void DBDumperCommand::Help(string& ret) { + ret.append(" "); + ret.append(DBDumperCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append(" [--" + ARG_TTL + "]"); + ret.append(" [--" + ARG_MAX_KEYS + "=]"); + ret.append(" [--" + ARG_TIMESTAMP + "]"); + ret.append(" [--" + ARG_COUNT_ONLY + "]"); + ret.append(" [--" + ARG_COUNT_DELIM + "=]"); + ret.append(" [--" + ARG_STATS + "]"); + ret.append(" [--" + ARG_TTL_BUCKET + "=]"); + ret.append(" [--" + ARG_TTL_START + "=:- is inclusive]"); + ret.append(" [--" + ARG_TTL_END + "=:- is exclusive]"); + ret.append("\n"); +} + +void DBDumperCommand::DoCommand() { + if (!db_) { + return; + } + // Parse command line args + uint64_t count = 0; + if (print_stats_) { + string stats; + if (db_->GetProperty("rocksdb.stats", &stats)) { + fprintf(stdout, "%s\n", stats.c_str()); + } + } + + // Setup key iterator + Iterator* iter = db_->NewIterator(ReadOptions()); + Status st = iter->status(); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error." + + st.ToString()); + } + + if (!null_from_) { + iter->Seek(from_); + } else { + iter->SeekToFirst(); + } + + int max_keys = max_keys_; + int ttl_start; + if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) { + ttl_start = DBWithTTLImpl::kMinTimestamp; // TTL introduction time + } + int ttl_end; + if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) { + ttl_end = DBWithTTLImpl::kMaxTimestamp; // Max time allowed by TTL feature + } + if (ttl_end < ttl_start) { + fprintf(stderr, "Error: End time can't be less than start time\n"); + delete iter; + return; + } + int time_range = ttl_end - ttl_start; + int bucket_size; + if (!ParseIntOption(option_map_, ARG_TTL_BUCKET, bucket_size, exec_state_) || + bucket_size <= 0) { + bucket_size = time_range; // Will have just 1 bucket by default + } + //cretaing variables for row count of each type + string rtype1,rtype2,row,val; + rtype2 = ""; + uint64_t c=0; + uint64_t s1=0,s2=0; + + // At this point, bucket_size=0 => time_range=0 + uint64_t num_buckets = (bucket_size >= time_range) ? 1 : + ((time_range + bucket_size - 1) / bucket_size); + vector bucket_counts(num_buckets, 0); + if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) { + fprintf(stdout, "Dumping key-values from %s to %s\n", + ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str()); + } + + for (; iter->Valid(); iter->Next()) { + int rawtime = 0; + // If end marker was specified, we stop before it + if (!null_to_ && (iter->key().ToString() >= to_)) + break; + // Terminate if maximum number of keys have been dumped + if (max_keys == 0) + break; + if (is_db_ttl_) { + TtlIterator* it_ttl = dynamic_cast(iter); + assert(it_ttl); + rawtime = it_ttl->timestamp(); + if (rawtime < ttl_start || rawtime >= ttl_end) { + continue; + } + } + if (max_keys > 0) { + --max_keys; + } + if (is_db_ttl_ && num_buckets > 1) { + IncBucketCounts(bucket_counts, ttl_start, time_range, bucket_size, + rawtime, num_buckets); + } + ++count; + if (count_delim_) { + rtype1 = ""; + row = iter->key().ToString(); + val = iter->value().ToString(); + s1 = row.size()+val.size(); + for(int j=0;row[j]!=delim_[0] && row[j]!='\0';j++) + rtype1+=row[j]; + if(rtype2.compare("") && rtype2.compare(rtype1)!=0) { + fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(), + (long long )c,(long long)s2); + c=1; + s2=s1; + rtype2 = rtype1; + } else { + c++; + s2+=s1; + rtype2=rtype1; + } + + } + + + + if (!count_only_ && !count_delim_) { + if (is_db_ttl_ && timestamp_) { + fprintf(stdout, "%s ", ReadableTime(rawtime).c_str()); + } + string str = PrintKeyValue(iter->key().ToString(), + iter->value().ToString(), is_key_hex_, + is_value_hex_); + fprintf(stdout, "%s\n", str.c_str()); + } + } + + if (num_buckets > 1 && is_db_ttl_) { + PrintBucketCounts(bucket_counts, ttl_start, ttl_end, bucket_size, + num_buckets); + } else if(count_delim_) { + fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(), + (long long )c,(long long)s2); + } else { + fprintf(stdout, "Keys in range: %lld\n", (long long) count); + } + // Clean up + delete iter; +} + +const string ReduceDBLevelsCommand::ARG_NEW_LEVELS = "new_levels"; +const string ReduceDBLevelsCommand::ARG_PRINT_OLD_LEVELS = "print_old_levels"; + +ReduceDBLevelsCommand::ReduceDBLevelsCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_NEW_LEVELS, ARG_PRINT_OLD_LEVELS})), + old_levels_(1 << 16), + new_levels_(-1), + print_old_levels_(false) { + + + ParseIntOption(option_map_, ARG_NEW_LEVELS, new_levels_, exec_state_); + print_old_levels_ = IsFlagPresent(flags, ARG_PRINT_OLD_LEVELS); + + if(new_levels_ <= 0) { + exec_state_ = LDBCommandExecuteResult::FAILED( + " Use --" + ARG_NEW_LEVELS + " to specify a new level number\n"); + } +} + +vector ReduceDBLevelsCommand::PrepareArgs(const string& db_path, + int new_levels, bool print_old_level) { + vector ret; + ret.push_back("reduce_levels"); + ret.push_back("--" + ARG_DB + "=" + db_path); + ret.push_back("--" + ARG_NEW_LEVELS + "=" + to_string(new_levels)); + if(print_old_level) { + ret.push_back("--" + ARG_PRINT_OLD_LEVELS); + } + return ret; +} + +void ReduceDBLevelsCommand::Help(string& ret) { + ret.append(" "); + ret.append(ReduceDBLevelsCommand::Name()); + ret.append(" --" + ARG_NEW_LEVELS + "="); + ret.append(" [--" + ARG_PRINT_OLD_LEVELS + "]"); + ret.append("\n"); +} + +Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + opt.num_levels = old_levels_; + opt.max_bytes_for_level_multiplier_additional.resize(opt.num_levels, 1); + // Disable size compaction + opt.max_bytes_for_level_base = 1ULL << 50; + opt.max_bytes_for_level_multiplier = 1; + opt.max_mem_compaction_level = 0; + return opt; +} + +Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, + int* levels) { + EnvOptions soptions; + std::shared_ptr tc( + NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits, + opt.table_cache_remove_scan_count_limit)); + const InternalKeyComparator cmp(opt.comparator); + VersionSet versions(db_path_, &opt, soptions, tc.get()); + std::vector dummy; + ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName, + ColumnFamilyOptions(opt)); + dummy.push_back(dummy_descriptor); + // We rely the VersionSet::Recover to tell us the internal data structures + // in the db. And the Recover() should never do any change + // (like LogAndApply) to the manifest file. + Status st = versions.Recover(dummy); + if (!st.ok()) { + return st; + } + int max = -1; + auto default_cfd = versions.GetColumnFamilySet()->GetDefault(); + for (int i = 0; i < default_cfd->NumberLevels(); i++) { + if (default_cfd->current()->NumLevelFiles(i)) { + max = i; + } + } + + *levels = max + 1; + return st; +} + +void ReduceDBLevelsCommand::DoCommand() { + if (new_levels_ <= 1) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Invalid number of levels.\n"); + return; + } + + Status st; + Options opt = PrepareOptionsForOpenDB(); + int old_level_num = -1; + st = GetOldNumOfLevels(opt, &old_level_num); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + return; + } + + if (print_old_levels_) { + fprintf(stdout, "The old number of levels in use is %d\n", old_level_num); + } + + if (old_level_num <= new_levels_) { + return; + } + + old_levels_ = old_level_num; + + OpenDB(); + if (!db_) { + return; + } + // Compact the whole DB to put all files to the highest level. + fprintf(stdout, "Compacting the db...\n"); + db_->CompactRange(nullptr, nullptr); + CloseDB(); + + EnvOptions soptions; + st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + return; + } +} + +const string ChangeCompactionStyleCommand::ARG_OLD_COMPACTION_STYLE = + "old_compaction_style"; +const string ChangeCompactionStyleCommand::ARG_NEW_COMPACTION_STYLE = + "new_compaction_style"; + +ChangeCompactionStyleCommand::ChangeCompactionStyleCommand( + const vector& params, const map& options, + const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_OLD_COMPACTION_STYLE, + ARG_NEW_COMPACTION_STYLE})), + old_compaction_style_(-1), + new_compaction_style_(-1) { + + ParseIntOption(option_map_, ARG_OLD_COMPACTION_STYLE, old_compaction_style_, + exec_state_); + if (old_compaction_style_ != kCompactionStyleLevel && + old_compaction_style_ != kCompactionStyleUniversal) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Use --" + ARG_OLD_COMPACTION_STYLE + " to specify old compaction " + + "style. Check ldb help for proper compaction style value.\n"); + return; + } + + ParseIntOption(option_map_, ARG_NEW_COMPACTION_STYLE, new_compaction_style_, + exec_state_); + if (new_compaction_style_ != kCompactionStyleLevel && + new_compaction_style_ != kCompactionStyleUniversal) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Use --" + ARG_NEW_COMPACTION_STYLE + " to specify new compaction " + + "style. Check ldb help for proper compaction style value.\n"); + return; + } + + if (new_compaction_style_ == old_compaction_style_) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Old compaction style is the same as new compaction style. " + "Nothing to do.\n"); + return; + } + + if (old_compaction_style_ == kCompactionStyleUniversal && + new_compaction_style_ == kCompactionStyleLevel) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Convert from universal compaction to level compaction. " + "Nothing to do.\n"); + return; + } +} + +void ChangeCompactionStyleCommand::Help(string& ret) { + ret.append(" "); + ret.append(ChangeCompactionStyleCommand::Name()); + ret.append(" --" + ARG_OLD_COMPACTION_STYLE + "="); + ret.append(" --" + ARG_NEW_COMPACTION_STYLE + "="); + ret.append("\n"); +} + +Options ChangeCompactionStyleCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + + if (old_compaction_style_ == kCompactionStyleLevel && + new_compaction_style_ == kCompactionStyleUniversal) { + // In order to convert from level compaction to universal compaction, we + // need to compact all data into a single file and move it to level 0. + opt.disable_auto_compactions = true; + opt.target_file_size_base = INT_MAX; + opt.target_file_size_multiplier = 1; + opt.max_bytes_for_level_base = INT_MAX; + opt.max_bytes_for_level_multiplier = 1; + } + + return opt; +} + +void ChangeCompactionStyleCommand::DoCommand() { + // print db stats before we have made any change + std::string property; + std::string files_per_level; + for (int i = 0; i < db_->NumberLevels(); i++) { + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i), + &property); + + // format print string + char buf[100]; + snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str()); + files_per_level += buf; + } + fprintf(stdout, "files per level before compaction: %s\n", + files_per_level.c_str()); + + // manual compact into a single file and move the file to level 0 + db_->CompactRange(nullptr, nullptr, + true /* reduce level */, + 0 /* reduce to level 0 */); + + // verify compaction result + files_per_level = ""; + int num_files = 0; + for (int i = 0; i < db_->NumberLevels(); i++) { + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i), + &property); + + // format print string + char buf[100]; + snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str()); + files_per_level += buf; + + num_files = atoi(property.c_str()); + + // level 0 should have only 1 file + if (i == 0 && num_files != 1) { + exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at " + "level 0 after compaction is " + std::to_string(num_files) + + ", not 1.\n"); + return; + } + // other levels should have no file + if (i > 0 && num_files != 0) { + exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at " + "level " + std::to_string(i) + " after compaction is " + + std::to_string(num_files) + ", not 0.\n"); + return; + } + } + + fprintf(stdout, "files per level after compaction: %s\n", + files_per_level.c_str()); +} + +class InMemoryHandler : public WriteBatch::Handler { + public: + InMemoryHandler(stringstream& row, bool print_values) : Handler(),row_(row) { + print_values_ = print_values; + } + + void commonPutMerge(const Slice& key, const Slice& value) { + string k = LDBCommand::StringToHex(key.ToString()); + if (print_values_) { + string v = LDBCommand::StringToHex(value.ToString()); + row_ << k << " : "; + row_ << v << " "; + } else { + row_ << k << " "; + } + } + + virtual void Put(const Slice& key, const Slice& value) { + row_ << "PUT : "; + commonPutMerge(key, value); + } + + virtual void Merge(const Slice& key, const Slice& value) { + row_ << "MERGE : "; + commonPutMerge(key, value); + } + + virtual void Delete(const Slice& key) { + row_ <<",DELETE : "; + row_ << LDBCommand::StringToHex(key.ToString()) << " "; + } + + virtual ~InMemoryHandler() { }; + + private: + stringstream & row_; + bool print_values_; +}; + +const string WALDumperCommand::ARG_WAL_FILE = "walfile"; +const string WALDumperCommand::ARG_PRINT_VALUE = "print_value"; +const string WALDumperCommand::ARG_PRINT_HEADER = "header"; + +WALDumperCommand::WALDumperCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions( + {ARG_WAL_FILE, ARG_PRINT_HEADER, ARG_PRINT_VALUE})), + print_header_(false), print_values_(false) { + + wal_file_.clear(); + + map::const_iterator itr = options.find(ARG_WAL_FILE); + if (itr != options.end()) { + wal_file_ = itr->second; + } + + + print_header_ = IsFlagPresent(flags, ARG_PRINT_HEADER); + print_values_ = IsFlagPresent(flags, ARG_PRINT_VALUE); + if (wal_file_.empty()) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Argument " + ARG_WAL_FILE + " must be specified."); + } +} + +void WALDumperCommand::Help(string& ret) { + ret.append(" "); + ret.append(WALDumperCommand::Name()); + ret.append(" --" + ARG_WAL_FILE + "="); + ret.append(" [--" + ARG_PRINT_HEADER + "] "); + ret.append(" [--" + ARG_PRINT_VALUE + "] "); + ret.append("\n"); +} + +void WALDumperCommand::DoCommand() { + struct StdErrReporter : public log::Reader::Reporter { + virtual void Corruption(size_t bytes, const Status& s) { + cerr<<"Corruption detected in log file "< file; + Env* env_ = Env::Default(); + EnvOptions soptions; + Status status = env_->NewSequentialFile(wal_file_, &file, soptions); + if (!status.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED("Failed to open WAL file " + + status.ToString()); + } else { + StdErrReporter reporter; + log::Reader reader(move(file), &reporter, true, 0); + string scratch; + WriteBatch batch; + Slice record; + stringstream row; + if (print_header_) { + cout<<"Sequence,Count,ByteSize,Physical Offset,Key(s)"; + if (print_values_) { + cout << " : value "; + } + cout << "\n"; + } + while(reader.ReadRecord(&record, &scratch)) { + row.str(""); + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + } else { + WriteBatchInternal::SetContents(&batch, record); + row<& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_TTL, ARG_HEX, + ARG_KEY_HEX, + ARG_VALUE_HEX})) { + + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::FAILED( + " must be specified for the get command"); + } else { + key_ = params.at(0); + } + + if (is_key_hex_) { + key_ = HexToString(key_); + } +} + +void GetCommand::Help(string& ret) { + ret.append(" "); + ret.append(GetCommand::Name()); + ret.append(" "); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); +} + +void GetCommand::DoCommand() { + string value; + Status st = db_->Get(ReadOptions(), key_, &value); + if (st.ok()) { + fprintf(stdout, "%s\n", + (is_value_hex_ ? StringToHex(value) : value).c_str()); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } +} + + +ApproxSizeCommand::ApproxSizeCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_FROM, ARG_TO})) { + + if (options.find(ARG_FROM) != options.end()) { + start_key_ = options.find(ARG_FROM)->second; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_FROM + + " must be specified for approxsize command"); + return; + } + + if (options.find(ARG_TO) != options.end()) { + end_key_ = options.find(ARG_TO)->second; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_TO + + " must be specified for approxsize command"); + return; + } + + if (is_key_hex_) { + start_key_ = HexToString(start_key_); + end_key_ = HexToString(end_key_); + } +} + +void ApproxSizeCommand::Help(string& ret) { + ret.append(" "); + ret.append(ApproxSizeCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append("\n"); +} + +void ApproxSizeCommand::DoCommand() { + + Range ranges[1]; + ranges[0] = Range(start_key_, end_key_); + uint64_t sizes[1]; + db_->GetApproximateSizes(ranges, 1, sizes); + fprintf(stdout, "%lu\n", (unsigned long)sizes[0]); + /* Weird that GetApproximateSizes() returns void, although documentation + * says that it returns a Status object. + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } + */ +} + + +BatchPutCommand::BatchPutCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_CREATE_IF_MISSING})) { + + if (params.size() < 2) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "At least one pair must be specified batchput."); + } else if (params.size() % 2 != 0) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Equal number of s and s must be specified for batchput."); + } else { + for (size_t i = 0; i < params.size(); i += 2) { + string key = params.at(i); + string value = params.at(i+1); + key_values_.push_back(pair( + is_key_hex_ ? HexToString(key) : key, + is_value_hex_ ? HexToString(value) : value)); + } + } +} + +void BatchPutCommand::Help(string& ret) { + ret.append(" "); + ret.append(BatchPutCommand::Name()); + ret.append(" [ ] [..]"); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); +} + +void BatchPutCommand::DoCommand() { + WriteBatch batch; + + for (vector>::const_iterator itr + = key_values_.begin(); itr != key_values_.end(); itr++) { + batch.Put(itr->first, itr->second); + } + Status st = db_->Write(WriteOptions(), &batch); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } +} + +Options BatchPutCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + opt.create_if_missing = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING); + return opt; +} + + +ScanCommand::ScanCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_TO, + ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP, + ARG_MAX_KEYS, ARG_TTL_START, ARG_TTL_END})), + start_key_specified_(false), + end_key_specified_(false), + max_keys_scanned_(-1) { + + map::const_iterator itr = options.find(ARG_FROM); + if (itr != options.end()) { + start_key_ = itr->second; + if (is_key_hex_) { + start_key_ = HexToString(start_key_); + } + start_key_specified_ = true; + } + itr = options.find(ARG_TO); + if (itr != options.end()) { + end_key_ = itr->second; + if (is_key_hex_) { + end_key_ = HexToString(end_key_); + } + end_key_specified_ = true; + } + + itr = options.find(ARG_MAX_KEYS); + if (itr != options.end()) { + try { + max_keys_scanned_ = stoi(itr->second); + } catch(const invalid_argument&) { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS + + " has an invalid value"); + } catch(const out_of_range&) { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS + + " has a value out-of-range"); + } + } +} + +void ScanCommand::Help(string& ret) { + ret.append(" "); + ret.append(ScanCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append(" [--" + ARG_TTL + "]"); + ret.append(" [--" + ARG_TIMESTAMP + "]"); + ret.append(" [--" + ARG_MAX_KEYS + "=q] "); + ret.append(" [--" + ARG_TTL_START + "=:- is inclusive]"); + ret.append(" [--" + ARG_TTL_END + "=:- is exclusive]"); + ret.append("\n"); +} + +void ScanCommand::DoCommand() { + + int num_keys_scanned = 0; + Iterator* it = db_->NewIterator(ReadOptions()); + if (start_key_specified_) { + it->Seek(start_key_); + } else { + it->SeekToFirst(); + } + int ttl_start; + if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) { + ttl_start = DBWithTTLImpl::kMinTimestamp; // TTL introduction time + } + int ttl_end; + if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) { + ttl_end = DBWithTTLImpl::kMaxTimestamp; // Max time allowed by TTL feature + } + if (ttl_end < ttl_start) { + fprintf(stderr, "Error: End time can't be less than start time\n"); + delete it; + return; + } + if (is_db_ttl_ && timestamp_) { + fprintf(stdout, "Scanning key-values from %s to %s\n", + ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str()); + } + for ( ; + it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_); + it->Next()) { + string key = it->key().ToString(); + if (is_db_ttl_) { + TtlIterator* it_ttl = dynamic_cast(it); + assert(it_ttl); + int rawtime = it_ttl->timestamp(); + if (rawtime < ttl_start || rawtime >= ttl_end) { + continue; + } + if (timestamp_) { + fprintf(stdout, "%s ", ReadableTime(rawtime).c_str()); + } + } + string value = it->value().ToString(); + fprintf(stdout, "%s : %s\n", + (is_key_hex_ ? StringToHex(key) : key).c_str(), + (is_value_hex_ ? StringToHex(value) : value).c_str() + ); + num_keys_scanned++; + if (max_keys_scanned_ >= 0 && num_keys_scanned >= max_keys_scanned_) { + break; + } + } + if (!it->status().ok()) { // Check for any errors found during the scan + exec_state_ = LDBCommandExecuteResult::FAILED(it->status().ToString()); + } + delete it; +} + + +DeleteCommand::DeleteCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) { + + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "KEY must be specified for the delete command"); + } else { + key_ = params.at(0); + if (is_key_hex_) { + key_ = HexToString(key_); + } + } +} + +void DeleteCommand::Help(string& ret) { + ret.append(" "); + ret.append(DeleteCommand::Name() + " "); + ret.append("\n"); +} + +void DeleteCommand::DoCommand() { + Status st = db_->Delete(WriteOptions(), key_); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } +} + + +PutCommand::PutCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_CREATE_IF_MISSING})) { + + if (params.size() != 2) { + exec_state_ = LDBCommandExecuteResult::FAILED( + " and must be specified for the put command"); + } else { + key_ = params.at(0); + value_ = params.at(1); + } + + if (is_key_hex_) { + key_ = HexToString(key_); + } + + if (is_value_hex_) { + value_ = HexToString(value_); + } +} + +void PutCommand::Help(string& ret) { + ret.append(" "); + ret.append(PutCommand::Name()); + ret.append(" "); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); +} + +void PutCommand::DoCommand() { + Status st = db_->Put(WriteOptions(), key_, value_); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } +} + +Options PutCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + opt.create_if_missing = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING); + return opt; +} + + +const char* DBQuerierCommand::HELP_CMD = "help"; +const char* DBQuerierCommand::GET_CMD = "get"; +const char* DBQuerierCommand::PUT_CMD = "put"; +const char* DBQuerierCommand::DELETE_CMD = "delete"; + +DBQuerierCommand::DBQuerierCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, + ARG_VALUE_HEX})) { + +} + +void DBQuerierCommand::Help(string& ret) { + ret.append(" "); + ret.append(DBQuerierCommand::Name()); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); + ret.append(" Starts a REPL shell. Type help for list of available " + "commands."); + ret.append("\n"); +} + +void DBQuerierCommand::DoCommand() { + if (!db_) { + return; + } + + ReadOptions read_options; + WriteOptions write_options; + + string line; + string key; + string value; + while (getline(cin, line, '\n')) { + + // Parse line into vector + vector tokens; + size_t pos = 0; + while (true) { + size_t pos2 = line.find(' ', pos); + if (pos2 == string::npos) { + break; + } + tokens.push_back(line.substr(pos, pos2-pos)); + pos = pos2 + 1; + } + tokens.push_back(line.substr(pos)); + + const string& cmd = tokens[0]; + + if (cmd == HELP_CMD) { + fprintf(stdout, + "get \n" + "put \n" + "delete \n"); + } else if (cmd == DELETE_CMD && tokens.size() == 2) { + key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); + db_->Delete(write_options, Slice(key)); + fprintf(stdout, "Successfully deleted %s\n", tokens[1].c_str()); + } else if (cmd == PUT_CMD && tokens.size() == 3) { + key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); + value = (is_value_hex_ ? HexToString(tokens[2]) : tokens[2]); + db_->Put(write_options, Slice(key), Slice(value)); + fprintf(stdout, "Successfully put %s %s\n", + tokens[1].c_str(), tokens[2].c_str()); + } else if (cmd == GET_CMD && tokens.size() == 2) { + key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); + if (db_->Get(read_options, Slice(key), &value).ok()) { + fprintf(stdout, "%s\n", PrintKeyValue(key, value, + is_key_hex_, is_value_hex_).c_str()); + } else { + fprintf(stdout, "Not found %s\n", tokens[1].c_str()); + } + } else { + fprintf(stdout, "Unknown command %s\n", line.c_str()); + } + } +} + +CheckConsistencyCommand::CheckConsistencyCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({})) { +} + +void CheckConsistencyCommand::Help(string& ret) { + ret.append(" "); + ret.append(CheckConsistencyCommand::Name()); + ret.append("\n"); +} + +void CheckConsistencyCommand::DoCommand() { + Options opt = PrepareOptionsForOpenDB(); + opt.paranoid_checks = true; + if (!exec_state_.IsNotStarted()) { + return; + } + DB* db; + Status st = DB::OpenForReadOnly(opt, db_path_, &db, false); + delete db; + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h new file mode 100644 index 0000000000..4f760e0ceb --- /dev/null +++ b/util/ldb_cmd.h @@ -0,0 +1,722 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include +#include +#include +#include +#include +#include + +#include "db/version_set.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "util/logging.h" +#include "util/ldb_cmd_execute_result.h" +#include "util/string_util.h" +#include "utilities/db_ttl.h" +#include "utilities/ttl/db_ttl_impl.h" + +using std::string; +using std::map; +using std::vector; +using std::ostringstream; + +namespace rocksdb { + +class LDBCommand { +public: + + // Command-line arguments + static const string ARG_DB; + static const string ARG_HEX; + static const string ARG_KEY_HEX; + static const string ARG_VALUE_HEX; + static const string ARG_TTL; + static const string ARG_TTL_START; + static const string ARG_TTL_END; + static const string ARG_TIMESTAMP; + static const string ARG_FROM; + static const string ARG_TO; + static const string ARG_MAX_KEYS; + static const string ARG_BLOOM_BITS; + static const string ARG_COMPRESSION_TYPE; + static const string ARG_BLOCK_SIZE; + static const string ARG_AUTO_COMPACTION; + static const string ARG_WRITE_BUFFER_SIZE; + static const string ARG_FILE_SIZE; + static const string ARG_CREATE_IF_MISSING; + + static LDBCommand* InitFromCmdLineArgs( + const vector& args, + const Options& options = Options() + ); + + static LDBCommand* InitFromCmdLineArgs( + int argc, + char** argv, + const Options& options = Options() + ); + + bool ValidateCmdLineOptions(); + + virtual Options PrepareOptionsForOpenDB(); + + virtual void SetOptions(Options options) { + options_ = options; + } + + virtual bool NoDBOpen() { + return false; + } + + virtual ~LDBCommand() { + if (db_ != nullptr) { + delete db_; + db_ = nullptr; + } + } + + /* Run the command, and return the execute result. */ + void Run() { + if (!exec_state_.IsNotStarted()) { + return; + } + + if (db_ == nullptr && !NoDBOpen()) { + OpenDB(); + if (!exec_state_.IsNotStarted()) { + return; + } + } + + DoCommand(); + if (exec_state_.IsNotStarted()) { + exec_state_ = LDBCommandExecuteResult::SUCCEED(""); + } + + if (db_ != nullptr) { + CloseDB (); + } + } + + virtual void DoCommand() = 0; + + LDBCommandExecuteResult GetExecuteState() { + return exec_state_; + } + + void ClearPreviousRunState() { + exec_state_.Reset(); + } + + static string HexToString(const string& str) { + string parsed; + if (str[0] != '0' || str[1] != 'x') { + fprintf(stderr, "Invalid hex input %s. Must start with 0x\n", + str.c_str()); + throw "Invalid hex input"; + } + + for (unsigned int i = 2; i < str.length();) { + int c; + sscanf(str.c_str() + i, "%2X", &c); + parsed.push_back(c); + i += 2; + } + return parsed; + } + + static string StringToHex(const string& str) { + string result = "0x"; + char buf[10]; + for (size_t i = 0; i < str.length(); i++) { + snprintf(buf, 10, "%02X", (unsigned char)str[i]); + result += buf; + } + return result; + } + + static const char* DELIM; + +protected: + + LDBCommandExecuteResult exec_state_; + string db_path_; + DB* db_; + DBWithTTL* db_ttl_; + + /** + * true implies that this command can work if the db is opened in read-only + * mode. + */ + bool is_read_only_; + + /** If true, the key is input/output as hex in get/put/scan/delete etc. */ + bool is_key_hex_; + + /** If true, the value is input/output as hex in get/put/scan/delete etc. */ + bool is_value_hex_; + + /** If true, the value is treated as timestamp suffixed */ + bool is_db_ttl_; + + // If true, the kvs are output with their insert/modify timestamp in a ttl db + bool timestamp_; + + /** + * Map of options passed on the command-line. + */ + const map option_map_; + + /** + * Flags passed on the command-line. + */ + const vector flags_; + + /** List of command-line options valid for this command */ + const vector valid_cmd_line_options_; + + bool ParseKeyValue(const string& line, string* key, string* value, + bool is_key_hex, bool is_value_hex); + + LDBCommand(const map& options, const vector& flags, + bool is_read_only, const vector& valid_cmd_line_options) : + db_(nullptr), + is_read_only_(is_read_only), + is_key_hex_(false), + is_value_hex_(false), + is_db_ttl_(false), + timestamp_(false), + option_map_(options), + flags_(flags), + valid_cmd_line_options_(valid_cmd_line_options) { + + map::const_iterator itr = options.find(ARG_DB); + if (itr != options.end()) { + db_path_ = itr->second; + } + + is_key_hex_ = IsKeyHex(options, flags); + is_value_hex_ = IsValueHex(options, flags); + is_db_ttl_ = IsFlagPresent(flags, ARG_TTL); + timestamp_ = IsFlagPresent(flags, ARG_TIMESTAMP); + } + + void OpenDB() { + Options opt = PrepareOptionsForOpenDB(); + if (!exec_state_.IsNotStarted()) { + return; + } + // Open the DB. + Status st; + if (is_db_ttl_) { + if (is_read_only_) { + st = DBWithTTL::Open(opt, db_path_, &db_ttl_, 0, true); + } else { + st = DBWithTTL::Open(opt, db_path_, &db_ttl_); + } + db_ = db_ttl_; + } else if (is_read_only_) { + st = DB::OpenForReadOnly(opt, db_path_, &db_); + } else { + st = DB::Open(opt, db_path_, &db_); + } + if (!st.ok()) { + string msg = st.ToString(); + exec_state_ = LDBCommandExecuteResult::FAILED(msg); + } + + options_ = opt; + } + + void CloseDB () { + if (db_ != nullptr) { + delete db_; + db_ = nullptr; + } + } + + static string PrintKeyValue(const string& key, const string& value, + bool is_key_hex, bool is_value_hex) { + string result; + result.append(is_key_hex ? StringToHex(key) : key); + result.append(DELIM); + result.append(is_value_hex ? StringToHex(value) : value); + return result; + } + + static string PrintKeyValue(const string& key, const string& value, + bool is_hex) { + return PrintKeyValue(key, value, is_hex, is_hex); + } + + /** + * Return true if the specified flag is present in the specified flags vector + */ + static bool IsFlagPresent(const vector& flags, const string& flag) { + return (std::find(flags.begin(), flags.end(), flag) != flags.end()); + } + + static string HelpRangeCmdArgs() { + ostringstream str_stream; + str_stream << " "; + str_stream << "[--" << ARG_FROM << "] "; + str_stream << "[--" << ARG_TO << "] "; + return str_stream.str(); + } + + /** + * A helper function that returns a list of command line options + * used by this command. It includes the common options and the ones + * passed in. + */ + vector BuildCmdLineOptions(vector options) { + vector ret = {ARG_DB, ARG_BLOOM_BITS, ARG_BLOCK_SIZE, + ARG_AUTO_COMPACTION, ARG_COMPRESSION_TYPE, + ARG_WRITE_BUFFER_SIZE, ARG_FILE_SIZE}; + ret.insert(ret.end(), options.begin(), options.end()); + return ret; + } + + bool ParseIntOption(const map& options, const string& option, + int& value, LDBCommandExecuteResult& exec_state); + + bool ParseStringOption(const map& options, + const string& option, string* value); + + Options options_; + +private: + + /** + * Interpret command line options and flags to determine if the key + * should be input/output in hex. + */ + bool IsKeyHex(const map& options, + const vector& flags) { + return (IsFlagPresent(flags, ARG_HEX) || + IsFlagPresent(flags, ARG_KEY_HEX) || + ParseBooleanOption(options, ARG_HEX, false) || + ParseBooleanOption(options, ARG_KEY_HEX, false)); + } + + /** + * Interpret command line options and flags to determine if the value + * should be input/output in hex. + */ + bool IsValueHex(const map& options, + const vector& flags) { + return (IsFlagPresent(flags, ARG_HEX) || + IsFlagPresent(flags, ARG_VALUE_HEX) || + ParseBooleanOption(options, ARG_HEX, false) || + ParseBooleanOption(options, ARG_VALUE_HEX, false)); + } + + /** + * Returns the value of the specified option as a boolean. + * default_val is used if the option is not found in options. + * Throws an exception if the value of the option is not + * "true" or "false" (case insensitive). + */ + bool ParseBooleanOption(const map& options, + const string& option, bool default_val) { + + map::const_iterator itr = options.find(option); + if (itr != options.end()) { + string option_val = itr->second; + return StringToBool(itr->second); + } + return default_val; + } + + /** + * Converts val to a boolean. + * val must be either true or false (case insensitive). + * Otherwise an exception is thrown. + */ + bool StringToBool(string val) { + std::transform(val.begin(), val.end(), val.begin(), ::tolower); + if (val == "true") { + return true; + } else if (val == "false") { + return false; + } else { + throw "Invalid value for boolean argument"; + } + } + + static LDBCommand* SelectCommand( + const string& cmd, + const vector& cmdParams, + const map& option_map, + const vector& flags + ); + +}; + +class CompactorCommand: public LDBCommand { +public: + static string Name() { return "compact"; } + + CompactorCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); + +private: + bool null_from_; + string from_; + bool null_to_; + string to_; +}; + +class DBDumperCommand: public LDBCommand { +public: + static string Name() { return "dump"; } + + DBDumperCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); + +private: + bool null_from_; + string from_; + bool null_to_; + string to_; + int max_keys_; + string delim_; + bool count_only_; + bool count_delim_; + bool print_stats_; + + static const string ARG_COUNT_ONLY; + static const string ARG_COUNT_DELIM; + static const string ARG_STATS; + static const string ARG_TTL_BUCKET; +}; + +class InternalDumpCommand: public LDBCommand { +public: + static string Name() { return "idump"; } + + InternalDumpCommand(const vector& params, + const map& options, + const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); + +private: + bool has_from_; + string from_; + bool has_to_; + string to_; + int max_keys_; + string delim_; + bool count_only_; + bool count_delim_; + bool print_stats_; + bool is_input_key_hex_; + + static const string ARG_DELIM; + static const string ARG_COUNT_ONLY; + static const string ARG_COUNT_DELIM; + static const string ARG_STATS; + static const string ARG_INPUT_KEY_HEX; +}; + +class DBLoaderCommand: public LDBCommand { +public: + static string Name() { return "load"; } + + DBLoaderCommand(string& db_name, vector& args); + + DBLoaderCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + virtual void DoCommand(); + + virtual Options PrepareOptionsForOpenDB(); + +private: + bool create_if_missing_; + bool disable_wal_; + bool bulk_load_; + bool compact_; + + static const string ARG_DISABLE_WAL; + static const string ARG_BULK_LOAD; + static const string ARG_COMPACT; +}; + +class ManifestDumpCommand: public LDBCommand { +public: + static string Name() { return "manifest_dump"; } + + ManifestDumpCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + virtual void DoCommand(); + + virtual bool NoDBOpen() { + return true; + } + +private: + bool verbose_; + string path_; + + static const string ARG_VERBOSE; + static const string ARG_PATH; +}; + +class ListColumnFamiliesCommand : public LDBCommand { + public: + static string Name() { return "list_column_families"; } + + ListColumnFamiliesCommand(const vector& params, + const map& options, + const vector& flags); + + static void Help(string& ret); + virtual void DoCommand(); + + virtual bool NoDBOpen() { return true; } + + private: + string dbname_; +}; + +class ReduceDBLevelsCommand : public LDBCommand { +public: + static string Name() { return "reduce_levels"; } + + ReduceDBLevelsCommand(const vector& params, + const map& options, const vector& flags); + + virtual Options PrepareOptionsForOpenDB(); + + virtual void DoCommand(); + + virtual bool NoDBOpen() { + return true; + } + + static void Help(string& msg); + + static vector PrepareArgs(const string& db_path, int new_levels, + bool print_old_level = false); + +private: + int old_levels_; + int new_levels_; + bool print_old_levels_; + + static const string ARG_NEW_LEVELS; + static const string ARG_PRINT_OLD_LEVELS; + + Status GetOldNumOfLevels(Options& opt, int* levels); +}; + +class ChangeCompactionStyleCommand : public LDBCommand { +public: + static string Name() { return "change_compaction_style"; } + + ChangeCompactionStyleCommand(const vector& params, + const map& options, const vector& flags); + + virtual Options PrepareOptionsForOpenDB(); + + virtual void DoCommand(); + + static void Help(string& msg); + +private: + int old_compaction_style_; + int new_compaction_style_; + + static const string ARG_OLD_COMPACTION_STYLE; + static const string ARG_NEW_COMPACTION_STYLE; +}; + +class WALDumperCommand : public LDBCommand { +public: + static string Name() { return "dump_wal"; } + + WALDumperCommand(const vector& params, + const map& options, const vector& flags); + + virtual bool NoDBOpen() { + return true; + } + + static void Help(string& ret); + virtual void DoCommand(); + +private: + bool print_header_; + string wal_file_; + bool print_values_; + + static const string ARG_WAL_FILE; + static const string ARG_PRINT_HEADER; + static const string ARG_PRINT_VALUE; +}; + + +class GetCommand : public LDBCommand { +public: + static string Name() { return "get"; } + + GetCommand(const vector& params, const map& options, + const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + +private: + string key_; +}; + +class ApproxSizeCommand : public LDBCommand { +public: + static string Name() { return "approxsize"; } + + ApproxSizeCommand(const vector& params, + const map& options, const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + +private: + string start_key_; + string end_key_; +}; + +class BatchPutCommand : public LDBCommand { +public: + static string Name() { return "batchput"; } + + BatchPutCommand(const vector& params, + const map& options, const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + + virtual Options PrepareOptionsForOpenDB(); + +private: + /** + * The key-values to be inserted. + */ + vector> key_values_; +}; + +class ScanCommand : public LDBCommand { +public: + static string Name() { return "scan"; } + + ScanCommand(const vector& params, const map& options, + const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + +private: + string start_key_; + string end_key_; + bool start_key_specified_; + bool end_key_specified_; + int max_keys_scanned_; +}; + +class DeleteCommand : public LDBCommand { +public: + static string Name() { return "delete"; } + + DeleteCommand(const vector& params, + const map& options, const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + +private: + string key_; +}; + +class PutCommand : public LDBCommand { +public: + static string Name() { return "put"; } + + PutCommand(const vector& params, const map& options, + const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + + virtual Options PrepareOptionsForOpenDB(); + +private: + string key_; + string value_; +}; + +/** + * Command that starts up a REPL shell that allows + * get/put/delete. + */ +class DBQuerierCommand: public LDBCommand { +public: + static string Name() { return "query"; } + + DBQuerierCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); + +private: + static const char* HELP_CMD; + static const char* GET_CMD; + static const char* PUT_CMD; + static const char* DELETE_CMD; +}; + +class CheckConsistencyCommand : public LDBCommand { +public: + static string Name() { return "checkconsistency"; } + + CheckConsistencyCommand(const vector& params, + const map& options, const vector& flags); + + virtual void DoCommand(); + + virtual bool NoDBOpen() { + return true; + } + + static void Help(string& ret); +}; + +} // namespace rocksdb diff --git a/util/ldb_cmd_execute_result.h b/util/ldb_cmd_execute_result.h new file mode 100644 index 0000000000..b9121b2b0a --- /dev/null +++ b/util/ldb_cmd_execute_result.h @@ -0,0 +1,76 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +namespace rocksdb { + +class LDBCommandExecuteResult { +public: + enum State { + EXEC_NOT_STARTED = 0, EXEC_SUCCEED = 1, EXEC_FAILED = 2, + }; + + LDBCommandExecuteResult() { + state_ = EXEC_NOT_STARTED; + message_ = ""; + } + + LDBCommandExecuteResult(State state, std::string& msg) { + state_ = state; + message_ = msg; + } + + std::string ToString() { + std::string ret; + switch (state_) { + case EXEC_SUCCEED: + break; + case EXEC_FAILED: + ret.append("Failed: "); + break; + case EXEC_NOT_STARTED: + ret.append("Not started: "); + } + if (!message_.empty()) { + ret.append(message_); + } + return ret; + } + + void Reset() { + state_ = EXEC_NOT_STARTED; + message_ = ""; + } + + bool IsSucceed() { + return state_ == EXEC_SUCCEED; + } + + bool IsNotStarted() { + return state_ == EXEC_NOT_STARTED; + } + + bool IsFailed() { + return state_ == EXEC_FAILED; + } + + static LDBCommandExecuteResult SUCCEED(std::string msg) { + return LDBCommandExecuteResult(EXEC_SUCCEED, msg); + } + + static LDBCommandExecuteResult FAILED(std::string msg) { + return LDBCommandExecuteResult(EXEC_FAILED, msg); + } + +private: + State state_; + std::string message_; + + bool operator==(const LDBCommandExecuteResult&); + bool operator!=(const LDBCommandExecuteResult&); +}; + +} diff --git a/util/ldb_tool.cc b/util/ldb_tool.cc new file mode 100644 index 0000000000..8439b63f90 --- /dev/null +++ b/util/ldb_tool.cc @@ -0,0 +1,107 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef ROCKSDB_LITE +#include "rocksdb/ldb_tool.h" +#include "util/ldb_cmd.h" + +namespace rocksdb { + +class LDBCommandRunner { +public: + + static void PrintHelp(const char* exec_name) { + string ret; + + ret.append("ldb - LevelDB Tool"); + ret.append("\n\n"); + ret.append("commands MUST specify --" + LDBCommand::ARG_DB + + "= when necessary\n"); + ret.append("\n"); + ret.append("The following optional parameters control if keys/values are " + "input/output as hex or as plain strings:\n"); + ret.append(" --" + LDBCommand::ARG_KEY_HEX + + " : Keys are input/output as hex\n"); + ret.append(" --" + LDBCommand::ARG_VALUE_HEX + + " : Values are input/output as hex\n"); + ret.append(" --" + LDBCommand::ARG_HEX + + " : Both keys and values are input/output as hex\n"); + ret.append("\n"); + + ret.append("The following optional parameters control the database " + "internals:\n"); + ret.append(" --" + LDBCommand::ARG_TTL + + " with 'put','get','scan','dump','query','batchput'" + " : DB supports ttl and value is internally timestamp-suffixed\n"); + ret.append(" --" + LDBCommand::ARG_BLOOM_BITS + "=\n"); + ret.append(" --" + LDBCommand::ARG_COMPRESSION_TYPE + + "=\n"); + ret.append(" --" + LDBCommand::ARG_BLOCK_SIZE + + "=\n"); + ret.append(" --" + LDBCommand::ARG_AUTO_COMPACTION + "=\n"); + ret.append(" --" + LDBCommand::ARG_WRITE_BUFFER_SIZE + + "=\n"); + ret.append(" --" + LDBCommand::ARG_FILE_SIZE + "=\n"); + + ret.append("\n\n"); + ret.append("Data Access Commands:\n"); + PutCommand::Help(ret); + GetCommand::Help(ret); + BatchPutCommand::Help(ret); + ScanCommand::Help(ret); + DeleteCommand::Help(ret); + DBQuerierCommand::Help(ret); + ApproxSizeCommand::Help(ret); + CheckConsistencyCommand::Help(ret); + + ret.append("\n\n"); + ret.append("Admin Commands:\n"); + WALDumperCommand::Help(ret); + CompactorCommand::Help(ret); + ReduceDBLevelsCommand::Help(ret); + ChangeCompactionStyleCommand::Help(ret); + DBDumperCommand::Help(ret); + DBLoaderCommand::Help(ret); + ManifestDumpCommand::Help(ret); + ListColumnFamiliesCommand::Help(ret); + InternalDumpCommand::Help(ret); + + fprintf(stderr, "%s\n", ret.c_str()); + } + + static void RunCommand(int argc, char** argv, Options options) { + if (argc <= 2) { + PrintHelp(argv[0]); + exit(1); + } + + LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs(argc, argv, options); + if (cmdObj == nullptr) { + fprintf(stderr, "Unknown command\n"); + PrintHelp(argv[0]); + exit(1); + } + + if (!cmdObj->ValidateCmdLineOptions()) { + exit(1); + } + + cmdObj->Run(); + LDBCommandExecuteResult ret = cmdObj->GetExecuteState(); + fprintf(stderr, "%s\n", ret.ToString().c_str()); + delete cmdObj; + + exit(ret.IsFailed()); + } + +}; + + +void LDBTool::Run(int argc, char** argv, Options options) { + LDBCommandRunner::RunCommand(argc, argv, options); +} +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/util/log_buffer.cc b/util/log_buffer.cc new file mode 100644 index 0000000000..726c01442b --- /dev/null +++ b/util/log_buffer.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "util/log_buffer.h" + +#include + +namespace rocksdb { + +LogBuffer::LogBuffer(const InfoLogLevel log_level, + Logger*info_log) + : log_level_(log_level), info_log_(info_log) {} + +void LogBuffer::AddLogToBuffer(const char* format, va_list ap) { + if (!info_log_ || log_level_ < info_log_->GetInfoLogLevel()) { + // Skip the level because of its level. + return; + } + + const size_t kLogSizeLimit = 512; + char* alloc_mem = arena_.AllocateAligned(kLogSizeLimit); + BufferedLog* buffered_log = new (alloc_mem) BufferedLog(); + char* p = buffered_log->message; + char* limit = alloc_mem + kLogSizeLimit - 1; + + // store the time + gettimeofday(&(buffered_log->now_tv), nullptr); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + auto n = vsnprintf(p, limit - p, format, backup_ap); + assert(n >= 0); + p += n; + va_end(backup_ap); + } + + if (p > limit) { + p = limit; + } + + // Add '\0' to the end + *p = '\0'; + + logs_.push_back(buffered_log); +} + +void LogBuffer::FlushBufferToLog() { + for (BufferedLog* log : logs_) { + const time_t seconds = log->now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + Log(log_level_, info_log_, + "(Original Log Time %04d/%02d/%02d-%02d:%02d:%02d.%06d) %s", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, + t.tm_sec, static_cast(log->now_tv.tv_usec), log->message); + } + logs_.clear(); +} + +void LogToBuffer(LogBuffer* log_buffer, const char* format, ...) { + if (log_buffer != nullptr) { + va_list ap; + va_start(ap, format); + log_buffer->AddLogToBuffer(format, ap); + va_end(ap); + } +} + +} // namespace rocksdb diff --git a/util/log_buffer.h b/util/log_buffer.h new file mode 100644 index 0000000000..2a24bf854c --- /dev/null +++ b/util/log_buffer.h @@ -0,0 +1,49 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include "rocksdb/env.h" +#include "util/arena.h" +#include "util/autovector.h" +#include + +namespace rocksdb { + +class Logger; + +// A class to buffer info log entries and flush them in the end. +class LogBuffer { + public: + // log_level: the log level for all the logs + // info_log: logger to write the logs to + LogBuffer(const InfoLogLevel log_level, Logger* info_log); + + // Add a log entry to the buffer. + void AddLogToBuffer(const char* format, va_list ap); + + size_t IsEmpty() const { return logs_.empty(); } + + // Flush all buffered log to the info log. + void FlushBufferToLog(); + + private: + // One log entry with its timestamp + struct BufferedLog { + struct timeval now_tv; // Timestamp of the log + char message[1]; // Beginning of log message + }; + + const InfoLogLevel log_level_; + Logger* info_log_; + Arena arena_; + autovector logs_; +}; + +// Add log to the LogBuffer for a delayed info logging. It can be used when +// we want to add some logs inside a mutex. +extern void LogToBuffer(LogBuffer* log_buffer, const char* format, ...); + +} // namespace rocksdb diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc new file mode 100644 index 0000000000..16e7af7e2c --- /dev/null +++ b/util/log_write_bench.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include + +#include "rocksdb/env.h" +#include "util/histogram.h" +#include "util/testharness.h" +#include "util/testutil.h" + +using GFLAGS::ParseCommandLineFlags; +using GFLAGS::SetUsageMessage; + +// A simple benchmark to simulate transactional logs + +DEFINE_int32(num_records, 6000, "Number of records."); +DEFINE_int32(record_size, 249, "Size of each record."); +DEFINE_int32(record_interval, 10000, "Interval between records (microSec)"); +DEFINE_int32(bytes_per_sync, 0, "bytes_per_sync parameter in EnvOptions"); +DEFINE_bool(enable_sync, false, "sync after each write."); + +namespace rocksdb { +void RunBenchmark() { + std::string file_name = test::TmpDir() + "/log_write_benchmark.log"; + Env* env = Env::Default(); + EnvOptions env_options; + env_options.use_mmap_writes = false; + env_options.bytes_per_sync = FLAGS_bytes_per_sync; + unique_ptr file; + env->NewWritableFile(file_name, &file, env_options); + + std::string record; + record.assign('X', FLAGS_record_size); + + HistogramImpl hist; + + uint64_t start_time = env->NowMicros(); + for (int i = 0; i < FLAGS_num_records; i++) { + uint64_t start_nanos = env->NowNanos(); + file->Append(record); + file->Flush(); + if (FLAGS_enable_sync) { + file->Sync(); + } + hist.Add(env->NowNanos() - start_nanos); + + if (i % 1000 == 1) { + fprintf(stderr, "Wrote %d records...\n", i); + } + + int time_to_sleep = + (i + 1) * FLAGS_record_interval - (env->NowMicros() - start_time); + if (time_to_sleep > 0) { + env->SleepForMicroseconds(time_to_sleep); + } + } + + fprintf(stderr, "Distribution of latency of append+flush: \n%s", + hist.ToString().c_str()); +} +} // namespace rocksdb + +int main(int argc, char** argv) { + SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + ParseCommandLineFlags(&argc, &argv, true); + + rocksdb::RunBenchmark(); + return 0; +} + +#endif // GFLAGS diff --git a/util/logging.cc b/util/logging.cc new file mode 100644 index 0000000000..1b5549d731 --- /dev/null +++ b/util/logging.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/logging.h" + +#define __STDC_FORMAT_MACROS +#include +#include +#include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + + +// for sizes >=10TB, print "XXTB" +// for sizes >=10GB, print "XXGB" +// etc. +// append file size summary to output and return the len +int AppendHumanBytes(uint64_t bytes, char* output, int len) { + const uint64_t ull10 = 10; + if (bytes >= ull10 << 40) { + return snprintf(output, len, "%" PRIu64 "TB", bytes >> 40); + } else if (bytes >= ull10 << 30) { + return snprintf(output, len, "%" PRIu64 "GB", bytes >> 30); + } else if (bytes >= ull10 << 20) { + return snprintf(output, len, "%" PRIu64 "MB", bytes >> 20); + } else if (bytes >= ull10 << 10) { + return snprintf(output, len, "%" PRIu64 "KB", bytes >> 10); + } else { + return snprintf(output, len, "%" PRIu64 "B", bytes); + } +} + +void AppendNumberTo(std::string* str, uint64_t num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); + str->append(buf); +} + +void AppendEscapedStringTo(std::string* str, const Slice& value) { + for (size_t i = 0; i < value.size(); i++) { + char c = value[i]; + if (c >= ' ' && c <= '~') { + str->push_back(c); + } else { + char buf[10]; + snprintf(buf, sizeof(buf), "\\x%02x", + static_cast(c) & 0xff); + str->append(buf); + } + } +} + +std::string NumberToString(uint64_t num) { + std::string r; + AppendNumberTo(&r, num); + return r; +} + +std::string EscapeString(const Slice& value) { + std::string r; + AppendEscapedStringTo(&r, value); + return r; +} + +bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { + uint64_t v = 0; + int digits = 0; + while (!in->empty()) { + char c = (*in)[0]; + if (c >= '0' && c <= '9') { + ++digits; + const unsigned int delta = (c - '0'); + static const uint64_t kMaxUint64 = ~static_cast(0); + if (v > kMaxUint64/10 || + (v == kMaxUint64/10 && delta > kMaxUint64%10)) { + // Overflow + return false; + } + v = (v * 10) + delta; + in->remove_prefix(1); + } else { + break; + } + } + *val = v; + return (digits > 0); +} + +} // namespace rocksdb diff --git a/util/logging.h b/util/logging.h new file mode 100644 index 0000000000..ce02697268 --- /dev/null +++ b/util/logging.h @@ -0,0 +1,47 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Must not be included from any .h files to avoid polluting the namespace +// with macros. + +#pragma once +#include +#include +#include +#include "port/port.h" + +namespace rocksdb { + +class Slice; +class WritableFile; + +// Append a human-readable size in bytes +int AppendHumanBytes(uint64_t bytes, char* output, int len); + +// Append a human-readable printout of "num" to *str +extern void AppendNumberTo(std::string* str, uint64_t num); + +// Append a human-readable printout of "value" to *str. +// Escapes any non-printable characters found in "value". +extern void AppendEscapedStringTo(std::string* str, const Slice& value); + +// Return a human-readable printout of "num" +extern std::string NumberToString(uint64_t num); + +// Return a human-readable version of "value". +// Escapes any non-printable characters found in "value". +extern std::string EscapeString(const Slice& value); + +// Parse a human-readable number from "*in" into *value. On success, +// advances "*in" past the consumed number and sets "*val" to the +// numeric value. Otherwise, returns false and leaves *in in an +// unspecified state. +extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); + +} // namespace rocksdb diff --git a/util/manual_compaction_test.cc b/util/manual_compaction_test.cc new file mode 100644 index 0000000000..dd615f0570 --- /dev/null +++ b/util/manual_compaction_test.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Test for issue 178: a manual compaction causes deleted data to reappear. +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/slice.h" +#include "rocksdb/write_batch.h" +#include "util/testharness.h" + +using namespace rocksdb; + +namespace { + +const int kNumKeys = 1100000; + +std::string Key1(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "my_key_%d", i); + return buf; +} + +std::string Key2(int i) { + return Key1(i) + "_xxx"; +} + +class ManualCompactionTest { + public: + ManualCompactionTest() { + // Get rid of any state from an old run. + dbname_ = rocksdb::test::TmpDir() + "/rocksdb_cbug_test"; + DestroyDB(dbname_, rocksdb::Options()); + } + + std::string dbname_; +}; + +class DestroyAllCompactionFilter : public CompactionFilter { + public: + DestroyAllCompactionFilter() {} + + virtual bool Filter(int level, + const Slice& key, + const Slice& existing_value, + std::string* new_value, + bool* value_changed) const { + return existing_value.ToString() == "destroy"; + } + + virtual const char* Name() const { + return "DestroyAllCompactionFilter"; + } +}; + +TEST(ManualCompactionTest, CompactTouchesAllKeys) { + for (int iter = 0; iter < 2; ++iter) { + DB* db; + Options options; + if (iter == 0) { // level compaction + options.num_levels = 3; + options.compaction_style = kCompactionStyleLevel; + } else { // universal compaction + options.compaction_style = kCompactionStyleUniversal; + } + options.create_if_missing = true; + options.compression = rocksdb::kNoCompression; + options.compaction_filter = new DestroyAllCompactionFilter(); + ASSERT_OK(DB::Open(options, dbname_, &db)); + + db->Put(WriteOptions(), Slice("key1"), Slice("destroy")); + db->Put(WriteOptions(), Slice("key2"), Slice("destroy")); + db->Put(WriteOptions(), Slice("key3"), Slice("value3")); + db->Put(WriteOptions(), Slice("key4"), Slice("destroy")); + + Slice key4("key4"); + db->CompactRange(nullptr, &key4); + Iterator* itr = db->NewIterator(ReadOptions()); + itr->SeekToFirst(); + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ("key3", itr->key().ToString()); + itr->Next(); + ASSERT_TRUE(!itr->Valid()); + delete itr; + + delete options.compaction_filter; + delete db; + DestroyDB(dbname_, options); + } +} + +TEST(ManualCompactionTest, Test) { + + // Open database. Disable compression since it affects the creation + // of layers and the code below is trying to test against a very + // specific scenario. + rocksdb::DB* db; + rocksdb::Options db_options; + db_options.create_if_missing = true; + db_options.compression = rocksdb::kNoCompression; + ASSERT_OK(rocksdb::DB::Open(db_options, dbname_, &db)); + + // create first key range + rocksdb::WriteBatch batch; + for (int i = 0; i < kNumKeys; i++) { + batch.Put(Key1(i), "value for range 1 key"); + } + ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch)); + + // create second key range + batch.Clear(); + for (int i = 0; i < kNumKeys; i++) { + batch.Put(Key2(i), "value for range 2 key"); + } + ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch)); + + // delete second key range + batch.Clear(); + for (int i = 0; i < kNumKeys; i++) { + batch.Delete(Key2(i)); + } + ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch)); + + // compact database + std::string start_key = Key1(0); + std::string end_key = Key1(kNumKeys - 1); + rocksdb::Slice least(start_key.data(), start_key.size()); + rocksdb::Slice greatest(end_key.data(), end_key.size()); + + // commenting out the line below causes the example to work correctly + db->CompactRange(&least, &greatest); + + // count the keys + rocksdb::Iterator* iter = db->NewIterator(rocksdb::ReadOptions()); + int num_keys = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + num_keys++; + } + delete iter; + ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys"; + + // close database + delete db; + DestroyDB(dbname_, rocksdb::Options()); +} + +} // anonymous namespace + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/murmurhash.cc b/util/murmurhash.cc new file mode 100644 index 0000000000..d9d8b70617 --- /dev/null +++ b/util/murmurhash.cc @@ -0,0 +1,183 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +/* + Murmurhash from http://sites.google.com/site/murmurhash/ + + All code is released to the public domain. For business purposes, Murmurhash is + under the MIT license. +*/ +#include "murmurhash.h" + +#if defined(__x86_64__) + +// ------------------------------------------------------------------- +// +// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment +// and endian-ness issues if used across multiple platforms. +// +// 64-bit hash for 64-bit platforms + +uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ) +{ + const uint64_t m = 0xc6a4a7935bd1e995; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= ((uint64_t)data2[6]) << 48; + case 6: h ^= ((uint64_t)data2[5]) << 40; + case 5: h ^= ((uint64_t)data2[4]) << 32; + case 4: h ^= ((uint64_t)data2[3]) << 24; + case 3: h ^= ((uint64_t)data2[2]) << 16; + case 2: h ^= ((uint64_t)data2[1]) << 8; + case 1: h ^= ((uint64_t)data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +#elif defined(__i386__) + +// ------------------------------------------------------------------- +// +// Note - This code makes a few assumptions about how your machine behaves - +// +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 +// +// And it has a few limitations - +// +// 1. It will not work incrementally. +// 2. It will not produce the same results on little-endian and big-endian +// machines. + +unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const unsigned int m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + unsigned int h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k = *(unsigned int *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +#else + +// ------------------------------------------------------------------- +// +// Same as MurmurHash2, but endian- and alignment-neutral. +// Half the speed though, alas. + +unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ) +{ + const unsigned int m = 0x5bd1e995; + const int r = 24; + + unsigned int h = seed ^ len; + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k; + + k = data[0]; + k |= data[1] << 8; + k |= data[2] << 16; + k |= data[3] << 24; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +#endif diff --git a/util/murmurhash.h b/util/murmurhash.h new file mode 100644 index 0000000000..faa86556d2 --- /dev/null +++ b/util/murmurhash.h @@ -0,0 +1,42 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +/* + Murmurhash from http://sites.google.com/site/murmurhash/ + + All code is released to the public domain. For business purposes, Murmurhash is + under the MIT license. +*/ +#pragma once +#include +#include "rocksdb/slice.h" + +#if defined(__x86_64__) +#define MURMUR_HASH MurmurHash64A +uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHash64A +typedef uint64_t murmur_t; + +#elif defined(__i386__) +#define MURMUR_HASH MurmurHash2 +unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHash2 +typedef unsigned int murmur_t; + +#else +#define MURMUR_HASH MurmurHashNeutral2 +unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHashNeutral2 +typedef unsigned int murmur_t; +#endif + +// Allow slice to be hashable by murmur hash. +namespace rocksdb { +struct murmur_hash { + size_t operator()(const Slice& slice) const { + return MurmurHash(slice.data(), slice.size(), 0); + } +}; +} // rocksdb diff --git a/util/mutexlock.h b/util/mutexlock.h new file mode 100644 index 0000000000..0f4e5c8b7b --- /dev/null +++ b/util/mutexlock.h @@ -0,0 +1,78 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "port/port.h" + +namespace rocksdb { + +// Helper class that locks a mutex on construction and unlocks the mutex when +// the destructor of the MutexLock object is invoked. +// +// Typical usage: +// +// void MyClass::MyMethod() { +// MutexLock l(&mu_); // mu_ is an instance variable +// ... some complex code, possibly with multiple return paths ... +// } + +class MutexLock { + public: + explicit MutexLock(port::Mutex *mu) : mu_(mu) { + this->mu_->Lock(); + } + ~MutexLock() { this->mu_->Unlock(); } + + private: + port::Mutex *const mu_; + // No copying allowed + MutexLock(const MutexLock&); + void operator=(const MutexLock&); +}; + +// +// Acquire a ReadLock on the specified RWMutex. +// The Lock will be automatically released then the +// object goes out of scope. +// +class ReadLock { + public: + explicit ReadLock(port::RWMutex *mu) : mu_(mu) { + this->mu_->ReadLock(); + } + ~ReadLock() { this->mu_->Unlock(); } + + private: + port::RWMutex *const mu_; + // No copying allowed + ReadLock(const ReadLock&); + void operator=(const ReadLock&); +}; + + +// +// Acquire a WriteLock on the specified RWMutex. +// The Lock will be automatically released then the +// object goes out of scope. +// +class WriteLock { + public: + explicit WriteLock(port::RWMutex *mu) : mu_(mu) { + this->mu_->WriteLock(); + } + ~WriteLock() { this->mu_->Unlock(); } + + private: + port::RWMutex *const mu_; + // No copying allowed + WriteLock(const WriteLock&); + void operator=(const WriteLock&); +}; + +} // namespace rocksdb diff --git a/util/options.cc b/util/options.cc new file mode 100644 index 0000000000..c4d3e5e986 --- /dev/null +++ b/util/options.cc @@ -0,0 +1,553 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/options.h" + +#define __STDC_FORMAT_MACROS +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/block_based_table_factory.h" + +namespace rocksdb { + +ColumnFamilyOptions::ColumnFamilyOptions() + : comparator(BytewiseComparator()), + merge_operator(nullptr), + compaction_filter(nullptr), + compaction_filter_factory(std::shared_ptr( + new DefaultCompactionFilterFactory())), + compaction_filter_factory_v2(new DefaultCompactionFilterFactoryV2()), + write_buffer_size(4 << 20), + max_write_buffer_number(2), + min_write_buffer_number_to_merge(1), + block_cache(nullptr), + block_cache_compressed(nullptr), + block_size(4096), + block_restart_interval(16), + compression(kSnappyCompression), + filter_policy(nullptr), + prefix_extractor(nullptr), + whole_key_filtering(true), + num_levels(7), + level0_file_num_compaction_trigger(4), + level0_slowdown_writes_trigger(20), + level0_stop_writes_trigger(24), + max_mem_compaction_level(2), + target_file_size_base(2 * 1048576), + target_file_size_multiplier(1), + max_bytes_for_level_base(10 * 1048576), + max_bytes_for_level_multiplier(10), + max_bytes_for_level_multiplier_additional(num_levels, 1), + expanded_compaction_factor(25), + source_compaction_factor(1), + max_grandparent_overlap_factor(10), + disable_seek_compaction(true), + soft_rate_limit(0.0), + hard_rate_limit(0.0), + rate_limit_delay_max_milliseconds(1000), + no_block_cache(false), + arena_block_size(0), + disable_auto_compactions(false), + purge_redundant_kvs_while_flush(true), + block_size_deviation(10), + compaction_style(kCompactionStyleLevel), + verify_checksums_in_compaction(true), + filter_deletes(false), + max_sequential_skip_in_iterations(8), + memtable_factory(std::shared_ptr(new SkipListFactory)), + table_factory( + std::shared_ptr(new BlockBasedTableFactory())), + inplace_update_support(false), + inplace_update_num_locks(10000), + inplace_callback(nullptr), + memtable_prefix_bloom_bits(0), + memtable_prefix_bloom_probes(6), + memtable_prefix_bloom_huge_page_tlb_size(0), + bloom_locality(0), + max_successive_merges(0), + min_partial_merge_operands(2) { + assert(memtable_factory.get() != nullptr); +} + +ColumnFamilyOptions::ColumnFamilyOptions(const Options& options) + : comparator(options.comparator), + merge_operator(options.merge_operator), + compaction_filter(options.compaction_filter), + compaction_filter_factory(options.compaction_filter_factory), + compaction_filter_factory_v2(options.compaction_filter_factory_v2), + write_buffer_size(options.write_buffer_size), + max_write_buffer_number(options.max_write_buffer_number), + min_write_buffer_number_to_merge( + options.min_write_buffer_number_to_merge), + block_cache(options.block_cache), + block_cache_compressed(options.block_cache_compressed), + block_size(options.block_size), + block_restart_interval(options.block_restart_interval), + compression(options.compression), + compression_per_level(options.compression_per_level), + compression_opts(options.compression_opts), + filter_policy(options.filter_policy), + prefix_extractor(options.prefix_extractor), + whole_key_filtering(options.whole_key_filtering), + num_levels(options.num_levels), + level0_file_num_compaction_trigger( + options.level0_file_num_compaction_trigger), + level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger), + level0_stop_writes_trigger(options.level0_stop_writes_trigger), + max_mem_compaction_level(options.max_mem_compaction_level), + target_file_size_base(options.target_file_size_base), + target_file_size_multiplier(options.target_file_size_multiplier), + max_bytes_for_level_base(options.max_bytes_for_level_base), + max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier), + max_bytes_for_level_multiplier_additional( + options.max_bytes_for_level_multiplier_additional), + expanded_compaction_factor(options.expanded_compaction_factor), + source_compaction_factor(options.source_compaction_factor), + max_grandparent_overlap_factor(options.max_grandparent_overlap_factor), + disable_seek_compaction(options.disable_seek_compaction), + soft_rate_limit(options.soft_rate_limit), + hard_rate_limit(options.hard_rate_limit), + rate_limit_delay_max_milliseconds( + options.rate_limit_delay_max_milliseconds), + no_block_cache(options.no_block_cache), + arena_block_size(options.arena_block_size), + disable_auto_compactions(options.disable_auto_compactions), + purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush), + block_size_deviation(options.block_size_deviation), + compaction_style(options.compaction_style), + verify_checksums_in_compaction(options.verify_checksums_in_compaction), + compaction_options_universal(options.compaction_options_universal), + compaction_options_fifo(options.compaction_options_fifo), + filter_deletes(options.filter_deletes), + max_sequential_skip_in_iterations( + options.max_sequential_skip_in_iterations), + memtable_factory(options.memtable_factory), + table_factory(options.table_factory), + table_properties_collector_factories( + options.table_properties_collector_factories), + inplace_update_support(options.inplace_update_support), + inplace_update_num_locks(options.inplace_update_num_locks), + inplace_callback(options.inplace_callback), + memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits), + memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes), + memtable_prefix_bloom_huge_page_tlb_size( + options.memtable_prefix_bloom_huge_page_tlb_size), + bloom_locality(options.bloom_locality), + max_successive_merges(options.max_successive_merges), + min_partial_merge_operands(options.min_partial_merge_operands) { + assert(memtable_factory.get() != nullptr); +} + +DBOptions::DBOptions() + : create_if_missing(false), + error_if_exists(false), + paranoid_checks(true), + env(Env::Default()), + info_log(nullptr), + info_log_level(INFO_LEVEL), + max_open_files(5000), + max_total_wal_size(0), + statistics(nullptr), + disableDataSync(false), + use_fsync(false), + db_stats_log_interval(1800), + db_log_dir(""), + wal_dir(""), + delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL), + max_background_compactions(1), + max_background_flushes(1), + max_log_file_size(0), + log_file_time_to_roll(0), + keep_log_file_num(1000), + max_manifest_file_size(std::numeric_limits::max()), + table_cache_numshardbits(4), + table_cache_remove_scan_count_limit(16), + WAL_ttl_seconds(0), + WAL_size_limit_MB(0), + manifest_preallocation_size(4 * 1024 * 1024), + allow_os_buffer(true), + allow_mmap_reads(false), + allow_mmap_writes(false), + is_fd_close_on_exec(true), + skip_log_error_on_recovery(false), + stats_dump_period_sec(3600), + advise_random_on_open(true), + access_hint_on_compaction_start(NORMAL), + use_adaptive_mutex(false), + bytes_per_sync(0), + allow_thread_local(true) {} + +DBOptions::DBOptions(const Options& options) + : create_if_missing(options.create_if_missing), + error_if_exists(options.error_if_exists), + paranoid_checks(options.paranoid_checks), + env(options.env), + info_log(options.info_log), + info_log_level(options.info_log_level), + max_open_files(options.max_open_files), + max_total_wal_size(options.max_total_wal_size), + statistics(options.statistics), + disableDataSync(options.disableDataSync), + use_fsync(options.use_fsync), + db_stats_log_interval(options.db_stats_log_interval), + db_log_dir(options.db_log_dir), + wal_dir(options.wal_dir), + delete_obsolete_files_period_micros( + options.delete_obsolete_files_period_micros), + max_background_compactions(options.max_background_compactions), + max_background_flushes(options.max_background_flushes), + max_log_file_size(options.max_log_file_size), + log_file_time_to_roll(options.log_file_time_to_roll), + keep_log_file_num(options.keep_log_file_num), + max_manifest_file_size(options.max_manifest_file_size), + table_cache_numshardbits(options.table_cache_numshardbits), + table_cache_remove_scan_count_limit( + options.table_cache_remove_scan_count_limit), + WAL_ttl_seconds(options.WAL_ttl_seconds), + WAL_size_limit_MB(options.WAL_size_limit_MB), + manifest_preallocation_size(options.manifest_preallocation_size), + allow_os_buffer(options.allow_os_buffer), + allow_mmap_reads(options.allow_mmap_reads), + allow_mmap_writes(options.allow_mmap_writes), + is_fd_close_on_exec(options.is_fd_close_on_exec), + skip_log_error_on_recovery(options.skip_log_error_on_recovery), + stats_dump_period_sec(options.stats_dump_period_sec), + advise_random_on_open(options.advise_random_on_open), + access_hint_on_compaction_start(options.access_hint_on_compaction_start), + use_adaptive_mutex(options.use_adaptive_mutex), + bytes_per_sync(options.bytes_per_sync), + allow_thread_local(options.allow_thread_local) {} + +static const char* const access_hints[] = { + "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" +}; + +void DBOptions::Dump(Logger* log) const { + Log(log," Options.error_if_exists: %d", error_if_exists); + Log(log," Options.create_if_missing: %d", create_if_missing); + Log(log," Options.paranoid_checks: %d", paranoid_checks); + Log(log," Options.env: %p", env); + Log(log," Options.info_log: %p", info_log.get()); + Log(log," Options.max_open_files: %d", max_open_files); + Log(log," Options.max_total_wal_size: %" PRIu64, max_total_wal_size); + Log(log, " Options.disableDataSync: %d", disableDataSync); + Log(log, " Options.use_fsync: %d", use_fsync); + Log(log, " Options.max_log_file_size: %zu", max_log_file_size); + Log(log, "Options.max_manifest_file_size: %lu", + (unsigned long)max_manifest_file_size); + Log(log, " Options.log_file_time_to_roll: %zu", log_file_time_to_roll); + Log(log, " Options.keep_log_file_num: %zu", keep_log_file_num); + Log(log, " Options.db_stats_log_interval: %d", db_stats_log_interval); + Log(log, " Options.allow_os_buffer: %d", allow_os_buffer); + Log(log, " Options.allow_mmap_reads: %d", allow_mmap_reads); + Log(log, " Options.allow_mmap_writes: %d", allow_mmap_writes); + Log(log, " Options.db_log_dir: %s", + db_log_dir.c_str()); + Log(log, " Options.wal_dir: %s", + wal_dir.c_str()); + Log(log, " Options.table_cache_numshardbits: %d", + table_cache_numshardbits); + Log(log, " Options.table_cache_remove_scan_count_limit: %d", + table_cache_remove_scan_count_limit); + Log(log, " Options.delete_obsolete_files_period_micros: %lu", + (unsigned long)delete_obsolete_files_period_micros); + Log(log, " Options.max_background_compactions: %d", + max_background_compactions); + Log(log, " Options.max_background_flushes: %d", + max_background_flushes); + Log(log, " Options.WAL_ttl_seconds: %lu", + (unsigned long)WAL_ttl_seconds); + Log(log, " Options.WAL_size_limit_MB: %lu", + (unsigned long)WAL_size_limit_MB); + Log(log, " Options.manifest_preallocation_size: %zu", + manifest_preallocation_size); + Log(log, " Options.allow_os_buffer: %d", + allow_os_buffer); + Log(log, " Options.allow_mmap_reads: %d", + allow_mmap_reads); + Log(log, " Options.allow_mmap_writes: %d", + allow_mmap_writes); + Log(log, " Options.is_fd_close_on_exec: %d", + is_fd_close_on_exec); + Log(log, " Options.skip_log_error_on_recovery: %d", + skip_log_error_on_recovery); + Log(log, " Options.stats_dump_period_sec: %u", + stats_dump_period_sec); + Log(log, " Options.advise_random_on_open: %d", + advise_random_on_open); + Log(log, " Options.access_hint_on_compaction_start: %s", + access_hints[access_hint_on_compaction_start]); + Log(log, " Options.use_adaptive_mutex: %d", + use_adaptive_mutex); + Log(log, " Options.bytes_per_sync: %lu", + (unsigned long)bytes_per_sync); +} // DBOptions::Dump + +void ColumnFamilyOptions::Dump(Logger* log) const { + Log(log, " Options.comparator: %s", comparator->Name()); + Log(log, " Options.merge_operator: %s", + merge_operator ? merge_operator->Name() : "None"); + Log(log, " Options.compaction_filter_factory: %s", + compaction_filter_factory->Name()); + Log(log, " Options.compaction_filter_factory_v2: %s", + compaction_filter_factory_v2->Name()); + Log(log, " Options.memtable_factory: %s", memtable_factory->Name()); + Log(log, " Options.table_factory: %s", table_factory->Name()); + Log(log, " Options.write_buffer_size: %zd", write_buffer_size); + Log(log, " Options.max_write_buffer_number: %d", max_write_buffer_number); + Log(log," Options.block_cache: %p", block_cache.get()); + Log(log," Options.block_cache_compressed: %p", + block_cache_compressed.get()); + if (block_cache) { + Log(log," Options.block_cache_size: %zd", + block_cache->GetCapacity()); + } + if (block_cache_compressed) { + Log(log,"Options.block_cache_compressed_size: %zd", + block_cache_compressed->GetCapacity()); + } + Log(log," Options.block_size: %zd", block_size); + Log(log," Options.block_restart_interval: %d", block_restart_interval); + if (!compression_per_level.empty()) { + for (unsigned int i = 0; i < compression_per_level.size(); i++) { + Log(log," Options.compression[%d]: %d", + i, compression_per_level[i]); + } + } else { + Log(log," Options.compression: %d", compression); + } + Log(log," Options.filter_policy: %s", + filter_policy == nullptr ? "nullptr" : filter_policy->Name()); + Log(log," Options.prefix_extractor: %s", + prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name()); + Log(log," Options.whole_key_filtering: %d", whole_key_filtering); + Log(log," Options.num_levels: %d", num_levels); + Log(log," Options.min_write_buffer_number_to_merge: %d", + min_write_buffer_number_to_merge); + Log(log," Options.purge_redundant_kvs_while_flush: %d", + purge_redundant_kvs_while_flush); + Log(log," Options.compression_opts.window_bits: %d", + compression_opts.window_bits); + Log(log," Options.compression_opts.level: %d", + compression_opts.level); + Log(log," Options.compression_opts.strategy: %d", + compression_opts.strategy); + Log(log," Options.level0_file_num_compaction_trigger: %d", + level0_file_num_compaction_trigger); + Log(log," Options.level0_slowdown_writes_trigger: %d", + level0_slowdown_writes_trigger); + Log(log," Options.level0_stop_writes_trigger: %d", + level0_stop_writes_trigger); + Log(log," Options.max_mem_compaction_level: %d", + max_mem_compaction_level); + Log(log," Options.target_file_size_base: %d", + target_file_size_base); + Log(log," Options.target_file_size_multiplier: %d", + target_file_size_multiplier); + Log(log," Options.max_bytes_for_level_base: %lu", + (unsigned long)max_bytes_for_level_base); + Log(log," Options.max_bytes_for_level_multiplier: %d", + max_bytes_for_level_multiplier); + for (int i = 0; i < num_levels; i++) { + Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d", + i, max_bytes_for_level_multiplier_additional[i]); + } + Log(log," Options.max_sequential_skip_in_iterations: %lu", + (unsigned long)max_sequential_skip_in_iterations); + Log(log," Options.expanded_compaction_factor: %d", + expanded_compaction_factor); + Log(log," Options.source_compaction_factor: %d", + source_compaction_factor); + Log(log," Options.max_grandparent_overlap_factor: %d", + max_grandparent_overlap_factor); + Log(log," Options.disable_seek_compaction: %d", + disable_seek_compaction); + Log(log," Options.no_block_cache: %d", + no_block_cache); + Log(log," Options.arena_block_size: %zu", + arena_block_size); + Log(log," Options.soft_rate_limit: %.2f", + soft_rate_limit); + Log(log," Options.hard_rate_limit: %.2f", + hard_rate_limit); + Log(log," Options.rate_limit_delay_max_milliseconds: %u", + rate_limit_delay_max_milliseconds); + Log(log," Options.disable_auto_compactions: %d", + disable_auto_compactions); + Log(log," Options.purge_redundant_kvs_while_flush: %d", + purge_redundant_kvs_while_flush); + Log(log," Options.block_size_deviation: %d", + block_size_deviation); + Log(log," Options.filter_deletes: %d", + filter_deletes); + Log(log, " Options.verify_checksums_in_compaction: %d", + verify_checksums_in_compaction); + Log(log," Options.compaction_style: %d", + compaction_style); + Log(log," Options.compaction_options_universal.size_ratio: %u", + compaction_options_universal.size_ratio); + Log(log,"Options.compaction_options_universal.min_merge_width: %u", + compaction_options_universal.min_merge_width); + Log(log,"Options.compaction_options_universal.max_merge_width: %u", + compaction_options_universal.max_merge_width); + Log(log,"Options.compaction_options_universal." + "max_size_amplification_percent: %u", + compaction_options_universal.max_size_amplification_percent); + Log(log, + "Options.compaction_options_universal.compression_size_percent: %u", + compaction_options_universal.compression_size_percent); + Log(log, "Options.compaction_options_fifo.max_table_files_size: %" PRIu64, + compaction_options_fifo.max_table_files_size); + std::string collector_names; + for (const auto& collector_factory : table_properties_collector_factories) { + collector_names.append(collector_factory->Name()); + collector_names.append("; "); + } + Log(log, " Options.table_properties_collectors: %s", + collector_names.c_str()); + Log(log, " Options.inplace_update_support: %d", + inplace_update_support); + Log(log, " Options.inplace_update_num_locks: %zd", + inplace_update_num_locks); + Log(log, " Options.min_partial_merge_operands: %u", + min_partial_merge_operands); + // TODO: easier config for bloom (maybe based on avg key/value size) + Log(log, " Options.memtable_prefix_bloom_bits: %d", + memtable_prefix_bloom_bits); + Log(log, " Options.memtable_prefix_bloom_probes: %d", + memtable_prefix_bloom_probes); + Log(log, " Options.memtable_prefix_bloom_huge_page_tlb_size: %zu", + memtable_prefix_bloom_huge_page_tlb_size); + Log(log, " Options.bloom_locality: %d", + bloom_locality); + Log(log, " Options.max_successive_merges: %zd", + max_successive_merges); +} // ColumnFamilyOptions::Dump + +void Options::Dump(Logger* log) const { + DBOptions::Dump(log); + ColumnFamilyOptions::Dump(log); +} // Options::Dump + +// +// The goal of this method is to create a configuration that +// allows an application to write all files into L0 and +// then do a single compaction to output all files into L1. +Options* +Options::PrepareForBulkLoad() +{ + // never slowdown ingest. + level0_file_num_compaction_trigger = (1<<30); + level0_slowdown_writes_trigger = (1<<30); + level0_stop_writes_trigger = (1<<30); + + // no auto compactions please. The application should issue a + // manual compaction after all data is loaded into L0. + disable_auto_compactions = true; + disable_seek_compaction = true; + disableDataSync = true; + + // A manual compaction run should pick all files in L0 in + // a single compaction run. + source_compaction_factor = (1<<30); + + // It is better to have only 2 levels, otherwise a manual + // compaction would compact at every possible level, thereby + // increasing the total time needed for compactions. + num_levels = 2; + + // Prevent a memtable flush to automatically promote files + // to L1. This is helpful so that all files that are + // input to the manual compaction are all at L0. + max_background_compactions = 2; + + // The compaction would create large files in L1. + target_file_size_base = 256 * 1024 * 1024; + return this; +} + +// Optimization functions +ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup() { + prefix_extractor.reset(NewNoopTransform()); + BlockBasedTableOptions block_based_options; + block_based_options.index_type = BlockBasedTableOptions::kBinarySearch; + table_factory.reset(new BlockBasedTableFactory(block_based_options)); +#ifndef ROCKSDB_LITE + memtable_factory.reset(NewHashLinkListRepFactory()); +#endif + return this; +} + +ColumnFamilyOptions* ColumnFamilyOptions::OptimizeLevelStyleCompaction( + uint64_t memtable_memory_budget) { + write_buffer_size = memtable_memory_budget / 4; + // merge two memtables when flushing to L0 + min_write_buffer_number_to_merge = 2; + // this means we'll use 50% extra memory in the worst case, but will reduce + // write stalls. + max_write_buffer_number = 6; + // start flushing L0->L1 as soon as possible. each file on level0 is + // (memtable_memory_budget / 2). This will flush level 0 when it's bigger than + // memtable_memory_budget. + level0_file_num_compaction_trigger = 2; + // doesn't really matter much, but we don't want to create too many files + target_file_size_base = memtable_memory_budget / 8; + // make Level1 size equal to Level0 size, so that L0->L1 compactions are fast + max_bytes_for_level_base = memtable_memory_budget; + + // level style compaction + compaction_style = kCompactionStyleLevel; + + // only compress levels >= 2 + compression_per_level.resize(num_levels); + for (int i = 0; i < num_levels; ++i) { + if (i < 2) { + compression_per_level[i] = kNoCompression; + } else { + compression_per_level[i] = kSnappyCompression; + } + } + return this; +} + +ColumnFamilyOptions* ColumnFamilyOptions::OptimizeUniversalStyleCompaction( + uint64_t memtable_memory_budget) { + write_buffer_size = memtable_memory_budget / 4; + // merge two memtables when flushing to L0 + min_write_buffer_number_to_merge = 2; + // this means we'll use 50% extra memory in the worst case, but will reduce + // write stalls. + max_write_buffer_number = 6; + // universal style compaction + compaction_style = kCompactionStyleUniversal; + compaction_options_universal.compression_size_percent = 80; + return this; +} + +DBOptions* DBOptions::IncreaseParallelism(int total_threads) { + max_background_compactions = total_threads - 1; + max_background_flushes = 1; + env->SetBackgroundThreads(total_threads, Env::LOW); + env->SetBackgroundThreads(1, Env::HIGH); + return this; +} + +} // namespace rocksdb diff --git a/util/perf_context.cc b/util/perf_context.cc new file mode 100644 index 0000000000..264b10d738 --- /dev/null +++ b/util/perf_context.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include +#include "util/perf_context_imp.h" + +namespace rocksdb { + +#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) +PerfLevel perf_level = kEnableCount; +// This is a dummy variable since some place references it +PerfContext perf_context; +#else +__thread PerfLevel perf_level = kEnableCount; +__thread PerfContext perf_context; +#endif + +void SetPerfLevel(PerfLevel level) { + perf_level = level; +} + +void PerfContext::Reset() { +#if !defined(NPERF_CONTEXT) && !defined(IOS_CROSS_COMPILE) + user_key_comparison_count = 0; + block_cache_hit_count = 0; + block_read_count = 0; + block_read_byte = 0; + block_read_time = 0; + block_checksum_time = 0; + block_decompress_time = 0; + internal_key_skipped_count = 0; + internal_delete_skipped_count = 0; + write_wal_time = 0; + + get_snapshot_time = 0; + get_from_memtable_time = 0; + get_from_memtable_count = 0; + get_post_process_time = 0; + get_from_output_files_time = 0; + seek_child_seek_time = 0; + seek_child_seek_count = 0; + seek_min_heap_time = 0; + seek_internal_seek_time = 0; + find_next_user_entry_time = 0; + write_pre_and_post_process_time = 0; + write_memtable_time = 0; +#endif +} + +#define OUTPUT(counter) #counter << " = " << counter << ", " + +std::string PerfContext::ToString() const { +#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) + return ""; +#else + std::ostringstream ss; + ss << OUTPUT(user_key_comparison_count) + << OUTPUT(block_cache_hit_count) + << OUTPUT(block_read_count) + << OUTPUT(block_read_byte) + << OUTPUT(block_read_time) + << OUTPUT(block_checksum_time) + << OUTPUT(block_decompress_time) + << OUTPUT(internal_key_skipped_count) + << OUTPUT(internal_delete_skipped_count) + << OUTPUT(write_wal_time) + << OUTPUT(get_snapshot_time) + << OUTPUT(get_from_memtable_time) + << OUTPUT(get_from_memtable_count) + << OUTPUT(get_post_process_time) + << OUTPUT(get_from_output_files_time) + << OUTPUT(seek_child_seek_time) + << OUTPUT(seek_child_seek_count) + << OUTPUT(seek_min_heap_time) + << OUTPUT(seek_internal_seek_time) + << OUTPUT(find_next_user_entry_time) + << OUTPUT(write_pre_and_post_process_time) + << OUTPUT(write_memtable_time); + return ss.str(); +#endif +} + +} diff --git a/util/perf_context_imp.h b/util/perf_context_imp.h new file mode 100644 index 0000000000..dc4ae95e5c --- /dev/null +++ b/util/perf_context_imp.h @@ -0,0 +1,88 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "rocksdb/perf_context.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) + +#define PERF_TIMER_DECLARE() +#define PERF_TIMER_START(metric) +#define PERF_TIMER_AUTO(metric) +#define PERF_TIMER_MEASURE(metric) +#define PERF_TIMER_STOP(metric) +#define PERF_COUNTER_ADD(metric, value) + +#else + +extern __thread PerfLevel perf_level; + +class PerfStepTimer { + public: + PerfStepTimer() + : enabled_(perf_level >= PerfLevel::kEnableTime), + env_(enabled_ ? Env::Default() : nullptr), + start_(0) { + } + + void Start() { + if (enabled_) { + start_ = env_->NowNanos(); + } + } + + void Measure(uint64_t* metric) { + if (start_) { + uint64_t now = env_->NowNanos(); + *metric += now - start_; + start_ = now; + } + } + + void Stop(uint64_t* metric) { + if (start_) { + *metric += env_->NowNanos() - start_; + start_ = 0; + } + } + + private: + const bool enabled_; + Env* const env_; + uint64_t start_; +}; + +// Declare the local timer object to be used later on +#define PERF_TIMER_DECLARE() \ + PerfStepTimer perf_step_timer; + +// Set start time of the timer +#define PERF_TIMER_START(metric) \ + perf_step_timer.Start(); + +// Declare and set start time of the timer +#define PERF_TIMER_AUTO(metric) \ + PerfStepTimer perf_step_timer; \ + perf_step_timer.Start(); + +// Update metric with time elapsed since last START. start time is reset +// to current timestamp. +#define PERF_TIMER_MEASURE(metric) \ + perf_step_timer.Measure(&(perf_context.metric)); + +// Update metric with time elapsed since last START. But start time is not set. +#define PERF_TIMER_STOP(metric) \ + perf_step_timer.Stop(&(perf_context.metric)); + +// Increase metric value +#define PERF_COUNTER_ADD(metric, value) \ + perf_context.metric += value; + +#endif + +} diff --git a/util/posix_logger.h b/util/posix_logger.h new file mode 100644 index 0000000000..6aba769f1a --- /dev/null +++ b/util/posix_logger.h @@ -0,0 +1,161 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#pragma once +#include +#include +#include +#include +#include +#include +#ifdef OS_LINUX +#include +#endif +#include "rocksdb/env.h" +#include + +namespace rocksdb { + +const int kDebugLogChunkSize = 128 * 1024; + +class PosixLogger : public Logger { + private: + FILE* file_; + uint64_t (*gettid_)(); // Return the thread id for the current thread + std::atomic_size_t log_size_; + int fd_; + const static uint64_t flush_every_seconds_ = 5; + std::atomic_uint_fast64_t last_flush_micros_; + Env* env_; + bool flush_pending_; + public: + PosixLogger(FILE* f, uint64_t (*gettid)(), Env* env, + const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL) + : Logger(log_level), + file_(f), + gettid_(gettid), + log_size_(0), + fd_(fileno(f)), + last_flush_micros_(0), + env_(env), + flush_pending_(false) {} + virtual ~PosixLogger() { + fclose(file_); + } + virtual void Flush() { + if (flush_pending_) { + flush_pending_ = false; + fflush(file_); + } + last_flush_micros_ = env_->NowMicros(); + } + virtual void Logv(const char* format, va_list ap) { + const uint64_t thread_id = (*gettid_)(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec, + static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + const size_t write_size = p - base; + +#ifdef ROCKSDB_FALLOCATE_PRESENT + // If this write would cross a boundary of kDebugLogChunkSize + // space, pre-allocate more space to avoid overly large + // allocations from filesystem allocsize options. + const size_t log_size = log_size_; + const int last_allocation_chunk = + ((kDebugLogChunkSize - 1 + log_size) / kDebugLogChunkSize); + const int desired_allocation_chunk = + ((kDebugLogChunkSize - 1 + log_size + write_size) / + kDebugLogChunkSize); + if (last_allocation_chunk != desired_allocation_chunk) { + fallocate(fd_, FALLOC_FL_KEEP_SIZE, 0, + desired_allocation_chunk * kDebugLogChunkSize); + } +#endif + + size_t sz = fwrite(base, 1, write_size, file_); + flush_pending_ = true; + assert(sz == write_size); + if (sz > 0) { + log_size_ += write_size; + } + uint64_t now_micros = static_cast(now_tv.tv_sec) * 1000000 + + now_tv.tv_usec; + if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + flush_pending_ = false; + fflush(file_); + last_flush_micros_ = now_micros; + } + if (base != buffer) { + delete[] base; + } + break; + } + } + size_t GetLogFileSize() const { + return log_size_; + } +}; + +} // namespace rocksdb diff --git a/util/random.h b/util/random.h new file mode 100644 index 0000000000..e5b331500c --- /dev/null +++ b/util/random.h @@ -0,0 +1,90 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +namespace rocksdb { + +// A very simple random number generator. Not especially good at +// generating truly random bits, but good enough for our needs in this +// package. +class Random { + private: + uint32_t seed_; + public: + explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } + uint32_t Next() { + static const uint32_t M = 2147483647L; // 2^31-1 + static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 + // We are computing + // seed_ = (seed_ * A) % M, where M = 2^31-1 + // + // seed_ must not be zero or M, or else all subsequent computed values + // will be zero or M respectively. For all other values, seed_ will end + // up cycling through every number in [1,M-1] + uint64_t product = seed_ * A; + + // Compute (product % M) using the fact that ((x << 31) % M) == x. + seed_ = static_cast((product >> 31) + (product & M)); + // The first reduction may overflow by 1 bit, so we may need to + // repeat. mod == M is not possible; using > allows the faster + // sign-bit-based test. + if (seed_ > M) { + seed_ -= M; + } + return seed_; + } + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(int n) { return Next() % n; } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(int n) { return (Next() % n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } +}; + +// A simple 64bit random number generator based on std::mt19937_64 +class Random64 { + private: + std::mt19937_64 generator_; + + public: + explicit Random64(uint64_t s) : generator_(s) { } + + // Generates the next random number + uint64_t Next() { return generator_(); } + + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint64_t Uniform(uint64_t n) { + return std::uniform_int_distribution(0, n - 1)(generator_); + } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(uint64_t n) { return Uniform(n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint64_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } +}; + +} // namespace rocksdb diff --git a/util/signal_test.cc b/util/signal_test.cc new file mode 100644 index 0000000000..f51fa548ef --- /dev/null +++ b/util/signal_test.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "port/stack_trace.h" +#include + +namespace { +void f0() { + char *p = nullptr; + *p = 10; /* SIGSEGV here!! */ +} + +void f1() { + f0(); +} + +void f2() { + f1(); +} + +void f3() { + f2(); +} +} // namespace + +int main() { + rocksdb::port::InstallStackTraceHandler(); + + f3(); + + return 0; +} diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc new file mode 100644 index 0000000000..895343001b --- /dev/null +++ b/util/skiplistrep.cc @@ -0,0 +1,129 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "rocksdb/memtablerep.h" +#include "db/memtable.h" +#include "db/skiplist.h" +#include "util/arena.h" + +namespace rocksdb { +namespace { +class SkipListRep : public MemTableRep { + SkipList skip_list_; +public: + explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena) + : MemTableRep(arena), skip_list_(compare, arena) { + } + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + virtual void Insert(KeyHandle handle) override { + skip_list_.Insert(static_cast(handle)); + } + + // Returns true iff an entry that compares equal to key is in the list. + virtual bool Contains(const char* key) const override { + return skip_list_.Contains(key); + } + + virtual size_t ApproximateMemoryUsage() override { + // All memory is allocated through arena; nothing to report here + return 0; + } + + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, + const char* entry)) override { + SkipListRep::Iterator iter(&skip_list_); + Slice dummy_slice; + for (iter.Seek(dummy_slice, k.memtable_key().data()); + iter.Valid() && callback_func(callback_args, iter.key()); + iter.Next()) { + } + } + + virtual ~SkipListRep() override { } + + // Iteration over the contents of a skip list + class Iterator : public MemTableRep::Iterator { + SkipList::Iterator iter_; + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator( + const SkipList* list + ) : iter_(list) { } + + virtual ~Iterator() override { } + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const override { + return iter_.Valid(); + } + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const override { + return iter_.key(); + } + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() override { + iter_.Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() override { + iter_.Prev(); + } + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& user_key, const char* memtable_key) + override { + if (memtable_key != nullptr) { + iter_.Seek(memtable_key); + } else { + iter_.Seek(EncodeKey(&tmp_, user_key)); + } + } + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + virtual void SeekToFirst() override { + iter_.SeekToFirst(); + } + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + virtual void SeekToLast() override { + iter_.SeekToLast(); + } + protected: + std::string tmp_; // For passing to EncodeKey + }; + + // Unhide default implementations of GetIterator + using MemTableRep::GetIterator; + + virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { + if (arena == nullptr) { + return new SkipListRep::Iterator(&skip_list_); + } else { + auto mem = arena->AllocateAligned(sizeof(SkipListRep::Iterator)); + return new (mem) SkipListRep::Iterator(&skip_list_); + } + } +}; +} + +MemTableRep* SkipListFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform*, Logger* logger) { + return new SkipListRep(compare, arena); +} + +} // namespace rocksdb diff --git a/util/slice.cc b/util/slice.cc new file mode 100644 index 0000000000..5a1f4f10e9 --- /dev/null +++ b/util/slice.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/slice_transform.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + +namespace { + +class FixedPrefixTransform : public SliceTransform { + private: + size_t prefix_len_; + std::string name_; + + public: + explicit FixedPrefixTransform(size_t prefix_len) + : prefix_len_(prefix_len), + name_("rocksdb.FixedPrefix." + std::to_string(prefix_len_)) {} + + virtual const char* Name() const { return name_.c_str(); } + + virtual Slice Transform(const Slice& src) const { + assert(InDomain(src)); + return Slice(src.data(), prefix_len_); + } + + virtual bool InDomain(const Slice& src) const { + return (src.size() >= prefix_len_); + } + + virtual bool InRange(const Slice& dst) const { + return (dst.size() == prefix_len_); + } +}; + +class NoopTransform : public SliceTransform { + public: + explicit NoopTransform() { } + + virtual const char* Name() const { + return "rocksdb.Noop"; + } + + virtual Slice Transform(const Slice& src) const { + return src; + } + + virtual bool InDomain(const Slice& src) const { + return true; + } + + virtual bool InRange(const Slice& dst) const { + return true; + } +}; + +} + +const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) { + return new FixedPrefixTransform(prefix_len); +} + +const SliceTransform* NewNoopTransform() { + return new NoopTransform; +} + +} // namespace rocksdb diff --git a/util/statistics.cc b/util/statistics.cc new file mode 100644 index 0000000000..4fc2400185 --- /dev/null +++ b/util/statistics.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/statistics.h" +#include "rocksdb/statistics.h" +#include +#include + +namespace rocksdb { + +std::shared_ptr CreateDBStatistics() { + return std::make_shared(); +} + +StatisticsImpl::StatisticsImpl() {} + +StatisticsImpl::~StatisticsImpl() {} + +long StatisticsImpl::getTickerCount(Tickers tickerType) { + assert(tickerType < TICKER_ENUM_MAX); + return tickers_[tickerType].value; +} + +void StatisticsImpl::setTickerCount(Tickers tickerType, uint64_t count) { + assert(tickerType < TICKER_ENUM_MAX); + tickers_[tickerType].value = count; +} + +void StatisticsImpl::recordTick(Tickers tickerType, uint64_t count) { + assert(tickerType < TICKER_ENUM_MAX); + tickers_[tickerType].value += count; +} + +void StatisticsImpl::measureTime(Histograms histogramType, uint64_t value) { + assert(histogramType < HISTOGRAM_ENUM_MAX); + histograms_[histogramType].Add(value); +} + +void StatisticsImpl::histogramData(Histograms histogramType, + HistogramData* const data) { + assert(histogramType < HISTOGRAM_ENUM_MAX); + histograms_[histogramType].Data(data); +} + +namespace { + +// a buffer size used for temp string buffers +const int kBufferSize = 200; + +std::string HistogramToString ( + Statistics* dbstats, + const Histograms& histogram_type, + const std::string& name) { + + char buffer[kBufferSize]; + HistogramData histogramData; + dbstats->histogramData(histogram_type, &histogramData); + snprintf( + buffer, + kBufferSize, + "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f\n", + name.c_str(), + histogramData.median, + histogramData.percentile95, + histogramData.percentile99 + ); + return std::string(buffer); +}; + +std::string TickerToString(Statistics* dbstats, const Tickers& ticker, + const std::string& name) { + char buffer[kBufferSize]; + snprintf(buffer, kBufferSize, "%s COUNT : %ld\n", + name.c_str(), dbstats->getTickerCount(ticker)); + return std::string(buffer); +}; +} // namespace + +std::string Statistics::ToString() { + std::string res; + res.reserve(20000); + for (const auto& t : TickersNameMap) { + res.append(TickerToString(this, t.first, t.second)); + } + for (const auto& h : HistogramsNameMap) { + res.append(HistogramToString(this, h.first, h.second)); + } + res.shrink_to_fit(); + return res; +} + +} // namespace rocksdb diff --git a/util/statistics.h b/util/statistics.h new file mode 100644 index 0000000000..d57a1dd4b3 --- /dev/null +++ b/util/statistics.h @@ -0,0 +1,66 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "rocksdb/statistics.h" +#include "util/histogram.h" +#include "util/mutexlock.h" +#include "port/likely.h" + +#include +#include + + +namespace rocksdb { + +class StatisticsImpl : public Statistics { + public: + StatisticsImpl(); + virtual ~StatisticsImpl(); + + virtual long getTickerCount(Tickers tickerType); + virtual void setTickerCount(Tickers tickerType, uint64_t count); + virtual void recordTick(Tickers tickerType, uint64_t count); + virtual void measureTime(Histograms histogramType, uint64_t value); + virtual void histogramData(Histograms histogramType, + HistogramData* const data); + + private: + struct Ticker { + Ticker() : value(uint_fast64_t()) {} + + std::atomic_uint_fast64_t value; + // Pad the structure to make it size of 64 bytes. A plain array of + // std::atomic_uint_fast64_t results in huge performance degradataion + // due to false sharing. + char padding[64 - sizeof(std::atomic_uint_fast64_t)]; + }; + + Ticker tickers_[TICKER_ENUM_MAX] __attribute__((aligned(64))); + HistogramImpl histograms_[HISTOGRAM_ENUM_MAX] __attribute__((aligned(64))); +}; + +// Utility functions +inline void MeasureTime(Statistics* statistics, Histograms histogramType, + uint64_t value) { + if (statistics) { + statistics->measureTime(histogramType, value); + } +} + +inline void RecordTick(Statistics* statistics, Tickers ticker, + uint64_t count = 1) { + if (statistics) { + statistics->recordTick(ticker, count); + } +} + +inline void SetTickerCount(Statistics* statistics, Tickers ticker, + uint64_t count) { + if (statistics) { + statistics->setTickerCount(ticker, count); + } +} +} diff --git a/util/stats_logger.h b/util/stats_logger.h new file mode 100644 index 0000000000..f0b45404e1 --- /dev/null +++ b/util/stats_logger.h @@ -0,0 +1,26 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +namespace rocksdb { + +class StatsLogger { + + public: + + virtual void Log_Deploy_Stats(const std::string& db_version, + const std::string& machine_info, + const std::string& data_dir, + const uint64_t data_size, + const uint32_t file_number, + const std::string& data_size_per_level, + const std::string& file_number_per_level, + const int64_t& ts_unix) = 0; + virtual ~StatsLogger() {} + +}; + +} diff --git a/util/status.cc b/util/status.cc new file mode 100644 index 0000000000..2a5f05a4b8 --- /dev/null +++ b/util/status.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "port/port.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +const char* Status::CopyState(const char* state) { + uint32_t size; + memcpy(&size, state, sizeof(size)); + char* result = new char[size + 4]; + memcpy(result, state, size + 4); + return result; +} + +Status::Status(Code code, const Slice& msg, const Slice& msg2) : + code_(code) { + assert(code != kOk); + const uint32_t len1 = msg.size(); + const uint32_t len2 = msg2.size(); + const uint32_t size = len1 + (len2 ? (2 + len2) : 0); + char* result = new char[size + 4]; + memcpy(result, &size, sizeof(size)); + memcpy(result + 4, msg.data(), len1); + if (len2) { + result[4 + len1] = ':'; + result[5 + len1] = ' '; + memcpy(result + 6 + len1, msg2.data(), len2); + } + state_ = result; +} + +std::string Status::ToString() const { + char tmp[30]; + const char* type; + switch (code_) { + case kOk: + return "OK"; + case kNotFound: + type = "NotFound: "; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + case kMergeInProgress: + type = "Merge in progress: "; + break; + case kIncomplete: + type = "Result incomplete: "; + break; + case kShutdownInProgress: + type = "Shutdown in progress: "; + break; + default: + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", + static_cast(code())); + type = tmp; + break; + } + std::string result(type); + if (state_ != nullptr) { + uint32_t length; + memcpy(&length, state_, sizeof(length)); + result.append(state_ + 4, length); + } + return result; +} + +} // namespace rocksdb diff --git a/util/stl_wrappers.h b/util/stl_wrappers.h new file mode 100644 index 0000000000..b4c14b4ba3 --- /dev/null +++ b/util/stl_wrappers.h @@ -0,0 +1,32 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +#include "util/murmurhash.h" +#include "util/coding.h" + +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice.h" + +namespace rocksdb { +namespace stl_wrappers { + class Base { + protected: + const MemTableRep::KeyComparator& compare_; + explicit Base(const MemTableRep::KeyComparator& compare) + : compare_(compare) { } + }; + + struct Compare : private Base { + explicit Compare(const MemTableRep::KeyComparator& compare) + : Base(compare) { } + inline bool operator()(const char* a, const char* b) const { + return compare_(a, b) < 0; + } + }; + +} +} diff --git a/util/stop_watch.h b/util/stop_watch.h new file mode 100644 index 0000000000..48e1b01c25 --- /dev/null +++ b/util/stop_watch.h @@ -0,0 +1,67 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "rocksdb/env.h" +#include "util/statistics.h" + +namespace rocksdb { +// Auto-scoped. +// Records the statistic into the corresponding histogram. +class StopWatch { + public: + explicit StopWatch( + Env * const env, + Statistics* statistics = nullptr, + const Histograms histogram_name = DB_GET, + bool auto_start = true) : + env_(env), + start_time_((!auto_start && !statistics) ? 0 : env->NowMicros()), + statistics_(statistics), + histogram_name_(histogram_name) {} + + + + uint64_t ElapsedMicros() { + return env_->NowMicros() - start_time_; + } + + ~StopWatch() { MeasureTime(statistics_, histogram_name_, ElapsedMicros()); } + + private: + Env* const env_; + const uint64_t start_time_; + Statistics* statistics_; + const Histograms histogram_name_; + +}; + +// a nano second precision stopwatch +class StopWatchNano { + public: + explicit StopWatchNano(Env* const env, bool auto_start = false) + : env_(env), start_(0) { + if (auto_start) { + Start(); + } + } + + void Start() { start_ = env_->NowNanos(); } + + uint64_t ElapsedNanos(bool reset = false) { + auto now = env_->NowNanos(); + auto elapsed = now - start_; + if (reset) { + start_ = now; + } + return elapsed; + } + + private: + Env* const env_; + uint64_t start_; +}; + +} // namespace rocksdb diff --git a/util/string_util.cc b/util/string_util.cc new file mode 100644 index 0000000000..97b7f9de96 --- /dev/null +++ b/util/string_util.cc @@ -0,0 +1,23 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include +#include "util/string_util.h" + +namespace rocksdb { + +std::vector stringSplit(std::string arg, char delim) { + std::vector splits; + std::stringstream ss(arg); + std::string item; + while (std::getline(ss, item, delim)) { + splits.push_back(item); + } + return splits; +} + +} // namespace rocksdb diff --git a/util/string_util.h b/util/string_util.h new file mode 100644 index 0000000000..676f4aae81 --- /dev/null +++ b/util/string_util.h @@ -0,0 +1,15 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include + +#pragma once +namespace rocksdb { + +extern std::vector stringSplit(std::string arg, char delim); + +} // namespace rocksdb diff --git a/util/sync_point.cc b/util/sync_point.cc new file mode 100644 index 0000000000..4e4c46a1fa --- /dev/null +++ b/util/sync_point.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "util/sync_point.h" + +#ifndef NDEBUG +namespace rocksdb { + +SyncPoint* SyncPoint::GetInstance() { + static SyncPoint sync_point; + return &sync_point; +} + +void SyncPoint::LoadDependency(const std::vector& dependencies) { + successors_.clear(); + predecessors_.clear(); + cleared_points_.clear(); + for (const auto& dependency : dependencies) { + successors_[dependency.predecessor].push_back(dependency.successor); + predecessors_[dependency.successor].push_back(dependency.predecessor); + } +} + +bool SyncPoint::PredecessorsAllCleared(const std::string& point) { + for (const auto& pred : predecessors_[point]) { + if (cleared_points_.count(pred) == 0) { + return false; + } + } + return true; +} + +void SyncPoint::EnableProcessing() { + std::unique_lock lock(mutex_); + enabled_ = true; +} + +void SyncPoint::DisableProcessing() { + std::unique_lock lock(mutex_); + enabled_ = false; +} + +void SyncPoint::ClearTrace() { + std::unique_lock lock(mutex_); + cleared_points_.clear(); +} + +void SyncPoint::Process(const std::string& point) { + std::unique_lock lock(mutex_); + + if (!enabled_) return; + + while (!PredecessorsAllCleared(point)) { + cv_.wait(lock); + } + + cleared_points_.insert(point); + cv_.notify_all(); +} + +} // namespace rocksdb +#endif // NDEBUG diff --git a/util/sync_point.h b/util/sync_point.h new file mode 100644 index 0000000000..b4b61a9fc5 --- /dev/null +++ b/util/sync_point.h @@ -0,0 +1,80 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifdef NDEBUG +#define TEST_SYNC_POINT(x) +#else + +namespace rocksdb { + +// This class provides facility to reproduce race conditions deterministically +// in unit tests. +// Developer could specify sync points in the codebase via TEST_SYNC_POINT. +// Each sync point represents a position in the execution stream of a thread. +// In the unit test, 'Happens After' relationship among sync points could be +// setup via SyncPoint::LoadDependency, to reproduce a desired interleave of +// threads execution. +// Refer to (DBTest,TransactionLogIteratorRace), for an exmaple use case. + +class SyncPoint { + public: + static SyncPoint* GetInstance(); + + struct Dependency { + std::string predecessor; + std::string successor; + }; + // call once at the beginning of a test to setup the dependency between + // sync points + void LoadDependency(const std::vector& dependencies); + + // enable sync point processing (disabled on startup) + void EnableProcessing(); + + // disable sync point processing + void DisableProcessing(); + + // remove the execution trace of all sync points + void ClearTrace(); + + // triggered by TEST_SYNC_POINT, blocking execution until all predecessors + // are executed. + void Process(const std::string& point); + + // TODO: it might be useful to provide a function that blocks until all + // sync points are cleared. + + private: + bool PredecessorsAllCleared(const std::string& point); + + // successor/predecessor map loaded from LoadDependency + std::unordered_map> successors_; + std::unordered_map> predecessors_; + + std::mutex mutex_; + std::condition_variable cv_; + // sync points that have been passed through + std::unordered_set cleared_points_; + bool enabled_ = false; +}; + +} // namespace rocksdb + +// Use TEST_SYNC_POINT to specify sync points inside code base. +// Sync points can have happens-after depedency on other sync points, +// configured at runtime via SyncPoint::LoadDependency. This could be +// utilized to re-produce race conditions between threads. +// See TransactionLogIteratorRace in db_test.cc for an example use case. +// TEST_SYNC_POINT is no op in release build. +#define TEST_SYNC_POINT(x) rocksdb::SyncPoint::GetInstance()->Process(x) +#endif // NDEBUG diff --git a/util/testharness.cc b/util/testharness.cc new file mode 100644 index 0000000000..4208d2c46a --- /dev/null +++ b/util/testharness.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testharness.h" +#include +#include +#include +#include +#include "port/stack_trace.h" + +namespace rocksdb { +namespace test { + +namespace { +struct Test { + const char* base; + const char* name; + void (*func)(); +}; +std::vector* tests; +} + +bool RegisterTest(const char* base, const char* name, void (*func)()) { + if (tests == nullptr) { + tests = new std::vector; + } + Test t; + t.base = base; + t.name = name; + t.func = func; + tests->push_back(t); + return true; +} + +int RunAllTests() { + port::InstallStackTraceHandler(); + + const char* matcher = getenv("ROCKSDB_TESTS"); + + int num = 0; + if (tests != nullptr) { + for (unsigned int i = 0; i < tests->size(); i++) { + const Test& t = (*tests)[i]; + if (matcher != nullptr) { + std::string name = t.base; + name.push_back('.'); + name.append(t.name); + if (strstr(name.c_str(), matcher) == nullptr) { + continue; + } + } + fprintf(stderr, "==== Test %s.%s\n", t.base, t.name); + (*t.func)(); + ++num; + } + } + fprintf(stderr, "==== PASSED %d tests\n", num); + return 0; +} + +std::string TmpDir() { + std::string dir; + Status s = Env::Default()->GetTestDirectory(&dir); + ASSERT_TRUE(s.ok()) << s.ToString(); + return dir; +} + +int RandomSeed() { + const char* env = getenv("TEST_RANDOM_SEED"); + int result = (env != nullptr ? atoi(env) : 301); + if (result <= 0) { + result = 301; + } + return result; +} + +} // namespace test +} // namespace rocksdb diff --git a/util/testharness.h b/util/testharness.h new file mode 100644 index 0000000000..52c29848d9 --- /dev/null +++ b/util/testharness.h @@ -0,0 +1,142 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include "port/stack_trace.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "util/random.h" + +namespace rocksdb { +namespace test { + +// Run some of the tests registered by the TEST() macro. If the +// environment variable "ROCKSDB_TESTS" is not set, runs all tests. +// Otherwise, runs only the tests whose name contains the value of +// "ROCKSDB_TESTS" as a substring. E.g., suppose the tests are: +// TEST(Foo, Hello) { ... } +// TEST(Foo, World) { ... } +// ROCKSDB_TESTS=Hello will run the first test +// ROCKSDB_TESTS=o will run both tests +// ROCKSDB_TESTS=Junk will run no tests +// +// Returns 0 if all tests pass. +// Dies or returns a non-zero value if some test fails. +extern int RunAllTests(); + +// Return the directory to use for temporary storage. +extern std::string TmpDir(); + +// Return a randomization seed for this run. Typically returns the +// same number on repeated invocations of this binary, but automated +// runs may be able to vary the seed. +extern int RandomSeed(); + +// An instance of Tester is allocated to hold temporary state during +// the execution of an assertion. +class Tester { + private: + bool ok_; + const char* fname_; + int line_; + std::stringstream ss_; + + public: + Tester(const char* f, int l) + : ok_(true), fname_(f), line_(l) { + } + + ~Tester() { + if (!ok_) { + fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str()); + port::PrintStack(2); + exit(1); + } + } + + Tester& Is(bool b, const char* msg) { + if (!b) { + ss_ << " Assertion failure " << msg; + ok_ = false; + } + return *this; + } + + Tester& IsOk(const Status& s) { + if (!s.ok()) { + ss_ << " " << s.ToString(); + ok_ = false; + } + return *this; + } + +#define BINARY_OP(name,op) \ + template \ + Tester& name(const X& x, const Y& y) { \ + if (! (x op y)) { \ + ss_ << " failed: " << x << (" " #op " ") << y; \ + ok_ = false; \ + } \ + return *this; \ + } + + BINARY_OP(IsEq, ==) + BINARY_OP(IsNe, !=) + BINARY_OP(IsGe, >=) + BINARY_OP(IsGt, >) + BINARY_OP(IsLe, <=) + BINARY_OP(IsLt, <) +#undef BINARY_OP + + // Attach the specified value to the error message if an error has occurred + template + Tester& operator<<(const V& value) { + if (!ok_) { + ss_ << " " << value; + } + return *this; + } +}; + +#define ASSERT_TRUE(c) ::rocksdb::test::Tester(__FILE__, __LINE__).Is((c), #c) +#define ASSERT_OK(s) ::rocksdb::test::Tester(__FILE__, __LINE__).IsOk((s)) +#define ASSERT_EQ(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsEq((a),(b)) +#define ASSERT_NE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsNe((a),(b)) +#define ASSERT_GE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsGe((a),(b)) +#define ASSERT_GT(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsGt((a),(b)) +#define ASSERT_LE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsLe((a),(b)) +#define ASSERT_LT(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsLt((a),(b)) + +#define TCONCAT(a,b) TCONCAT1(a,b) +#define TCONCAT1(a,b) a##b + +#define TEST(base,name) \ +class TCONCAT(_Test_,name) : public base { \ + public: \ + void _Run(); \ + static void _RunIt() { \ + TCONCAT(_Test_,name) t; \ + t._Run(); \ + } \ +}; \ +bool TCONCAT(_Test_ignored_,name) = \ + ::rocksdb::test::RegisterTest(#base, #name, &TCONCAT(_Test_,name)::_RunIt); \ +void TCONCAT(_Test_,name)::_Run() + +// Register the specified test. Typically not used directly, but +// invoked via the macro expansion of TEST. +extern bool RegisterTest(const char* base, const char* name, void (*func)()); + + +} // namespace test +} // namespace rocksdb diff --git a/util/testutil.cc b/util/testutil.cc new file mode 100644 index 0000000000..13e781e646 --- /dev/null +++ b/util/testutil.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testutil.h" + +#include "util/random.h" + +namespace rocksdb { +namespace test { + +Slice RandomString(Random* rnd, int len, std::string* dst) { + dst->resize(len); + for (int i = 0; i < len; i++) { + (*dst)[i] = static_cast(' ' + rnd->Uniform(95)); // ' ' .. '~' + } + return Slice(*dst); +} + +std::string RandomKey(Random* rnd, int len) { + // Make sure to generate a wide variety of characters so we + // test the boundary conditions for short-key optimizations. + static const char kTestChars[] = { + '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff' + }; + std::string result; + for (int i = 0; i < len; i++) { + result += kTestChars[rnd->Uniform(sizeof(kTestChars))]; + } + return result; +} + + +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst) { + int raw = static_cast(len * compressed_fraction); + if (raw < 1) raw = 1; + std::string raw_data; + RandomString(rnd, raw, &raw_data); + + // Duplicate the random data until we have filled "len" bytes + dst->clear(); + while (dst->size() < (unsigned int)len) { + dst->append(raw_data); + } + dst->resize(len); + return Slice(*dst); +} + +} // namespace test +} // namespace rocksdb diff --git a/util/testutil.h b/util/testutil.h new file mode 100644 index 0000000000..4fc8c0f5b3 --- /dev/null +++ b/util/testutil.h @@ -0,0 +1,80 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "util/random.h" + +namespace rocksdb { +namespace test { + +// Store in *dst a random string of length "len" and return a Slice that +// references the generated data. +extern Slice RandomString(Random* rnd, int len, std::string* dst); + +// Return a random key with the specified length that may contain interesting +// characters (e.g. \x00, \xff, etc.). +extern std::string RandomKey(Random* rnd, int len); + +// Store in *dst a string of length "len" that will compress to +// "N*compressed_fraction" bytes and return a Slice that references +// the generated data. +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst); + +// A wrapper that allows injection of errors. +class ErrorEnv : public EnvWrapper { + public: + bool writable_file_error_; + int num_writable_file_errors_; + + ErrorEnv() : EnvWrapper(Env::Default()), + writable_file_error_(false), + num_writable_file_errors_(0) { } + + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + result->reset(); + if (writable_file_error_) { + ++num_writable_file_errors_; + return Status::IOError(fname, "fake error"); + } + return target()->NewWritableFile(fname, result, soptions); + } +}; + +// An internal comparator that just forward comparing results from the +// user comparator in it. Can be used to test entities that have no dependency +// on internal key structure but consumes InternalKeyComparator, like +// BlockBasedTable. +class PlainInternalKeyComparator : public InternalKeyComparator { + public: + explicit PlainInternalKeyComparator(const Comparator* c) + : InternalKeyComparator(c) {} + + virtual ~PlainInternalKeyComparator() {} + + virtual int Compare(const Slice& a, const Slice& b) const override { + return user_comparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + user_comparator()->FindShortestSeparator(start, limit); + } + virtual void FindShortSuccessor(std::string* key) const override { + user_comparator()->FindShortSuccessor(key); + } +}; + +} // namespace test +} // namespace rocksdb diff --git a/util/thread_local.cc b/util/thread_local.cc new file mode 100644 index 0000000000..bc8a4c7d28 --- /dev/null +++ b/util/thread_local.cc @@ -0,0 +1,243 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/thread_local.h" +#include "util/mutexlock.h" +#include "port/likely.h" + + +namespace rocksdb { + +port::Mutex ThreadLocalPtr::StaticMeta::mutex_; +#if !defined(OS_MACOSX) +__thread ThreadLocalPtr::ThreadData* ThreadLocalPtr::StaticMeta::tls_ = nullptr; +#endif + +ThreadLocalPtr::StaticMeta* ThreadLocalPtr::Instance() { + static ThreadLocalPtr::StaticMeta inst; + return &inst; +} + +void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) { + auto* tls = static_cast(ptr); + assert(tls != nullptr); + + auto* inst = Instance(); + pthread_setspecific(inst->pthread_key_, nullptr); + + MutexLock l(&mutex_); + inst->RemoveThreadData(tls); + // Unref stored pointers of current thread from all instances + uint32_t id = 0; + for (auto& e : tls->entries) { + void* raw = e.ptr.load(std::memory_order_relaxed); + if (raw != nullptr) { + auto unref = inst->GetHandler(id); + if (unref != nullptr) { + unref(raw); + } + } + ++id; + } + // Delete thread local structure no matter if it is Mac platform + delete tls; +} + +ThreadLocalPtr::StaticMeta::StaticMeta() : next_instance_id_(0) { + if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) { + throw std::runtime_error("pthread_key_create failed"); + } + head_.next = &head_; + head_.prev = &head_; +} + +void ThreadLocalPtr::StaticMeta::AddThreadData(ThreadLocalPtr::ThreadData* d) { + mutex_.AssertHeld(); + d->next = &head_; + d->prev = head_.prev; + head_.prev->next = d; + head_.prev = d; +} + +void ThreadLocalPtr::StaticMeta::RemoveThreadData( + ThreadLocalPtr::ThreadData* d) { + mutex_.AssertHeld(); + d->next->prev = d->prev; + d->prev->next = d->next; + d->next = d->prev = d; +} + +ThreadLocalPtr::ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() { +#if defined(OS_MACOSX) + // Make this local variable name look like a member variable so that we + // can share all the code below + ThreadData* tls_ = + static_cast(pthread_getspecific(Instance()->pthread_key_)); +#endif + + if (UNLIKELY(tls_ == nullptr)) { + auto* inst = Instance(); + tls_ = new ThreadData(); + { + // Register it in the global chain, needs to be done before thread exit + // handler registration + MutexLock l(&mutex_); + inst->AddThreadData(tls_); + } + // Even it is not OS_MACOSX, need to register value for pthread_key_ so that + // its exit handler will be triggered. + if (pthread_setspecific(inst->pthread_key_, tls_) != 0) { + { + MutexLock l(&mutex_); + inst->RemoveThreadData(tls_); + } + delete tls_; + throw std::runtime_error("pthread_setspecific failed"); + } + } + return tls_; +} + +void* ThreadLocalPtr::StaticMeta::Get(uint32_t id) const { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + return nullptr; + } + return tls->entries[id].ptr.load(std::memory_order_relaxed); +} + +void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(&mutex_); + tls->entries.resize(id + 1); + } + tls->entries[id].ptr.store(ptr, std::memory_order_relaxed); +} + +void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(&mutex_); + tls->entries.resize(id + 1); + } + return tls->entries[id].ptr.exchange(ptr, std::memory_order_relaxed); +} + +bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr, + void*& expected) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(&mutex_); + tls->entries.resize(id + 1); + } + return tls->entries[id].ptr.compare_exchange_strong(expected, ptr, + std::memory_order_relaxed, std::memory_order_relaxed); +} + +void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector* ptrs, + void* const replacement) { + MutexLock l(&mutex_); + for (ThreadData* t = head_.next; t != &head_; t = t->next) { + if (id < t->entries.size()) { + void* ptr = + t->entries[id].ptr.exchange(replacement, std::memory_order_relaxed); + if (ptr != nullptr) { + ptrs->push_back(ptr); + } + } + } +} + +void ThreadLocalPtr::StaticMeta::SetHandler(uint32_t id, UnrefHandler handler) { + MutexLock l(&mutex_); + handler_map_[id] = handler; +} + +UnrefHandler ThreadLocalPtr::StaticMeta::GetHandler(uint32_t id) { + mutex_.AssertHeld(); + auto iter = handler_map_.find(id); + if (iter == handler_map_.end()) { + return nullptr; + } + return iter->second; +} + +uint32_t ThreadLocalPtr::StaticMeta::GetId() { + MutexLock l(&mutex_); + if (free_instance_ids_.empty()) { + return next_instance_id_++; + } + + uint32_t id = free_instance_ids_.back(); + free_instance_ids_.pop_back(); + return id; +} + +uint32_t ThreadLocalPtr::StaticMeta::PeekId() const { + MutexLock l(&mutex_); + if (!free_instance_ids_.empty()) { + return free_instance_ids_.back(); + } + return next_instance_id_; +} + +void ThreadLocalPtr::StaticMeta::ReclaimId(uint32_t id) { + // This id is not used, go through all thread local data and release + // corresponding value + MutexLock l(&mutex_); + auto unref = GetHandler(id); + for (ThreadData* t = head_.next; t != &head_; t = t->next) { + if (id < t->entries.size()) { + void* ptr = + t->entries[id].ptr.exchange(nullptr, std::memory_order_relaxed); + if (ptr != nullptr && unref != nullptr) { + unref(ptr); + } + } + } + handler_map_[id] = nullptr; + free_instance_ids_.push_back(id); +} + +ThreadLocalPtr::ThreadLocalPtr(UnrefHandler handler) + : id_(Instance()->GetId()) { + if (handler != nullptr) { + Instance()->SetHandler(id_, handler); + } +} + +ThreadLocalPtr::~ThreadLocalPtr() { + Instance()->ReclaimId(id_); +} + +void* ThreadLocalPtr::Get() const { + return Instance()->Get(id_); +} + +void ThreadLocalPtr::Reset(void* ptr) { + Instance()->Reset(id_, ptr); +} + +void* ThreadLocalPtr::Swap(void* ptr) { + return Instance()->Swap(id_, ptr); +} + +bool ThreadLocalPtr::CompareAndSwap(void* ptr, void*& expected) { + return Instance()->CompareAndSwap(id_, ptr, expected); +} + +void ThreadLocalPtr::Scrape(autovector* ptrs, void* const replacement) { + Instance()->Scrape(id_, ptrs, replacement); +} + +} // namespace rocksdb diff --git a/util/thread_local.h b/util/thread_local.h new file mode 100644 index 0000000000..a037a9ceb9 --- /dev/null +++ b/util/thread_local.h @@ -0,0 +1,166 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include + +#include "util/autovector.h" +#include "port/port_posix.h" +#include "util/thread_local.h" + +namespace rocksdb { + +// Cleanup function that will be called for a stored thread local +// pointer (if not NULL) when one of the following happens: +// (1) a thread terminates +// (2) a ThreadLocalPtr is destroyed +typedef void (*UnrefHandler)(void* ptr); + +// Thread local storage that only stores value of pointer type. The storage +// distinguish data coming from different thread and different ThreadLocalPtr +// instances. For example, if a regular thread_local variable A is declared +// in DBImpl, two DBImpl objects would share the same A. ThreadLocalPtr avoids +// the confliction. The total storage size equals to # of threads * # of +// ThreadLocalPtr instances. It is not efficient in terms of space, but it +// should serve most of our use cases well and keep code simple. +class ThreadLocalPtr { + public: + explicit ThreadLocalPtr(UnrefHandler handler = nullptr); + + ~ThreadLocalPtr(); + + // Return the current pointer stored in thread local + void* Get() const; + + // Set a new pointer value to the thread local storage. + void Reset(void* ptr); + + // Atomically swap the supplied ptr and return the previous value + void* Swap(void* ptr); + + // Atomically compare the stored value with expected. Set the new + // pointer value to thread local only if the comparision is true. + // Otherwise, expected returns the stored value. + // Return true on success, false on failure + bool CompareAndSwap(void* ptr, void*& expected); + + // Reset all thread local data to replacement, and return non-nullptr + // data for all existing threads + void Scrape(autovector* ptrs, void* const replacement); + + protected: + struct Entry { + Entry() : ptr(nullptr) {} + Entry(const Entry& e) : ptr(e.ptr.load(std::memory_order_relaxed)) {} + std::atomic ptr; + }; + + // This is the structure that is declared as "thread_local" storage. + // The vector keep list of atomic pointer for all instances for "current" + // thread. The vector is indexed by an Id that is unique in process and + // associated with one ThreadLocalPtr instance. The Id is assigned by a + // global StaticMeta singleton. So if we instantiated 3 ThreadLocalPtr + // instances, each thread will have a ThreadData with a vector of size 3: + // --------------------------------------------------- + // | | instance 1 | instance 2 | instnace 3 | + // --------------------------------------------------- + // | thread 1 | void* | void* | void* | <- ThreadData + // --------------------------------------------------- + // | thread 2 | void* | void* | void* | <- ThreadData + // --------------------------------------------------- + // | thread 3 | void* | void* | void* | <- ThreadData + // --------------------------------------------------- + struct ThreadData { + ThreadData() : entries() {} + std::vector entries; + ThreadData* next; + ThreadData* prev; + }; + + class StaticMeta { + public: + StaticMeta(); + + // Return the next available Id + uint32_t GetId(); + // Return the next availabe Id without claiming it + uint32_t PeekId() const; + // Return the given Id back to the free pool. This also triggers + // UnrefHandler for associated pointer value (if not NULL) for all threads. + void ReclaimId(uint32_t id); + + // Return the pointer value for the given id for the current thread. + void* Get(uint32_t id) const; + // Reset the pointer value for the given id for the current thread. + // It triggers UnrefHanlder if the id has existing pointer value. + void Reset(uint32_t id, void* ptr); + // Atomically swap the supplied ptr and return the previous value + void* Swap(uint32_t id, void* ptr); + // Atomically compare and swap the provided value only if it equals + // to expected value. + bool CompareAndSwap(uint32_t id, void* ptr, void*& expected); + // Reset all thread local data to replacement, and return non-nullptr + // data for all existing threads + void Scrape(uint32_t id, autovector* ptrs, void* const replacement); + + // Register the UnrefHandler for id + void SetHandler(uint32_t id, UnrefHandler handler); + + private: + // Get UnrefHandler for id with acquiring mutex + // REQUIRES: mutex locked + UnrefHandler GetHandler(uint32_t id); + + // Triggered before a thread terminates + static void OnThreadExit(void* ptr); + + // Add current thread's ThreadData to the global chain + // REQUIRES: mutex locked + void AddThreadData(ThreadData* d); + + // Remove current thread's ThreadData from the global chain + // REQUIRES: mutex locked + void RemoveThreadData(ThreadData* d); + + static ThreadData* GetThreadLocal(); + + uint32_t next_instance_id_; + // Used to recycle Ids in case ThreadLocalPtr is instantiated and destroyed + // frequently. This also prevents it from blowing up the vector space. + autovector free_instance_ids_; + // Chain all thread local structure together. This is necessary since + // when one ThreadLocalPtr gets destroyed, we need to loop over each + // thread's version of pointer corresponding to that instance and + // call UnrefHandler for it. + ThreadData head_; + + std::unordered_map handler_map_; + + // protect inst, next_instance_id_, free_instance_ids_, head_, + // ThreadData.entries + static port::Mutex mutex_; +#if !defined(OS_MACOSX) + // Thread local storage + static __thread ThreadData* tls_; +#endif + // Used to make thread exit trigger possible if !defined(OS_MACOSX). + // Otherwise, used to retrieve thread data. + pthread_key_t pthread_key_; + }; + + static StaticMeta* Instance(); + + const uint32_t id_; +}; + +} // namespace rocksdb diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc new file mode 100644 index 0000000000..70dfa956ea --- /dev/null +++ b/util/thread_local_test.cc @@ -0,0 +1,472 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include + +#include "rocksdb/env.h" +#include "port/port_posix.h" +#include "util/autovector.h" +#include "util/thread_local.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class ThreadLocalTest { + public: + ThreadLocalTest() : env_(Env::Default()) {} + + Env* env_; +}; + +namespace { + +struct Params { + Params(port::Mutex* m, port::CondVar* c, int* unref, int n, + UnrefHandler handler = nullptr) + : mu(m), + cv(c), + unref(unref), + total(n), + started(0), + completed(0), + doWrite(false), + tls1(handler), + tls2(nullptr) {} + + port::Mutex* mu; + port::CondVar* cv; + int* unref; + int total; + int started; + int completed; + bool doWrite; + ThreadLocalPtr tls1; + ThreadLocalPtr* tls2; +}; + +class IDChecker : public ThreadLocalPtr { + public: + static uint32_t PeekId() { return Instance()->PeekId(); } +}; + +} // anonymous namespace + +TEST(ThreadLocalTest, UniqueIdTest) { + port::Mutex mu; + port::CondVar cv(&mu); + + ASSERT_EQ(IDChecker::PeekId(), 0u); + // New ThreadLocal instance bumps id by 1 + { + // Id used 0 + Params p1(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 1u); + // Id used 1 + Params p2(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 2u); + // Id used 2 + Params p3(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 3u); + // Id used 3 + Params p4(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 4u); + } + // id 3, 2, 1, 0 are in the free queue in order + ASSERT_EQ(IDChecker::PeekId(), 0u); + + // pick up 0 + Params p1(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 1u); + // pick up 1 + Params* p2 = new Params(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 2u); + // pick up 2 + Params p3(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 3u); + // return up 1 + delete p2; + ASSERT_EQ(IDChecker::PeekId(), 1u); + // Now we have 3, 1 in queue + // pick up 1 + Params p4(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 3u); + // pick up 3 + Params p5(&mu, &cv, nullptr, 1u); + // next new id + ASSERT_EQ(IDChecker::PeekId(), 4u); + // After exit, id sequence in queue: + // 3, 1, 2, 0 +} + +TEST(ThreadLocalTest, SequentialReadWriteTest) { + // global id list carries over 3, 1, 2, 0 + ASSERT_EQ(IDChecker::PeekId(), 0u); + + port::Mutex mu; + port::CondVar cv(&mu); + Params p(&mu, &cv, nullptr, 1); + ThreadLocalPtr tls2; + p.tls2 = &tls2; + + auto func = [](void* ptr) { + auto& p = *static_cast(ptr); + + ASSERT_TRUE(p.tls1.Get() == nullptr); + p.tls1.Reset(reinterpret_cast(1)); + ASSERT_TRUE(p.tls1.Get() == reinterpret_cast(1)); + p.tls1.Reset(reinterpret_cast(2)); + ASSERT_TRUE(p.tls1.Get() == reinterpret_cast(2)); + + ASSERT_TRUE(p.tls2->Get() == nullptr); + p.tls2->Reset(reinterpret_cast(1)); + ASSERT_TRUE(p.tls2->Get() == reinterpret_cast(1)); + p.tls2->Reset(reinterpret_cast(2)); + ASSERT_TRUE(p.tls2->Get() == reinterpret_cast(2)); + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + p.mu->Unlock(); + }; + + for (int iter = 0; iter < 1024; ++iter) { + ASSERT_EQ(IDChecker::PeekId(), 1u); + // Another new thread, read/write should not see value from previous thread + env_->StartThread(func, static_cast(&p)); + mu.Lock(); + while (p.completed != iter + 1) { + cv.Wait(); + } + mu.Unlock(); + ASSERT_EQ(IDChecker::PeekId(), 1u); + } +} + +TEST(ThreadLocalTest, ConcurrentReadWriteTest) { + // global id list carries over 3, 1, 2, 0 + ASSERT_EQ(IDChecker::PeekId(), 0u); + + ThreadLocalPtr tls2; + port::Mutex mu1; + port::CondVar cv1(&mu1); + Params p1(&mu1, &cv1, nullptr, 16); + p1.tls2 = &tls2; + + port::Mutex mu2; + port::CondVar cv2(&mu2); + Params p2(&mu2, &cv2, nullptr, 16); + p2.doWrite = true; + p2.tls2 = &tls2; + + auto func = [](void* ptr) { + auto& p = *static_cast(ptr); + + p.mu->Lock(); + int own = ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + + // Let write threads write a different value from the read threads + if (p.doWrite) { + own += 8192; + } + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + auto* env = Env::Default(); + auto start = env->NowMicros(); + + p.tls1.Reset(reinterpret_cast(own)); + p.tls2->Reset(reinterpret_cast(own + 1)); + // Loop for 1 second + while (env->NowMicros() - start < 1000 * 1000) { + for (int iter = 0; iter < 100000; ++iter) { + ASSERT_TRUE(p.tls1.Get() == reinterpret_cast(own)); + ASSERT_TRUE(p.tls2->Get() == reinterpret_cast(own + 1)); + if (p.doWrite) { + p.tls1.Reset(reinterpret_cast(own)); + p.tls2->Reset(reinterpret_cast(own + 1)); + } + } + } + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + p.mu->Unlock(); + }; + + // Initiate 2 instnaces: one keeps writing and one keeps reading. + // The read instance should not see data from the write instance. + // Each thread local copy of the value are also different from each + // other. + for (int th = 0; th < p1.total; ++th) { + env_->StartThread(func, static_cast(&p1)); + } + for (int th = 0; th < p2.total; ++th) { + env_->StartThread(func, static_cast(&p2)); + } + + mu1.Lock(); + while (p1.completed != p1.total) { + cv1.Wait(); + } + mu1.Unlock(); + + mu2.Lock(); + while (p2.completed != p2.total) { + cv2.Wait(); + } + mu2.Unlock(); + + ASSERT_EQ(IDChecker::PeekId(), 3u); +} + +TEST(ThreadLocalTest, Unref) { + ASSERT_EQ(IDChecker::PeekId(), 0u); + + auto unref = [](void* ptr) { + auto& p = *static_cast(ptr); + p.mu->Lock(); + ++(*p.unref); + p.mu->Unlock(); + }; + + // Case 0: no unref triggered if ThreadLocalPtr is never accessed + auto func0 = [](void* ptr) { + auto& p = *static_cast(ptr); + + p.mu->Lock(); + ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + Params p(&mu, &cv, &unref_count, th, unref); + + for (int i = 0; i < p.total; ++i) { + env_->StartThread(func0, static_cast(&p)); + } + env_->WaitForJoin(); + ASSERT_EQ(unref_count, 0); + } + + // Case 1: unref triggered by thread exit + auto func1 = [](void* ptr) { + auto& p = *static_cast(ptr); + + p.mu->Lock(); + ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + ThreadLocalPtr tls2(unref); + Params p(&mu, &cv, &unref_count, th, unref); + p.tls2 = &tls2; + + for (int i = 0; i < p.total; ++i) { + env_->StartThread(func1, static_cast(&p)); + } + + env_->WaitForJoin(); + + // N threads x 2 ThreadLocal instance cleanup on thread exit + ASSERT_EQ(unref_count, 2 * p.total); + } + + // Case 2: unref triggered by ThreadLocal instance destruction + auto func2 = [](void* ptr) { + auto& p = *static_cast(ptr); + + p.mu->Lock(); + ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + + // Waiting for instruction to exit thread + while (p.completed != 0) { + p.cv->Wait(); + } + p.mu->Unlock(); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + Params p(&mu, &cv, &unref_count, th, unref); + p.tls2 = new ThreadLocalPtr(unref); + + for (int i = 0; i < p.total; ++i) { + env_->StartThread(func2, static_cast(&p)); + } + + // Wait for all threads to finish using Params + mu.Lock(); + while (p.completed != p.total) { + cv.Wait(); + } + mu.Unlock(); + + // Now destroy one ThreadLocal instance + delete p.tls2; + p.tls2 = nullptr; + // instance destroy for N threads + ASSERT_EQ(unref_count, p.total); + + // Signal to exit + mu.Lock(); + p.completed = 0; + cv.SignalAll(); + mu.Unlock(); + env_->WaitForJoin(); + // additional N threads exit unref for the left instance + ASSERT_EQ(unref_count, 2 * p.total); + } +} + +TEST(ThreadLocalTest, Swap) { + ThreadLocalPtr tls; + tls.Reset(reinterpret_cast(1)); + ASSERT_EQ(reinterpret_cast(tls.Swap(nullptr)), 1); + ASSERT_TRUE(tls.Swap(reinterpret_cast(2)) == nullptr); + ASSERT_EQ(reinterpret_cast(tls.Get()), 2); + ASSERT_EQ(reinterpret_cast(tls.Swap(reinterpret_cast(3))), 2); +} + +TEST(ThreadLocalTest, Scrape) { + auto unref = [](void* ptr) { + auto& p = *static_cast(ptr); + p.mu->Lock(); + ++(*p.unref); + p.mu->Unlock(); + }; + + auto func = [](void* ptr) { + auto& p = *static_cast(ptr); + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + + // Waiting for instruction to exit thread + while (p.completed != 0) { + p.cv->Wait(); + } + p.mu->Unlock(); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + Params p(&mu, &cv, &unref_count, th, unref); + p.tls2 = new ThreadLocalPtr(unref); + + for (int i = 0; i < p.total; ++i) { + env_->StartThread(func, static_cast(&p)); + } + + // Wait for all threads to finish using Params + mu.Lock(); + while (p.completed != p.total) { + cv.Wait(); + } + mu.Unlock(); + + ASSERT_EQ(unref_count, 0); + + // Scrape all thread local data. No unref at thread + // exit or ThreadLocalPtr destruction + autovector ptrs; + p.tls1.Scrape(&ptrs, nullptr); + p.tls2->Scrape(&ptrs, nullptr); + delete p.tls2; + // Signal to exit + mu.Lock(); + p.completed = 0; + cv.SignalAll(); + mu.Unlock(); + env_->WaitForJoin(); + + ASSERT_EQ(unref_count, 0); + } +} + +TEST(ThreadLocalTest, CompareAndSwap) { + ThreadLocalPtr tls; + ASSERT_TRUE(tls.Swap(reinterpret_cast(1)) == nullptr); + void* expected = reinterpret_cast(1); + // Swap in 2 + ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast(2), expected)); + expected = reinterpret_cast(100); + // Fail Swap, still 2 + ASSERT_TRUE(!tls.CompareAndSwap(reinterpret_cast(2), expected)); + ASSERT_EQ(expected, reinterpret_cast(2)); + // Swap in 3 + expected = reinterpret_cast(2); + ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast(3), expected)); + ASSERT_EQ(tls.Get(), reinterpret_cast(3)); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/vectorrep.cc b/util/vectorrep.cc new file mode 100644 index 0000000000..cf8bad5c4d --- /dev/null +++ b/util/vectorrep.cc @@ -0,0 +1,294 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef ROCKSDB_LITE +#include "rocksdb/memtablerep.h" + +#include +#include +#include +#include +#include + +#include "util/arena.h" +#include "db/memtable.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include "util/stl_wrappers.h" + +namespace rocksdb { +namespace { + +using namespace stl_wrappers; + +class VectorRep : public MemTableRep { + public: + VectorRep(const KeyComparator& compare, Arena* arena, size_t count); + + // Insert key into the collection. (The caller will pack key and value into a + // single buffer and pass that in as the parameter to Insert) + // REQUIRES: nothing that compares equal to key is currently in the + // collection. + virtual void Insert(KeyHandle handle) override; + + // Returns true iff an entry that compares equal to key is in the collection. + virtual bool Contains(const char* key) const override; + + virtual void MarkReadOnly() override; + + virtual size_t ApproximateMemoryUsage() override; + + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, + const char* entry)) override; + + virtual ~VectorRep() override { } + + class Iterator : public MemTableRep::Iterator { + class VectorRep* vrep_; + std::shared_ptr> bucket_; + typename std::vector::const_iterator mutable cit_; + const KeyComparator& compare_; + std::string tmp_; // For passing to EncodeKey + bool mutable sorted_; + void DoSort() const; + public: + explicit Iterator(class VectorRep* vrep, + std::shared_ptr> bucket, + const KeyComparator& compare); + + // Initialize an iterator over the specified collection. + // The returned iterator is not valid. + // explicit Iterator(const MemTableRep* collection); + virtual ~Iterator() override { }; + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const override; + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const override; + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() override; + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() override; + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& user_key, const char* memtable_key) override; + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() override; + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() override; + }; + + // Unhide default implementations of GetIterator() + using MemTableRep::GetIterator; + + // Return an iterator over the keys in this representation. + virtual MemTableRep::Iterator* GetIterator(Arena* arena) override; + + private: + friend class Iterator; + typedef std::vector Bucket; + std::shared_ptr bucket_; + mutable port::RWMutex rwlock_; + bool immutable_; + bool sorted_; + const KeyComparator& compare_; +}; + +void VectorRep::Insert(KeyHandle handle) { + auto* key = static_cast(handle); + assert(!Contains(key)); + WriteLock l(&rwlock_); + assert(!immutable_); + bucket_->push_back(key); +} + +// Returns true iff an entry that compares equal to key is in the collection. +bool VectorRep::Contains(const char* key) const { + ReadLock l(&rwlock_); + return std::find(bucket_->begin(), bucket_->end(), key) != bucket_->end(); +} + +void VectorRep::MarkReadOnly() { + WriteLock l(&rwlock_); + immutable_ = true; +} + +size_t VectorRep::ApproximateMemoryUsage() { + return + sizeof(bucket_) + sizeof(*bucket_) + + bucket_->size() * + sizeof( + std::remove_reference::type::value_type + ); +} + +VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count) + : MemTableRep(arena), + bucket_(new Bucket()), + immutable_(false), + sorted_(false), + compare_(compare) { bucket_.get()->reserve(count); } + +VectorRep::Iterator::Iterator(class VectorRep* vrep, + std::shared_ptr> bucket, + const KeyComparator& compare) +: vrep_(vrep), + bucket_(bucket), + cit_(bucket_->end()), + compare_(compare), + sorted_(false) { } + +void VectorRep::Iterator::DoSort() const { + // vrep is non-null means that we are working on an immutable memtable + if (!sorted_ && vrep_ != nullptr) { + WriteLock l(&vrep_->rwlock_); + if (!vrep_->sorted_) { + std::sort(bucket_->begin(), bucket_->end(), Compare(compare_)); + cit_ = bucket_->begin(); + vrep_->sorted_ = true; + } + sorted_ = true; + } + if (!sorted_) { + std::sort(bucket_->begin(), bucket_->end(), Compare(compare_)); + cit_ = bucket_->begin(); + sorted_ = true; + } + assert(sorted_); + assert(vrep_ == nullptr || vrep_->sorted_); +} + +// Returns true iff the iterator is positioned at a valid node. +bool VectorRep::Iterator::Valid() const { + DoSort(); + return cit_ != bucket_->end(); +} + +// Returns the key at the current position. +// REQUIRES: Valid() +const char* VectorRep::Iterator::key() const { + assert(Valid()); + return *cit_; +} + +// Advances to the next position. +// REQUIRES: Valid() +void VectorRep::Iterator::Next() { + assert(Valid()); + if (cit_ == bucket_->end()) { + return; + } + ++cit_; +} + +// Advances to the previous position. +// REQUIRES: Valid() +void VectorRep::Iterator::Prev() { + assert(Valid()); + if (cit_ == bucket_->begin()) { + // If you try to go back from the first element, the iterator should be + // invalidated. So we set it to past-the-end. This means that you can + // treat the container circularly. + cit_ = bucket_->end(); + } else { + --cit_; + } +} + +// Advance to the first entry with a key >= target +void VectorRep::Iterator::Seek(const Slice& user_key, + const char* memtable_key) { + DoSort(); + // Do binary search to find first value not less than the target + const char* encoded_key = + (memtable_key != nullptr) ? memtable_key : EncodeKey(&tmp_, user_key); + cit_ = std::equal_range(bucket_->begin(), + bucket_->end(), + encoded_key, + [this] (const char* a, const char* b) { + return compare_(a, b) < 0; + }).first; +} + +// Position at the first entry in collection. +// Final state of iterator is Valid() iff collection is not empty. +void VectorRep::Iterator::SeekToFirst() { + DoSort(); + cit_ = bucket_->begin(); +} + +// Position at the last entry in collection. +// Final state of iterator is Valid() iff collection is not empty. +void VectorRep::Iterator::SeekToLast() { + DoSort(); + cit_ = bucket_->end(); + if (bucket_->size() != 0) { + --cit_; + } +} + +void VectorRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + rwlock_.ReadLock(); + VectorRep* vector_rep; + std::shared_ptr bucket; + if (immutable_) { + vector_rep = this; + } else { + vector_rep = nullptr; + bucket.reset(new Bucket(*bucket_)); // make a copy + } + VectorRep::Iterator iter(vector_rep, immutable_ ? bucket_ : bucket, compare_); + rwlock_.Unlock(); + + for (iter.Seek(k.user_key(), k.memtable_key().data()); + iter.Valid() && callback_func(callback_args, iter.key()); iter.Next()) { + } +} + +MemTableRep::Iterator* VectorRep::GetIterator(Arena* arena) { + char* mem = nullptr; + if (arena != nullptr) { + mem = arena->AllocateAligned(sizeof(Iterator)); + } + ReadLock l(&rwlock_); + // Do not sort here. The sorting would be done the first time + // a Seek is performed on the iterator. + if (immutable_) { + if (arena == nullptr) { + return new Iterator(this, bucket_, compare_); + } else { + return new (mem) Iterator(this, bucket_, compare_); + } + } else { + std::shared_ptr tmp; + tmp.reset(new Bucket(*bucket_)); // make a copy + if (arena == nullptr) { + return new Iterator(nullptr, tmp, compare_); + } else { + return new (mem) Iterator(nullptr, tmp, compare_); + } + } +} +} // anon namespace + +MemTableRep* VectorRepFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform*, Logger* logger) { + return new VectorRep(compare, arena, count_); +} +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/util/xxhash.cc b/util/xxhash.cc new file mode 100644 index 0000000000..6dfd4b283f --- /dev/null +++ b/util/xxhash.cc @@ -0,0 +1,475 @@ +/* +xxHash - Fast Hash algorithm +Copyright (C) 2012-2014, Yann Collet. +BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +You can contact the author at : +- xxHash source repository : http://code.google.com/p/xxhash/ +*/ + + +//************************************** +// Tuning parameters +//************************************** +// Unaligned memory access is automatically enabled for "common" CPU, such as x86. +// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected. +// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance. +// You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32). +#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_USE_UNALIGNED_ACCESS 1 +#endif + +// XXH_ACCEPT_NULL_INPUT_POINTER : +// If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer. +// When this option is enabled, xxHash output for null input pointers will be the same as a null-length input. +// This option has a very small performance cost (only measurable on small inputs). +// By default, this option is disabled. To enable it, uncomment below define : +//#define XXH_ACCEPT_NULL_INPUT_POINTER 1 + +// XXH_FORCE_NATIVE_FORMAT : +// By default, xxHash library provides endian-independent Hash values, based on little-endian convention. +// Results are therefore identical for little-endian and big-endian CPU. +// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. +// Should endian-independance be of no importance for your application, you may set the #define below to 1. +// It will improve speed for Big-endian CPU. +// This option has no impact on Little_Endian CPU. +#define XXH_FORCE_NATIVE_FORMAT 0 + + +//************************************** +// Compiler Specific Options +//************************************** +// Disable some Visual warning messages +#ifdef _MSC_VER // Visual Studio +# pragma warning(disable : 4127) // disable: C4127: conditional expression is constant +#endif + +#ifdef _MSC_VER // Visual Studio +# define FORCE_INLINE static __forceinline +#else +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +#endif + + +//************************************** +// Includes & Memory related functions +//************************************** +#include "xxhash.h" +// Modify the local functions below should you wish to use some other memory related routines +// for malloc(), free() +#include +FORCE_INLINE void* XXH_malloc(size_t s) { return malloc(s); } +FORCE_INLINE void XXH_free (void* p) { free(p); } +// for memcpy() +#include +FORCE_INLINE void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); } + + +//************************************** +// Basic Types +//************************************** +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +#endif + +#if defined(__GNUC__) && !defined(XXH_USE_UNALIGNED_ACCESS) +# define _PACKED __attribute__ ((packed)) +#else +# define _PACKED +#endif + +#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# ifdef __IBMC__ +# pragma pack(1) +# else +# pragma pack(push, 1) +# endif +#endif + +typedef struct _U32_S { U32 v; } _PACKED U32_S; + +#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# pragma pack(pop) +#endif + +#define A32(x) (((U32_S *)(x))->v) + + +//*************************************** +// Compiler-specific Functions and Macros +//*************************************** +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +// Note : although _rotl exists for minGW (GCC under windows), performance seems poor +#if defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +#else +# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) +#endif + +#if defined(_MSC_VER) // Visual Studio +# define XXH_swap32 _byteswap_ulong +#elif GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static inline U32 XXH_swap32 (U32 x) { + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff );} +#endif + + +//************************************** +// Constants +//************************************** +#define PRIME32_1 2654435761U +#define PRIME32_2 2246822519U +#define PRIME32_3 3266489917U +#define PRIME32_4 668265263U +#define PRIME32_5 374761393U + + +//************************************** +// Architecture Macros +//************************************** +typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; +#ifndef XXH_CPU_LITTLE_ENDIAN // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch + static const int one = 1; +# define XXH_CPU_LITTLE_ENDIAN (*(char*)(&one)) +#endif + + +//************************************** +// Macros +//************************************** +#define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(!!(c)) }; } // use only *after* variable declarations + + +//**************************** +// Memory reads +//**************************** +typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +FORCE_INLINE U32 XXH_readLE32_align(const U32* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align==XXH_unaligned) + return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr)); + else + return endian==XXH_littleEndian ? *ptr : XXH_swap32(*ptr); +} + +FORCE_INLINE U32 XXH_readLE32(const U32* ptr, XXH_endianess endian) { return XXH_readLE32_align(ptr, endian, XXH_unaligned); } + + +//**************************** +// Simple Hash Functions +//**************************** +FORCE_INLINE U32 XXH32_endian_align(const void* input, int len, U32 seed, XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + U32 h32; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (p==NULL) { len=0; p=(const BYTE*)(size_t)16; } +#endif + + if (len>=16) + { + const BYTE* const limit = bEnd - 16; + U32 v1 = seed + PRIME32_1 + PRIME32_2; + U32 v2 = seed + PRIME32_2; + U32 v3 = seed + 0; + U32 v4 = seed - PRIME32_1; + + do + { + v1 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4; + v2 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4; + v3 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4; + v4 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4; + } while (p<=limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } + else + { + h32 = seed + PRIME32_5; + } + + h32 += (U32) len; + + while (p<=bEnd-4) + { + h32 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; + p+=4; + } + + while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +U32 XXH32(const void* input, int len, U32 seed) +{ +#if 0 + // Simple version, good for code maintenance, but unfortunately slow for small inputs + void* state = XXH32_init(seed); + XXH32_update(state, input, len); + return XXH32_digest(state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + +# if !defined(XXH_USE_UNALIGNED_ACCESS) + if ((((size_t)input) & 3)) // Input is aligned, let's leverage the speed advantage + { + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } +# endif + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + + +//**************************** +// Advanced Hash Functions +//**************************** + +struct XXH_state32_t +{ + U64 total_len; + U32 seed; + U32 v1; + U32 v2; + U32 v3; + U32 v4; + int memsize; + char memory[16]; +}; + + +int XXH32_sizeofState() +{ + XXH_STATIC_ASSERT(XXH32_SIZEOFSTATE >= sizeof(struct XXH_state32_t)); // A compilation error here means XXH32_SIZEOFSTATE is not large enough + return sizeof(struct XXH_state32_t); +} + + +XXH_errorcode XXH32_resetState(void* state_in, U32 seed) +{ + struct XXH_state32_t * state = (struct XXH_state32_t *) state_in; + state->seed = seed; + state->v1 = seed + PRIME32_1 + PRIME32_2; + state->v2 = seed + PRIME32_2; + state->v3 = seed + 0; + state->v4 = seed - PRIME32_1; + state->total_len = 0; + state->memsize = 0; + return XXH_OK; +} + + +void* XXH32_init (U32 seed) +{ + void* state = XXH_malloc (sizeof(struct XXH_state32_t)); + XXH32_resetState(state, seed); + return state; +} + + +FORCE_INLINE XXH_errorcode XXH32_update_endian (void* state_in, const void* input, int len, XXH_endianess endian) +{ + struct XXH_state32_t * state = (struct XXH_state32_t *) state_in; + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (input==NULL) return XXH_ERROR; +#endif + + state->total_len += len; + + if (state->memsize + len < 16) // fill in tmp buffer + { + XXH_memcpy(state->memory + state->memsize, input, len); + state->memsize += len; + return XXH_OK; + } + + if (state->memsize) // some data left from previous update + { + XXH_memcpy(state->memory + state->memsize, input, 16-state->memsize); + { + const U32* p32 = (const U32*)state->memory; + state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++; + state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++; + state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++; + state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++; + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) + { + const BYTE* const limit = bEnd - 16; + U32 v1 = state->v1; + U32 v2 = state->v2; + U32 v3 = state->v3; + U32 v4 = state->v4; + + do + { + v1 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4; + v2 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4; + v3 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4; + v4 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) + { + XXH_memcpy(state->memory, p, bEnd-p); + state->memsize = (int)(bEnd-p); + } + + return XXH_OK; +} + +XXH_errorcode XXH32_update (void* state_in, const void* input, int len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH32_update_endian(state_in, input, len, XXH_bigEndian); +} + + + +FORCE_INLINE U32 XXH32_intermediateDigest_endian (void* state_in, XXH_endianess endian) +{ + struct XXH_state32_t * state = (struct XXH_state32_t *) state_in; + const BYTE * p = (const BYTE*)state->memory; + BYTE* bEnd = (BYTE*)state->memory + state->memsize; + U32 h32; + + if (state->total_len >= 16) + { + h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); + } + else + { + h32 = state->seed + PRIME32_5; + } + + h32 += (U32) state->total_len; + + while (p<=bEnd-4) + { + h32 += XXH_readLE32((const U32*)p, endian) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4; + p+=4; + } + + while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +U32 XXH32_intermediateDigest (void* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_intermediateDigest_endian(state_in, XXH_littleEndian); + else + return XXH32_intermediateDigest_endian(state_in, XXH_bigEndian); +} + + +U32 XXH32_digest (void* state_in) +{ + U32 h32 = XXH32_intermediateDigest(state_in); + + XXH_free(state_in); + + return h32; +} diff --git a/util/xxhash.h b/util/xxhash.h new file mode 100644 index 0000000000..ceff06677b --- /dev/null +++ b/util/xxhash.h @@ -0,0 +1,164 @@ +/* + xxHash - Fast Hash algorithm + Header File + Copyright (C) 2012-2014, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - xxHash source repository : http://code.google.com/p/xxhash/ +*/ + +/* Notice extracted from xxHash homepage : + +xxHash is an extremely fast Hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. +*/ + +#pragma once + +#if defined (__cplusplus) +extern "C" { +#endif + + +//**************************** +// Type +//**************************** +typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; + + + +//**************************** +// Simple Hash Functions +//**************************** + +unsigned int XXH32 (const void* input, int len, unsigned int seed); + +/* +XXH32() : + Calculate the 32-bits hash of sequence of length "len" stored at memory address "input". + The memory between input & input+len must be valid (allocated and read-accessible). + "seed" can be used to alter the result predictably. + This function successfully passes all SMHasher tests. + Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s + Note that "len" is type "int", which means it is limited to 2^31-1. + If your data is larger, use the advanced functions below. +*/ + + + +//**************************** +// Advanced Hash Functions +//**************************** + +void* XXH32_init (unsigned int seed); +XXH_errorcode XXH32_update (void* state, const void* input, int len); +unsigned int XXH32_digest (void* state); + +/* +These functions calculate the xxhash of an input provided in several small packets, +as opposed to an input provided as a single block. + +It must be started with : +void* XXH32_init() +The function returns a pointer which holds the state of calculation. + +This pointer must be provided as "void* state" parameter for XXH32_update(). +XXH32_update() can be called as many times as necessary. +The user must provide a valid (allocated) input. +The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. +Note that "len" is type "int", which means it is limited to 2^31-1. +If your data is larger, it is recommended to chunk your data into blocks +of size for example 2^30 (1GB) to avoid any "int" overflow issue. + +Finally, you can end the calculation anytime, by using XXH32_digest(). +This function returns the final 32-bits hash. +You must provide the same "void* state" parameter created by XXH32_init(). +Memory will be freed by XXH32_digest(). +*/ + + +int XXH32_sizeofState(); +XXH_errorcode XXH32_resetState(void* state, unsigned int seed); + +#define XXH32_SIZEOFSTATE 48 +typedef struct { long long ll[(XXH32_SIZEOFSTATE+(sizeof(long long)-1))/sizeof(long long)]; } XXH32_stateSpace_t; +/* +These functions allow user application to make its own allocation for state. + +XXH32_sizeofState() is used to know how much space must be allocated for the xxHash 32-bits state. +Note that the state must be aligned to access 'long long' fields. Memory must be allocated and referenced by a pointer. +This pointer must then be provided as 'state' into XXH32_resetState(), which initializes the state. + +For static allocation purposes (such as allocation on stack, or freestanding systems without malloc()), +use the structure XXH32_stateSpace_t, which will ensure that memory space is large enough and correctly aligned to access 'long long' fields. +*/ + + +unsigned int XXH32_intermediateDigest (void* state); +/* +This function does the same as XXH32_digest(), generating a 32-bit hash, +but preserve memory context. +This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_update(). +To free memory context, use XXH32_digest(), or free(). +*/ + + + +//**************************** +// Deprecated function names +//**************************** +// The following translations are provided to ease code transition +// You are encouraged to no longer this function names +#define XXH32_feed XXH32_update +#define XXH32_result XXH32_digest +#define XXH32_getIntermediateResult XXH32_intermediateDigest + + + +#if defined (__cplusplus) +} +#endif diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc new file mode 100644 index 0000000000..3ac1d90a1a --- /dev/null +++ b/utilities/backupable/backupable_db.cc @@ -0,0 +1,1306 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include "utilities/backupable_db.h" +#include "db/filename.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "rocksdb/transaction_log.h" + +#define __STDC_FORMAT_MACROS + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +namespace { +class RateLimiter { + public: + RateLimiter(Env* env, uint64_t max_bytes_per_second, uint64_t bytes_per_check) + : env_(env), + max_bytes_per_second_(max_bytes_per_second), + bytes_per_check_(bytes_per_check), + micros_start_time_(env->NowMicros()), + bytes_since_start_(0) {} + + void ReportAndWait(uint64_t bytes_since_last_call) { + bytes_since_start_ += bytes_since_last_call; + if (bytes_since_start_ < bytes_per_check_) { + // not enough bytes to be rate-limited + return; + } + + uint64_t now = env_->NowMicros(); + uint64_t interval = now - micros_start_time_; + uint64_t should_take_micros = + (bytes_since_start_ * kMicrosInSecond) / max_bytes_per_second_; + + if (should_take_micros > interval) { + env_->SleepForMicroseconds(should_take_micros - interval); + now = env_->NowMicros(); + } + // reset interval + micros_start_time_ = now; + bytes_since_start_ = 0; + } + + private: + Env* env_; + uint64_t max_bytes_per_second_; + uint64_t bytes_per_check_; + uint64_t micros_start_time_; + uint64_t bytes_since_start_; + static const uint64_t kMicrosInSecond = 1000 * 1000LL; +}; +} // namespace + +void BackupableDBOptions::Dump(Logger* logger) const { + Log(logger, " Options.backup_dir: %s", backup_dir.c_str()); + Log(logger, " Options.backup_env: %p", backup_env); + Log(logger, " Options.share_table_files: %d", + static_cast(share_table_files)); + Log(logger, " Options.info_log: %p", info_log); + Log(logger, " Options.sync: %d", static_cast(sync)); + Log(logger, " Options.destroy_old_data: %d", + static_cast(destroy_old_data)); + Log(logger, " Options.backup_log_files: %d", + static_cast(backup_log_files)); + Log(logger, " Options.backup_rate_limit: %" PRIu64, backup_rate_limit); + Log(logger, "Options.restore_rate_limit: %" PRIu64, restore_rate_limit); +} + +// -------- BackupEngineImpl class --------- +class BackupEngineImpl : public BackupEngine { + public: + BackupEngineImpl(Env* db_env, const BackupableDBOptions& options, + bool read_only = false); + ~BackupEngineImpl(); + Status CreateNewBackup(DB* db, bool flush_before_backup = false); + Status PurgeOldBackups(uint32_t num_backups_to_keep); + Status DeleteBackup(BackupID backup_id); + void StopBackup() { + stop_backup_.store(true, std::memory_order_release); + } + + void GetBackupInfo(std::vector* backup_info); + Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir, + const std::string& wal_dir, + const RestoreOptions& restore_options = + RestoreOptions()); + Status RestoreDBFromLatestBackup(const std::string& db_dir, + const std::string& wal_dir, + const RestoreOptions& restore_options = + RestoreOptions()) { + return RestoreDBFromBackup(latest_backup_id_, db_dir, wal_dir, + restore_options); + } + + private: + void DeleteChildren(const std::string& dir, uint32_t file_type_filter = 0); + + struct FileInfo { + FileInfo(const std::string& fname, uint64_t sz, uint32_t checksum) + : refs(0), filename(fname), size(sz), checksum_value(checksum) {} + + int refs; + const std::string filename; + const uint64_t size; + uint32_t checksum_value; + }; + + class BackupMeta { + public: + BackupMeta(const std::string& meta_filename, + std::unordered_map* file_infos, Env* env) + : timestamp_(0), size_(0), meta_filename_(meta_filename), + file_infos_(file_infos), env_(env) {} + + ~BackupMeta() {} + + void RecordTimestamp() { + env_->GetCurrentTime(×tamp_); + } + int64_t GetTimestamp() const { + return timestamp_; + } + uint64_t GetSize() const { + return size_; + } + void SetSequenceNumber(uint64_t sequence_number) { + sequence_number_ = sequence_number; + } + uint64_t GetSequenceNumber() { + return sequence_number_; + } + + Status AddFile(const FileInfo& file_info); + + void Delete(bool delete_meta = true); + + bool Empty() { + return files_.empty(); + } + + const std::vector& GetFiles() { + return files_; + } + + Status LoadFromFile(const std::string& backup_dir); + Status StoreToFile(bool sync); + + private: + int64_t timestamp_; + // sequence number is only approximate, should not be used + // by clients + uint64_t sequence_number_; + uint64_t size_; + std::string const meta_filename_; + // files with relative paths (without "/" prefix!!) + std::vector files_; + std::unordered_map* file_infos_; + Env* env_; + + static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024; // 10MB + }; // BackupMeta + + inline std::string GetAbsolutePath( + const std::string &relative_path = "") const { + assert(relative_path.size() == 0 || relative_path[0] != '/'); + return options_.backup_dir + "/" + relative_path; + } + inline std::string GetPrivateDirRel() const { + return "private"; + } + inline std::string GetSharedChecksumDirRel() const { + return "shared_checksum"; + } + inline std::string GetPrivateFileRel(BackupID backup_id, + bool tmp = false, + const std::string& file = "") const { + assert(file.size() == 0 || file[0] != '/'); + return GetPrivateDirRel() + "/" + std::to_string(backup_id) + + (tmp ? ".tmp" : "") + "/" + file; + } + inline std::string GetSharedFileRel(const std::string& file = "", + bool tmp = false) const { + assert(file.size() == 0 || file[0] != '/'); + return "shared/" + file + (tmp ? ".tmp" : ""); + } + inline std::string GetSharedFileWithChecksumRel(const std::string& file = "", + bool tmp = false) const { + assert(file.size() == 0 || file[0] != '/'); + return GetSharedChecksumDirRel() + "/" + file + (tmp ? ".tmp" : ""); + } + inline std::string GetSharedFileWithChecksum(const std::string& file, + const uint32_t checksum_value, + const uint64_t file_size) const { + assert(file.size() == 0 || file[0] != '/'); + std::string file_copy = file; + return file_copy.insert(file_copy.find_last_of('.'), + "_" + std::to_string(checksum_value) + + "_" + std::to_string(file_size)); + } + inline std::string GetFileFromChecksumFile(const std::string& file) const { + assert(file.size() == 0 || file[0] != '/'); + std::string file_copy = file; + size_t first_underscore = file_copy.find_first_of('_'); + return file_copy.erase(first_underscore, + file_copy.find_last_of('.') - first_underscore); + } + inline std::string GetLatestBackupFile(bool tmp = false) const { + return GetAbsolutePath(std::string("LATEST_BACKUP") + (tmp ? ".tmp" : "")); + } + inline std::string GetBackupMetaDir() const { + return GetAbsolutePath("meta"); + } + inline std::string GetBackupMetaFile(BackupID backup_id) const { + return GetBackupMetaDir() + "/" + std::to_string(backup_id); + } + + Status GetLatestBackupFileContents(uint32_t* latest_backup); + Status PutLatestBackupFileContents(uint32_t latest_backup); + // if size_limit == 0, there is no size limit, copy everything + Status CopyFile(const std::string& src, + const std::string& dst, + Env* src_env, + Env* dst_env, + bool sync, + RateLimiter* rate_limiter, + uint64_t* size = nullptr, + uint32_t* checksum_value = nullptr, + uint64_t size_limit = 0); + // if size_limit == 0, there is no size limit, copy everything + Status BackupFile(BackupID backup_id, + BackupMeta* backup, + bool shared, + const std::string& src_dir, + const std::string& src_fname, // starts with "/" + RateLimiter* rate_limiter, + uint64_t size_limit = 0, + bool shared_checksum = false); + + Status CalculateChecksum(const std::string& src, + Env* src_env, + uint64_t size_limit, + uint32_t* checksum_value); + + // Will delete all the files we don't need anymore + // If full_scan == true, it will do the full scan of files/ directory + // and delete all the files that are not referenced from backuped_file_infos__ + void GarbageCollection(bool full_scan); + + // backup state data + BackupID latest_backup_id_; + std::map backups_; + std::unordered_map backuped_file_infos_; + std::vector obsolete_backups_; + std::atomic stop_backup_; + + // options data + BackupableDBOptions options_; + Env* db_env_; + Env* backup_env_; + + // directories + unique_ptr backup_directory_; + unique_ptr shared_directory_; + unique_ptr meta_directory_; + unique_ptr private_directory_; + + static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL; // 5MB + size_t copy_file_buffer_size_; + bool read_only_; +}; + +BackupEngine* BackupEngine::NewBackupEngine( + Env* db_env, const BackupableDBOptions& options) { + return new BackupEngineImpl(db_env, options); +} + +BackupEngineImpl::BackupEngineImpl(Env* db_env, + const BackupableDBOptions& options, + bool read_only) + : stop_backup_(false), + options_(options), + db_env_(db_env), + backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_), + copy_file_buffer_size_(kDefaultCopyFileBufferSize), + read_only_(read_only) { + if (read_only_) { + Log(options_.info_log, "Starting read_only backup engine"); + } + options_.Dump(options_.info_log); + + if (!read_only_) { + // create all the dirs we need + backup_env_->CreateDirIfMissing(GetAbsolutePath()); + backup_env_->NewDirectory(GetAbsolutePath(), &backup_directory_); + if (options_.share_table_files) { + if (options_.share_files_with_checksum) { + backup_env_->CreateDirIfMissing(GetAbsolutePath( + GetSharedFileWithChecksumRel())); + backup_env_->NewDirectory(GetAbsolutePath( + GetSharedFileWithChecksumRel()), &shared_directory_); + } else { + backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel())); + backup_env_->NewDirectory(GetAbsolutePath(GetSharedFileRel()), + &shared_directory_); + } + } + backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel())); + backup_env_->NewDirectory(GetAbsolutePath(GetPrivateDirRel()), + &private_directory_); + backup_env_->CreateDirIfMissing(GetBackupMetaDir()); + backup_env_->NewDirectory(GetBackupMetaDir(), &meta_directory_); + } + + std::vector backup_meta_files; + backup_env_->GetChildren(GetBackupMetaDir(), &backup_meta_files); + // create backups_ structure + for (auto& file : backup_meta_files) { + BackupID backup_id = 0; + sscanf(file.c_str(), "%u", &backup_id); + if (backup_id == 0 || file != std::to_string(backup_id)) { + if (!read_only_) { + // invalid file name, delete that + backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file); + } + continue; + } + assert(backups_.find(backup_id) == backups_.end()); + backups_.insert(std::make_pair( + backup_id, BackupMeta(GetBackupMetaFile(backup_id), + &backuped_file_infos_, backup_env_))); + } + + if (options_.destroy_old_data) { // Destory old data + assert(!read_only_); + for (auto& backup : backups_) { + backup.second.Delete(); + obsolete_backups_.push_back(backup.first); + } + backups_.clear(); + // start from beginning + latest_backup_id_ = 0; + // GarbageCollection() will do the actual deletion + } else { // Load data from storage + // load the backups if any + for (auto& backup : backups_) { + Status s = backup.second.LoadFromFile(options_.backup_dir); + if (!s.ok()) { + Log(options_.info_log, "Backup %u corrupted -- %s", backup.first, + s.ToString().c_str()); + if (!read_only_) { + Log(options_.info_log, "-> Deleting backup %u", backup.first); + } + backup.second.Delete(!read_only_); + obsolete_backups_.push_back(backup.first); + } + } + // delete obsolete backups from the structure + for (auto ob : obsolete_backups_) { + backups_.erase(ob); + } + + Status s = GetLatestBackupFileContents(&latest_backup_id_); + + // If latest backup file is corrupted or non-existent + // set latest backup as the biggest backup we have + // or 0 if we have no backups + if (!s.ok() || + backups_.find(latest_backup_id_) == backups_.end()) { + auto itr = backups_.end(); + latest_backup_id_ = (itr == backups_.begin()) ? 0 : (--itr)->first; + } + } + + // delete any backups that claim to be later than latest + for (auto itr = backups_.upper_bound(latest_backup_id_); + itr != backups_.end();) { + itr->second.Delete(); + obsolete_backups_.push_back(itr->first); + itr = backups_.erase(itr); + } + + if (!read_only_) { + PutLatestBackupFileContents(latest_backup_id_); // Ignore errors + GarbageCollection(true); + } + Log(options_.info_log, "Initialized BackupEngine, the latest backup is %u.", + latest_backup_id_); +} + +BackupEngineImpl::~BackupEngineImpl() { LogFlush(options_.info_log); } + +Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { + assert(!read_only_); + Status s; + std::vector live_files; + VectorLogPtr live_wal_files; + uint64_t manifest_file_size = 0; + uint64_t sequence_number = db->GetLatestSequenceNumber(); + + s = db->DisableFileDeletions(); + if (s.ok()) { + // this will return live_files prefixed with "/" + s = db->GetLiveFiles(live_files, &manifest_file_size, flush_before_backup); + } + // if we didn't flush before backup, we need to also get WAL files + if (s.ok() && !flush_before_backup && options_.backup_log_files) { + // returns file names prefixed with "/" + s = db->GetSortedWalFiles(live_wal_files); + } + if (!s.ok()) { + db->EnableFileDeletions(false); + return s; + } + + BackupID new_backup_id = latest_backup_id_ + 1; + assert(backups_.find(new_backup_id) == backups_.end()); + auto ret = backups_.insert(std::make_pair( + new_backup_id, BackupMeta(GetBackupMetaFile(new_backup_id), + &backuped_file_infos_, backup_env_))); + assert(ret.second == true); + auto& new_backup = ret.first->second; + new_backup.RecordTimestamp(); + new_backup.SetSequenceNumber(sequence_number); + + Log(options_.info_log, "Started the backup process -- creating backup %u", + new_backup_id); + + // create temporary private dir + s = backup_env_->CreateDir( + GetAbsolutePath(GetPrivateFileRel(new_backup_id, true))); + + unique_ptr rate_limiter; + if (options_.backup_rate_limit > 0) { + copy_file_buffer_size_ = options_.backup_rate_limit / 10; + rate_limiter.reset(new RateLimiter(db_env_, options_.backup_rate_limit, + copy_file_buffer_size_)); + } + + // copy live_files + for (size_t i = 0; s.ok() && i < live_files.size(); ++i) { + uint64_t number; + FileType type; + bool ok = ParseFileName(live_files[i], &number, &type); + if (!ok) { + assert(false); + return Status::Corruption("Can't parse file name. This is very bad"); + } + // we should only get sst, manifest and current files here + assert(type == kTableFile || type == kDescriptorFile || + type == kCurrentFile); + + // rules: + // * if it's kTableFile, then it's shared + // * if it's kDescriptorFile, limit the size to manifest_file_size + s = BackupFile(new_backup_id, + &new_backup, + options_.share_table_files && type == kTableFile, + db->GetName(), /* src_dir */ + live_files[i], /* src_fname */ + rate_limiter.get(), + (type == kDescriptorFile) ? manifest_file_size : 0, + options_.share_files_with_checksum && type == kTableFile); + } + + // copy WAL files + for (size_t i = 0; s.ok() && i < live_wal_files.size(); ++i) { + if (live_wal_files[i]->Type() == kAliveLogFile) { + // we only care about live log files + // copy the file into backup_dir/files// + s = BackupFile(new_backup_id, + &new_backup, + false, /* not shared */ + db->GetOptions().wal_dir, + live_wal_files[i]->PathName(), + rate_limiter.get()); + } + } + + // we copied all the files, enable file deletions + db->EnableFileDeletions(false); + + if (s.ok()) { + // move tmp private backup to real backup folder + s = backup_env_->RenameFile( + GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)), // tmp + GetAbsolutePath(GetPrivateFileRel(new_backup_id, false))); + } + + if (s.ok()) { + // persist the backup metadata on the disk + s = new_backup.StoreToFile(options_.sync); + } + if (s.ok()) { + // install the newly created backup meta! (atomic) + s = PutLatestBackupFileContents(new_backup_id); + } + if (s.ok() && options_.sync) { + unique_ptr backup_private_directory; + backup_env_->NewDirectory( + GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)), + &backup_private_directory); + if (backup_private_directory != nullptr) { + backup_private_directory->Fsync(); + } + if (private_directory_ != nullptr) { + private_directory_->Fsync(); + } + if (meta_directory_ != nullptr) { + meta_directory_->Fsync(); + } + if (shared_directory_ != nullptr) { + shared_directory_->Fsync(); + } + if (backup_directory_ != nullptr) { + backup_directory_->Fsync(); + } + } + + if (!s.ok()) { + // clean all the files we might have created + Log(options_.info_log, "Backup failed -- %s", s.ToString().c_str()); + backups_.erase(new_backup_id); + GarbageCollection(true); + return s; + } + + // here we know that we succeeded and installed the new backup + // in the LATEST_BACKUP file + latest_backup_id_ = new_backup_id; + Log(options_.info_log, "Backup DONE. All is good"); + return s; +} + +Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) { + assert(!read_only_); + Log(options_.info_log, "Purging old backups, keeping %u", + num_backups_to_keep); + while (num_backups_to_keep < backups_.size()) { + Log(options_.info_log, "Deleting backup %u", backups_.begin()->first); + backups_.begin()->second.Delete(); + obsolete_backups_.push_back(backups_.begin()->first); + backups_.erase(backups_.begin()); + } + GarbageCollection(false); + return Status::OK(); +} + +Status BackupEngineImpl::DeleteBackup(BackupID backup_id) { + assert(!read_only_); + Log(options_.info_log, "Deleting backup %u", backup_id); + auto backup = backups_.find(backup_id); + if (backup == backups_.end()) { + return Status::NotFound("Backup not found"); + } + backup->second.Delete(); + obsolete_backups_.push_back(backup_id); + backups_.erase(backup); + GarbageCollection(false); + return Status::OK(); +} + +void BackupEngineImpl::GetBackupInfo(std::vector* backup_info) { + backup_info->reserve(backups_.size()); + for (auto& backup : backups_) { + if (!backup.second.Empty()) { + backup_info->push_back(BackupInfo( + backup.first, backup.second.GetTimestamp(), backup.second.GetSize())); + } + } +} + +Status BackupEngineImpl::RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options) { + auto backup_itr = backups_.find(backup_id); + if (backup_itr == backups_.end()) { + return Status::NotFound("Backup not found"); + } + auto& backup = backup_itr->second; + if (backup.Empty()) { + return Status::NotFound("Backup not found"); + } + + Log(options_.info_log, "Restoring backup id %u\n", backup_id); + Log(options_.info_log, "keep_log_files: %d\n", + static_cast(restore_options.keep_log_files)); + + // just in case. Ignore errors + db_env_->CreateDirIfMissing(db_dir); + db_env_->CreateDirIfMissing(wal_dir); + + if (restore_options.keep_log_files) { + // delete files in db_dir, but keep all the log files + DeleteChildren(db_dir, 1 << kLogFile); + // move all the files from archive dir to wal_dir + std::string archive_dir = ArchivalDirectory(wal_dir); + std::vector archive_files; + db_env_->GetChildren(archive_dir, &archive_files); // ignore errors + for (const auto& f : archive_files) { + uint64_t number; + FileType type; + bool ok = ParseFileName(f, &number, &type); + if (ok && type == kLogFile) { + Log(options_.info_log, "Moving log file from archive/ to wal_dir: %s", + f.c_str()); + Status s = + db_env_->RenameFile(archive_dir + "/" + f, wal_dir + "/" + f); + if (!s.ok()) { + // if we can't move log file from archive_dir to wal_dir, + // we should fail, since it might mean data loss + return s; + } + } + } + } else { + DeleteChildren(wal_dir); + DeleteChildren(ArchivalDirectory(wal_dir)); + DeleteChildren(db_dir); + } + + unique_ptr rate_limiter; + if (options_.restore_rate_limit > 0) { + copy_file_buffer_size_ = options_.restore_rate_limit / 10; + rate_limiter.reset(new RateLimiter(db_env_, options_.restore_rate_limit, + copy_file_buffer_size_)); + } + Status s; + for (auto& file : backup.GetFiles()) { + std::string dst; + // 1. extract the filename + size_t slash = file.find_last_of('/'); + // file will either be shared/, shared_checksum/ + // or private// + assert(slash != std::string::npos); + dst = file.substr(slash + 1); + + // if the file was in shared_checksum, extract the real file name + // in this case the file is __. + if (file.substr(0, slash) == GetSharedChecksumDirRel()) { + dst = GetFileFromChecksumFile(dst); + } + + // 2. find the filetype + uint64_t number; + FileType type; + bool ok = ParseFileName(dst, &number, &type); + if (!ok) { + return Status::Corruption("Backup corrupted"); + } + // 3. Construct the final path + // kLogFile lives in wal_dir and all the rest live in db_dir + dst = ((type == kLogFile) ? wal_dir : db_dir) + + "/" + dst; + + Log(options_.info_log, "Restoring %s to %s\n", file.c_str(), dst.c_str()); + uint32_t checksum_value; + s = CopyFile(GetAbsolutePath(file), dst, backup_env_, db_env_, false, + rate_limiter.get(), nullptr /* size */, &checksum_value); + if (!s.ok()) { + break; + } + + const auto iter = backuped_file_infos_.find(file); + assert(iter != backuped_file_infos_.end()); + if (iter->second.checksum_value != checksum_value) { + s = Status::Corruption("Checksum check failed"); + break; + } + } + + Log(options_.info_log, "Restoring done -- %s\n", s.ToString().c_str()); + return s; +} + +// latest backup id is an ASCII representation of latest backup id +Status BackupEngineImpl::GetLatestBackupFileContents(uint32_t* latest_backup) { + Status s; + unique_ptr file; + s = backup_env_->NewSequentialFile(GetLatestBackupFile(), + &file, + EnvOptions()); + if (!s.ok()) { + return s; + } + + char buf[11]; + Slice data; + s = file->Read(10, &data, buf); + if (!s.ok() || data.size() == 0) { + return s.ok() ? Status::Corruption("Latest backup file corrupted") : s; + } + buf[data.size()] = 0; + + *latest_backup = 0; + sscanf(data.data(), "%u", latest_backup); + if (backup_env_->FileExists(GetBackupMetaFile(*latest_backup)) == false) { + s = Status::Corruption("Latest backup file corrupted"); + } + return Status::OK(); +} + +// this operation HAS to be atomic +// writing 4 bytes to the file is atomic alright, but we should *never* +// do something like 1. delete file, 2. write new file +// We write to a tmp file and then atomically rename +Status BackupEngineImpl::PutLatestBackupFileContents(uint32_t latest_backup) { + assert(!read_only_); + Status s; + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = backup_env_->NewWritableFile(GetLatestBackupFile(true), + &file, + env_options); + if (!s.ok()) { + backup_env_->DeleteFile(GetLatestBackupFile(true)); + return s; + } + + char file_contents[10]; + int len = sprintf(file_contents, "%u\n", latest_backup); + s = file->Append(Slice(file_contents, len)); + if (s.ok() && options_.sync) { + file->Sync(); + } + if (s.ok()) { + s = file->Close(); + } + if (s.ok()) { + // atomically replace real file with new tmp + s = backup_env_->RenameFile(GetLatestBackupFile(true), + GetLatestBackupFile(false)); + } + return s; +} + +Status BackupEngineImpl::CopyFile(const std::string& src, + const std::string& dst, Env* src_env, + Env* dst_env, bool sync, + RateLimiter* rate_limiter, uint64_t* size, + uint32_t* checksum_value, + uint64_t size_limit) { + Status s; + unique_ptr dst_file; + unique_ptr src_file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + env_options.use_os_buffer = false; + if (size != nullptr) { + *size = 0; + } + if (checksum_value != nullptr) { + *checksum_value = 0; + } + + // Check if size limit is set. if not, set it to very big number + if (size_limit == 0) { + size_limit = std::numeric_limits::max(); + } + + s = src_env->NewSequentialFile(src, &src_file, env_options); + if (s.ok()) { + s = dst_env->NewWritableFile(dst, &dst_file, env_options); + } + if (!s.ok()) { + return s; + } + + unique_ptr buf(new char[copy_file_buffer_size_]); + Slice data; + + do { + if (stop_backup_.load(std::memory_order_acquire)) { + return Status::Incomplete("Backup stopped"); + } + size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ? + copy_file_buffer_size_ : size_limit; + s = src_file->Read(buffer_to_read, &data, buf.get()); + size_limit -= data.size(); + + if (!s.ok()) { + return s; + } + + if (size != nullptr) { + *size += data.size(); + } + if (checksum_value != nullptr) { + *checksum_value = crc32c::Extend(*checksum_value, data.data(), + data.size()); + } + s = dst_file->Append(data); + if (rate_limiter != nullptr) { + rate_limiter->ReportAndWait(data.size()); + } + } while (s.ok() && data.size() > 0 && size_limit > 0); + + if (s.ok() && sync) { + s = dst_file->Sync(); + } + + return s; +} + +// src_fname will always start with "/" +Status BackupEngineImpl::BackupFile(BackupID backup_id, BackupMeta* backup, + bool shared, const std::string& src_dir, + const std::string& src_fname, + RateLimiter* rate_limiter, + uint64_t size_limit, + bool shared_checksum) { + + assert(src_fname.size() > 0 && src_fname[0] == '/'); + std::string dst_relative = src_fname.substr(1); + std::string dst_relative_tmp; + Status s; + uint64_t size; + uint32_t checksum_value = 0; + + if (shared && shared_checksum) { + // add checksum and file length to the file name + s = CalculateChecksum(src_dir + src_fname, + db_env_, + size_limit, + &checksum_value); + if (s.ok()) { + s = db_env_->GetFileSize(src_dir + src_fname, &size); + } + if (!s.ok()) { + return s; + } + dst_relative = GetSharedFileWithChecksum(dst_relative, checksum_value, + size); + dst_relative_tmp = GetSharedFileWithChecksumRel(dst_relative, true); + dst_relative = GetSharedFileWithChecksumRel(dst_relative, false); + } else if (shared) { + dst_relative_tmp = GetSharedFileRel(dst_relative, true); + dst_relative = GetSharedFileRel(dst_relative, false); + } else { + dst_relative_tmp = GetPrivateFileRel(backup_id, true, dst_relative); + dst_relative = GetPrivateFileRel(backup_id, false, dst_relative); + } + std::string dst_path = GetAbsolutePath(dst_relative); + std::string dst_path_tmp = GetAbsolutePath(dst_relative_tmp); + + // if it's shared, we also need to check if it exists -- if it does, + // no need to copy it again + if (shared && backup_env_->FileExists(dst_path)) { + if (shared_checksum) { + Log(options_.info_log, + "%s already present, with checksum %u and size %" PRIu64, + src_fname.c_str(), checksum_value, size); + } else { + backup_env_->GetFileSize(dst_path, &size); // Ignore error + Log(options_.info_log, "%s already present, calculate checksum", + src_fname.c_str()); + s = CalculateChecksum(src_dir + src_fname, + db_env_, + size_limit, + &checksum_value); + } + } else { + Log(options_.info_log, "Copying %s", src_fname.c_str()); + s = CopyFile(src_dir + src_fname, + dst_path_tmp, + db_env_, + backup_env_, + options_.sync, + rate_limiter, + &size, + &checksum_value, + size_limit); + if (s.ok() && shared) { + s = backup_env_->RenameFile(dst_path_tmp, dst_path); + } + } + if (s.ok()) { + s = backup->AddFile(FileInfo(dst_relative, size, checksum_value)); + } + return s; +} + +Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env, + uint64_t size_limit, + uint32_t* checksum_value) { + *checksum_value = 0; + if (size_limit == 0) { + size_limit = std::numeric_limits::max(); + } + + EnvOptions env_options; + env_options.use_mmap_writes = false; + env_options.use_os_buffer = false; + + std::unique_ptr src_file; + Status s = src_env->NewSequentialFile(src, &src_file, env_options); + if (!s.ok()) { + return s; + } + + std::unique_ptr buf(new char[copy_file_buffer_size_]); + Slice data; + + do { + if (stop_backup_.load(std::memory_order_acquire)) { + return Status::Incomplete("Backup stopped"); + } + size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ? + copy_file_buffer_size_ : size_limit; + s = src_file->Read(buffer_to_read, &data, buf.get()); + + if (!s.ok()) { + return s; + } + + size_limit -= data.size(); + *checksum_value = crc32c::Extend(*checksum_value, data.data(), data.size()); + } while (data.size() > 0 && size_limit > 0); + + return s; +} + +void BackupEngineImpl::DeleteChildren(const std::string& dir, + uint32_t file_type_filter) { + std::vector children; + db_env_->GetChildren(dir, &children); // ignore errors + + for (const auto& f : children) { + uint64_t number; + FileType type; + bool ok = ParseFileName(f, &number, &type); + if (ok && (file_type_filter & (1 << type))) { + // don't delete this file + continue; + } + db_env_->DeleteFile(dir + "/" + f); // ignore errors + } +} + +void BackupEngineImpl::GarbageCollection(bool full_scan) { + assert(!read_only_); + Log(options_.info_log, "Starting garbage collection"); + std::vector to_delete; + for (auto& itr : backuped_file_infos_) { + if (itr.second.refs == 0) { + Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first)); + Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(), + s.ToString().c_str()); + to_delete.push_back(itr.first); + } + } + for (auto& td : to_delete) { + backuped_file_infos_.erase(td); + } + if (!full_scan) { + // take care of private dirs -- if full_scan == true, then full_scan will + // take care of them + for (auto backup_id : obsolete_backups_) { + std::string private_dir = GetPrivateFileRel(backup_id); + Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir)); + Log(options_.info_log, "Deleting private dir %s -- %s", + private_dir.c_str(), s.ToString().c_str()); + } + } + obsolete_backups_.clear(); + + if (full_scan) { + Log(options_.info_log, "Starting full scan garbage collection"); + // delete obsolete shared files + std::vector shared_children; + backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()), + &shared_children); + for (auto& child : shared_children) { + std::string rel_fname = GetSharedFileRel(child); + // if it's not refcounted, delete it + if (backuped_file_infos_.find(rel_fname) == backuped_file_infos_.end()) { + // this might be a directory, but DeleteFile will just fail in that + // case, so we're good + Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname)); + if (s.ok()) { + Log(options_.info_log, "Deleted %s", rel_fname.c_str()); + } + } + } + + // delete obsolete private files + std::vector private_children; + backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()), + &private_children); + for (auto& child : private_children) { + BackupID backup_id = 0; + bool tmp_dir = child.find(".tmp") != std::string::npos; + sscanf(child.c_str(), "%u", &backup_id); + if (!tmp_dir && // if it's tmp_dir, delete it + (backup_id == 0 || backups_.find(backup_id) != backups_.end())) { + // it's either not a number or it's still alive. continue + continue; + } + // here we have to delete the dir and all its children + std::string full_private_path = + GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir)); + std::vector subchildren; + backup_env_->GetChildren(full_private_path, &subchildren); + for (auto& subchild : subchildren) { + Status s = backup_env_->DeleteFile(full_private_path + subchild); + if (s.ok()) { + Log(options_.info_log, "Deleted %s", + (full_private_path + subchild).c_str()); + } + } + // finally delete the private dir + Status s = backup_env_->DeleteDir(full_private_path); + Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(), + s.ToString().c_str()); + } + } +} + +// ------- BackupMeta class -------- + +Status BackupEngineImpl::BackupMeta::AddFile(const FileInfo& file_info) { + size_ += file_info.size; + files_.push_back(file_info.filename); + + auto itr = file_infos_->find(file_info.filename); + if (itr == file_infos_->end()) { + auto ret = file_infos_->insert({file_info.filename, file_info}); + if (ret.second) { + ret.first->second.refs = 1; + } else { + // if this happens, something is seriously wrong + return Status::Corruption("In memory metadata insertion error"); + } + } else { + if (itr->second.checksum_value != file_info.checksum_value) { + return Status::Corruption("Checksum mismatch for existing backup file"); + } + ++itr->second.refs; // increase refcount if already present + } + + return Status::OK(); +} + +void BackupEngineImpl::BackupMeta::Delete(bool delete_meta) { + for (const auto& file : files_) { + auto itr = file_infos_->find(file); + assert(itr != file_infos_->end()); + --(itr->second.refs); // decrease refcount + } + files_.clear(); + // delete meta file + if (delete_meta) { + env_->DeleteFile(meta_filename_); + } + timestamp_ = 0; +} + +// each backup meta file is of the format: +// +// +// +// +// +// ... +Status BackupEngineImpl::BackupMeta::LoadFromFile( + const std::string& backup_dir) { + assert(Empty()); + Status s; + unique_ptr backup_meta_file; + s = env_->NewSequentialFile(meta_filename_, &backup_meta_file, EnvOptions()); + if (!s.ok()) { + return s; + } + + unique_ptr buf(new char[max_backup_meta_file_size_ + 1]); + Slice data; + s = backup_meta_file->Read(max_backup_meta_file_size_, &data, buf.get()); + + if (!s.ok() || data.size() == max_backup_meta_file_size_) { + return s.ok() ? Status::Corruption("File size too big") : s; + } + buf[data.size()] = 0; + + uint32_t num_files = 0; + int bytes_read = 0; + sscanf(data.data(), "%" PRId64 "%n", ×tamp_, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' + sscanf(data.data(), "%" PRIu64 "%n", &sequence_number_, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' + sscanf(data.data(), "%u%n", &num_files, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' + + std::vector files; + + for (uint32_t i = 0; s.ok() && i < num_files; ++i) { + auto line = GetSliceUntil(&data, '\n'); + std::string filename = GetSliceUntil(&line, ' ').ToString(); + + uint64_t size; + s = env_->GetFileSize(backup_dir + "/" + filename, &size); + if (!s.ok()) { + return s; + } + + if (line.empty()) { + return Status::Corruption("File checksum is missing"); + } + + uint32_t checksum_value = 0; + if (line.starts_with("crc32 ")) { + line.remove_prefix(6); + sscanf(line.data(), "%u", &checksum_value); + if (memcmp(line.data(), std::to_string(checksum_value).c_str(), + line.size() - 1) != 0) { + return Status::Corruption("Invalid checksum value"); + } + } else { + return Status::Corruption("Unknown checksum type"); + } + + files.emplace_back(filename, size, checksum_value); + } + + if (s.ok() && data.size() > 0) { + // file has to be read completely. if not, we count it as corruption + s = Status::Corruption("Tailing data in backup meta file"); + } + + if (s.ok()) { + for (const auto& file_info : files) { + s = AddFile(file_info); + if (!s.ok()) { + break; + } + } + } + + return s; +} + +Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) { + Status s; + unique_ptr backup_meta_file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = env_->NewWritableFile(meta_filename_ + ".tmp", &backup_meta_file, + env_options); + if (!s.ok()) { + return s; + } + + unique_ptr buf(new char[max_backup_meta_file_size_]); + int len = 0, buf_size = max_backup_meta_file_size_; + len += snprintf(buf.get(), buf_size, "%" PRId64 "\n", timestamp_); + len += snprintf(buf.get() + len, buf_size - len, "%" PRIu64 "\n", + sequence_number_); + len += snprintf(buf.get() + len, buf_size - len, "%zu\n", files_.size()); + for (const auto& file : files_) { + const auto& iter = file_infos_->find(file); + + assert(iter != file_infos_->end()); + // use crc32 for now, switch to something else if needed + len += snprintf(buf.get() + len, buf_size - len, "%s crc32 %u\n", + file.c_str(), iter->second.checksum_value); + } + + s = backup_meta_file->Append(Slice(buf.get(), (size_t)len)); + if (s.ok() && sync) { + s = backup_meta_file->Sync(); + } + if (s.ok()) { + s = backup_meta_file->Close(); + } + if (s.ok()) { + s = env_->RenameFile(meta_filename_ + ".tmp", meta_filename_); + } + return s; +} + +// -------- BackupEngineReadOnlyImpl --------- +class BackupEngineReadOnlyImpl : public BackupEngineReadOnly { + public: + BackupEngineReadOnlyImpl(Env* db_env, const BackupableDBOptions& options) + : backup_engine_(new BackupEngineImpl(db_env, options, true)) {} + + virtual ~BackupEngineReadOnlyImpl() {} + + virtual void GetBackupInfo(std::vector* backup_info) { + backup_engine_->GetBackupInfo(backup_info); + } + + virtual Status RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) { + return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir, + restore_options); + } + + virtual Status RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) { + return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir, + restore_options); + } + + private: + std::unique_ptr backup_engine_; +}; + +BackupEngineReadOnly* BackupEngineReadOnly::NewReadOnlyBackupEngine( + Env* db_env, const BackupableDBOptions& options) { + if (options.destroy_old_data) { + assert(false); + return nullptr; + } + return new BackupEngineReadOnlyImpl(db_env, options); +} + +// --- BackupableDB methods -------- + +BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options) + : StackableDB(db), + backup_engine_(new BackupEngineImpl(db->GetEnv(), options)) {} + +BackupableDB::~BackupableDB() { + delete backup_engine_; +} + +Status BackupableDB::CreateNewBackup(bool flush_before_backup) { + return backup_engine_->CreateNewBackup(this, flush_before_backup); +} + +void BackupableDB::GetBackupInfo(std::vector* backup_info) { + backup_engine_->GetBackupInfo(backup_info); +} + +Status BackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) { + return backup_engine_->PurgeOldBackups(num_backups_to_keep); +} + +Status BackupableDB::DeleteBackup(BackupID backup_id) { + return backup_engine_->DeleteBackup(backup_id); +} + +void BackupableDB::StopBackup() { + backup_engine_->StopBackup(); +} + +// --- RestoreBackupableDB methods ------ + +RestoreBackupableDB::RestoreBackupableDB(Env* db_env, + const BackupableDBOptions& options) + : backup_engine_(new BackupEngineImpl(db_env, options)) {} + +RestoreBackupableDB::~RestoreBackupableDB() { + delete backup_engine_; +} + +void +RestoreBackupableDB::GetBackupInfo(std::vector* backup_info) { + backup_engine_->GetBackupInfo(backup_info); +} + +Status RestoreBackupableDB::RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options) { + return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir, + restore_options); +} + +Status RestoreBackupableDB::RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options) { + return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir, + restore_options); +} + +Status RestoreBackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) { + return backup_engine_->PurgeOldBackups(num_backups_to_keep); +} + +Status RestoreBackupableDB::DeleteBackup(BackupID backup_id) { + return backup_engine_->DeleteBackup(backup_id); +} + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc new file mode 100644 index 0000000000..b68f1c65bc --- /dev/null +++ b/utilities/backupable/backupable_db_test.cc @@ -0,0 +1,974 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/types.h" +#include "rocksdb/transaction_log.h" +#include "utilities/utility_db.h" +#include "utilities/backupable_db.h" +#include "util/testharness.h" +#include "util/random.h" +#include "util/mutexlock.h" +#include "util/testutil.h" +#include "util/auto_roll_logger.h" + +namespace rocksdb { + +namespace { + +using std::unique_ptr; + +class DummyDB : public StackableDB { + public: + /* implicit */ + DummyDB(const Options& options, const std::string& dbname) + : StackableDB(nullptr), options_(options), dbname_(dbname), + deletions_enabled_(true), sequence_number_(0) {} + + virtual SequenceNumber GetLatestSequenceNumber() const { + return ++sequence_number_; + } + + virtual const std::string& GetName() const override { + return dbname_; + } + + virtual Env* GetEnv() const override { + return options_.env; + } + + using DB::GetOptions; + virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const + override { + return options_; + } + + virtual Status EnableFileDeletions(bool force) override { + ASSERT_TRUE(!deletions_enabled_); + deletions_enabled_ = true; + return Status::OK(); + } + + virtual Status DisableFileDeletions() override { + ASSERT_TRUE(deletions_enabled_); + deletions_enabled_ = false; + return Status::OK(); + } + + virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, + bool flush_memtable = true) override { + ASSERT_TRUE(!deletions_enabled_); + vec = live_files_; + *mfs = 100; + return Status::OK(); + } + + virtual ColumnFamilyHandle* DefaultColumnFamily() const override { + return nullptr; + } + + class DummyLogFile : public LogFile { + public: + /* implicit */ + DummyLogFile(const std::string& path, bool alive = true) + : path_(path), alive_(alive) {} + + virtual std::string PathName() const override { + return path_; + } + + virtual uint64_t LogNumber() const { + // what business do you have calling this method? + ASSERT_TRUE(false); + return 0; + } + + virtual WalFileType Type() const override { + return alive_ ? kAliveLogFile : kArchivedLogFile; + } + + virtual SequenceNumber StartSequence() const { + // backupabledb should not need this method + ASSERT_TRUE(false); + return 0; + } + + virtual uint64_t SizeFileBytes() const { + // backupabledb should not need this method + ASSERT_TRUE(false); + return 0; + } + + private: + std::string path_; + bool alive_; + }; // DummyLogFile + + virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + ASSERT_TRUE(!deletions_enabled_); + files.resize(wal_files_.size()); + for (size_t i = 0; i < files.size(); ++i) { + files[i].reset( + new DummyLogFile(wal_files_[i].first, wal_files_[i].second)); + } + return Status::OK(); + } + + std::vector live_files_; + // pair + std::vector> wal_files_; + private: + Options options_; + std::string dbname_; + bool deletions_enabled_; + mutable SequenceNumber sequence_number_; +}; // DummyDB + +class TestEnv : public EnvWrapper { + public: + explicit TestEnv(Env* t) : EnvWrapper(t) {} + + class DummySequentialFile : public SequentialFile { + public: + DummySequentialFile() : SequentialFile(), rnd_(5) {} + virtual Status Read(size_t n, Slice* result, char* scratch) { + size_t read_size = (n > size_left) ? size_left : n; + for (size_t i = 0; i < read_size; ++i) { + scratch[i] = rnd_.Next() & 255; + } + *result = Slice(scratch, read_size); + size_left -= read_size; + return Status::OK(); + } + + virtual Status Skip(uint64_t n) { + size_left = (n > size_left) ? size_left - n : 0; + return Status::OK(); + } + private: + size_t size_left = 200; + Random rnd_; + }; + + Status NewSequentialFile(const std::string& f, + unique_ptr* r, + const EnvOptions& options) { + MutexLock l(&mutex_); + if (dummy_sequential_file_) { + r->reset(new TestEnv::DummySequentialFile()); + return Status::OK(); + } else { + return EnvWrapper::NewSequentialFile(f, r, options); + } + } + + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& options) { + MutexLock l(&mutex_); + written_files_.push_back(f); + if (limit_written_files_ <= 0) { + return Status::NotSupported("Sorry, can't do this"); + } + limit_written_files_--; + return EnvWrapper::NewWritableFile(f, r, options); + } + + virtual Status DeleteFile(const std::string& fname) override { + MutexLock l(&mutex_); + ASSERT_GT(limit_delete_files_, 0U); + limit_delete_files_--; + return EnvWrapper::DeleteFile(fname); + } + + void AssertWrittenFiles(std::vector& should_have_written) { + MutexLock l(&mutex_); + sort(should_have_written.begin(), should_have_written.end()); + sort(written_files_.begin(), written_files_.end()); + ASSERT_TRUE(written_files_ == should_have_written); + } + + void ClearWrittenFiles() { + MutexLock l(&mutex_); + written_files_.clear(); + } + + void SetLimitWrittenFiles(uint64_t limit) { + MutexLock l(&mutex_); + limit_written_files_ = limit; + } + + void SetLimitDeleteFiles(uint64_t limit) { + MutexLock l(&mutex_); + limit_delete_files_ = limit; + } + + void SetDummySequentialFile(bool dummy_sequential_file) { + MutexLock l(&mutex_); + dummy_sequential_file_ = dummy_sequential_file; + } + + private: + port::Mutex mutex_; + bool dummy_sequential_file_ = false; + std::vector written_files_; + uint64_t limit_written_files_ = 1000000; + uint64_t limit_delete_files_ = 1000000; +}; // TestEnv + +class FileManager : public EnvWrapper { + public: + explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {} + + Status DeleteRandomFileInDir(const std::string dir) { + std::vector children; + GetChildren(dir, &children); + if (children.size() <= 2) { // . and .. + return Status::NotFound(""); + } + while (true) { + int i = rnd_.Next() % children.size(); + if (children[i] != "." && children[i] != "..") { + return DeleteFile(dir + "/" + children[i]); + } + } + // should never get here + assert(false); + return Status::NotFound(""); + } + + Status CorruptFile(const std::string& fname, uint64_t bytes_to_corrupt) { + uint64_t size; + Status s = GetFileSize(fname, &size); + if (!s.ok()) { + return s; + } + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = NewRandomRWFile(fname, &file, env_options); + if (!s.ok()) { + return s; + } + + for (uint64_t i = 0; s.ok() && i < bytes_to_corrupt; ++i) { + std::string tmp; + // write one random byte to a random position + s = file->Write(rnd_.Next() % size, test::RandomString(&rnd_, 1, &tmp)); + } + return s; + } + + Status CorruptChecksum(const std::string& fname, bool appear_valid) { + std::string metadata; + Status s = ReadFileToString(this, fname, &metadata); + if (!s.ok()) { + return s; + } + s = DeleteFile(fname); + if (!s.ok()) { + return s; + } + + auto pos = metadata.find("private"); + if (pos == std::string::npos) { + return Status::Corruption("private file is expected"); + } + pos = metadata.find(" crc32 ", pos + 6); + if (pos == std::string::npos) { + return Status::Corruption("checksum not found"); + } + + if (metadata.size() < pos + 7) { + return Status::Corruption("bad CRC32 checksum value"); + } + + if (appear_valid) { + if (metadata[pos + 8] == '\n') { + // single digit value, safe to insert one more digit + metadata.insert(pos + 8, 1, '0'); + } else { + metadata.erase(pos + 8, 1); + } + } else { + metadata[pos + 7] = 'a'; + } + + return WriteToFile(fname, metadata); + } + + Status WriteToFile(const std::string& fname, const std::string& data) { + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + Status s = EnvWrapper::NewWritableFile(fname, &file, env_options); + if (!s.ok()) { + return s; + } + return file->Append(Slice(data)); + } + + private: + Random rnd_; +}; // FileManager + +// utility functions +static size_t FillDB(DB* db, int from, int to) { + size_t bytes_written = 0; + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value = "testvalue" + std::to_string(i); + bytes_written += key.size() + value.size(); + + ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value))); + } + return bytes_written; +} + +static void AssertExists(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value; + Status s = db->Get(ReadOptions(), Slice(key), &value); + ASSERT_EQ(value, "testvalue" + std::to_string(i)); + } +} + +static void AssertEmpty(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value = "testvalue" + std::to_string(i); + + Status s = db->Get(ReadOptions(), Slice(key), &value); + ASSERT_TRUE(s.IsNotFound()); + } +} + +class BackupableDBTest { + public: + BackupableDBTest() { + // set up files + dbname_ = test::TmpDir() + "/backupable_db"; + backupdir_ = test::TmpDir() + "/backupable_db_backup"; + + // set up envs + env_ = Env::Default(); + test_db_env_.reset(new TestEnv(env_)); + test_backup_env_.reset(new TestEnv(env_)); + file_manager_.reset(new FileManager(env_)); + + // set up db options + options_.create_if_missing = true; + options_.paranoid_checks = true; + options_.write_buffer_size = 1 << 17; // 128KB + options_.env = test_db_env_.get(); + options_.wal_dir = dbname_; + // set up backup db options + CreateLoggerFromOptions(dbname_, backupdir_, env_, + DBOptions(), &logger_); + backupable_options_.reset(new BackupableDBOptions( + backupdir_, test_backup_env_.get(), true, logger_.get(), true)); + + // delete old files in db + DestroyDB(dbname_, Options()); + } + + DB* OpenDB() { + DB* db; + ASSERT_OK(DB::Open(options_, dbname_, &db)); + return db; + } + + void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false, + bool share_table_files = true, + bool share_with_checksums = false) { + // reset all the defaults + test_backup_env_->SetLimitWrittenFiles(1000000); + test_db_env_->SetLimitWrittenFiles(1000000); + test_db_env_->SetDummySequentialFile(dummy); + + DB* db; + if (dummy) { + dummy_db_ = new DummyDB(options_, dbname_); + db = dummy_db_; + } else { + ASSERT_OK(DB::Open(options_, dbname_, &db)); + } + backupable_options_->destroy_old_data = destroy_old_data; + backupable_options_->share_table_files = share_table_files; + backupable_options_->share_files_with_checksum = share_with_checksums; + db_.reset(new BackupableDB(db, *backupable_options_)); + } + + void CloseBackupableDB() { + db_.reset(nullptr); + } + + void OpenRestoreDB() { + backupable_options_->destroy_old_data = false; + restore_db_.reset( + new RestoreBackupableDB(test_db_env_.get(), *backupable_options_)); + } + + void CloseRestoreDB() { + restore_db_.reset(nullptr); + } + + // restores backup backup_id and asserts the existence of + // [start_exist, end_exist> and not-existence of + // [end_exist, end> + // + // if backup_id == 0, it means restore from latest + // if end == 0, don't check AssertEmpty + void AssertBackupConsistency(BackupID backup_id, uint32_t start_exist, + uint32_t end_exist, uint32_t end = 0, + bool keep_log_files = false) { + RestoreOptions restore_options(keep_log_files); + bool opened_restore = false; + if (restore_db_.get() == nullptr) { + opened_restore = true; + OpenRestoreDB(); + } + if (backup_id > 0) { + ASSERT_OK(restore_db_->RestoreDBFromBackup(backup_id, dbname_, dbname_, + restore_options)); + } else { + ASSERT_OK(restore_db_->RestoreDBFromLatestBackup(dbname_, dbname_, + restore_options)); + } + DB* db = OpenDB(); + AssertExists(db, start_exist, end_exist); + if (end != 0) { + AssertEmpty(db, end_exist, end); + } + delete db; + if (opened_restore) { + CloseRestoreDB(); + } + } + + void DeleteLogFiles() { + std::vector delete_logs; + env_->GetChildren(dbname_, &delete_logs); + for (auto f : delete_logs) { + uint64_t number; + FileType type; + bool ok = ParseFileName(f, &number, &type); + if (ok && type == kLogFile) { + env_->DeleteFile(dbname_ + "/" + f); + } + } + } + + // files + std::string dbname_; + std::string backupdir_; + + // envs + Env* env_; + unique_ptr test_db_env_; + unique_ptr test_backup_env_; + unique_ptr file_manager_; + + // all the dbs! + DummyDB* dummy_db_; // BackupableDB owns dummy_db_ + unique_ptr db_; + unique_ptr restore_db_; + + // options + Options options_; + unique_ptr backupable_options_; + std::shared_ptr logger_; +}; // BackupableDBTest + +void AppendPath(const std::string& path, std::vector& v) { + for (auto& f : v) { + f = path + f; + } +} + +// this will make sure that backup does not copy the same file twice +TEST(BackupableDBTest, NoDoubleCopy) { + OpenBackupableDB(true, true); + + // should write 5 DB files + LATEST_BACKUP + one meta file + test_backup_env_->SetLimitWrittenFiles(7); + test_backup_env_->ClearWrittenFiles(); + test_db_env_->SetLimitWrittenFiles(0); + dummy_db_->live_files_ = { "/00010.sst", "/00011.sst", + "/CURRENT", "/MANIFEST-01" }; + dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}}; + ASSERT_OK(db_->CreateNewBackup(false)); + std::vector should_have_written = { + "/shared/00010.sst.tmp", + "/shared/00011.sst.tmp", + "/private/1.tmp/CURRENT", + "/private/1.tmp/MANIFEST-01", + "/private/1.tmp/00011.log", + "/meta/1.tmp", + "/LATEST_BACKUP.tmp" + }; + AppendPath(dbname_ + "_backup", should_have_written); + test_backup_env_->AssertWrittenFiles(should_have_written); + + // should write 4 new DB files + LATEST_BACKUP + one meta file + // should not write/copy 00010.sst, since it's already there! + test_backup_env_->SetLimitWrittenFiles(6); + test_backup_env_->ClearWrittenFiles(); + dummy_db_->live_files_ = { "/00010.sst", "/00015.sst", + "/CURRENT", "/MANIFEST-01" }; + dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}}; + ASSERT_OK(db_->CreateNewBackup(false)); + // should not open 00010.sst - it's already there + should_have_written = { + "/shared/00015.sst.tmp", + "/private/2.tmp/CURRENT", + "/private/2.tmp/MANIFEST-01", + "/private/2.tmp/00011.log", + "/meta/2.tmp", + "/LATEST_BACKUP.tmp" + }; + AppendPath(dbname_ + "_backup", should_have_written); + test_backup_env_->AssertWrittenFiles(should_have_written); + + ASSERT_OK(db_->DeleteBackup(1)); + ASSERT_EQ(true, + test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst")); + // 00011.sst was only in backup 1, should be deleted + ASSERT_EQ(false, + test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst")); + ASSERT_EQ(true, + test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst")); + + // MANIFEST file size should be only 100 + uint64_t size; + test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01", &size); + ASSERT_EQ(100UL, size); + test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size); + ASSERT_EQ(200UL, size); + + CloseBackupableDB(); +} + +// test various kind of corruptions that may happen: +// 1. Not able to write a file for backup - that backup should fail, +// everything else should work +// 2. Corrupted/deleted LATEST_BACKUP - everything should work fine +// 3. Corrupted backup meta file or missing backuped file - we should +// not be able to open that backup, but all other backups should be +// fine +// 4. Corrupted checksum value - if the checksum is not a valid uint32_t, +// db open should fail, otherwise, it aborts during the restore process. +TEST(BackupableDBTest, CorruptionsTest) { + const int keys_iteration = 5000; + Random rnd(6); + Status s; + + OpenBackupableDB(true); + // create five backups + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + } + + // ---------- case 1. - fail a write ----------- + // try creating backup 6, but fail a write + FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6); + test_backup_env_->SetLimitWrittenFiles(2); + // should fail + s = db_->CreateNewBackup(!!(rnd.Next() % 2)); + ASSERT_TRUE(!s.ok()); + test_backup_env_->SetLimitWrittenFiles(1000000); + // latest backup should have all the keys + CloseBackupableDB(); + AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6); + + // ---------- case 2. - corrupt/delete latest backup ----------- + ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/LATEST_BACKUP", 2)); + AssertBackupConsistency(0, 0, keys_iteration * 5); + ASSERT_OK(file_manager_->DeleteFile(backupdir_ + "/LATEST_BACKUP")); + AssertBackupConsistency(0, 0, keys_iteration * 5); + // create backup 6, point LATEST_BACKUP to 5 + OpenBackupableDB(); + FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6); + ASSERT_OK(db_->CreateNewBackup(false)); + CloseBackupableDB(); + ASSERT_OK(file_manager_->WriteToFile(backupdir_ + "/LATEST_BACKUP", "5")); + AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6); + // assert that all 6 data is gone! + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/6") == false); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/6") == false); + + // --------- case 3. corrupted backup meta or missing backuped file ---- + ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/5", 3)); + // since 5 meta is now corrupted, latest backup should be 4 + AssertBackupConsistency(0, 0, keys_iteration * 4, keys_iteration * 5); + OpenRestoreDB(); + s = restore_db_->RestoreDBFromBackup(5, dbname_, dbname_); + ASSERT_TRUE(!s.ok()); + CloseRestoreDB(); + ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/4")); + // 4 is corrupted, 3 is the latest backup now + AssertBackupConsistency(0, 0, keys_iteration * 3, keys_iteration * 5); + OpenRestoreDB(); + s = restore_db_->RestoreDBFromBackup(4, dbname_, dbname_); + CloseRestoreDB(); + ASSERT_TRUE(!s.ok()); + + // --------- case 4. corrupted checksum value ---- + ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/3", false)); + // checksum of backup 3 is an invalid value, this can be detected at + // db open time, and it reverts to the previous backup automatically + AssertBackupConsistency(0, 0, keys_iteration * 2, keys_iteration * 5); + // checksum of the backup 2 appears to be valid, this can cause checksum + // mismatch and abort restore process + ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/2", true)); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2")); + OpenRestoreDB(); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2")); + s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_); + ASSERT_TRUE(!s.ok()); + ASSERT_OK(restore_db_->DeleteBackup(2)); + CloseRestoreDB(); + AssertBackupConsistency(0, 0, keys_iteration * 1, keys_iteration * 5); + + // new backup should be 2! + OpenBackupableDB(); + FillDB(db_.get(), keys_iteration * 1, keys_iteration * 2); + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + CloseBackupableDB(); + AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * 5); +} + +// open DB, write, close DB, backup, restore, repeat +TEST(BackupableDBTest, OfflineIntegrationTest) { + // has to be a big number, so that it triggers the memtable flush + const int keys_iteration = 5000; + const int max_key = keys_iteration * 4 + 10; + // first iter -- flush before backup + // second iter -- don't flush before backup + for (int iter = 0; iter < 2; ++iter) { + // delete old data + DestroyDB(dbname_, Options()); + bool destroy_data = true; + + // every iteration -- + // 1. insert new data in the DB + // 2. backup the DB + // 3. destroy the db + // 4. restore the db, check everything is still there + for (int i = 0; i < 5; ++i) { + // in last iteration, put smaller amount of data, + int fill_up_to = std::min(keys_iteration * (i + 1), max_key); + // ---- insert new data and back up ---- + OpenBackupableDB(destroy_data); + destroy_data = false; + FillDB(db_.get(), keys_iteration * i, fill_up_to); + ASSERT_OK(db_->CreateNewBackup(iter == 0)); + CloseBackupableDB(); + DestroyDB(dbname_, Options()); + + // ---- make sure it's empty ---- + DB* db = OpenDB(); + AssertEmpty(db, 0, fill_up_to); + delete db; + + // ---- restore the DB ---- + OpenRestoreDB(); + if (i >= 3) { // test purge old backups + // when i == 4, purge to only 1 backup + // when i == 3, purge to 2 backups + ASSERT_OK(restore_db_->PurgeOldBackups(5 - i)); + } + // ---- make sure the data is there --- + AssertBackupConsistency(0, 0, fill_up_to, max_key); + CloseRestoreDB(); + } + } +} + +// open DB, write, backup, write, backup, close, restore +TEST(BackupableDBTest, OnlineIntegrationTest) { + // has to be a big number, so that it triggers the memtable flush + const int keys_iteration = 5000; + const int max_key = keys_iteration * 4 + 10; + Random rnd(7); + // delete old data + DestroyDB(dbname_, Options()); + + OpenBackupableDB(true); + // write some data, backup, repeat + for (int i = 0; i < 5; ++i) { + if (i == 4) { + // delete backup number 2, online delete! + OpenRestoreDB(); + ASSERT_OK(restore_db_->DeleteBackup(2)); + CloseRestoreDB(); + } + // in last iteration, put smaller amount of data, + // so that backups can share sst files + int fill_up_to = std::min(keys_iteration * (i + 1), max_key); + FillDB(db_.get(), keys_iteration * i, fill_up_to); + // we should get consistent results with flush_before_backup + // set to both true and false + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + } + // close and destroy + CloseBackupableDB(); + DestroyDB(dbname_, Options()); + + // ---- make sure it's empty ---- + DB* db = OpenDB(); + AssertEmpty(db, 0, max_key); + delete db; + + // ---- restore every backup and verify all the data is there ---- + OpenRestoreDB(); + for (int i = 1; i <= 5; ++i) { + if (i == 2) { + // we deleted backup 2 + Status s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_); + ASSERT_TRUE(!s.ok()); + } else { + int fill_up_to = std::min(keys_iteration * i, max_key); + AssertBackupConsistency(i, 0, fill_up_to, max_key); + } + } + + // delete some backups -- this should leave only backups 3 and 5 alive + ASSERT_OK(restore_db_->DeleteBackup(4)); + ASSERT_OK(restore_db_->PurgeOldBackups(2)); + + std::vector backup_info; + restore_db_->GetBackupInfo(&backup_info); + ASSERT_EQ(2UL, backup_info.size()); + + // check backup 3 + AssertBackupConsistency(3, 0, 3 * keys_iteration, max_key); + // check backup 5 + AssertBackupConsistency(5, 0, max_key); + + CloseRestoreDB(); +} + +TEST(BackupableDBTest, FailOverwritingBackups) { + options_.write_buffer_size = 1024 * 1024 * 1024; // 1GB + // create backups 1, 2, 3, 4, 5 + OpenBackupableDB(true); + for (int i = 0; i < 5; ++i) { + CloseBackupableDB(); + DeleteLogFiles(); + OpenBackupableDB(false); + FillDB(db_.get(), 100 * i, 100 * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(true)); + } + CloseBackupableDB(); + + // restore 3 + OpenRestoreDB(); + ASSERT_OK(restore_db_->RestoreDBFromBackup(3, dbname_, dbname_)); + CloseRestoreDB(); + + OpenBackupableDB(false); + FillDB(db_.get(), 0, 300); + Status s = db_->CreateNewBackup(true); + // the new backup fails because new table files + // clash with old table files from backups 4 and 5 + // (since write_buffer_size is huge, we can be sure that + // each backup will generate only one sst file and that + // a file generated by a new backup is the same as + // sst file generated by backup 4) + ASSERT_TRUE(s.IsCorruption()); + ASSERT_OK(db_->DeleteBackup(4)); + ASSERT_OK(db_->DeleteBackup(5)); + // now, the backup can succeed + ASSERT_OK(db_->CreateNewBackup(true)); + CloseBackupableDB(); +} + +TEST(BackupableDBTest, NoShareTableFiles) { + const int keys_iteration = 5000; + OpenBackupableDB(true, false, false); + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(!!(i % 2))); + } + CloseBackupableDB(); + + for (int i = 0; i < 5; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1), + keys_iteration * 6); + } +} + +// Verify that you can backup and restore with share_files_with_checksum on +TEST(BackupableDBTest, ShareTableFilesWithChecksums) { + const int keys_iteration = 5000; + OpenBackupableDB(true, false, true, true); + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(!!(i % 2))); + } + CloseBackupableDB(); + + for (int i = 0; i < 5; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1), + keys_iteration * 6); + } +} + +// Verify that you can backup and restore using share_files_with_checksum set to +// false and then transition this option to true +TEST(BackupableDBTest, ShareTableFilesWithChecksumsTransition) { + const int keys_iteration = 5000; + // set share_files_with_checksum to false + OpenBackupableDB(true, false, true, false); + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(true)); + } + CloseBackupableDB(); + + for (int i = 0; i < 5; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1), + keys_iteration * 6); + } + + // set share_files_with_checksum to true and do some more backups + OpenBackupableDB(true, false, true, true); + for (int i = 5; i < 10; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(true)); + } + CloseBackupableDB(); + + for (int i = 0; i < 5; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 5 + 1), + keys_iteration * 11); + } +} + +TEST(BackupableDBTest, DeleteTmpFiles) { + OpenBackupableDB(); + CloseBackupableDB(); + std::string shared_tmp = backupdir_ + "/shared/00006.sst.tmp"; + std::string private_tmp_dir = backupdir_ + "/private/10.tmp"; + std::string private_tmp_file = private_tmp_dir + "/00003.sst"; + file_manager_->WriteToFile(shared_tmp, "tmp"); + file_manager_->CreateDir(private_tmp_dir); + file_manager_->WriteToFile(private_tmp_file, "tmp"); + ASSERT_EQ(true, file_manager_->FileExists(private_tmp_dir)); + OpenBackupableDB(); + CloseBackupableDB(); + ASSERT_EQ(false, file_manager_->FileExists(shared_tmp)); + ASSERT_EQ(false, file_manager_->FileExists(private_tmp_file)); + ASSERT_EQ(false, file_manager_->FileExists(private_tmp_dir)); +} + +TEST(BackupableDBTest, KeepLogFiles) { + backupable_options_->backup_log_files = false; + // basically infinite + options_.WAL_ttl_seconds = 24 * 60 * 60; + OpenBackupableDB(true); + FillDB(db_.get(), 0, 100); + ASSERT_OK(db_->Flush(FlushOptions())); + FillDB(db_.get(), 100, 200); + ASSERT_OK(db_->CreateNewBackup(false)); + FillDB(db_.get(), 200, 300); + ASSERT_OK(db_->Flush(FlushOptions())); + FillDB(db_.get(), 300, 400); + ASSERT_OK(db_->Flush(FlushOptions())); + FillDB(db_.get(), 400, 500); + ASSERT_OK(db_->Flush(FlushOptions())); + CloseBackupableDB(); + + // all data should be there if we call with keep_log_files = true + AssertBackupConsistency(0, 0, 500, 600, true); +} + +TEST(BackupableDBTest, RateLimiting) { + uint64_t const KB = 1024 * 1024; + size_t const kMicrosPerSec = 1000 * 1000LL; + + std::vector> limits( + {{KB, 5 * KB}, {2 * KB, 3 * KB}}); + + for (const auto& limit : limits) { + // destroy old data + DestroyDB(dbname_, Options()); + + backupable_options_->backup_rate_limit = limit.first; + backupable_options_->restore_rate_limit = limit.second; + options_.compression = kNoCompression; + OpenBackupableDB(true); + size_t bytes_written = FillDB(db_.get(), 0, 100000); + + auto start_backup = env_->NowMicros(); + ASSERT_OK(db_->CreateNewBackup(false)); + auto backup_time = env_->NowMicros() - start_backup; + auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) / + backupable_options_->backup_rate_limit; + ASSERT_GT(backup_time, 0.9 * rate_limited_backup_time); + + CloseBackupableDB(); + + OpenRestoreDB(); + auto start_restore = env_->NowMicros(); + ASSERT_OK(restore_db_->RestoreDBFromLatestBackup(dbname_, dbname_)); + auto restore_time = env_->NowMicros() - start_restore; + CloseRestoreDB(); + auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) / + backupable_options_->restore_rate_limit; + ASSERT_GT(restore_time, 0.9 * rate_limited_restore_time); + + AssertBackupConsistency(0, 0, 100000, 100010); + } +} + +TEST(BackupableDBTest, ReadOnlyBackupEngine) { + DestroyDB(dbname_, Options()); + OpenBackupableDB(true); + FillDB(db_.get(), 0, 100); + ASSERT_OK(db_->CreateNewBackup(true)); + FillDB(db_.get(), 100, 200); + ASSERT_OK(db_->CreateNewBackup(true)); + CloseBackupableDB(); + DestroyDB(dbname_, Options()); + + backupable_options_->destroy_old_data = false; + test_backup_env_->ClearWrittenFiles(); + test_backup_env_->SetLimitDeleteFiles(0); + auto read_only_backup_engine = + BackupEngineReadOnly::NewReadOnlyBackupEngine(env_, *backupable_options_); + std::vector backup_info; + read_only_backup_engine->GetBackupInfo(&backup_info); + ASSERT_EQ(backup_info.size(), 2U); + + RestoreOptions restore_options(false); + ASSERT_OK(read_only_backup_engine->RestoreDBFromLatestBackup( + dbname_, dbname_, restore_options)); + delete read_only_backup_engine; + std::vector should_have_written; + test_backup_env_->AssertWrittenFiles(should_have_written); + + DB* db = OpenDB(); + AssertExists(db, 0, 200); + delete db; +} + +} // anon namespace + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/utilities/geodb/geodb_impl.cc b/utilities/geodb/geodb_impl.cc new file mode 100644 index 0000000000..065e5ca35e --- /dev/null +++ b/utilities/geodb/geodb_impl.cc @@ -0,0 +1,431 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef ROCKSDB_LITE + +#include "utilities/geodb/geodb_impl.h" + +#define __STDC_FORMAT_MACROS + +#include +#include +#include +#include +#include "db/filename.h" +#include "util/coding.h" + +// +// There are two types of keys. The first type of key-values +// maps a geo location to the set of object ids and their values. +// Table 1 +// key : p + : + $quadkey + : + $id + +// : + $latitude + : + $longitude +// value : value of the object +// This table can be used to find all objects that reside near +// a specified geolocation. +// +// Table 2 +// key : 'k' + : + $id +// value: $quadkey + +namespace rocksdb { + +GeoDBImpl::GeoDBImpl(DB* db, const GeoDBOptions& options) : + GeoDB(db, options), db_(db), options_(options) { +} + +GeoDBImpl::~GeoDBImpl() { +} + +Status GeoDBImpl::Insert(const GeoObject& obj) { + WriteBatch batch; + + // It is possible that this id is already associated with + // with a different position. We first have to remove that + // association before we can insert the new one. + + // remove existing object, if it exists + GeoObject old; + Status status = GetById(obj.id, &old); + if (status.ok()) { + assert(obj.id.compare(old.id) == 0); + std::string quadkey = PositionToQuad(old.position, Detail); + std::string key1 = MakeKey1(old.position, old.id, quadkey); + std::string key2 = MakeKey2(old.id); + batch.Delete(Slice(key1)); + batch.Delete(Slice(key2)); + } else if (status.IsNotFound()) { + // What if another thread is trying to insert the same ID concurrently? + } else { + return status; + } + + // insert new object + std::string quadkey = PositionToQuad(obj.position, Detail); + std::string key1 = MakeKey1(obj.position, obj.id, quadkey); + std::string key2 = MakeKey2(obj.id); + batch.Put(Slice(key1), Slice(obj.value)); + batch.Put(Slice(key2), Slice(quadkey)); + return db_->Write(woptions_, &batch); +} + +Status GeoDBImpl::GetByPosition(const GeoPosition& pos, + const Slice& id, + std::string* value) { + std::string quadkey = PositionToQuad(pos, Detail); + std::string key1 = MakeKey1(pos, id, quadkey); + return db_->Get(roptions_, Slice(key1), value); +} + +Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) { + Status status; + Slice quadkey; + + // create an iterator so that we can get a consistent picture + // of the database. + Iterator* iter = db_->NewIterator(roptions_); + + // create key for table2 + std::string kt = MakeKey2(id); + Slice key2(kt); + + iter->Seek(key2); + if (iter->Valid() && iter->status().ok()) { + if (iter->key().compare(key2) == 0) { + quadkey = iter->value(); + } + } + if (quadkey.size() == 0) { + delete iter; + return Status::NotFound(key2); + } + + // + // Seek to the quadkey + id prefix + // + std::string prefix = MakeKey1Prefix(quadkey.ToString(), id); + iter->Seek(Slice(prefix)); + assert(iter->Valid()); + if (!iter->Valid() || !iter->status().ok()) { + delete iter; + return Status::NotFound(); + } + + // split the key into p + quadkey + id + lat + lon + std::vector parts; + Slice key = iter->key(); + StringSplit(&parts, key.ToString(), ':'); + assert(parts.size() == 5); + assert(parts[0] == "p"); + assert(parts[1] == quadkey); + assert(parts[2] == id); + + // fill up output parameters + object->position.latitude = atof(parts[3].c_str()); + object->position.longitude = atof(parts[4].c_str()); + object->id = id.ToString(); // this is redundant + object->value = iter->value().ToString(); + delete iter; + return Status::OK(); +} + + +Status GeoDBImpl::Remove(const Slice& id) { + // Read the object from the database + GeoObject obj; + Status status = GetById(id, &obj); + if (!status.ok()) { + return status; + } + + // remove the object by atomically deleting it from both tables + std::string quadkey = PositionToQuad(obj.position, Detail); + std::string key1 = MakeKey1(obj.position, obj.id, quadkey); + std::string key2 = MakeKey2(obj.id); + WriteBatch batch; + batch.Delete(Slice(key1)); + batch.Delete(Slice(key2)); + return db_->Write(woptions_, &batch); +} + +Status GeoDBImpl::SearchRadial(const GeoPosition& pos, + double radius, + std::vector* values, + int number_of_values) { + // Gather all bounding quadkeys + std::vector qids; + Status s = searchQuadIds(pos, radius, &qids); + if (!s.ok()) { + return s; + } + + // create an iterator + Iterator* iter = db_->NewIterator(ReadOptions()); + + // Process each prospective quadkey + for (std::string qid : qids) { + // The user is interested in only these many objects. + if (number_of_values == 0) { + break; + } + + // convert quadkey to db key prefix + std::string dbkey = MakeQuadKeyPrefix(qid); + + for (iter->Seek(dbkey); + number_of_values > 0 && iter->Valid() && iter->status().ok(); + iter->Next()) { + // split the key into p + quadkey + id + lat + lon + std::vector parts; + Slice key = iter->key(); + StringSplit(&parts, key.ToString(), ':'); + assert(parts.size() == 5); + assert(parts[0] == "p"); + std::string* quadkey = &parts[1]; + + // If the key we are looking for is a prefix of the key + // we found from the database, then this is one of the keys + // we are looking for. + auto res = std::mismatch(qid.begin(), qid.end(), quadkey->begin()); + if (res.first == qid.end()) { + GeoPosition pos(atof(parts[3].c_str()), atof(parts[4].c_str())); + GeoObject obj(pos, parts[4], iter->value().ToString()); + values->push_back(obj); + number_of_values--; + } else { + break; + } + } + } + delete iter; + return Status::OK(); +} + +std::string GeoDBImpl::MakeKey1(const GeoPosition& pos, Slice id, + std::string quadkey) { + std::string lat = std::to_string(pos.latitude); + std::string lon = std::to_string(pos.longitude); + std::string key = "p:"; + key.reserve(5 + quadkey.size() + id.size() + lat.size() + lon.size()); + key.append(quadkey); + key.append(":"); + key.append(id.ToString()); + key.append(":"); + key.append(lat); + key.append(":"); + key.append(lon); + return key; +} + +std::string GeoDBImpl::MakeKey2(Slice id) { + std::string key = "k:"; + key.append(id.ToString()); + return key; +} + +std::string GeoDBImpl::MakeKey1Prefix(std::string quadkey, + Slice id) { + std::string key = "p:"; + key.reserve(3 + quadkey.size() + id.size()); + key.append(quadkey); + key.append(":"); + key.append(id.ToString()); + return key; +} + +std::string GeoDBImpl::MakeQuadKeyPrefix(std::string quadkey) { + std::string key = "p:"; + key.append(quadkey); + return key; +} + +void GeoDBImpl::StringSplit(std::vector* tokens, + const std::string &text, char sep) { + std::size_t start = 0, end = 0; + while ((end = text.find(sep, start)) != std::string::npos) { + tokens->push_back(text.substr(start, end - start)); + start = end + 1; + } + tokens->push_back(text.substr(start)); +} + +// convert degrees to radians +double GeoDBImpl::radians(double x) { + return (x * PI) / 180; +} + +// convert radians to degrees +double GeoDBImpl::degrees(double x) { + return (x * 180) / PI; +} + +// convert a gps location to quad coordinate +std::string GeoDBImpl::PositionToQuad(const GeoPosition& pos, + int levelOfDetail) { + Pixel p = PositionToPixel(pos, levelOfDetail); + Tile tile = PixelToTile(p); + return TileToQuadKey(tile, levelOfDetail); +} + +GeoPosition GeoDBImpl::displaceLatLon(double lat, double lon, + double deltay, double deltax) { + double dLat = deltay / EarthRadius; + double dLon = deltax / (EarthRadius * cos(radians(lat))); + return GeoPosition(lat + degrees(dLat), + lon + degrees(dLon)); +} + +// +// Return the distance between two positions on the earth +// +double GeoDBImpl::distance(double lat1, double lon1, + double lat2, double lon2) { + double lon = radians(lon2 - lon1); + double lat = radians(lat2 - lat1); + + double a = (sin(lat / 2) * sin(lat / 2)) + + cos(radians(lat1)) * cos(radians(lat2)) * + (sin(lon / 2) * sin(lon / 2)); + double angle = 2 * atan2(sqrt(a), sqrt(1 - a)); + return angle * EarthRadius; +} + +// +// Returns all the quadkeys inside the search range +// +Status GeoDBImpl::searchQuadIds(const GeoPosition& position, + double radius, + std::vector* quadKeys) { + // get the outline of the search square + GeoPosition topLeftPos = boundingTopLeft(position, radius); + GeoPosition bottomRightPos = boundingBottomRight(position, radius); + + Pixel topLeft = PositionToPixel(topLeftPos, Detail); + Pixel bottomRight = PositionToPixel(bottomRightPos, Detail); + + // how many level of details to look for + int numberOfTilesAtMaxDepth = floor((bottomRight.x - topLeft.x) / 256); + int zoomLevelsToRise = floor(log(numberOfTilesAtMaxDepth) / log(2)); + zoomLevelsToRise++; + int levels = std::max(0, Detail - zoomLevelsToRise); + + quadKeys->push_back(PositionToQuad(GeoPosition(topLeftPos.latitude, + topLeftPos.longitude), + levels)); + quadKeys->push_back(PositionToQuad(GeoPosition(topLeftPos.latitude, + bottomRightPos.longitude), + levels)); + quadKeys->push_back(PositionToQuad(GeoPosition(bottomRightPos.latitude, + topLeftPos.longitude), + levels)); + quadKeys->push_back(PositionToQuad(GeoPosition(bottomRightPos.latitude, + bottomRightPos.longitude), + levels)); + return Status::OK(); +} + +// Determines the ground resolution (in meters per pixel) at a specified +// latitude and level of detail. +// Latitude (in degrees) at which to measure the ground resolution. +// Level of detail, from 1 (lowest detail) to 23 (highest detail). +// Returns the ground resolution, in meters per pixel. +double GeoDBImpl::GroundResolution(double latitude, int levelOfDetail) { + latitude = clip(latitude, MinLatitude, MaxLatitude); + return cos(latitude * PI / 180) * 2 * PI * EarthRadius / + MapSize(levelOfDetail); +} + +// Converts a point from latitude/longitude WGS-84 coordinates (in degrees) +// into pixel XY coordinates at a specified level of detail. +GeoDBImpl::Pixel GeoDBImpl::PositionToPixel(const GeoPosition& pos, + int levelOfDetail) { + double latitude = clip(pos.latitude, MinLatitude, MaxLatitude); + double x = (pos.longitude + 180) / 360; + double sinLatitude = sin(latitude * PI / 180); + double y = 0.5 - log((1 + sinLatitude) / (1 - sinLatitude)) / (4 * PI); + double mapSize = MapSize(levelOfDetail); + double X = floor(clip(x * mapSize + 0.5, 0, mapSize - 1)); + double Y = floor(clip(y * mapSize + 0.5, 0, mapSize - 1)); + return Pixel((unsigned int)X, (unsigned int)Y); +} + +GeoPosition GeoDBImpl::PixelToPosition(const Pixel& pixel, int levelOfDetail) { + double mapSize = MapSize(levelOfDetail); + double x = (clip(pixel.x, 0, mapSize - 1) / mapSize) - 0.5; + double y = 0.5 - (clip(pixel.y, 0, mapSize - 1) / mapSize); + double latitude = 90 - 360 * atan(exp(-y * 2 * PI)) / PI; + double longitude = 360 * x; + return GeoPosition(latitude, longitude); +} + +// Converts a Pixel to a Tile +GeoDBImpl::Tile GeoDBImpl::PixelToTile(const Pixel& pixel) { + unsigned int tileX = floor(pixel.x / 256); + unsigned int tileY = floor(pixel.y / 256); + return Tile(tileX, tileY); +} + +GeoDBImpl::Pixel GeoDBImpl::TileToPixel(const Tile& tile) { + unsigned int pixelX = tile.x * 256; + unsigned int pixelY = tile.y * 256; + return Pixel(pixelX, pixelY); +} + +// Convert a Tile to a quadkey +std::string GeoDBImpl::TileToQuadKey(const Tile& tile, int levelOfDetail) { + std::stringstream quadKey; + for (int i = levelOfDetail; i > 0; i--) { + char digit = '0'; + int mask = 1 << (i - 1); + if ((tile.x & mask) != 0) { + digit++; + } + if ((tile.y & mask) != 0) { + digit++; + digit++; + } + quadKey << digit; + } + return quadKey.str(); +} + +// +// Convert a quadkey to a tile and its level of detail +// +void GeoDBImpl::QuadKeyToTile(std::string quadkey, Tile* tile, + int *levelOfDetail) { + tile->x = tile->y = 0; + *levelOfDetail = quadkey.size(); + const char* key = reinterpret_cast(quadkey.c_str()); + for (int i = *levelOfDetail; i > 0; i--) { + int mask = 1 << (i - 1); + switch (key[*levelOfDetail - i]) { + case '0': + break; + + case '1': + tile->x |= mask; + break; + + case '2': + tile->y |= mask; + break; + + case '3': + tile->x |= mask; + tile->y |= mask; + break; + + default: + std::stringstream msg; + msg << quadkey; + msg << " Invalid QuadKey."; + throw std::runtime_error(msg.str()); + } + } +} +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/utilities/geodb/geodb_impl.h b/utilities/geodb/geodb_impl.h new file mode 100644 index 0000000000..4ee42ad29e --- /dev/null +++ b/utilities/geodb/geodb_impl.h @@ -0,0 +1,191 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#ifndef ROCKSDB_LITE + +#pragma once +#include +#include +#include +#include +#include +#include + +#include "utilities/geo_db.h" +#include "utilities/stackable_db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +// A specific implementation of GeoDB + +class GeoDBImpl : public GeoDB { + public: + GeoDBImpl(DB* db, const GeoDBOptions& options); + ~GeoDBImpl(); + + // Associate the GPS location with the identified by 'id'. The value + // is a blob that is associated with this object. + virtual Status Insert(const GeoObject& object); + + // Retrieve the value of the object located at the specified GPS + // location and is identified by the 'id'. + virtual Status GetByPosition(const GeoPosition& pos, + const Slice& id, + std::string* value); + + // Retrieve the value of the object identified by the 'id'. This method + // could be potentially slower than GetByPosition + virtual Status GetById(const Slice& id, GeoObject* object); + + // Delete the specified object + virtual Status Remove(const Slice& id); + + // Returns a list of all items within a circular radius from the + // specified gps location + virtual Status SearchRadial(const GeoPosition& pos, + double radius, + std::vector* values, + int number_of_values); + + private: + DB* db_; + const GeoDBOptions options_; + const WriteOptions woptions_; + const ReadOptions roptions_; + + // The value of PI + static constexpr double PI = 3.141592653589793; + + // convert degrees to radians + static double radians(double x); + + // convert radians to degrees + static double degrees(double x); + + // A pixel class that captures X and Y coordinates + class Pixel { + public: + unsigned int x; + unsigned int y; + Pixel(unsigned int a, unsigned int b) : + x(a), y(b) { + } + }; + + // A Tile in the geoid + class Tile { + public: + unsigned int x; + unsigned int y; + Tile(unsigned int a, unsigned int b) : + x(a), y(b) { + } + }; + + // convert a gps location to quad coordinate + static std::string PositionToQuad(const GeoPosition& pos, int levelOfDetail); + + // arbitrary constant use for WGS84 via + // http://en.wikipedia.org/wiki/World_Geodetic_System + // http://mathforum.org/library/drmath/view/51832.html + // http://msdn.microsoft.com/en-us/library/bb259689.aspx + // http://www.tuicool.com/articles/NBrE73 + // + const int Detail = 23; + static constexpr double EarthRadius = 6378137; + static constexpr double MinLatitude = -85.05112878; + static constexpr double MaxLatitude = 85.05112878; + static constexpr double MinLongitude = -180; + static constexpr double MaxLongitude = 180; + + // clips a number to the specified minimum and maximum values. + static double clip(double n, double minValue, double maxValue) { + return fmin(fmax(n, minValue), maxValue); + } + + // Determines the map width and height (in pixels) at a specified level + // of detail, from 1 (lowest detail) to 23 (highest detail). + // Returns the map width and height in pixels. + static unsigned int MapSize(int levelOfDetail) { + return (unsigned int)(256 << levelOfDetail); + } + + // Determines the ground resolution (in meters per pixel) at a specified + // latitude and level of detail. + // Latitude (in degrees) at which to measure the ground resolution. + // Level of detail, from 1 (lowest detail) to 23 (highest detail). + // Returns the ground resolution, in meters per pixel. + static double GroundResolution(double latitude, int levelOfDetail); + + // Converts a point from latitude/longitude WGS-84 coordinates (in degrees) + // into pixel XY coordinates at a specified level of detail. + static Pixel PositionToPixel(const GeoPosition& pos, int levelOfDetail); + + static GeoPosition PixelToPosition(const Pixel& pixel, int levelOfDetail); + + // Converts a Pixel to a Tile + static Tile PixelToTile(const Pixel& pixel); + + static Pixel TileToPixel(const Tile& tile); + + // Convert a Tile to a quadkey + static std::string TileToQuadKey(const Tile& tile, int levelOfDetail); + + // Convert a quadkey to a tile and its level of detail + static void QuadKeyToTile(std::string quadkey, Tile* tile, + int *levelOfDetail); + + // Return the distance between two positions on the earth + static double distance(double lat1, double lon1, + double lat2, double lon2); + static GeoPosition displaceLatLon(double lat, double lon, + double deltay, double deltax); + + // + // Returns the top left position after applying the delta to + // the specified position + // + static GeoPosition boundingTopLeft(const GeoPosition& in, double radius) { + return displaceLatLon(in.latitude, in.longitude, -radius, -radius); + } + + // + // Returns the bottom right position after applying the delta to + // the specified position + static GeoPosition boundingBottomRight(const GeoPosition& in, + double radius) { + return displaceLatLon(in.latitude, in.longitude, radius, radius); + } + + // + // Get all quadkeys within a radius of a specified position + // + Status searchQuadIds(const GeoPosition& position, + double radius, + std::vector* quadKeys); + + // splits a string into its components + static void StringSplit(std::vector* tokens, + const std::string &text, + char sep); + + // + // Create keys for accessing rocksdb table(s) + // + static std::string MakeKey1(const GeoPosition& pos, + Slice id, + std::string quadkey); + static std::string MakeKey2(Slice id); + static std::string MakeKey1Prefix(std::string quadkey, + Slice id); + static std::string MakeQuadKeyPrefix(std::string quadkey); +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/utilities/geodb/geodb_test.cc b/utilities/geodb/geodb_test.cc new file mode 100644 index 0000000000..1a42e32477 --- /dev/null +++ b/utilities/geodb/geodb_test.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// +#include "utilities/geodb/geodb_impl.h" + +#include +#include "util/testharness.h" + +namespace rocksdb { + +class GeoDBTest { + public: + static const std::string kDefaultDbName; + static Options options; + DB* db; + GeoDB* geodb; + + GeoDBTest() { + GeoDBOptions geodb_options; + ASSERT_OK(DestroyDB(kDefaultDbName, options)); + options.create_if_missing = true; + Status status = DB::Open(options, kDefaultDbName, &db); + geodb = new GeoDBImpl(db, geodb_options); + } + + ~GeoDBTest() { + delete geodb; + } + + GeoDB* getdb() { + return geodb; + } +}; + +const std::string GeoDBTest::kDefaultDbName = "/tmp/geodefault"; +Options GeoDBTest::options = Options(); + +// Insert, Get and Remove +TEST(GeoDBTest, SimpleTest) { + GeoPosition pos1(100, 101); + std::string id1("id1"); + std::string value1("value1"); + + // insert first object into database + GeoObject obj1(pos1, id1, value1); + Status status = getdb()->Insert(obj1); + ASSERT_TRUE(status.ok()); + + // insert second object into database + GeoPosition pos2(200, 201); + std::string id2("id2"); + std::string value2 = "value2"; + GeoObject obj2(pos2, id2, value2); + status = getdb()->Insert(obj2); + ASSERT_TRUE(status.ok()); + + // retrieve first object using position + std::string value; + status = getdb()->GetByPosition(pos1, Slice(id1), &value); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(value, value1); + + // retrieve first object using id + GeoObject obj; + status = getdb()->GetById(Slice(id1), &obj); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(obj.position.latitude, 100); + ASSERT_EQ(obj.position.longitude, 101); + ASSERT_EQ(obj.id.compare(id1), 0); + ASSERT_EQ(obj.value, value1); + + // delete first object + status = getdb()->Remove(Slice(id1)); + ASSERT_TRUE(status.ok()); + status = getdb()->GetByPosition(pos1, Slice(id1), &value); + ASSERT_TRUE(status.IsNotFound()); + status = getdb()->GetById(id1, &obj); + ASSERT_TRUE(status.IsNotFound()); + + // check that we can still find second object + status = getdb()->GetByPosition(pos2, id2, &value); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(value, value2); + status = getdb()->GetById(id2, &obj); + ASSERT_TRUE(status.ok()); +} + +// Search. +// Verify distances via http://www.stevemorse.org/nearest/distance.php +TEST(GeoDBTest, Search) { + GeoPosition pos1(45, 45); + std::string id1("mid1"); + std::string value1 = "midvalue1"; + + // insert object at 45 degree latitude + GeoObject obj1(pos1, id1, value1); + Status status = getdb()->Insert(obj1); + ASSERT_TRUE(status.ok()); + + // search all objects centered at 46 degree latitude with + // a radius of 200 kilometers. We should find the one object that + // we inserted earlier. + std::vector values; + status = getdb()->SearchRadial(GeoPosition(46, 46), 200000, &values); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(values.size(), 1U); + + // search all objects centered at 46 degree latitude with + // a radius of 2 kilometers. There should be none. + values.clear(); + status = getdb()->SearchRadial(GeoPosition(46, 46), 2, &values); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(values.size(), 0U); +} + +} // namespace rocksdb + +int main(int argc, char* argv[]) { + return rocksdb::test::RunAllTests(); +} diff --git a/utilities/merge_operators.h b/utilities/merge_operators.h new file mode 100644 index 0000000000..fdf06645fc --- /dev/null +++ b/utilities/merge_operators.h @@ -0,0 +1,45 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef MERGE_OPERATORS_H +#define MERGE_OPERATORS_H + +#include +#include + +#include "rocksdb/merge_operator.h" + +namespace rocksdb { + +class MergeOperators { + public: + static std::shared_ptr CreatePutOperator(); + static std::shared_ptr CreateUInt64AddOperator(); + static std::shared_ptr CreateStringAppendOperator(); + static std::shared_ptr CreateStringAppendTESTOperator(); + + // Will return a different merge operator depending on the string. + // TODO: Hook the "name" up to the actual Name() of the MergeOperators? + static std::shared_ptr CreateFromStringId( + const std::string& name) { + if (name == "put") { + return CreatePutOperator(); + } else if ( name == "uint64add") { + return CreateUInt64AddOperator(); + } else if (name == "stringappend") { + return CreateStringAppendOperator(); + } else if (name == "stringappendtest") { + return CreateStringAppendTESTOperator(); + } else { + // Empty or unknown, just return nullptr + return nullptr; + } + } + +}; + +} // namespace rocksdb + +#endif diff --git a/utilities/merge_operators/put.cc b/utilities/merge_operators/put.cc new file mode 100644 index 0000000000..3330843130 --- /dev/null +++ b/utilities/merge_operators/put.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include "rocksdb/slice.h" +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" + +using namespace rocksdb; + +namespace { // anonymous namespace + +// A merge operator that mimics Put semantics +// Since this merge-operator will not be used in production, +// it is implemented as a non-associative merge operator to illustrate the +// new interface and for testing purposes. (That is, we inherit from +// the MergeOperator class rather than the AssociativeMergeOperator +// which would be simpler in this case). +// +// From the client-perspective, semantics are the same. +class PutOperator : public MergeOperator { + public: + virtual bool FullMerge(const Slice& key, + const Slice* existing_value, + const std::deque& operand_sequence, + std::string* new_value, + Logger* logger) const override { + // Put basically only looks at the current/latest value + assert(!operand_sequence.empty()); + assert(new_value != nullptr); + new_value->assign(operand_sequence.back()); + return true; + } + + virtual bool PartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const override { + new_value->assign(right_operand.data(), right_operand.size()); + return true; + } + + using MergeOperator::PartialMergeMulti; + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const + override { + new_value->assign(operand_list.back().data(), operand_list.back().size()); + return true; + } + + virtual const char* Name() const override { + return "PutOperator"; + } +}; + +} // end of anonymous namespace + +namespace rocksdb { + +std::shared_ptr MergeOperators::CreatePutOperator() { + return std::make_shared(); +} + +} diff --git a/utilities/merge_operators/string_append/stringappend.cc b/utilities/merge_operators/string_append/stringappend.cc new file mode 100644 index 0000000000..38cd22eb4a --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend.cc @@ -0,0 +1,60 @@ +/** + * A MergeOperator for rocksdb that implements string append. + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#include "stringappend.h" + +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" + +namespace rocksdb { + +// Constructor: also specify the delimiter character. +StringAppendOperator::StringAppendOperator(char delim_char) + : delim_(delim_char) { +} + +// Implementation for the merge operation (concatenates two strings) +bool StringAppendOperator::Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const { + + // Clear the *new_value for writing. + assert(new_value); + new_value->clear(); + + if (!existing_value) { + // No existing_value. Set *new_value = value + new_value->assign(value.data(),value.size()); + } else { + // Generic append (existing_value != null). + // Reserve *new_value to correct size, and apply concatenation. + new_value->reserve(existing_value->size() + 1 + value.size()); + new_value->assign(existing_value->data(),existing_value->size()); + new_value->append(1,delim_); + new_value->append(value.data(), value.size()); + } + + return true; +} + +const char* StringAppendOperator::Name() const { + return "StringAppendOperator"; +} + +std::shared_ptr MergeOperators::CreateStringAppendOperator() { + return std::make_shared(','); +} + +} // namespace rocksdb + + + diff --git a/utilities/merge_operators/string_append/stringappend.h b/utilities/merge_operators/string_append/stringappend.h new file mode 100644 index 0000000000..ca5b97ec99 --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend.h @@ -0,0 +1,31 @@ +/** + * A MergeOperator for rocksdb that implements string append. + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + +class StringAppendOperator : public AssociativeMergeOperator { + public: + StringAppendOperator(char delim_char); /// Constructor: specify delimiter + + virtual bool Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const override; + + virtual const char* Name() const override; + + private: + char delim_; // The delimiter is inserted between elements + +}; + +} // namespace rocksdb + diff --git a/utilities/merge_operators/string_append/stringappend2.cc b/utilities/merge_operators/string_append/stringappend2.cc new file mode 100644 index 0000000000..b2e03588f7 --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend2.cc @@ -0,0 +1,113 @@ +/** + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#include "stringappend2.h" + +#include +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" + +namespace rocksdb { + +// Constructor: also specify the delimiter character. +StringAppendTESTOperator::StringAppendTESTOperator(char delim_char) + : delim_(delim_char) { +} + +// Implementation for the merge operation (concatenates two strings) +bool StringAppendTESTOperator::FullMerge( + const Slice& key, + const Slice* existing_value, + const std::deque& operands, + std::string* new_value, + Logger* logger) const { + + // Clear the *new_value for writing. + assert(new_value); + new_value->clear(); + + // Compute the space needed for the final result. + int numBytes = 0; + for(auto it = operands.begin(); it != operands.end(); ++it) { + numBytes += it->size() + 1; // Plus 1 for the delimiter + } + + // Only print the delimiter after the first entry has been printed + bool printDelim = false; + + // Prepend the *existing_value if one exists. + if (existing_value) { + new_value->reserve(numBytes + existing_value->size()); + new_value->append(existing_value->data(), existing_value->size()); + printDelim = true; + } else if (numBytes) { + new_value->reserve(numBytes-1); // Minus 1 since we have one less delimiter + } + + // Concatenate the sequence of strings (and add a delimiter between each) + for(auto it = operands.begin(); it != operands.end(); ++it) { + if (printDelim) { + new_value->append(1,delim_); + } + new_value->append(*it); + printDelim = true; + } + + return true; +} + +bool StringAppendTESTOperator::PartialMergeMulti( + const Slice& key, const std::deque& operand_list, + std::string* new_value, Logger* logger) const { + return false; +} + +// A version of PartialMerge that actually performs "partial merging". +// Use this to simulate the exact behaviour of the StringAppendOperator. +bool StringAppendTESTOperator::_AssocPartialMergeMulti( + const Slice& key, const std::deque& operand_list, + std::string* new_value, Logger* logger) const { + // Clear the *new_value for writing + assert(new_value); + new_value->clear(); + assert(operand_list.size() >= 2); + + // Generic append + // Determine and reserve correct size for *new_value. + size_t size = 0; + for (const auto& operand : operand_list) { + size += operand.size(); + } + size += operand_list.size() - 1; // Delimiters + new_value->reserve(size); + + // Apply concatenation + new_value->assign(operand_list.front().data(), operand_list.front().size()); + + for (std::deque::const_iterator it = operand_list.begin() + 1; + it != operand_list.end(); ++it) { + new_value->append(1, delim_); + new_value->append(it->data(), it->size()); + } + + return true; +} + +const char* StringAppendTESTOperator::Name() const { + return "StringAppendTESTOperator"; +} + + +std::shared_ptr +MergeOperators::CreateStringAppendTESTOperator() { + return std::make_shared(','); +} + +} // namespace rocksdb + diff --git a/utilities/merge_operators/string_append/stringappend2.h b/utilities/merge_operators/string_append/stringappend2.h new file mode 100644 index 0000000000..5e506ef8fe --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend2.h @@ -0,0 +1,51 @@ +/** + * A TEST MergeOperator for rocksdb that implements string append. + * It is built using the MergeOperator interface rather than the simpler + * AssociativeMergeOperator interface. This is useful for testing/benchmarking. + * While the two operators are semantically the same, all production code + * should use the StringAppendOperator defined in stringappend.{h,cc}. The + * operator defined in the present file is primarily for testing. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once +#include +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + +class StringAppendTESTOperator : public MergeOperator { + public: + // Constructor with delimiter + explicit StringAppendTESTOperator(char delim_char); + + virtual bool FullMerge(const Slice& key, + const Slice* existing_value, + const std::deque& operand_sequence, + std::string* new_value, + Logger* logger) const override; + + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const + override; + + virtual const char* Name() const override; + + private: + // A version of PartialMerge that actually performs "partial merging". + // Use this to simulate the exact behaviour of the StringAppendOperator. + bool _AssocPartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const; + + char delim_; // The delimiter is inserted between elements + +}; + +} // namespace rocksdb diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc new file mode 100644 index 0000000000..a68186a3af --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend_test.cc @@ -0,0 +1,595 @@ +/** + * An persistent map : key -> (list of strings), using rocksdb merge. + * This file is a test-harness / use-case for the StringAppendOperator. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook, Inc. +*/ + +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/merge_operators/string_append/stringappend2.h" +#include "utilities/db_ttl.h" +#include "util/testharness.h" +#include "util/random.h" + +using namespace rocksdb; + +namespace rocksdb { + +// Path to the database on file system +const std::string kDbName = "/tmp/mergetestdb"; + +namespace { +// OpenDb opens a (possibly new) rocksdb database with a StringAppendOperator +std::shared_ptr OpenNormalDb(char delim_char) { + DB* db; + Options options; + options.create_if_missing = true; + options.merge_operator.reset(new StringAppendOperator(delim_char)); + ASSERT_OK(DB::Open(options, kDbName, &db)); + return std::shared_ptr(db); +} + +// Open a TtlDB with a non-associative StringAppendTESTOperator +std::shared_ptr OpenTtlDb(char delim_char) { + DBWithTTL* db; + Options options; + options.create_if_missing = true; + options.merge_operator.reset(new StringAppendTESTOperator(delim_char)); + ASSERT_OK(DBWithTTL::Open(options, kDbName, &db, 123456)); + return std::shared_ptr(db); +} +} // namespace + +/// StringLists represents a set of string-lists, each with a key-index. +/// Supports Append(list, string) and Get(list) +class StringLists { + public: + + //Constructor: specifies the rocksdb db + /* implicit */ + StringLists(std::shared_ptr db) + : db_(db), + merge_option_(), + get_option_() { + assert(db); + } + + // Append string val onto the list defined by key; return true on success + bool Append(const std::string& key, const std::string& val){ + Slice valSlice(val.data(), val.size()); + auto s = db_->Merge(merge_option_, key, valSlice); + + if (s.ok()) { + return true; + } else { + std::cerr << "ERROR " << s.ToString() << std::endl; + return false; + } + } + + // Returns the list of strings associated with key (or "" if does not exist) + bool Get(const std::string& key, std::string* const result){ + assert(result != nullptr); // we should have a place to store the result + auto s = db_->Get(get_option_, key, result); + + if (s.ok()) { + return true; + } + + // Either key does not exist, or there is some error. + *result = ""; // Always return empty string (just for convention) + + //NotFound is okay; just return empty (similar to std::map) + //But network or db errors, etc, should fail the test (or at least yell) + if (!s.IsNotFound()) { + std::cerr << "ERROR " << s.ToString() << std::endl; + } + + // Always return false if s.ok() was not true + return false; + } + + + private: + std::shared_ptr db_; + WriteOptions merge_option_; + ReadOptions get_option_; + +}; + + +// The class for unit-testing +class StringAppendOperatorTest { + public: + StringAppendOperatorTest() { + DestroyDB(kDbName, Options()); // Start each test with a fresh DB + } + + typedef std::shared_ptr (* OpenFuncPtr)(char); + + // Allows user to open databases with different configurations. + // e.g.: Can open a DB or a TtlDB, etc. + static void SetOpenDbFunction(OpenFuncPtr func) { + OpenDb = func; + } + + protected: + static OpenFuncPtr OpenDb; +}; +StringAppendOperatorTest::OpenFuncPtr StringAppendOperatorTest::OpenDb = nullptr; + +// THE TEST CASES BEGIN HERE + +TEST(StringAppendOperatorTest, IteratorTest) { + auto db_ = OpenDb(','); + StringLists slists(db_); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + slists.Append("k2", "a1"); + slists.Append("k2", "a2"); + slists.Append("k2", "a3"); + + std::string res; + std::unique_ptr it(db_->NewIterator(ReadOptions())); + std::string k1("k1"); + std::string k2("k2"); + bool first = true; + for (it->Seek(k1); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3"); + } + } + slists.Append("k2", "a4"); + slists.Append("k1", "v4"); + + // Snapshot should still be the same. Should ignore a4 and v4. + first = true; + for (it->Seek(k1); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3"); + } + } + + + // Should release the snapshot and be aware of the new stuff now + it.reset(db_->NewIterator(ReadOptions())); + first = true; + for (it->Seek(k1); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3,v4"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3,a4"); + } + } + + // start from k2 this time. + for (it->Seek(k2); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3,v4"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3,a4"); + } + } + + slists.Append("k3", "g1"); + + it.reset(db_->NewIterator(ReadOptions())); + first = true; + std::string k3("k3"); + for(it->Seek(k2); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "a1,a2,a3,a4"); + first = false; + } else { + ASSERT_EQ(res, "g1"); + } + } + for(it->Seek(k3); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + // should not be hit + ASSERT_EQ(res, "a1,a2,a3,a4"); + first = false; + } else { + ASSERT_EQ(res, "g1"); + } + } + +} + +TEST(StringAppendOperatorTest, SimpleTest) { + auto db = OpenDb(','); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + std::string res; + bool status = slists.Get("k1", &res); + + ASSERT_TRUE(status); + ASSERT_EQ(res, "v1,v2,v3"); +} + +TEST(StringAppendOperatorTest, SimpleDelimiterTest) { + auto db = OpenDb('|'); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + std::string res; + slists.Get("k1", &res); + ASSERT_EQ(res, "v1|v2|v3"); +} + +TEST(StringAppendOperatorTest, OneValueNoDelimiterTest) { + auto db = OpenDb('!'); + StringLists slists(db); + + slists.Append("random_key", "single_val"); + + std::string res; + slists.Get("random_key", &res); + ASSERT_EQ(res, "single_val"); +} + +TEST(StringAppendOperatorTest, VariousKeys) { + auto db = OpenDb('\n'); + StringLists slists(db); + + slists.Append("c", "asdasd"); + slists.Append("a", "x"); + slists.Append("b", "y"); + slists.Append("a", "t"); + slists.Append("a", "r"); + slists.Append("b", "2"); + slists.Append("c", "asdasd"); + + std::string a, b, c; + bool sa, sb, sc; + sa = slists.Get("a", &a); + sb = slists.Get("b", &b); + sc = slists.Get("c", &c); + + ASSERT_TRUE(sa && sb && sc); // All three keys should have been found + + ASSERT_EQ(a, "x\nt\nr"); + ASSERT_EQ(b, "y\n2"); + ASSERT_EQ(c, "asdasd\nasdasd"); +} + +// Generate semi random keys/words from a small distribution. +TEST(StringAppendOperatorTest, RandomMixGetAppend) { + auto db = OpenDb(' '); + StringLists slists(db); + + // Generate a list of random keys and values + const int kWordCount = 15; + std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", "342839", + "dsuha", "mabuais", "sadajsid", "jf9834hf", "2d9j89", + "dj9823jd", "a", "dk02ed2dh", "$(jd4h984$(*", "mabz"}; + const int kKeyCount = 6; + std::string keys[] = {"dhaiusdhu", "denidw", "daisda", "keykey", "muki", + "shzassdianmd"}; + + // Will store a local copy of all data in order to verify correctness + std::map parallel_copy; + + // Generate a bunch of random queries (Append and Get)! + enum query_t { APPEND_OP, GET_OP, NUM_OPS }; + Random randomGen(1337); //deterministic seed; always get same results! + + const int kNumQueries = 30; + for (int q=0; q 0) { + parallel_copy[key] += " " + word; + } else { + parallel_copy[key] = word; + } + + } else if (query == GET_OP) { + // Assumes that a non-existent key just returns + std::string res; + slists.Get(key, &res); + ASSERT_EQ(res, parallel_copy[key]); + } + + } + +} + +TEST(StringAppendOperatorTest, BIGRandomMixGetAppend) { + auto db = OpenDb(' '); + StringLists slists(db); + + // Generate a list of random keys and values + const int kWordCount = 15; + std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", "342839", + "dsuha", "mabuais", "sadajsid", "jf9834hf", "2d9j89", + "dj9823jd", "a", "dk02ed2dh", "$(jd4h984$(*", "mabz"}; + const int kKeyCount = 6; + std::string keys[] = {"dhaiusdhu", "denidw", "daisda", "keykey", "muki", + "shzassdianmd"}; + + // Will store a local copy of all data in order to verify correctness + std::map parallel_copy; + + // Generate a bunch of random queries (Append and Get)! + enum query_t { APPEND_OP, GET_OP, NUM_OPS }; + Random randomGen(9138204); // deterministic seed + + const int kNumQueries = 1000; + for (int q=0; q 0) { + parallel_copy[key] += " " + word; + } else { + parallel_copy[key] = word; + } + + } else if (query == GET_OP) { + // Assumes that a non-existent key just returns + std::string res; + slists.Get(key, &res); + ASSERT_EQ(res, parallel_copy[key]); + } + + } + +} + + +TEST(StringAppendOperatorTest, PersistentVariousKeys) { + // Perform the following operations in limited scope + { + auto db = OpenDb('\n'); + StringLists slists(db); + + slists.Append("c", "asdasd"); + slists.Append("a", "x"); + slists.Append("b", "y"); + slists.Append("a", "t"); + slists.Append("a", "r"); + slists.Append("b", "2"); + slists.Append("c", "asdasd"); + + std::string a, b, c; + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + + ASSERT_EQ(a, "x\nt\nr"); + ASSERT_EQ(b, "y\n2"); + ASSERT_EQ(c, "asdasd\nasdasd"); + } + + // Reopen the database (the previous changes should persist / be remembered) + { + auto db = OpenDb('\n'); + StringLists slists(db); + + slists.Append("c", "bbnagnagsx"); + slists.Append("a", "sa"); + slists.Append("b", "df"); + slists.Append("a", "gh"); + slists.Append("a", "jk"); + slists.Append("b", "l;"); + slists.Append("c", "rogosh"); + + // The previous changes should be on disk (L0) + // The most recent changes should be in memory (MemTable) + // Hence, this will test both Get() paths. + std::string a, b, c; + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + } + + // Reopen the database (the previous changes should persist / be remembered) + { + auto db = OpenDb('\n'); + StringLists slists(db); + + // All changes should be on disk. This will test VersionSet Get() + std::string a, b, c; + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + } +} + +TEST(StringAppendOperatorTest, PersistentFlushAndCompaction) { + // Perform the following operations in limited scope + { + auto db = OpenDb('\n'); + StringLists slists(db); + std::string a, b, c; + bool success; + + // Append, Flush, Get + slists.Append("c", "asdasd"); + db->Flush(rocksdb::FlushOptions()); + success = slists.Get("c", &c); + ASSERT_TRUE(success); + ASSERT_EQ(c, "asdasd"); + + // Append, Flush, Append, Get + slists.Append("a", "x"); + slists.Append("b", "y"); + db->Flush(rocksdb::FlushOptions()); + slists.Append("a", "t"); + slists.Append("a", "r"); + slists.Append("b", "2"); + + success = slists.Get("a", &a); + assert(success == true); + ASSERT_EQ(a, "x\nt\nr"); + + success = slists.Get("b", &b); + assert(success == true); + ASSERT_EQ(b, "y\n2"); + + // Append, Get + success = slists.Append("c", "asdasd"); + assert(success); + success = slists.Append("b", "monkey"); + assert(success); + + // I omit the "assert(success)" checks here. + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + + ASSERT_EQ(a, "x\nt\nr"); + ASSERT_EQ(b, "y\n2\nmonkey"); + ASSERT_EQ(c, "asdasd\nasdasd"); + } + + // Reopen the database (the previous changes should persist / be remembered) + { + auto db = OpenDb('\n'); + StringLists slists(db); + std::string a, b, c; + + // Get (Quick check for persistence of previous database) + slists.Get("a", &a); + ASSERT_EQ(a, "x\nt\nr"); + + //Append, Compact, Get + slists.Append("c", "bbnagnagsx"); + slists.Append("a", "sa"); + slists.Append("b", "df"); + db->CompactRange(nullptr, nullptr); + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + ASSERT_EQ(a, "x\nt\nr\nsa"); + ASSERT_EQ(b, "y\n2\nmonkey\ndf"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx"); + + // Append, Get + slists.Append("a", "gh"); + slists.Append("a", "jk"); + slists.Append("b", "l;"); + slists.Append("c", "rogosh"); + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + + // Compact, Get + db->CompactRange(nullptr, nullptr); + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + + // Append, Flush, Compact, Get + slists.Append("b", "afcg"); + db->Flush(rocksdb::FlushOptions()); + db->CompactRange(nullptr, nullptr); + slists.Get("b", &b); + ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;\nafcg"); + } +} + +TEST(StringAppendOperatorTest, SimpleTestNullDelimiter) { + auto db = OpenDb('\0'); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + std::string res; + bool status = slists.Get("k1", &res); + ASSERT_TRUE(status); + + // Construct the desired string. Default constructor doesn't like '\0' chars. + std::string checker("v1,v2,v3"); // Verify that the string is right size. + checker[2] = '\0'; // Use null delimiter instead of comma. + checker[5] = '\0'; + assert(checker.size() == 8); // Verify it is still the correct size + + // Check that the rocksdb result string matches the desired string + assert(res.size() == checker.size()); + ASSERT_EQ(res, checker); +} + +} // namespace rocksdb + +int main(int arc, char** argv) { + // Run with regular database + { + fprintf(stderr, "Running tests with regular db and operator.\n"); + StringAppendOperatorTest::SetOpenDbFunction(&OpenNormalDb); + rocksdb::test::RunAllTests(); + } + + // Run with TTL + { + fprintf(stderr, "Running tests with ttl db and generic operator.\n"); + StringAppendOperatorTest::SetOpenDbFunction(&OpenTtlDb); + rocksdb::test::RunAllTests(); + } + + return 0; +} diff --git a/utilities/merge_operators/uint64add.cc b/utilities/merge_operators/uint64add.cc new file mode 100644 index 0000000000..9d78651ec8 --- /dev/null +++ b/utilities/merge_operators/uint64add.cc @@ -0,0 +1,65 @@ +#include +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "util/coding.h" +#include "utilities/merge_operators.h" + +using namespace rocksdb; + +namespace { // anonymous namespace + +// A 'model' merge operator with uint64 addition semantics +// Implemented as an AssociativeMergeOperator for simplicity and example. +class UInt64AddOperator : public AssociativeMergeOperator { + public: + virtual bool Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const override { + uint64_t orig_value = 0; + if (existing_value){ + orig_value = DecodeInteger(*existing_value, logger); + } + uint64_t operand = DecodeInteger(value, logger); + + assert(new_value); + new_value->clear(); + PutFixed64(new_value, orig_value + operand); + + return true; // Return true always since corruption will be treated as 0 + } + + virtual const char* Name() const override { + return "UInt64AddOperator"; + } + + private: + // Takes the string and decodes it into a uint64_t + // On error, prints a message and returns 0 + uint64_t DecodeInteger(const Slice& value, Logger* logger) const { + uint64_t result = 0; + + if (value.size() == sizeof(uint64_t)) { + result = DecodeFixed64(value.data()); + } else if (logger != nullptr) { + // If value is corrupted, treat it as 0 + Log(logger, "uint64 value corruption, size: %zu > %zu", + value.size(), sizeof(uint64_t)); + } + + return result; + } + +}; + +} + +namespace rocksdb { + +std::shared_ptr MergeOperators::CreateUInt64AddOperator() { + return std::make_shared(); +} + +} diff --git a/utilities/redis/README b/utilities/redis/README new file mode 100644 index 0000000000..8b17bc05a6 --- /dev/null +++ b/utilities/redis/README @@ -0,0 +1,14 @@ +This folder defines a REDIS-style interface for Rocksdb. +Right now it is written as a simple tag-on in the rocksdb::RedisLists class. +It implements Redis Lists, and supports only the "non-blocking operations". + +Internally, the set of lists are stored in a rocksdb database, mapping keys to +values. Each "value" is the list itself, storing a sequence of "elements". +Each element is stored as a 32-bit-integer, followed by a sequence of bytes. +The 32-bit-integer represents the length of the element (that is, the number +of bytes that follow). And then that many bytes follow. + + +NOTE: This README file may be old. See the actual redis_lists.cc file for +definitive details on the implementation. There should be a header at the top +of that file, explaining a bit of the implementation details. diff --git a/utilities/redis/redis_list_exception.h b/utilities/redis/redis_list_exception.h new file mode 100644 index 0000000000..0b0f376167 --- /dev/null +++ b/utilities/redis/redis_list_exception.h @@ -0,0 +1,22 @@ +/** + * A simple structure for exceptions in RedisLists. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#ifndef ROCKSDB_LITE +#pragma once +#include + +namespace rocksdb { + +class RedisListException: public std::exception { + public: + const char* what() const throw() { + return "Invalid operation or corrupt data in Redis List."; + } +}; + +} // namespace rocksdb +#endif diff --git a/utilities/redis/redis_list_iterator.h b/utilities/redis/redis_list_iterator.h new file mode 100644 index 0000000000..b776ada24f --- /dev/null +++ b/utilities/redis/redis_list_iterator.h @@ -0,0 +1,310 @@ +// Copyright 2013 Facebook +/** + * RedisListIterator: + * An abstraction over the "list" concept (e.g.: for redis lists). + * Provides functionality to read, traverse, edit, and write these lists. + * + * Upon construction, the RedisListIterator is given a block of list data. + * Internally, it stores a pointer to the data and a pointer to current item. + * It also stores a "result" list that will be mutated over time. + * + * Traversal and mutation are done by "forward iteration". + * The Push() and Skip() methods will advance the iterator to the next item. + * However, Push() will also "write the current item to the result". + * Skip() will simply move to next item, causing current item to be dropped. + * + * Upon completion, the result (accessible by WriteResult()) will be saved. + * All "skipped" items will be gone; all "pushed" items will remain. + * + * @throws Any of the operations may throw a RedisListException if an invalid + * operation is performed or if the data is found to be corrupt. + * + * @notes By default, if WriteResult() is called part-way through iteration, + * it will automatically advance the iterator to the end, and Keep() + * all items that haven't been traversed yet. This may be subject + * to review. + * + * @notes Can access the "current" item via GetCurrent(), and other + * list-specific information such as Length(). + * + * @notes The internal representation is due to change at any time. Presently, + * the list is represented as follows: + * - 32-bit integer header: the number of items in the list + * - For each item: + * - 32-bit int (n): the number of bytes representing this item + * - n bytes of data: the actual data. + * + * @author Deon Nicholas (dnicholas@fb.com) + */ + +#ifndef ROCKSDB_LITE +#pragma once + +#include + +#include "redis_list_exception.h" +#include "rocksdb/slice.h" +#include "util/coding.h" + +namespace rocksdb { + +/// An abstraction over the "list" concept. +/// All operations may throw a RedisListException +class RedisListIterator { + public: + /// Construct a redis-list-iterator based on data. + /// If the data is non-empty, it must formatted according to @notes above. + /// + /// If the data is valid, we can assume the following invariant(s): + /// a) length_, num_bytes_ are set correctly. + /// b) cur_byte_ always refers to the start of the current element, + /// just before the bytes that specify element length. + /// c) cur_elem_ is always the index of the current element. + /// d) cur_elem_length_ is always the number of bytes in current element, + /// excluding the 4-byte header itself. + /// e) result_ will always contain data_[0..cur_byte_) and a header + /// f) Whenever corrupt data is encountered or an invalid operation is + /// attempted, a RedisListException will immediately be thrown. + RedisListIterator(const std::string& list_data) + : data_(list_data.data()), + num_bytes_(list_data.size()), + cur_byte_(0), + cur_elem_(0), + cur_elem_length_(0), + length_(0), + result_() { + + // Initialize the result_ (reserve enough space for header) + InitializeResult(); + + // Parse the data only if it is not empty. + if (num_bytes_ == 0) { + return; + } + + // If non-empty, but less than 4 bytes, data must be corrupt + if (num_bytes_ < sizeof(length_)) { + ThrowError("Corrupt header."); // Will break control flow + } + + // Good. The first bytes specify the number of elements + length_ = DecodeFixed32(data_); + cur_byte_ = sizeof(length_); + + // If we have at least one element, point to that element. + // Also, read the first integer of the element (specifying the size), + // if possible. + if (length_ > 0) { + if (cur_byte_ + sizeof(cur_elem_length_) <= num_bytes_) { + cur_elem_length_ = DecodeFixed32(data_+cur_byte_); + } else { + ThrowError("Corrupt data for first element."); + } + } + + // At this point, we are fully set-up. + // The invariants described in the header should now be true. + } + + /// Reserve some space for the result_. + /// Equivalent to result_.reserve(bytes). + void Reserve(int bytes) { + result_.reserve(bytes); + } + + /// Go to next element in data file. + /// Also writes the current element to result_. + RedisListIterator& Push() { + WriteCurrentElement(); + MoveNext(); + return *this; + } + + /// Go to next element in data file. + /// Drops/skips the current element. It will not be written to result_. + RedisListIterator& Skip() { + MoveNext(); + --length_; // One less item + --cur_elem_; // We moved one forward, but index did not change + return *this; + } + + /// Insert elem into the result_ (just BEFORE the current element / byte) + /// Note: if Done() (i.e.: iterator points to end), this will append elem. + void InsertElement(const Slice& elem) { + // Ensure we are in a valid state + CheckErrors(); + + const int kOrigSize = result_.size(); + result_.resize(kOrigSize + SizeOf(elem)); + EncodeFixed32(result_.data() + kOrigSize, elem.size()); + memcpy(result_.data() + kOrigSize + sizeof(uint32_t), + elem.data(), + elem.size()); + ++length_; + ++cur_elem_; + } + + /// Access the current element, and save the result into *curElem + void GetCurrent(Slice* curElem) { + // Ensure we are in a valid state + CheckErrors(); + + // Ensure that we are not past the last element. + if (Done()) { + ThrowError("Invalid dereferencing."); + } + + // Dereference the element + *curElem = Slice(data_+cur_byte_+sizeof(cur_elem_length_), + cur_elem_length_); + } + + // Number of elements + int Length() const { + return length_; + } + + // Number of bytes in the final representation (i.e: WriteResult().size()) + int Size() const { + // result_ holds the currently written data + // data_[cur_byte..num_bytes-1] is the remainder of the data + return result_.size() + (num_bytes_ - cur_byte_); + } + + // Reached the end? + bool Done() const { + return cur_byte_ >= num_bytes_ || cur_elem_ >= length_; + } + + /// Returns a string representing the final, edited, data. + /// Assumes that all bytes of data_ in the range [0,cur_byte_) have been read + /// and that result_ contains this data. + /// The rest of the data must still be written. + /// So, this method ADVANCES THE ITERATOR TO THE END before writing. + Slice WriteResult() { + CheckErrors(); + + // The header should currently be filled with dummy data (0's) + // Correctly update the header. + // Note, this is safe since result_ is a vector (guaranteed contiguous) + EncodeFixed32(&result_[0],length_); + + // Append the remainder of the data to the result. + result_.insert(result_.end(),data_+cur_byte_, data_ +num_bytes_); + + // Seek to end of file + cur_byte_ = num_bytes_; + cur_elem_ = length_; + cur_elem_length_ = 0; + + // Return the result + return Slice(result_.data(),result_.size()); + } + + public: // Static public functions + + /// An upper-bound on the amount of bytes needed to store this element. + /// This is used to hide representation information from the client. + /// E.G. This can be used to compute the bytes we want to Reserve(). + static uint32_t SizeOf(const Slice& elem) { + // [Integer Length . Data] + return sizeof(uint32_t) + elem.size(); + } + + private: // Private functions + + /// Initializes the result_ string. + /// It will fill the first few bytes with 0's so that there is + /// enough space for header information when we need to write later. + /// Currently, "header information" means: the length (number of elements) + /// Assumes that result_ is empty to begin with + void InitializeResult() { + assert(result_.empty()); // Should always be true. + result_.resize(sizeof(uint32_t),0); // Put a block of 0's as the header + } + + /// Go to the next element (used in Push() and Skip()) + void MoveNext() { + CheckErrors(); + + // Check to make sure we are not already in a finished state + if (Done()) { + ThrowError("Attempting to iterate past end of list."); + } + + // Move forward one element. + cur_byte_ += sizeof(cur_elem_length_) + cur_elem_length_; + ++cur_elem_; + + // If we are at the end, finish + if (Done()) { + cur_elem_length_ = 0; + return; + } + + // Otherwise, we should be able to read the new element's length + if (cur_byte_ + sizeof(cur_elem_length_) > num_bytes_) { + ThrowError("Corrupt element data."); + } + + // Set the new element's length + cur_elem_length_ = DecodeFixed32(data_+cur_byte_); + + return; + } + + /// Append the current element (pointed to by cur_byte_) to result_ + /// Assumes result_ has already been reserved appropriately. + void WriteCurrentElement() { + // First verify that the iterator is still valid. + CheckErrors(); + if (Done()) { + ThrowError("Attempting to write invalid element."); + } + + // Append the cur element. + result_.insert(result_.end(), + data_+cur_byte_, + data_+cur_byte_+ sizeof(uint32_t) + cur_elem_length_); + } + + /// Will ThrowError() if neccessary. + /// Checks for common/ubiquitous errors that can arise after most operations. + /// This method should be called before any reading operation. + /// If this function succeeds, then we are guaranteed to be in a valid state. + /// Other member functions should check for errors and ThrowError() also + /// if an error occurs that is specific to it even while in a valid state. + void CheckErrors() { + // Check if any crazy thing has happened recently + if ((cur_elem_ > length_) || // Bad index + (cur_byte_ > num_bytes_) || // No more bytes + (cur_byte_ + cur_elem_length_ > num_bytes_) || // Item too large + (cur_byte_ == num_bytes_ && cur_elem_ != length_) || // Too many items + (cur_elem_ == length_ && cur_byte_ != num_bytes_)) { // Too many bytes + ThrowError("Corrupt data."); + } + } + + /// Will throw an exception based on the passed-in message. + /// This function is guaranteed to STOP THE CONTROL-FLOW. + /// (i.e.: you do not have to call "return" after calling ThrowError) + void ThrowError(const char* const msg = NULL) { + // TODO: For now we ignore the msg parameter. This can be expanded later. + throw RedisListException(); + } + + private: + const char* const data_; // A pointer to the data (the first byte) + const uint32_t num_bytes_; // The number of bytes in this list + + uint32_t cur_byte_; // The current byte being read + uint32_t cur_elem_; // The current element being read + uint32_t cur_elem_length_; // The number of bytes in current element + + uint32_t length_; // The number of elements in this list + std::vector result_; // The output data +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/redis/redis_lists.cc b/utilities/redis/redis_lists.cc new file mode 100644 index 0000000000..2b38a2da4b --- /dev/null +++ b/utilities/redis/redis_lists.cc @@ -0,0 +1,552 @@ +// Copyright 2013 Facebook +/** + * A (persistent) Redis API built using the rocksdb backend. + * Implements Redis Lists as described on: http://redis.io/commands#list + * + * @throws All functions may throw a RedisListException on error/corruption. + * + * @notes Internally, the set of lists is stored in a rocksdb database, + * mapping keys to values. Each "value" is the list itself, storing + * some kind of internal representation of the data. All the + * representation details are handled by the RedisListIterator class. + * The present file should be oblivious to the representation details, + * handling only the client (Redis) API, and the calls to rocksdb. + * + * @TODO Presently, all operations take at least O(NV) time where + * N is the number of elements in the list, and V is the average + * number of bytes per value in the list. So maybe, with merge operator + * we can improve this to an optimal O(V) amortized time, since we + * wouldn't have to read and re-write the entire list. + * + * @author Deon Nicholas (dnicholas@fb.com) + */ + +#ifndef ROCKSDB_LITE +#include "redis_lists.h" + +#include +#include +#include + +#include "rocksdb/slice.h" +#include "util/coding.h" + +namespace rocksdb +{ + +/// Constructors + +RedisLists::RedisLists(const std::string& db_path, + Options options, bool destructive) + : put_option_(), + get_option_() { + + // Store the name of the database + db_name_ = db_path; + + // If destructive, destroy the DB before re-opening it. + if (destructive) { + DestroyDB(db_name_, Options()); + } + + // Now open and deal with the db + DB* db; + Status s = DB::Open(options, db_name_, &db); + if (!s.ok()) { + std::cerr << "ERROR " << s.ToString() << std::endl; + assert(false); + } + + db_ = std::unique_ptr(db); +} + + +/// Accessors + +// Number of elements in the list associated with key +// : throws RedisListException +int RedisLists::Length(const std::string& key) { + // Extract the string data representing the list. + std::string data; + db_->Get(get_option_, key, &data); + + // Return the length + RedisListIterator it(data); + return it.Length(); +} + +// Get the element at the specified index in the (list: key) +// Returns ("") on out-of-bounds +// : throws RedisListException +bool RedisLists::Index(const std::string& key, int32_t index, + std::string* result) { + // Extract the string data representing the list. + std::string data; + db_->Get(get_option_, key, &data); + + // Handle REDIS negative indices (from the end); fast iff Length() takes O(1) + if (index < 0) { + index = Length(key) - (-index); //replace (-i) with (N-i). + } + + // Iterate through the list until the desired index is found. + int curIndex = 0; + RedisListIterator it(data); + while(curIndex < index && !it.Done()) { + ++curIndex; + it.Skip(); + } + + // If we actually found the index + if (curIndex == index && !it.Done()) { + Slice elem; + it.GetCurrent(&elem); + if (result != NULL) { + *result = elem.ToString(); + } + + return true; + } else { + return false; + } +} + +// Return a truncated version of the list. +// First, negative values for first/last are interpreted as "end of list". +// So, if first == -1, then it is re-set to index: (Length(key) - 1) +// Then, return exactly those indices i such that first <= i <= last. +// : throws RedisListException +std::vector RedisLists::Range(const std::string& key, + int32_t first, int32_t last) { + // Extract the string data representing the list. + std::string data; + db_->Get(get_option_, key, &data); + + // Handle negative bounds (-1 means last element, etc.) + int listLen = Length(key); + if (first < 0) { + first = listLen - (-first); // Replace (-x) with (N-x) + } + if (last < 0) { + last = listLen - (-last); + } + + // Verify bounds (and truncate the range so that it is valid) + first = std::max(first, 0); + last = std::min(last, listLen-1); + int len = std::max(last-first+1, 0); + + // Initialize the resulting list + std::vector result(len); + + // Traverse the list and update the vector + int curIdx = 0; + Slice elem; + for (RedisListIterator it(data); !it.Done() && curIdx<=last; it.Skip()) { + if (first <= curIdx && curIdx <= last) { + it.GetCurrent(&elem); + result[curIdx-first].assign(elem.data(),elem.size()); + } + + ++curIdx; + } + + // Return the result. Might be empty + return result; +} + +// Print the (list: key) out to stdout. For debugging mostly. Public for now. +void RedisLists::Print(const std::string& key) { + // Extract the string data representing the list. + std::string data; + db_->Get(get_option_, key, &data); + + // Iterate through the list and print the items + Slice elem; + for (RedisListIterator it(data); !it.Done(); it.Skip()) { + it.GetCurrent(&elem); + std::cout << "ITEM " << elem.ToString() << std::endl; + } + + //Now print the byte data + RedisListIterator it(data); + std::cout << "==Printing data==" << std::endl; + std::cout << data.size() << std::endl; + std::cout << it.Size() << " " << it.Length() << std::endl; + Slice result = it.WriteResult(); + std::cout << result.data() << std::endl; + if (true) { + std::cout << "size: " << result.size() << std::endl; + const char* val = result.data(); + for(int i=0; i<(int)result.size(); ++i) { + std::cout << (int)val[i] << " " << (val[i]>=32?val[i]:' ') << std::endl; + } + std::cout << std::endl; + } +} + +/// Insert/Update Functions +/// Note: The "real" insert function is private. See below. + +// InsertBefore and InsertAfter are simply wrappers around the Insert function. +int RedisLists::InsertBefore(const std::string& key, const std::string& pivot, + const std::string& value) { + return Insert(key, pivot, value, false); +} + +int RedisLists::InsertAfter(const std::string& key, const std::string& pivot, + const std::string& value) { + return Insert(key, pivot, value, true); +} + +// Prepend value onto beginning of (list: key) +// : throws RedisListException +int RedisLists::PushLeft(const std::string& key, const std::string& value) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Construct the result + RedisListIterator it(data); + it.Reserve(it.Size() + it.SizeOf(value)); + it.InsertElement(value); + + // Push the data back to the db and return the length + db_->Put(put_option_, key, it.WriteResult()); + return it.Length(); +} + +// Append value onto end of (list: key) +// TODO: Make this O(1) time. Might require MergeOperator. +// : throws RedisListException +int RedisLists::PushRight(const std::string& key, const std::string& value) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Create an iterator to the data and seek to the end. + RedisListIterator it(data); + it.Reserve(it.Size() + it.SizeOf(value)); + while (!it.Done()) { + it.Push(); // Write each element as we go + } + + // Insert the new element at the current position (the end) + it.InsertElement(value); + + // Push it back to the db, and return length + db_->Put(put_option_, key, it.WriteResult()); + return it.Length(); +} + +// Set (list: key)[idx] = val. Return true on success, false on fail. +// : throws RedisListException +bool RedisLists::Set(const std::string& key, int32_t index, + const std::string& value) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Handle negative index for REDIS (meaning -index from end of list) + if (index < 0) { + index = Length(key) - (-index); + } + + // Iterate through the list until we find the element we want + int curIndex = 0; + RedisListIterator it(data); + it.Reserve(it.Size() + it.SizeOf(value)); // Over-estimate is fine + while(curIndex < index && !it.Done()) { + it.Push(); + ++curIndex; + } + + // If not found, return false (this occurs when index was invalid) + if (it.Done() || curIndex != index) { + return false; + } + + // Write the new element value, and drop the previous element value + it.InsertElement(value); + it.Skip(); + + // Write the data to the database + // Check status, since it needs to return true/false guarantee + Status s = db_->Put(put_option_, key, it.WriteResult()); + + // Success + return s.ok(); +} + +/// Delete / Remove / Pop functions + +// Trim (list: key) so that it will only contain the indices from start..stop +// Invalid indices will not generate an error, just empty, +// or the portion of the list that fits in this interval +// : throws RedisListException +bool RedisLists::Trim(const std::string& key, int32_t start, int32_t stop) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Handle negative indices in REDIS + int listLen = Length(key); + if (start < 0) { + start = listLen - (-start); + } + if (stop < 0) { + stop = listLen - (-stop); + } + + // Truncate bounds to only fit in the list + start = std::max(start, 0); + stop = std::min(stop, listLen-1); + + // Construct an iterator for the list. Drop all undesired elements. + int curIndex = 0; + RedisListIterator it(data); + it.Reserve(it.Size()); // Over-estimate + while(!it.Done()) { + // If not within the range, just skip the item (drop it). + // Otherwise, continue as usual. + if (start <= curIndex && curIndex <= stop) { + it.Push(); + } else { + it.Skip(); + } + + // Increment the current index + ++curIndex; + } + + // Write the (possibly empty) result to the database + Status s = db_->Put(put_option_, key, it.WriteResult()); + + // Return true as long as the write succeeded + return s.ok(); +} + +// Return and remove the first element in the list (or "" if empty) +// : throws RedisListException +bool RedisLists::PopLeft(const std::string& key, std::string* result) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Point to first element in the list (if it exists), and get its value/size + RedisListIterator it(data); + if (it.Length() > 0) { // Proceed only if list is non-empty + Slice elem; + it.GetCurrent(&elem); // Store the value of the first element + it.Reserve(it.Size() - it.SizeOf(elem)); + it.Skip(); // DROP the first item and move to next + + // Update the db + db_->Put(put_option_, key, it.WriteResult()); + + // Return the value + if (result != NULL) { + *result = elem.ToString(); + } + return true; + } else { + return false; + } +} + +// Remove and return the last element in the list (or "" if empty) +// TODO: Make this O(1). Might require MergeOperator. +// : throws RedisListException +bool RedisLists::PopRight(const std::string& key, std::string* result) { + // Extract the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Construct an iterator to the data and move to last element + RedisListIterator it(data); + it.Reserve(it.Size()); + int len = it.Length(); + int curIndex = 0; + while(curIndex < (len-1) && !it.Done()) { + it.Push(); + ++curIndex; + } + + // Extract and drop/skip the last element + if (curIndex == len-1) { + assert(!it.Done()); // Sanity check. Should not have ended here. + + // Extract and pop the element + Slice elem; + it.GetCurrent(&elem); // Save value of element. + it.Skip(); // Skip the element + + // Write the result to the database + db_->Put(put_option_, key, it.WriteResult()); + + // Return the value + if (result != NULL) { + *result = elem.ToString(); + } + return true; + } else { + // Must have been an empty list + assert(it.Done() && len==0 && curIndex == 0); + return false; + } +} + +// Remove the (first or last) "num" occurrences of value in (list: key) +// : throws RedisListException +int RedisLists::Remove(const std::string& key, int32_t num, + const std::string& value) { + // Negative num ==> RemoveLast; Positive num ==> Remove First + if (num < 0) { + return RemoveLast(key, -num, value); + } else if (num > 0) { + return RemoveFirst(key, num, value); + } else { + return RemoveFirst(key, Length(key), value); + } +} + +// Remove the first "num" occurrences of value in (list: key). +// : throws RedisListException +int RedisLists::RemoveFirst(const std::string& key, int32_t num, + const std::string& value) { + // Ensure that the number is positive + assert(num >= 0); + + // Extract the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Traverse the list, appending all but the desired occurrences of value + int numSkipped = 0; // Keep track of the number of times value is seen + Slice elem; + RedisListIterator it(data); + it.Reserve(it.Size()); + while (!it.Done()) { + it.GetCurrent(&elem); + + if (elem == value && numSkipped < num) { + // Drop this item if desired + it.Skip(); + ++numSkipped; + } else { + // Otherwise keep the item and proceed as normal + it.Push(); + } + } + + // Put the result back to the database + db_->Put(put_option_, key, it.WriteResult()); + + // Return the number of elements removed + return numSkipped; +} + + +// Remove the last "num" occurrences of value in (list: key). +// TODO: I traverse the list 2x. Make faster. Might require MergeOperator. +// : throws RedisListException +int RedisLists::RemoveLast(const std::string& key, int32_t num, + const std::string& value) { + // Ensure that the number is positive + assert(num >= 0); + + // Extract the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Temporary variable to hold the "current element" in the blocks below + Slice elem; + + // Count the total number of occurrences of value + int totalOccs = 0; + for (RedisListIterator it(data); !it.Done(); it.Skip()) { + it.GetCurrent(&elem); + if (elem == value) { + ++totalOccs; + } + } + + // Construct an iterator to the data. Reserve enough space for the result. + RedisListIterator it(data); + int bytesRemoved = std::min(num,totalOccs)*it.SizeOf(value); + it.Reserve(it.Size() - bytesRemoved); + + // Traverse the list, appending all but the desired occurrences of value. + // Note: "Drop the last k occurrences" is equivalent to + // "keep only the first n-k occurrences", where n is total occurrences. + int numKept = 0; // Keep track of the number of times value is kept + while(!it.Done()) { + it.GetCurrent(&elem); + + // If we are within the deletion range and equal to value, drop it. + // Otherwise, append/keep/push it. + if (elem == value) { + if (numKept < totalOccs - num) { + it.Push(); + ++numKept; + } else { + it.Skip(); + } + } else { + // Always append the others + it.Push(); + } + } + + // Put the result back to the database + db_->Put(put_option_, key, it.WriteResult()); + + // Return the number of elements removed + return totalOccs - numKept; +} + +/// Private functions + +// Insert element value into (list: key), right before/after +// the first occurrence of pivot +// : throws RedisListException +int RedisLists::Insert(const std::string& key, const std::string& pivot, + const std::string& value, bool insert_after) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Construct an iterator to the data and reserve enough space for result. + RedisListIterator it(data); + it.Reserve(it.Size() + it.SizeOf(value)); + + // Iterate through the list until we find the element we want + Slice elem; + bool found = false; + while(!it.Done() && !found) { + it.GetCurrent(&elem); + + // When we find the element, insert the element and mark found + if (elem == pivot) { // Found it! + found = true; + if (insert_after == true) { // Skip one more, if inserting after it + it.Push(); + } + it.InsertElement(value); + } else { + it.Push(); + } + + } + + // Put the data (string) into the database + if (found) { + db_->Put(put_option_, key, it.WriteResult()); + } + + // Returns the new (possibly unchanged) length of the list + return it.Length(); +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/redis/redis_lists.h b/utilities/redis/redis_lists.h new file mode 100644 index 0000000000..6c8b9551ea --- /dev/null +++ b/utilities/redis/redis_lists.h @@ -0,0 +1,108 @@ +/** + * A (persistent) Redis API built using the rocksdb backend. + * Implements Redis Lists as described on: http://redis.io/commands#list + * + * @throws All functions may throw a RedisListException + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#ifndef ROCKSDB_LITE +#pragma once + +#include +#include "rocksdb/db.h" +#include "redis_list_iterator.h" +#include "redis_list_exception.h" + +namespace rocksdb { + +/// The Redis functionality (see http://redis.io/commands#list) +/// All functions may THROW a RedisListException +class RedisLists { + public: // Constructors / Destructors + /// Construct a new RedisLists database, with name/path of db. + /// Will clear the database on open iff destructive is true (default false). + /// Otherwise, it will restore saved changes. + /// May throw RedisListException + RedisLists(const std::string& db_path, + Options options, bool destructive = false); + + public: // Accessors + /// The number of items in (list: key) + int Length(const std::string& key); + + /// Search the list for the (index)'th item (0-based) in (list:key) + /// A negative index indicates: "from end-of-list" + /// If index is within range: return true, and return the value in *result. + /// If (index < -length OR index>=length), then index is out of range: + /// return false (and *result is left unchanged) + /// May throw RedisListException + bool Index(const std::string& key, int32_t index, + std::string* result); + + /// Return (list: key)[first..last] (inclusive) + /// May throw RedisListException + std::vector Range(const std::string& key, + int32_t first, int32_t last); + + /// Prints the entire (list: key), for debugging. + void Print(const std::string& key); + + public: // Insert/Update + /// Insert value before/after pivot in (list: key). Return the length. + /// May throw RedisListException + int InsertBefore(const std::string& key, const std::string& pivot, + const std::string& value); + int InsertAfter(const std::string& key, const std::string& pivot, + const std::string& value); + + /// Push / Insert value at beginning/end of the list. Return the length. + /// May throw RedisListException + int PushLeft(const std::string& key, const std::string& value); + int PushRight(const std::string& key, const std::string& value); + + /// Set (list: key)[idx] = val. Return true on success, false on fail + /// May throw RedisListException + bool Set(const std::string& key, int32_t index, const std::string& value); + + public: // Delete / Remove / Pop / Trim + /// Trim (list: key) so that it will only contain the indices from start..stop + /// Returns true on success + /// May throw RedisListException + bool Trim(const std::string& key, int32_t start, int32_t stop); + + /// If list is empty, return false and leave *result unchanged. + /// Else, remove the first/last elem, store it in *result, and return true + bool PopLeft(const std::string& key, std::string* result); // First + bool PopRight(const std::string& key, std::string* result); // Last + + /// Remove the first (or last) num occurrences of value from the list (key) + /// Return the number of elements removed. + /// May throw RedisListException + int Remove(const std::string& key, int32_t num, + const std::string& value); + int RemoveFirst(const std::string& key, int32_t num, + const std::string& value); + int RemoveLast(const std::string& key, int32_t num, + const std::string& value); + + private: // Private Functions + /// Calls InsertBefore or InsertAfter + int Insert(const std::string& key, const std::string& pivot, + const std::string& value, bool insert_after); + private: + std::string db_name_; // The actual database name/path + WriteOptions put_option_; + ReadOptions get_option_; + + /// The backend rocksdb database. + /// Map : key --> list + /// where a list is a sequence of elements + /// and an element is a 4-byte integer (n), followed by n bytes of data + std::unique_ptr db_; +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/redis/redis_lists_test.cc b/utilities/redis/redis_lists_test.cc new file mode 100644 index 0000000000..b05c6c798d --- /dev/null +++ b/utilities/redis/redis_lists_test.cc @@ -0,0 +1,884 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +/** + * A test harness for the Redis API built on rocksdb. + * + * USAGE: Build with: "make redis_test" (in rocksdb directory). + * Run unit tests with: "./redis_test" + * Manual/Interactive user testing: "./redis_test -m" + * Manual user testing + restart database: "./redis_test -m -d" + * + * TODO: Add LARGE random test cases to verify efficiency and scalability + * + * @author Deon Nicholas (dnicholas@fb.com) + */ + + +#include +#include + +#include "redis_lists.h" +#include "util/testharness.h" +#include "util/random.h" + +using namespace rocksdb; +using namespace std; + +namespace rocksdb { + +class RedisListsTest { + public: + static const string kDefaultDbName; + static Options options; + + RedisListsTest() { + options.create_if_missing = true; + } +}; + +const string RedisListsTest::kDefaultDbName = "/tmp/redisdefaultdb/"; +Options RedisListsTest::options = Options(); + +// operator== and operator<< are defined below for vectors (lists) +// Needed for ASSERT_EQ + +namespace { +void AssertListEq(const std::vector& result, + const std::vector& expected_result) { + ASSERT_EQ(result.size(), expected_result.size()); + for (size_t i = 0; i < result.size(); ++i) { + ASSERT_EQ(result[i], expected_result[i]); + } +} +} // namespace + +// PushRight, Length, Index, Range +TEST(RedisListsTest, SimpleTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Simple PushRight (should return the new length each time) + ASSERT_EQ(redis.PushRight("k1", "v1"), 1); + ASSERT_EQ(redis.PushRight("k1", "v2"), 2); + ASSERT_EQ(redis.PushRight("k1", "v3"), 3); + + // Check Length and Index() functions + ASSERT_EQ(redis.Length("k1"), 3); // Check length + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "v1"); // Check valid indices + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "v2"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "v3"); + + // Check range function and vectors + std::vector result = redis.Range("k1", 0, 2); // Get the list + std::vector expected_result(3); + expected_result[0] = "v1"; + expected_result[1] = "v2"; + expected_result[2] = "v3"; + AssertListEq(result, expected_result); +} + +// PushLeft, Length, Index, Range +TEST(RedisListsTest, SimpleTest2) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Simple PushRight + ASSERT_EQ(redis.PushLeft("k1", "v3"), 1); + ASSERT_EQ(redis.PushLeft("k1", "v2"), 2); + ASSERT_EQ(redis.PushLeft("k1", "v1"), 3); + + // Check Length and Index() functions + ASSERT_EQ(redis.Length("k1"), 3); // Check length + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "v1"); // Check valid indices + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "v2"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "v3"); + + // Check range function and vectors + std::vector result = redis.Range("k1", 0, 2); // Get the list + std::vector expected_result(3); + expected_result[0] = "v1"; + expected_result[1] = "v2"; + expected_result[2] = "v3"; + AssertListEq(result, expected_result); +} + +// Exhaustive test of the Index() function +TEST(RedisListsTest, IndexTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Empty Index check (return empty and should not crash or edit tempv) + tempv = "yo"; + ASSERT_TRUE(!redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "yo"); + ASSERT_TRUE(!redis.Index("fda", 3, &tempv)); + ASSERT_EQ(tempv, "yo"); + ASSERT_TRUE(!redis.Index("random", -12391, &tempv)); + ASSERT_EQ(tempv, "yo"); + + // Simple Pushes (will yield: [v6, v4, v4, v1, v2, v3] + redis.PushRight("k1", "v1"); + redis.PushRight("k1", "v2"); + redis.PushRight("k1", "v3"); + redis.PushLeft("k1", "v4"); + redis.PushLeft("k1", "v4"); + redis.PushLeft("k1", "v6"); + + // Simple, non-negative indices + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "v6"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "v4"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "v4"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "v1"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "v2"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "v3"); + + // Negative indices + ASSERT_TRUE(redis.Index("k1", -6, &tempv)); + ASSERT_EQ(tempv, "v6"); + ASSERT_TRUE(redis.Index("k1", -5, &tempv)); + ASSERT_EQ(tempv, "v4"); + ASSERT_TRUE(redis.Index("k1", -4, &tempv)); + ASSERT_EQ(tempv, "v4"); + ASSERT_TRUE(redis.Index("k1", -3, &tempv)); + ASSERT_EQ(tempv, "v1"); + ASSERT_TRUE(redis.Index("k1", -2, &tempv)); + ASSERT_EQ(tempv, "v2"); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "v3"); + + // Out of bounds (return empty, no crash) + ASSERT_TRUE(!redis.Index("k1", 6, &tempv)); + ASSERT_TRUE(!redis.Index("k1", 123219, &tempv)); + ASSERT_TRUE(!redis.Index("k1", -7, &tempv)); + ASSERT_TRUE(!redis.Index("k1", -129, &tempv)); +} + + +// Exhaustive test of the Range() function +TEST(RedisListsTest, RangeTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Simple Pushes (will yield: [v6, v4, v4, v1, v2, v3]) + redis.PushRight("k1", "v1"); + redis.PushRight("k1", "v2"); + redis.PushRight("k1", "v3"); + redis.PushLeft("k1", "v4"); + redis.PushLeft("k1", "v4"); + redis.PushLeft("k1", "v6"); + + // Sanity check (check the length; make sure it's 6) + ASSERT_EQ(redis.Length("k1"), 6); + + // Simple range + std::vector res = redis.Range("k1", 1, 4); + ASSERT_EQ((int)res.size(), 4); + ASSERT_EQ(res[0], "v4"); + ASSERT_EQ(res[1], "v4"); + ASSERT_EQ(res[2], "v1"); + ASSERT_EQ(res[3], "v2"); + + // Negative indices (i.e.: measured from the end) + res = redis.Range("k1", 2, -1); + ASSERT_EQ((int)res.size(), 4); + ASSERT_EQ(res[0], "v4"); + ASSERT_EQ(res[1], "v1"); + ASSERT_EQ(res[2], "v2"); + ASSERT_EQ(res[3], "v3"); + + res = redis.Range("k1", -6, -4); + ASSERT_EQ((int)res.size(), 3); + ASSERT_EQ(res[0], "v6"); + ASSERT_EQ(res[1], "v4"); + ASSERT_EQ(res[2], "v4"); + + res = redis.Range("k1", -1, 5); + ASSERT_EQ((int)res.size(), 1); + ASSERT_EQ(res[0], "v3"); + + // Partial / Broken indices + res = redis.Range("k1", -3, 1000000); + ASSERT_EQ((int)res.size(), 3); + ASSERT_EQ(res[0], "v1"); + ASSERT_EQ(res[1], "v2"); + ASSERT_EQ(res[2], "v3"); + + res = redis.Range("k1", -1000000, 1); + ASSERT_EQ((int)res.size(), 2); + ASSERT_EQ(res[0], "v6"); + ASSERT_EQ(res[1], "v4"); + + // Invalid indices + res = redis.Range("k1", 7, 9); + ASSERT_EQ((int)res.size(), 0); + + res = redis.Range("k1", -8, -7); + ASSERT_EQ((int)res.size(), 0); + + res = redis.Range("k1", 3, 2); + ASSERT_EQ((int)res.size(), 0); + + res = redis.Range("k1", 5, -2); + ASSERT_EQ((int)res.size(), 0); + + // Range matches Index + res = redis.Range("k1", -6, -4); + ASSERT_TRUE(redis.Index("k1", -6, &tempv)); + ASSERT_EQ(tempv, res[0]); + ASSERT_TRUE(redis.Index("k1", -5, &tempv)); + ASSERT_EQ(tempv, res[1]); + ASSERT_TRUE(redis.Index("k1", -4, &tempv)); + ASSERT_EQ(tempv, res[2]); + + // Last check + res = redis.Range("k1", 0, -6); + ASSERT_EQ((int)res.size(), 1); + ASSERT_EQ(res[0], "v6"); +} + +// Exhaustive test for InsertBefore(), and InsertAfter() +TEST(RedisListsTest, InsertTest) { + RedisLists redis(kDefaultDbName, options, true); + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Insert on empty list (return 0, and do not crash) + ASSERT_EQ(redis.InsertBefore("k1", "non-exist", "a"), 0); + ASSERT_EQ(redis.InsertAfter("k1", "other-non-exist", "c"), 0); + ASSERT_EQ(redis.Length("k1"), 0); + + // Push some preliminary stuff [g, f, e, d, c, b, a] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "b"); + redis.PushLeft("k1", "c"); + redis.PushLeft("k1", "d"); + redis.PushLeft("k1", "e"); + redis.PushLeft("k1", "f"); + redis.PushLeft("k1", "g"); + ASSERT_EQ(redis.Length("k1"), 7); + + // Test InsertBefore + int newLength = redis.InsertBefore("k1", "e", "hello"); + ASSERT_EQ(newLength, 8); + ASSERT_EQ(redis.Length("k1"), newLength); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "f"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "e"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "hello"); + + // Test InsertAfter + newLength = redis.InsertAfter("k1", "c", "bye"); + ASSERT_EQ(newLength, 9); + ASSERT_EQ(redis.Length("k1"), newLength); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "bye"); + + // Test bad value on InsertBefore + newLength = redis.InsertBefore("k1", "yo", "x"); + ASSERT_EQ(newLength, 9); + ASSERT_EQ(redis.Length("k1"), newLength); + + // Test bad value on InsertAfter + newLength = redis.InsertAfter("k1", "xxxx", "y"); + ASSERT_EQ(newLength, 9); + ASSERT_EQ(redis.Length("k1"), newLength); + + // Test InsertBefore beginning + newLength = redis.InsertBefore("k1", "g", "begggggggggggggggg"); + ASSERT_EQ(newLength, 10); + ASSERT_EQ(redis.Length("k1"), newLength); + + // Test InsertAfter end + newLength = redis.InsertAfter("k1", "a", "enddd"); + ASSERT_EQ(newLength, 11); + ASSERT_EQ(redis.Length("k1"), newLength); + + // Make sure nothing weird happened. + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "begggggggggggggggg"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "g"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "f"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "hello"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "e"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "d"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "c"); + ASSERT_TRUE(redis.Index("k1", 7, &tempv)); + ASSERT_EQ(tempv, "bye"); + ASSERT_TRUE(redis.Index("k1", 8, &tempv)); + ASSERT_EQ(tempv, "b"); + ASSERT_TRUE(redis.Index("k1", 9, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(redis.Index("k1", 10, &tempv)); + ASSERT_EQ(tempv, "enddd"); +} + +// Exhaustive test of Set function +TEST(RedisListsTest, SetTest) { + RedisLists redis(kDefaultDbName, options, true); + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Set on empty list (return false, and do not crash) + ASSERT_EQ(redis.Set("k1", 7, "a"), false); + ASSERT_EQ(redis.Set("k1", 0, "a"), false); + ASSERT_EQ(redis.Set("k1", -49, "cx"), false); + ASSERT_EQ(redis.Length("k1"), 0); + + // Push some preliminary stuff [g, f, e, d, c, b, a] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "b"); + redis.PushLeft("k1", "c"); + redis.PushLeft("k1", "d"); + redis.PushLeft("k1", "e"); + redis.PushLeft("k1", "f"); + redis.PushLeft("k1", "g"); + ASSERT_EQ(redis.Length("k1"), 7); + + // Test Regular Set + ASSERT_TRUE(redis.Set("k1", 0, "0")); + ASSERT_TRUE(redis.Set("k1", 3, "3")); + ASSERT_TRUE(redis.Set("k1", 6, "6")); + ASSERT_TRUE(redis.Set("k1", 2, "2")); + ASSERT_TRUE(redis.Set("k1", 5, "5")); + ASSERT_TRUE(redis.Set("k1", 1, "1")); + ASSERT_TRUE(redis.Set("k1", 4, "4")); + + ASSERT_EQ(redis.Length("k1"), 7); // Size should not change + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "0"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "1"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "2"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "3"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "4"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "5"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "6"); + + // Set with negative indices + ASSERT_TRUE(redis.Set("k1", -7, "a")); + ASSERT_TRUE(redis.Set("k1", -4, "d")); + ASSERT_TRUE(redis.Set("k1", -1, "g")); + ASSERT_TRUE(redis.Set("k1", -5, "c")); + ASSERT_TRUE(redis.Set("k1", -2, "f")); + ASSERT_TRUE(redis.Set("k1", -6, "b")); + ASSERT_TRUE(redis.Set("k1", -3, "e")); + + ASSERT_EQ(redis.Length("k1"), 7); // Size should not change + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "b"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "c"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "d"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "e"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "f"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "g"); + + // Bad indices (just out-of-bounds / off-by-one check) + ASSERT_EQ(redis.Set("k1", -8, "off-by-one in negative index"), false); + ASSERT_EQ(redis.Set("k1", 7, "off-by-one-error in positive index"), false); + ASSERT_EQ(redis.Set("k1", 43892, "big random index should fail"), false); + ASSERT_EQ(redis.Set("k1", -21391, "large negative index should fail"), false); + + // One last check (to make sure nothing weird happened) + ASSERT_EQ(redis.Length("k1"), 7); // Size should not change + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "b"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "c"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "d"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "e"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "f"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "g"); +} + +// Testing Insert, Push, and Set, in a mixed environment +TEST(RedisListsTest, InsertPushSetTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // A series of pushes and insertions + // Will result in [newbegin, z, a, aftera, x, newend] + // Also, check the return value sometimes (should return length) + int lengthCheck; + lengthCheck = redis.PushLeft("k1", "a"); + ASSERT_EQ(lengthCheck, 1); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + lengthCheck = redis.InsertAfter("k1", "a", "aftera"); + ASSERT_EQ(lengthCheck , 4); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore beginning of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + + // Check + std::vector res = redis.Range("k1", 0, -1); // Get the list + ASSERT_EQ((int)res.size(), 6); + ASSERT_EQ(res[0], "newbegin"); + ASSERT_EQ(res[5], "newend"); + ASSERT_EQ(res[3], "aftera"); + + // Testing duplicate values/pivots (multiple occurrences of 'a') + ASSERT_TRUE(redis.Set("k1", 0, "a")); // [a, z, a, aftera, x, newend] + redis.InsertAfter("k1", "a", "happy"); // [a, happy, z, a, aftera, ...] + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "happy"); + redis.InsertBefore("k1", "a", "sad"); // [sad, a, happy, z, a, aftera, ...] + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "sad"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "happy"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "aftera"); + redis.InsertAfter("k1", "a", "zz"); // [sad, a, zz, happy, z, a, aftera, ...] + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "zz"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "aftera"); + ASSERT_TRUE(redis.Set("k1", 1, "nota")); // [sad, nota, zz, happy, z, a, ...] + redis.InsertBefore("k1", "a", "ba"); // [sad, nota, zz, happy, z, ba, a, ...] + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "ba"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "a"); + + // We currently have: [sad, nota, zz, happy, z, ba, a, aftera, x, newend] + // redis.Print("k1"); // manually check + + // Test Inserting before/after non-existent values + lengthCheck = redis.Length("k1"); // Ensure that the length doesn't change + ASSERT_EQ(lengthCheck, 10); + ASSERT_EQ(redis.InsertBefore("k1", "non-exist", "randval"), lengthCheck); + ASSERT_EQ(redis.InsertAfter("k1", "nothing", "a"), lengthCheck); + ASSERT_EQ(redis.InsertAfter("randKey", "randVal", "ranValue"), 0); // Empty + ASSERT_EQ(redis.Length("k1"), lengthCheck); // The length should not change + + // Simply Test the Set() function + redis.Set("k1", 5, "ba2"); + redis.InsertBefore("k1", "ba2", "beforeba2"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "beforeba2"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "ba2"); + ASSERT_TRUE(redis.Index("k1", 7, &tempv)); + ASSERT_EQ(tempv, "a"); + + // We have: [sad, nota, zz, happy, z, beforeba2, ba2, a, aftera, x, newend] + + // Set() with negative indices + redis.Set("k1", -1, "endprank"); + ASSERT_TRUE(!redis.Index("k1", 11, &tempv)); + ASSERT_TRUE(redis.Index("k1", 10, &tempv)); + ASSERT_EQ(tempv, "endprank"); // Ensure Set worked correctly + redis.Set("k1", -11, "t"); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "t"); + + // Test out of bounds Set + ASSERT_EQ(redis.Set("k1", -12, "ssd"), false); + ASSERT_EQ(redis.Set("k1", 11, "sasd"), false); + ASSERT_EQ(redis.Set("k1", 1200, "big"), false); +} + +// Testing Trim, Pop +TEST(RedisListsTest, TrimPopTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // A series of pushes and insertions + // Will result in [newbegin, z, a, aftera, x, newend] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore start of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + redis.InsertAfter("k1", "a", "aftera"); + + // Simple PopLeft/Right test + ASSERT_TRUE(redis.PopLeft("k1", &tempv)); + ASSERT_EQ(tempv, "newbegin"); + ASSERT_EQ(redis.Length("k1"), 5); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.PopRight("k1", &tempv)); + ASSERT_EQ(tempv, "newend"); + ASSERT_EQ(redis.Length("k1"), 4); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "x"); + + // Now have: [z, a, aftera, x] + + // Test Trim + ASSERT_TRUE(redis.Trim("k1", 0, -1)); // [z, a, aftera, x] (do nothing) + ASSERT_EQ(redis.Length("k1"), 4); + ASSERT_TRUE(redis.Trim("k1", 0, 2)); // [z, a, aftera] + ASSERT_EQ(redis.Length("k1"), 3); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "aftera"); + ASSERT_TRUE(redis.Trim("k1", 1, 1)); // [a] + ASSERT_EQ(redis.Length("k1"), 1); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "a"); + + // Test out of bounds (empty) trim + ASSERT_TRUE(redis.Trim("k1", 1, 0)); + ASSERT_EQ(redis.Length("k1"), 0); + + // Popping with empty list (return empty without error) + ASSERT_TRUE(!redis.PopLeft("k1", &tempv)); + ASSERT_TRUE(!redis.PopRight("k1", &tempv)); + ASSERT_TRUE(redis.Trim("k1", 0, 5)); + + // Exhaustive Trim test (negative and invalid indices) + // Will start in [newbegin, z, a, aftera, x, newend] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore start of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + redis.InsertAfter("k1", "a", "aftera"); + ASSERT_TRUE(redis.Trim("k1", -6, -1)); // Should do nothing + ASSERT_EQ(redis.Length("k1"), 6); + ASSERT_TRUE(redis.Trim("k1", 1, -2)); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "x"); + ASSERT_EQ(redis.Length("k1"), 4); + ASSERT_TRUE(redis.Trim("k1", -3, -2)); + ASSERT_EQ(redis.Length("k1"), 2); +} + +// Testing Remove, RemoveFirst, RemoveLast +TEST(RedisListsTest, RemoveTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // A series of pushes and insertions + // Will result in [newbegin, z, a, aftera, x, newend, a, a] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore start of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + redis.InsertAfter("k1", "a", "aftera"); + redis.PushRight("k1", "a"); + redis.PushRight("k1", "a"); + + // Verify + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "newbegin"); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "a"); + + // Check RemoveFirst (Remove the first two 'a') + // Results in [newbegin, z, aftera, x, newend, a] + int numRemoved = redis.Remove("k1", 2, "a"); + ASSERT_EQ(numRemoved, 2); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "newbegin"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "newend"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_EQ(redis.Length("k1"), 6); + + // Repopulate some stuff + // Results in: [x, x, x, x, x, newbegin, z, x, aftera, x, newend, a, x] + redis.PushLeft("k1", "x"); + redis.PushLeft("k1", "x"); + redis.PushLeft("k1", "x"); + redis.PushLeft("k1", "x"); + redis.PushLeft("k1", "x"); + redis.PushRight("k1", "x"); + redis.InsertAfter("k1", "z", "x"); + + // Test removal from end + numRemoved = redis.Remove("k1", -2, "x"); + ASSERT_EQ(numRemoved, 2); + ASSERT_TRUE(redis.Index("k1", 8, &tempv)); + ASSERT_EQ(tempv, "aftera"); + ASSERT_TRUE(redis.Index("k1", 9, &tempv)); + ASSERT_EQ(tempv, "newend"); + ASSERT_TRUE(redis.Index("k1", 10, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(!redis.Index("k1", 11, &tempv)); + numRemoved = redis.Remove("k1", -2, "x"); + ASSERT_EQ(numRemoved, 2); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "newbegin"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "aftera"); + + // We now have: [x, x, x, x, newbegin, z, aftera, newend, a] + ASSERT_EQ(redis.Length("k1"), 9); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "x"); + + // Test over-shooting (removing more than there exists) + numRemoved = redis.Remove("k1", -9000, "x"); + ASSERT_EQ(numRemoved , 4); // Only really removed 4 + ASSERT_EQ(redis.Length("k1"), 5); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "newbegin"); + numRemoved = redis.Remove("k1", 1, "x"); + ASSERT_EQ(numRemoved, 0); + + // Try removing ALL! + numRemoved = redis.Remove("k1", 0, "newbegin"); // REMOVE 0 will remove all! + ASSERT_EQ(numRemoved, 1); + + // Removal from an empty-list + ASSERT_TRUE(redis.Trim("k1", 1, 0)); + numRemoved = redis.Remove("k1", 1, "z"); + ASSERT_EQ(numRemoved, 0); +} + + +// Test Multiple keys and Persistence +TEST(RedisListsTest, PersistenceMultiKeyTest) { + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Block one: populate a single key in the database + { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + // A series of pushes and insertions + // Will result in [newbegin, z, a, aftera, x, newend, a, a] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore start of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + redis.InsertAfter("k1", "a", "aftera"); + redis.PushRight("k1", "a"); + redis.PushRight("k1", "a"); + + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "aftera"); + } + + // Block two: make sure changes were saved and add some other key + { + RedisLists redis(kDefaultDbName, options, false); // Persistent, non-destructive + + // Check + ASSERT_EQ(redis.Length("k1"), 8); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "aftera"); + + redis.PushRight("k2", "randomkey"); + redis.PushLeft("k2", "sas"); + + redis.PopLeft("k1", &tempv); + } + + // Block three: Verify the changes from block 2 + { + RedisLists redis(kDefaultDbName, options, false); // Persistent, non-destructive + + // Check + ASSERT_EQ(redis.Length("k1"), 7); + ASSERT_EQ(redis.Length("k2"), 2); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k2", -2, &tempv)); + ASSERT_EQ(tempv, "sas"); + } +} + +/// THE manual REDIS TEST begins here +/// THIS WILL ONLY OCCUR IF YOU RUN: ./redis_test -m + +namespace { +void MakeUpper(std::string* const s) { + int len = s->length(); + for(int i=0; i + } +} + +/// Allows the user to enter in REDIS commands into the command-line. +/// This is useful for manual / interacticve testing / debugging. +/// Use destructive=true to clean the database before use. +/// Use destructive=false to remember the previous state (i.e.: persistent) +/// Should be called from main function. +int manual_redis_test(bool destructive){ + RedisLists redis(RedisListsTest::kDefaultDbName, + RedisListsTest::options, + destructive); + + // TODO: Right now, please use spaces to separate each word. + // In actual redis, you can use quotes to specify compound values + // Example: RPUSH mylist "this is a compound value" + + std::string command; + while(true) { + cin >> command; + MakeUpper(&command); + + if (command == "LINSERT") { + std::string k, t, p, v; + cin >> k >> t >> p >> v; + MakeUpper(&t); + if (t=="BEFORE") { + std::cout << redis.InsertBefore(k, p, v) << std::endl; + } else if (t=="AFTER") { + std::cout << redis.InsertAfter(k, p, v) << std::endl; + } + } else if (command == "LPUSH") { + std::string k, v; + std::cin >> k >> v; + redis.PushLeft(k, v); + } else if (command == "RPUSH") { + std::string k, v; + std::cin >> k >> v; + redis.PushRight(k, v); + } else if (command == "LPOP") { + std::string k; + std::cin >> k; + string res; + redis.PopLeft(k, &res); + std::cout << res << std::endl; + } else if (command == "RPOP") { + std::string k; + std::cin >> k; + string res; + redis.PopRight(k, &res); + std::cout << res << std::endl; + } else if (command == "LREM") { + std::string k; + int amt; + std::string v; + + std::cin >> k >> amt >> v; + std::cout << redis.Remove(k, amt, v) << std::endl; + } else if (command == "LLEN") { + std::string k; + std::cin >> k; + std::cout << redis.Length(k) << std::endl; + } else if (command == "LRANGE") { + std::string k; + int i, j; + std::cin >> k >> i >> j; + std::vector res = redis.Range(k, i, j); + for (auto it = res.begin(); it != res.end(); ++it) { + std::cout << " " << (*it); + } + std::cout << std::endl; + } else if (command == "LTRIM") { + std::string k; + int i, j; + std::cin >> k >> i >> j; + redis.Trim(k, i, j); + } else if (command == "LSET") { + std::string k; + int idx; + std::string v; + cin >> k >> idx >> v; + redis.Set(k, idx, v); + } else if (command == "LINDEX") { + std::string k; + int idx; + std::cin >> k >> idx; + string res; + redis.Index(k, idx, &res); + std::cout << res << std::endl; + } else if (command == "PRINT") { // Added by Deon + std::string k; + cin >> k; + redis.Print(k); + } else if (command == "QUIT") { + return 0; + } else { + std::cout << "unknown command: " << command << std::endl; + } + } +} +} // namespace + +} // namespace rocksdb + + +// USAGE: "./redis_test" for default (unit tests) +// "./redis_test -m" for manual testing (redis command api) +// "./redis_test -m -d" for destructive manual test (erase db before use) + + +namespace { +// Check for "want" argument in the argument list +bool found_arg(int argc, char* argv[], const char* want){ + for(int i=1; icompaction_filter) { + options->compaction_filter = + new TtlCompactionFilter(ttl, env, options->compaction_filter); + } else { + options->compaction_filter_factory = + std::shared_ptr(new TtlCompactionFilterFactory( + ttl, env, options->compaction_filter_factory)); + } + + if (options->merge_operator) { + options->merge_operator.reset( + new TtlMergeOperator(options->merge_operator, env)); + } +} + +// Open the db inside DBWithTTLImpl because options needs pointer to its ttl +DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db) {} + +DBWithTTLImpl::~DBWithTTLImpl() { delete GetOptions().compaction_filter; } + +Status UtilityDB::OpenTtlDB(const Options& options, const std::string& dbname, + StackableDB** dbptr, int32_t ttl, bool read_only) { + DBWithTTL* db; + Status s = DBWithTTL::Open(options, dbname, &db, ttl, read_only); + if (s.ok()) { + *dbptr = db; + } else { + *dbptr = nullptr; + } + return s; +} + +Status DBWithTTL::Open(const Options& options, const std::string& dbname, + DBWithTTL** dbptr, int32_t ttl, bool read_only) { + + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + Status s = DBWithTTL::Open(db_options, dbname, column_families, &handles, + dbptr, {ttl}, read_only); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + return s; +} + +Status DBWithTTL::Open( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DBWithTTL** dbptr, + std::vector ttls, bool read_only) { + + if (ttls.size() != column_families.size()) { + return Status::InvalidArgument( + "ttls size has to be the same as number of column families"); + } + + std::vector column_families_sanitized = + column_families; + for (size_t i = 0; i < column_families_sanitized.size(); ++i) { + DBWithTTLImpl::SanitizeOptions( + ttls[i], &column_families_sanitized[i].options, + db_options.env == nullptr ? Env::Default() : db_options.env); + } + DB* db; + + Status st; + if (read_only) { + st = DB::OpenForReadOnly(db_options, dbname, column_families_sanitized, + handles, &db); + } else { + st = DB::Open(db_options, dbname, column_families_sanitized, handles, &db); + } + if (st.ok()) { + *dbptr = new DBWithTTLImpl(db); + } else { + *dbptr = nullptr; + } + return st; +} + +Status DBWithTTLImpl::CreateColumnFamilyWithTtl( + const ColumnFamilyOptions& options, const std::string& column_family_name, + ColumnFamilyHandle** handle, int ttl) { + ColumnFamilyOptions sanitized_options = options; + DBWithTTLImpl::SanitizeOptions(ttl, &sanitized_options, GetEnv()); + + return DBWithTTL::CreateColumnFamily(sanitized_options, column_family_name, + handle); +} + +Status DBWithTTLImpl::CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) { + return CreateColumnFamilyWithTtl(options, column_family_name, handle, 0); +} + +// Appends the current timestamp to the string. +// Returns false if could not get the current_time, true if append succeeds +Status DBWithTTLImpl::AppendTS(const Slice& val, std::string* val_with_ts, + Env* env) { + val_with_ts->reserve(kTSLength + val.size()); + char ts_string[kTSLength]; + int64_t curtime; + Status st = env->GetCurrentTime(&curtime); + if (!st.ok()) { + return st; + } + EncodeFixed32(ts_string, (int32_t)curtime); + val_with_ts->append(val.data(), val.size()); + val_with_ts->append(ts_string, kTSLength); + return st; +} + +// Returns corruption if the length of the string is lesser than timestamp, or +// timestamp refers to a time lesser than ttl-feature release time +Status DBWithTTLImpl::SanityCheckTimestamp(const Slice& str) { + if (str.size() < kTSLength) { + return Status::Corruption("Error: value's length less than timestamp's\n"); + } + // Checks that TS is not lesser than kMinTimestamp + // Gaurds against corruption & normal database opened incorrectly in ttl mode + int32_t timestamp_value = DecodeFixed32(str.data() + str.size() - kTSLength); + if (timestamp_value < kMinTimestamp) { + return Status::Corruption("Error: Timestamp < ttl feature release time!\n"); + } + return Status::OK(); +} + +// Checks if the string is stale or not according to TTl provided +bool DBWithTTLImpl::IsStale(const Slice& value, int32_t ttl, Env* env) { + if (ttl <= 0) { // Data is fresh if TTL is non-positive + return false; + } + int64_t curtime; + if (!env->GetCurrentTime(&curtime).ok()) { + return false; // Treat the data as fresh if could not get current time + } + int32_t timestamp_value = + DecodeFixed32(value.data() + value.size() - kTSLength); + return (timestamp_value + ttl) < curtime; +} + +// Strips the TS from the end of the string +Status DBWithTTLImpl::StripTS(std::string* str) { + Status st; + if (str->length() < kTSLength) { + return Status::Corruption("Bad timestamp in key-value"); + } + // Erasing characters which hold the TS + str->erase(str->length() - kTSLength, kTSLength); + return st; +} + +Status DBWithTTLImpl::Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) { + WriteBatch batch; + batch.Put(column_family, key, val); + return Write(options, &batch); +} + +Status DBWithTTLImpl::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) { + Status st = db_->Get(options, column_family, key, value); + if (!st.ok()) { + return st; + } + st = SanityCheckTimestamp(*value); + if (!st.ok()) { + return st; + } + return StripTS(value); +} + +std::vector DBWithTTLImpl::MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + return std::vector( + keys.size(), Status::NotSupported("MultiGet not supported with TTL")); +} + +bool DBWithTTLImpl::KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value, + bool* value_found) { + bool ret = db_->KeyMayExist(options, column_family, key, value, value_found); + if (ret && value != nullptr && value_found != nullptr && *value_found) { + if (!SanityCheckTimestamp(*value).ok() || !StripTS(value).ok()) { + return false; + } + } + return ret; +} + +Status DBWithTTLImpl::Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + WriteBatch batch; + batch.Merge(column_family, key, value); + return Write(options, &batch); +} + +Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) { + class Handler : public WriteBatch::Handler { + public: + explicit Handler(Env* env) : env_(env) {} + WriteBatch updates_ttl; + Status batch_rewrite_status; + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + std::string value_with_ts; + Status st = AppendTS(value, &value_with_ts, env_); + if (!st.ok()) { + batch_rewrite_status = st; + } else { + WriteBatchInternal::Put(&updates_ttl, column_family_id, key, + value_with_ts); + } + return Status::OK(); + } + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + std::string value_with_ts; + Status st = AppendTS(value, &value_with_ts, env_); + if (!st.ok()) { + batch_rewrite_status = st; + } else { + WriteBatchInternal::Merge(&updates_ttl, column_family_id, key, + value_with_ts); + } + return Status::OK(); + } + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { + WriteBatchInternal::Delete(&updates_ttl, column_family_id, key); + return Status::OK(); + } + virtual void LogData(const Slice& blob) { updates_ttl.PutLogData(blob); } + + private: + Env* env_; + }; + Handler handler(GetEnv()); + updates->Iterate(&handler); + if (!handler.batch_rewrite_status.ok()) { + return handler.batch_rewrite_status; + } else { + return db_->Write(opts, &(handler.updates_ttl)); + } +} + +Iterator* DBWithTTLImpl::NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) { + return new TtlIterator(db_->NewIterator(opts, column_family)); +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h new file mode 100644 index 0000000000..a5c8fc8cac --- /dev/null +++ b/utilities/ttl/db_ttl_impl.h @@ -0,0 +1,314 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifndef ROCKSDB_LITE +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/merge_operator.h" +#include "utilities/utility_db.h" +#include "utilities/db_ttl.h" +#include "db/db_impl.h" + +namespace rocksdb { + +class DBWithTTLImpl : public DBWithTTL { + public: + static void SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options, + Env* env); + + explicit DBWithTTLImpl(DB* db); + + virtual ~DBWithTTLImpl(); + + Status CreateColumnFamilyWithTtl(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle, + int ttl) override; + + Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) override; + + using StackableDB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) override; + + using StackableDB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) override; + + using StackableDB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + using StackableDB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, + bool* value_found = nullptr) override; + + using StackableDB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + + using StackableDB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) override; + + virtual DB* GetBaseDB() { return db_; } + + static bool IsStale(const Slice& value, int32_t ttl, Env* env); + + static Status AppendTS(const Slice& val, std::string* val_with_ts, Env* env); + + static Status SanityCheckTimestamp(const Slice& str); + + static Status StripTS(std::string* str); + + static const uint32_t kTSLength = sizeof(int32_t); // size of timestamp + + static const int32_t kMinTimestamp = 1368146402; // 05/09/2013:5:40PM GMT-8 + + static const int32_t kMaxTimestamp = 2147483647; // 01/18/2038:7:14PM GMT-8 +}; + +class TtlIterator : public Iterator { + + public: + explicit TtlIterator(Iterator* iter) : iter_(iter) { assert(iter_); } + + ~TtlIterator() { delete iter_; } + + bool Valid() const { return iter_->Valid(); } + + void SeekToFirst() { iter_->SeekToFirst(); } + + void SeekToLast() { iter_->SeekToLast(); } + + void Seek(const Slice& target) { iter_->Seek(target); } + + void Next() { iter_->Next(); } + + void Prev() { iter_->Prev(); } + + Slice key() const { return iter_->key(); } + + int32_t timestamp() const { + return DecodeFixed32(iter_->value().data() + iter_->value().size() - + DBWithTTLImpl::kTSLength); + } + + Slice value() const { + // TODO: handle timestamp corruption like in general iterator semantics + assert(DBWithTTLImpl::SanityCheckTimestamp(iter_->value()).ok()); + Slice trimmed_value = iter_->value(); + trimmed_value.size_ -= DBWithTTLImpl::kTSLength; + return trimmed_value; + } + + Status status() const { return iter_->status(); } + + private: + Iterator* iter_; +}; + +class TtlCompactionFilter : public CompactionFilter { + public: + TtlCompactionFilter( + int32_t ttl, Env* env, const CompactionFilter* user_comp_filter, + std::unique_ptr user_comp_filter_from_factory = + nullptr) + : ttl_(ttl), + env_(env), + user_comp_filter_(user_comp_filter), + user_comp_filter_from_factory_( + std::move(user_comp_filter_from_factory)) { + // Unlike the merge operator, compaction filter is necessary for TTL, hence + // this would be called even if user doesn't specify any compaction-filter + if (!user_comp_filter_) { + user_comp_filter_ = user_comp_filter_from_factory_.get(); + } + } + + virtual bool Filter(int level, const Slice& key, const Slice& old_val, + std::string* new_val, bool* value_changed) const + override { + if (DBWithTTLImpl::IsStale(old_val, ttl_, env_)) { + return true; + } + if (user_comp_filter_ == nullptr) { + return false; + } + assert(old_val.size() >= DBWithTTLImpl::kTSLength); + Slice old_val_without_ts(old_val.data(), + old_val.size() - DBWithTTLImpl::kTSLength); + if (user_comp_filter_->Filter(level, key, old_val_without_ts, new_val, + value_changed)) { + return true; + } + if (*value_changed) { + new_val->append( + old_val.data() + old_val.size() - DBWithTTLImpl::kTSLength, + DBWithTTLImpl::kTSLength); + } + return false; + } + + virtual const char* Name() const override { return "Delete By TTL"; } + + private: + int32_t ttl_; + Env* env_; + const CompactionFilter* user_comp_filter_; + std::unique_ptr user_comp_filter_from_factory_; +}; + +class TtlCompactionFilterFactory : public CompactionFilterFactory { + public: + TtlCompactionFilterFactory( + int32_t ttl, Env* env, + std::shared_ptr comp_filter_factory) + : ttl_(ttl), env_(env), user_comp_filter_factory_(comp_filter_factory) {} + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) { + return std::unique_ptr(new TtlCompactionFilter( + ttl_, env_, nullptr, + std::move(user_comp_filter_factory_->CreateCompactionFilter(context)))); + } + + virtual const char* Name() const override { + return "TtlCompactionFilterFactory"; + } + + private: + int32_t ttl_; + Env* env_; + std::shared_ptr user_comp_filter_factory_; +}; + +class TtlMergeOperator : public MergeOperator { + + public: + explicit TtlMergeOperator(const std::shared_ptr merge_op, + Env* env) + : user_merge_op_(merge_op), env_(env) { + assert(merge_op); + assert(env); + } + + virtual bool FullMerge(const Slice& key, const Slice* existing_value, + const std::deque& operands, + std::string* new_value, Logger* logger) const + override { + const uint32_t ts_len = DBWithTTLImpl::kTSLength; + if (existing_value && existing_value->size() < ts_len) { + Log(logger, "Error: Could not remove timestamp from existing value."); + return false; + } + + // Extract time-stamp from each operand to be passed to user_merge_op_ + std::deque operands_without_ts; + for (const auto& operand : operands) { + if (operand.size() < ts_len) { + Log(logger, "Error: Could not remove timestamp from operand value."); + return false; + } + operands_without_ts.push_back(operand.substr(0, operand.size() - ts_len)); + } + + // Apply the user merge operator (store result in *new_value) + bool good = true; + if (existing_value) { + Slice existing_value_without_ts(existing_value->data(), + existing_value->size() - ts_len); + good = user_merge_op_->FullMerge(key, &existing_value_without_ts, + operands_without_ts, new_value, logger); + } else { + good = user_merge_op_->FullMerge(key, nullptr, operands_without_ts, + new_value, logger); + } + + // Return false if the user merge operator returned false + if (!good) { + return false; + } + + // Augment the *new_value with the ttl time-stamp + int64_t curtime; + if (!env_->GetCurrentTime(&curtime).ok()) { + Log(logger, + "Error: Could not get current time to be attached internally " + "to the new value."); + return false; + } else { + char ts_string[ts_len]; + EncodeFixed32(ts_string, (int32_t)curtime); + new_value->append(ts_string, ts_len); + return true; + } + } + + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const + override { + const uint32_t ts_len = DBWithTTLImpl::kTSLength; + std::deque operands_without_ts; + + for (const auto& operand : operand_list) { + if (operand.size() < ts_len) { + Log(logger, "Error: Could not remove timestamp from value."); + return false; + } + + operands_without_ts.push_back( + Slice(operand.data(), operand.size() - ts_len)); + } + + // Apply the user partial-merge operator (store result in *new_value) + assert(new_value); + if (!user_merge_op_->PartialMergeMulti(key, operands_without_ts, new_value, + logger)) { + return false; + } + + // Augment the *new_value with the ttl time-stamp + int64_t curtime; + if (!env_->GetCurrentTime(&curtime).ok()) { + Log(logger, + "Error: Could not get current time to be attached internally " + "to the new value."); + return false; + } else { + char ts_string[ts_len]; + EncodeFixed32(ts_string, (int32_t)curtime); + new_value->append(ts_string, ts_len); + return true; + } + } + + virtual const char* Name() const override { return "Merge By TTL"; } + + private: + std::shared_ptr user_merge_op_; + Env* env_; +}; +} +#endif // ROCKSDB_LITE diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc new file mode 100644 index 0000000000..4791a2a770 --- /dev/null +++ b/utilities/ttl/ttl_test.cc @@ -0,0 +1,595 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "rocksdb/compaction_filter.h" +#include "utilities/db_ttl.h" +#include "util/testharness.h" +#include "util/logging.h" +#include +#include + +namespace rocksdb { + +namespace { + +typedef std::map KVMap; + +enum BatchOperation { + PUT = 0, + DELETE = 1 +}; +} + +class SpecialTimeEnv : public EnvWrapper { + public: + explicit SpecialTimeEnv(Env* base) : EnvWrapper(base) { + base->GetCurrentTime(¤t_time_); + } + + void Sleep(int64_t sleep_time) { current_time_ += sleep_time; } + virtual Status GetCurrentTime(int64_t* current_time) { + *current_time = current_time_; + return Status::OK(); + } + + private: + int64_t current_time_; +}; + +class TtlTest { + public: + TtlTest() { + env_.reset(new SpecialTimeEnv(Env::Default())); + dbname_ = test::TmpDir() + "/db_ttl"; + options_.create_if_missing = true; + options_.env = env_.get(); + // ensure that compaction is kicked in to always strip timestamp from kvs + options_.max_grandparent_overlap_factor = 0; + // compaction should take place always from level0 for determinism + options_.max_mem_compaction_level = 0; + db_ttl_ = nullptr; + DestroyDB(dbname_, Options()); + } + + ~TtlTest() { + CloseTtl(); + DestroyDB(dbname_, Options()); + } + + // Open database with TTL support when TTL not provided with db_ttl_ pointer + void OpenTtl() { + ASSERT_TRUE(db_ttl_ == + nullptr); // db should be closed before opening again + ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_)); + } + + // Open database with TTL support when TTL provided with db_ttl_ pointer + void OpenTtl(int32_t ttl) { + ASSERT_TRUE(db_ttl_ == nullptr); + ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_, ttl)); + } + + // Open with TestFilter compaction filter + void OpenTtlWithTestCompaction(int32_t ttl) { + options_.compaction_filter_factory = + std::shared_ptr( + new TestFilterFactory(kSampleSize_, kNewValue_)); + OpenTtl(ttl); + } + + // Open database with TTL support in read_only mode + void OpenReadOnlyTtl(int32_t ttl) { + ASSERT_TRUE(db_ttl_ == nullptr); + ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_, ttl, true)); + } + + void CloseTtl() { + delete db_ttl_; + db_ttl_ = nullptr; + } + + // Populates and returns a kv-map + void MakeKVMap(int64_t num_entries) { + kvmap_.clear(); + int digits = 1; + for (int dummy = num_entries; dummy /= 10 ; ++digits); + int digits_in_i = 1; + for (int64_t i = 0; i < num_entries; i++) { + std::string key = "key"; + std::string value = "value"; + if (i % 10 == 0) { + digits_in_i++; + } + for(int j = digits_in_i; j < digits; j++) { + key.append("0"); + value.append("0"); + } + AppendNumberTo(&key, i); + AppendNumberTo(&value, i); + kvmap_[key] = value; + } + ASSERT_EQ((int)kvmap_.size(), num_entries);//check all insertions done + } + + // Makes a write-batch with key-vals from kvmap_ and 'Write''s it + void MakePutWriteBatch(const BatchOperation* batch_ops, int num_ops) { + ASSERT_LE(num_ops, (int)kvmap_.size()); + static WriteOptions wopts; + static FlushOptions flush_opts; + WriteBatch batch; + kv_it_ = kvmap_.begin(); + for (int i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, kv_it_++) { + switch (batch_ops[i]) { + case PUT: + batch.Put(kv_it_->first, kv_it_->second); + break; + case DELETE: + batch.Delete(kv_it_->first); + break; + default: + ASSERT_TRUE(false); + } + } + db_ttl_->Write(wopts, &batch); + db_ttl_->Flush(flush_opts); + } + + // Puts num_entries starting from start_pos_map from kvmap_ into the database + void PutValues(int start_pos_map, int num_entries, bool flush = true, + ColumnFamilyHandle* cf = nullptr) { + ASSERT_TRUE(db_ttl_); + ASSERT_LE(start_pos_map + num_entries, (int)kvmap_.size()); + static WriteOptions wopts; + static FlushOptions flush_opts; + kv_it_ = kvmap_.begin(); + advance(kv_it_, start_pos_map); + for (int i = 0; kv_it_ != kvmap_.end() && i < num_entries; i++, kv_it_++) { + ASSERT_OK(cf == nullptr + ? db_ttl_->Put(wopts, kv_it_->first, kv_it_->second) + : db_ttl_->Put(wopts, cf, kv_it_->first, kv_it_->second)); + } + // Put a mock kv at the end because CompactionFilter doesn't delete last key + ASSERT_OK(cf == nullptr ? db_ttl_->Put(wopts, "keymock", "valuemock") + : db_ttl_->Put(wopts, cf, "keymock", "valuemock")); + if (flush) { + if (cf == nullptr) { + db_ttl_->Flush(flush_opts); + } else { + db_ttl_->Flush(flush_opts, cf); + } + } + } + + // Runs a manual compaction + void ManualCompact(ColumnFamilyHandle* cf = nullptr) { + if (cf == nullptr) { + db_ttl_->CompactRange(nullptr, nullptr); + } else { + db_ttl_->CompactRange(cf, nullptr, nullptr); + } + } + + // checks the whole kvmap_ to return correct values using KeyMayExist + void SimpleKeyMayExistCheck() { + static ReadOptions ropts; + bool value_found; + std::string val; + for(auto &kv : kvmap_) { + bool ret = db_ttl_->KeyMayExist(ropts, kv.first, &val, &value_found); + if (ret == false || value_found == false) { + fprintf(stderr, "KeyMayExist could not find key=%s in the database but" + " should have\n", kv.first.c_str()); + ASSERT_TRUE(false); + } else if (val.compare(kv.second) != 0) { + fprintf(stderr, " value for key=%s present in database is %s but" + " should be %s\n", kv.first.c_str(), val.c_str(), + kv.second.c_str()); + ASSERT_TRUE(false); + } + } + } + + // Sleeps for slp_tim then runs a manual compaction + // Checks span starting from st_pos from kvmap_ in the db and + // Gets should return true if check is true and false otherwise + // Also checks that value that we got is the same as inserted; and =kNewValue + // if test_compaction_change is true + void SleepCompactCheck(int slp_tim, int st_pos, int span, bool check = true, + bool test_compaction_change = false, + ColumnFamilyHandle* cf = nullptr) { + ASSERT_TRUE(db_ttl_); + + env_->Sleep(slp_tim); + ManualCompact(cf); + static ReadOptions ropts; + kv_it_ = kvmap_.begin(); + advance(kv_it_, st_pos); + std::string v; + for (int i = 0; kv_it_ != kvmap_.end() && i < span; i++, kv_it_++) { + Status s = (cf == nullptr) ? db_ttl_->Get(ropts, kv_it_->first, &v) + : db_ttl_->Get(ropts, cf, kv_it_->first, &v); + if (s.ok() != check) { + fprintf(stderr, "key=%s ", kv_it_->first.c_str()); + if (!s.ok()) { + fprintf(stderr, "is absent from db but was expected to be present\n"); + } else { + fprintf(stderr, "is present in db but was expected to be absent\n"); + } + ASSERT_TRUE(false); + } else if (s.ok()) { + if (test_compaction_change && v.compare(kNewValue_) != 0) { + fprintf(stderr, " value for key=%s present in database is %s but " + " should be %s\n", kv_it_->first.c_str(), v.c_str(), + kNewValue_.c_str()); + ASSERT_TRUE(false); + } else if (!test_compaction_change && v.compare(kv_it_->second) !=0) { + fprintf(stderr, " value for key=%s present in database is %s but " + " should be %s\n", kv_it_->first.c_str(), v.c_str(), + kv_it_->second.c_str()); + ASSERT_TRUE(false); + } + } + } + } + + // Similar as SleepCompactCheck but uses TtlIterator to read from db + void SleepCompactCheckIter(int slp, int st_pos, int span, bool check=true) { + ASSERT_TRUE(db_ttl_); + env_->Sleep(slp); + ManualCompact(); + static ReadOptions ropts; + Iterator *dbiter = db_ttl_->NewIterator(ropts); + kv_it_ = kvmap_.begin(); + advance(kv_it_, st_pos); + + dbiter->Seek(kv_it_->first); + if (!check) { + if (dbiter->Valid()) { + ASSERT_NE(dbiter->value().compare(kv_it_->second), 0); + } + } else { // dbiter should have found out kvmap_[st_pos] + for (int i = st_pos; + kv_it_ != kvmap_.end() && i < st_pos + span; + i++, kv_it_++) { + ASSERT_TRUE(dbiter->Valid()); + ASSERT_EQ(dbiter->value().compare(kv_it_->second), 0); + dbiter->Next(); + } + } + delete dbiter; + } + + class TestFilter : public CompactionFilter { + public: + TestFilter(const int64_t kSampleSize, const std::string kNewValue) + : kSampleSize_(kSampleSize), + kNewValue_(kNewValue) { + } + + // Works on keys of the form "key" + // Drops key if number at the end of key is in [0, kSampleSize_/3), + // Keeps key if it is in [kSampleSize_/3, 2*kSampleSize_/3), + // Change value if it is in [2*kSampleSize_/3, kSampleSize_) + // Eg. kSampleSize_=6. Drop:key0-1...Keep:key2-3...Change:key4-5... + virtual bool Filter(int level, const Slice& key, + const Slice& value, std::string* new_value, + bool* value_changed) const override { + assert(new_value != nullptr); + + std::string search_str = "0123456789"; + std::string key_string = key.ToString(); + size_t pos = key_string.find_first_of(search_str); + int num_key_end; + if (pos != std::string::npos) { + num_key_end = stoi(key_string.substr(pos, key.size() - pos)); + } else { + return false; // Keep keys not matching the format "key" + } + + int partition = kSampleSize_ / 3; + if (num_key_end < partition) { + return true; + } else if (num_key_end < partition * 2) { + return false; + } else { + *new_value = kNewValue_; + *value_changed = true; + return false; + } + } + + virtual const char* Name() const override { + return "TestFilter"; + } + + private: + const int64_t kSampleSize_; + const std::string kNewValue_; + }; + + class TestFilterFactory : public CompactionFilterFactory { + public: + TestFilterFactory(const int64_t kSampleSize, const std::string kNewValue) + : kSampleSize_(kSampleSize), + kNewValue_(kNewValue) { + } + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + return std::unique_ptr( + new TestFilter(kSampleSize_, kNewValue_)); + } + + virtual const char* Name() const override { + return "TestFilterFactory"; + } + + private: + const int64_t kSampleSize_; + const std::string kNewValue_; + }; + + + // Choose carefully so that Put, Gets & Compaction complete in 1 second buffer + const int64_t kSampleSize_ = 100; + std::string dbname_; + DBWithTTL* db_ttl_; + unique_ptr env_; + + private: + Options options_; + KVMap kvmap_; + KVMap::iterator kv_it_; + const std::string kNewValue_ = "new_value"; + unique_ptr test_comp_filter_; +}; // class TtlTest + +// If TTL is non positive or not provided, the behaviour is TTL = infinity +// This test opens the db 3 times with such default behavior and inserts a +// bunch of kvs each time. All kvs should accumulate in the db till the end +// Partitions the sample-size provided into 3 sets over boundary1 and boundary2 +TEST(TtlTest, NoEffect) { + MakeKVMap(kSampleSize_); + int boundary1 = kSampleSize_ / 3; + int boundary2 = 2 * boundary1; + + OpenTtl(); + PutValues(0, boundary1); //T=0: Set1 never deleted + SleepCompactCheck(1, 0, boundary1); //T=1: Set1 still there + CloseTtl(); + + OpenTtl(0); + PutValues(boundary1, boundary2 - boundary1); //T=1: Set2 never deleted + SleepCompactCheck(1, 0, boundary2); //T=2: Sets1 & 2 still there + CloseTtl(); + + OpenTtl(-1); + PutValues(boundary2, kSampleSize_ - boundary2); //T=3: Set3 never deleted + SleepCompactCheck(1, 0, kSampleSize_, true); //T=4: Sets 1,2,3 still there + CloseTtl(); +} + +// Puts a set of values and checks its presence using Get during ttl +TEST(TtlTest, PresentDuringTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(2); // T=0:Open the db with ttl = 2 + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 + SleepCompactCheck(1, 0, kSampleSize_, true); // T=1:Set1 should still be there + CloseTtl(); +} + +// Puts a set of values and checks its absence using Get after ttl +TEST(TtlTest, AbsentAfterTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); // T=0:Open the db with ttl = 2 + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 + SleepCompactCheck(2, 0, kSampleSize_, false); // T=2:Set1 should not be there + CloseTtl(); +} + +// Resets the timestamp of a set of kvs by updating them and checks that they +// are not deleted according to the old timestamp +TEST(TtlTest, ResetTimestamp) { + MakeKVMap(kSampleSize_); + + OpenTtl(3); + PutValues(0, kSampleSize_); // T=0: Insert Set1. Delete at t=3 + env_->Sleep(2); // T=2 + PutValues(0, kSampleSize_); // T=2: Insert Set1. Delete at t=5 + SleepCompactCheck(2, 0, kSampleSize_); // T=4: Set1 should still be there + CloseTtl(); +} + +// Similar to PresentDuringTTL but uses Iterator +TEST(TtlTest, IterPresentDuringTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(2); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=2 + SleepCompactCheckIter(1, 0, kSampleSize_); // T=1: Set should be there + CloseTtl(); +} + +// Similar to AbsentAfterTTL but uses Iterator +TEST(TtlTest, IterAbsentAfterTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + SleepCompactCheckIter(2, 0, kSampleSize_, false); // T=2: Should not be there + CloseTtl(); +} + +// Checks presence while opening the same db more than once with the same ttl +// Note: The second open will open the same db +TEST(TtlTest, MultiOpenSamePresent) { + MakeKVMap(kSampleSize_); + + OpenTtl(2); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=2 + CloseTtl(); + + OpenTtl(2); // T=0. Delete at t=2 + SleepCompactCheck(1, 0, kSampleSize_); // T=1: Set should be there + CloseTtl(); +} + +// Checks absence while opening the same db more than once with the same ttl +// Note: The second open will open the same db +TEST(TtlTest, MultiOpenSameAbsent) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + CloseTtl(); + + OpenTtl(1); // T=0.Delete at t=1 + SleepCompactCheck(2, 0, kSampleSize_, false); // T=2: Set should not be there + CloseTtl(); +} + +// Checks presence while opening the same db more than once with bigger ttl +TEST(TtlTest, MultiOpenDifferent) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + CloseTtl(); + + OpenTtl(3); // T=0: Set deleted at t=3 + SleepCompactCheck(2, 0, kSampleSize_); // T=2: Set should be there + CloseTtl(); +} + +// Checks presence during ttl in read_only mode +TEST(TtlTest, ReadOnlyPresentForever) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); // T=0:Open the db normally + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=1 + CloseTtl(); + + OpenReadOnlyTtl(1); + SleepCompactCheck(2, 0, kSampleSize_); // T=2:Set1 should still be there + CloseTtl(); +} + +// Checks whether WriteBatch works well with TTL +// Puts all kvs in kvmap_ in a batch and writes first, then deletes first half +TEST(TtlTest, WriteBatchTest) { + MakeKVMap(kSampleSize_); + BatchOperation batch_ops[kSampleSize_]; + for (int i = 0; i < kSampleSize_; i++) { + batch_ops[i] = PUT; + } + + OpenTtl(2); + MakePutWriteBatch(batch_ops, kSampleSize_); + for (int i = 0; i < kSampleSize_ / 2; i++) { + batch_ops[i] = DELETE; + } + MakePutWriteBatch(batch_ops, kSampleSize_ / 2); + SleepCompactCheck(0, 0, kSampleSize_ / 2, false); + SleepCompactCheck(0, kSampleSize_ / 2, kSampleSize_ - kSampleSize_ / 2); + CloseTtl(); +} + +// Checks user's compaction filter for correctness with TTL logic +TEST(TtlTest, CompactionFilter) { + MakeKVMap(kSampleSize_); + + OpenTtlWithTestCompaction(1); + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=1 + // T=2: TTL logic takes precedence over TestFilter:-Set1 should not be there + SleepCompactCheck(2, 0, kSampleSize_, false); + CloseTtl(); + + OpenTtlWithTestCompaction(3); + PutValues(0, kSampleSize_); // T=0:Insert Set1. + int partition = kSampleSize_ / 3; + SleepCompactCheck(1, 0, partition, false); // Part dropped + SleepCompactCheck(0, partition, partition); // Part kept + SleepCompactCheck(0, 2 * partition, partition, true, true); // Part changed + CloseTtl(); +} + +// Insert some key-values which KeyMayExist should be able to get and check that +// values returned are fine +TEST(TtlTest, KeyMayExist) { + MakeKVMap(kSampleSize_); + + OpenTtl(); + PutValues(0, kSampleSize_, false); + + SimpleKeyMayExistCheck(); + + CloseTtl(); +} + +TEST(TtlTest, ColumnFamiliesTest) { + DB* db; + Options options; + options.create_if_missing = true; + options.env = env_.get(); + + DB::Open(options, dbname_, &db); + ColumnFamilyHandle* handle; + ASSERT_OK(db->CreateColumnFamily(ColumnFamilyOptions(options), + "ttl_column_family", &handle)); + + delete handle; + delete db; + + std::vector column_families; + column_families.push_back(ColumnFamilyDescriptor( + kDefaultColumnFamilyName, ColumnFamilyOptions(options))); + column_families.push_back(ColumnFamilyDescriptor( + "ttl_column_family", ColumnFamilyOptions(options))); + + std::vector handles; + + ASSERT_OK(DBWithTTL::Open(DBOptions(options), dbname_, column_families, + &handles, &db_ttl_, {3, 5}, false)); + ASSERT_EQ(handles.size(), 2U); + ColumnFamilyHandle* new_handle; + ASSERT_OK(db_ttl_->CreateColumnFamilyWithTtl(options, "ttl_column_family_2", + &new_handle, 2)); + handles.push_back(new_handle); + + MakeKVMap(kSampleSize_); + PutValues(0, kSampleSize_, false, handles[0]); + PutValues(0, kSampleSize_, false, handles[1]); + PutValues(0, kSampleSize_, false, handles[2]); + + // everything should be there after 1 second + SleepCompactCheck(1, 0, kSampleSize_, true, false, handles[0]); + SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[1]); + SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[2]); + + // only column family 1 should be alive after 4 seconds + SleepCompactCheck(3, 0, kSampleSize_, false, false, handles[0]); + SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[1]); + SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[2]); + + // nothing should be there after 6 seconds + SleepCompactCheck(2, 0, kSampleSize_, false, false, handles[0]); + SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[1]); + SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[2]); + + for (auto h : handles) { + delete h; + } + delete db_ttl_; + db_ttl_ = nullptr; +} + +} // namespace rocksdb + +// A black-box test for the ttl wrapper around rocksdb +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +}