mirror of
https://github.com/XRPLF/rippled.git
synced 2025-11-29 15:35:50 +00:00
Squashed 'src/rocksdb/' content from commit 457bae6
git-subtree-dir: src/rocksdb
git-subtree-split: 457bae6911
This commit is contained in:
10
.arcconfig
Normal file
10
.arcconfig
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"project_id" : "rocksdb",
|
||||
"conduit_uri" : "https://reviews.facebook.net/",
|
||||
"copyright_holder" : "Facebook",
|
||||
"load" : [
|
||||
"linters"
|
||||
],
|
||||
"lint.engine" : "FacebookFbcodeLintEngine",
|
||||
"lint.engine.single.linter" : "FbcodeCppLinter"
|
||||
}
|
||||
5
.clang-format
Normal file
5
.clang-format
Normal file
@@ -0,0 +1,5 @@
|
||||
# Complete list of style options can be found at:
|
||||
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
|
||||
---
|
||||
BasedOnStyle: Google
|
||||
...
|
||||
34
.gitignore
vendored
Normal file
34
.gitignore
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
TARGETS
|
||||
build_config.mk
|
||||
|
||||
*.a
|
||||
*.arc
|
||||
*.d
|
||||
*.dylib*
|
||||
*.gcda
|
||||
*.gcno
|
||||
*.o
|
||||
*.so
|
||||
*.so.*
|
||||
*_test
|
||||
*_bench
|
||||
*_stress
|
||||
*.out
|
||||
*.class
|
||||
*.jar
|
||||
*.*jnilib*
|
||||
*.d-e
|
||||
*.o-*
|
||||
*.swp
|
||||
|
||||
ldb
|
||||
manifest_dump
|
||||
sst_dump
|
||||
util/build_version.cc
|
||||
build_tools/VALGRIND_LOGS/
|
||||
coverage/COVERAGE_REPORT
|
||||
.gdbhistory
|
||||
.phutil_module_cache
|
||||
tags
|
||||
java/*.log
|
||||
java/include/org_rocksdb_*.h
|
||||
20
.travis.yml
Normal file
20
.travis.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
language: cpp
|
||||
compiler: gcc
|
||||
before_install:
|
||||
# As of this writing (10 May 2014) the Travis build environment is Ubuntu 12.04,
|
||||
# which needs the following ugly dependency incantations to build RocksDB:
|
||||
- sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
|
||||
- sudo apt-get update -qq
|
||||
- sudo apt-get install -y -qq gcc-4.8 g++-4.8 zlib1g-dev libbz2-dev libsnappy-dev libjemalloc-dev
|
||||
- sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 50
|
||||
- sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 50
|
||||
- wget https://gflags.googlecode.com/files/libgflags0_2.0-1_amd64.deb
|
||||
- sudo dpkg -i libgflags0_2.0-1_amd64.deb
|
||||
- wget https://gflags.googlecode.com/files/libgflags-dev_2.0-1_amd64.deb
|
||||
- sudo dpkg -i libgflags-dev_2.0-1_amd64.deb
|
||||
# Lousy hack to disable use and testing of fallocate, which doesn't behave quite
|
||||
# as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
|
||||
- sed -i "s/fallocate(/HACK_NO_fallocate(/" build_tools/build_detect_platform
|
||||
script: make check -j8
|
||||
notifications:
|
||||
email: false
|
||||
20
CONTRIBUTING.md
Normal file
20
CONTRIBUTING.md
Normal file
@@ -0,0 +1,20 @@
|
||||
# Contributing to RocksDB
|
||||
|
||||
## Contributor License Agreement ("CLA")
|
||||
|
||||
In order to accept your pull request, we need you to submit a CLA. You
|
||||
only need to do this once, so if you've done this for another Facebook
|
||||
open source project, you're good to go. If you are submitting a pull
|
||||
request for the first time, just let us know that you have completed
|
||||
the CLA and we can cross-check with your GitHub username.
|
||||
|
||||
Complete your CLA here: <https://code.facebook.com/cla>
|
||||
|
||||
If you don't have a Facebook account, we can send you a PDF that you can
|
||||
sign offline. Send us an e-mail or create a new github issue to
|
||||
request the CLA in PDF format.
|
||||
|
||||
## License
|
||||
|
||||
By contributing to RocksDB, you agree that your contributions will be
|
||||
licensed under the [BSD License](LICENSE).
|
||||
89
HISTORY.md
Normal file
89
HISTORY.md
Normal file
@@ -0,0 +1,89 @@
|
||||
# Rocksdb Change Log
|
||||
|
||||
## 3.1.0 (05/21/2014)
|
||||
|
||||
### Public API changes
|
||||
* Replaced ColumnFamilyOptions::table_properties_collectors with ColumnFamilyOptions::table_properties_collector_factories
|
||||
|
||||
### New Features
|
||||
* Hash index for block-based table will be materialized and reconstructed more efficiently. Previously hash index is constructed by scanning the whole table during every table open.
|
||||
* FIFO compaction style
|
||||
|
||||
## 3.0.0 (05/05/2014)
|
||||
|
||||
### Public API changes
|
||||
* Added _LEVEL to all InfoLogLevel enums
|
||||
* Deprecated ReadOptions.prefix and ReadOptions.prefix_seek. Seek() defaults to prefix-based seek when Options.prefix_extractor is supplied. More detail is documented in https://github.com/facebook/rocksdb/wiki/Prefix-Seek-API-Changes
|
||||
* MemTableRepFactory::CreateMemTableRep() takes info logger as an extra parameter.
|
||||
|
||||
### New Features
|
||||
* Column family support
|
||||
* Added an option to use different checksum functions in BlockBasedTableOptions
|
||||
* Added ApplyToAllCacheEntries() function to Cache
|
||||
|
||||
## 2.8.0 (04/04/2014)
|
||||
|
||||
* Removed arena.h from public header files.
|
||||
* By default, checksums are verified on every read from database
|
||||
* Change default value of several options, including: paranoid_checks=true, max_open_files=5000, level0_slowdown_writes_trigger=20, level0_stop_writes_trigger=24, disable_seek_compaction=true, max_background_flushes=1 and allow_mmap_writes=false
|
||||
* Added is_manual_compaction to CompactionFilter::Context
|
||||
* Added "virtual void WaitForJoin()" in class Env. Default operation is no-op.
|
||||
* Removed BackupEngine::DeleteBackupsNewerThan() function
|
||||
* Added new option -- verify_checksums_in_compaction
|
||||
* Changed Options.prefix_extractor from raw pointer to shared_ptr (take ownership)
|
||||
Changed HashSkipListRepFactory and HashLinkListRepFactory constructor to not take SliceTransform object (use Options.prefix_extractor implicitly)
|
||||
* Added Env::GetThreadPoolQueueLen(), which returns the waiting queue length of thread pools
|
||||
* Added a command "checkconsistency" in ldb tool, which checks
|
||||
if file system state matches DB state (file existence and file sizes)
|
||||
* Separate options related to block based table to a new struct BlockBasedTableOptions.
|
||||
* WriteBatch has a new function Count() to return total size in the batch, and Data() now returns a reference instead of a copy
|
||||
* Add more counters to perf context.
|
||||
* Supports several more DB properties: compaction-pending, background-errors and cur-size-active-mem-table.
|
||||
|
||||
### New Features
|
||||
* If we find one truncated record at the end of the MANIFEST or WAL files,
|
||||
we will ignore it. We assume that writers of these records were interrupted
|
||||
and that we can safely ignore it.
|
||||
* A new SST format "PlainTable" is added, which is optimized for memory-only workloads. It can be created through NewPlainTableFactory() or NewTotalOrderPlainTableFactory().
|
||||
* A new mem table implementation hash linked list optimizing for the case that there are only few keys for each prefix, which can be created through NewHashLinkListRepFactory().
|
||||
* Merge operator supports a new function PartialMergeMulti() to allow users to do partial merges against multiple operands.
|
||||
* Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1.
|
||||
* Geo-spatial support for locations and radial-search.
|
||||
|
||||
## 2.7.0 (01/28/2014)
|
||||
|
||||
### Public API changes
|
||||
|
||||
* Renamed `StackableDB::GetRawDB()` to `StackableDB::GetBaseDB()`.
|
||||
* Renamed `WriteBatch::Data()` `const std::string& Data() const`.
|
||||
* Renamed class `TableStats` to `TableProperties`.
|
||||
* Deleted class `PrefixHashRepFactory`. Please use `NewHashSkipListRepFactory()` instead.
|
||||
* Supported multi-threaded `EnableFileDeletions()` and `DisableFileDeletions()`.
|
||||
* Added `DB::GetOptions()`.
|
||||
* Added `DB::GetDbIdentity()`.
|
||||
|
||||
### New Features
|
||||
|
||||
* Added [BackupableDB](https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F)
|
||||
* Implemented [TailingIterator](https://github.com/facebook/rocksdb/wiki/Tailing-Iterator), a special type of iterator that
|
||||
doesn't create a snapshot (can be used to read newly inserted data)
|
||||
and is optimized for doing sequential reads.
|
||||
* Added property block for table, which allows (1) a table to store
|
||||
its metadata and (2) end user to collect and store properties they
|
||||
are interested in.
|
||||
* Enabled caching index and filter block in block cache (turned off by default).
|
||||
* Supported error report when doing manual compaction.
|
||||
* Supported additional Linux platform flavors and Mac OS.
|
||||
* Put with `SliceParts` - Variant of `Put()` that gathers output like `writev(2)`
|
||||
* Bug fixes and code refactor for compatibility with upcoming Column
|
||||
Family feature.
|
||||
|
||||
### Performance Improvements
|
||||
|
||||
* Huge benchmark performance improvements by multiple efforts. For example, increase in readonly QPS from about 530k in 2.6 release to 1.1 million in 2.7 [1]
|
||||
* Speeding up a way RocksDB deleted obsolete files - no longer listing the whole directory under a lock -- decrease in p99
|
||||
* Use raw pointer instead of shared pointer for statistics: [5b825d](https://github.com/facebook/rocksdb/commit/5b825d6964e26ec3b4bb6faa708ebb1787f1d7bd) -- huge increase in performance -- shared pointers are slow
|
||||
* Optimized locking for `Get()` -- [1fdb3f](https://github.com/facebook/rocksdb/commit/1fdb3f7dc60e96394e3e5b69a46ede5d67fb976c) -- 1.5x QPS increase for some workloads
|
||||
* Cache speedup - [e8d40c3](https://github.com/facebook/rocksdb/commit/e8d40c31b3cca0c3e1ae9abe9b9003b1288026a9)
|
||||
* Implemented autovector, which allocates first N elements on stack. Most of vectors in RocksDB are small. Also, we never want to allocate heap objects while holding a mutex. -- [c01676e4](https://github.com/facebook/rocksdb/commit/c01676e46d3be08c3c140361ef1f5884f47d3b3c)
|
||||
* Lots of efforts to move malloc, memcpy and IO outside of locks
|
||||
84
INSTALL.md
Normal file
84
INSTALL.md
Normal file
@@ -0,0 +1,84 @@
|
||||
## Compilation
|
||||
|
||||
RocksDB's library should be able to compile without any dependency installed,
|
||||
although we recommend installing some compression libraries (see below).
|
||||
We do depend on newer gcc with C++11 support.
|
||||
|
||||
There are few options when compiling RocksDB:
|
||||
|
||||
* [recommended] `make static_lib` will compile librocksdb.a, RocksDB static library.
|
||||
|
||||
* `make shared_lib` will compile librocksdb.so, RocksDB shared library.
|
||||
|
||||
* `make check` will compile and run all the unit tests
|
||||
|
||||
* `make all` will compile our static library, and all our tools and unit tests. Our tools
|
||||
depend on gflags. You will need to have gflags installed to run `make all`.
|
||||
|
||||
## Dependencies
|
||||
|
||||
* You can link RocksDB with following compression libraries:
|
||||
- [zlib](http://www.zlib.net/) - a library for data compression.
|
||||
- [bzip2](http://www.bzip.org/) - a library for data compression.
|
||||
- [snappy](https://code.google.com/p/snappy/) - a library for fast
|
||||
data compression.
|
||||
|
||||
* All our tools depend on:
|
||||
- [gflags](https://code.google.com/p/gflags/) - a library that handles
|
||||
command line flags processing. You can compile rocksdb library even
|
||||
if you don't have gflags installed.
|
||||
|
||||
## Supported platforms
|
||||
|
||||
* **Linux - Ubuntu**
|
||||
* Upgrade your gcc to version at least 4.7 to get C++11 support.
|
||||
* Install gflags. First, try: `sudo apt-get install libgflags-dev`
|
||||
If this doesn't work and you're using Ubuntu, here's a nice tutorial:
|
||||
(http://askubuntu.com/questions/312173/installing-gflags-12-04)
|
||||
* Install snappy. This is usually as easy as:
|
||||
`sudo apt-get install libsnappy-dev`.
|
||||
* Install zlib. Try: `sudo apt-get install zlib1g-dev`.
|
||||
* Install bzip2: `sudo apt-get install libbz2-dev`.
|
||||
* **Linux - CentOS**
|
||||
* Upgrade your gcc to version at least 4.7 to get C++11 support:
|
||||
`yum install gcc47-c++`
|
||||
* Install gflags:
|
||||
|
||||
wget https://gflags.googlecode.com/files/gflags-2.0-no-svn-files.tar.gz
|
||||
tar -xzvf gflags-2.0-no-svn-files.tar.gz
|
||||
cd gflags-2.0
|
||||
./configure && make && sudo make install
|
||||
|
||||
* Install snappy:
|
||||
|
||||
wget https://snappy.googlecode.com/files/snappy-1.1.1.tar.gz
|
||||
tar -xzvf snappy-1.1.1.tar.gz
|
||||
cd snappy-1.1.1
|
||||
./configure && make && sudo make install
|
||||
|
||||
* Install zlib:
|
||||
|
||||
sudo yum install zlib
|
||||
sudo yum install zlib-devel
|
||||
|
||||
* Install bzip2:
|
||||
|
||||
sudo yum install bzip2
|
||||
sudo yum install bzip2-devel
|
||||
|
||||
* **OS X**:
|
||||
* Install latest C++ compiler that supports C++ 11:
|
||||
* Update XCode: run `xcode-select --install` (or install it from XCode App's settting).
|
||||
* Install via [homebrew](http://brew.sh/).
|
||||
* If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line.
|
||||
* run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher).
|
||||
* Install zlib, bzip2 and snappy libraries for compression.
|
||||
* Install gflags. We have included a script
|
||||
`build_tools/mac-install-gflags.sh`, which should automatically install it.
|
||||
If you installed gflags by other means (for example, `brew install gflags`),
|
||||
please set `LIBRARY_PATH` and `CPATH` accordingly.
|
||||
* Please note that some of the optimizations/features are disabled in OSX.
|
||||
We did not run any production workloads on it.
|
||||
|
||||
* **iOS**:
|
||||
* Run: `TARGET_OS=IOS make static_lib`
|
||||
523
Makefile
Normal file
523
Makefile
Normal file
@@ -0,0 +1,523 @@
|
||||
# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
# Inherit some settings from environment variables, if available
|
||||
INSTALL_PATH ?= $(CURDIR)
|
||||
|
||||
#-----------------------------------------------
|
||||
|
||||
ifneq ($(MAKECMDGOALS),dbg)
|
||||
OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
|
||||
else
|
||||
# intentionally left blank
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),shared_lib)
|
||||
OPT += -DNDEBUG
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),static_lib)
|
||||
OPT += -DNDEBUG
|
||||
endif
|
||||
|
||||
#-----------------------------------------------
|
||||
|
||||
# detect what platform we're building on
|
||||
$(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/build_config.mk"))
|
||||
# this file is generated by the previous line to set build flags and sources
|
||||
include build_config.mk
|
||||
|
||||
ifneq ($(PLATFORM), IOS)
|
||||
CFLAGS += -g
|
||||
CXXFLAGS += -g
|
||||
else
|
||||
# no debug info for IOS, that will make our library big
|
||||
OPT += -DNDEBUG
|
||||
endif
|
||||
|
||||
# ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
|
||||
ifdef COMPILE_WITH_ASAN
|
||||
# ASAN compile flags
|
||||
EXEC_LDFLAGS += -fsanitize=address
|
||||
PLATFORM_CCFLAGS += -fsanitize=address
|
||||
PLATFORM_CXXFLAGS += -fsanitize=address
|
||||
else
|
||||
# if we're not compiling with ASAN, use jemalloc
|
||||
EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS)
|
||||
PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
|
||||
PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
|
||||
endif
|
||||
|
||||
WARNING_FLAGS = -Wall -Werror
|
||||
CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
|
||||
CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
|
||||
|
||||
LDFLAGS += $(PLATFORM_LDFLAGS)
|
||||
|
||||
LIBOBJECTS = $(SOURCES:.cc=.o)
|
||||
LIBOBJECTS += $(SOURCESCPP:.cpp=.o)
|
||||
MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
|
||||
|
||||
TESTUTIL = ./util/testutil.o
|
||||
TESTHARNESS = ./util/testharness.o $(TESTUTIL)
|
||||
BENCHHARNESS = ./util/benchharness.o
|
||||
VALGRIND_ERROR = 2
|
||||
VALGRIND_DIR = build_tools/VALGRIND_LOGS
|
||||
VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
|
||||
VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
|
||||
|
||||
TESTS = \
|
||||
db_test \
|
||||
block_hash_index_test \
|
||||
autovector_test \
|
||||
column_family_test \
|
||||
table_properties_collector_test \
|
||||
arena_test \
|
||||
auto_roll_logger_test \
|
||||
benchharness_test \
|
||||
block_test \
|
||||
bloom_test \
|
||||
dynamic_bloom_test \
|
||||
c_test \
|
||||
cache_test \
|
||||
coding_test \
|
||||
corruption_test \
|
||||
crc32c_test \
|
||||
dbformat_test \
|
||||
env_test \
|
||||
blob_store_test \
|
||||
filelock_test \
|
||||
filename_test \
|
||||
filter_block_test \
|
||||
histogram_test \
|
||||
log_test \
|
||||
manual_compaction_test \
|
||||
memenv_test \
|
||||
merge_test \
|
||||
redis_test \
|
||||
reduce_levels_test \
|
||||
plain_table_db_test \
|
||||
prefix_test \
|
||||
simple_table_db_test \
|
||||
skiplist_test \
|
||||
stringappend_test \
|
||||
ttl_test \
|
||||
backupable_db_test \
|
||||
version_edit_test \
|
||||
version_set_test \
|
||||
file_indexer_test \
|
||||
write_batch_test\
|
||||
deletefile_test \
|
||||
table_test \
|
||||
thread_local_test \
|
||||
geodb_test
|
||||
|
||||
TOOLS = \
|
||||
sst_dump \
|
||||
db_sanity_test \
|
||||
db_stress \
|
||||
ldb \
|
||||
db_repl_stress \
|
||||
blob_store_bench
|
||||
|
||||
PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench $(TOOLS)
|
||||
|
||||
# The library name is configurable since we are maintaining libraries of both
|
||||
# debug/release mode.
|
||||
ifeq ($(LIBNAME),)
|
||||
LIBNAME=librocksdb
|
||||
endif
|
||||
LIBRARY = ${LIBNAME}.a
|
||||
MEMENVLIBRARY = libmemenv.a
|
||||
|
||||
default: all
|
||||
|
||||
#-----------------------------------------------
|
||||
# Create platform independent shared libraries.
|
||||
#-----------------------------------------------
|
||||
ifneq ($(PLATFORM_SHARED_EXT),)
|
||||
|
||||
ifneq ($(PLATFORM_SHARED_VERSIONED),true)
|
||||
SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
|
||||
SHARED2 = $(SHARED1)
|
||||
SHARED3 = $(SHARED1)
|
||||
SHARED = $(SHARED1)
|
||||
else
|
||||
# Update db.h if you change these.
|
||||
SHARED_MAJOR = 3
|
||||
SHARED_MINOR = 2
|
||||
SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
|
||||
SHARED2 = $(SHARED1).$(SHARED_MAJOR)
|
||||
SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
|
||||
SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
|
||||
$(SHARED1): $(SHARED3)
|
||||
ln -fs $(SHARED3) $(SHARED1)
|
||||
$(SHARED2): $(SHARED3)
|
||||
ln -fs $(SHARED3) $(SHARED2)
|
||||
endif
|
||||
|
||||
$(SHARED3):
|
||||
$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) $(LDFLAGS) -o $@
|
||||
|
||||
endif # PLATFORM_SHARED_EXT
|
||||
|
||||
.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
|
||||
release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
|
||||
dbg
|
||||
|
||||
all: $(LIBRARY) $(PROGRAMS) $(TESTS)
|
||||
|
||||
static_lib: $(LIBRARY)
|
||||
|
||||
shared_lib: $(SHARED)
|
||||
|
||||
dbg: $(LIBRARY) $(PROGRAMS) $(TESTS)
|
||||
|
||||
# creates static library and programs
|
||||
release:
|
||||
$(MAKE) clean
|
||||
OPT="-DNDEBUG -O2" $(MAKE) static_lib $(PROGRAMS) -j32
|
||||
|
||||
coverage:
|
||||
$(MAKE) clean
|
||||
COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check -j32
|
||||
(cd coverage; ./coverage_test.sh)
|
||||
# Delete intermediate files
|
||||
find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
|
||||
|
||||
check: $(TESTS) ldb
|
||||
for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
|
||||
python tools/ldb_test.py
|
||||
|
||||
ldb_tests: ldb
|
||||
python tools/ldb_test.py
|
||||
|
||||
crash_test: whitebox_crash_test blackbox_crash_test
|
||||
|
||||
blackbox_crash_test: db_stress
|
||||
python -u tools/db_crashtest.py
|
||||
|
||||
whitebox_crash_test: db_stress
|
||||
python -u tools/db_crashtest2.py
|
||||
|
||||
asan_check:
|
||||
$(MAKE) clean
|
||||
COMPILE_WITH_ASAN=1 $(MAKE) check -j32
|
||||
$(MAKE) clean
|
||||
|
||||
asan_crash_test:
|
||||
$(MAKE) clean
|
||||
COMPILE_WITH_ASAN=1 $(MAKE) crash_test
|
||||
$(MAKE) clean
|
||||
|
||||
valgrind_check: all $(PROGRAMS) $(TESTS)
|
||||
mkdir -p $(VALGRIND_DIR)
|
||||
echo TESTS THAT HAVE VALGRIND ERRORS > $(VALGRIND_DIR)/valgrind_failed_tests; \
|
||||
echo TIMES in seconds TAKEN BY TESTS ON VALGRIND > $(VALGRIND_DIR)/valgrind_tests_times; \
|
||||
for t in $(filter-out skiplist_test,$(TESTS)); do \
|
||||
stime=`date '+%s'`; \
|
||||
$(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \
|
||||
if [ $$? -eq $(VALGRIND_ERROR) ] ; then \
|
||||
echo $$t >> $(VALGRIND_DIR)/valgrind_failed_tests; \
|
||||
fi; \
|
||||
etime=`date '+%s'`; \
|
||||
echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \
|
||||
done
|
||||
|
||||
clean:
|
||||
-rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk
|
||||
-rm -rf ios-x86/* ios-arm/*
|
||||
-find . -name "*.[od]" -exec rm {} \;
|
||||
-find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
|
||||
tags:
|
||||
ctags * -R
|
||||
cscope -b `find . -name '*.cc'` `find . -name '*.h'`
|
||||
|
||||
format:
|
||||
build_tools/format-diff.sh
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unit tests and tools
|
||||
# ---------------------------------------------------------------------------
|
||||
$(LIBRARY): $(LIBOBJECTS)
|
||||
rm -f $@
|
||||
$(AR) -rs $@ $(LIBOBJECTS)
|
||||
|
||||
db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
|
||||
$(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
|
||||
$(CXX) tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL)
|
||||
$(CXX) tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
|
||||
$(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL)
|
||||
$(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
signal_test: util/signal_test.o $(LIBOBJECTS)
|
||||
$(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
column_family_test: db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
dynamic_bloom_test: util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL)
|
||||
$(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
benchharness_test: util/benchharness_test.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS)
|
||||
$(CXX) util/benchharness_test.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
histogram_test: util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
|
||||
|
||||
plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
|
||||
|
||||
log_and_apply_bench: db/log_and_apply_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS)
|
||||
$(CXX) db/log_and_apply_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
|
||||
|
||||
perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
|
||||
|
||||
prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
|
||||
|
||||
backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
file_indexer_test : db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
|
||||
|
||||
geodb_test: utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
|
||||
rm -f $@
|
||||
$(AR) -rs $@ $(MEMENVOBJECTS)
|
||||
|
||||
memenv_test : helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
rocksdb_shell: tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o tools/shell/ShellContext.h tools/shell/ShellState.h tools/shell/DBClientProxy.h $(LIBOBJECTS)
|
||||
$(CXX) tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
DBClientProxy_test: tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY)
|
||||
$(CXX) tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY) $(EXEC_LDFLAGS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
sst_dump: tools/sst_dump.o $(LIBOBJECTS)
|
||||
$(CXX) tools/sst_dump.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
ldb: tools/ldb.o $(LIBOBJECTS)
|
||||
$(CXX) tools/ldb.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Jni stuff
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
JNI_NATIVE_SOURCES = ./java/rocksjni/*.cc
|
||||
JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
|
||||
ROCKSDBJNILIB = ./java/librocksdbjni.so
|
||||
|
||||
ifeq ($(PLATFORM), OS_MACOSX)
|
||||
ROCKSDBJNILIB = ./java/librocksdbjni.jnilib
|
||||
JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
|
||||
endif
|
||||
|
||||
rocksdbjava: clean
|
||||
OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32
|
||||
cd java;$(MAKE) java;
|
||||
rm -f $(ROCKSDBJNILIB)
|
||||
$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o $(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
jclean:
|
||||
cd java;$(MAKE) clean;
|
||||
rm -f $(ROCKSDBJNILIB)
|
||||
|
||||
jtest:
|
||||
cd java;$(MAKE) sample;$(MAKE) test;
|
||||
|
||||
jdb_bench:
|
||||
cd java;$(MAKE) db_bench;
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Platform-specific compilation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ifeq ($(PLATFORM), IOS)
|
||||
# For iOS, create universal object files to be used on both the simulator and
|
||||
# a device.
|
||||
PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
|
||||
SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
|
||||
DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
|
||||
IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
|
||||
|
||||
.cc.o:
|
||||
mkdir -p ios-x86/$(dir $@)
|
||||
$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
|
||||
mkdir -p ios-arm/$(dir $@)
|
||||
xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
|
||||
lipo ios-x86/$@ ios-arm/$@ -create -output $@
|
||||
|
||||
.c.o:
|
||||
mkdir -p ios-x86/$(dir $@)
|
||||
$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
|
||||
mkdir -p ios-arm/$(dir $@)
|
||||
xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
|
||||
lipo ios-x86/$@ ios-arm/$@ -create -output $@
|
||||
|
||||
else
|
||||
.cc.o:
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
|
||||
|
||||
.c.o:
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Source files dependencies detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Add proper dependency support so changing a .h file forces a .cc file to
|
||||
# rebuild.
|
||||
|
||||
# The .d file indicates .cc file's dependencies on .h files. We generate such
|
||||
# dependency by g++'s -MM option, whose output is a make dependency rule.
|
||||
# The sed command makes sure the "target" file in the generated .d file has
|
||||
# the correct path prefix.
|
||||
%.d: %.cc
|
||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM $< -o $@
|
||||
ifeq ($(PLATFORM), OS_MACOSX)
|
||||
@sed -i '' -e 's,.*:,$*.o:,' $@
|
||||
else
|
||||
@sed -i -e 's,.*:,$*.o:,' $@
|
||||
endif
|
||||
|
||||
DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d))
|
||||
|
||||
depend: $(DEPFILES)
|
||||
|
||||
# if the make goal is either "clean" or "format", we shouldn't
|
||||
# try to import the *.d files.
|
||||
# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly
|
||||
# working solution.
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
ifneq ($(MAKECMDGOALS),format)
|
||||
ifneq ($(MAKECMDGOALS),jclean)
|
||||
ifneq ($(MAKECMDGOALS),jtest)
|
||||
-include $(DEPFILES)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
23
PATENTS
Normal file
23
PATENTS
Normal file
@@ -0,0 +1,23 @@
|
||||
Additional Grant of Patent Rights
|
||||
|
||||
“Software” means the rocksdb software distributed by Facebook, Inc.
|
||||
|
||||
Facebook hereby grants you a perpetual, worldwide, royalty-free,
|
||||
non-exclusive, irrevocable (subject to the termination provision below)
|
||||
license under any rights in any patent claims owned by Facebook, to make,
|
||||
have made, use, sell, offer to sell, import, and otherwise transfer the
|
||||
Software. For avoidance of doubt, no license is granted under Facebook’s
|
||||
rights in any patent claims that are infringed by (i) modifications to the
|
||||
Software made by you or a third party, or (ii) the Software in combination
|
||||
with any software or other technology provided by you or a third party.
|
||||
|
||||
The license granted hereunder will terminate, automatically and without
|
||||
notice, for anyone that makes any claim (including by filing any lawsuit,
|
||||
assertion or other action) alleging (a) direct, indirect, or contributory
|
||||
infringement or inducement to infringe any patent: (i) by Facebook or any
|
||||
of its subsidiaries or affiliates, whether or not such claim is related
|
||||
to the Software, (ii) by any party if such claim arises in whole or in
|
||||
part from any software, product or service of Facebook or any of its
|
||||
subsidiaries or affiliates, whether or not such claim is related to the
|
||||
Software, or (iii) by any party relating to the Software; or (b) that
|
||||
any right in any patent claim of Facebook is invalid or unenforceable.
|
||||
26
README.md
Normal file
26
README.md
Normal file
@@ -0,0 +1,26 @@
|
||||
## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage
|
||||
|
||||
[](https://travis-ci.org/facebook/rocksdb)
|
||||
|
||||
RocksDB is developed and maintained by Facebook Database Engineering Team.
|
||||
It is built on on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com)
|
||||
and Jeff Dean (jeff@google.com)
|
||||
|
||||
This code is a library that forms the core building block for a fast
|
||||
key value server, especially suited for storing data on flash drives.
|
||||
It has an Log-Structured-Merge-Database (LSM) design with flexible tradeoffs
|
||||
between Write-Amplification-Factor (WAF), Read-Amplification-Factor (RAF)
|
||||
and Space-Amplification-Factor (SAF). It has multi-threaded compactions,
|
||||
making it specially suitable for storing multiple terabytes of data in a
|
||||
single database.
|
||||
|
||||
Start with example usage here: https://github.com/facebook/rocksdb/tree/master/examples
|
||||
|
||||
See [doc/index.html](https://github.com/facebook/rocksdb/blob/master/doc/index.html) and
|
||||
[github wiki](https://github.com/facebook/rocksdb/wiki) for more explanation.
|
||||
|
||||
The public interface is in `include/`. Callers should not include or
|
||||
rely on the details of any other header files in this package. Those
|
||||
internal APIs may be changed without warning.
|
||||
|
||||
Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/
|
||||
20
ROCKSDB_LITE.md
Normal file
20
ROCKSDB_LITE.md
Normal file
@@ -0,0 +1,20 @@
|
||||
# RocksDBLite
|
||||
|
||||
RocksDBLite is a project focused on mobile use cases, which don't need a lot of fancy things we've built for server workloads and they are very sensitive to binary size. For that reason, we added a compile flag ROCKSDB_LITE that comments out a lot of the nonessential code and keeps the binary lean.
|
||||
|
||||
Some examples of the features disabled by ROCKSDB_LITE:
|
||||
* compiled-in support for LDB tool
|
||||
* No backupable DB
|
||||
* No support for replication (which we provide in form of TrasactionalIterator)
|
||||
* No advanced monitoring tools
|
||||
* No special-purpose memtables that are highly optimized for specific use cases
|
||||
|
||||
When adding a new big feature to RocksDB, please add ROCKSDB_LITE compile guard if:
|
||||
* Nobody from mobile really needs your feature,
|
||||
* Your feature is adding a lot of weight to the binary.
|
||||
|
||||
Don't add ROCKSDB_LITE compile guard if:
|
||||
* It would introduce a lot of code complexity. Compile guards make code harder to read. It's a trade-off.
|
||||
* Your feature is not adding a lot of weight.
|
||||
|
||||
If unsure, ask. :)
|
||||
320
build_tools/build_detect_platform
Executable file
320
build_tools/build_detect_platform
Executable file
@@ -0,0 +1,320 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Detects OS we're compiling on and outputs a file specified by the first
|
||||
# argument, which in turn gets read while processing Makefile.
|
||||
#
|
||||
# The output will set the following variables:
|
||||
# CC C Compiler path
|
||||
# CXX C++ Compiler path
|
||||
# PLATFORM_LDFLAGS Linker flags
|
||||
# PLATFORM_SHARED_EXT Extension for shared libraries
|
||||
# PLATFORM_SHARED_LDFLAGS Flags for building shared library
|
||||
# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library
|
||||
# PLATFORM_CCFLAGS C compiler flags
|
||||
# PLATFORM_CXXFLAGS C++ compiler flags. Will contain:
|
||||
# PLATFORM_SHARED_VERSIONED Set to 'true' if platform supports versioned
|
||||
# shared libraries, empty otherwise.
|
||||
#
|
||||
# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
|
||||
#
|
||||
# -DLEVELDB_PLATFORM_POSIX if cstdatomic is present
|
||||
# -DLEVELDB_PLATFORM_NOATOMIC if it is not
|
||||
# -DSNAPPY if the Snappy library is present
|
||||
# -DLZ4 if the LZ4 library is present
|
||||
#
|
||||
# Using gflags in rocksdb:
|
||||
# Our project depends on gflags, which requires users to take some extra steps
|
||||
# before they can compile the whole repository:
|
||||
# 1. Install gflags. You may download it from here:
|
||||
# https://code.google.com/p/gflags/
|
||||
# 2. Once install, add the include path/lib path for gflags to CPATH and
|
||||
# LIBRARY_PATH respectively. If installed with default mode, the
|
||||
# lib and include path will be /usr/local/lib and /usr/local/include
|
||||
# Mac user can do this by running build_tools/mac-install-gflags.sh
|
||||
|
||||
OUTPUT=$1
|
||||
if test -z "$OUTPUT"; then
|
||||
echo "usage: $0 <output-filename>" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# we depend on C++11
|
||||
PLATFORM_CXXFLAGS="-std=c++11"
|
||||
# we currently depend on POSIX platform
|
||||
COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
|
||||
|
||||
# Default to fbcode gcc on internal fb machines
|
||||
if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
|
||||
FBCODE_BUILD="true"
|
||||
if [ -z "$USE_CLANG" ]; then
|
||||
CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \
|
||||
$(rpm -q --whatprovides redhat-release)`
|
||||
if [ "$CENTOS_VERSION" = "6" ]; then
|
||||
source "$PWD/build_tools/fbcode.gcc481.sh"
|
||||
else
|
||||
source "$PWD/build_tools/fbcode.gcc471.sh"
|
||||
fi
|
||||
else
|
||||
source "$PWD/build_tools/fbcode.clang31.sh"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Delete existing output, if it exists
|
||||
rm -f "$OUTPUT"
|
||||
touch "$OUTPUT"
|
||||
|
||||
if test -z "$CC"; then
|
||||
CC=cc
|
||||
fi
|
||||
|
||||
if test -z "$CXX"; then
|
||||
CXX=g++
|
||||
fi
|
||||
|
||||
# Detect OS
|
||||
if test -z "$TARGET_OS"; then
|
||||
TARGET_OS=`uname -s`
|
||||
fi
|
||||
|
||||
COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}"
|
||||
CROSS_COMPILE=
|
||||
PLATFORM_CCFLAGS=
|
||||
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
|
||||
PLATFORM_SHARED_EXT="so"
|
||||
PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
|
||||
PLATFORM_SHARED_CFLAGS="-fPIC"
|
||||
PLATFORM_SHARED_VERSIONED=false
|
||||
|
||||
# generic port files (working on all platform by #ifdef) go directly in /port
|
||||
GENERIC_PORT_FILES=`cd "$ROCKSDB_ROOT"; find port -name '*.cc' | tr "\n" " "`
|
||||
|
||||
# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
|
||||
case "$TARGET_OS" in
|
||||
Darwin)
|
||||
PLATFORM=OS_MACOSX
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX"
|
||||
PLATFORM_SHARED_EXT=dylib
|
||||
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
|
||||
# PORT_FILES=port/darwin/darwin_specific.cc
|
||||
;;
|
||||
IOS)
|
||||
PLATFORM=IOS
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE -DROCKSDB_LITE"
|
||||
PLATFORM_SHARED_EXT=dylib
|
||||
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
|
||||
CROSS_COMPILE=true
|
||||
;;
|
||||
Linux)
|
||||
PLATFORM=OS_LINUX
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
|
||||
if [ -z "$USE_CLANG" ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
|
||||
fi
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
|
||||
# PORT_FILES=port/linux/linux_specific.cc
|
||||
;;
|
||||
SunOS)
|
||||
PLATFORM=OS_SOLARIS
|
||||
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
|
||||
# PORT_FILES=port/sunos/sunos_specific.cc
|
||||
;;
|
||||
FreeBSD)
|
||||
PLATFORM=OS_FREEBSD
|
||||
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
|
||||
# PORT_FILES=port/freebsd/freebsd_specific.cc
|
||||
;;
|
||||
NetBSD)
|
||||
PLATFORM=OS_NETBSD
|
||||
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s"
|
||||
# PORT_FILES=port/netbsd/netbsd_specific.cc
|
||||
;;
|
||||
OpenBSD)
|
||||
PLATFORM=OS_OPENBSD
|
||||
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread"
|
||||
# PORT_FILES=port/openbsd/openbsd_specific.cc
|
||||
;;
|
||||
DragonFly)
|
||||
PLATFORM=OS_DRAGONFLYBSD
|
||||
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
|
||||
# PORT_FILES=port/dragonfly/dragonfly_specific.cc
|
||||
;;
|
||||
OS_ANDROID_CROSSCOMPILE)
|
||||
PLATFORM=OS_ANDROID
|
||||
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS " # All pthread features are in the Android C library
|
||||
# PORT_FILES=port/android/android.cc
|
||||
CROSS_COMPILE=true
|
||||
;;
|
||||
*)
|
||||
echo "Unknown platform!" >&2
|
||||
exit 1
|
||||
esac
|
||||
|
||||
if test -z "$DO_NOT_RUN_BUILD_DETECT_VERSION"; then
|
||||
"$PWD/build_tools/build_detect_version"
|
||||
fi
|
||||
|
||||
# We want to make a list of all cc files within util, db, table, and helpers
|
||||
# except for the test and benchmark files. By default, find will output a list
|
||||
# of all files matching either rule, so we need to append -print to make the
|
||||
# prune take effect.
|
||||
DIRS="util db table utilities"
|
||||
|
||||
set -f # temporarily disable globbing so that our patterns arent expanded
|
||||
PRUNE_TEST="-name *test*.cc -prune"
|
||||
PRUNE_BENCH="-name *bench*.cc -prune"
|
||||
PORTABLE_FILES=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
|
||||
PORTABLE_CPP=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "`
|
||||
set +f # re-enable globbing
|
||||
|
||||
# The sources consist of the portable files, plus the platform-specific port
|
||||
# file.
|
||||
echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> "$OUTPUT"
|
||||
echo "SOURCESCPP=$PORTABLE_CPP" >> "$OUTPUT"
|
||||
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> "$OUTPUT"
|
||||
|
||||
if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
|
||||
# Cross-compiling; do not try any compilation tests.
|
||||
# Also don't need any compilation tests if compiling on fbcode
|
||||
true
|
||||
else
|
||||
# If -std=c++0x works, use <atomic>. Otherwise use port_posix.h.
|
||||
$CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <atomic>
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT"
|
||||
fi
|
||||
|
||||
# Test whether fallocate is available
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <fcntl.h>
|
||||
int main() {
|
||||
int fd = open("/dev/null", 0);
|
||||
fallocate(fd, 0, 0, 1024);
|
||||
}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT"
|
||||
fi
|
||||
|
||||
# Test whether Snappy library is installed
|
||||
# http://code.google.com/p/snappy/
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <snappy.h>
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
|
||||
fi
|
||||
|
||||
|
||||
# Test whether gflags library is installed
|
||||
# http://code.google.com/p/gflags/
|
||||
# check if the namespace is gflags
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <gflags/gflags.h>
|
||||
using namespace gflags;
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=gflags"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
|
||||
fi
|
||||
|
||||
# check if namespace is google
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <gflags/gflags.h>
|
||||
using namespace google;
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=google"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
|
||||
fi
|
||||
|
||||
# Test whether zlib library is installed
|
||||
$CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <zlib.h>
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
|
||||
fi
|
||||
|
||||
# Test whether bzip library is installed
|
||||
$CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <bzlib.h>
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DBZIP2"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2"
|
||||
fi
|
||||
|
||||
# Test whether lz4 library is installed
|
||||
$CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <lz4.h>
|
||||
#include <lz4hc.h>
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DLZ4"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4"
|
||||
fi
|
||||
|
||||
# Test whether tcmalloc is available
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <<EOF
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
|
||||
fi
|
||||
fi
|
||||
|
||||
# shall we use HDFS?
|
||||
|
||||
if test "$USE_HDFS"; then
|
||||
if test -z "$JAVA_HOME"; then
|
||||
echo "JAVA_HOME has to be set for HDFS usage."
|
||||
exit 1
|
||||
fi
|
||||
HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS"
|
||||
HDFS_LDFLAGS="$HDFS_LDFLAGS -Wl,--no-whole-archive -lhdfs -L$JAVA_HOME/jre/lib/amd64"
|
||||
HDFS_LDFLAGS="$HDFS_LDFLAGS -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib"
|
||||
HDFS_LDFLAGS="$HDFS_LDFLAGS -ldl -lverify -ljava -ljvm"
|
||||
COMMON_FLAGS="$COMMON_FLAGS $HDFS_CCFLAGS"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $HDFS_LDFLAGS"
|
||||
fi
|
||||
|
||||
# if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 "
|
||||
COMMON_FLAGS="$COMMON_FLAGS $USE_SSE"
|
||||
|
||||
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
|
||||
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
|
||||
|
||||
VALGRIND_VER="$VALGRIND_VER"
|
||||
|
||||
echo "CC=$CC" >> "$OUTPUT"
|
||||
echo "CXX=$CXX" >> "$OUTPUT"
|
||||
echo "PLATFORM=$PLATFORM" >> "$OUTPUT"
|
||||
echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT"
|
||||
echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT"
|
||||
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT"
|
||||
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> "$OUTPUT"
|
||||
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> "$OUTPUT"
|
||||
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> "$OUTPUT"
|
||||
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> "$OUTPUT"
|
||||
echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> "$OUTPUT"
|
||||
echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> "$OUTPUT"
|
||||
echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> "$OUTPUT"
|
||||
echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> "$OUTPUT"
|
||||
22
build_tools/build_detect_version
Executable file
22
build_tools/build_detect_version
Executable file
@@ -0,0 +1,22 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Record the version of the source that we are compiling.
|
||||
# We keep a record of the git revision in util/version.cc. This source file
|
||||
# is then built as a regular source file as part of the compilation process.
|
||||
# One can run "strings executable_filename | grep _build_" to find the version of
|
||||
# the source that we used to build the executable file.
|
||||
|
||||
OUTFILE="$PWD/util/build_version.cc"
|
||||
|
||||
GIT_SHA=""
|
||||
if command -v git >/dev/null 2>&1; then
|
||||
GIT_SHA=$(git rev-parse HEAD 2>/dev/null)
|
||||
fi
|
||||
|
||||
cat > "${OUTFILE}" <<EOF
|
||||
#include "build_version.h"
|
||||
const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:${GIT_SHA}";
|
||||
const char* rocksdb_build_git_datetime = "rocksdb_build_git_datetime:$(date)";
|
||||
const char* rocksdb_build_compile_date = __DATE__;
|
||||
const char* rocksdb_build_compile_time = __TIME__;
|
||||
EOF
|
||||
74
build_tools/fbcode.clang31.sh
Normal file
74
build_tools/fbcode.clang31.sh
Normal file
@@ -0,0 +1,74 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Set environment variables so that we can compile leveldb using
|
||||
# fbcode settings. It uses the latest g++ compiler and also
|
||||
# uses jemalloc
|
||||
|
||||
TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
|
||||
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
|
||||
TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
|
||||
TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
|
||||
GLIBC_RUNTIME_PATH=/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1
|
||||
|
||||
# location of libgcc
|
||||
LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
|
||||
LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
|
||||
|
||||
# location of glibc
|
||||
GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
|
||||
GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
|
||||
|
||||
# location of snappy headers and libraries
|
||||
SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
|
||||
SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
|
||||
|
||||
# location of zlib headers and libraries
|
||||
ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
|
||||
ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
|
||||
|
||||
# location of gflags headers and libraries
|
||||
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
|
||||
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
|
||||
|
||||
# location of bzip headers and libraries
|
||||
BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
|
||||
BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
|
||||
|
||||
# location of gflags headers and libraries
|
||||
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
|
||||
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
|
||||
|
||||
# use Intel SSE support for checksum calculations
|
||||
export USE_SSE=" -msse -msse4.2 "
|
||||
|
||||
CC="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang $CLANG_INCLUDES"
|
||||
CXX="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang++ $CLANG_INCLUDES $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
|
||||
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
|
||||
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
|
||||
|
||||
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin -nostdlib "
|
||||
CFLAGS+=" -nostdinc -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1 "
|
||||
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/x86_64-facebook-linux "
|
||||
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/backward "
|
||||
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include "
|
||||
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include "
|
||||
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/clang/clang-3.2/0b7c69d/lib/clang/3.2/include "
|
||||
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include/linux "
|
||||
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include "
|
||||
CFLAGS+=" -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual"
|
||||
CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
|
||||
CXXFLAGS="$CFLAGS -nostdinc++"
|
||||
|
||||
CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
|
||||
|
||||
EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
|
||||
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
|
||||
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
|
||||
EXEC_LDFLAGS+=" -Wl,--dynamic-linker,$GLIBC_RUNTIME_PATH/lib/ld-linux-x86-64.so.2"
|
||||
EXEC_LDFLAGS+=" -B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin"
|
||||
|
||||
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
|
||||
|
||||
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
|
||||
|
||||
export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED
|
||||
70
build_tools/fbcode.gcc471.sh
Normal file
70
build_tools/fbcode.gcc471.sh
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Set environment variables so that we can compile leveldb using
|
||||
# fbcode settings. It uses the latest g++ compiler and also
|
||||
# uses jemalloc
|
||||
|
||||
TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
|
||||
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
|
||||
TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
|
||||
TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
|
||||
|
||||
# location of libhdfs libraries
|
||||
if test "$USE_HDFS"; then
|
||||
JAVA_HOME="/usr/local/jdk-6u22-64"
|
||||
JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
|
||||
GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1"
|
||||
HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
|
||||
HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
|
||||
HDFSLIB+=" -ldl -lverify -ljava -ljvm "
|
||||
fi
|
||||
|
||||
# location of libgcc
|
||||
LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
|
||||
LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
|
||||
|
||||
# location of glibc
|
||||
GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
|
||||
GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
|
||||
|
||||
# location of snappy headers and libraries
|
||||
SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
|
||||
SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
|
||||
|
||||
# location of zlib headers and libraries
|
||||
ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
|
||||
ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
|
||||
|
||||
# location of bzip headers and libraries
|
||||
BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
|
||||
BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
|
||||
|
||||
# location of gflags headers and libraries
|
||||
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
|
||||
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
|
||||
|
||||
# use Intel SSE support for checksum calculations
|
||||
export USE_SSE=" -msse -msse4.2 "
|
||||
|
||||
CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/gcc"
|
||||
CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
|
||||
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
|
||||
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
|
||||
|
||||
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
|
||||
CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
|
||||
CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
|
||||
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
|
||||
CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2"
|
||||
|
||||
EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
|
||||
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
|
||||
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
|
||||
|
||||
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
|
||||
|
||||
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
|
||||
|
||||
VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/91ddd43/bin/"
|
||||
|
||||
export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER
|
||||
81
build_tools/fbcode.gcc481.sh
Normal file
81
build_tools/fbcode.gcc481.sh
Normal file
@@ -0,0 +1,81 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Set environment variables so that we can compile rocksdb using
|
||||
# fbcode settings. It uses the latest g++ compiler and also
|
||||
# uses jemalloc
|
||||
|
||||
TOOLCHAIN_REV=53dc1fe83f84e9145b9ffb81b81aa7f6a49c87cc
|
||||
CENTOS_VERSION=`rpm -q --qf "%{VERSION}" $(rpm -q --whatprovides redhat-release)`
|
||||
if [ "$CENTOS_VERSION" = "6" ]; then
|
||||
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos6-native"
|
||||
else
|
||||
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
|
||||
fi
|
||||
TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17"
|
||||
|
||||
# location of libhdfs libraries
|
||||
if test "$USE_HDFS"; then
|
||||
JAVA_HOME="/usr/local/jdk-6u22-64"
|
||||
JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
|
||||
GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17"
|
||||
HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
|
||||
HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
|
||||
HDFSLIB+=" -ldl -lverify -ljava -ljvm "
|
||||
fi
|
||||
|
||||
# location of libgcc
|
||||
LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/include"
|
||||
LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/libs"
|
||||
|
||||
# location of glibc
|
||||
GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/include"
|
||||
GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/lib"
|
||||
|
||||
# location of snappy headers and libraries
|
||||
SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/include"
|
||||
SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/lib/libsnappy.a"
|
||||
|
||||
# location of zlib headers and libraries
|
||||
ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/include"
|
||||
ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/lib/libz.a"
|
||||
|
||||
# location of bzip headers and libraries
|
||||
BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/include"
|
||||
BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/lib/libbz2.a"
|
||||
|
||||
LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b
|
||||
LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include"
|
||||
LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a"
|
||||
|
||||
# location of gflags headers and libraries
|
||||
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/include"
|
||||
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/lib/libgflags.a"
|
||||
|
||||
# location of jemalloc
|
||||
JEMALLOC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/include/"
|
||||
JEMALLOC_LIB=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/lib/libjemalloc.a"
|
||||
|
||||
# use Intel SSE support for checksum calculations
|
||||
export USE_SSE=" -msse -msse4.2 "
|
||||
|
||||
CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/gcc"
|
||||
CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE"
|
||||
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
|
||||
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
|
||||
|
||||
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
|
||||
CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
|
||||
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
|
||||
CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4"
|
||||
|
||||
EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
|
||||
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/675d945/lib/libunwind.a"
|
||||
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
|
||||
|
||||
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
|
||||
|
||||
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
|
||||
|
||||
VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/c3f970a/bin/"
|
||||
|
||||
export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE
|
||||
107
build_tools/format-diff.sh
Executable file
107
build_tools/format-diff.sh
Executable file
@@ -0,0 +1,107 @@
|
||||
#!/bin/bash
|
||||
# If clang_format_diff.py command is not specfied, we assume we are able to
|
||||
# access directly without any path.
|
||||
if [ -z $CLANG_FORMAT_DIFF ]
|
||||
then
|
||||
CLANG_FORMAT_DIFF="clang-format-diff.py"
|
||||
fi
|
||||
|
||||
# Check clang-format-diff.py
|
||||
if ! which $CLANG_FORMAT_DIFF &> /dev/null
|
||||
then
|
||||
echo "You didn't have clang-format-diff.py available in your computer!"
|
||||
echo "You can download it by running: "
|
||||
echo " curl http://goo.gl/iUW1u2"
|
||||
exit 128
|
||||
fi
|
||||
|
||||
# Check argparse, a library that clang-format-diff.py requires.
|
||||
python 2>/dev/null << EOF
|
||||
import argparse
|
||||
EOF
|
||||
|
||||
if [ "$?" != 0 ]
|
||||
then
|
||||
echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
|
||||
echo "installed. You can try either of the follow ways to install it:"
|
||||
echo " 1. Manually download argparse: https://pypi.python.org/pypi/argparse"
|
||||
echo " 2. easy_install argparse (if you have easy_install)"
|
||||
echo " 3. pip install argparse (if you have pip)"
|
||||
exit 129
|
||||
fi
|
||||
|
||||
# TODO(kailiu) following work is not complete since we still need to figure
|
||||
# out how to add the modified files done pre-commit hook to git's commit index.
|
||||
#
|
||||
# Check if this script has already been added to pre-commit hook.
|
||||
# Will suggest user to add this script to pre-commit hook if their pre-commit
|
||||
# is empty.
|
||||
# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit"
|
||||
# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null
|
||||
# then
|
||||
# echo "Would you like to add this script to pre-commit hook, which will do "
|
||||
# echo -n "the format check for all the affected lines before you check in (y/n):"
|
||||
# read add_to_hook
|
||||
# if [ "$add_to_hook" == "y" ]
|
||||
# then
|
||||
# ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
|
||||
# fi
|
||||
# fi
|
||||
set -e
|
||||
|
||||
uncommitted_code=`git diff HEAD`
|
||||
|
||||
# If there's no uncommitted changes, we assume user are doing post-commit
|
||||
# format check, in which case we'll check the modified lines from latest commit.
|
||||
# Otherwise, we'll check format of the uncommitted code only.
|
||||
if [ -z "$uncommitted_code" ]
|
||||
then
|
||||
# Check the format of last commit
|
||||
diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1)
|
||||
else
|
||||
# Check the format of uncommitted lines,
|
||||
diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
|
||||
fi
|
||||
|
||||
if [ -z "$diffs" ]
|
||||
then
|
||||
echo "Nothing needs to be reformatted!"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Highlight the insertion/deletion from the clang-format-diff.py's output
|
||||
COLOR_END="\033[0m"
|
||||
COLOR_RED="\033[0;31m"
|
||||
COLOR_GREEN="\033[0;32m"
|
||||
|
||||
echo -e "Detect lines that doesn't follow the format rules:\r"
|
||||
# Add the color to the diff. lines added will be green; lines removed will be red.
|
||||
echo "$diffs" |
|
||||
sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
|
||||
sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
|
||||
echo -e "Would you like to fix the format automatically (y/n): \c"
|
||||
|
||||
# Make sure under any mode, we can read user input.
|
||||
exec < /dev/tty
|
||||
read to_fix
|
||||
|
||||
if [ "$to_fix" != "y" ]
|
||||
then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Do in-place format adjustment.
|
||||
git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1
|
||||
echo "Files reformatted!"
|
||||
|
||||
# Amend to last commit if user do the post-commit format check
|
||||
if [ -z "$uncommitted_code" ]; then
|
||||
echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c"
|
||||
read to_amend
|
||||
|
||||
if [ "$to_amend" == "y" ]
|
||||
then
|
||||
git commit -a --amend --reuse-message HEAD
|
||||
echo "Amended to last commit"
|
||||
fi
|
||||
fi
|
||||
25
build_tools/mac-install-gflags.sh
Executable file
25
build_tools/mac-install-gflags.sh
Executable file
@@ -0,0 +1,25 @@
|
||||
#!/bin/sh
|
||||
# Install gflags for mac developers.
|
||||
|
||||
set -e
|
||||
|
||||
DIR=`mktemp -d /tmp/rocksdb_gflags_XXXX`
|
||||
|
||||
cd $DIR
|
||||
wget https://gflags.googlecode.com/files/gflags-2.0.tar.gz
|
||||
tar xvfz gflags-2.0.tar.gz
|
||||
cd gflags-2.0
|
||||
|
||||
./configure
|
||||
make
|
||||
make install
|
||||
|
||||
# Add include/lib path for g++
|
||||
echo 'export LIBRARY_PATH+=":/usr/local/lib"' >> ~/.bash_profile
|
||||
echo 'export CPATH+=":/usr/local/include"' >> ~/.bash_profile
|
||||
|
||||
echo ""
|
||||
echo "-----------------------------------------------------------------------------"
|
||||
echo "| Installation Completed |"
|
||||
echo "-----------------------------------------------------------------------------"
|
||||
echo "Please run `. ~/bash_profile` to be able to compile with gflags"
|
||||
46
build_tools/make_new_version.sh
Executable file
46
build_tools/make_new_version.sh
Executable file
@@ -0,0 +1,46 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
# This source code is licensed under the BSD-style license found in the
|
||||
# LICENSE file in the root directory of this source tree. An additional grant
|
||||
# of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
set -e
|
||||
if [ -z "$GIT" ]
|
||||
then
|
||||
GIT="git"
|
||||
fi
|
||||
|
||||
# Print out the colored progress info so that it can be brainlessly
|
||||
# distinguished by users.
|
||||
function title() {
|
||||
echo -e "\033[1;32m$*\033[0m"
|
||||
}
|
||||
|
||||
usage="Create new RocksDB version and prepare it for the release process\n"
|
||||
usage+="USAGE: ./make_new_version.sh <version>"
|
||||
|
||||
# -- Pre-check
|
||||
if [[ $# < 1 ]]; then
|
||||
echo -e $usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ROCKSDB_VERSION=$1
|
||||
|
||||
GIT_BRANCH=`git rev-parse --abbrev-ref HEAD`
|
||||
echo $GIT_BRANCH
|
||||
|
||||
if [ $GIT_BRANCH != "master" ]; then
|
||||
echo "Error: Current branch is '$GIT_BRANCH', Please switch to master branch."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
title "Adding new tag for this release ..."
|
||||
BRANCH="$ROCKSDB_VERSION.fb"
|
||||
$GIT co -b $BRANCH
|
||||
|
||||
# Setting up the proxy for remote repo access
|
||||
title "Pushing new branch to remote repo ..."
|
||||
git push origin --set-upstream $BRANCH
|
||||
|
||||
title "Branch $BRANCH is pushed to github;"
|
||||
330
build_tools/regression_build_test.sh
Executable file
330
build_tools/regression_build_test.sh
Executable file
@@ -0,0 +1,330 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
NUM=10000000
|
||||
|
||||
if [ $# -eq 1 ];then
|
||||
DATA_DIR=$1
|
||||
elif [ $# -eq 2 ];then
|
||||
DATA_DIR=$1
|
||||
STAT_FILE=$2
|
||||
fi
|
||||
|
||||
# On the production build servers, set data and stat
|
||||
# files/directories not in /tmp or else the tempdir cleaning
|
||||
# scripts will make you very unhappy.
|
||||
DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)}
|
||||
STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)}
|
||||
|
||||
function cleanup {
|
||||
rm -rf $DATA_DIR
|
||||
rm -f $STAT_FILE.fillseq
|
||||
rm -f $STAT_FILE.readrandom
|
||||
rm -f $STAT_FILE.overwrite
|
||||
rm -f $STAT_FILE.memtablefillreadrandom
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
if [ -z $GIT_BRANCH ]; then
|
||||
git_br=`git rev-parse --abbrev-ref HEAD`
|
||||
else
|
||||
git_br=$(basename $GIT_BRANCH)
|
||||
fi
|
||||
|
||||
if [ $git_br == "master" ]; then
|
||||
git_br=""
|
||||
else
|
||||
git_br="."$git_br
|
||||
fi
|
||||
|
||||
make release
|
||||
|
||||
# measure fillseq + fill up the DB for overwrite benchmark
|
||||
./db_bench \
|
||||
--benchmarks=fillseq \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=0 \
|
||||
--bloom_bits=10 \
|
||||
--num=$NUM \
|
||||
--writes=$NUM \
|
||||
--cache_size=6442450944 \
|
||||
--cache_numshardbits=6 \
|
||||
--table_cache_numshardbits=4 \
|
||||
--open_files=55000 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 > ${STAT_FILE}.fillseq
|
||||
|
||||
# measure overwrite performance
|
||||
./db_bench \
|
||||
--benchmarks=overwrite \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=1 \
|
||||
--bloom_bits=10 \
|
||||
--num=$NUM \
|
||||
--writes=$((NUM / 10)) \
|
||||
--cache_size=6442450944 \
|
||||
--cache_numshardbits=6 \
|
||||
--table_cache_numshardbits=4 \
|
||||
--open_files=55000 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 \
|
||||
--threads=8 > ${STAT_FILE}.overwrite
|
||||
|
||||
# fill up the db for readrandom benchmark (1GB total size)
|
||||
./db_bench \
|
||||
--benchmarks=fillseq \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=0 \
|
||||
--bloom_bits=10 \
|
||||
--num=$NUM \
|
||||
--writes=$NUM \
|
||||
--cache_size=6442450944 \
|
||||
--cache_numshardbits=6 \
|
||||
--table_cache_numshardbits=4 \
|
||||
--open_files=55000 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 \
|
||||
--threads=1 > /dev/null
|
||||
|
||||
# measure readrandom with 6GB block cache
|
||||
./db_bench \
|
||||
--benchmarks=readrandom \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=1 \
|
||||
--bloom_bits=10 \
|
||||
--num=$NUM \
|
||||
--reads=$((NUM / 5)) \
|
||||
--cache_size=6442450944 \
|
||||
--cache_numshardbits=6 \
|
||||
--table_cache_numshardbits=4 \
|
||||
--open_files=55000 \
|
||||
--disable_seek_compaction=1 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 \
|
||||
--threads=16 > ${STAT_FILE}.readrandom
|
||||
|
||||
# measure readrandom with 6GB block cache and tailing iterator
|
||||
./db_bench \
|
||||
--benchmarks=readrandom \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=1 \
|
||||
--bloom_bits=10 \
|
||||
--num=$NUM \
|
||||
--reads=$((NUM / 5)) \
|
||||
--cache_size=6442450944 \
|
||||
--cache_numshardbits=6 \
|
||||
--table_cache_numshardbits=4 \
|
||||
--open_files=55000 \
|
||||
--disable_seek_compaction=1 \
|
||||
--use_tailing_iterator=1 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 \
|
||||
--threads=16 > ${STAT_FILE}.readrandomtailing
|
||||
|
||||
# measure readrandom with 100MB block cache
|
||||
./db_bench \
|
||||
--benchmarks=readrandom \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=1 \
|
||||
--bloom_bits=10 \
|
||||
--num=$NUM \
|
||||
--reads=$((NUM / 5)) \
|
||||
--cache_size=104857600 \
|
||||
--cache_numshardbits=6 \
|
||||
--table_cache_numshardbits=4 \
|
||||
--open_files=55000 \
|
||||
--disable_seek_compaction=1 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 \
|
||||
--threads=16 > ${STAT_FILE}.readrandomsmallblockcache
|
||||
|
||||
# measure readrandom with 8k data in memtable
|
||||
./db_bench \
|
||||
--benchmarks=overwrite,readrandom \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=1 \
|
||||
--bloom_bits=10 \
|
||||
--num=$NUM \
|
||||
--reads=$((NUM / 5)) \
|
||||
--writes=512 \
|
||||
--cache_size=6442450944 \
|
||||
--cache_numshardbits=6 \
|
||||
--table_cache_numshardbits=4 \
|
||||
--write_buffer_size=1000000000 \
|
||||
--open_files=55000 \
|
||||
--disable_seek_compaction=1 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 \
|
||||
--threads=16 > ${STAT_FILE}.readrandom_mem_sst
|
||||
|
||||
|
||||
# fill up the db for readrandom benchmark with filluniquerandom (1GB total size)
|
||||
./db_bench \
|
||||
--benchmarks=filluniquerandom \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=0 \
|
||||
--bloom_bits=10 \
|
||||
--num=$((NUM / 4)) \
|
||||
--writes=$((NUM / 4)) \
|
||||
--cache_size=6442450944 \
|
||||
--cache_numshardbits=6 \
|
||||
--table_cache_numshardbits=4 \
|
||||
--open_files=55000 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 \
|
||||
--threads=1 > /dev/null
|
||||
|
||||
# dummy test just to compact the data
|
||||
./db_bench \
|
||||
--benchmarks=readrandom \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=1 \
|
||||
--bloom_bits=10 \
|
||||
--num=$((NUM / 1000)) \
|
||||
--reads=$((NUM / 1000)) \
|
||||
--cache_size=6442450944 \
|
||||
--cache_numshardbits=6 \
|
||||
--table_cache_numshardbits=4 \
|
||||
--open_files=55000 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 \
|
||||
--threads=16 > /dev/null
|
||||
|
||||
# measure readrandom after load with filluniquerandom with 6GB block cache
|
||||
./db_bench \
|
||||
--benchmarks=readrandom \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=1 \
|
||||
--bloom_bits=10 \
|
||||
--num=$((NUM / 4)) \
|
||||
--reads=$((NUM / 4)) \
|
||||
--cache_size=6442450944 \
|
||||
--cache_numshardbits=6 \
|
||||
--table_cache_numshardbits=4 \
|
||||
--open_files=55000 \
|
||||
--disable_seek_compaction=1 \
|
||||
--disable_auto_compactions=1 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 \
|
||||
--threads=16 > ${STAT_FILE}.readrandom_filluniquerandom
|
||||
|
||||
# measure readwhilewriting after load with filluniquerandom with 6GB block cache
|
||||
./db_bench \
|
||||
--benchmarks=readwhilewriting \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=1 \
|
||||
--bloom_bits=10 \
|
||||
--num=$((NUM / 4)) \
|
||||
--reads=$((NUM / 4)) \
|
||||
--writes_per_second=1000 \
|
||||
--write_buffer_size=100000000 \
|
||||
--cache_size=6442450944 \
|
||||
--cache_numshardbits=6 \
|
||||
--table_cache_numshardbits=4 \
|
||||
--open_files=55000 \
|
||||
--disable_seek_compaction=1 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 \
|
||||
--threads=16 > ${STAT_FILE}.readwhilewriting
|
||||
|
||||
# measure memtable performance -- none of the data gets flushed to disk
|
||||
./db_bench \
|
||||
--benchmarks=fillrandom,readrandom, \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=0 \
|
||||
--num=$((NUM / 10)) \
|
||||
--reads=$NUM \
|
||||
--cache_size=6442450944 \
|
||||
--cache_numshardbits=6 \
|
||||
--table_cache_numshardbits=4 \
|
||||
--write_buffer_size=1000000000 \
|
||||
--open_files=55000 \
|
||||
--disable_seek_compaction=1 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 \
|
||||
--value_size=10 \
|
||||
--threads=16 > ${STAT_FILE}.memtablefillreadrandom
|
||||
|
||||
# send data to ods
|
||||
function send_to_ods {
|
||||
key="$1"
|
||||
value="$2"
|
||||
|
||||
if [ -z $JENKINS_HOME ]; then
|
||||
# running on devbox, just print out the values
|
||||
echo $1 $2
|
||||
return
|
||||
fi
|
||||
|
||||
if [ -z "$value" ];then
|
||||
echo >&2 "ERROR: Key $key doesn't have a value."
|
||||
return
|
||||
fi
|
||||
curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \
|
||||
--connect-timeout 60
|
||||
}
|
||||
|
||||
function send_benchmark_to_ods {
|
||||
bench="$1"
|
||||
bench_key="$2"
|
||||
file="$3"
|
||||
|
||||
QPS=$(grep $bench $file | awk '{print $5}')
|
||||
P50_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $3}' )
|
||||
P75_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $5}' )
|
||||
P99_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $7}' )
|
||||
|
||||
send_to_ods rocksdb.build.$bench_key.qps $QPS
|
||||
send_to_ods rocksdb.build.$bench_key.p50_micros $P50_MICROS
|
||||
send_to_ods rocksdb.build.$bench_key.p75_micros $P75_MICROS
|
||||
send_to_ods rocksdb.build.$bench_key.p99_micros $P99_MICROS
|
||||
}
|
||||
|
||||
send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite
|
||||
send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq
|
||||
send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom
|
||||
send_benchmark_to_ods readrandom readrandom_tailing $STAT_FILE.readrandomtailing
|
||||
send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache
|
||||
send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst
|
||||
send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom
|
||||
send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom
|
||||
send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom
|
||||
send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting
|
||||
15
build_tools/valgrind_test.sh
Executable file
15
build_tools/valgrind_test.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
#A shell script for Jenknis to run valgrind on rocksdb tests
|
||||
#Returns 0 on success when there are no failed tests
|
||||
|
||||
VALGRIND_DIR=build_tools/VALGRIND_LOGS
|
||||
make clean
|
||||
make -j$(nproc) valgrind_check
|
||||
NUM_FAILED_TESTS=$((`wc -l $VALGRIND_DIR/valgrind_failed_tests | awk '{print $1}'` - 1))
|
||||
if [ $NUM_FAILED_TESTS -lt 1 ]; then
|
||||
echo No tests have valgrind errors
|
||||
exit 0
|
||||
else
|
||||
cat $VALGRIND_DIR/valgrind_failed_tests
|
||||
exit 1
|
||||
fi
|
||||
78
coverage/coverage_test.sh
Executable file
78
coverage/coverage_test.sh
Executable file
@@ -0,0 +1,78 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Exit on error.
|
||||
set -e
|
||||
|
||||
if [ -n "$USE_CLANG" ]; then
|
||||
echo "Error: Coverage test is supported only for gcc."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ROOT=".."
|
||||
# Fetch right version of gcov
|
||||
if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
|
||||
source $ROOT/build_tools/fbcode.gcc471.sh
|
||||
GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov
|
||||
else
|
||||
GCOV=$(which gcov)
|
||||
fi
|
||||
|
||||
COVERAGE_DIR="$PWD/COVERAGE_REPORT"
|
||||
mkdir -p $COVERAGE_DIR
|
||||
|
||||
# Find all gcno files to generate the coverage report
|
||||
|
||||
GCNO_FILES=`find $ROOT -name "*.gcno"`
|
||||
$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
|
||||
# Parse the raw gcov report to more human readable form.
|
||||
python $ROOT/coverage/parse_gcov_output.py |
|
||||
# Write the output to both stdout and report file.
|
||||
tee $COVERAGE_DIR/coverage_report_all.txt &&
|
||||
echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n"
|
||||
|
||||
# TODO: we also need to get the files of the latest commits.
|
||||
# Get the most recently committed files.
|
||||
LATEST_FILES=`
|
||||
git show --pretty="format:" --name-only HEAD |
|
||||
grep -v "^$" |
|
||||
paste -s -d,`
|
||||
RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt
|
||||
|
||||
echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT
|
||||
$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
|
||||
python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES |
|
||||
tee -a $RECENT_REPORT &&
|
||||
echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
|
||||
|
||||
# Unless otherwise specified, we'll not generate html report by default
|
||||
if [ -z "$HTML" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Generate the html report. If we cannot find lcov in this machine, we'll simply
|
||||
# skip this step.
|
||||
echo "Generating the html coverage report..."
|
||||
|
||||
LCOV=$(which lcov || true 2>/dev/null)
|
||||
if [ -z $LCOV ]
|
||||
then
|
||||
echo "Skip: Cannot find lcov to generate the html report."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
LCOV_VERSION=$(lcov -v | grep 1.1 || true)
|
||||
if [ $LCOV_VERSION ]
|
||||
then
|
||||
echo "Not supported lcov version. Expect lcov 1.1."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
(cd $ROOT; lcov --no-external \
|
||||
--capture \
|
||||
--directory $PWD \
|
||||
--gcov-tool $GCOV \
|
||||
--output-file $COVERAGE_DIR/coverage.info)
|
||||
|
||||
genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR
|
||||
|
||||
echo "HTML Coverage report is generated in $COVERAGE_DIR"
|
||||
118
coverage/parse_gcov_output.py
Normal file
118
coverage/parse_gcov_output.py
Normal file
@@ -0,0 +1,118 @@
|
||||
import optparse
|
||||
import re
|
||||
import sys
|
||||
|
||||
from optparse import OptionParser
|
||||
|
||||
# the gcov report follows certain pattern. Each file will have two lines
|
||||
# of report, from which we can extract the file name, total lines and coverage
|
||||
# percentage.
|
||||
def parse_gcov_report(gcov_input):
|
||||
per_file_coverage = {}
|
||||
total_coverage = None
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
|
||||
# --First line of the coverage report (with file name in it)?
|
||||
match_obj = re.match("^File '(.*)'$", line)
|
||||
if match_obj:
|
||||
# fetch the file name from the first line of the report.
|
||||
current_file = match_obj.group(1)
|
||||
continue
|
||||
|
||||
# -- Second line of the file report (with coverage percentage)
|
||||
match_obj = re.match("^Lines executed:(.*)% of (.*)", line)
|
||||
|
||||
if match_obj:
|
||||
coverage = float(match_obj.group(1))
|
||||
lines = int(match_obj.group(2))
|
||||
|
||||
if current_file is not None:
|
||||
per_file_coverage[current_file] = (coverage, lines)
|
||||
current_file = None
|
||||
else:
|
||||
# If current_file is not set, we reach the last line of report,
|
||||
# which contains the summarized coverage percentage.
|
||||
total_coverage = (coverage, lines)
|
||||
continue
|
||||
|
||||
# If the line's pattern doesn't fall into the above categories. We
|
||||
# can simply ignore them since they're either empty line or doesn't
|
||||
# find executable lines of the given file.
|
||||
current_file = None
|
||||
|
||||
return per_file_coverage, total_coverage
|
||||
|
||||
def get_option_parser():
|
||||
usage = "Parse the gcov output and generate more human-readable code " +\
|
||||
"coverage report."
|
||||
parser = OptionParser(usage)
|
||||
|
||||
parser.add_option(
|
||||
"--interested-files", "-i",
|
||||
dest="filenames",
|
||||
help="Comma separated files names. if specified, we will display " +
|
||||
"the coverage report only for interested source files. " +
|
||||
"Otherwise we will display the coverage report for all " +
|
||||
"source files."
|
||||
)
|
||||
return parser
|
||||
|
||||
def display_file_coverage(per_file_coverage, total_coverage):
|
||||
# To print out auto-adjustable column, we need to know the longest
|
||||
# length of file names.
|
||||
max_file_name_length = max(
|
||||
len(fname) for fname in per_file_coverage.keys()
|
||||
)
|
||||
|
||||
# -- Print header
|
||||
# size of separator is determined by 3 column sizes:
|
||||
# file name, coverage percentage and lines.
|
||||
header_template = \
|
||||
"%" + str(max_file_name_length) + "s\t%s\t%s"
|
||||
separator = "-" * (max_file_name_length + 10 + 20)
|
||||
print header_template % ("Filename", "Coverage", "Lines")
|
||||
print separator
|
||||
|
||||
# -- Print body
|
||||
# template for printing coverage report for each file.
|
||||
record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d"
|
||||
|
||||
for fname, coverage_info in per_file_coverage.items():
|
||||
coverage, lines = coverage_info
|
||||
print record_template % (fname, coverage, lines)
|
||||
|
||||
# -- Print footer
|
||||
if total_coverage:
|
||||
print separator
|
||||
print record_template % ("Total", total_coverage[0], total_coverage[1])
|
||||
|
||||
def report_coverage():
|
||||
parser = get_option_parser()
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
interested_files = set()
|
||||
if options.filenames is not None:
|
||||
interested_files = set(f.strip() for f in options.filenames.split(','))
|
||||
|
||||
# To make things simple, right now we only read gcov report from the input
|
||||
per_file_coverage, total_coverage = parse_gcov_report(sys.stdin)
|
||||
|
||||
# Check if we need to display coverage info for interested files.
|
||||
if len(interested_files):
|
||||
per_file_coverage = dict(
|
||||
(fname, per_file_coverage[fname]) for fname in interested_files
|
||||
if fname in per_file_coverage
|
||||
)
|
||||
# If we only interested in several files, it makes no sense to report
|
||||
# the total_coverage
|
||||
total_coverage = None
|
||||
|
||||
if not len(per_file_coverage):
|
||||
print >> sys.stderr, "Cannot find coverage info for the given files."
|
||||
return
|
||||
display_file_coverage(per_file_coverage, total_coverage)
|
||||
|
||||
if __name__ == "__main__":
|
||||
report_coverage()
|
||||
224
db/builder.cc
Normal file
224
db/builder.cc
Normal file
@@ -0,0 +1,224 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/builder.h"
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/merge_helper.h"
|
||||
#include "db/table_cache.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "table/block_based_table_builder.h"
|
||||
#include "util/stop_watch.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class TableFactory;
|
||||
|
||||
TableBuilder* NewTableBuilder(const Options& options,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
WritableFile* file,
|
||||
CompressionType compression_type) {
|
||||
return options.table_factory->NewTableBuilder(options, internal_comparator,
|
||||
file, compression_type);
|
||||
}
|
||||
|
||||
Status BuildTable(const std::string& dbname, Env* env, const Options& options,
|
||||
const EnvOptions& soptions, TableCache* table_cache,
|
||||
Iterator* iter, FileMetaData* meta,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
const SequenceNumber newest_snapshot,
|
||||
const SequenceNumber earliest_seqno_in_memtable,
|
||||
const CompressionType compression) {
|
||||
Status s;
|
||||
meta->file_size = 0;
|
||||
meta->smallest_seqno = meta->largest_seqno = 0;
|
||||
iter->SeekToFirst();
|
||||
|
||||
// If the sequence number of the smallest entry in the memtable is
|
||||
// smaller than the most recent snapshot, then we do not trigger
|
||||
// removal of duplicate/deleted keys as part of this builder.
|
||||
bool purge = options.purge_redundant_kvs_while_flush;
|
||||
if (earliest_seqno_in_memtable <= newest_snapshot) {
|
||||
purge = false;
|
||||
}
|
||||
|
||||
std::string fname = TableFileName(dbname, meta->number);
|
||||
if (iter->Valid()) {
|
||||
unique_ptr<WritableFile> file;
|
||||
s = env->NewWritableFile(fname, &file, soptions);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
TableBuilder* builder =
|
||||
NewTableBuilder(options, internal_comparator, file.get(), compression);
|
||||
|
||||
// the first key is the smallest key
|
||||
Slice key = iter->key();
|
||||
meta->smallest.DecodeFrom(key);
|
||||
meta->smallest_seqno = GetInternalKeySeqno(key);
|
||||
meta->largest_seqno = meta->smallest_seqno;
|
||||
|
||||
MergeHelper merge(internal_comparator.user_comparator(),
|
||||
options.merge_operator.get(), options.info_log.get(),
|
||||
options.min_partial_merge_operands,
|
||||
true /* internal key corruption is not ok */);
|
||||
|
||||
if (purge) {
|
||||
// Ugly walkaround to avoid compiler error for release build
|
||||
bool ok __attribute__((unused)) = true;
|
||||
|
||||
// Will write to builder if current key != prev key
|
||||
ParsedInternalKey prev_ikey;
|
||||
std::string prev_key;
|
||||
bool is_first_key = true; // Also write if this is the very first key
|
||||
|
||||
while (iter->Valid()) {
|
||||
bool iterator_at_next = false;
|
||||
|
||||
// Get current key
|
||||
ParsedInternalKey this_ikey;
|
||||
Slice key = iter->key();
|
||||
Slice value = iter->value();
|
||||
|
||||
// In-memory key corruption is not ok;
|
||||
// TODO: find a clean way to treat in memory key corruption
|
||||
ok = ParseInternalKey(key, &this_ikey);
|
||||
assert(ok);
|
||||
assert(this_ikey.sequence >= earliest_seqno_in_memtable);
|
||||
|
||||
// If the key is the same as the previous key (and it is not the
|
||||
// first key), then we skip it, since it is an older version.
|
||||
// Otherwise we output the key and mark it as the "new" previous key.
|
||||
if (!is_first_key && !internal_comparator.user_comparator()->Compare(
|
||||
prev_ikey.user_key, this_ikey.user_key)) {
|
||||
// seqno within the same key are in decreasing order
|
||||
assert(this_ikey.sequence < prev_ikey.sequence);
|
||||
} else {
|
||||
is_first_key = false;
|
||||
|
||||
if (this_ikey.type == kTypeMerge) {
|
||||
// Handle merge-type keys using the MergeHelper
|
||||
// TODO: pass statistics to MergeUntil
|
||||
merge.MergeUntil(iter, 0 /* don't worry about snapshot */);
|
||||
iterator_at_next = true;
|
||||
if (merge.IsSuccess()) {
|
||||
// Merge completed correctly.
|
||||
// Add the resulting merge key/value and continue to next
|
||||
builder->Add(merge.key(), merge.value());
|
||||
prev_key.assign(merge.key().data(), merge.key().size());
|
||||
ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
|
||||
assert(ok);
|
||||
} else {
|
||||
// Merge did not find a Put/Delete.
|
||||
// Can not compact these merges into a kValueType.
|
||||
// Write them out one-by-one. (Proceed back() to front())
|
||||
const std::deque<std::string>& keys = merge.keys();
|
||||
const std::deque<std::string>& values = merge.values();
|
||||
assert(keys.size() == values.size() && keys.size() >= 1);
|
||||
std::deque<std::string>::const_reverse_iterator key_iter;
|
||||
std::deque<std::string>::const_reverse_iterator value_iter;
|
||||
for (key_iter=keys.rbegin(), value_iter = values.rbegin();
|
||||
key_iter != keys.rend() && value_iter != values.rend();
|
||||
++key_iter, ++value_iter) {
|
||||
|
||||
builder->Add(Slice(*key_iter), Slice(*value_iter));
|
||||
}
|
||||
|
||||
// Sanity check. Both iterators should end at the same time
|
||||
assert(key_iter == keys.rend() && value_iter == values.rend());
|
||||
|
||||
prev_key.assign(keys.front());
|
||||
ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
|
||||
assert(ok);
|
||||
}
|
||||
} else {
|
||||
// Handle Put/Delete-type keys by simply writing them
|
||||
builder->Add(key, value);
|
||||
prev_key.assign(key.data(), key.size());
|
||||
ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
|
||||
assert(ok);
|
||||
}
|
||||
}
|
||||
|
||||
if (!iterator_at_next) iter->Next();
|
||||
}
|
||||
|
||||
// The last key is the largest key
|
||||
meta->largest.DecodeFrom(Slice(prev_key));
|
||||
SequenceNumber seqno = GetInternalKeySeqno(Slice(prev_key));
|
||||
meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
|
||||
meta->largest_seqno = std::max(meta->largest_seqno, seqno);
|
||||
|
||||
} else {
|
||||
for (; iter->Valid(); iter->Next()) {
|
||||
Slice key = iter->key();
|
||||
meta->largest.DecodeFrom(key);
|
||||
builder->Add(key, iter->value());
|
||||
SequenceNumber seqno = GetInternalKeySeqno(key);
|
||||
meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
|
||||
meta->largest_seqno = std::max(meta->largest_seqno, seqno);
|
||||
}
|
||||
}
|
||||
|
||||
// Finish and check for builder errors
|
||||
if (s.ok()) {
|
||||
s = builder->Finish();
|
||||
if (s.ok()) {
|
||||
meta->file_size = builder->FileSize();
|
||||
assert(meta->file_size > 0);
|
||||
}
|
||||
} else {
|
||||
builder->Abandon();
|
||||
}
|
||||
delete builder;
|
||||
|
||||
// Finish and check for file errors
|
||||
if (s.ok() && !options.disableDataSync) {
|
||||
if (options.use_fsync) {
|
||||
StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
|
||||
s = file->Fsync();
|
||||
} else {
|
||||
StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
|
||||
s = file->Sync();
|
||||
}
|
||||
}
|
||||
if (s.ok()) {
|
||||
s = file->Close();
|
||||
}
|
||||
|
||||
if (s.ok()) {
|
||||
// Verify that the table is usable
|
||||
Iterator* it = table_cache->NewIterator(ReadOptions(), soptions,
|
||||
internal_comparator, *meta);
|
||||
s = it->status();
|
||||
delete it;
|
||||
}
|
||||
}
|
||||
|
||||
// Check for input iterator errors
|
||||
if (!iter->status().ok()) {
|
||||
s = iter->status();
|
||||
}
|
||||
|
||||
if (s.ok() && meta->file_size > 0) {
|
||||
// Keep it
|
||||
} else {
|
||||
env->DeleteFile(fname);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
45
db/builder.h
Normal file
45
db/builder.h
Normal file
@@ -0,0 +1,45 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
#pragma once
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "rocksdb/options.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
struct Options;
|
||||
struct FileMetaData;
|
||||
|
||||
class Env;
|
||||
struct EnvOptions;
|
||||
class Iterator;
|
||||
class TableCache;
|
||||
class VersionEdit;
|
||||
class TableBuilder;
|
||||
class WritableFile;
|
||||
|
||||
extern TableBuilder* NewTableBuilder(
|
||||
const Options& options, const InternalKeyComparator& internal_comparator,
|
||||
WritableFile* file, CompressionType compression_type);
|
||||
|
||||
// Build a Table file from the contents of *iter. The generated file
|
||||
// will be named according to meta->number. On success, the rest of
|
||||
// *meta will be filled with metadata about the generated table.
|
||||
// If no data is present in *iter, meta->file_size will be set to
|
||||
// zero, and no Table file will be produced.
|
||||
extern Status BuildTable(const std::string& dbname, Env* env,
|
||||
const Options& options, const EnvOptions& soptions,
|
||||
TableCache* table_cache, Iterator* iter,
|
||||
FileMetaData* meta,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
const SequenceNumber newest_snapshot,
|
||||
const SequenceNumber earliest_seqno_in_memtable,
|
||||
const CompressionType compression);
|
||||
|
||||
} // namespace rocksdb
|
||||
494
db/c_test.c
Normal file
494
db/c_test.c
Normal file
@@ -0,0 +1,494 @@
|
||||
/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
Use of this source code is governed by a BSD-style license that can be
|
||||
found in the LICENSE file. See the AUTHORS file for names of contributors. */
|
||||
|
||||
#include "rocksdb/c.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
const char* phase = "";
|
||||
static char dbname[200];
|
||||
|
||||
static void StartPhase(const char* name) {
|
||||
fprintf(stderr, "=== Test %s\n", name);
|
||||
phase = name;
|
||||
}
|
||||
|
||||
static const char* GetTempDir(void) {
|
||||
const char* ret = getenv("TEST_TMPDIR");
|
||||
if (ret == NULL || ret[0] == '\0')
|
||||
ret = "/tmp";
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define CheckNoError(err) \
|
||||
if ((err) != NULL) { \
|
||||
fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
|
||||
abort(); \
|
||||
}
|
||||
|
||||
#define CheckCondition(cond) \
|
||||
if (!(cond)) { \
|
||||
fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
|
||||
abort(); \
|
||||
}
|
||||
|
||||
static void CheckEqual(const char* expected, const char* v, size_t n) {
|
||||
if (expected == NULL && v == NULL) {
|
||||
// ok
|
||||
} else if (expected != NULL && v != NULL && n == strlen(expected) &&
|
||||
memcmp(expected, v, n) == 0) {
|
||||
// ok
|
||||
return;
|
||||
} else {
|
||||
fprintf(stderr, "%s: expected '%s', got '%s'\n",
|
||||
phase,
|
||||
(expected ? expected : "(null)"),
|
||||
(v ? v : "(null"));
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
static void Free(char** ptr) {
|
||||
if (*ptr) {
|
||||
free(*ptr);
|
||||
*ptr = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void CheckGet(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_readoptions_t* options,
|
||||
const char* key,
|
||||
const char* expected) {
|
||||
char* err = NULL;
|
||||
size_t val_len;
|
||||
char* val;
|
||||
val = rocksdb_get(db, options, key, strlen(key), &val_len, &err);
|
||||
CheckNoError(err);
|
||||
CheckEqual(expected, val, val_len);
|
||||
Free(&val);
|
||||
}
|
||||
|
||||
static void CheckIter(rocksdb_iterator_t* iter,
|
||||
const char* key, const char* val) {
|
||||
size_t len;
|
||||
const char* str;
|
||||
str = rocksdb_iter_key(iter, &len);
|
||||
CheckEqual(key, str, len);
|
||||
str = rocksdb_iter_value(iter, &len);
|
||||
CheckEqual(val, str, len);
|
||||
}
|
||||
|
||||
// Callback from rocksdb_writebatch_iterate()
|
||||
static void CheckPut(void* ptr,
|
||||
const char* k, size_t klen,
|
||||
const char* v, size_t vlen) {
|
||||
int* state = (int*) ptr;
|
||||
CheckCondition(*state < 2);
|
||||
switch (*state) {
|
||||
case 0:
|
||||
CheckEqual("bar", k, klen);
|
||||
CheckEqual("b", v, vlen);
|
||||
break;
|
||||
case 1:
|
||||
CheckEqual("box", k, klen);
|
||||
CheckEqual("c", v, vlen);
|
||||
break;
|
||||
}
|
||||
(*state)++;
|
||||
}
|
||||
|
||||
// Callback from rocksdb_writebatch_iterate()
|
||||
static void CheckDel(void* ptr, const char* k, size_t klen) {
|
||||
int* state = (int*) ptr;
|
||||
CheckCondition(*state == 2);
|
||||
CheckEqual("bar", k, klen);
|
||||
(*state)++;
|
||||
}
|
||||
|
||||
static void CmpDestroy(void* arg) { }
|
||||
|
||||
static int CmpCompare(void* arg, const char* a, size_t alen,
|
||||
const char* b, size_t blen) {
|
||||
int n = (alen < blen) ? alen : blen;
|
||||
int r = memcmp(a, b, n);
|
||||
if (r == 0) {
|
||||
if (alen < blen) r = -1;
|
||||
else if (alen > blen) r = +1;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static const char* CmpName(void* arg) {
|
||||
return "foo";
|
||||
}
|
||||
|
||||
// Custom filter policy
|
||||
static unsigned char fake_filter_result = 1;
|
||||
static void FilterDestroy(void* arg) { }
|
||||
static const char* FilterName(void* arg) {
|
||||
return "TestFilter";
|
||||
}
|
||||
static char* FilterCreate(
|
||||
void* arg,
|
||||
const char* const* key_array, const size_t* key_length_array,
|
||||
int num_keys,
|
||||
size_t* filter_length) {
|
||||
*filter_length = 4;
|
||||
char* result = malloc(4);
|
||||
memcpy(result, "fake", 4);
|
||||
return result;
|
||||
}
|
||||
static unsigned char FilterKeyMatch(
|
||||
void* arg,
|
||||
const char* key, size_t length,
|
||||
const char* filter, size_t filter_length) {
|
||||
CheckCondition(filter_length == 4);
|
||||
CheckCondition(memcmp(filter, "fake", 4) == 0);
|
||||
return fake_filter_result;
|
||||
}
|
||||
|
||||
// Custom merge operator
|
||||
static void MergeOperatorDestroy(void* arg) { }
|
||||
static const char* MergeOperatorName(void* arg) {
|
||||
return "TestMergeOperator";
|
||||
}
|
||||
static char* MergeOperatorFullMerge(
|
||||
void* arg,
|
||||
const char* key, size_t key_length,
|
||||
const char* existing_value, size_t existing_value_length,
|
||||
const char* const* operands_list, const size_t* operands_list_length,
|
||||
int num_operands,
|
||||
unsigned char* success, size_t* new_value_length) {
|
||||
*new_value_length = 4;
|
||||
*success = 1;
|
||||
char* result = malloc(4);
|
||||
memcpy(result, "fake", 4);
|
||||
return result;
|
||||
}
|
||||
static char* MergeOperatorPartialMerge(
|
||||
void* arg,
|
||||
const char* key, size_t key_length,
|
||||
const char* const* operands_list, const size_t* operands_list_length,
|
||||
int num_operands,
|
||||
unsigned char* success, size_t* new_value_length) {
|
||||
*new_value_length = 4;
|
||||
*success = 1;
|
||||
char* result = malloc(4);
|
||||
memcpy(result, "fake", 4);
|
||||
return result;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
rocksdb_t* db;
|
||||
rocksdb_comparator_t* cmp;
|
||||
rocksdb_cache_t* cache;
|
||||
rocksdb_env_t* env;
|
||||
rocksdb_options_t* options;
|
||||
rocksdb_readoptions_t* roptions;
|
||||
rocksdb_writeoptions_t* woptions;
|
||||
char* err = NULL;
|
||||
int run = -1;
|
||||
|
||||
snprintf(dbname, sizeof(dbname),
|
||||
"%s/rocksdb_c_test-%d",
|
||||
GetTempDir(),
|
||||
((int) geteuid()));
|
||||
|
||||
StartPhase("create_objects");
|
||||
cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
|
||||
env = rocksdb_create_default_env();
|
||||
cache = rocksdb_cache_create_lru(100000);
|
||||
|
||||
options = rocksdb_options_create();
|
||||
rocksdb_options_set_comparator(options, cmp);
|
||||
rocksdb_options_set_error_if_exists(options, 1);
|
||||
rocksdb_options_set_cache(options, cache);
|
||||
rocksdb_options_set_env(options, env);
|
||||
rocksdb_options_set_info_log(options, NULL);
|
||||
rocksdb_options_set_write_buffer_size(options, 100000);
|
||||
rocksdb_options_set_paranoid_checks(options, 1);
|
||||
rocksdb_options_set_max_open_files(options, 10);
|
||||
rocksdb_options_set_block_size(options, 1024);
|
||||
rocksdb_options_set_block_restart_interval(options, 8);
|
||||
rocksdb_options_set_compression(options, rocksdb_no_compression);
|
||||
rocksdb_options_set_compression_options(options, -14, -1, 0);
|
||||
int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression,
|
||||
rocksdb_no_compression, rocksdb_no_compression};
|
||||
rocksdb_options_set_compression_per_level(options, compression_levels, 4);
|
||||
|
||||
roptions = rocksdb_readoptions_create();
|
||||
rocksdb_readoptions_set_verify_checksums(roptions, 1);
|
||||
rocksdb_readoptions_set_fill_cache(roptions, 0);
|
||||
|
||||
woptions = rocksdb_writeoptions_create();
|
||||
rocksdb_writeoptions_set_sync(woptions, 1);
|
||||
|
||||
StartPhase("destroy");
|
||||
rocksdb_destroy_db(options, dbname, &err);
|
||||
Free(&err);
|
||||
|
||||
StartPhase("open_error");
|
||||
db = rocksdb_open(options, dbname, &err);
|
||||
CheckCondition(err != NULL);
|
||||
Free(&err);
|
||||
|
||||
StartPhase("open");
|
||||
rocksdb_options_set_create_if_missing(options, 1);
|
||||
db = rocksdb_open(options, dbname, &err);
|
||||
CheckNoError(err);
|
||||
CheckGet(db, roptions, "foo", NULL);
|
||||
|
||||
StartPhase("put");
|
||||
rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err);
|
||||
CheckNoError(err);
|
||||
CheckGet(db, roptions, "foo", "hello");
|
||||
|
||||
StartPhase("compactall");
|
||||
rocksdb_compact_range(db, NULL, 0, NULL, 0);
|
||||
CheckGet(db, roptions, "foo", "hello");
|
||||
|
||||
StartPhase("compactrange");
|
||||
rocksdb_compact_range(db, "a", 1, "z", 1);
|
||||
CheckGet(db, roptions, "foo", "hello");
|
||||
|
||||
StartPhase("writebatch");
|
||||
{
|
||||
rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
|
||||
rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
|
||||
rocksdb_writebatch_clear(wb);
|
||||
rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
|
||||
rocksdb_writebatch_put(wb, "box", 3, "c", 1);
|
||||
rocksdb_writebatch_delete(wb, "bar", 3);
|
||||
rocksdb_write(db, woptions, wb, &err);
|
||||
CheckNoError(err);
|
||||
CheckGet(db, roptions, "foo", "hello");
|
||||
CheckGet(db, roptions, "bar", NULL);
|
||||
CheckGet(db, roptions, "box", "c");
|
||||
int pos = 0;
|
||||
rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
|
||||
CheckCondition(pos == 3);
|
||||
rocksdb_writebatch_destroy(wb);
|
||||
}
|
||||
|
||||
StartPhase("iter");
|
||||
{
|
||||
rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
|
||||
CheckCondition(!rocksdb_iter_valid(iter));
|
||||
rocksdb_iter_seek_to_first(iter);
|
||||
CheckCondition(rocksdb_iter_valid(iter));
|
||||
CheckIter(iter, "box", "c");
|
||||
rocksdb_iter_next(iter);
|
||||
CheckIter(iter, "foo", "hello");
|
||||
rocksdb_iter_prev(iter);
|
||||
CheckIter(iter, "box", "c");
|
||||
rocksdb_iter_prev(iter);
|
||||
CheckCondition(!rocksdb_iter_valid(iter));
|
||||
rocksdb_iter_seek_to_last(iter);
|
||||
CheckIter(iter, "foo", "hello");
|
||||
rocksdb_iter_seek(iter, "b", 1);
|
||||
CheckIter(iter, "box", "c");
|
||||
rocksdb_iter_get_error(iter, &err);
|
||||
CheckNoError(err);
|
||||
rocksdb_iter_destroy(iter);
|
||||
}
|
||||
|
||||
StartPhase("approximate_sizes");
|
||||
{
|
||||
int i;
|
||||
int n = 20000;
|
||||
char keybuf[100];
|
||||
char valbuf[100];
|
||||
uint64_t sizes[2];
|
||||
const char* start[2] = { "a", "k00000000000000010000" };
|
||||
size_t start_len[2] = { 1, 21 };
|
||||
const char* limit[2] = { "k00000000000000010000", "z" };
|
||||
size_t limit_len[2] = { 21, 1 };
|
||||
rocksdb_writeoptions_set_sync(woptions, 0);
|
||||
for (i = 0; i < n; i++) {
|
||||
snprintf(keybuf, sizeof(keybuf), "k%020d", i);
|
||||
snprintf(valbuf, sizeof(valbuf), "v%020d", i);
|
||||
rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
|
||||
&err);
|
||||
CheckNoError(err);
|
||||
}
|
||||
rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
|
||||
CheckCondition(sizes[0] > 0);
|
||||
CheckCondition(sizes[1] > 0);
|
||||
}
|
||||
|
||||
StartPhase("property");
|
||||
{
|
||||
char* prop = rocksdb_property_value(db, "nosuchprop");
|
||||
CheckCondition(prop == NULL);
|
||||
prop = rocksdb_property_value(db, "rocksdb.stats");
|
||||
CheckCondition(prop != NULL);
|
||||
Free(&prop);
|
||||
}
|
||||
|
||||
StartPhase("snapshot");
|
||||
{
|
||||
const rocksdb_snapshot_t* snap;
|
||||
snap = rocksdb_create_snapshot(db);
|
||||
rocksdb_delete(db, woptions, "foo", 3, &err);
|
||||
CheckNoError(err);
|
||||
rocksdb_readoptions_set_snapshot(roptions, snap);
|
||||
CheckGet(db, roptions, "foo", "hello");
|
||||
rocksdb_readoptions_set_snapshot(roptions, NULL);
|
||||
CheckGet(db, roptions, "foo", NULL);
|
||||
rocksdb_release_snapshot(db, snap);
|
||||
}
|
||||
|
||||
StartPhase("repair");
|
||||
{
|
||||
// If we do not compact here, then the lazy deletion of
|
||||
// files (https://reviews.facebook.net/D6123) would leave
|
||||
// around deleted files and the repair process will find
|
||||
// those files and put them back into the database.
|
||||
rocksdb_compact_range(db, NULL, 0, NULL, 0);
|
||||
rocksdb_close(db);
|
||||
rocksdb_options_set_create_if_missing(options, 0);
|
||||
rocksdb_options_set_error_if_exists(options, 0);
|
||||
rocksdb_repair_db(options, dbname, &err);
|
||||
CheckNoError(err);
|
||||
db = rocksdb_open(options, dbname, &err);
|
||||
CheckNoError(err);
|
||||
CheckGet(db, roptions, "foo", NULL);
|
||||
CheckGet(db, roptions, "bar", NULL);
|
||||
CheckGet(db, roptions, "box", "c");
|
||||
rocksdb_options_set_create_if_missing(options, 1);
|
||||
rocksdb_options_set_error_if_exists(options, 1);
|
||||
}
|
||||
|
||||
StartPhase("filter");
|
||||
for (run = 0; run < 2; run++) {
|
||||
// First run uses custom filter, second run uses bloom filter
|
||||
CheckNoError(err);
|
||||
rocksdb_filterpolicy_t* policy;
|
||||
if (run == 0) {
|
||||
policy = rocksdb_filterpolicy_create(
|
||||
NULL, FilterDestroy, FilterCreate, FilterKeyMatch, NULL, FilterName);
|
||||
} else {
|
||||
policy = rocksdb_filterpolicy_create_bloom(10);
|
||||
}
|
||||
|
||||
// Create new database
|
||||
rocksdb_close(db);
|
||||
rocksdb_destroy_db(options, dbname, &err);
|
||||
rocksdb_options_set_filter_policy(options, policy);
|
||||
db = rocksdb_open(options, dbname, &err);
|
||||
CheckNoError(err);
|
||||
rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
|
||||
CheckNoError(err);
|
||||
rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
|
||||
CheckNoError(err);
|
||||
rocksdb_compact_range(db, NULL, 0, NULL, 0);
|
||||
|
||||
fake_filter_result = 1;
|
||||
CheckGet(db, roptions, "foo", "foovalue");
|
||||
CheckGet(db, roptions, "bar", "barvalue");
|
||||
if (phase == 0) {
|
||||
// Must not find value when custom filter returns false
|
||||
fake_filter_result = 0;
|
||||
CheckGet(db, roptions, "foo", NULL);
|
||||
CheckGet(db, roptions, "bar", NULL);
|
||||
fake_filter_result = 1;
|
||||
|
||||
CheckGet(db, roptions, "foo", "foovalue");
|
||||
CheckGet(db, roptions, "bar", "barvalue");
|
||||
}
|
||||
rocksdb_options_set_filter_policy(options, NULL);
|
||||
rocksdb_filterpolicy_destroy(policy);
|
||||
}
|
||||
|
||||
StartPhase("merge_operator");
|
||||
{
|
||||
rocksdb_mergeoperator_t* merge_operator;
|
||||
merge_operator = rocksdb_mergeoperator_create(
|
||||
NULL, MergeOperatorDestroy, MergeOperatorFullMerge,
|
||||
MergeOperatorPartialMerge, NULL, MergeOperatorName);
|
||||
// Create new database
|
||||
rocksdb_close(db);
|
||||
rocksdb_destroy_db(options, dbname, &err);
|
||||
rocksdb_options_set_merge_operator(options, merge_operator);
|
||||
db = rocksdb_open(options, dbname, &err);
|
||||
CheckNoError(err);
|
||||
rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
|
||||
CheckNoError(err);
|
||||
CheckGet(db, roptions, "foo", "foovalue");
|
||||
rocksdb_merge(db, woptions, "foo", 3, "barvalue", 8, &err);
|
||||
CheckNoError(err);
|
||||
CheckGet(db, roptions, "foo", "fake");
|
||||
|
||||
// Merge of a non-existing value
|
||||
rocksdb_merge(db, woptions, "bar", 3, "barvalue", 8, &err);
|
||||
CheckNoError(err);
|
||||
CheckGet(db, roptions, "bar", "fake");
|
||||
|
||||
}
|
||||
|
||||
StartPhase("prefix");
|
||||
{
|
||||
// Create new database
|
||||
rocksdb_close(db);
|
||||
rocksdb_destroy_db(options, dbname, &err);
|
||||
|
||||
rocksdb_filterpolicy_t* policy = rocksdb_filterpolicy_create_bloom(10);
|
||||
rocksdb_options_set_filter_policy(options, policy);
|
||||
rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3));
|
||||
rocksdb_options_set_hash_skip_list_rep(options, 50000, 4, 4);
|
||||
rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
|
||||
|
||||
db = rocksdb_open(options, dbname, &err);
|
||||
CheckNoError(err);
|
||||
|
||||
rocksdb_put(db, woptions, "foo1", 4, "foo", 3, &err);
|
||||
CheckNoError(err);
|
||||
rocksdb_put(db, woptions, "foo2", 4, "foo", 3, &err);
|
||||
CheckNoError(err);
|
||||
rocksdb_put(db, woptions, "foo3", 4, "foo", 3, &err);
|
||||
CheckNoError(err);
|
||||
rocksdb_put(db, woptions, "bar1", 4, "bar", 3, &err);
|
||||
CheckNoError(err);
|
||||
rocksdb_put(db, woptions, "bar2", 4, "bar", 3, &err);
|
||||
CheckNoError(err);
|
||||
rocksdb_put(db, woptions, "bar3", 4, "bar", 3, &err);
|
||||
CheckNoError(err);
|
||||
|
||||
rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
|
||||
CheckCondition(!rocksdb_iter_valid(iter));
|
||||
|
||||
rocksdb_iter_seek(iter, "bar", 3);
|
||||
rocksdb_iter_get_error(iter, &err);
|
||||
CheckNoError(err);
|
||||
CheckCondition(rocksdb_iter_valid(iter));
|
||||
|
||||
CheckIter(iter, "bar1", "bar");
|
||||
rocksdb_iter_next(iter);
|
||||
CheckIter(iter, "bar2", "bar");
|
||||
rocksdb_iter_next(iter);
|
||||
CheckIter(iter, "bar3", "bar");
|
||||
rocksdb_iter_get_error(iter, &err);
|
||||
CheckNoError(err);
|
||||
rocksdb_iter_destroy(iter);
|
||||
rocksdb_filterpolicy_destroy(policy);
|
||||
}
|
||||
|
||||
StartPhase("cleanup");
|
||||
rocksdb_close(db);
|
||||
rocksdb_options_destroy(options);
|
||||
rocksdb_readoptions_destroy(roptions);
|
||||
rocksdb_writeoptions_destroy(woptions);
|
||||
rocksdb_cache_destroy(cache);
|
||||
rocksdb_comparator_destroy(cmp);
|
||||
rocksdb_env_destroy(env);
|
||||
|
||||
fprintf(stderr, "PASS\n");
|
||||
return 0;
|
||||
}
|
||||
604
db/column_family.cc
Normal file
604
db/column_family.cc
Normal file
@@ -0,0 +1,604 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/column_family.h"
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
#include "db/db_impl.h"
|
||||
#include "db/version_set.h"
|
||||
#include "db/internal_stats.h"
|
||||
#include "db/compaction_picker.h"
|
||||
#include "db/table_properties_collector.h"
|
||||
#include "util/autovector.h"
|
||||
#include "util/hash_skiplist_rep.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
|
||||
DBImpl* db, port::Mutex* mutex)
|
||||
: cfd_(cfd), db_(db), mutex_(mutex) {
|
||||
if (cfd_ != nullptr) {
|
||||
cfd_->Ref();
|
||||
}
|
||||
}
|
||||
|
||||
ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
|
||||
if (cfd_ != nullptr) {
|
||||
DBImpl::DeletionState deletion_state;
|
||||
mutex_->Lock();
|
||||
if (cfd_->Unref()) {
|
||||
delete cfd_;
|
||||
}
|
||||
db_->FindObsoleteFiles(deletion_state, false, true);
|
||||
mutex_->Unlock();
|
||||
if (deletion_state.HaveSomethingToDelete()) {
|
||||
db_->PurgeObsoleteFiles(deletion_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
|
||||
|
||||
namespace {
|
||||
// Fix user-supplied options to be reasonable
|
||||
template <class T, class V>
|
||||
static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
|
||||
if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
|
||||
if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
|
||||
const InternalFilterPolicy* ipolicy,
|
||||
const ColumnFamilyOptions& src) {
|
||||
ColumnFamilyOptions result = src;
|
||||
result.comparator = icmp;
|
||||
result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
|
||||
#ifdef OS_MACOSX
|
||||
// TODO(icanadi) make write_buffer_size uint64_t instead of size_t
|
||||
ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30);
|
||||
#else
|
||||
ClipToRange(&result.write_buffer_size,
|
||||
((size_t)64) << 10, ((size_t)64) << 30);
|
||||
#endif
|
||||
// if user sets arena_block_size, we trust user to use this value. Otherwise,
|
||||
// calculate a proper value from writer_buffer_size;
|
||||
if (result.arena_block_size <= 0) {
|
||||
result.arena_block_size = result.write_buffer_size / 10;
|
||||
}
|
||||
result.min_write_buffer_number_to_merge =
|
||||
std::min(result.min_write_buffer_number_to_merge,
|
||||
result.max_write_buffer_number - 1);
|
||||
if (result.block_cache == nullptr && !result.no_block_cache) {
|
||||
result.block_cache = NewLRUCache(8 << 20);
|
||||
}
|
||||
result.compression_per_level = src.compression_per_level;
|
||||
if (result.block_size_deviation < 0 || result.block_size_deviation > 100) {
|
||||
result.block_size_deviation = 0;
|
||||
}
|
||||
if (result.max_mem_compaction_level >= result.num_levels) {
|
||||
result.max_mem_compaction_level = result.num_levels - 1;
|
||||
}
|
||||
if (result.soft_rate_limit > result.hard_rate_limit) {
|
||||
result.soft_rate_limit = result.hard_rate_limit;
|
||||
}
|
||||
if (!result.prefix_extractor) {
|
||||
assert(result.memtable_factory);
|
||||
Slice name = result.memtable_factory->Name();
|
||||
if (name.compare("HashSkipListRepFactory") == 0 ||
|
||||
name.compare("HashLinkListRepFactory") == 0) {
|
||||
result.memtable_factory = std::make_shared<SkipListFactory>();
|
||||
}
|
||||
}
|
||||
|
||||
// -- Sanitize the table properties collector
|
||||
// All user defined properties collectors will be wrapped by
|
||||
// UserKeyTablePropertiesCollector since for them they only have the
|
||||
// knowledge of the user keys; internal keys are invisible to them.
|
||||
auto& collector_factories = result.table_properties_collector_factories;
|
||||
for (size_t i = 0; i < result.table_properties_collector_factories.size();
|
||||
++i) {
|
||||
assert(collector_factories[i]);
|
||||
collector_factories[i] =
|
||||
std::make_shared<UserKeyTablePropertiesCollectorFactory>(
|
||||
collector_factories[i]);
|
||||
}
|
||||
// Add collector to collect internal key statistics
|
||||
collector_factories.push_back(
|
||||
std::make_shared<InternalKeyPropertiesCollectorFactory>());
|
||||
|
||||
if (result.compaction_style == kCompactionStyleFIFO) {
|
||||
result.num_levels = 1;
|
||||
// since we delete level0 files in FIFO compaction when there are too many
|
||||
// of them, these options don't really mean anything
|
||||
result.level0_file_num_compaction_trigger = std::numeric_limits<int>::max();
|
||||
result.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
|
||||
result.level0_stop_writes_trigger = std::numeric_limits<int>::max();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int SuperVersion::dummy = 0;
|
||||
void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
|
||||
void* const SuperVersion::kSVObsolete = nullptr;
|
||||
|
||||
SuperVersion::~SuperVersion() {
|
||||
for (auto td : to_delete) {
|
||||
delete td;
|
||||
}
|
||||
}
|
||||
|
||||
SuperVersion* SuperVersion::Ref() {
|
||||
refs.fetch_add(1, std::memory_order_relaxed);
|
||||
return this;
|
||||
}
|
||||
|
||||
bool SuperVersion::Unref() {
|
||||
// fetch_sub returns the previous value of ref
|
||||
uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed);
|
||||
assert(previous_refs > 0);
|
||||
return previous_refs == 1;
|
||||
}
|
||||
|
||||
void SuperVersion::Cleanup() {
|
||||
assert(refs.load(std::memory_order_relaxed) == 0);
|
||||
imm->Unref(&to_delete);
|
||||
MemTable* m = mem->Unref();
|
||||
if (m != nullptr) {
|
||||
to_delete.push_back(m);
|
||||
}
|
||||
current->Unref();
|
||||
}
|
||||
|
||||
void SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm,
|
||||
Version* new_current) {
|
||||
mem = new_mem;
|
||||
imm = new_imm;
|
||||
current = new_current;
|
||||
mem->Ref();
|
||||
imm->Ref();
|
||||
current->Ref();
|
||||
refs.store(1, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
namespace {
|
||||
void SuperVersionUnrefHandle(void* ptr) {
|
||||
// UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
|
||||
// destroyed. When former happens, the thread shouldn't see kSVInUse.
|
||||
// When latter happens, we are in ~ColumnFamilyData(), no get should happen as
|
||||
// well.
|
||||
SuperVersion* sv = static_cast<SuperVersion*>(ptr);
|
||||
if (sv->Unref()) {
|
||||
sv->db_mutex->Lock();
|
||||
sv->Cleanup();
|
||||
sv->db_mutex->Unlock();
|
||||
delete sv;
|
||||
}
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
ColumnFamilyData::ColumnFamilyData(const std::string& dbname, uint32_t id,
|
||||
const std::string& name,
|
||||
Version* dummy_versions, Cache* table_cache,
|
||||
const ColumnFamilyOptions& options,
|
||||
const DBOptions* db_options,
|
||||
const EnvOptions& storage_options,
|
||||
ColumnFamilySet* column_family_set)
|
||||
: id_(id),
|
||||
name_(name),
|
||||
dummy_versions_(dummy_versions),
|
||||
current_(nullptr),
|
||||
refs_(0),
|
||||
dropped_(false),
|
||||
internal_comparator_(options.comparator),
|
||||
internal_filter_policy_(options.filter_policy),
|
||||
options_(*db_options, SanitizeOptions(&internal_comparator_,
|
||||
&internal_filter_policy_, options)),
|
||||
mem_(nullptr),
|
||||
imm_(options_.min_write_buffer_number_to_merge),
|
||||
super_version_(nullptr),
|
||||
super_version_number_(0),
|
||||
local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
|
||||
next_(nullptr),
|
||||
prev_(nullptr),
|
||||
log_number_(0),
|
||||
need_slowdown_for_num_level0_files_(false),
|
||||
column_family_set_(column_family_set) {
|
||||
Ref();
|
||||
|
||||
// if dummy_versions is nullptr, then this is a dummy column family.
|
||||
if (dummy_versions != nullptr) {
|
||||
internal_stats_.reset(new InternalStats(
|
||||
options_.num_levels, db_options->env, db_options->statistics.get()));
|
||||
table_cache_.reset(
|
||||
new TableCache(dbname, &options_, storage_options, table_cache));
|
||||
if (options_.compaction_style == kCompactionStyleUniversal) {
|
||||
compaction_picker_.reset(
|
||||
new UniversalCompactionPicker(&options_, &internal_comparator_));
|
||||
} else if (options_.compaction_style == kCompactionStyleLevel) {
|
||||
compaction_picker_.reset(
|
||||
new LevelCompactionPicker(&options_, &internal_comparator_));
|
||||
} else {
|
||||
assert(options_.compaction_style == kCompactionStyleFIFO);
|
||||
compaction_picker_.reset(
|
||||
new FIFOCompactionPicker(&options_, &internal_comparator_));
|
||||
}
|
||||
|
||||
Log(options_.info_log, "Options for column family \"%s\":\n",
|
||||
name.c_str());
|
||||
const ColumnFamilyOptions* cf_options = &options_;
|
||||
cf_options->Dump(options_.info_log.get());
|
||||
}
|
||||
}
|
||||
|
||||
// DB mutex held
|
||||
ColumnFamilyData::~ColumnFamilyData() {
|
||||
assert(refs_ == 0);
|
||||
// remove from linked list
|
||||
auto prev = prev_;
|
||||
auto next = next_;
|
||||
prev->next_ = next;
|
||||
next->prev_ = prev;
|
||||
|
||||
// it's nullptr for dummy CFD
|
||||
if (column_family_set_ != nullptr) {
|
||||
// remove from column_family_set
|
||||
column_family_set_->RemoveColumnFamily(this);
|
||||
}
|
||||
|
||||
if (current_ != nullptr) {
|
||||
current_->Unref();
|
||||
}
|
||||
|
||||
if (super_version_ != nullptr) {
|
||||
// Release SuperVersion reference kept in ThreadLocalPtr.
|
||||
// This must be done outside of mutex_ since unref handler can lock mutex.
|
||||
super_version_->db_mutex->Unlock();
|
||||
local_sv_.reset();
|
||||
super_version_->db_mutex->Lock();
|
||||
|
||||
bool is_last_reference __attribute__((unused));
|
||||
is_last_reference = super_version_->Unref();
|
||||
assert(is_last_reference);
|
||||
super_version_->Cleanup();
|
||||
delete super_version_;
|
||||
super_version_ = nullptr;
|
||||
}
|
||||
|
||||
if (dummy_versions_ != nullptr) {
|
||||
// List must be empty
|
||||
assert(dummy_versions_->next_ == dummy_versions_);
|
||||
delete dummy_versions_;
|
||||
}
|
||||
|
||||
if (mem_ != nullptr) {
|
||||
delete mem_->Unref();
|
||||
}
|
||||
autovector<MemTable*> to_delete;
|
||||
imm_.current()->Unref(&to_delete);
|
||||
for (MemTable* m : to_delete) {
|
||||
delete m;
|
||||
}
|
||||
}
|
||||
|
||||
const EnvOptions* ColumnFamilyData::soptions() const {
|
||||
return &(column_family_set_->storage_options_);
|
||||
}
|
||||
|
||||
void ColumnFamilyData::SetCurrent(Version* current) {
|
||||
current_ = current;
|
||||
need_slowdown_for_num_level0_files_ =
|
||||
(options_.level0_slowdown_writes_trigger >= 0 &&
|
||||
current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
|
||||
}
|
||||
|
||||
void ColumnFamilyData::CreateNewMemtable() {
|
||||
assert(current_ != nullptr);
|
||||
if (mem_ != nullptr) {
|
||||
delete mem_->Unref();
|
||||
}
|
||||
mem_ = new MemTable(internal_comparator_, options_);
|
||||
mem_->Ref();
|
||||
}
|
||||
|
||||
Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
|
||||
return compaction_picker_->PickCompaction(current_, log_buffer);
|
||||
}
|
||||
|
||||
Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level,
|
||||
const InternalKey* begin,
|
||||
const InternalKey* end,
|
||||
InternalKey** compaction_end) {
|
||||
return compaction_picker_->CompactRange(current_, input_level, output_level,
|
||||
begin, end, compaction_end);
|
||||
}
|
||||
|
||||
SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
|
||||
port::Mutex* db_mutex) {
|
||||
SuperVersion* sv = nullptr;
|
||||
if (LIKELY(column_family_set_->db_options_->allow_thread_local)) {
|
||||
sv = GetThreadLocalSuperVersion(db_mutex);
|
||||
sv->Ref();
|
||||
if (!ReturnThreadLocalSuperVersion(sv)) {
|
||||
sv->Unref();
|
||||
}
|
||||
} else {
|
||||
db_mutex->Lock();
|
||||
sv = super_version_->Ref();
|
||||
db_mutex->Unlock();
|
||||
}
|
||||
return sv;
|
||||
}
|
||||
|
||||
SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
|
||||
port::Mutex* db_mutex) {
|
||||
SuperVersion* sv = nullptr;
|
||||
// The SuperVersion is cached in thread local storage to avoid acquiring
|
||||
// mutex when SuperVersion does not change since the last use. When a new
|
||||
// SuperVersion is installed, the compaction or flush thread cleans up
|
||||
// cached SuperVersion in all existing thread local storage. To avoid
|
||||
// acquiring mutex for this operation, we use atomic Swap() on the thread
|
||||
// local pointer to guarantee exclusive access. If the thread local pointer
|
||||
// is being used while a new SuperVersion is installed, the cached
|
||||
// SuperVersion can become stale. In that case, the background thread would
|
||||
// have swapped in kSVObsolete. We re-check the value at when returning
|
||||
// SuperVersion back to thread local, with an atomic compare and swap.
|
||||
// The superversion will need to be released if detected to be stale.
|
||||
void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
|
||||
// Invariant:
|
||||
// (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
|
||||
// (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
|
||||
// should only keep kSVInUse before ReturnThreadLocalSuperVersion call
|
||||
// (if no Scrape happens).
|
||||
assert(ptr != SuperVersion::kSVInUse);
|
||||
sv = static_cast<SuperVersion*>(ptr);
|
||||
if (sv == SuperVersion::kSVObsolete ||
|
||||
sv->version_number != super_version_number_.load()) {
|
||||
RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_ACQUIRES);
|
||||
SuperVersion* sv_to_delete = nullptr;
|
||||
|
||||
if (sv && sv->Unref()) {
|
||||
RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS);
|
||||
db_mutex->Lock();
|
||||
// NOTE: underlying resources held by superversion (sst files) might
|
||||
// not be released until the next background job.
|
||||
sv->Cleanup();
|
||||
sv_to_delete = sv;
|
||||
} else {
|
||||
db_mutex->Lock();
|
||||
}
|
||||
sv = super_version_->Ref();
|
||||
db_mutex->Unlock();
|
||||
|
||||
delete sv_to_delete;
|
||||
}
|
||||
assert(sv != nullptr);
|
||||
return sv;
|
||||
}
|
||||
|
||||
bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
|
||||
assert(sv != nullptr);
|
||||
// Put the SuperVersion back
|
||||
void* expected = SuperVersion::kSVInUse;
|
||||
if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
|
||||
// When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
|
||||
// storage has not been altered and no Scrape has happend. The
|
||||
// SuperVersion is still current.
|
||||
return true;
|
||||
} else {
|
||||
// ThreadLocal scrape happened in the process of this GetImpl call (after
|
||||
// thread local Swap() at the beginning and before CompareAndSwap()).
|
||||
// This means the SuperVersion it holds is obsolete.
|
||||
assert(expected == SuperVersion::kSVObsolete);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
SuperVersion* ColumnFamilyData::InstallSuperVersion(
|
||||
SuperVersion* new_superversion, port::Mutex* db_mutex) {
|
||||
new_superversion->db_mutex = db_mutex;
|
||||
new_superversion->Init(mem_, imm_.current(), current_);
|
||||
SuperVersion* old_superversion = super_version_;
|
||||
super_version_ = new_superversion;
|
||||
++super_version_number_;
|
||||
super_version_->version_number = super_version_number_;
|
||||
// Reset SuperVersions cached in thread local storage
|
||||
if (column_family_set_->db_options_->allow_thread_local) {
|
||||
ResetThreadLocalSuperVersions();
|
||||
}
|
||||
if (old_superversion != nullptr && old_superversion->Unref()) {
|
||||
old_superversion->Cleanup();
|
||||
return old_superversion; // will let caller delete outside of mutex
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void ColumnFamilyData::ResetThreadLocalSuperVersions() {
|
||||
autovector<void*> sv_ptrs;
|
||||
local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
|
||||
for (auto ptr : sv_ptrs) {
|
||||
assert(ptr);
|
||||
if (ptr == SuperVersion::kSVInUse) {
|
||||
continue;
|
||||
}
|
||||
auto sv = static_cast<SuperVersion*>(ptr);
|
||||
if (sv->Unref()) {
|
||||
sv->Cleanup();
|
||||
delete sv;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
|
||||
const DBOptions* db_options,
|
||||
const EnvOptions& storage_options,
|
||||
Cache* table_cache)
|
||||
: max_column_family_(0),
|
||||
dummy_cfd_(new ColumnFamilyData(dbname, 0, "", nullptr, nullptr,
|
||||
ColumnFamilyOptions(), db_options,
|
||||
storage_options_, nullptr)),
|
||||
default_cfd_cache_(nullptr),
|
||||
db_name_(dbname),
|
||||
db_options_(db_options),
|
||||
storage_options_(storage_options),
|
||||
table_cache_(table_cache),
|
||||
spin_lock_(ATOMIC_FLAG_INIT) {
|
||||
// initialize linked list
|
||||
dummy_cfd_->prev_ = dummy_cfd_;
|
||||
dummy_cfd_->next_ = dummy_cfd_;
|
||||
}
|
||||
|
||||
ColumnFamilySet::~ColumnFamilySet() {
|
||||
while (column_family_data_.size() > 0) {
|
||||
// cfd destructor will delete itself from column_family_data_
|
||||
auto cfd = column_family_data_.begin()->second;
|
||||
cfd->Unref();
|
||||
delete cfd;
|
||||
}
|
||||
dummy_cfd_->Unref();
|
||||
delete dummy_cfd_;
|
||||
}
|
||||
|
||||
ColumnFamilyData* ColumnFamilySet::GetDefault() const {
|
||||
assert(default_cfd_cache_ != nullptr);
|
||||
return default_cfd_cache_;
|
||||
}
|
||||
|
||||
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
|
||||
auto cfd_iter = column_family_data_.find(id);
|
||||
if (cfd_iter != column_family_data_.end()) {
|
||||
return cfd_iter->second;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name)
|
||||
const {
|
||||
auto cfd_iter = column_families_.find(name);
|
||||
if (cfd_iter != column_families_.end()) {
|
||||
auto cfd = GetColumnFamily(cfd_iter->second);
|
||||
assert(cfd != nullptr);
|
||||
return cfd;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
|
||||
return ++max_column_family_;
|
||||
}
|
||||
|
||||
uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
|
||||
|
||||
void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
|
||||
max_column_family_ = std::max(new_max_column_family, max_column_family_);
|
||||
}
|
||||
|
||||
size_t ColumnFamilySet::NumberOfColumnFamilies() const {
|
||||
return column_families_.size();
|
||||
}
|
||||
|
||||
// under a DB mutex
|
||||
ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
|
||||
const std::string& name, uint32_t id, Version* dummy_versions,
|
||||
const ColumnFamilyOptions& options) {
|
||||
assert(column_families_.find(name) == column_families_.end());
|
||||
ColumnFamilyData* new_cfd =
|
||||
new ColumnFamilyData(db_name_, id, name, dummy_versions, table_cache_,
|
||||
options, db_options_, storage_options_, this);
|
||||
Lock();
|
||||
column_families_.insert({name, id});
|
||||
column_family_data_.insert({id, new_cfd});
|
||||
Unlock();
|
||||
max_column_family_ = std::max(max_column_family_, id);
|
||||
// add to linked list
|
||||
new_cfd->next_ = dummy_cfd_;
|
||||
auto prev = dummy_cfd_->prev_;
|
||||
new_cfd->prev_ = prev;
|
||||
prev->next_ = new_cfd;
|
||||
dummy_cfd_->prev_ = new_cfd;
|
||||
if (id == 0) {
|
||||
default_cfd_cache_ = new_cfd;
|
||||
}
|
||||
return new_cfd;
|
||||
}
|
||||
|
||||
void ColumnFamilySet::Lock() {
|
||||
// spin lock
|
||||
while (spin_lock_.test_and_set(std::memory_order_acquire)) {
|
||||
}
|
||||
}
|
||||
|
||||
void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); }
|
||||
|
||||
// REQUIRES: DB mutex held
|
||||
void ColumnFamilySet::FreeDeadColumnFamilies() {
|
||||
autovector<ColumnFamilyData*> to_delete;
|
||||
for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
|
||||
if (cfd->refs_ == 0) {
|
||||
to_delete.push_back(cfd);
|
||||
}
|
||||
}
|
||||
for (auto cfd : to_delete) {
|
||||
// this is very rare, so it's not a problem that we do it under a mutex
|
||||
delete cfd;
|
||||
}
|
||||
}
|
||||
|
||||
// under a DB mutex
|
||||
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
|
||||
auto cfd_iter = column_family_data_.find(cfd->GetID());
|
||||
assert(cfd_iter != column_family_data_.end());
|
||||
Lock();
|
||||
column_family_data_.erase(cfd_iter);
|
||||
column_families_.erase(cfd->GetName());
|
||||
Unlock();
|
||||
}
|
||||
|
||||
bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
|
||||
if (column_family_id == 0) {
|
||||
// optimization for common case
|
||||
current_ = column_family_set_->GetDefault();
|
||||
} else {
|
||||
// maybe outside of db mutex, should lock
|
||||
column_family_set_->Lock();
|
||||
current_ = column_family_set_->GetColumnFamily(column_family_id);
|
||||
column_family_set_->Unlock();
|
||||
}
|
||||
handle_.SetCFD(current_);
|
||||
return current_ != nullptr;
|
||||
}
|
||||
|
||||
uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
|
||||
assert(current_ != nullptr);
|
||||
return current_->GetLogNumber();
|
||||
}
|
||||
|
||||
MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
|
||||
assert(current_ != nullptr);
|
||||
return current_->mem();
|
||||
}
|
||||
|
||||
const Options* ColumnFamilyMemTablesImpl::GetOptions() const {
|
||||
assert(current_ != nullptr);
|
||||
return current_->options();
|
||||
}
|
||||
|
||||
ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
|
||||
assert(current_ != nullptr);
|
||||
return &handle_;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
419
db/column_family.h
Normal file
419
db/column_family.h
Normal file
@@ -0,0 +1,419 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <unordered_map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <atomic>
|
||||
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "db/memtable_list.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "db/table_cache.h"
|
||||
#include "util/thread_local.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Version;
|
||||
class VersionSet;
|
||||
class MemTable;
|
||||
class MemTableListVersion;
|
||||
class CompactionPicker;
|
||||
class Compaction;
|
||||
class InternalKey;
|
||||
class InternalStats;
|
||||
class ColumnFamilyData;
|
||||
class DBImpl;
|
||||
class LogBuffer;
|
||||
|
||||
// ColumnFamilyHandleImpl is the class that clients use to access different
|
||||
// column families. It has non-trivial destructor, which gets called when client
|
||||
// is done using the column family
|
||||
class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
|
||||
public:
|
||||
// create while holding the mutex
|
||||
ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex);
|
||||
// destroy without mutex
|
||||
virtual ~ColumnFamilyHandleImpl();
|
||||
virtual ColumnFamilyData* cfd() const { return cfd_; }
|
||||
|
||||
virtual uint32_t GetID() const;
|
||||
|
||||
private:
|
||||
ColumnFamilyData* cfd_;
|
||||
DBImpl* db_;
|
||||
port::Mutex* mutex_;
|
||||
};
|
||||
|
||||
// Does not ref-count ColumnFamilyData
|
||||
// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
|
||||
// calls DBImpl methods. When this happens, MemTableInserter need access to
|
||||
// ColumnFamilyHandle (same as the client would need). In that case, we feed
|
||||
// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
|
||||
// methods
|
||||
class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
|
||||
public:
|
||||
ColumnFamilyHandleInternal()
|
||||
: ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {}
|
||||
|
||||
void SetCFD(ColumnFamilyData* cfd) { internal_cfd_ = cfd; }
|
||||
virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
|
||||
|
||||
private:
|
||||
ColumnFamilyData* internal_cfd_;
|
||||
};
|
||||
|
||||
// holds references to memtable, all immutable memtables and version
|
||||
struct SuperVersion {
|
||||
MemTable* mem;
|
||||
MemTableListVersion* imm;
|
||||
Version* current;
|
||||
std::atomic<uint32_t> refs;
|
||||
// We need to_delete because during Cleanup(), imm->Unref() returns
|
||||
// all memtables that we need to free through this vector. We then
|
||||
// delete all those memtables outside of mutex, during destruction
|
||||
autovector<MemTable*> to_delete;
|
||||
// Version number of the current SuperVersion
|
||||
uint64_t version_number;
|
||||
port::Mutex* db_mutex;
|
||||
|
||||
// should be called outside the mutex
|
||||
SuperVersion() = default;
|
||||
~SuperVersion();
|
||||
SuperVersion* Ref();
|
||||
|
||||
bool Unref();
|
||||
|
||||
// call these two methods with db mutex held
|
||||
// Cleanup unrefs mem, imm and current. Also, it stores all memtables
|
||||
// that needs to be deleted in to_delete vector. Unrefing those
|
||||
// objects needs to be done in the mutex
|
||||
void Cleanup();
|
||||
void Init(MemTable* new_mem, MemTableListVersion* new_imm,
|
||||
Version* new_current);
|
||||
|
||||
// The value of dummy is not actually used. kSVInUse takes its address as a
|
||||
// mark in the thread local storage to indicate the SuperVersion is in use
|
||||
// by thread. This way, the value of kSVInUse is guaranteed to have no
|
||||
// conflict with SuperVersion object address and portable on different
|
||||
// platform.
|
||||
static int dummy;
|
||||
static void* const kSVInUse;
|
||||
static void* const kSVObsolete;
|
||||
};
|
||||
|
||||
extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
|
||||
const InternalFilterPolicy* ipolicy,
|
||||
const ColumnFamilyOptions& src);
|
||||
|
||||
class ColumnFamilySet;
|
||||
|
||||
// This class keeps all the data that a column family needs. It's mosly dumb and
|
||||
// used just to provide access to metadata.
|
||||
// Most methods require DB mutex held, unless otherwise noted
|
||||
class ColumnFamilyData {
|
||||
public:
|
||||
~ColumnFamilyData();
|
||||
|
||||
// thread-safe
|
||||
uint32_t GetID() const { return id_; }
|
||||
// thread-safe
|
||||
const std::string& GetName() const { return name_; }
|
||||
|
||||
void Ref() { ++refs_; }
|
||||
// will just decrease reference count to 0, but will not delete it. returns
|
||||
// true if the ref count was decreased to zero. in that case, it can be
|
||||
// deleted by the caller immediatelly, or later, by calling
|
||||
// FreeDeadColumnFamilies()
|
||||
bool Unref() {
|
||||
assert(refs_ > 0);
|
||||
return --refs_ == 0;
|
||||
}
|
||||
|
||||
// This can only be called from single-threaded VersionSet::LogAndApply()
|
||||
// After dropping column family no other operation on that column family
|
||||
// will be executed. All the files and memory will be, however, kept around
|
||||
// until client drops the column family handle. That way, client can still
|
||||
// access data from dropped column family.
|
||||
// Column family can be dropped and still alive. In that state:
|
||||
// *) Column family is not included in the iteration.
|
||||
// *) Compaction and flush is not executed on the dropped column family.
|
||||
// *) Client can continue writing and reading from column family. However, all
|
||||
// writes stay in the current memtable.
|
||||
// When the dropped column family is unreferenced, then we:
|
||||
// *) delete all memory associated with that column family
|
||||
// *) delete all the files associated with that column family
|
||||
void SetDropped() {
|
||||
// can't drop default CF
|
||||
assert(id_ != 0);
|
||||
dropped_ = true;
|
||||
}
|
||||
bool IsDropped() const { return dropped_; }
|
||||
|
||||
// thread-safe
|
||||
int NumberLevels() const { return options_.num_levels; }
|
||||
|
||||
void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
|
||||
uint64_t GetLogNumber() const { return log_number_; }
|
||||
|
||||
// thread-safe
|
||||
const Options* options() const { return &options_; }
|
||||
const EnvOptions* soptions() const;
|
||||
|
||||
InternalStats* internal_stats() { return internal_stats_.get(); }
|
||||
|
||||
MemTableList* imm() { return &imm_; }
|
||||
MemTable* mem() { return mem_; }
|
||||
Version* current() { return current_; }
|
||||
Version* dummy_versions() { return dummy_versions_; }
|
||||
void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
|
||||
void SetCurrent(Version* current);
|
||||
void CreateNewMemtable();
|
||||
|
||||
TableCache* table_cache() const { return table_cache_.get(); }
|
||||
|
||||
// See documentation in compaction_picker.h
|
||||
Compaction* PickCompaction(LogBuffer* log_buffer);
|
||||
Compaction* CompactRange(int input_level, int output_level,
|
||||
const InternalKey* begin, const InternalKey* end,
|
||||
InternalKey** compaction_end);
|
||||
|
||||
CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
|
||||
// thread-safe
|
||||
const Comparator* user_comparator() const {
|
||||
return internal_comparator_.user_comparator();
|
||||
}
|
||||
// thread-safe
|
||||
const InternalKeyComparator& internal_comparator() const {
|
||||
return internal_comparator_;
|
||||
}
|
||||
|
||||
SuperVersion* GetSuperVersion() { return super_version_; }
|
||||
// thread-safe
|
||||
// Return a already referenced SuperVersion to be used safely.
|
||||
SuperVersion* GetReferencedSuperVersion(port::Mutex* db_mutex);
|
||||
// thread-safe
|
||||
// Get SuperVersion stored in thread local storage. If it does not exist,
|
||||
// get a reference from a current SuperVersion.
|
||||
SuperVersion* GetThreadLocalSuperVersion(port::Mutex* db_mutex);
|
||||
// Try to return SuperVersion back to thread local storage. Retrun true on
|
||||
// success and false on failure. It fails when the thread local storage
|
||||
// contains anything other than SuperVersion::kSVInUse flag.
|
||||
bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
|
||||
// thread-safe
|
||||
uint64_t GetSuperVersionNumber() const {
|
||||
return super_version_number_.load();
|
||||
}
|
||||
// will return a pointer to SuperVersion* if previous SuperVersion
|
||||
// if its reference count is zero and needs deletion or nullptr if not
|
||||
// As argument takes a pointer to allocated SuperVersion to enable
|
||||
// the clients to allocate SuperVersion outside of mutex.
|
||||
SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
|
||||
port::Mutex* db_mutex);
|
||||
|
||||
void ResetThreadLocalSuperVersions();
|
||||
|
||||
// A Flag indicating whether write needs to slowdown because of there are
|
||||
// too many number of level0 files.
|
||||
bool NeedSlowdownForNumLevel0Files() const {
|
||||
return need_slowdown_for_num_level0_files_;
|
||||
}
|
||||
|
||||
private:
|
||||
friend class ColumnFamilySet;
|
||||
ColumnFamilyData(const std::string& dbname, uint32_t id,
|
||||
const std::string& name, Version* dummy_versions,
|
||||
Cache* table_cache, const ColumnFamilyOptions& options,
|
||||
const DBOptions* db_options,
|
||||
const EnvOptions& storage_options,
|
||||
ColumnFamilySet* column_family_set);
|
||||
|
||||
uint32_t id_;
|
||||
const std::string name_;
|
||||
Version* dummy_versions_; // Head of circular doubly-linked list of versions.
|
||||
Version* current_; // == dummy_versions->prev_
|
||||
|
||||
int refs_; // outstanding references to ColumnFamilyData
|
||||
bool dropped_; // true if client dropped it
|
||||
|
||||
const InternalKeyComparator internal_comparator_;
|
||||
const InternalFilterPolicy internal_filter_policy_;
|
||||
|
||||
Options const options_;
|
||||
|
||||
std::unique_ptr<TableCache> table_cache_;
|
||||
|
||||
std::unique_ptr<InternalStats> internal_stats_;
|
||||
|
||||
MemTable* mem_;
|
||||
MemTableList imm_;
|
||||
SuperVersion* super_version_;
|
||||
|
||||
// An ordinal representing the current SuperVersion. Updated by
|
||||
// InstallSuperVersion(), i.e. incremented every time super_version_
|
||||
// changes.
|
||||
std::atomic<uint64_t> super_version_number_;
|
||||
|
||||
// Thread's local copy of SuperVersion pointer
|
||||
// This needs to be destructed before mutex_
|
||||
std::unique_ptr<ThreadLocalPtr> local_sv_;
|
||||
|
||||
// pointers for a circular linked list. we use it to support iterations
|
||||
// that can be concurrent with writes
|
||||
ColumnFamilyData* next_;
|
||||
ColumnFamilyData* prev_;
|
||||
|
||||
// This is the earliest log file number that contains data from this
|
||||
// Column Family. All earlier log files must be ignored and not
|
||||
// recovered from
|
||||
uint64_t log_number_;
|
||||
|
||||
// A flag indicating whether we should delay writes because
|
||||
// we have too many level 0 files
|
||||
bool need_slowdown_for_num_level0_files_;
|
||||
|
||||
// An object that keeps all the compaction stats
|
||||
// and picks the next compaction
|
||||
std::unique_ptr<CompactionPicker> compaction_picker_;
|
||||
|
||||
ColumnFamilySet* column_family_set_;
|
||||
};
|
||||
|
||||
// ColumnFamilySet has interesting thread-safety requirements
|
||||
// * CreateColumnFamily() or RemoveColumnFamily() -- need to protect by DB
|
||||
// mutex. Inside, column_family_data_ and column_families_ will be protected
|
||||
// by Lock() and Unlock(). CreateColumnFamily() should ONLY be called from
|
||||
// VersionSet::LogAndApply() in the normal runtime. It is also called
|
||||
// during Recovery and in DumpManifest(). RemoveColumnFamily() is called
|
||||
// from ColumnFamilyData destructor
|
||||
// * Iteration -- hold DB mutex, but you can release it in the body of
|
||||
// iteration. If you release DB mutex in body, reference the column
|
||||
// family before the mutex and unreference after you unlock, since the column
|
||||
// family might get dropped when the DB mutex is released
|
||||
// * GetDefault() -- thread safe
|
||||
// * GetColumnFamily() -- either inside of DB mutex or call Lock() <-> Unlock()
|
||||
// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
|
||||
// NumberOfColumnFamilies -- inside of DB mutex
|
||||
class ColumnFamilySet {
|
||||
public:
|
||||
// ColumnFamilySet supports iteration
|
||||
class iterator {
|
||||
public:
|
||||
explicit iterator(ColumnFamilyData* cfd)
|
||||
: current_(cfd) {}
|
||||
iterator& operator++() {
|
||||
// dummy is never dead or dropped, so this will never be infinite
|
||||
do {
|
||||
current_ = current_->next_;
|
||||
} while (current_->refs_ == 0 || current_->IsDropped());
|
||||
return *this;
|
||||
}
|
||||
bool operator!=(const iterator& other) {
|
||||
return this->current_ != other.current_;
|
||||
}
|
||||
ColumnFamilyData* operator*() { return current_; }
|
||||
|
||||
private:
|
||||
ColumnFamilyData* current_;
|
||||
};
|
||||
|
||||
ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
|
||||
const EnvOptions& storage_options, Cache* table_cache);
|
||||
~ColumnFamilySet();
|
||||
|
||||
ColumnFamilyData* GetDefault() const;
|
||||
// GetColumnFamily() calls return nullptr if column family is not found
|
||||
ColumnFamilyData* GetColumnFamily(uint32_t id) const;
|
||||
ColumnFamilyData* GetColumnFamily(const std::string& name) const;
|
||||
// this call will return the next available column family ID. it guarantees
|
||||
// that there is no column family with id greater than or equal to the
|
||||
// returned value in the current running instance or anytime in RocksDB
|
||||
// instance history.
|
||||
uint32_t GetNextColumnFamilyID();
|
||||
uint32_t GetMaxColumnFamily();
|
||||
void UpdateMaxColumnFamily(uint32_t new_max_column_family);
|
||||
size_t NumberOfColumnFamilies() const;
|
||||
|
||||
ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
|
||||
Version* dummy_version,
|
||||
const ColumnFamilyOptions& options);
|
||||
|
||||
iterator begin() { return iterator(dummy_cfd_->next_); }
|
||||
iterator end() { return iterator(dummy_cfd_); }
|
||||
|
||||
void Lock();
|
||||
void Unlock();
|
||||
|
||||
// REQUIRES: DB mutex held
|
||||
// Don't call while iterating over ColumnFamilySet
|
||||
void FreeDeadColumnFamilies();
|
||||
|
||||
private:
|
||||
friend class ColumnFamilyData;
|
||||
// helper function that gets called from cfd destructor
|
||||
// REQUIRES: DB mutex held
|
||||
void RemoveColumnFamily(ColumnFamilyData* cfd);
|
||||
|
||||
// column_families_ and column_family_data_ need to be protected:
|
||||
// * when mutating: 1. DB mutex locked first, 2. spinlock locked second
|
||||
// * when reading, either: 1. lock DB mutex, or 2. lock spinlock
|
||||
// (if both, respect the ordering to avoid deadlock!)
|
||||
std::unordered_map<std::string, uint32_t> column_families_;
|
||||
std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
|
||||
|
||||
uint32_t max_column_family_;
|
||||
ColumnFamilyData* dummy_cfd_;
|
||||
// We don't hold the refcount here, since default column family always exists
|
||||
// We are also not responsible for cleaning up default_cfd_cache_. This is
|
||||
// just a cache that makes common case (accessing default column family)
|
||||
// faster
|
||||
ColumnFamilyData* default_cfd_cache_;
|
||||
|
||||
const std::string db_name_;
|
||||
const DBOptions* const db_options_;
|
||||
const EnvOptions storage_options_;
|
||||
Cache* table_cache_;
|
||||
std::atomic_flag spin_lock_;
|
||||
};
|
||||
|
||||
// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
|
||||
// memtables of different column families (specified by ID in the write batch)
|
||||
class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
|
||||
public:
|
||||
explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
|
||||
: column_family_set_(column_family_set), current_(nullptr) {}
|
||||
|
||||
// sets current_ to ColumnFamilyData with column_family_id
|
||||
// returns false if column family doesn't exist
|
||||
bool Seek(uint32_t column_family_id) override;
|
||||
|
||||
// Returns log number of the selected column family
|
||||
uint64_t GetLogNumber() const override;
|
||||
|
||||
// REQUIRES: Seek() called first
|
||||
virtual MemTable* GetMemTable() const override;
|
||||
|
||||
// Returns options for selected column family
|
||||
// REQUIRES: Seek() called first
|
||||
virtual const Options* GetOptions() const override;
|
||||
|
||||
// Returns column family handle for the selected column family
|
||||
virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
|
||||
|
||||
private:
|
||||
ColumnFamilySet* column_family_set_;
|
||||
ColumnFamilyData* current_;
|
||||
ColumnFamilyHandleInternal handle_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
977
db/column_family_test.cc
Normal file
977
db/column_family_test.cc
Normal file
@@ -0,0 +1,977 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "db/db_impl.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
#include "util/coding.h"
|
||||
#include "utilities/merge_operators.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
std::string RandomString(Random* rnd, int len) {
|
||||
std::string r;
|
||||
test::RandomString(rnd, len, &r);
|
||||
return r;
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
// counts how many operations were performed
|
||||
class EnvCounter : public EnvWrapper {
|
||||
public:
|
||||
explicit EnvCounter(Env* base)
|
||||
: EnvWrapper(base), num_new_writable_file_(0) {}
|
||||
int GetNumberOfNewWritableFileCalls() {
|
||||
return num_new_writable_file_;
|
||||
}
|
||||
Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
|
||||
const EnvOptions& soptions) {
|
||||
++num_new_writable_file_;
|
||||
return EnvWrapper::NewWritableFile(f, r, soptions);
|
||||
}
|
||||
|
||||
private:
|
||||
int num_new_writable_file_;
|
||||
};
|
||||
|
||||
class ColumnFamilyTest {
|
||||
public:
|
||||
ColumnFamilyTest() : rnd_(139) {
|
||||
env_ = new EnvCounter(Env::Default());
|
||||
dbname_ = test::TmpDir() + "/column_family_test";
|
||||
db_options_.create_if_missing = true;
|
||||
db_options_.env = env_;
|
||||
DestroyDB(dbname_, Options(db_options_, column_family_options_));
|
||||
}
|
||||
|
||||
~ColumnFamilyTest() {
|
||||
delete env_;
|
||||
}
|
||||
|
||||
void Close() {
|
||||
for (auto h : handles_) {
|
||||
delete h;
|
||||
}
|
||||
handles_.clear();
|
||||
names_.clear();
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
}
|
||||
|
||||
Status TryOpen(std::vector<std::string> cf,
|
||||
std::vector<ColumnFamilyOptions> options = {}) {
|
||||
std::vector<ColumnFamilyDescriptor> column_families;
|
||||
names_.clear();
|
||||
for (size_t i = 0; i < cf.size(); ++i) {
|
||||
column_families.push_back(ColumnFamilyDescriptor(
|
||||
cf[i], options.size() == 0 ? column_family_options_ : options[i]));
|
||||
names_.push_back(cf[i]);
|
||||
}
|
||||
return DB::Open(db_options_, dbname_, column_families, &handles_, &db_);
|
||||
}
|
||||
|
||||
Status OpenReadOnly(std::vector<std::string> cf,
|
||||
std::vector<ColumnFamilyOptions> options = {}) {
|
||||
std::vector<ColumnFamilyDescriptor> column_families;
|
||||
names_.clear();
|
||||
for (size_t i = 0; i < cf.size(); ++i) {
|
||||
column_families.push_back(ColumnFamilyDescriptor(
|
||||
cf[i], options.size() == 0 ? column_family_options_ : options[i]));
|
||||
names_.push_back(cf[i]);
|
||||
}
|
||||
return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_,
|
||||
&db_);
|
||||
}
|
||||
|
||||
void AssertOpenReadOnly(std::vector<std::string> cf,
|
||||
std::vector<ColumnFamilyOptions> options = {}) {
|
||||
ASSERT_OK(OpenReadOnly(cf, options));
|
||||
}
|
||||
|
||||
|
||||
void Open(std::vector<std::string> cf,
|
||||
std::vector<ColumnFamilyOptions> options = {}) {
|
||||
ASSERT_OK(TryOpen(cf, options));
|
||||
}
|
||||
|
||||
void Open() {
|
||||
Open({"default"});
|
||||
}
|
||||
|
||||
DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_); }
|
||||
|
||||
int GetProperty(int cf, std::string property) {
|
||||
std::string value;
|
||||
ASSERT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value));
|
||||
return std::stoi(value);
|
||||
}
|
||||
|
||||
void Destroy() {
|
||||
for (auto h : handles_) {
|
||||
delete h;
|
||||
}
|
||||
handles_.clear();
|
||||
names_.clear();
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
|
||||
}
|
||||
|
||||
void CreateColumnFamilies(
|
||||
const std::vector<std::string>& cfs,
|
||||
const std::vector<ColumnFamilyOptions> options = {}) {
|
||||
int cfi = handles_.size();
|
||||
handles_.resize(cfi + cfs.size());
|
||||
names_.resize(cfi + cfs.size());
|
||||
for (size_t i = 0; i < cfs.size(); ++i) {
|
||||
ASSERT_OK(db_->CreateColumnFamily(
|
||||
options.size() == 0 ? column_family_options_ : options[i], cfs[i],
|
||||
&handles_[cfi]));
|
||||
names_[cfi] = cfs[i];
|
||||
cfi++;
|
||||
}
|
||||
}
|
||||
|
||||
void Reopen(const std::vector<ColumnFamilyOptions> options = {}) {
|
||||
std::vector<std::string> names;
|
||||
for (auto name : names_) {
|
||||
if (name != "") {
|
||||
names.push_back(name);
|
||||
}
|
||||
}
|
||||
Close();
|
||||
assert(options.size() == 0 || names.size() == options.size());
|
||||
Open(names, options);
|
||||
}
|
||||
|
||||
void CreateColumnFamiliesAndReopen(const std::vector<std::string>& cfs) {
|
||||
CreateColumnFamilies(cfs);
|
||||
Reopen();
|
||||
}
|
||||
|
||||
void DropColumnFamilies(const std::vector<int>& cfs) {
|
||||
for (auto cf : cfs) {
|
||||
ASSERT_OK(db_->DropColumnFamily(handles_[cf]));
|
||||
delete handles_[cf];
|
||||
handles_[cf] = nullptr;
|
||||
names_[cf] = "";
|
||||
}
|
||||
}
|
||||
|
||||
void PutRandomData(int cf, int num, int key_value_size) {
|
||||
for (int i = 0; i < num; ++i) {
|
||||
// 10 bytes for key, rest is value
|
||||
ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 10),
|
||||
RandomString(&rnd_, key_value_size - 10)));
|
||||
}
|
||||
}
|
||||
|
||||
void WaitForFlush(int cf) {
|
||||
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
|
||||
}
|
||||
|
||||
void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); }
|
||||
|
||||
Status Put(int cf, const std::string& key, const std::string& value) {
|
||||
return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
|
||||
}
|
||||
Status Merge(int cf, const std::string& key, const std::string& value) {
|
||||
return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
|
||||
}
|
||||
Status Flush(int cf) {
|
||||
return db_->Flush(FlushOptions(), handles_[cf]);
|
||||
}
|
||||
|
||||
std::string Get(int cf, const std::string& key) {
|
||||
ReadOptions options;
|
||||
options.verify_checksums = true;
|
||||
std::string result;
|
||||
Status s = db_->Get(options, handles_[cf], Slice(key), &result);
|
||||
if (s.IsNotFound()) {
|
||||
result = "NOT_FOUND";
|
||||
} else if (!s.ok()) {
|
||||
result = s.ToString();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void CompactAll(int cf) {
|
||||
ASSERT_OK(db_->CompactRange(handles_[cf], nullptr, nullptr));
|
||||
}
|
||||
|
||||
void Compact(int cf, const Slice& start, const Slice& limit) {
|
||||
ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit));
|
||||
}
|
||||
|
||||
int NumTableFilesAtLevel(int level, int cf) {
|
||||
return GetProperty(cf,
|
||||
"rocksdb.num-files-at-level" + std::to_string(level));
|
||||
}
|
||||
|
||||
// Return spread of files per level
|
||||
std::string FilesPerLevel(int cf) {
|
||||
std::string result;
|
||||
int last_non_zero_offset = 0;
|
||||
for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) {
|
||||
int f = NumTableFilesAtLevel(level, cf);
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
|
||||
result += buf;
|
||||
if (f > 0) {
|
||||
last_non_zero_offset = result.size();
|
||||
}
|
||||
}
|
||||
result.resize(last_non_zero_offset);
|
||||
return result;
|
||||
}
|
||||
|
||||
int CountLiveFiles() {
|
||||
std::vector<LiveFileMetaData> metadata;
|
||||
db_->GetLiveFilesMetaData(&metadata);
|
||||
return static_cast<int>(metadata.size());
|
||||
}
|
||||
|
||||
// Do n memtable flushes, each of which produces an sstable
|
||||
// covering the range [small,large].
|
||||
void MakeTables(int cf, int n, const std::string& small,
|
||||
const std::string& large) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
ASSERT_OK(Put(cf, small, "begin"));
|
||||
ASSERT_OK(Put(cf, large, "end"));
|
||||
ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf]));
|
||||
}
|
||||
}
|
||||
|
||||
int CountLiveLogFiles() {
|
||||
int micros_wait_for_log_deletion = 20000;
|
||||
env_->SleepForMicroseconds(micros_wait_for_log_deletion);
|
||||
int ret = 0;
|
||||
VectorLogPtr wal_files;
|
||||
Status s;
|
||||
// GetSortedWalFiles is a flakey function -- it gets all the wal_dir
|
||||
// children files and then later checks for their existance. if some of the
|
||||
// log files doesn't exist anymore, it reports an error. it does all of this
|
||||
// without DB mutex held, so if a background process deletes the log file
|
||||
// while the function is being executed, it returns an error. We retry the
|
||||
// function 10 times to avoid the error failing the test
|
||||
for (int retries = 0; retries < 10; ++retries) {
|
||||
wal_files.clear();
|
||||
s = db_->GetSortedWalFiles(wal_files);
|
||||
if (s.ok()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
ASSERT_OK(s);
|
||||
for (const auto& wal : wal_files) {
|
||||
if (wal->Type() == kAliveLogFile) {
|
||||
++ret;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
|
||||
assert(num_per_cf.size() == handles_.size());
|
||||
|
||||
for (size_t i = 0; i < num_per_cf.size(); ++i) {
|
||||
ASSERT_EQ(num_per_cf[i],
|
||||
GetProperty(i, "rocksdb.num-immutable-mem-table"));
|
||||
}
|
||||
}
|
||||
|
||||
void CopyFile(const std::string& source, const std::string& destination,
|
||||
uint64_t size = 0) {
|
||||
const EnvOptions soptions;
|
||||
unique_ptr<SequentialFile> srcfile;
|
||||
ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
|
||||
unique_ptr<WritableFile> destfile;
|
||||
ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
|
||||
|
||||
if (size == 0) {
|
||||
// default argument means copy everything
|
||||
ASSERT_OK(env_->GetFileSize(source, &size));
|
||||
}
|
||||
|
||||
char buffer[4096];
|
||||
Slice slice;
|
||||
while (size > 0) {
|
||||
uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
|
||||
ASSERT_OK(srcfile->Read(one, &slice, buffer));
|
||||
ASSERT_OK(destfile->Append(slice));
|
||||
size -= slice.size();
|
||||
}
|
||||
ASSERT_OK(destfile->Close());
|
||||
}
|
||||
|
||||
std::vector<ColumnFamilyHandle*> handles_;
|
||||
std::vector<std::string> names_;
|
||||
ColumnFamilyOptions column_family_options_;
|
||||
DBOptions db_options_;
|
||||
std::string dbname_;
|
||||
DB* db_ = nullptr;
|
||||
EnvCounter* env_;
|
||||
Random rnd_;
|
||||
};
|
||||
|
||||
TEST(ColumnFamilyTest, DontReuseColumnFamilyID) {
|
||||
for (int iter = 0; iter < 3; ++iter) {
|
||||
Open();
|
||||
CreateColumnFamilies({"one", "two", "three"});
|
||||
for (size_t i = 0; i < handles_.size(); ++i) {
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[i]);
|
||||
ASSERT_EQ(i, cfh->GetID());
|
||||
}
|
||||
if (iter == 1) {
|
||||
Reopen();
|
||||
}
|
||||
DropColumnFamilies({3});
|
||||
Reopen();
|
||||
if (iter == 2) {
|
||||
// this tests if max_column_family is correctly persisted with
|
||||
// WriteSnapshot()
|
||||
Reopen();
|
||||
}
|
||||
CreateColumnFamilies({"three2"});
|
||||
// ID 3 that was used for dropped column family "three" should not be reused
|
||||
auto cfh3 = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[3]);
|
||||
ASSERT_EQ(4U, cfh3->GetID());
|
||||
Close();
|
||||
Destroy();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
TEST(ColumnFamilyTest, AddDrop) {
|
||||
Open();
|
||||
CreateColumnFamilies({"one", "two", "three"});
|
||||
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(2, "fodor"));
|
||||
DropColumnFamilies({2});
|
||||
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
|
||||
CreateColumnFamilies({"four"});
|
||||
ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
|
||||
ASSERT_OK(Put(1, "fodor", "mirko"));
|
||||
ASSERT_EQ("mirko", Get(1, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
|
||||
Close();
|
||||
ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument());
|
||||
Open({"default", "one", "three", "four"});
|
||||
DropColumnFamilies({1});
|
||||
Reopen();
|
||||
Close();
|
||||
|
||||
std::vector<std::string> families;
|
||||
ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
|
||||
sort(families.begin(), families.end());
|
||||
ASSERT_TRUE(families ==
|
||||
std::vector<std::string>({"default", "four", "three"}));
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, DropTest) {
|
||||
// first iteration - dont reopen DB before dropping
|
||||
// second iteration - reopen DB before dropping
|
||||
for (int iter = 0; iter < 2; ++iter) {
|
||||
Open({"default"});
|
||||
CreateColumnFamiliesAndReopen({"pikachu"});
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i)));
|
||||
}
|
||||
ASSERT_OK(Flush(1));
|
||||
|
||||
if (iter == 1) {
|
||||
Reopen();
|
||||
}
|
||||
ASSERT_EQ("bar1", Get(1, "1"));
|
||||
|
||||
ASSERT_EQ(CountLiveFiles(), 1);
|
||||
DropColumnFamilies({1});
|
||||
// make sure that all files are deleted when we drop the column family
|
||||
ASSERT_EQ(CountLiveFiles(), 0);
|
||||
Destroy();
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, WriteBatchFailure) {
|
||||
Open();
|
||||
CreateColumnFamiliesAndReopen({"one", "two"});
|
||||
WriteBatch batch;
|
||||
batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
|
||||
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
||||
DropColumnFamilies({1});
|
||||
Status s = db_->Write(WriteOptions(), &batch);
|
||||
ASSERT_TRUE(s.IsInvalidArgument());
|
||||
Close();
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, ReadWrite) {
|
||||
Open();
|
||||
CreateColumnFamiliesAndReopen({"one", "two"});
|
||||
ASSERT_OK(Put(0, "foo", "v1"));
|
||||
ASSERT_OK(Put(0, "bar", "v2"));
|
||||
ASSERT_OK(Put(1, "mirko", "v3"));
|
||||
ASSERT_OK(Put(0, "foo", "v2"));
|
||||
ASSERT_OK(Put(2, "fodor", "v5"));
|
||||
|
||||
for (int iter = 0; iter <= 3; ++iter) {
|
||||
ASSERT_EQ("v2", Get(0, "foo"));
|
||||
ASSERT_EQ("v2", Get(0, "bar"));
|
||||
ASSERT_EQ("v3", Get(1, "mirko"));
|
||||
ASSERT_EQ("v5", Get(2, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
|
||||
if (iter <= 1) {
|
||||
Reopen();
|
||||
}
|
||||
}
|
||||
Close();
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, IgnoreRecoveredLog) {
|
||||
std::string backup_logs = dbname_ + "/backup_logs";
|
||||
|
||||
// delete old files in backup_logs directory
|
||||
ASSERT_OK(env_->CreateDirIfMissing(dbname_));
|
||||
ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
|
||||
std::vector<std::string> old_files;
|
||||
env_->GetChildren(backup_logs, &old_files);
|
||||
for (auto& file : old_files) {
|
||||
if (file != "." && file != "..") {
|
||||
env_->DeleteFile(backup_logs + "/" + file);
|
||||
}
|
||||
}
|
||||
|
||||
column_family_options_.merge_operator =
|
||||
MergeOperators::CreateUInt64AddOperator();
|
||||
db_options_.wal_dir = dbname_ + "/logs";
|
||||
Destroy();
|
||||
Open();
|
||||
CreateColumnFamilies({"cf1", "cf2"});
|
||||
|
||||
// fill up the DB
|
||||
std::string one, two, three;
|
||||
PutFixed64(&one, 1);
|
||||
PutFixed64(&two, 2);
|
||||
PutFixed64(&three, 3);
|
||||
ASSERT_OK(Merge(0, "foo", one));
|
||||
ASSERT_OK(Merge(1, "mirko", one));
|
||||
ASSERT_OK(Merge(0, "foo", one));
|
||||
ASSERT_OK(Merge(2, "bla", one));
|
||||
ASSERT_OK(Merge(2, "fodor", one));
|
||||
ASSERT_OK(Merge(0, "bar", one));
|
||||
ASSERT_OK(Merge(2, "bla", one));
|
||||
ASSERT_OK(Merge(1, "mirko", two));
|
||||
ASSERT_OK(Merge(1, "franjo", one));
|
||||
|
||||
// copy the logs to backup
|
||||
std::vector<std::string> logs;
|
||||
env_->GetChildren(db_options_.wal_dir, &logs);
|
||||
for (auto& log : logs) {
|
||||
if (log != ".." && log != ".") {
|
||||
CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
|
||||
}
|
||||
}
|
||||
|
||||
// recover the DB
|
||||
Close();
|
||||
|
||||
// 1. check consistency
|
||||
// 2. copy the logs from backup back to WAL dir. if the recovery happens
|
||||
// again on the same log files, this should lead to incorrect results
|
||||
// due to applying merge operator twice
|
||||
// 3. check consistency
|
||||
for (int iter = 0; iter < 2; ++iter) {
|
||||
// assert consistency
|
||||
Open({"default", "cf1", "cf2"});
|
||||
ASSERT_EQ(two, Get(0, "foo"));
|
||||
ASSERT_EQ(one, Get(0, "bar"));
|
||||
ASSERT_EQ(three, Get(1, "mirko"));
|
||||
ASSERT_EQ(one, Get(1, "franjo"));
|
||||
ASSERT_EQ(one, Get(2, "fodor"));
|
||||
ASSERT_EQ(two, Get(2, "bla"));
|
||||
Close();
|
||||
|
||||
if (iter == 0) {
|
||||
// copy the logs from backup back to wal dir
|
||||
for (auto& log : logs) {
|
||||
if (log != ".." && log != ".") {
|
||||
CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, FlushTest) {
|
||||
Open();
|
||||
CreateColumnFamiliesAndReopen({"one", "two"});
|
||||
ASSERT_OK(Put(0, "foo", "v1"));
|
||||
ASSERT_OK(Put(0, "bar", "v2"));
|
||||
ASSERT_OK(Put(1, "mirko", "v3"));
|
||||
ASSERT_OK(Put(0, "foo", "v2"));
|
||||
ASSERT_OK(Put(2, "fodor", "v5"));
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
Flush(i);
|
||||
}
|
||||
Reopen();
|
||||
|
||||
for (int iter = 0; iter <= 2; ++iter) {
|
||||
ASSERT_EQ("v2", Get(0, "foo"));
|
||||
ASSERT_EQ("v2", Get(0, "bar"));
|
||||
ASSERT_EQ("v3", Get(1, "mirko"));
|
||||
ASSERT_EQ("v5", Get(2, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
|
||||
if (iter <= 1) {
|
||||
Reopen();
|
||||
}
|
||||
}
|
||||
Close();
|
||||
}
|
||||
|
||||
// Makes sure that obsolete log files get deleted
|
||||
TEST(ColumnFamilyTest, LogDeletionTest) {
|
||||
db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
|
||||
column_family_options_.write_buffer_size = 100000; // 100KB
|
||||
Open();
|
||||
CreateColumnFamilies({"one", "two", "three", "four"});
|
||||
// Each bracket is one log file. if number is in (), it means
|
||||
// we don't need it anymore (it's been flushed)
|
||||
// []
|
||||
ASSERT_EQ(CountLiveLogFiles(), 0);
|
||||
PutRandomData(0, 1, 100);
|
||||
// [0]
|
||||
PutRandomData(1, 1, 100);
|
||||
// [0, 1]
|
||||
PutRandomData(1, 1000, 100);
|
||||
WaitForFlush(1);
|
||||
// [0, (1)] [1]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 2);
|
||||
PutRandomData(0, 1, 100);
|
||||
// [0, (1)] [0, 1]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 2);
|
||||
PutRandomData(2, 1, 100);
|
||||
// [0, (1)] [0, 1, 2]
|
||||
PutRandomData(2, 1000, 100);
|
||||
WaitForFlush(2);
|
||||
// [0, (1)] [0, 1, (2)] [2]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 3);
|
||||
PutRandomData(2, 1000, 100);
|
||||
WaitForFlush(2);
|
||||
// [0, (1)] [0, 1, (2)] [(2)] [2]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 4);
|
||||
PutRandomData(3, 1, 100);
|
||||
// [0, (1)] [0, 1, (2)] [(2)] [2, 3]
|
||||
PutRandomData(1, 1, 100);
|
||||
// [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 4);
|
||||
PutRandomData(1, 1000, 100);
|
||||
WaitForFlush(1);
|
||||
// [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 5);
|
||||
PutRandomData(0, 1000, 100);
|
||||
WaitForFlush(0);
|
||||
// [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0]
|
||||
// delete obsolete logs -->
|
||||
// [(1), 2, 3] [1, (0)] [0]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 3);
|
||||
PutRandomData(0, 1000, 100);
|
||||
WaitForFlush(0);
|
||||
// [(1), 2, 3] [1, (0)], [(0)] [0]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 4);
|
||||
PutRandomData(1, 1000, 100);
|
||||
WaitForFlush(1);
|
||||
// [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 5);
|
||||
PutRandomData(2, 1000, 100);
|
||||
WaitForFlush(2);
|
||||
// [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 6);
|
||||
PutRandomData(3, 1000, 100);
|
||||
WaitForFlush(3);
|
||||
// [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3]
|
||||
// delete obsolete logs -->
|
||||
// [0, (1)] [1, (2)], [2, (3)] [3]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 4);
|
||||
Close();
|
||||
}
|
||||
|
||||
// Makes sure that obsolete log files get deleted
|
||||
TEST(ColumnFamilyTest, DifferentWriteBufferSizes) {
|
||||
// disable flushing stale column families
|
||||
db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
|
||||
Open();
|
||||
CreateColumnFamilies({"one", "two", "three"});
|
||||
ColumnFamilyOptions default_cf, one, two, three;
|
||||
// setup options. all column families have max_write_buffer_number setup to 10
|
||||
// "default" -> 100KB memtable, start flushing immediatelly
|
||||
// "one" -> 200KB memtable, start flushing with two immutable memtables
|
||||
// "two" -> 1MB memtable, start flushing with three immutable memtables
|
||||
// "three" -> 90KB memtable, start flushing with four immutable memtables
|
||||
default_cf.write_buffer_size = 100000;
|
||||
default_cf.max_write_buffer_number = 10;
|
||||
default_cf.min_write_buffer_number_to_merge = 1;
|
||||
one.write_buffer_size = 200000;
|
||||
one.max_write_buffer_number = 10;
|
||||
one.min_write_buffer_number_to_merge = 2;
|
||||
two.write_buffer_size = 1000000;
|
||||
two.max_write_buffer_number = 10;
|
||||
two.min_write_buffer_number_to_merge = 3;
|
||||
three.write_buffer_size = 90000;
|
||||
three.max_write_buffer_number = 10;
|
||||
three.min_write_buffer_number_to_merge = 4;
|
||||
|
||||
Reopen({default_cf, one, two, three});
|
||||
|
||||
int micros_wait_for_flush = 10000;
|
||||
PutRandomData(0, 100, 1000);
|
||||
WaitForFlush(0);
|
||||
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 1);
|
||||
PutRandomData(1, 200, 1000);
|
||||
env_->SleepForMicroseconds(micros_wait_for_flush);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 0, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 2);
|
||||
PutRandomData(2, 1000, 1000);
|
||||
env_->SleepForMicroseconds(micros_wait_for_flush);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 1, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 3);
|
||||
PutRandomData(2, 1000, 1000);
|
||||
env_->SleepForMicroseconds(micros_wait_for_flush);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 2, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 4);
|
||||
PutRandomData(3, 90, 1000);
|
||||
env_->SleepForMicroseconds(micros_wait_for_flush);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 2, 1});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 5);
|
||||
PutRandomData(3, 90, 1000);
|
||||
env_->SleepForMicroseconds(micros_wait_for_flush);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 2, 2});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 6);
|
||||
PutRandomData(3, 90, 1000);
|
||||
env_->SleepForMicroseconds(micros_wait_for_flush);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 2, 3});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 7);
|
||||
PutRandomData(0, 100, 1000);
|
||||
WaitForFlush(0);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 2, 3});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 8);
|
||||
PutRandomData(2, 100, 10000);
|
||||
WaitForFlush(2);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 0, 3});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 9);
|
||||
PutRandomData(3, 90, 1000);
|
||||
WaitForFlush(3);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 0, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 10);
|
||||
PutRandomData(3, 90, 1000);
|
||||
env_->SleepForMicroseconds(micros_wait_for_flush);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 0, 1});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 11);
|
||||
PutRandomData(1, 200, 1000);
|
||||
WaitForFlush(1);
|
||||
AssertNumberOfImmutableMemtables({0, 0, 0, 1});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 5);
|
||||
PutRandomData(3, 90*6, 1000);
|
||||
WaitForFlush(3);
|
||||
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 12);
|
||||
PutRandomData(0, 100, 1000);
|
||||
WaitForFlush(0);
|
||||
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 12);
|
||||
PutRandomData(2, 3*100, 10000);
|
||||
WaitForFlush(2);
|
||||
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 12);
|
||||
PutRandomData(1, 2*200, 1000);
|
||||
WaitForFlush(1);
|
||||
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 7);
|
||||
Close();
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, DifferentMergeOperators) {
|
||||
Open();
|
||||
CreateColumnFamilies({"first", "second"});
|
||||
ColumnFamilyOptions default_cf, first, second;
|
||||
first.merge_operator = MergeOperators::CreateUInt64AddOperator();
|
||||
second.merge_operator = MergeOperators::CreateStringAppendOperator();
|
||||
Reopen({default_cf, first, second});
|
||||
|
||||
std::string one, two, three;
|
||||
PutFixed64(&one, 1);
|
||||
PutFixed64(&two, 2);
|
||||
PutFixed64(&three, 3);
|
||||
|
||||
ASSERT_OK(Put(0, "foo", two));
|
||||
ASSERT_OK(Put(0, "foo", one));
|
||||
ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported());
|
||||
ASSERT_EQ(Get(0, "foo"), one);
|
||||
|
||||
ASSERT_OK(Put(1, "foo", two));
|
||||
ASSERT_OK(Put(1, "foo", one));
|
||||
ASSERT_OK(Merge(1, "foo", two));
|
||||
ASSERT_EQ(Get(1, "foo"), three);
|
||||
|
||||
ASSERT_OK(Put(2, "foo", two));
|
||||
ASSERT_OK(Put(2, "foo", one));
|
||||
ASSERT_OK(Merge(2, "foo", two));
|
||||
ASSERT_EQ(Get(2, "foo"), one + "," + two);
|
||||
Close();
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, DifferentCompactionStyles) {
|
||||
Open();
|
||||
CreateColumnFamilies({"one", "two"});
|
||||
ColumnFamilyOptions default_cf, one, two;
|
||||
db_options_.max_open_files = 20; // only 10 files in file cache
|
||||
db_options_.disableDataSync = true;
|
||||
|
||||
default_cf.compaction_style = kCompactionStyleLevel;
|
||||
default_cf.num_levels = 3;
|
||||
default_cf.write_buffer_size = 64 << 10; // 64KB
|
||||
default_cf.target_file_size_base = 30 << 10;
|
||||
default_cf.filter_policy = nullptr;
|
||||
default_cf.no_block_cache = true;
|
||||
default_cf.source_compaction_factor = 100;
|
||||
default_cf.disable_seek_compaction = false;
|
||||
|
||||
one.compaction_style = kCompactionStyleUniversal;
|
||||
// trigger compaction if there are >= 4 files
|
||||
one.level0_file_num_compaction_trigger = 4;
|
||||
one.write_buffer_size = 100000;
|
||||
|
||||
two.compaction_style = kCompactionStyleLevel;
|
||||
two.num_levels = 4;
|
||||
two.max_mem_compaction_level = 0;
|
||||
two.level0_file_num_compaction_trigger = 3;
|
||||
two.write_buffer_size = 100000;
|
||||
|
||||
Reopen({default_cf, one, two});
|
||||
|
||||
// SETUP column family "default" - test read compaction
|
||||
ASSERT_EQ("", FilesPerLevel(0));
|
||||
PutRandomData(0, 1, 4096);
|
||||
ASSERT_OK(Flush(0));
|
||||
ASSERT_EQ("0,0,1", FilesPerLevel(0));
|
||||
// write 8MB
|
||||
PutRandomData(0, 2000, 4096);
|
||||
ASSERT_OK(Flush(0));
|
||||
// clear levels 0 and 1
|
||||
dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[0]);
|
||||
dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[0]);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);
|
||||
// write some new keys into level 0 and 1
|
||||
PutRandomData(0, 1024, 512);
|
||||
ASSERT_OK(Flush(0));
|
||||
WaitForCompaction();
|
||||
PutRandomData(0, 10, 512);
|
||||
ASSERT_OK(Flush(0));
|
||||
// remember number of files in each level
|
||||
int l1 = NumTableFilesAtLevel(0, 0);
|
||||
int l2 = NumTableFilesAtLevel(1, 0);
|
||||
int l3 = NumTableFilesAtLevel(2, 0);
|
||||
ASSERT_NE(l1, 0);
|
||||
ASSERT_NE(l2, 0);
|
||||
ASSERT_NE(l3, 0);
|
||||
|
||||
// SETUP column family "one" -- universal style
|
||||
for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
|
||||
PutRandomData(1, 11, 10000);
|
||||
WaitForFlush(1);
|
||||
ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(1));
|
||||
}
|
||||
|
||||
// SETUP column family "two" -- level style with 4 levels
|
||||
for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
|
||||
PutRandomData(2, 15, 10000);
|
||||
WaitForFlush(2);
|
||||
ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2));
|
||||
}
|
||||
|
||||
// TRIGGER compaction "default"
|
||||
// read a bunch of times, trigger read compaction
|
||||
for (int i = 0; i < 200000; ++i) {
|
||||
Get(0, std::to_string(i));
|
||||
}
|
||||
|
||||
// TRIGGER compaction "one"
|
||||
PutRandomData(1, 12, 10000);
|
||||
|
||||
// TRIGGER compaction "two"
|
||||
PutRandomData(2, 10, 10000);
|
||||
|
||||
// WAIT for compactions
|
||||
WaitForCompaction();
|
||||
|
||||
// VERIFY compaction "default"
|
||||
// verify that the number of files have decreased
|
||||
// in some level, indicating that there was a compaction
|
||||
ASSERT_TRUE(NumTableFilesAtLevel(0, 0) < l1 ||
|
||||
NumTableFilesAtLevel(1, 0) < l2 ||
|
||||
NumTableFilesAtLevel(2, 0) < l3);
|
||||
|
||||
// VERIFY compaction "one"
|
||||
ASSERT_EQ("1", FilesPerLevel(1));
|
||||
|
||||
// VERIFY compaction "two"
|
||||
ASSERT_EQ("0,1", FilesPerLevel(2));
|
||||
CompactAll(2);
|
||||
ASSERT_EQ("0,1", FilesPerLevel(2));
|
||||
|
||||
Close();
|
||||
}
|
||||
|
||||
namespace {
|
||||
std::string IterStatus(Iterator* iter) {
|
||||
std::string result;
|
||||
if (iter->Valid()) {
|
||||
result = iter->key().ToString() + "->" + iter->value().ToString();
|
||||
} else {
|
||||
result = "(invalid)";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
TEST(ColumnFamilyTest, NewIteratorsTest) {
|
||||
// iter == 0 -- no tailing
|
||||
// iter == 2 -- tailing
|
||||
for (int iter = 0; iter < 2; ++iter) {
|
||||
Open();
|
||||
CreateColumnFamiliesAndReopen({"one", "two"});
|
||||
ASSERT_OK(Put(0, "a", "b"));
|
||||
ASSERT_OK(Put(1, "b", "a"));
|
||||
ASSERT_OK(Put(2, "c", "m"));
|
||||
ASSERT_OK(Put(2, "v", "t"));
|
||||
std::vector<Iterator*> iterators;
|
||||
ReadOptions options;
|
||||
options.tailing = (iter == 1);
|
||||
ASSERT_OK(db_->NewIterators(options, handles_, &iterators));
|
||||
|
||||
for (auto it : iterators) {
|
||||
it->SeekToFirst();
|
||||
}
|
||||
ASSERT_EQ(IterStatus(iterators[0]), "a->b");
|
||||
ASSERT_EQ(IterStatus(iterators[1]), "b->a");
|
||||
ASSERT_EQ(IterStatus(iterators[2]), "c->m");
|
||||
|
||||
ASSERT_OK(Put(1, "x", "x"));
|
||||
|
||||
for (auto it : iterators) {
|
||||
it->Next();
|
||||
}
|
||||
|
||||
ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
|
||||
if (iter == 0) {
|
||||
// no tailing
|
||||
ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
|
||||
} else {
|
||||
// tailing
|
||||
ASSERT_EQ(IterStatus(iterators[1]), "x->x");
|
||||
}
|
||||
ASSERT_EQ(IterStatus(iterators[2]), "v->t");
|
||||
|
||||
for (auto it : iterators) {
|
||||
delete it;
|
||||
}
|
||||
Destroy();
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, ReadOnlyDBTest) {
|
||||
Open();
|
||||
CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
|
||||
ASSERT_OK(Put(1, "foo", "bla"));
|
||||
ASSERT_OK(Put(2, "foo", "blabla"));
|
||||
ASSERT_OK(Put(3, "foo", "blablabla"));
|
||||
ASSERT_OK(Put(4, "foo", "blablablabla"));
|
||||
|
||||
DropColumnFamilies({2});
|
||||
Close();
|
||||
// open only a subset of column families
|
||||
AssertOpenReadOnly({"default", "one", "four"});
|
||||
ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
|
||||
ASSERT_EQ("bla", Get(1, "foo"));
|
||||
ASSERT_EQ("blablablabla", Get(2, "foo"));
|
||||
|
||||
Close();
|
||||
// can't open dropped column family
|
||||
Status s = OpenReadOnly({"default", "one", "two"});
|
||||
ASSERT_TRUE(!s.ok());
|
||||
|
||||
// Can't open without specifying default column family
|
||||
s = OpenReadOnly({"one", "four"});
|
||||
ASSERT_TRUE(!s.ok());
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, DontRollEmptyLogs) {
|
||||
Open();
|
||||
CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
|
||||
|
||||
for (size_t i = 0; i < handles_.size(); ++i) {
|
||||
PutRandomData(i, 10, 100);
|
||||
}
|
||||
int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls();
|
||||
// this will trigger the flushes
|
||||
ASSERT_OK(db_->Write(WriteOptions(), nullptr));
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
dbfull()->TEST_WaitForFlushMemTable(handles_[i]);
|
||||
}
|
||||
int total_new_writable_files =
|
||||
env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start;
|
||||
ASSERT_EQ(static_cast<size_t>(total_new_writable_files), handles_.size() + 1);
|
||||
Close();
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, FlushStaleColumnFamilies) {
|
||||
Open();
|
||||
CreateColumnFamilies({"one", "two"});
|
||||
ColumnFamilyOptions default_cf, one, two;
|
||||
default_cf.write_buffer_size = 100000; // small write buffer size
|
||||
default_cf.disable_auto_compactions = true;
|
||||
one.disable_auto_compactions = true;
|
||||
two.disable_auto_compactions = true;
|
||||
db_options_.max_total_wal_size = 210000;
|
||||
|
||||
Reopen({default_cf, one, two});
|
||||
|
||||
PutRandomData(2, 1, 10); // 10 bytes
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
PutRandomData(0, 100, 1000); // flush
|
||||
WaitForFlush(0);
|
||||
ASSERT_EQ(i + 1, CountLiveFiles());
|
||||
}
|
||||
// third flush. now, CF [two] should be detected as stale and flushed
|
||||
// column family 1 should not be flushed since it's empty
|
||||
PutRandomData(0, 100, 1000); // flush
|
||||
WaitForFlush(0);
|
||||
WaitForFlush(2);
|
||||
// 3 files for default column families, 1 file for column family [two], zero
|
||||
// files for column family [one], because it's empty
|
||||
ASSERT_EQ(4, CountLiveFiles());
|
||||
Close();
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
||||
253
db/compaction.cc
Normal file
253
db/compaction.cc
Normal file
@@ -0,0 +1,253 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/compaction.h"
|
||||
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#include <inttypes.h>
|
||||
#include <vector>
|
||||
|
||||
#include "db/column_family.h"
|
||||
#include "util/logging.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
static uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
|
||||
uint64_t sum = 0;
|
||||
for (size_t i = 0; i < files.size() && files[i]; i++) {
|
||||
sum += files[i]->file_size;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
Compaction::Compaction(Version* input_version, int level, int out_level,
|
||||
uint64_t target_file_size,
|
||||
uint64_t max_grandparent_overlap_bytes,
|
||||
bool seek_compaction, bool enable_compression,
|
||||
bool deletion_compaction)
|
||||
: level_(level),
|
||||
out_level_(out_level),
|
||||
max_output_file_size_(target_file_size),
|
||||
max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
|
||||
input_version_(input_version),
|
||||
number_levels_(input_version_->NumberLevels()),
|
||||
cfd_(input_version_->cfd_),
|
||||
seek_compaction_(seek_compaction),
|
||||
enable_compression_(enable_compression),
|
||||
deletion_compaction_(deletion_compaction),
|
||||
grandparent_index_(0),
|
||||
seen_key_(false),
|
||||
overlapped_bytes_(0),
|
||||
base_index_(-1),
|
||||
parent_index_(-1),
|
||||
score_(0),
|
||||
bottommost_level_(false),
|
||||
is_full_compaction_(false),
|
||||
is_manual_compaction_(false),
|
||||
level_ptrs_(std::vector<size_t>(number_levels_)) {
|
||||
|
||||
cfd_->Ref();
|
||||
input_version_->Ref();
|
||||
edit_ = new VersionEdit();
|
||||
edit_->SetColumnFamily(cfd_->GetID());
|
||||
for (int i = 0; i < number_levels_; i++) {
|
||||
level_ptrs_[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
Compaction::~Compaction() {
|
||||
delete edit_;
|
||||
if (input_version_ != nullptr) {
|
||||
input_version_->Unref();
|
||||
}
|
||||
if (cfd_ != nullptr) {
|
||||
if (cfd_->Unref()) {
|
||||
delete cfd_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Compaction::IsTrivialMove() const {
|
||||
// Avoid a move if there is lots of overlapping grandparent data.
|
||||
// Otherwise, the move could create a parent file that will require
|
||||
// a very expensive merge later on.
|
||||
// If level_== out_level_, the purpose is to force compaction filter to be
|
||||
// applied to that level, and thus cannot be a trivia move.
|
||||
return (level_ != out_level_ &&
|
||||
num_input_files(0) == 1 &&
|
||||
num_input_files(1) == 0 &&
|
||||
TotalFileSize(grandparents_) <= max_grandparent_overlap_bytes_);
|
||||
}
|
||||
|
||||
bool Compaction::IsDeletionCompaction() const { return deletion_compaction_; }
|
||||
|
||||
void Compaction::AddInputDeletions(VersionEdit* edit) {
|
||||
for (int which = 0; which < 2; which++) {
|
||||
for (size_t i = 0; i < inputs_[which].size(); i++) {
|
||||
edit->DeleteFile(level_ + which, inputs_[which][i]->number);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
|
||||
assert(cfd_->options()->compaction_style != kCompactionStyleFIFO);
|
||||
if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
|
||||
return bottommost_level_;
|
||||
}
|
||||
// Maybe use binary search to find right entry instead of linear search?
|
||||
const Comparator* user_cmp = cfd_->user_comparator();
|
||||
for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
|
||||
const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
|
||||
for (; level_ptrs_[lvl] < files.size(); ) {
|
||||
FileMetaData* f = files[level_ptrs_[lvl]];
|
||||
if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
|
||||
// We've advanced far enough
|
||||
if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
|
||||
// Key falls in this file's range, so definitely not base level
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
level_ptrs_[lvl]++;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Compaction::ShouldStopBefore(const Slice& internal_key) {
|
||||
// Scan to find earliest grandparent file that contains key.
|
||||
const InternalKeyComparator* icmp = &cfd_->internal_comparator();
|
||||
while (grandparent_index_ < grandparents_.size() &&
|
||||
icmp->Compare(internal_key,
|
||||
grandparents_[grandparent_index_]->largest.Encode()) > 0) {
|
||||
if (seen_key_) {
|
||||
overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
|
||||
}
|
||||
assert(grandparent_index_ + 1 >= grandparents_.size() ||
|
||||
icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
|
||||
grandparents_[grandparent_index_+1]->smallest.Encode())
|
||||
< 0);
|
||||
grandparent_index_++;
|
||||
}
|
||||
seen_key_ = true;
|
||||
|
||||
if (overlapped_bytes_ > max_grandparent_overlap_bytes_) {
|
||||
// Too much overlap for current output; start new output
|
||||
overlapped_bytes_ = 0;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Mark (or clear) each file that is being compacted
|
||||
void Compaction::MarkFilesBeingCompacted(bool value) {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
std::vector<FileMetaData*> v = inputs_[i];
|
||||
for (unsigned int j = 0; j < inputs_[i].size(); j++) {
|
||||
assert(value ? !inputs_[i][j]->being_compacted :
|
||||
inputs_[i][j]->being_compacted);
|
||||
inputs_[i][j]->being_compacted = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Is this compaction producing files at the bottommost level?
|
||||
void Compaction::SetupBottomMostLevel(bool isManual) {
|
||||
assert(cfd_->options()->compaction_style != kCompactionStyleFIFO);
|
||||
if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
|
||||
// If universal compaction style is used and manual
|
||||
// compaction is occuring, then we are guaranteed that
|
||||
// all files will be picked in a single compaction
|
||||
// run. We can safely set bottommost_level_ = true.
|
||||
// If it is not manual compaction, then bottommost_level_
|
||||
// is already set when the Compaction was created.
|
||||
if (isManual) {
|
||||
bottommost_level_ = true;
|
||||
}
|
||||
return;
|
||||
}
|
||||
bottommost_level_ = true;
|
||||
for (int i = output_level() + 1; i < number_levels_; i++) {
|
||||
if (input_version_->NumLevelFiles(i) > 0) {
|
||||
bottommost_level_ = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Compaction::ReleaseInputs() {
|
||||
if (input_version_ != nullptr) {
|
||||
input_version_->Unref();
|
||||
input_version_ = nullptr;
|
||||
}
|
||||
if (cfd_ != nullptr) {
|
||||
if (cfd_->Unref()) {
|
||||
delete cfd_;
|
||||
}
|
||||
cfd_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void Compaction::ReleaseCompactionFiles(Status status) {
|
||||
cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
|
||||
}
|
||||
|
||||
void Compaction::ResetNextCompactionIndex() {
|
||||
input_version_->ResetNextCompactionIndex(level_);
|
||||
}
|
||||
|
||||
namespace {
|
||||
int InputSummary(const std::vector<FileMetaData*>& files, char* output,
|
||||
int len) {
|
||||
*output = '\0';
|
||||
int write = 0;
|
||||
for (unsigned int i = 0; i < files.size(); i++) {
|
||||
int sz = len - write;
|
||||
int ret;
|
||||
char sztxt[16];
|
||||
AppendHumanBytes(files.at(i)->file_size, sztxt, 16);
|
||||
ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", files.at(i)->number,
|
||||
sztxt);
|
||||
if (ret < 0 || ret >= sz) break;
|
||||
write += ret;
|
||||
}
|
||||
// if files.size() is non-zero, overwrite the last space
|
||||
return write - !!files.size();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void Compaction::Summary(char* output, int len) {
|
||||
int write =
|
||||
snprintf(output, len, "Base version %" PRIu64
|
||||
" Base level %d, seek compaction:%d, inputs: [",
|
||||
input_version_->GetVersionNumber(), level_, seek_compaction_);
|
||||
if (write < 0 || write >= len) {
|
||||
return;
|
||||
}
|
||||
|
||||
write += InputSummary(inputs_[0], output + write, len - write);
|
||||
if (write < 0 || write >= len) {
|
||||
return;
|
||||
}
|
||||
|
||||
write += snprintf(output + write, len - write, "], [");
|
||||
if (write < 0 || write >= len) {
|
||||
return;
|
||||
}
|
||||
|
||||
write += InputSummary(inputs_[1], output + write, len - write);
|
||||
if (write < 0 || write >= len) {
|
||||
return;
|
||||
}
|
||||
|
||||
snprintf(output + write, len - write, "]");
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
158
db/compaction.h
Normal file
158
db/compaction.h
Normal file
@@ -0,0 +1,158 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include "db/version_set.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Version;
|
||||
class ColumnFamilyData;
|
||||
|
||||
// A Compaction encapsulates information about a compaction.
|
||||
class Compaction {
|
||||
public:
|
||||
~Compaction();
|
||||
|
||||
// Return the level that is being compacted. Inputs from "level"
|
||||
// will be merged.
|
||||
int level() const { return level_; }
|
||||
|
||||
// Outputs will go to this level
|
||||
int output_level() const { return out_level_; }
|
||||
|
||||
// Return the object that holds the edits to the descriptor done
|
||||
// by this compaction.
|
||||
VersionEdit* edit() { return edit_; }
|
||||
|
||||
// "which" must be either 0 or 1
|
||||
int num_input_files(int which) const { return inputs_[which].size(); }
|
||||
|
||||
// Returns input version of the compaction
|
||||
Version* input_version() const { return input_version_; }
|
||||
|
||||
ColumnFamilyData* column_family_data() const { return cfd_; }
|
||||
|
||||
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
|
||||
FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
|
||||
|
||||
std::vector<FileMetaData*>* inputs(int which) { return &inputs_[which]; }
|
||||
|
||||
// Maximum size of files to build during this compaction.
|
||||
uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
|
||||
|
||||
// Whether compression will be enabled for compaction outputs
|
||||
bool enable_compression() const { return enable_compression_; }
|
||||
|
||||
// Is this a trivial compaction that can be implemented by just
|
||||
// moving a single input file to the next level (no merging or splitting)
|
||||
bool IsTrivialMove() const;
|
||||
|
||||
// If true, just delete all files in inputs_[0]
|
||||
bool IsDeletionCompaction() const;
|
||||
|
||||
// Add all inputs to this compaction as delete operations to *edit.
|
||||
void AddInputDeletions(VersionEdit* edit);
|
||||
|
||||
// Returns true if the information we have available guarantees that
|
||||
// the compaction is producing data in "level+1" for which no data exists
|
||||
// in levels greater than "level+1".
|
||||
bool IsBaseLevelForKey(const Slice& user_key);
|
||||
|
||||
// Returns true iff we should stop building the current output
|
||||
// before processing "internal_key".
|
||||
bool ShouldStopBefore(const Slice& internal_key);
|
||||
|
||||
// Release the input version for the compaction, once the compaction
|
||||
// is successful.
|
||||
void ReleaseInputs();
|
||||
|
||||
// Clear all files to indicate that they are not being compacted
|
||||
// Delete this compaction from the list of running compactions.
|
||||
void ReleaseCompactionFiles(Status status);
|
||||
|
||||
void Summary(char* output, int len);
|
||||
|
||||
// Return the score that was used to pick this compaction run.
|
||||
double score() const { return score_; }
|
||||
|
||||
// Is this compaction creating a file in the bottom most level?
|
||||
bool BottomMostLevel() { return bottommost_level_; }
|
||||
|
||||
// Does this compaction include all sst files?
|
||||
bool IsFullCompaction() { return is_full_compaction_; }
|
||||
|
||||
// Was this compaction triggered manually by the client?
|
||||
bool IsManualCompaction() { return is_manual_compaction_; }
|
||||
|
||||
private:
|
||||
friend class CompactionPicker;
|
||||
friend class UniversalCompactionPicker;
|
||||
friend class FIFOCompactionPicker;
|
||||
friend class LevelCompactionPicker;
|
||||
|
||||
Compaction(Version* input_version, int level, int out_level,
|
||||
uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
|
||||
bool seek_compaction = false, bool enable_compression = true,
|
||||
bool deletion_compaction = false);
|
||||
|
||||
int level_;
|
||||
int out_level_; // levels to which output files are stored
|
||||
uint64_t max_output_file_size_;
|
||||
uint64_t max_grandparent_overlap_bytes_;
|
||||
Version* input_version_;
|
||||
VersionEdit* edit_;
|
||||
int number_levels_;
|
||||
ColumnFamilyData* cfd_;
|
||||
|
||||
bool seek_compaction_;
|
||||
bool enable_compression_;
|
||||
// if true, just delete files in inputs_[0]
|
||||
bool deletion_compaction_;
|
||||
|
||||
// Each compaction reads inputs from "level_" and "level_+1"
|
||||
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
|
||||
|
||||
// State used to check for number of of overlapping grandparent files
|
||||
// (parent == level_ + 1, grandparent == level_ + 2)
|
||||
std::vector<FileMetaData*> grandparents_;
|
||||
size_t grandparent_index_; // Index in grandparent_starts_
|
||||
bool seen_key_; // Some output key has been seen
|
||||
uint64_t overlapped_bytes_; // Bytes of overlap between current output
|
||||
// and grandparent files
|
||||
int base_index_; // index of the file in files_[level_]
|
||||
int parent_index_; // index of some file with same range in files_[level_+1]
|
||||
double score_; // score that was used to pick this compaction.
|
||||
|
||||
// Is this compaction creating a file in the bottom most level?
|
||||
bool bottommost_level_;
|
||||
// Does this compaction include all sst files?
|
||||
bool is_full_compaction_;
|
||||
|
||||
// Is this compaction requested by the client?
|
||||
bool is_manual_compaction_;
|
||||
|
||||
// level_ptrs_ holds indices into input_version_->levels_: our state
|
||||
// is that we are positioned at one of the file ranges for each
|
||||
// higher level than the ones involved in this compaction (i.e. for
|
||||
// all L >= level_ + 2).
|
||||
std::vector<size_t> level_ptrs_;
|
||||
|
||||
// mark (or clear) all files that are being compacted
|
||||
void MarkFilesBeingCompacted(bool);
|
||||
|
||||
// Initialize whether compaction producing files at the bottommost level
|
||||
void SetupBottomMostLevel(bool isManual);
|
||||
|
||||
// In case of compaction error, reset the nextIndex that is used
|
||||
// to pick up the next file to be compacted from files_by_size_
|
||||
void ResetNextCompactionIndex();
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
960
db/compaction_picker.cc
Normal file
960
db/compaction_picker.cc
Normal file
@@ -0,0 +1,960 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/compaction_picker.h"
|
||||
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#include <inttypes.h>
|
||||
#include <limits>
|
||||
#include "util/log_buffer.h"
|
||||
#include "util/statistics.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
|
||||
uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
|
||||
uint64_t sum = 0;
|
||||
for (size_t i = 0; i < files.size() && files[i]; i++) {
|
||||
sum += files[i]->file_size;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// Multiple two operands. If they overflow, return op1.
|
||||
uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) {
|
||||
if (op1 == 0) {
|
||||
return 0;
|
||||
}
|
||||
if (op2 <= 0) {
|
||||
return op1;
|
||||
}
|
||||
uint64_t casted_op2 = (uint64_t) op2;
|
||||
if (std::numeric_limits<uint64_t>::max() / op1 < casted_op2) {
|
||||
return op1;
|
||||
}
|
||||
return op1 * casted_op2;
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
CompactionPicker::CompactionPicker(const Options* options,
|
||||
const InternalKeyComparator* icmp)
|
||||
: compactions_in_progress_(options->num_levels),
|
||||
options_(options),
|
||||
num_levels_(options->num_levels),
|
||||
icmp_(icmp) {
|
||||
|
||||
max_file_size_.reset(new uint64_t[NumberLevels()]);
|
||||
level_max_bytes_.reset(new uint64_t[NumberLevels()]);
|
||||
int target_file_size_multiplier = options_->target_file_size_multiplier;
|
||||
int max_bytes_multiplier = options_->max_bytes_for_level_multiplier;
|
||||
for (int i = 0; i < NumberLevels(); i++) {
|
||||
if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) {
|
||||
max_file_size_[i] = ULLONG_MAX;
|
||||
level_max_bytes_[i] = options_->max_bytes_for_level_base;
|
||||
} else if (i > 1) {
|
||||
max_file_size_[i] = MultiplyCheckOverflow(max_file_size_[i - 1],
|
||||
target_file_size_multiplier);
|
||||
level_max_bytes_[i] = MultiplyCheckOverflow(
|
||||
MultiplyCheckOverflow(level_max_bytes_[i - 1], max_bytes_multiplier),
|
||||
options_->max_bytes_for_level_multiplier_additional[i - 1]);
|
||||
} else {
|
||||
max_file_size_[i] = options_->target_file_size_base;
|
||||
level_max_bytes_[i] = options_->max_bytes_for_level_base;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CompactionPicker::~CompactionPicker() {}
|
||||
|
||||
void CompactionPicker::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
|
||||
for (int level = 0; level < NumberLevels() - 1; level++) {
|
||||
uint64_t total = 0;
|
||||
for (auto c : compactions_in_progress_[level]) {
|
||||
assert(c->level() == level);
|
||||
for (int i = 0; i < c->num_input_files(0); i++) {
|
||||
total += c->input(0,i)->file_size;
|
||||
}
|
||||
}
|
||||
sizes[level] = total;
|
||||
}
|
||||
}
|
||||
|
||||
// Clear all files to indicate that they are not being compacted
|
||||
// Delete this compaction from the list of running compactions.
|
||||
void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
|
||||
c->MarkFilesBeingCompacted(false);
|
||||
compactions_in_progress_[c->level()].erase(c);
|
||||
if (!status.ok()) {
|
||||
c->ResetNextCompactionIndex();
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t CompactionPicker::MaxFileSizeForLevel(int level) const {
|
||||
assert(level >= 0);
|
||||
assert(level < NumberLevels());
|
||||
return max_file_size_[level];
|
||||
}
|
||||
|
||||
uint64_t CompactionPicker::MaxGrandParentOverlapBytes(int level) {
|
||||
uint64_t result = MaxFileSizeForLevel(level);
|
||||
result *= options_->max_grandparent_overlap_factor;
|
||||
return result;
|
||||
}
|
||||
|
||||
double CompactionPicker::MaxBytesForLevel(int level) {
|
||||
// Note: the result for level zero is not really used since we set
|
||||
// the level-0 compaction threshold based on number of files.
|
||||
assert(level >= 0);
|
||||
assert(level < NumberLevels());
|
||||
return level_max_bytes_[level];
|
||||
}
|
||||
|
||||
void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs,
|
||||
InternalKey* smallest, InternalKey* largest) {
|
||||
assert(!inputs.empty());
|
||||
smallest->Clear();
|
||||
largest->Clear();
|
||||
for (size_t i = 0; i < inputs.size(); i++) {
|
||||
FileMetaData* f = inputs[i];
|
||||
if (i == 0) {
|
||||
*smallest = f->smallest;
|
||||
*largest = f->largest;
|
||||
} else {
|
||||
if (icmp_->Compare(f->smallest, *smallest) < 0) {
|
||||
*smallest = f->smallest;
|
||||
}
|
||||
if (icmp_->Compare(f->largest, *largest) > 0) {
|
||||
*largest = f->largest;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs1,
|
||||
const std::vector<FileMetaData*>& inputs2,
|
||||
InternalKey* smallest, InternalKey* largest) {
|
||||
std::vector<FileMetaData*> all = inputs1;
|
||||
all.insert(all.end(), inputs2.begin(), inputs2.end());
|
||||
GetRange(all, smallest, largest);
|
||||
}
|
||||
|
||||
bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
|
||||
// If inputs are empty then there is nothing to expand.
|
||||
if (!c || c->inputs_[0].empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// GetOverlappingInputs will always do the right thing for level-0.
|
||||
// So we don't need to do any expansion if level == 0.
|
||||
if (c->level() == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const int level = c->level();
|
||||
InternalKey smallest, largest;
|
||||
|
||||
// Keep expanding c->inputs_[0] until we are sure that there is a
|
||||
// "clean cut" boundary between the files in input and the surrounding files.
|
||||
// This will ensure that no parts of a key are lost during compaction.
|
||||
int hint_index = -1;
|
||||
size_t old_size;
|
||||
do {
|
||||
old_size = c->inputs_[0].size();
|
||||
GetRange(c->inputs_[0], &smallest, &largest);
|
||||
c->inputs_[0].clear();
|
||||
c->input_version_->GetOverlappingInputs(
|
||||
level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index);
|
||||
} while(c->inputs_[0].size() > old_size);
|
||||
|
||||
// Get the new range
|
||||
GetRange(c->inputs_[0], &smallest, &largest);
|
||||
|
||||
// If, after the expansion, there are files that are already under
|
||||
// compaction, then we must drop/cancel this compaction.
|
||||
int parent_index = -1;
|
||||
if (c->inputs_[0].empty()) {
|
||||
Log(options_->info_log,
|
||||
"[%s] ExpandWhileOverlapping() failure because zero input files",
|
||||
c->column_family_data()->GetName().c_str());
|
||||
}
|
||||
if (c->inputs_[0].empty() || FilesInCompaction(c->inputs_[0]) ||
|
||||
(c->level() != c->output_level() &&
|
||||
ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
|
||||
&parent_index))) {
|
||||
c->inputs_[0].clear();
|
||||
c->inputs_[1].clear();
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) {
|
||||
uint64_t result = MaxFileSizeForLevel(level);
|
||||
result *= options_->expanded_compaction_factor;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns true if any one of specified files are being compacted
|
||||
bool CompactionPicker::FilesInCompaction(std::vector<FileMetaData*>& files) {
|
||||
for (unsigned int i = 0; i < files.size(); i++) {
|
||||
if (files[i]->being_compacted) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns true if any one of the parent files are being compacted
|
||||
bool CompactionPicker::ParentRangeInCompaction(Version* version,
|
||||
const InternalKey* smallest,
|
||||
const InternalKey* largest,
|
||||
int level, int* parent_index) {
|
||||
std::vector<FileMetaData*> inputs;
|
||||
assert(level + 1 < NumberLevels());
|
||||
|
||||
version->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
|
||||
*parent_index, parent_index);
|
||||
return FilesInCompaction(inputs);
|
||||
}
|
||||
|
||||
// Populates the set of inputs from "level+1" that overlap with "level".
|
||||
// Will also attempt to expand "level" if that doesn't expand "level+1"
|
||||
// or cause "level" to include a file for compaction that has an overlapping
|
||||
// user-key with another file.
|
||||
void CompactionPicker::SetupOtherInputs(Compaction* c) {
|
||||
// If inputs are empty, then there is nothing to expand.
|
||||
// If both input and output levels are the same, no need to consider
|
||||
// files at level "level+1"
|
||||
if (c->inputs_[0].empty() || c->level() == c->output_level()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int level = c->level();
|
||||
InternalKey smallest, largest;
|
||||
|
||||
// Get the range one last time.
|
||||
GetRange(c->inputs_[0], &smallest, &largest);
|
||||
|
||||
// Populate the set of next-level files (inputs_[1]) to include in compaction
|
||||
c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest,
|
||||
&c->inputs_[1], c->parent_index_,
|
||||
&c->parent_index_);
|
||||
|
||||
// Get entire range covered by compaction
|
||||
InternalKey all_start, all_limit;
|
||||
GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
|
||||
|
||||
// See if we can further grow the number of inputs in "level" without
|
||||
// changing the number of "level+1" files we pick up. We also choose NOT
|
||||
// to expand if this would cause "level" to include some entries for some
|
||||
// user key, while excluding other entries for the same user key. This
|
||||
// can happen when one user key spans multiple files.
|
||||
if (!c->inputs_[1].empty()) {
|
||||
std::vector<FileMetaData*> expanded0;
|
||||
c->input_version_->GetOverlappingInputs(
|
||||
level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
|
||||
const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
|
||||
const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
|
||||
const uint64_t expanded0_size = TotalFileSize(expanded0);
|
||||
uint64_t limit = ExpandedCompactionByteSizeLimit(level);
|
||||
if (expanded0.size() > c->inputs_[0].size() &&
|
||||
inputs1_size + expanded0_size < limit &&
|
||||
!FilesInCompaction(expanded0) &&
|
||||
!c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
|
||||
InternalKey new_start, new_limit;
|
||||
GetRange(expanded0, &new_start, &new_limit);
|
||||
std::vector<FileMetaData*> expanded1;
|
||||
c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
|
||||
&expanded1, c->parent_index_,
|
||||
&c->parent_index_);
|
||||
if (expanded1.size() == c->inputs_[1].size() &&
|
||||
!FilesInCompaction(expanded1)) {
|
||||
Log(options_->info_log,
|
||||
"[%s] Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu "
|
||||
"bytes)\n",
|
||||
c->column_family_data()->GetName().c_str(), (unsigned long)level,
|
||||
(unsigned long)(c->inputs_[0].size()),
|
||||
(unsigned long)(c->inputs_[1].size()), (unsigned long)inputs0_size,
|
||||
(unsigned long)inputs1_size, (unsigned long)(expanded0.size()),
|
||||
(unsigned long)(expanded1.size()), (unsigned long)expanded0_size,
|
||||
(unsigned long)inputs1_size);
|
||||
smallest = new_start;
|
||||
largest = new_limit;
|
||||
c->inputs_[0] = expanded0;
|
||||
c->inputs_[1] = expanded1;
|
||||
GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute the set of grandparent files that overlap this compaction
|
||||
// (parent == level+1; grandparent == level+2)
|
||||
if (level + 2 < NumberLevels()) {
|
||||
c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
|
||||
&c->grandparents_);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Compaction* CompactionPicker::CompactRange(Version* version, int input_level,
|
||||
int output_level,
|
||||
const InternalKey* begin,
|
||||
const InternalKey* end,
|
||||
InternalKey** compaction_end) {
|
||||
// CompactionPickerFIFO has its own implementation of compact range
|
||||
assert(options_->compaction_style != kCompactionStyleFIFO);
|
||||
|
||||
std::vector<FileMetaData*> inputs;
|
||||
bool covering_the_whole_range = true;
|
||||
|
||||
// All files are 'overlapping' in universal style compaction.
|
||||
// We have to compact the entire range in one shot.
|
||||
if (options_->compaction_style == kCompactionStyleUniversal) {
|
||||
begin = nullptr;
|
||||
end = nullptr;
|
||||
}
|
||||
version->GetOverlappingInputs(input_level, begin, end, &inputs);
|
||||
if (inputs.empty()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Avoid compacting too much in one shot in case the range is large.
|
||||
// But we cannot do this for level-0 since level-0 files can overlap
|
||||
// and we must not pick one file and drop another older file if the
|
||||
// two files overlap.
|
||||
if (input_level > 0) {
|
||||
const uint64_t limit =
|
||||
MaxFileSizeForLevel(input_level) * options_->source_compaction_factor;
|
||||
uint64_t total = 0;
|
||||
for (size_t i = 0; i + 1 < inputs.size(); ++i) {
|
||||
uint64_t s = inputs[i]->file_size;
|
||||
total += s;
|
||||
if (total >= limit) {
|
||||
**compaction_end = inputs[i + 1]->smallest;
|
||||
covering_the_whole_range = false;
|
||||
inputs.resize(i + 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Compaction* c = new Compaction(version, input_level, output_level,
|
||||
MaxFileSizeForLevel(output_level),
|
||||
MaxGrandParentOverlapBytes(input_level));
|
||||
|
||||
c->inputs_[0] = inputs;
|
||||
if (ExpandWhileOverlapping(c) == false) {
|
||||
delete c;
|
||||
Log(options_->info_log,
|
||||
"[%s] Could not compact due to expansion failure.\n",
|
||||
version->cfd_->GetName().c_str());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
SetupOtherInputs(c);
|
||||
|
||||
if (covering_the_whole_range) {
|
||||
*compaction_end = nullptr;
|
||||
}
|
||||
|
||||
// These files that are to be manaully compacted do not trample
|
||||
// upon other files because manual compactions are processed when
|
||||
// the system has a max of 1 background compaction thread.
|
||||
c->MarkFilesBeingCompacted(true);
|
||||
|
||||
// Is this compaction creating a file at the bottommost level
|
||||
c->SetupBottomMostLevel(true);
|
||||
|
||||
c->is_manual_compaction_ = true;
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
Compaction* LevelCompactionPicker::PickCompaction(Version* version,
|
||||
LogBuffer* log_buffer) {
|
||||
Compaction* c = nullptr;
|
||||
int level = -1;
|
||||
|
||||
// Compute the compactions needed. It is better to do it here
|
||||
// and also in LogAndApply(), otherwise the values could be stale.
|
||||
std::vector<uint64_t> size_being_compacted(NumberLevels() - 1);
|
||||
SizeBeingCompacted(size_being_compacted);
|
||||
version->ComputeCompactionScore(size_being_compacted);
|
||||
|
||||
// We prefer compactions triggered by too much data in a level over
|
||||
// the compactions triggered by seeks.
|
||||
//
|
||||
// Find the compactions by size on all levels.
|
||||
for (int i = 0; i < NumberLevels() - 1; i++) {
|
||||
assert(i == 0 ||
|
||||
version->compaction_score_[i] <= version->compaction_score_[i - 1]);
|
||||
level = version->compaction_level_[i];
|
||||
if ((version->compaction_score_[i] >= 1)) {
|
||||
c = PickCompactionBySize(version, level, version->compaction_score_[i]);
|
||||
if (ExpandWhileOverlapping(c) == false) {
|
||||
delete c;
|
||||
c = nullptr;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find compactions needed by seeks
|
||||
FileMetaData* f = version->file_to_compact_;
|
||||
if (c == nullptr && f != nullptr && !f->being_compacted) {
|
||||
|
||||
level = version->file_to_compact_level_;
|
||||
int parent_index = -1;
|
||||
|
||||
// Only allow one level 0 compaction at a time.
|
||||
// Do not pick this file if its parents at level+1 are being compacted.
|
||||
if (level != 0 || compactions_in_progress_[0].empty()) {
|
||||
if (!ParentRangeInCompaction(version, &f->smallest, &f->largest, level,
|
||||
&parent_index)) {
|
||||
c = new Compaction(version, level, level + 1,
|
||||
MaxFileSizeForLevel(level + 1),
|
||||
MaxGrandParentOverlapBytes(level), true);
|
||||
c->inputs_[0].push_back(f);
|
||||
c->parent_index_ = parent_index;
|
||||
c->input_version_->file_to_compact_ = nullptr;
|
||||
if (ExpandWhileOverlapping(c) == false) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (c == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Two level 0 compaction won't run at the same time, so don't need to worry
|
||||
// about files on level 0 being compacted.
|
||||
if (level == 0) {
|
||||
assert(compactions_in_progress_[0].empty());
|
||||
InternalKey smallest, largest;
|
||||
GetRange(c->inputs_[0], &smallest, &largest);
|
||||
// Note that the next call will discard the file we placed in
|
||||
// c->inputs_[0] earlier and replace it with an overlapping set
|
||||
// which will include the picked file.
|
||||
c->inputs_[0].clear();
|
||||
c->input_version_->GetOverlappingInputs(0, &smallest, &largest,
|
||||
&c->inputs_[0]);
|
||||
|
||||
// If we include more L0 files in the same compaction run it can
|
||||
// cause the 'smallest' and 'largest' key to get extended to a
|
||||
// larger range. So, re-invoke GetRange to get the new key range
|
||||
GetRange(c->inputs_[0], &smallest, &largest);
|
||||
if (ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
|
||||
&c->parent_index_)) {
|
||||
delete c;
|
||||
return nullptr;
|
||||
}
|
||||
assert(!c->inputs_[0].empty());
|
||||
}
|
||||
|
||||
// Setup "level+1" files (inputs_[1])
|
||||
SetupOtherInputs(c);
|
||||
|
||||
// mark all the files that are being compacted
|
||||
c->MarkFilesBeingCompacted(true);
|
||||
|
||||
// Is this compaction creating a file at the bottommost level
|
||||
c->SetupBottomMostLevel(false);
|
||||
|
||||
// remember this currently undergoing compaction
|
||||
compactions_in_progress_[level].insert(c);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version,
|
||||
int level,
|
||||
double score) {
|
||||
Compaction* c = nullptr;
|
||||
|
||||
// level 0 files are overlapping. So we cannot pick more
|
||||
// than one concurrent compactions at this level. This
|
||||
// could be made better by looking at key-ranges that are
|
||||
// being compacted at level 0.
|
||||
if (level == 0 && compactions_in_progress_[level].size() == 1) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
assert(level >= 0);
|
||||
assert(level + 1 < NumberLevels());
|
||||
c = new Compaction(version, level, level + 1, MaxFileSizeForLevel(level + 1),
|
||||
MaxGrandParentOverlapBytes(level));
|
||||
c->score_ = score;
|
||||
|
||||
// Pick the largest file in this level that is not already
|
||||
// being compacted
|
||||
std::vector<int>& file_size = c->input_version_->files_by_size_[level];
|
||||
|
||||
// record the first file that is not yet compacted
|
||||
int nextIndex = -1;
|
||||
|
||||
for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level];
|
||||
i < file_size.size(); i++) {
|
||||
int index = file_size[i];
|
||||
FileMetaData* f = c->input_version_->files_[level][index];
|
||||
|
||||
// check to verify files are arranged in descending size
|
||||
assert((i == file_size.size() - 1) ||
|
||||
(i >= Version::number_of_files_to_sort_ - 1) ||
|
||||
(f->file_size >=
|
||||
c->input_version_->files_[level][file_size[i + 1]]->file_size));
|
||||
|
||||
// do not pick a file to compact if it is being compacted
|
||||
// from n-1 level.
|
||||
if (f->being_compacted) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// remember the startIndex for the next call to PickCompaction
|
||||
if (nextIndex == -1) {
|
||||
nextIndex = i;
|
||||
}
|
||||
|
||||
// Do not pick this file if its parents at level+1 are being compacted.
|
||||
// Maybe we can avoid redoing this work in SetupOtherInputs
|
||||
int parent_index = -1;
|
||||
if (ParentRangeInCompaction(c->input_version_, &f->smallest, &f->largest,
|
||||
level, &parent_index)) {
|
||||
continue;
|
||||
}
|
||||
c->inputs_[0].push_back(f);
|
||||
c->base_index_ = index;
|
||||
c->parent_index_ = parent_index;
|
||||
break;
|
||||
}
|
||||
|
||||
if (c->inputs_[0].empty()) {
|
||||
delete c;
|
||||
c = nullptr;
|
||||
}
|
||||
|
||||
// store where to start the iteration in the next call to PickCompaction
|
||||
version->next_file_to_compact_by_size_[level] = nextIndex;
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
// Universal style of compaction. Pick files that are contiguous in
|
||||
// time-range to compact.
|
||||
//
|
||||
Compaction* UniversalCompactionPicker::PickCompaction(Version* version,
|
||||
LogBuffer* log_buffer) {
|
||||
int level = 0;
|
||||
double score = version->compaction_score_[0];
|
||||
|
||||
if ((version->files_[level].size() <
|
||||
(unsigned int)options_->level0_file_num_compaction_trigger)) {
|
||||
LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n",
|
||||
version->cfd_->GetName().c_str());
|
||||
return nullptr;
|
||||
}
|
||||
Version::FileSummaryStorage tmp;
|
||||
LogToBuffer(log_buffer, "[%s] Universal: candidate files(%zu): %s\n",
|
||||
version->cfd_->GetName().c_str(), version->files_[level].size(),
|
||||
version->LevelFileSummary(&tmp, 0));
|
||||
|
||||
// Check for size amplification first.
|
||||
Compaction* c;
|
||||
if ((c = PickCompactionUniversalSizeAmp(version, score, log_buffer)) !=
|
||||
nullptr) {
|
||||
LogToBuffer(log_buffer, "[%s] Universal: compacting for size amp\n",
|
||||
version->cfd_->GetName().c_str());
|
||||
} else {
|
||||
// Size amplification is within limits. Try reducing read
|
||||
// amplification while maintaining file size ratios.
|
||||
unsigned int ratio = options_->compaction_options_universal.size_ratio;
|
||||
|
||||
if ((c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX,
|
||||
log_buffer)) != nullptr) {
|
||||
LogToBuffer(log_buffer, "[%s] Universal: compacting for size ratio\n",
|
||||
version->cfd_->GetName().c_str());
|
||||
} else {
|
||||
// Size amplification and file size ratios are within configured limits.
|
||||
// If max read amplification is exceeding configured limits, then force
|
||||
// compaction without looking at filesize ratios and try to reduce
|
||||
// the number of files to fewer than level0_file_num_compaction_trigger.
|
||||
unsigned int num_files = version->files_[level].size() -
|
||||
options_->level0_file_num_compaction_trigger;
|
||||
if ((c = PickCompactionUniversalReadAmp(
|
||||
version, score, UINT_MAX, num_files, log_buffer)) != nullptr) {
|
||||
LogToBuffer(log_buffer, "[%s] Universal: compacting for file num\n",
|
||||
version->cfd_->GetName().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (c == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
assert(c->inputs_[0].size() > 1);
|
||||
|
||||
// validate that all the chosen files are non overlapping in time
|
||||
FileMetaData* newerfile __attribute__((unused)) = nullptr;
|
||||
for (unsigned int i = 0; i < c->inputs_[0].size(); i++) {
|
||||
FileMetaData* f = c->inputs_[0][i];
|
||||
assert (f->smallest_seqno <= f->largest_seqno);
|
||||
assert(newerfile == nullptr ||
|
||||
newerfile->smallest_seqno > f->largest_seqno);
|
||||
newerfile = f;
|
||||
}
|
||||
|
||||
// The files are sorted from newest first to oldest last.
|
||||
std::vector<int>& file_by_time = c->input_version_->files_by_size_[level];
|
||||
|
||||
// Is the earliest file part of this compaction?
|
||||
int last_index = file_by_time[file_by_time.size()-1];
|
||||
FileMetaData* last_file = c->input_version_->files_[level][last_index];
|
||||
if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
|
||||
c->bottommost_level_ = true;
|
||||
}
|
||||
|
||||
// update statistics
|
||||
MeasureTime(options_->statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION,
|
||||
c->inputs_[0].size());
|
||||
|
||||
// mark all the files that are being compacted
|
||||
c->MarkFilesBeingCompacted(true);
|
||||
|
||||
// remember this currently undergoing compaction
|
||||
compactions_in_progress_[level].insert(c);
|
||||
|
||||
// Record whether this compaction includes all sst files.
|
||||
// For now, it is only relevant in universal compaction mode.
|
||||
c->is_full_compaction_ =
|
||||
(c->inputs_[0].size() == c->input_version_->files_[0].size());
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
//
|
||||
// Consider compaction files based on their size differences with
|
||||
// the next file in time order.
|
||||
//
|
||||
Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
|
||||
Version* version, double score, unsigned int ratio,
|
||||
unsigned int max_number_of_files_to_compact, LogBuffer* log_buffer) {
|
||||
int level = 0;
|
||||
|
||||
unsigned int min_merge_width =
|
||||
options_->compaction_options_universal.min_merge_width;
|
||||
unsigned int max_merge_width =
|
||||
options_->compaction_options_universal.max_merge_width;
|
||||
|
||||
// The files are sorted from newest first to oldest last.
|
||||
std::vector<int>& file_by_time = version->files_by_size_[level];
|
||||
FileMetaData* f = nullptr;
|
||||
bool done = false;
|
||||
int start_index = 0;
|
||||
unsigned int candidate_count = 0;
|
||||
assert(file_by_time.size() == version->files_[level].size());
|
||||
|
||||
unsigned int max_files_to_compact = std::min(max_merge_width,
|
||||
max_number_of_files_to_compact);
|
||||
min_merge_width = std::max(min_merge_width, 2U);
|
||||
|
||||
// Considers a candidate file only if it is smaller than the
|
||||
// total size accumulated so far.
|
||||
for (unsigned int loop = 0; loop < file_by_time.size(); loop++) {
|
||||
|
||||
candidate_count = 0;
|
||||
|
||||
// Skip files that are already being compacted
|
||||
for (f = nullptr; loop < file_by_time.size(); loop++) {
|
||||
int index = file_by_time[loop];
|
||||
f = version->files_[level][index];
|
||||
|
||||
if (!f->being_compacted) {
|
||||
candidate_count = 1;
|
||||
break;
|
||||
}
|
||||
LogToBuffer(
|
||||
log_buffer, "[%s] Universal: file %lu[%d] being compacted, skipping",
|
||||
version->cfd_->GetName().c_str(), (unsigned long)f->number, loop);
|
||||
f = nullptr;
|
||||
}
|
||||
|
||||
// This file is not being compacted. Consider it as the
|
||||
// first candidate to be compacted.
|
||||
uint64_t candidate_size = f != nullptr? f->file_size : 0;
|
||||
if (f != nullptr) {
|
||||
LogToBuffer(
|
||||
log_buffer, "[%s] Universal: Possible candidate file %lu[%d].",
|
||||
version->cfd_->GetName().c_str(), (unsigned long)f->number, loop);
|
||||
}
|
||||
|
||||
// Check if the suceeding files need compaction.
|
||||
for (unsigned int i = loop+1;
|
||||
candidate_count < max_files_to_compact && i < file_by_time.size();
|
||||
i++) {
|
||||
int index = file_by_time[i];
|
||||
FileMetaData* f = version->files_[level][index];
|
||||
if (f->being_compacted) {
|
||||
break;
|
||||
}
|
||||
// Pick files if the total/last candidate file size (increased by the
|
||||
// specified ratio) is still larger than the next candidate file.
|
||||
// candidate_size is the total size of files picked so far with the
|
||||
// default kCompactionStopStyleTotalSize; with
|
||||
// kCompactionStopStyleSimilarSize, it's simply the size of the last
|
||||
// picked file.
|
||||
uint64_t sz = (candidate_size * (100L + ratio)) /100;
|
||||
if (sz < f->file_size) {
|
||||
break;
|
||||
}
|
||||
if (options_->compaction_options_universal.stop_style == kCompactionStopStyleSimilarSize) {
|
||||
// Similar-size stopping rule: also check the last picked file isn't
|
||||
// far larger than the next candidate file.
|
||||
sz = (f->file_size * (100L + ratio)) / 100;
|
||||
if (sz < candidate_size) {
|
||||
// If the small file we've encountered begins a run of similar-size
|
||||
// files, we'll pick them up on a future iteration of the outer
|
||||
// loop. If it's some lonely straggler, it'll eventually get picked
|
||||
// by the last-resort read amp strategy which disregards size ratios.
|
||||
break;
|
||||
}
|
||||
candidate_size = f->file_size;
|
||||
} else { // default kCompactionStopStyleTotalSize
|
||||
candidate_size += f->file_size;
|
||||
}
|
||||
candidate_count++;
|
||||
}
|
||||
|
||||
// Found a series of consecutive files that need compaction.
|
||||
if (candidate_count >= (unsigned int)min_merge_width) {
|
||||
start_index = loop;
|
||||
done = true;
|
||||
break;
|
||||
} else {
|
||||
for (unsigned int i = loop;
|
||||
i < loop + candidate_count && i < file_by_time.size(); i++) {
|
||||
int index = file_by_time[i];
|
||||
FileMetaData* f = version->files_[level][index];
|
||||
LogToBuffer(log_buffer,
|
||||
"[%s] Universal: Skipping file %lu[%d] with size %lu %d\n",
|
||||
version->cfd_->GetName().c_str(), (unsigned long)f->number,
|
||||
i, (unsigned long)f->file_size, f->being_compacted);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!done || candidate_count <= 1) {
|
||||
return nullptr;
|
||||
}
|
||||
unsigned int first_index_after = start_index + candidate_count;
|
||||
// Compression is enabled if files compacted earlier already reached
|
||||
// size ratio of compression.
|
||||
bool enable_compression = true;
|
||||
int ratio_to_compress =
|
||||
options_->compaction_options_universal.compression_size_percent;
|
||||
if (ratio_to_compress >= 0) {
|
||||
uint64_t total_size = version->NumLevelBytes(level);
|
||||
uint64_t older_file_size = 0;
|
||||
for (unsigned int i = file_by_time.size() - 1; i >= first_index_after;
|
||||
i--) {
|
||||
older_file_size += version->files_[level][file_by_time[i]]->file_size;
|
||||
if (older_file_size * 100L >= total_size * (long) ratio_to_compress) {
|
||||
enable_compression = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Compaction* c =
|
||||
new Compaction(version, level, level, MaxFileSizeForLevel(level),
|
||||
LLONG_MAX, false, enable_compression);
|
||||
c->score_ = score;
|
||||
|
||||
for (unsigned int i = start_index; i < first_index_after; i++) {
|
||||
int index = file_by_time[i];
|
||||
FileMetaData* f = c->input_version_->files_[level][index];
|
||||
c->inputs_[0].push_back(f);
|
||||
LogToBuffer(log_buffer,
|
||||
"[%s] Universal: Picking file %lu[%d] with size %lu\n",
|
||||
version->cfd_->GetName().c_str(), (unsigned long)f->number, i,
|
||||
(unsigned long)f->file_size);
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
// Look at overall size amplification. If size amplification
|
||||
// exceeeds the configured value, then do a compaction
|
||||
// of the candidate files all the way upto the earliest
|
||||
// base file (overrides configured values of file-size ratios,
|
||||
// min_merge_width and max_merge_width).
|
||||
//
|
||||
Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
|
||||
Version* version, double score, LogBuffer* log_buffer) {
|
||||
int level = 0;
|
||||
|
||||
// percentage flexibilty while reducing size amplification
|
||||
uint64_t ratio = options_->compaction_options_universal.
|
||||
max_size_amplification_percent;
|
||||
|
||||
// The files are sorted from newest first to oldest last.
|
||||
std::vector<int>& file_by_time = version->files_by_size_[level];
|
||||
assert(file_by_time.size() == version->files_[level].size());
|
||||
|
||||
unsigned int candidate_count = 0;
|
||||
uint64_t candidate_size = 0;
|
||||
unsigned int start_index = 0;
|
||||
FileMetaData* f = nullptr;
|
||||
|
||||
// Skip files that are already being compacted
|
||||
for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) {
|
||||
int index = file_by_time[loop];
|
||||
f = version->files_[level][index];
|
||||
if (!f->being_compacted) {
|
||||
start_index = loop; // Consider this as the first candidate.
|
||||
break;
|
||||
}
|
||||
LogToBuffer(log_buffer,
|
||||
"[%s] Universal: skipping file %lu[%d] compacted %s",
|
||||
version->cfd_->GetName().c_str(), (unsigned long)f->number,
|
||||
loop, " cannot be a candidate to reduce size amp.\n");
|
||||
f = nullptr;
|
||||
}
|
||||
if (f == nullptr) {
|
||||
return nullptr; // no candidate files
|
||||
}
|
||||
|
||||
LogToBuffer(log_buffer, "[%s] Universal: First candidate file %lu[%d] %s",
|
||||
version->cfd_->GetName().c_str(), (unsigned long)f->number,
|
||||
start_index, " to reduce size amp.\n");
|
||||
|
||||
// keep adding up all the remaining files
|
||||
for (unsigned int loop = start_index; loop < file_by_time.size() - 1;
|
||||
loop++) {
|
||||
int index = file_by_time[loop];
|
||||
f = version->files_[level][index];
|
||||
if (f->being_compacted) {
|
||||
LogToBuffer(
|
||||
log_buffer, "[%s] Universal: Possible candidate file %lu[%d] %s.",
|
||||
version->cfd_->GetName().c_str(), (unsigned long)f->number, loop,
|
||||
" is already being compacted. No size amp reduction possible.\n");
|
||||
return nullptr;
|
||||
}
|
||||
candidate_size += f->file_size;
|
||||
candidate_count++;
|
||||
}
|
||||
if (candidate_count == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// size of earliest file
|
||||
int index = file_by_time[file_by_time.size() - 1];
|
||||
uint64_t earliest_file_size = version->files_[level][index]->file_size;
|
||||
|
||||
// size amplification = percentage of additional size
|
||||
if (candidate_size * 100 < ratio * earliest_file_size) {
|
||||
LogToBuffer(
|
||||
log_buffer,
|
||||
"[%s] Universal: size amp not needed. newer-files-total-size %lu "
|
||||
"earliest-file-size %lu",
|
||||
version->cfd_->GetName().c_str(), (unsigned long)candidate_size,
|
||||
(unsigned long)earliest_file_size);
|
||||
return nullptr;
|
||||
} else {
|
||||
LogToBuffer(log_buffer,
|
||||
"[%s] Universal: size amp needed. newer-files-total-size %lu "
|
||||
"earliest-file-size %lu",
|
||||
version->cfd_->GetName().c_str(), (unsigned long)candidate_size,
|
||||
(unsigned long)earliest_file_size);
|
||||
}
|
||||
assert(start_index >= 0 && start_index < file_by_time.size() - 1);
|
||||
|
||||
// create a compaction request
|
||||
// We always compact all the files, so always compress.
|
||||
Compaction* c =
|
||||
new Compaction(version, level, level, MaxFileSizeForLevel(level),
|
||||
LLONG_MAX, false, true);
|
||||
c->score_ = score;
|
||||
for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
|
||||
int index = file_by_time[loop];
|
||||
f = c->input_version_->files_[level][index];
|
||||
c->inputs_[0].push_back(f);
|
||||
LogToBuffer(log_buffer,
|
||||
"[%s] Universal: size amp picking file %lu[%d] with size %lu",
|
||||
version->cfd_->GetName().c_str(), (unsigned long)f->number,
|
||||
index, (unsigned long)f->file_size);
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
Compaction* FIFOCompactionPicker::PickCompaction(Version* version,
|
||||
LogBuffer* log_buffer) {
|
||||
assert(version->NumberLevels() == 1);
|
||||
uint64_t total_size = 0;
|
||||
for (const auto& file : version->files_[0]) {
|
||||
total_size += file->file_size;
|
||||
}
|
||||
|
||||
if (total_size <= options_->compaction_options_fifo.max_table_files_size ||
|
||||
version->files_[0].size() == 0) {
|
||||
// total size not exceeded
|
||||
LogToBuffer(log_buffer,
|
||||
"[%s] FIFO compaction: nothing to do. Total size %" PRIu64
|
||||
", max size %" PRIu64 "\n",
|
||||
version->cfd_->GetName().c_str(), total_size,
|
||||
options_->compaction_options_fifo.max_table_files_size);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (compactions_in_progress_[0].size() > 0) {
|
||||
LogToBuffer(log_buffer,
|
||||
"[%s] FIFO compaction: Already executing compaction. No need "
|
||||
"to run parallel compactions since compactions are very fast",
|
||||
version->cfd_->GetName().c_str());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
Compaction* c = new Compaction(version, 0, 0, 0, 0, false, false,
|
||||
true /* is deletion compaction */);
|
||||
// delete old files (FIFO)
|
||||
for (auto ritr = version->files_[0].rbegin();
|
||||
ritr != version->files_[0].rend(); ++ritr) {
|
||||
auto f = *ritr;
|
||||
total_size -= f->file_size;
|
||||
c->inputs_[0].push_back(f);
|
||||
char tmp_fsize[16];
|
||||
AppendHumanBytes(f->file_size, tmp_fsize, sizeof(tmp_fsize));
|
||||
LogToBuffer(log_buffer, "[%s] FIFO compaction: picking file %" PRIu64
|
||||
" with size %s for deletion",
|
||||
version->cfd_->GetName().c_str(), f->number, tmp_fsize);
|
||||
if (total_size <= options_->compaction_options_fifo.max_table_files_size) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
c->MarkFilesBeingCompacted(true);
|
||||
compactions_in_progress_[0].insert(c);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
Compaction* FIFOCompactionPicker::CompactRange(Version* version,
|
||||
int input_level,
|
||||
int output_level,
|
||||
const InternalKey* begin,
|
||||
const InternalKey* end,
|
||||
InternalKey** compaction_end) {
|
||||
assert(input_level == 0);
|
||||
assert(output_level == 0);
|
||||
*compaction_end = nullptr;
|
||||
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_->info_log.get());
|
||||
auto c = PickCompaction(version, &log_buffer);
|
||||
log_buffer.FlushBufferToLog();
|
||||
return c;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
181
db/compaction_picker.h
Normal file
181
db/compaction_picker.h
Normal file
@@ -0,0 +1,181 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include "db/version_set.h"
|
||||
#include "db/compaction.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/env.h"
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class LogBuffer;
|
||||
class Compaction;
|
||||
class Version;
|
||||
|
||||
class CompactionPicker {
|
||||
public:
|
||||
CompactionPicker(const Options* options, const InternalKeyComparator* icmp);
|
||||
virtual ~CompactionPicker();
|
||||
|
||||
// Pick level and inputs for a new compaction.
|
||||
// Returns nullptr if there is no compaction to be done.
|
||||
// Otherwise returns a pointer to a heap-allocated object that
|
||||
// describes the compaction. Caller should delete the result.
|
||||
virtual Compaction* PickCompaction(Version* version,
|
||||
LogBuffer* log_buffer) = 0;
|
||||
|
||||
// Return a compaction object for compacting the range [begin,end] in
|
||||
// the specified level. Returns nullptr if there is nothing in that
|
||||
// level that overlaps the specified range. Caller should delete
|
||||
// the result.
|
||||
//
|
||||
// The returned Compaction might not include the whole requested range.
|
||||
// In that case, compaction_end will be set to the next key that needs
|
||||
// compacting. In case the compaction will compact the whole range,
|
||||
// compaction_end will be set to nullptr.
|
||||
// Client is responsible for compaction_end storage -- when called,
|
||||
// *compaction_end should point to valid InternalKey!
|
||||
virtual Compaction* CompactRange(Version* version, int input_level,
|
||||
int output_level, const InternalKey* begin,
|
||||
const InternalKey* end,
|
||||
InternalKey** compaction_end);
|
||||
|
||||
// Free up the files that participated in a compaction
|
||||
void ReleaseCompactionFiles(Compaction* c, Status status);
|
||||
|
||||
// Return the total amount of data that is undergoing
|
||||
// compactions per level
|
||||
void SizeBeingCompacted(std::vector<uint64_t>& sizes);
|
||||
|
||||
// Returns maximum total overlap bytes with grandparent
|
||||
// level (i.e., level+2) before we stop building a single
|
||||
// file in level->level+1 compaction.
|
||||
uint64_t MaxGrandParentOverlapBytes(int level);
|
||||
|
||||
// Returns maximum total bytes of data on a given level.
|
||||
double MaxBytesForLevel(int level);
|
||||
|
||||
// Get the max file size in a given level.
|
||||
uint64_t MaxFileSizeForLevel(int level) const;
|
||||
|
||||
protected:
|
||||
int NumberLevels() const { return num_levels_; }
|
||||
|
||||
// Stores the minimal range that covers all entries in inputs in
|
||||
// *smallest, *largest.
|
||||
// REQUIRES: inputs is not empty
|
||||
void GetRange(const std::vector<FileMetaData*>& inputs, InternalKey* smallest,
|
||||
InternalKey* largest);
|
||||
|
||||
// Stores the minimal range that covers all entries in inputs1 and inputs2
|
||||
// in *smallest, *largest.
|
||||
// REQUIRES: inputs is not empty
|
||||
void GetRange(const std::vector<FileMetaData*>& inputs1,
|
||||
const std::vector<FileMetaData*>& inputs2,
|
||||
InternalKey* smallest, InternalKey* largest);
|
||||
|
||||
// Add more files to the inputs on "level" to make sure that
|
||||
// no newer version of a key is compacted to "level+1" while leaving an older
|
||||
// version in a "level". Otherwise, any Get() will search "level" first,
|
||||
// and will likely return an old/stale value for the key, since it always
|
||||
// searches in increasing order of level to find the value. This could
|
||||
// also scramble the order of merge operands. This function should be
|
||||
// called any time a new Compaction is created, and its inputs_[0] are
|
||||
// populated.
|
||||
//
|
||||
// Will return false if it is impossible to apply this compaction.
|
||||
bool ExpandWhileOverlapping(Compaction* c);
|
||||
|
||||
uint64_t ExpandedCompactionByteSizeLimit(int level);
|
||||
|
||||
// Returns true if any one of the specified files are being compacted
|
||||
bool FilesInCompaction(std::vector<FileMetaData*>& files);
|
||||
|
||||
// Returns true if any one of the parent files are being compacted
|
||||
bool ParentRangeInCompaction(Version* version, const InternalKey* smallest,
|
||||
const InternalKey* largest, int level,
|
||||
int* index);
|
||||
|
||||
void SetupOtherInputs(Compaction* c);
|
||||
|
||||
// record all the ongoing compactions for all levels
|
||||
std::vector<std::set<Compaction*>> compactions_in_progress_;
|
||||
|
||||
// Per-level target file size.
|
||||
std::unique_ptr<uint64_t[]> max_file_size_;
|
||||
|
||||
// Per-level max bytes
|
||||
std::unique_ptr<uint64_t[]> level_max_bytes_;
|
||||
|
||||
const Options* const options_;
|
||||
|
||||
private:
|
||||
int num_levels_;
|
||||
|
||||
const InternalKeyComparator* const icmp_;
|
||||
};
|
||||
|
||||
class UniversalCompactionPicker : public CompactionPicker {
|
||||
public:
|
||||
UniversalCompactionPicker(const Options* options,
|
||||
const InternalKeyComparator* icmp)
|
||||
: CompactionPicker(options, icmp) {}
|
||||
virtual Compaction* PickCompaction(Version* version,
|
||||
LogBuffer* log_buffer) override;
|
||||
|
||||
private:
|
||||
// Pick Universal compaction to limit read amplification
|
||||
Compaction* PickCompactionUniversalReadAmp(Version* version, double score,
|
||||
unsigned int ratio,
|
||||
unsigned int num_files,
|
||||
LogBuffer* log_buffer);
|
||||
|
||||
// Pick Universal compaction to limit space amplification.
|
||||
Compaction* PickCompactionUniversalSizeAmp(Version* version, double score,
|
||||
LogBuffer* log_buffer);
|
||||
};
|
||||
|
||||
class LevelCompactionPicker : public CompactionPicker {
|
||||
public:
|
||||
LevelCompactionPicker(const Options* options,
|
||||
const InternalKeyComparator* icmp)
|
||||
: CompactionPicker(options, icmp) {}
|
||||
virtual Compaction* PickCompaction(Version* version,
|
||||
LogBuffer* log_buffer) override;
|
||||
|
||||
private:
|
||||
// For the specfied level, pick a compaction.
|
||||
// Returns nullptr if there is no compaction to be done.
|
||||
// If level is 0 and there is already a compaction on that level, this
|
||||
// function will return nullptr.
|
||||
Compaction* PickCompactionBySize(Version* version, int level, double score);
|
||||
};
|
||||
|
||||
class FIFOCompactionPicker : public CompactionPicker {
|
||||
public:
|
||||
FIFOCompactionPicker(const Options* options,
|
||||
const InternalKeyComparator* icmp)
|
||||
: CompactionPicker(options, icmp) {}
|
||||
|
||||
virtual Compaction* PickCompaction(Version* version,
|
||||
LogBuffer* log_buffer) override;
|
||||
|
||||
virtual Compaction* CompactRange(Version* version, int input_level,
|
||||
int output_level, const InternalKey* begin,
|
||||
const InternalKey* end,
|
||||
InternalKey** compaction_end) override;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
440
db/corruption_test.cc
Normal file
440
db/corruption_test.cc
Normal file
@@ -0,0 +1,440 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include "rocksdb/cache.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "rocksdb/write_batch.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/log_format.h"
|
||||
#include "db/version_set.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
static const int kValueSize = 1000;
|
||||
|
||||
class CorruptionTest {
|
||||
public:
|
||||
test::ErrorEnv env_;
|
||||
std::string dbname_;
|
||||
shared_ptr<Cache> tiny_cache_;
|
||||
Options options_;
|
||||
DB* db_;
|
||||
|
||||
CorruptionTest() {
|
||||
tiny_cache_ = NewLRUCache(100);
|
||||
options_.env = &env_;
|
||||
dbname_ = test::TmpDir() + "/corruption_test";
|
||||
DestroyDB(dbname_, options_);
|
||||
|
||||
db_ = nullptr;
|
||||
options_.create_if_missing = true;
|
||||
options_.block_size_deviation = 0; // make unit test pass for now
|
||||
Reopen();
|
||||
options_.create_if_missing = false;
|
||||
}
|
||||
|
||||
~CorruptionTest() {
|
||||
delete db_;
|
||||
DestroyDB(dbname_, Options());
|
||||
}
|
||||
|
||||
Status TryReopen(Options* options = nullptr) {
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
Options opt = (options ? *options : options_);
|
||||
opt.env = &env_;
|
||||
opt.block_cache = tiny_cache_;
|
||||
opt.block_size_deviation = 0;
|
||||
opt.arena_block_size = 4096;
|
||||
return DB::Open(opt, dbname_, &db_);
|
||||
}
|
||||
|
||||
void Reopen(Options* options = nullptr) {
|
||||
ASSERT_OK(TryReopen(options));
|
||||
}
|
||||
|
||||
void RepairDB() {
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
ASSERT_OK(::rocksdb::RepairDB(dbname_, options_));
|
||||
}
|
||||
|
||||
void Build(int n) {
|
||||
std::string key_space, value_space;
|
||||
WriteBatch batch;
|
||||
for (int i = 0; i < n; i++) {
|
||||
//if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
|
||||
Slice key = Key(i, &key_space);
|
||||
batch.Clear();
|
||||
batch.Put(key, Value(i, &value_space));
|
||||
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
||||
}
|
||||
}
|
||||
|
||||
void Check(int min_expected, int max_expected) {
|
||||
unsigned int next_expected = 0;
|
||||
int missed = 0;
|
||||
int bad_keys = 0;
|
||||
int bad_values = 0;
|
||||
int correct = 0;
|
||||
std::string value_space;
|
||||
// Do not verify checksums. If we verify checksums then the
|
||||
// db itself will raise errors because data is corrupted.
|
||||
// Instead, we want the reads to be successful and this test
|
||||
// will detect whether the appropriate corruptions have
|
||||
// occured.
|
||||
Iterator* iter = db_->NewIterator(ReadOptions(false, true));
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
uint64_t key;
|
||||
Slice in(iter->key());
|
||||
if (!ConsumeDecimalNumber(&in, &key) ||
|
||||
!in.empty() ||
|
||||
key < next_expected) {
|
||||
bad_keys++;
|
||||
continue;
|
||||
}
|
||||
missed += (key - next_expected);
|
||||
next_expected = key + 1;
|
||||
if (iter->value() != Value(key, &value_space)) {
|
||||
bad_values++;
|
||||
} else {
|
||||
correct++;
|
||||
}
|
||||
}
|
||||
delete iter;
|
||||
|
||||
fprintf(stderr,
|
||||
"expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
|
||||
min_expected, max_expected, correct, bad_keys, bad_values, missed);
|
||||
ASSERT_LE(min_expected, correct);
|
||||
ASSERT_GE(max_expected, correct);
|
||||
}
|
||||
|
||||
void CorruptFile(const std::string fname, int offset, int bytes_to_corrupt) {
|
||||
struct stat sbuf;
|
||||
if (stat(fname.c_str(), &sbuf) != 0) {
|
||||
const char* msg = strerror(errno);
|
||||
ASSERT_TRUE(false) << fname << ": " << msg;
|
||||
}
|
||||
|
||||
if (offset < 0) {
|
||||
// Relative to end of file; make it absolute
|
||||
if (-offset > sbuf.st_size) {
|
||||
offset = 0;
|
||||
} else {
|
||||
offset = sbuf.st_size + offset;
|
||||
}
|
||||
}
|
||||
if (offset > sbuf.st_size) {
|
||||
offset = sbuf.st_size;
|
||||
}
|
||||
if (offset + bytes_to_corrupt > sbuf.st_size) {
|
||||
bytes_to_corrupt = sbuf.st_size - offset;
|
||||
}
|
||||
|
||||
// Do it
|
||||
std::string contents;
|
||||
Status s = ReadFileToString(Env::Default(), fname, &contents);
|
||||
ASSERT_TRUE(s.ok()) << s.ToString();
|
||||
for (int i = 0; i < bytes_to_corrupt; i++) {
|
||||
contents[i + offset] ^= 0x80;
|
||||
}
|
||||
s = WriteStringToFile(Env::Default(), contents, fname);
|
||||
ASSERT_TRUE(s.ok()) << s.ToString();
|
||||
}
|
||||
|
||||
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
|
||||
// Pick file to corrupt
|
||||
std::vector<std::string> filenames;
|
||||
ASSERT_OK(env_.GetChildren(dbname_, &filenames));
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
std::string fname;
|
||||
int picked_number = -1;
|
||||
for (unsigned int i = 0; i < filenames.size(); i++) {
|
||||
if (ParseFileName(filenames[i], &number, &type) &&
|
||||
type == filetype &&
|
||||
static_cast<int>(number) > picked_number) { // Pick latest file
|
||||
fname = dbname_ + "/" + filenames[i];
|
||||
picked_number = number;
|
||||
}
|
||||
}
|
||||
ASSERT_TRUE(!fname.empty()) << filetype;
|
||||
|
||||
CorruptFile(fname, offset, bytes_to_corrupt);
|
||||
}
|
||||
|
||||
// corrupts exactly one file at level `level`. if no file found at level,
|
||||
// asserts
|
||||
void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) {
|
||||
std::vector<LiveFileMetaData> metadata;
|
||||
db_->GetLiveFilesMetaData(&metadata);
|
||||
for (const auto& m : metadata) {
|
||||
if (m.level == level) {
|
||||
CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt);
|
||||
return;
|
||||
}
|
||||
}
|
||||
ASSERT_TRUE(false) << "no file found at level";
|
||||
}
|
||||
|
||||
|
||||
int Property(const std::string& name) {
|
||||
std::string property;
|
||||
int result;
|
||||
if (db_->GetProperty(name, &property) &&
|
||||
sscanf(property.c_str(), "%d", &result) == 1) {
|
||||
return result;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the ith key
|
||||
Slice Key(int i, std::string* storage) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "%016d", i);
|
||||
storage->assign(buf, strlen(buf));
|
||||
return Slice(*storage);
|
||||
}
|
||||
|
||||
// Return the value to associate with the specified key
|
||||
Slice Value(int k, std::string* storage) {
|
||||
Random r(k);
|
||||
return test::RandomString(&r, kValueSize, storage);
|
||||
}
|
||||
};
|
||||
|
||||
TEST(CorruptionTest, Recovery) {
|
||||
Build(100);
|
||||
Check(100, 100);
|
||||
Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
|
||||
Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block
|
||||
Reopen();
|
||||
|
||||
// The 64 records in the first two log blocks are completely lost.
|
||||
Check(36, 36);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, RecoverWriteError) {
|
||||
env_.writable_file_error_ = true;
|
||||
Status s = TryReopen();
|
||||
ASSERT_TRUE(!s.ok());
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, NewFileErrorDuringWrite) {
|
||||
// Do enough writing to force minor compaction
|
||||
env_.writable_file_error_ = true;
|
||||
const int num = 3 + (Options().write_buffer_size / kValueSize);
|
||||
std::string value_storage;
|
||||
Status s;
|
||||
bool failed = false;
|
||||
for (int i = 0; i < num; i++) {
|
||||
WriteBatch batch;
|
||||
batch.Put("a", Value(100, &value_storage));
|
||||
s = db_->Write(WriteOptions(), &batch);
|
||||
if (!s.ok()) {
|
||||
failed = true;
|
||||
}
|
||||
ASSERT_TRUE(!failed || !s.ok());
|
||||
}
|
||||
ASSERT_TRUE(!s.ok());
|
||||
ASSERT_GE(env_.num_writable_file_errors_, 1);
|
||||
env_.writable_file_error_ = false;
|
||||
Reopen();
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, TableFile) {
|
||||
Build(100);
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_FlushMemTable();
|
||||
dbi->TEST_CompactRange(0, nullptr, nullptr);
|
||||
dbi->TEST_CompactRange(1, nullptr, nullptr);
|
||||
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
Check(99, 99);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, TableFileIndexData) {
|
||||
Build(10000); // Enough to build multiple Tables
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_FlushMemTable();
|
||||
|
||||
Corrupt(kTableFile, -2000, 500);
|
||||
Reopen();
|
||||
Check(5000, 9999);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, MissingDescriptor) {
|
||||
Build(1000);
|
||||
RepairDB();
|
||||
Reopen();
|
||||
Check(1000, 1000);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, SequenceNumberRecovery) {
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
|
||||
RepairDB();
|
||||
Reopen();
|
||||
std::string v;
|
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
|
||||
ASSERT_EQ("v5", v);
|
||||
// Write something. If sequence number was not recovered properly,
|
||||
// it will be hidden by an earlier write.
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
|
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
|
||||
ASSERT_EQ("v6", v);
|
||||
Reopen();
|
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
|
||||
ASSERT_EQ("v6", v);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, CorruptedDescriptor) {
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_FlushMemTable();
|
||||
dbi->TEST_CompactRange(0, nullptr, nullptr);
|
||||
|
||||
Corrupt(kDescriptorFile, 0, 1000);
|
||||
Status s = TryReopen();
|
||||
ASSERT_TRUE(!s.ok());
|
||||
|
||||
RepairDB();
|
||||
Reopen();
|
||||
std::string v;
|
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
|
||||
ASSERT_EQ("hello", v);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, CompactionInputError) {
|
||||
Build(10);
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_FlushMemTable();
|
||||
const int last = dbi->MaxMemCompactionLevel();
|
||||
ASSERT_EQ(1, Property("rocksdb.num-files-at-level" + NumberToString(last)));
|
||||
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
Check(9, 9);
|
||||
|
||||
// Force compactions by writing lots of values
|
||||
Build(10000);
|
||||
Check(10000, 10000);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, CompactionInputErrorParanoid) {
|
||||
Options options;
|
||||
options.paranoid_checks = true;
|
||||
options.write_buffer_size = 131072;
|
||||
options.max_write_buffer_number = 2;
|
||||
Reopen(&options);
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
|
||||
// Fill levels >= 1 so memtable flush outputs to level 0
|
||||
for (int level = 1; level < dbi->NumberLevels(); level++) {
|
||||
dbi->Put(WriteOptions(), "", "begin");
|
||||
dbi->Put(WriteOptions(), "~", "end");
|
||||
dbi->TEST_FlushMemTable();
|
||||
}
|
||||
|
||||
options.max_mem_compaction_level = 0;
|
||||
Reopen(&options);
|
||||
|
||||
dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
Build(10);
|
||||
dbi->TEST_FlushMemTable();
|
||||
dbi->TEST_WaitForCompact();
|
||||
ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
|
||||
|
||||
CorruptTableFileAtLevel(0, 100, 1);
|
||||
Check(9, 9);
|
||||
|
||||
// Write must eventually fail because of corrupted table
|
||||
Status s;
|
||||
std::string tmp1, tmp2;
|
||||
bool failed = false;
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
|
||||
if (!s.ok()) {
|
||||
failed = true;
|
||||
}
|
||||
// if one write failed, every subsequent write must fail, too
|
||||
ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
|
||||
}
|
||||
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, UnrelatedKeys) {
|
||||
Build(10);
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_FlushMemTable();
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
|
||||
std::string tmp1, tmp2;
|
||||
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
|
||||
std::string v;
|
||||
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
|
||||
ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
|
||||
dbi->TEST_FlushMemTable();
|
||||
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
|
||||
ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, FileSystemStateCorrupted) {
|
||||
for (int iter = 0; iter < 2; ++iter) {
|
||||
Options options;
|
||||
options.paranoid_checks = true;
|
||||
options.create_if_missing = true;
|
||||
Reopen(&options);
|
||||
Build(10);
|
||||
ASSERT_OK(db_->Flush(FlushOptions()));
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
std::vector<LiveFileMetaData> metadata;
|
||||
dbi->GetLiveFilesMetaData(&metadata);
|
||||
ASSERT_GT(metadata.size(), size_t(0));
|
||||
std::string filename = dbname_ + metadata[0].name;
|
||||
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
|
||||
if (iter == 0) { // corrupt file size
|
||||
unique_ptr<WritableFile> file;
|
||||
env_.NewWritableFile(filename, &file, EnvOptions());
|
||||
file->Append(Slice("corrupted sst"));
|
||||
file.reset();
|
||||
} else { // delete the file
|
||||
env_.DeleteFile(filename);
|
||||
}
|
||||
|
||||
Status x = TryReopen(&options);
|
||||
ASSERT_TRUE(x.IsCorruption());
|
||||
DestroyDB(dbname_, options_);
|
||||
Reopen(&options);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
||||
2642
db/db_bench.cc
Normal file
2642
db/db_bench.cc
Normal file
File diff suppressed because it is too large
Load Diff
179
db/db_filesnapshot.cc
Normal file
179
db/db_filesnapshot.cc
Normal file
@@ -0,0 +1,179 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2012 Facebook.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#include <inttypes.h>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
#include "db/db_impl.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/version_set.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "port/port.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include "util/sync_point.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
Status DBImpl::DisableFileDeletions() {
|
||||
MutexLock l(&mutex_);
|
||||
++disable_delete_obsolete_files_;
|
||||
if (disable_delete_obsolete_files_ == 1) {
|
||||
Log(options_.info_log, "File Deletions Disabled");
|
||||
} else {
|
||||
Log(options_.info_log,
|
||||
"File Deletions Disabled, but already disabled. Counter: %d",
|
||||
disable_delete_obsolete_files_);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DBImpl::EnableFileDeletions(bool force) {
|
||||
DeletionState deletion_state;
|
||||
bool should_purge_files = false;
|
||||
{
|
||||
MutexLock l(&mutex_);
|
||||
if (force) {
|
||||
// if force, we need to enable file deletions right away
|
||||
disable_delete_obsolete_files_ = 0;
|
||||
} else if (disable_delete_obsolete_files_ > 0) {
|
||||
--disable_delete_obsolete_files_;
|
||||
}
|
||||
if (disable_delete_obsolete_files_ == 0) {
|
||||
Log(options_.info_log, "File Deletions Enabled");
|
||||
should_purge_files = true;
|
||||
FindObsoleteFiles(deletion_state, true);
|
||||
} else {
|
||||
Log(options_.info_log,
|
||||
"File Deletions Enable, but not really enabled. Counter: %d",
|
||||
disable_delete_obsolete_files_);
|
||||
}
|
||||
}
|
||||
if (should_purge_files) {
|
||||
PurgeObsoleteFiles(deletion_state);
|
||||
}
|
||||
LogFlush(options_.info_log);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
|
||||
uint64_t* manifest_file_size,
|
||||
bool flush_memtable) {
|
||||
|
||||
*manifest_file_size = 0;
|
||||
|
||||
mutex_.Lock();
|
||||
|
||||
if (flush_memtable) {
|
||||
// flush all dirty data to disk.
|
||||
Status status;
|
||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||
cfd->Ref();
|
||||
mutex_.Unlock();
|
||||
status = FlushMemTable(cfd, FlushOptions());
|
||||
mutex_.Lock();
|
||||
cfd->Unref();
|
||||
if (!status.ok()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
||||
|
||||
if (!status.ok()) {
|
||||
mutex_.Unlock();
|
||||
Log(options_.info_log, "Cannot Flush data %s\n",
|
||||
status.ToString().c_str());
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
// Make a set of all of the live *.sst files
|
||||
std::set<uint64_t> live;
|
||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||
cfd->current()->AddLiveFiles(&live);
|
||||
}
|
||||
|
||||
ret.clear();
|
||||
ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
|
||||
|
||||
// create names of the live files. The names are not absolute
|
||||
// paths, instead they are relative to dbname_;
|
||||
for (auto live_file : live) {
|
||||
ret.push_back(TableFileName("", live_file));
|
||||
}
|
||||
|
||||
ret.push_back(CurrentFileName(""));
|
||||
ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber()));
|
||||
|
||||
// find length of manifest file while holding the mutex lock
|
||||
*manifest_file_size = versions_->ManifestFileSize();
|
||||
|
||||
mutex_.Unlock();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
|
||||
// First get sorted files in db dir, then get sorted files from archived
|
||||
// dir, to avoid a race condition where a log file is moved to archived
|
||||
// dir in between.
|
||||
Status s;
|
||||
// list wal files in main db dir.
|
||||
VectorLogPtr logs;
|
||||
s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// Reproduce the race condition where a log file is moved
|
||||
// to archived dir, between these two sync points, used in
|
||||
// (DBTest,TransactionLogIteratorRace)
|
||||
TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1");
|
||||
TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2");
|
||||
|
||||
files.clear();
|
||||
// list wal files in archive dir.
|
||||
std::string archivedir = ArchivalDirectory(options_.wal_dir);
|
||||
if (env_->FileExists(archivedir)) {
|
||||
s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t latest_archived_log_number = 0;
|
||||
if (!files.empty()) {
|
||||
latest_archived_log_number = files.back()->LogNumber();
|
||||
Log(options_.info_log, "Latest Archived log: %" PRIu64,
|
||||
latest_archived_log_number);
|
||||
}
|
||||
|
||||
files.reserve(files.size() + logs.size());
|
||||
for (auto& log : logs) {
|
||||
if (log->LogNumber() > latest_archived_log_number) {
|
||||
files.push_back(std::move(log));
|
||||
} else {
|
||||
// When the race condition happens, we could see the
|
||||
// same log in both db dir and archived dir. Simply
|
||||
// ignore the one in db dir. Note that, if we read
|
||||
// archived dir first, we would have missed the log file.
|
||||
Log(options_.info_log, "%s already moved to archive",
|
||||
log->PathName().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
4703
db/db_impl.cc
Normal file
4703
db/db_impl.cc
Normal file
File diff suppressed because it is too large
Load Diff
635
db/db_impl.h
Normal file
635
db/db_impl.h
Normal file
@@ -0,0 +1,635 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <deque>
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "db/log_writer.h"
|
||||
#include "db/snapshot.h"
|
||||
#include "db/column_family.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "memtable_list.h"
|
||||
#include "port/port.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/memtablerep.h"
|
||||
#include "rocksdb/transaction_log.h"
|
||||
#include "util/autovector.h"
|
||||
#include "util/stats_logger.h"
|
||||
#include "util/thread_local.h"
|
||||
#include "db/internal_stats.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class MemTable;
|
||||
class TableCache;
|
||||
class Version;
|
||||
class VersionEdit;
|
||||
class VersionSet;
|
||||
class CompactionFilterV2;
|
||||
class Arena;
|
||||
|
||||
class DBImpl : public DB {
|
||||
public:
|
||||
DBImpl(const DBOptions& options, const std::string& dbname);
|
||||
virtual ~DBImpl();
|
||||
|
||||
// Implementations of the DB interface
|
||||
using DB::Put;
|
||||
virtual Status Put(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value);
|
||||
using DB::Merge;
|
||||
virtual Status Merge(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value);
|
||||
using DB::Delete;
|
||||
virtual Status Delete(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key);
|
||||
using DB::Write;
|
||||
virtual Status Write(const WriteOptions& options, WriteBatch* updates);
|
||||
using DB::Get;
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value);
|
||||
using DB::MultiGet;
|
||||
virtual std::vector<Status> MultiGet(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_family,
|
||||
const std::vector<Slice>& keys, std::vector<std::string>* values);
|
||||
|
||||
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
|
||||
const std::string& column_family,
|
||||
ColumnFamilyHandle** handle);
|
||||
virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
|
||||
|
||||
// Returns false if key doesn't exist in the database and true if it may.
|
||||
// If value_found is not passed in as null, then return the value if found in
|
||||
// memory. On return, if value was found, then value_found will be set to true
|
||||
// , otherwise false.
|
||||
using DB::KeyMayExist;
|
||||
virtual bool KeyMayExist(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value, bool* value_found = nullptr);
|
||||
using DB::NewIterator;
|
||||
virtual Iterator* NewIterator(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family);
|
||||
virtual Status NewIterators(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_families,
|
||||
std::vector<Iterator*>* iterators);
|
||||
virtual const Snapshot* GetSnapshot();
|
||||
virtual void ReleaseSnapshot(const Snapshot* snapshot);
|
||||
using DB::GetProperty;
|
||||
virtual bool GetProperty(ColumnFamilyHandle* column_family,
|
||||
const Slice& property, std::string* value);
|
||||
using DB::GetApproximateSizes;
|
||||
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
|
||||
const Range* range, int n, uint64_t* sizes);
|
||||
using DB::CompactRange;
|
||||
virtual Status CompactRange(ColumnFamilyHandle* column_family,
|
||||
const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false, int target_level = -1);
|
||||
|
||||
using DB::NumberLevels;
|
||||
virtual int NumberLevels(ColumnFamilyHandle* column_family);
|
||||
using DB::MaxMemCompactionLevel;
|
||||
virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family);
|
||||
using DB::Level0StopWriteTrigger;
|
||||
virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family);
|
||||
virtual const std::string& GetName() const;
|
||||
virtual Env* GetEnv() const;
|
||||
using DB::GetOptions;
|
||||
virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const;
|
||||
using DB::Flush;
|
||||
virtual Status Flush(const FlushOptions& options,
|
||||
ColumnFamilyHandle* column_family);
|
||||
|
||||
virtual SequenceNumber GetLatestSequenceNumber() const;
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
virtual Status DisableFileDeletions();
|
||||
virtual Status EnableFileDeletions(bool force);
|
||||
// All the returned filenames start with "/"
|
||||
virtual Status GetLiveFiles(std::vector<std::string>&,
|
||||
uint64_t* manifest_file_size,
|
||||
bool flush_memtable = true);
|
||||
virtual Status GetSortedWalFiles(VectorLogPtr& files);
|
||||
|
||||
virtual Status GetUpdatesSince(
|
||||
SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
|
||||
const TransactionLogIterator::ReadOptions&
|
||||
read_options = TransactionLogIterator::ReadOptions());
|
||||
virtual Status DeleteFile(std::string name);
|
||||
|
||||
virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
// checks if all live files exist on file system and that their file sizes
|
||||
// match to our in-memory records
|
||||
virtual Status CheckConsistency();
|
||||
|
||||
virtual Status GetDbIdentity(std::string& identity);
|
||||
|
||||
Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
|
||||
int output_level, const Slice* begin,
|
||||
const Slice* end);
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
// Extra methods (for testing) that are not in the public DB interface
|
||||
// Implemented in db_impl_debug.cc
|
||||
|
||||
// Compact any files in the named level that overlap [*begin, *end]
|
||||
Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
|
||||
ColumnFamilyHandle* column_family = nullptr);
|
||||
|
||||
// Force current memtable contents to be flushed.
|
||||
Status TEST_FlushMemTable(bool wait = true);
|
||||
|
||||
// Wait for memtable compaction
|
||||
Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
|
||||
|
||||
// Wait for any compaction
|
||||
Status TEST_WaitForCompact();
|
||||
|
||||
// Return an internal iterator over the current state of the database.
|
||||
// The keys of this iterator are internal keys (see format.h).
|
||||
// The returned iterator should be deleted when no longer needed.
|
||||
Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family =
|
||||
nullptr);
|
||||
|
||||
// Return the maximum overlapping data (in bytes) at next level for any
|
||||
// file at a level >= 1.
|
||||
int64_t TEST_MaxNextLevelOverlappingBytes(ColumnFamilyHandle* column_family =
|
||||
nullptr);
|
||||
|
||||
// Return the current manifest file no.
|
||||
uint64_t TEST_Current_Manifest_FileNo();
|
||||
|
||||
// Trigger's a background call for testing.
|
||||
void TEST_PurgeObsoleteteWAL();
|
||||
|
||||
// get total level0 file size. Only for testing.
|
||||
uint64_t TEST_GetLevel0TotalSize();
|
||||
|
||||
void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
|
||||
{
|
||||
default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
|
||||
}
|
||||
|
||||
void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
|
||||
std::vector<std::vector<FileMetaData>>* metadata);
|
||||
|
||||
Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
|
||||
SequenceNumber* sequence);
|
||||
|
||||
Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
|
||||
#endif // NDEBUG
|
||||
|
||||
// needed for CleanupIteratorState
|
||||
struct DeletionState {
|
||||
inline bool HaveSomethingToDelete() const {
|
||||
return candidate_files.size() ||
|
||||
sst_delete_files.size() ||
|
||||
log_delete_files.size();
|
||||
}
|
||||
|
||||
// a list of all files that we'll consider deleting
|
||||
// (every once in a while this is filled up with all files
|
||||
// in the DB directory)
|
||||
std::vector<std::string> candidate_files;
|
||||
|
||||
// the list of all live sst files that cannot be deleted
|
||||
std::vector<uint64_t> sst_live;
|
||||
|
||||
// a list of sst files that we need to delete
|
||||
std::vector<FileMetaData*> sst_delete_files;
|
||||
|
||||
// a list of log files that we need to delete
|
||||
std::vector<uint64_t> log_delete_files;
|
||||
|
||||
// a list of memtables to be free
|
||||
autovector<MemTable*> memtables_to_free;
|
||||
|
||||
autovector<SuperVersion*> superversions_to_free;
|
||||
|
||||
SuperVersion* new_superversion; // if nullptr no new superversion
|
||||
|
||||
// the current manifest_file_number, log_number and prev_log_number
|
||||
// that corresponds to the set of files in 'live'.
|
||||
uint64_t manifest_file_number, pending_manifest_file_number, log_number,
|
||||
prev_log_number;
|
||||
|
||||
explicit DeletionState(bool create_superversion = false) {
|
||||
manifest_file_number = 0;
|
||||
pending_manifest_file_number = 0;
|
||||
log_number = 0;
|
||||
prev_log_number = 0;
|
||||
new_superversion = create_superversion ? new SuperVersion() : nullptr;
|
||||
}
|
||||
|
||||
~DeletionState() {
|
||||
// free pending memtables
|
||||
for (auto m : memtables_to_free) {
|
||||
delete m;
|
||||
}
|
||||
// free superversions
|
||||
for (auto s : superversions_to_free) {
|
||||
delete s;
|
||||
}
|
||||
// if new_superversion was not used, it will be non-nullptr and needs
|
||||
// to be freed here
|
||||
delete new_superversion;
|
||||
}
|
||||
};
|
||||
|
||||
// Returns the list of live files in 'live' and the list
|
||||
// of all files in the filesystem in 'candidate_files'.
|
||||
// If force == false and the last call was less than
|
||||
// options_.delete_obsolete_files_period_micros microseconds ago,
|
||||
// it will not fill up the deletion_state
|
||||
void FindObsoleteFiles(DeletionState& deletion_state,
|
||||
bool force,
|
||||
bool no_full_scan = false);
|
||||
|
||||
// Diffs the files listed in filenames and those that do not
|
||||
// belong to live files are posibly removed. Also, removes all the
|
||||
// files in sst_delete_files and log_delete_files.
|
||||
// It is not necessary to hold the mutex when invoking this method.
|
||||
void PurgeObsoleteFiles(DeletionState& deletion_state);
|
||||
|
||||
ColumnFamilyHandle* DefaultColumnFamily() const;
|
||||
|
||||
protected:
|
||||
Env* const env_;
|
||||
const std::string dbname_;
|
||||
unique_ptr<VersionSet> versions_;
|
||||
const DBOptions options_;
|
||||
|
||||
Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
|
||||
SuperVersion* super_version,
|
||||
Arena* arena = nullptr);
|
||||
|
||||
private:
|
||||
friend class DB;
|
||||
friend class InternalStats;
|
||||
#ifndef ROCKSDB_LITE
|
||||
friend class TailingIterator;
|
||||
friend class ForwardIterator;
|
||||
#endif
|
||||
friend struct SuperVersion;
|
||||
struct CompactionState;
|
||||
struct Writer;
|
||||
|
||||
Status NewDB();
|
||||
|
||||
// Recover the descriptor from persistent storage. May do a significant
|
||||
// amount of work to recover recently logged updates. Any changes to
|
||||
// be made to the descriptor are added to *edit.
|
||||
Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
bool read_only = false, bool error_if_log_file_exist = false);
|
||||
|
||||
void MaybeIgnoreError(Status* s) const;
|
||||
|
||||
const Status CreateArchivalDirectory();
|
||||
|
||||
// Delete any unneeded files and stale in-memory entries.
|
||||
void DeleteObsoleteFiles();
|
||||
|
||||
// Flush the in-memory write buffer to storage. Switches to a new
|
||||
// log-file/memtable and writes a new descriptor iff successful.
|
||||
Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress,
|
||||
DeletionState& deletion_state,
|
||||
LogBuffer* log_buffer);
|
||||
|
||||
Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
|
||||
bool read_only);
|
||||
|
||||
// The following two methods are used to flush a memtable to
|
||||
// storage. The first one is used atdatabase RecoveryTime (when the
|
||||
// database is opened) and is heavyweight because it holds the mutex
|
||||
// for the entire period. The second method WriteLevel0Table supports
|
||||
// concurrent flush memtables to storage.
|
||||
Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
|
||||
VersionEdit* edit);
|
||||
Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
|
||||
VersionEdit* edit, uint64_t* filenumber,
|
||||
LogBuffer* log_buffer);
|
||||
|
||||
uint64_t SlowdownAmount(int n, double bottom, double top);
|
||||
|
||||
// TODO(icanadi) free superversion_to_free and old_log outside of mutex
|
||||
Status MakeRoomForWrite(ColumnFamilyData* cfd,
|
||||
bool force /* flush even if there is room? */,
|
||||
autovector<SuperVersion*>* superversions_to_free,
|
||||
autovector<log::Writer*>* logs_to_free);
|
||||
|
||||
void BuildBatchGroup(Writer** last_writer,
|
||||
autovector<WriteBatch*>* write_batch_group);
|
||||
|
||||
// Force current memtable contents to be flushed.
|
||||
Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
|
||||
|
||||
// Wait for memtable flushed
|
||||
Status WaitForFlushMemTable(ColumnFamilyData* cfd);
|
||||
|
||||
void MaybeScheduleLogDBDeployStats();
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
static void BGLogDBDeployStats(void* db);
|
||||
void LogDBDeployStats();
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
void MaybeScheduleFlushOrCompaction();
|
||||
static void BGWorkCompaction(void* db);
|
||||
static void BGWorkFlush(void* db);
|
||||
void BackgroundCallCompaction();
|
||||
void BackgroundCallFlush();
|
||||
Status BackgroundCompaction(bool* madeProgress, DeletionState& deletion_state,
|
||||
LogBuffer* log_buffer);
|
||||
Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state,
|
||||
LogBuffer* log_buffer);
|
||||
void CleanupCompaction(CompactionState* compact, Status status);
|
||||
Status DoCompactionWork(CompactionState* compact,
|
||||
DeletionState& deletion_state,
|
||||
LogBuffer* log_buffer);
|
||||
|
||||
// This function is called as part of compaction. It enables Flush process to
|
||||
// preempt compaction, since it's higher prioirty
|
||||
// Returns: micros spent executing
|
||||
uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
|
||||
DeletionState& deletion_state,
|
||||
LogBuffer* log_buffer);
|
||||
|
||||
// Call compaction filter if is_compaction_v2 is not true. Then iterate
|
||||
// through input and compact the kv-pairs
|
||||
Status ProcessKeyValueCompaction(
|
||||
SequenceNumber visible_at_tip,
|
||||
SequenceNumber earliest_snapshot,
|
||||
SequenceNumber latest_snapshot,
|
||||
DeletionState& deletion_state,
|
||||
bool bottommost_level,
|
||||
int64_t& imm_micros,
|
||||
Iterator* input,
|
||||
CompactionState* compact,
|
||||
bool is_compaction_v2,
|
||||
LogBuffer* log_buffer);
|
||||
|
||||
// Call compaction_filter_v2->Filter() on kv-pairs in compact
|
||||
void CallCompactionFilterV2(CompactionState* compact,
|
||||
CompactionFilterV2* compaction_filter_v2);
|
||||
|
||||
Status OpenCompactionOutputFile(CompactionState* compact);
|
||||
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
|
||||
Status InstallCompactionResults(CompactionState* compact,
|
||||
LogBuffer* log_buffer);
|
||||
void AllocateCompactionOutputFileNumbers(CompactionState* compact);
|
||||
void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
|
||||
|
||||
#ifdef ROCKSDB_LITE
|
||||
void PurgeObsoleteWALFiles() {
|
||||
// this function is used for archiving WAL files. we don't need this in
|
||||
// ROCKSDB_LITE
|
||||
}
|
||||
#else
|
||||
void PurgeObsoleteWALFiles();
|
||||
|
||||
Status GetSortedWalsOfType(const std::string& path,
|
||||
VectorLogPtr& log_files,
|
||||
WalFileType type);
|
||||
|
||||
// Requires: all_logs should be sorted with earliest log file first
|
||||
// Retains all log files in all_logs which contain updates with seq no.
|
||||
// Greater Than or Equal to the requested SequenceNumber.
|
||||
Status RetainProbableWalFiles(VectorLogPtr& all_logs,
|
||||
const SequenceNumber target);
|
||||
|
||||
Status ReadFirstRecord(const WalFileType type, const uint64_t number,
|
||||
SequenceNumber* sequence);
|
||||
|
||||
Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
void PrintStatistics();
|
||||
|
||||
// dump rocksdb.stats to LOG
|
||||
void MaybeDumpStats();
|
||||
|
||||
// Return true if the current db supports snapshot. If the current
|
||||
// DB does not support snapshot, then calling GetSnapshot() will always
|
||||
// return nullptr.
|
||||
//
|
||||
// @see GetSnapshot()
|
||||
virtual bool IsSnapshotSupported() const;
|
||||
|
||||
// Return the minimum empty level that could hold the total data in the
|
||||
// input level. Return the input level, if such level could not be found.
|
||||
int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level);
|
||||
|
||||
// Move the files in the input level to the target level.
|
||||
// If target_level < 0, automatically calculate the minimum level that could
|
||||
// hold the data set.
|
||||
Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
|
||||
|
||||
// table_cache_ provides its own synchronization
|
||||
std::shared_ptr<Cache> table_cache_;
|
||||
|
||||
// Lock over the persistent DB state. Non-nullptr iff successfully acquired.
|
||||
FileLock* db_lock_;
|
||||
|
||||
// State below is protected by mutex_
|
||||
port::Mutex mutex_;
|
||||
port::AtomicPointer shutting_down_;
|
||||
// This condition variable is signaled on these conditions:
|
||||
// * whenever bg_compaction_scheduled_ goes down to 0
|
||||
// * if bg_manual_only_ > 0, whenever a compaction finishes, even if it hasn't
|
||||
// made any progress
|
||||
// * whenever a compaction made any progress
|
||||
// * whenever bg_flush_scheduled_ value decreases (i.e. whenever a flush is
|
||||
// done, even if it didn't make any progress)
|
||||
// * whenever there is an error in background flush or compaction
|
||||
// * whenever bg_logstats_scheduled_ turns to false
|
||||
port::CondVar bg_cv_;
|
||||
uint64_t logfile_number_;
|
||||
unique_ptr<log::Writer> log_;
|
||||
bool log_empty_;
|
||||
ColumnFamilyHandleImpl* default_cf_handle_;
|
||||
unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
|
||||
struct LogFileNumberSize {
|
||||
explicit LogFileNumberSize(uint64_t _number)
|
||||
: number(_number), size(0), getting_flushed(false) {}
|
||||
void AddSize(uint64_t new_size) { size += new_size; }
|
||||
uint64_t number;
|
||||
uint64_t size;
|
||||
bool getting_flushed;
|
||||
};
|
||||
std::deque<LogFileNumberSize> alive_log_files_;
|
||||
uint64_t total_log_size_;
|
||||
// only used for dynamically adjusting max_total_wal_size. it is a sum of
|
||||
// [write_buffer_size * max_write_buffer_number] over all column families
|
||||
uint64_t max_total_in_memory_state_;
|
||||
|
||||
std::string host_name_;
|
||||
|
||||
std::unique_ptr<Directory> db_directory_;
|
||||
|
||||
// Queue of writers.
|
||||
std::deque<Writer*> writers_;
|
||||
WriteBatch tmp_batch_;
|
||||
|
||||
SnapshotList snapshots_;
|
||||
|
||||
// cache for ReadFirstRecord() calls
|
||||
std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
|
||||
port::Mutex read_first_record_cache_mutex_;
|
||||
|
||||
// Set of table files to protect from deletion because they are
|
||||
// part of ongoing compactions.
|
||||
std::set<uint64_t> pending_outputs_;
|
||||
|
||||
// At least one compaction or flush job is pending but not yet scheduled
|
||||
// because of the max background thread limit.
|
||||
bool bg_schedule_needed_;
|
||||
|
||||
// count how many background compactions are running or have been scheduled
|
||||
int bg_compaction_scheduled_;
|
||||
|
||||
// If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual
|
||||
// compactions (if manual_compaction_ is not null). This mechanism enables
|
||||
// manual compactions to wait until all other compactions are finished.
|
||||
int bg_manual_only_;
|
||||
|
||||
// number of background memtable flush jobs, submitted to the HIGH pool
|
||||
int bg_flush_scheduled_;
|
||||
|
||||
// Has a background stats log thread scheduled?
|
||||
bool bg_logstats_scheduled_;
|
||||
|
||||
// Information for a manual compaction
|
||||
struct ManualCompaction {
|
||||
ColumnFamilyData* cfd;
|
||||
int input_level;
|
||||
int output_level;
|
||||
bool done;
|
||||
Status status;
|
||||
bool in_progress; // compaction request being processed?
|
||||
const InternalKey* begin; // nullptr means beginning of key range
|
||||
const InternalKey* end; // nullptr means end of key range
|
||||
InternalKey tmp_storage; // Used to keep track of compaction progress
|
||||
};
|
||||
ManualCompaction* manual_compaction_;
|
||||
|
||||
// Have we encountered a background error in paranoid mode?
|
||||
Status bg_error_;
|
||||
|
||||
std::unique_ptr<StatsLogger> logger_;
|
||||
|
||||
int64_t volatile last_log_ts;
|
||||
|
||||
// shall we disable deletion of obsolete files
|
||||
// if 0 the deletion is enabled.
|
||||
// if non-zero, files will not be getting deleted
|
||||
// This enables two different threads to call
|
||||
// EnableFileDeletions() and DisableFileDeletions()
|
||||
// without any synchronization
|
||||
int disable_delete_obsolete_files_;
|
||||
|
||||
// last time when DeleteObsoleteFiles was invoked
|
||||
uint64_t delete_obsolete_files_last_run_;
|
||||
|
||||
// last time when PurgeObsoleteWALFiles ran.
|
||||
uint64_t purge_wal_files_last_run_;
|
||||
|
||||
// last time stats were dumped to LOG
|
||||
std::atomic<uint64_t> last_stats_dump_time_microsec_;
|
||||
|
||||
// obsolete files will be deleted every this seconds if ttl deletion is
|
||||
// enabled and archive size_limit is disabled.
|
||||
uint64_t default_interval_to_delete_obsolete_WAL_;
|
||||
|
||||
bool flush_on_destroy_; // Used when disableWAL is true.
|
||||
|
||||
static const int KEEP_LOG_FILE_NUM = 1000;
|
||||
std::string db_absolute_path_;
|
||||
|
||||
// count of the number of contiguous delaying writes
|
||||
int delayed_writes_;
|
||||
|
||||
// The options to access storage files
|
||||
const EnvOptions storage_options_;
|
||||
|
||||
// A value of true temporarily disables scheduling of background work
|
||||
bool bg_work_gate_closed_;
|
||||
|
||||
// Guard against multiple concurrent refitting
|
||||
bool refitting_level_;
|
||||
|
||||
// Indicate DB was opened successfully
|
||||
bool opened_successfully_;
|
||||
|
||||
// No copying allowed
|
||||
DBImpl(const DBImpl&);
|
||||
void operator=(const DBImpl&);
|
||||
|
||||
// dump the delayed_writes_ to the log file and reset counter.
|
||||
void DelayLoggingAndReset();
|
||||
|
||||
// Return the earliest snapshot where seqno is visible.
|
||||
// Store the snapshot right before that, if any, in prev_snapshot
|
||||
inline SequenceNumber findEarliestVisibleSnapshot(
|
||||
SequenceNumber in,
|
||||
std::vector<SequenceNumber>& snapshots,
|
||||
SequenceNumber* prev_snapshot);
|
||||
|
||||
// Background threads call this function, which is just a wrapper around
|
||||
// the cfd->InstallSuperVersion() function. Background threads carry
|
||||
// deletion_state which can have new_superversion already allocated.
|
||||
void InstallSuperVersion(ColumnFamilyData* cfd,
|
||||
DeletionState& deletion_state);
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
using DB::GetPropertiesOfAllTables;
|
||||
virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
|
||||
TablePropertiesCollection* props)
|
||||
override;
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
// Function that Get and KeyMayExist call with no_io true or false
|
||||
// Note: 'value_found' from KeyMayExist propagates here
|
||||
Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
|
||||
const Slice& key, std::string* value,
|
||||
bool* value_found = nullptr);
|
||||
};
|
||||
|
||||
// Sanitize db options. The caller should delete result.info_log if
|
||||
// it is not equal to src.info_log.
|
||||
extern Options SanitizeOptions(const std::string& db,
|
||||
const InternalKeyComparator* icmp,
|
||||
const InternalFilterPolicy* ipolicy,
|
||||
const Options& src);
|
||||
extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
|
||||
|
||||
// Determine compression type, based on user options, level of the output
|
||||
// file and whether compression is disabled.
|
||||
// If enable_compression is false, then compression is always disabled no
|
||||
// matter what the values of the other two parameters are.
|
||||
// Otherwise, the compression type is determined based on options and level.
|
||||
CompressionType GetCompressionType(const Options& options, int level,
|
||||
const bool enable_compression);
|
||||
|
||||
// Determine compression type for L0 file written by memtable flush.
|
||||
CompressionType GetCompressionFlush(const Options& options);
|
||||
|
||||
} // namespace rocksdb
|
||||
133
db/db_impl_debug.cc
Normal file
133
db/db_impl_debug.cc
Normal file
@@ -0,0 +1,133 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include "db/db_impl.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
void DBImpl::TEST_PurgeObsoleteteWAL() { PurgeObsoleteWALFiles(); }
|
||||
|
||||
uint64_t DBImpl::TEST_GetLevel0TotalSize() {
|
||||
MutexLock l(&mutex_);
|
||||
return default_cf_handle_->cfd()->current()->NumLevelBytes(0);
|
||||
}
|
||||
|
||||
Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
|
||||
ColumnFamilyData* cfd;
|
||||
if (column_family == nullptr) {
|
||||
cfd = default_cf_handle_->cfd();
|
||||
} else {
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
cfd = cfh->cfd();
|
||||
}
|
||||
|
||||
mutex_.Lock();
|
||||
SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
|
||||
mutex_.Unlock();
|
||||
ReadOptions roptions;
|
||||
return NewInternalIterator(roptions, cfd, super_version);
|
||||
}
|
||||
|
||||
int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
|
||||
ColumnFamilyHandle* column_family) {
|
||||
ColumnFamilyData* cfd;
|
||||
if (column_family == nullptr) {
|
||||
cfd = default_cf_handle_->cfd();
|
||||
} else {
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
cfd = cfh->cfd();
|
||||
}
|
||||
MutexLock l(&mutex_);
|
||||
return cfd->current()->MaxNextLevelOverlappingBytes();
|
||||
}
|
||||
|
||||
void DBImpl::TEST_GetFilesMetaData(
|
||||
ColumnFamilyHandle* column_family,
|
||||
std::vector<std::vector<FileMetaData>>* metadata) {
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
auto cfd = cfh->cfd();
|
||||
MutexLock l(&mutex_);
|
||||
metadata->resize(NumberLevels());
|
||||
for (int level = 0; level < NumberLevels(); level++) {
|
||||
const std::vector<FileMetaData*>& files = cfd->current()->files_[level];
|
||||
|
||||
(*metadata)[level].clear();
|
||||
for (const auto& f : files) {
|
||||
(*metadata)[level].push_back(*f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
|
||||
return versions_->ManifestFileNumber();
|
||||
}
|
||||
|
||||
Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
|
||||
const Slice* end,
|
||||
ColumnFamilyHandle* column_family) {
|
||||
ColumnFamilyData* cfd;
|
||||
if (column_family == nullptr) {
|
||||
cfd = default_cf_handle_->cfd();
|
||||
} else {
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
cfd = cfh->cfd();
|
||||
}
|
||||
int output_level =
|
||||
(cfd->options()->compaction_style == kCompactionStyleUniversal ||
|
||||
cfd->options()->compaction_style == kCompactionStyleFIFO)
|
||||
? level
|
||||
: level + 1;
|
||||
return RunManualCompaction(cfd, level, output_level, begin, end);
|
||||
}
|
||||
|
||||
Status DBImpl::TEST_FlushMemTable(bool wait) {
|
||||
FlushOptions fo;
|
||||
fo.wait = wait;
|
||||
return FlushMemTable(default_cf_handle_->cfd(), fo);
|
||||
}
|
||||
|
||||
Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
|
||||
ColumnFamilyData* cfd;
|
||||
if (column_family == nullptr) {
|
||||
cfd = default_cf_handle_->cfd();
|
||||
} else {
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
cfd = cfh->cfd();
|
||||
}
|
||||
return WaitForFlushMemTable(cfd);
|
||||
}
|
||||
|
||||
Status DBImpl::TEST_WaitForCompact() {
|
||||
// Wait until the compaction completes
|
||||
|
||||
// TODO: a bug here. This function actually does not necessarily
|
||||
// wait for compact. It actually waits for scheduled compaction
|
||||
// OR flush to finish.
|
||||
|
||||
MutexLock l(&mutex_);
|
||||
while ((bg_compaction_scheduled_ || bg_flush_scheduled_) && bg_error_.ok()) {
|
||||
bg_cv_.Wait();
|
||||
}
|
||||
return bg_error_;
|
||||
}
|
||||
|
||||
Status DBImpl::TEST_ReadFirstRecord(const WalFileType type,
|
||||
const uint64_t number,
|
||||
SequenceNumber* sequence) {
|
||||
return ReadFirstRecord(type, number, sequence);
|
||||
}
|
||||
|
||||
Status DBImpl::TEST_ReadFirstLine(const std::string& fname,
|
||||
SequenceNumber* sequence) {
|
||||
return ReadFirstLine(fname, sequence);
|
||||
}
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
154
db/db_impl_readonly.cc
Normal file
154
db/db_impl_readonly.cc
Normal file
@@ -0,0 +1,154 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2012 Facebook. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "db/db_impl_readonly.h"
|
||||
#include "db/db_impl.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "db/db_iter.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/log_reader.h"
|
||||
#include "db/log_writer.h"
|
||||
#include "db/memtable.h"
|
||||
#include "db/merge_context.h"
|
||||
#include "db/table_cache.h"
|
||||
#include "db/version_set.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "rocksdb/merge_operator.h"
|
||||
#include "port/port.h"
|
||||
#include "table/block.h"
|
||||
#include "table/merger.h"
|
||||
#include "table/two_level_iterator.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/build_version.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
DBImplReadOnly::DBImplReadOnly(const DBOptions& options,
|
||||
const std::string& dbname)
|
||||
: DBImpl(options, dbname) {
|
||||
Log(options_.info_log, "Opening the db in read only mode");
|
||||
}
|
||||
|
||||
DBImplReadOnly::~DBImplReadOnly() {
|
||||
}
|
||||
|
||||
// Implementations of the DB interface
|
||||
Status DBImplReadOnly::Get(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value) {
|
||||
Status s;
|
||||
SequenceNumber snapshot = versions_->LastSequence();
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
auto cfd = cfh->cfd();
|
||||
SuperVersion* super_version = cfd->GetSuperVersion();
|
||||
MergeContext merge_context;
|
||||
LookupKey lkey(key, snapshot);
|
||||
if (super_version->mem->Get(lkey, value, &s, merge_context,
|
||||
*cfd->options())) {
|
||||
} else {
|
||||
Version::GetStats stats;
|
||||
super_version->current->Get(options, lkey, value, &s, &merge_context,
|
||||
&stats);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family) {
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
auto cfd = cfh->cfd();
|
||||
SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
|
||||
SequenceNumber latest_snapshot = versions_->LastSequence();
|
||||
Iterator* internal_iter = NewInternalIterator(options, cfd, super_version);
|
||||
return NewDBIterator(
|
||||
env_, *cfd->options(), cfd->user_comparator(), internal_iter,
|
||||
(options.snapshot != nullptr
|
||||
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
|
||||
: latest_snapshot));
|
||||
}
|
||||
|
||||
Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
|
||||
DB** dbptr, bool error_if_log_file_exist) {
|
||||
*dbptr = nullptr;
|
||||
|
||||
DBOptions db_options(options);
|
||||
ColumnFamilyOptions cf_options(options);
|
||||
std::vector<ColumnFamilyDescriptor> column_families;
|
||||
column_families.push_back(
|
||||
ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
|
||||
std::vector<ColumnFamilyHandle*> handles;
|
||||
|
||||
Status s =
|
||||
DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
|
||||
if (s.ok()) {
|
||||
assert(handles.size() == 1);
|
||||
// i can delete the handle since DBImpl is always holding a
|
||||
// reference to default column family
|
||||
delete handles[0];
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status DB::OpenForReadOnly(
|
||||
const DBOptions& db_options, const std::string& dbname,
|
||||
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
|
||||
bool error_if_log_file_exist) {
|
||||
*dbptr = nullptr;
|
||||
handles->clear();
|
||||
|
||||
DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
|
||||
impl->mutex_.Lock();
|
||||
Status s = impl->Recover(column_families, true /* read only */,
|
||||
error_if_log_file_exist);
|
||||
if (s.ok()) {
|
||||
// set column family handles
|
||||
for (auto cf : column_families) {
|
||||
auto cfd =
|
||||
impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
|
||||
if (cfd == nullptr) {
|
||||
s = Status::InvalidArgument("Column family not found: ", cf.name);
|
||||
break;
|
||||
}
|
||||
handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
|
||||
}
|
||||
}
|
||||
if (s.ok()) {
|
||||
for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
|
||||
delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_);
|
||||
}
|
||||
}
|
||||
impl->mutex_.Unlock();
|
||||
if (s.ok()) {
|
||||
*dbptr = impl;
|
||||
} else {
|
||||
for (auto h : *handles) {
|
||||
delete h;
|
||||
}
|
||||
handles->clear();
|
||||
delete impl;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
} // namespace rocksdb
|
||||
103
db/db_impl_readonly.h
Normal file
103
db/db_impl_readonly.h
Normal file
@@ -0,0 +1,103 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2012 Facebook. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#pragma once
|
||||
#include "db/db_impl.h"
|
||||
|
||||
#include <deque>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "db/dbformat.h"
|
||||
#include "db/log_writer.h"
|
||||
#include "db/snapshot.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "port/port.h"
|
||||
#include "util/stats_logger.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class DBImplReadOnly : public DBImpl {
|
||||
public:
|
||||
DBImplReadOnly(const DBOptions& options, const std::string& dbname);
|
||||
virtual ~DBImplReadOnly();
|
||||
|
||||
// Implementations of the DB interface
|
||||
using DB::Get;
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value);
|
||||
|
||||
// TODO: Implement ReadOnly MultiGet?
|
||||
|
||||
using DBImpl::NewIterator;
|
||||
virtual Iterator* NewIterator(const ReadOptions&,
|
||||
ColumnFamilyHandle* column_family);
|
||||
|
||||
virtual Status NewIterators(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_family,
|
||||
std::vector<Iterator*>* iterators) {
|
||||
// TODO
|
||||
return Status::NotSupported("Not supported yet.");
|
||||
}
|
||||
|
||||
using DBImpl::Put;
|
||||
virtual Status Put(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
using DBImpl::Merge;
|
||||
virtual Status Merge(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
using DBImpl::Delete;
|
||||
virtual Status Delete(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
using DBImpl::CompactRange;
|
||||
virtual Status CompactRange(ColumnFamilyHandle* column_family,
|
||||
const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false,
|
||||
int target_level = -1) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status DisableFileDeletions() {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status EnableFileDeletions(bool force) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status GetLiveFiles(std::vector<std::string>&,
|
||||
uint64_t* manifest_file_size,
|
||||
bool flush_memtable = true) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
using DBImpl::Flush;
|
||||
virtual Status Flush(const FlushOptions& options,
|
||||
ColumnFamilyHandle* column_family) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
|
||||
private:
|
||||
friend class DB;
|
||||
|
||||
// No copying allowed
|
||||
DBImplReadOnly(const DBImplReadOnly&);
|
||||
void operator=(const DBImplReadOnly&);
|
||||
};
|
||||
}
|
||||
517
db/db_iter.cc
Normal file
517
db/db_iter.cc
Normal file
@@ -0,0 +1,517 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/db_iter.h"
|
||||
#include <stdexcept>
|
||||
#include <deque>
|
||||
|
||||
#include "db/filename.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/merge_operator.h"
|
||||
#include "port/port.h"
|
||||
#include "util/arena.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include "util/perf_context_imp.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
#if 0
|
||||
static void DumpInternalIter(Iterator* iter) {
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
ParsedInternalKey k;
|
||||
if (!ParseInternalKey(iter->key(), &k)) {
|
||||
fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
|
||||
} else {
|
||||
fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Memtables and sstables that make the DB representation contain
|
||||
// (userkey,seq,type) => uservalue entries. DBIter
|
||||
// combines multiple entries for the same userkey found in the DB
|
||||
// representation into a single entry while accounting for sequence
|
||||
// numbers, deletion markers, overwrites, etc.
|
||||
class DBIter: public Iterator {
|
||||
public:
|
||||
// The following is grossly complicated. TODO: clean it up
|
||||
// Which direction is the iterator currently moving?
|
||||
// (1) When moving forward, the internal iterator is positioned at
|
||||
// the exact entry that yields this->key(), this->value()
|
||||
// (2) When moving backwards, the internal iterator is positioned
|
||||
// just before all entries whose user key == this->key().
|
||||
enum Direction {
|
||||
kForward,
|
||||
kReverse
|
||||
};
|
||||
|
||||
DBIter(Env* env, const Options& options, const Comparator* cmp,
|
||||
Iterator* iter, SequenceNumber s, bool arena_mode)
|
||||
: arena_mode_(arena_mode),
|
||||
env_(env),
|
||||
logger_(options.info_log.get()),
|
||||
user_comparator_(cmp),
|
||||
user_merge_operator_(options.merge_operator.get()),
|
||||
iter_(iter),
|
||||
sequence_(s),
|
||||
direction_(kForward),
|
||||
valid_(false),
|
||||
current_entry_is_merged_(false),
|
||||
statistics_(options.statistics.get()) {
|
||||
RecordTick(statistics_, NO_ITERATORS, 1);
|
||||
max_skip_ = options.max_sequential_skip_in_iterations;
|
||||
}
|
||||
virtual ~DBIter() {
|
||||
RecordTick(statistics_, NO_ITERATORS, -1);
|
||||
if (!arena_mode_) {
|
||||
delete iter_;
|
||||
} else {
|
||||
iter_->~Iterator();
|
||||
}
|
||||
}
|
||||
virtual void SetIter(Iterator* iter) {
|
||||
assert(iter_ == nullptr);
|
||||
iter_ = iter;
|
||||
}
|
||||
virtual bool Valid() const { return valid_; }
|
||||
virtual Slice key() const {
|
||||
assert(valid_);
|
||||
return saved_key_.GetKey();
|
||||
}
|
||||
virtual Slice value() const {
|
||||
assert(valid_);
|
||||
return (direction_ == kForward && !current_entry_is_merged_) ?
|
||||
iter_->value() : saved_value_;
|
||||
}
|
||||
virtual Status status() const {
|
||||
if (status_.ok()) {
|
||||
return iter_->status();
|
||||
} else {
|
||||
return status_;
|
||||
}
|
||||
}
|
||||
|
||||
virtual void Next();
|
||||
virtual void Prev();
|
||||
virtual void Seek(const Slice& target);
|
||||
virtual void SeekToFirst();
|
||||
virtual void SeekToLast();
|
||||
|
||||
private:
|
||||
inline void FindNextUserEntry(bool skipping);
|
||||
void FindNextUserEntryInternal(bool skipping);
|
||||
void FindPrevUserEntry();
|
||||
bool ParseKey(ParsedInternalKey* key);
|
||||
void MergeValuesNewToOld();
|
||||
|
||||
inline void ClearSavedValue() {
|
||||
if (saved_value_.capacity() > 1048576) {
|
||||
std::string empty;
|
||||
swap(empty, saved_value_);
|
||||
} else {
|
||||
saved_value_.clear();
|
||||
}
|
||||
}
|
||||
|
||||
bool arena_mode_;
|
||||
Env* const env_;
|
||||
Logger* logger_;
|
||||
const Comparator* const user_comparator_;
|
||||
const MergeOperator* const user_merge_operator_;
|
||||
Iterator* iter_;
|
||||
SequenceNumber const sequence_;
|
||||
|
||||
Status status_;
|
||||
IterKey saved_key_; // == current key when direction_==kReverse
|
||||
std::string saved_value_; // == current raw value when direction_==kReverse
|
||||
Direction direction_;
|
||||
bool valid_;
|
||||
bool current_entry_is_merged_;
|
||||
Statistics* statistics_;
|
||||
uint64_t max_skip_;
|
||||
|
||||
// No copying allowed
|
||||
DBIter(const DBIter&);
|
||||
void operator=(const DBIter&);
|
||||
};
|
||||
|
||||
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
|
||||
if (!ParseInternalKey(iter_->key(), ikey)) {
|
||||
status_ = Status::Corruption("corrupted internal key in DBIter");
|
||||
Log(logger_, "corrupted internal key in DBIter: %s",
|
||||
iter_->key().ToString(true).c_str());
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
void DBIter::Next() {
|
||||
assert(valid_);
|
||||
|
||||
if (direction_ == kReverse) { // Switch directions?
|
||||
direction_ = kForward;
|
||||
// iter_ is pointing just before the entries for this->key(),
|
||||
// so advance into the range of entries for this->key() and then
|
||||
// use the normal skipping code below.
|
||||
if (!iter_->Valid()) {
|
||||
iter_->SeekToFirst();
|
||||
} else {
|
||||
iter_->Next();
|
||||
}
|
||||
if (!iter_->Valid()) {
|
||||
valid_ = false;
|
||||
saved_key_.Clear();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// If the current value is merged, we might already hit end of iter_
|
||||
if (!iter_->Valid()) {
|
||||
valid_ = false;
|
||||
return;
|
||||
}
|
||||
FindNextUserEntry(true /* skipping the current user key */);
|
||||
}
|
||||
|
||||
|
||||
// PRE: saved_key_ has the current user key if skipping
|
||||
// POST: saved_key_ should have the next user key if valid_,
|
||||
// if the current entry is a result of merge
|
||||
// current_entry_is_merged_ => true
|
||||
// saved_value_ => the merged value
|
||||
//
|
||||
// NOTE: In between, saved_key_ can point to a user key that has
|
||||
// a delete marker
|
||||
inline void DBIter::FindNextUserEntry(bool skipping) {
|
||||
PERF_TIMER_AUTO(find_next_user_entry_time);
|
||||
FindNextUserEntryInternal(skipping);
|
||||
PERF_TIMER_STOP(find_next_user_entry_time);
|
||||
}
|
||||
|
||||
// Actual implementation of DBIter::FindNextUserEntry()
|
||||
void DBIter::FindNextUserEntryInternal(bool skipping) {
|
||||
// Loop until we hit an acceptable entry to yield
|
||||
assert(iter_->Valid());
|
||||
assert(direction_ == kForward);
|
||||
current_entry_is_merged_ = false;
|
||||
uint64_t num_skipped = 0;
|
||||
do {
|
||||
ParsedInternalKey ikey;
|
||||
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
|
||||
if (skipping &&
|
||||
user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
|
||||
num_skipped++; // skip this entry
|
||||
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
|
||||
} else {
|
||||
skipping = false;
|
||||
switch (ikey.type) {
|
||||
case kTypeDeletion:
|
||||
// Arrange to skip all upcoming entries for this key since
|
||||
// they are hidden by this deletion.
|
||||
saved_key_.SetKey(ikey.user_key);
|
||||
skipping = true;
|
||||
num_skipped = 0;
|
||||
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
|
||||
break;
|
||||
case kTypeValue:
|
||||
valid_ = true;
|
||||
saved_key_.SetKey(ikey.user_key);
|
||||
return;
|
||||
case kTypeMerge:
|
||||
// By now, we are sure the current ikey is going to yield a value
|
||||
saved_key_.SetKey(ikey.user_key);
|
||||
current_entry_is_merged_ = true;
|
||||
valid_ = true;
|
||||
MergeValuesNewToOld(); // Go to a different state machine
|
||||
return;
|
||||
default:
|
||||
assert(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// If we have sequentially iterated via numerous keys and still not
|
||||
// found the next user-key, then it is better to seek so that we can
|
||||
// avoid too many key comparisons. We seek to the last occurence of
|
||||
// our current key by looking for sequence number 0.
|
||||
if (skipping && num_skipped > max_skip_) {
|
||||
num_skipped = 0;
|
||||
std::string last_key;
|
||||
AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), 0,
|
||||
kValueTypeForSeek));
|
||||
iter_->Seek(last_key);
|
||||
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
|
||||
} else {
|
||||
iter_->Next();
|
||||
}
|
||||
} while (iter_->Valid());
|
||||
valid_ = false;
|
||||
}
|
||||
|
||||
// Merge values of the same user key starting from the current iter_ position
|
||||
// Scan from the newer entries to older entries.
|
||||
// PRE: iter_->key() points to the first merge type entry
|
||||
// saved_key_ stores the user key
|
||||
// POST: saved_value_ has the merged value for the user key
|
||||
// iter_ points to the next entry (or invalid)
|
||||
void DBIter::MergeValuesNewToOld() {
|
||||
if (!user_merge_operator_) {
|
||||
Log(logger_, "Options::merge_operator is null.");
|
||||
throw std::logic_error("DBIter::MergeValuesNewToOld() with"
|
||||
" Options::merge_operator null");
|
||||
}
|
||||
|
||||
// Start the merge process by pushing the first operand
|
||||
std::deque<std::string> operands;
|
||||
operands.push_front(iter_->value().ToString());
|
||||
|
||||
std::string merge_result; // Temporary string to hold merge result later
|
||||
ParsedInternalKey ikey;
|
||||
for (iter_->Next(); iter_->Valid(); iter_->Next()) {
|
||||
if (!ParseKey(&ikey)) {
|
||||
// skip corrupted key
|
||||
continue;
|
||||
}
|
||||
|
||||
if (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) != 0) {
|
||||
// hit the next user key, stop right here
|
||||
break;
|
||||
}
|
||||
|
||||
if (kTypeDeletion == ikey.type) {
|
||||
// hit a delete with the same user key, stop right here
|
||||
// iter_ is positioned after delete
|
||||
iter_->Next();
|
||||
break;
|
||||
}
|
||||
|
||||
if (kTypeValue == ikey.type) {
|
||||
// hit a put, merge the put value with operands and store the
|
||||
// final result in saved_value_. We are done!
|
||||
// ignore corruption if there is any.
|
||||
const Slice value = iter_->value();
|
||||
user_merge_operator_->FullMerge(ikey.user_key, &value, operands,
|
||||
&saved_value_, logger_);
|
||||
// iter_ is positioned after put
|
||||
iter_->Next();
|
||||
return;
|
||||
}
|
||||
|
||||
if (kTypeMerge == ikey.type) {
|
||||
// hit a merge, add the value as an operand and run associative merge.
|
||||
// when complete, add result to operands and continue.
|
||||
const Slice& value = iter_->value();
|
||||
operands.push_front(value.ToString());
|
||||
}
|
||||
}
|
||||
|
||||
// we either exhausted all internal keys under this user key, or hit
|
||||
// a deletion marker.
|
||||
// feed null as the existing value to the merge operator, such that
|
||||
// client can differentiate this scenario and do things accordingly.
|
||||
user_merge_operator_->FullMerge(saved_key_.GetKey(), nullptr, operands,
|
||||
&saved_value_, logger_);
|
||||
}
|
||||
|
||||
void DBIter::Prev() {
|
||||
assert(valid_);
|
||||
|
||||
// Throw an exception now if merge_operator is provided
|
||||
// TODO: support backward iteration
|
||||
if (user_merge_operator_) {
|
||||
Log(logger_, "Prev not supported yet if merge_operator is provided");
|
||||
throw std::logic_error("DBIter::Prev backward iteration not supported"
|
||||
" if merge_operator is provided");
|
||||
}
|
||||
|
||||
if (direction_ == kForward) { // Switch directions?
|
||||
// iter_ is pointing at the current entry. Scan backwards until
|
||||
// the key changes so we can use the normal reverse scanning code.
|
||||
assert(iter_->Valid()); // Otherwise valid_ would have been false
|
||||
saved_key_.SetKey(ExtractUserKey(iter_->key()));
|
||||
while (true) {
|
||||
iter_->Prev();
|
||||
if (!iter_->Valid()) {
|
||||
valid_ = false;
|
||||
saved_key_.Clear();
|
||||
ClearSavedValue();
|
||||
return;
|
||||
}
|
||||
if (user_comparator_->Compare(ExtractUserKey(iter_->key()),
|
||||
saved_key_.GetKey()) < 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
direction_ = kReverse;
|
||||
}
|
||||
|
||||
FindPrevUserEntry();
|
||||
}
|
||||
|
||||
void DBIter::FindPrevUserEntry() {
|
||||
assert(direction_ == kReverse);
|
||||
uint64_t num_skipped = 0;
|
||||
|
||||
ValueType value_type = kTypeDeletion;
|
||||
bool saved_key_valid = true;
|
||||
if (iter_->Valid()) {
|
||||
do {
|
||||
ParsedInternalKey ikey;
|
||||
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
|
||||
if ((value_type != kTypeDeletion) &&
|
||||
user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) < 0) {
|
||||
// We encountered a non-deleted value in entries for previous keys,
|
||||
break;
|
||||
}
|
||||
value_type = ikey.type;
|
||||
if (value_type == kTypeDeletion) {
|
||||
saved_key_.Clear();
|
||||
ClearSavedValue();
|
||||
saved_key_valid = false;
|
||||
} else {
|
||||
Slice raw_value = iter_->value();
|
||||
if (saved_value_.capacity() > raw_value.size() + 1048576) {
|
||||
std::string empty;
|
||||
swap(empty, saved_value_);
|
||||
}
|
||||
saved_key_.SetKey(ExtractUserKey(iter_->key()));
|
||||
saved_value_.assign(raw_value.data(), raw_value.size());
|
||||
}
|
||||
} else {
|
||||
// In the case of ikey.sequence > sequence_, we might have already
|
||||
// iterated to a different user key.
|
||||
saved_key_valid = false;
|
||||
}
|
||||
num_skipped++;
|
||||
// If we have sequentially iterated via numerous keys and still not
|
||||
// found the prev user-key, then it is better to seek so that we can
|
||||
// avoid too many key comparisons. We seek to the first occurence of
|
||||
// our current key by looking for max sequence number.
|
||||
if (saved_key_valid && num_skipped > max_skip_) {
|
||||
num_skipped = 0;
|
||||
std::string last_key;
|
||||
AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(),
|
||||
kMaxSequenceNumber,
|
||||
kValueTypeForSeek));
|
||||
iter_->Seek(last_key);
|
||||
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
|
||||
} else {
|
||||
iter_->Prev();
|
||||
}
|
||||
} while (iter_->Valid());
|
||||
}
|
||||
|
||||
if (value_type == kTypeDeletion) {
|
||||
// End
|
||||
valid_ = false;
|
||||
saved_key_.Clear();
|
||||
ClearSavedValue();
|
||||
direction_ = kForward;
|
||||
} else {
|
||||
valid_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
void DBIter::Seek(const Slice& target) {
|
||||
saved_key_.Clear();
|
||||
// now savved_key is used to store internal key.
|
||||
saved_key_.SetInternalKey(target, sequence_);
|
||||
PERF_TIMER_AUTO(seek_internal_seek_time);
|
||||
iter_->Seek(saved_key_.GetKey());
|
||||
PERF_TIMER_STOP(seek_internal_seek_time);
|
||||
if (iter_->Valid()) {
|
||||
direction_ = kForward;
|
||||
ClearSavedValue();
|
||||
FindNextUserEntry(false /*not skipping */);
|
||||
} else {
|
||||
valid_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
void DBIter::SeekToFirst() {
|
||||
direction_ = kForward;
|
||||
ClearSavedValue();
|
||||
PERF_TIMER_AUTO(seek_internal_seek_time);
|
||||
iter_->SeekToFirst();
|
||||
PERF_TIMER_STOP(seek_internal_seek_time);
|
||||
if (iter_->Valid()) {
|
||||
FindNextUserEntry(false /* not skipping */);
|
||||
} else {
|
||||
valid_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
void DBIter::SeekToLast() {
|
||||
// Throw an exception for now if merge_operator is provided
|
||||
// TODO: support backward iteration
|
||||
if (user_merge_operator_) {
|
||||
Log(logger_, "SeekToLast not supported yet if merge_operator is provided");
|
||||
throw std::logic_error("DBIter::SeekToLast: backward iteration not"
|
||||
" supported if merge_operator is provided");
|
||||
}
|
||||
|
||||
direction_ = kReverse;
|
||||
ClearSavedValue();
|
||||
PERF_TIMER_AUTO(seek_internal_seek_time);
|
||||
iter_->SeekToLast();
|
||||
PERF_TIMER_STOP(seek_internal_seek_time);
|
||||
FindPrevUserEntry();
|
||||
}
|
||||
|
||||
Iterator* NewDBIterator(Env* env, const Options& options,
|
||||
const Comparator* user_key_comparator,
|
||||
Iterator* internal_iter,
|
||||
const SequenceNumber& sequence) {
|
||||
return new DBIter(env, options, user_key_comparator, internal_iter, sequence,
|
||||
false);
|
||||
}
|
||||
|
||||
ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
|
||||
|
||||
void ArenaWrappedDBIter::SetDBIter(DBIter* iter) { db_iter_ = iter; }
|
||||
|
||||
void ArenaWrappedDBIter::SetIterUnderDBIter(Iterator* iter) {
|
||||
static_cast<DBIter*>(db_iter_)->SetIter(iter);
|
||||
}
|
||||
|
||||
inline bool ArenaWrappedDBIter::Valid() const { return db_iter_->Valid(); }
|
||||
inline void ArenaWrappedDBIter::SeekToFirst() { db_iter_->SeekToFirst(); }
|
||||
inline void ArenaWrappedDBIter::SeekToLast() { db_iter_->SeekToLast(); }
|
||||
inline void ArenaWrappedDBIter::Seek(const Slice& target) {
|
||||
db_iter_->Seek(target);
|
||||
}
|
||||
inline void ArenaWrappedDBIter::Next() { db_iter_->Next(); }
|
||||
inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); }
|
||||
inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); }
|
||||
inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); }
|
||||
inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); }
|
||||
void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
|
||||
void* arg2) {
|
||||
db_iter_->RegisterCleanup(function, arg1, arg2);
|
||||
}
|
||||
|
||||
ArenaWrappedDBIter* NewArenaWrappedDbIterator(
|
||||
Env* env, const Options& options, const Comparator* user_key_comparator,
|
||||
const SequenceNumber& sequence) {
|
||||
ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
|
||||
Arena* arena = iter->GetArena();
|
||||
auto mem = arena->AllocateAligned(sizeof(DBIter));
|
||||
DBIter* db_iter = new (mem)
|
||||
DBIter(env, options, user_key_comparator, nullptr, sequence, true);
|
||||
iter->SetDBIter(db_iter);
|
||||
return iter;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
73
db/db_iter.h
Normal file
73
db/db_iter.h
Normal file
@@ -0,0 +1,73 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include "rocksdb/db.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "util/arena.h"
|
||||
#include "util/autovector.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Arena;
|
||||
class DBIter;
|
||||
|
||||
// Return a new iterator that converts internal keys (yielded by
|
||||
// "*internal_iter") that were live at the specified "sequence" number
|
||||
// into appropriate user keys.
|
||||
extern Iterator* NewDBIterator(
|
||||
Env* env,
|
||||
const Options& options,
|
||||
const Comparator *user_key_comparator,
|
||||
Iterator* internal_iter,
|
||||
const SequenceNumber& sequence);
|
||||
|
||||
// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
|
||||
// iterator is supposed be allocated. This class is used as an entry point of
|
||||
// a iterator hierarchy whose memory can be allocated inline. In that way,
|
||||
// accessing the iterator tree can be more cache friendly. It is also faster
|
||||
// to allocate.
|
||||
class ArenaWrappedDBIter : public Iterator {
|
||||
public:
|
||||
virtual ~ArenaWrappedDBIter();
|
||||
|
||||
// Get the arena to be used to allocate memory for DBIter to be wrapped,
|
||||
// as well as child iterators in it.
|
||||
virtual Arena* GetArena() { return &arena_; }
|
||||
|
||||
// Set the DB Iterator to be wrapped
|
||||
|
||||
virtual void SetDBIter(DBIter* iter);
|
||||
|
||||
// Set the internal iterator wrapped inside the DB Iterator. Usually it is
|
||||
// a merging iterator.
|
||||
virtual void SetIterUnderDBIter(Iterator* iter);
|
||||
virtual bool Valid() const override;
|
||||
virtual void SeekToFirst() override;
|
||||
virtual void SeekToLast() override;
|
||||
virtual void Seek(const Slice& target) override;
|
||||
virtual void Next() override;
|
||||
virtual void Prev() override;
|
||||
virtual Slice key() const override;
|
||||
virtual Slice value() const override;
|
||||
virtual Status status() const override;
|
||||
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
|
||||
|
||||
private:
|
||||
DBIter* db_iter_;
|
||||
Arena arena_;
|
||||
};
|
||||
|
||||
// Generate the arena wrapped iterator class.
|
||||
extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
|
||||
Env* env, const Options& options, const Comparator* user_key_comparator,
|
||||
const SequenceNumber& sequence);
|
||||
|
||||
} // namespace rocksdb
|
||||
95
db/db_stats_logger.cc
Normal file
95
db/db_stats_logger.cc
Normal file
@@ -0,0 +1,95 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/db_impl.h"
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include "db/version_set.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "port/port.h"
|
||||
#include "util/mutexlock.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
void DBImpl::MaybeScheduleLogDBDeployStats() {
|
||||
// we did say maybe
|
||||
#ifndef ROCKSDB_LITE
|
||||
// There is a lock in the actual logger.
|
||||
if (!logger_ || options_.db_stats_log_interval < 0
|
||||
|| host_name_.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if(bg_logstats_scheduled_ || shutting_down_.Acquire_Load()) {
|
||||
// Already scheduled
|
||||
} else {
|
||||
int64_t current_ts = 0;
|
||||
Status st = env_->GetCurrentTime(¤t_ts);
|
||||
if (!st.ok()) {
|
||||
return;
|
||||
}
|
||||
if ((current_ts - last_log_ts) < options_.db_stats_log_interval) {
|
||||
return;
|
||||
}
|
||||
last_log_ts = current_ts;
|
||||
bg_logstats_scheduled_ = true;
|
||||
env_->Schedule(&DBImpl::BGLogDBDeployStats, this);
|
||||
}
|
||||
}
|
||||
|
||||
void DBImpl::BGLogDBDeployStats(void* db) {
|
||||
DBImpl* db_inst = reinterpret_cast<DBImpl*>(db);
|
||||
db_inst->LogDBDeployStats();
|
||||
}
|
||||
|
||||
void DBImpl::LogDBDeployStats() {
|
||||
mutex_.Lock();
|
||||
|
||||
if (shutting_down_.Acquire_Load()) {
|
||||
bg_logstats_scheduled_ = false;
|
||||
bg_cv_.SignalAll();
|
||||
mutex_.Unlock();
|
||||
return;
|
||||
}
|
||||
|
||||
char tmp_ver[100];
|
||||
sprintf(tmp_ver, "%d.%d", kMajorVersion, kMinorVersion);
|
||||
std::string version_info(tmp_ver);
|
||||
|
||||
uint64_t file_total_size = 0;
|
||||
uint32_t file_total_num = 0;
|
||||
Version* current = default_cf_handle_->cfd()->current();
|
||||
for (int i = 0; i < current->NumberLevels(); i++) {
|
||||
file_total_num += current->NumLevelFiles(i);
|
||||
file_total_size += current->NumLevelBytes(i);
|
||||
}
|
||||
|
||||
Version::LevelSummaryStorage scratch;
|
||||
const char* file_num_summary = current->LevelSummary(&scratch);
|
||||
std::string file_num_per_level(file_num_summary);
|
||||
std::string data_size_per_level(file_num_summary);
|
||||
|
||||
mutex_.Unlock();
|
||||
|
||||
int64_t unix_ts;
|
||||
env_->GetCurrentTime(&unix_ts);
|
||||
|
||||
logger_->Log_Deploy_Stats(version_info, host_name_,
|
||||
db_absolute_path_, file_total_size, file_total_num, file_num_per_level,
|
||||
data_size_per_level, unix_ts);
|
||||
|
||||
mutex_.Lock();
|
||||
bg_logstats_scheduled_ = false;
|
||||
bg_cv_.SignalAll();
|
||||
mutex_.Unlock();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
6852
db/db_test.cc
Normal file
6852
db/db_test.cc
Normal file
File diff suppressed because it is too large
Load Diff
169
db/dbformat.cc
Normal file
169
db/dbformat.cc
Normal file
@@ -0,0 +1,169 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
#include "db/dbformat.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include "port/port.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/perf_context_imp.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
|
||||
assert(seq <= kMaxSequenceNumber);
|
||||
assert(t <= kValueTypeForSeek);
|
||||
return (seq << 8) | t;
|
||||
}
|
||||
|
||||
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
|
||||
result->append(key.user_key.data(), key.user_key.size());
|
||||
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
|
||||
}
|
||||
|
||||
std::string ParsedInternalKey::DebugString(bool hex) const {
|
||||
char buf[50];
|
||||
snprintf(buf, sizeof(buf), "' @ %llu : %d",
|
||||
(unsigned long long) sequence,
|
||||
int(type));
|
||||
std::string result = "'";
|
||||
result += user_key.ToString(hex);
|
||||
result += buf;
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string InternalKey::DebugString(bool hex) const {
|
||||
std::string result;
|
||||
ParsedInternalKey parsed;
|
||||
if (ParseInternalKey(rep_, &parsed)) {
|
||||
result = parsed.DebugString(hex);
|
||||
} else {
|
||||
result = "(bad)";
|
||||
result.append(EscapeString(rep_));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
const char* InternalKeyComparator::Name() const {
|
||||
return name_.c_str();
|
||||
}
|
||||
|
||||
int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
|
||||
// Order by:
|
||||
// increasing user key (according to user-supplied comparator)
|
||||
// decreasing sequence number
|
||||
// decreasing type (though sequence# should be enough to disambiguate)
|
||||
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
|
||||
PERF_COUNTER_ADD(user_key_comparison_count, 1);
|
||||
if (r == 0) {
|
||||
const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
|
||||
const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
|
||||
if (anum > bnum) {
|
||||
r = -1;
|
||||
} else if (anum < bnum) {
|
||||
r = +1;
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
int InternalKeyComparator::Compare(const ParsedInternalKey& a,
|
||||
const ParsedInternalKey& b) const {
|
||||
// Order by:
|
||||
// increasing user key (according to user-supplied comparator)
|
||||
// decreasing sequence number
|
||||
// decreasing type (though sequence# should be enough to disambiguate)
|
||||
int r = user_comparator_->Compare(a.user_key, b.user_key);
|
||||
PERF_COUNTER_ADD(user_key_comparison_count, 1);
|
||||
if (r == 0) {
|
||||
if (a.sequence > b.sequence) {
|
||||
r = -1;
|
||||
} else if (a.sequence < b.sequence) {
|
||||
r = +1;
|
||||
} else if (a.type > b.type) {
|
||||
r = -1;
|
||||
} else if (a.type < b.type) {
|
||||
r = +1;
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
void InternalKeyComparator::FindShortestSeparator(
|
||||
std::string* start,
|
||||
const Slice& limit) const {
|
||||
// Attempt to shorten the user portion of the key
|
||||
Slice user_start = ExtractUserKey(*start);
|
||||
Slice user_limit = ExtractUserKey(limit);
|
||||
std::string tmp(user_start.data(), user_start.size());
|
||||
user_comparator_->FindShortestSeparator(&tmp, user_limit);
|
||||
if (tmp.size() < user_start.size() &&
|
||||
user_comparator_->Compare(user_start, tmp) < 0) {
|
||||
// User key has become shorter physically, but larger logically.
|
||||
// Tack on the earliest possible number to the shortened user key.
|
||||
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
|
||||
assert(this->Compare(*start, tmp) < 0);
|
||||
assert(this->Compare(tmp, limit) < 0);
|
||||
start->swap(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
|
||||
Slice user_key = ExtractUserKey(*key);
|
||||
std::string tmp(user_key.data(), user_key.size());
|
||||
user_comparator_->FindShortSuccessor(&tmp);
|
||||
if (tmp.size() < user_key.size() &&
|
||||
user_comparator_->Compare(user_key, tmp) < 0) {
|
||||
// User key has become shorter physically, but larger logically.
|
||||
// Tack on the earliest possible number to the shortened user key.
|
||||
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
|
||||
assert(this->Compare(*key, tmp) < 0);
|
||||
key->swap(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
const char* InternalFilterPolicy::Name() const {
|
||||
return user_policy_->Name();
|
||||
}
|
||||
|
||||
void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
|
||||
std::string* dst) const {
|
||||
// We rely on the fact that the code in table.cc does not mind us
|
||||
// adjusting keys[].
|
||||
Slice* mkey = const_cast<Slice*>(keys);
|
||||
for (int i = 0; i < n; i++) {
|
||||
mkey[i] = ExtractUserKey(keys[i]);
|
||||
// TODO(sanjay): Suppress dups?
|
||||
}
|
||||
user_policy_->CreateFilter(keys, n, dst);
|
||||
}
|
||||
|
||||
bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
|
||||
return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
|
||||
}
|
||||
|
||||
LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
|
||||
size_t usize = user_key.size();
|
||||
size_t needed = usize + 13; // A conservative estimate
|
||||
char* dst;
|
||||
if (needed <= sizeof(space_)) {
|
||||
dst = space_;
|
||||
} else {
|
||||
dst = new char[needed];
|
||||
}
|
||||
start_ = dst;
|
||||
dst = EncodeVarint32(dst, usize + 8);
|
||||
kstart_ = dst;
|
||||
memcpy(dst, user_key.data(), usize);
|
||||
dst += usize;
|
||||
EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
|
||||
dst += 8;
|
||||
end_ = dst;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
345
db/dbformat.h
Normal file
345
db/dbformat.h
Normal file
@@ -0,0 +1,345 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <stdio.h>
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/filter_policy.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/logging.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class InternalKey;
|
||||
|
||||
// Value types encoded as the last component of internal keys.
|
||||
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
|
||||
// data structures.
|
||||
// The highest bit of the value type needs to be reserved to SST tables
|
||||
// for them to do more flexible encoding.
|
||||
enum ValueType : unsigned char {
|
||||
kTypeDeletion = 0x0,
|
||||
kTypeValue = 0x1,
|
||||
kTypeMerge = 0x2,
|
||||
// Following types are used only in write ahead logs. They are not used in
|
||||
// memtables or sst files:
|
||||
kTypeLogData = 0x3,
|
||||
kTypeColumnFamilyDeletion = 0x4,
|
||||
kTypeColumnFamilyValue = 0x5,
|
||||
kTypeColumnFamilyMerge = 0x6,
|
||||
kMaxValue = 0x7F
|
||||
};
|
||||
|
||||
// kValueTypeForSeek defines the ValueType that should be passed when
|
||||
// constructing a ParsedInternalKey object for seeking to a particular
|
||||
// sequence number (since we sort sequence numbers in decreasing order
|
||||
// and the value type is embedded as the low 8 bits in the sequence
|
||||
// number in internal keys, we need to use the highest-numbered
|
||||
// ValueType, not the lowest).
|
||||
static const ValueType kValueTypeForSeek = kTypeMerge;
|
||||
|
||||
// We leave eight bits empty at the bottom so a type and sequence#
|
||||
// can be packed together into 64-bits.
|
||||
static const SequenceNumber kMaxSequenceNumber =
|
||||
((0x1ull << 56) - 1);
|
||||
|
||||
struct ParsedInternalKey {
|
||||
Slice user_key;
|
||||
SequenceNumber sequence;
|
||||
ValueType type;
|
||||
|
||||
ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
|
||||
ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
|
||||
: user_key(u), sequence(seq), type(t) { }
|
||||
std::string DebugString(bool hex = false) const;
|
||||
};
|
||||
|
||||
// Return the length of the encoding of "key".
|
||||
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
|
||||
return key.user_key.size() + 8;
|
||||
}
|
||||
|
||||
extern uint64_t PackSequenceAndType(uint64_t seq, ValueType t);
|
||||
|
||||
// Append the serialization of "key" to *result.
|
||||
extern void AppendInternalKey(std::string* result,
|
||||
const ParsedInternalKey& key);
|
||||
|
||||
// Attempt to parse an internal key from "internal_key". On success,
|
||||
// stores the parsed data in "*result", and returns true.
|
||||
//
|
||||
// On error, returns false, leaves "*result" in an undefined state.
|
||||
extern bool ParseInternalKey(const Slice& internal_key,
|
||||
ParsedInternalKey* result);
|
||||
|
||||
// Returns the user key portion of an internal key.
|
||||
inline Slice ExtractUserKey(const Slice& internal_key) {
|
||||
assert(internal_key.size() >= 8);
|
||||
return Slice(internal_key.data(), internal_key.size() - 8);
|
||||
}
|
||||
|
||||
inline ValueType ExtractValueType(const Slice& internal_key) {
|
||||
assert(internal_key.size() >= 8);
|
||||
const size_t n = internal_key.size();
|
||||
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
|
||||
unsigned char c = num & 0xff;
|
||||
return static_cast<ValueType>(c);
|
||||
}
|
||||
|
||||
// A comparator for internal keys that uses a specified comparator for
|
||||
// the user key portion and breaks ties by decreasing sequence number.
|
||||
class InternalKeyComparator : public Comparator {
|
||||
private:
|
||||
const Comparator* user_comparator_;
|
||||
std::string name_;
|
||||
public:
|
||||
explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c),
|
||||
name_("rocksdb.InternalKeyComparator:" +
|
||||
std::string(user_comparator_->Name())) {
|
||||
}
|
||||
virtual ~InternalKeyComparator() {}
|
||||
|
||||
virtual const char* Name() const;
|
||||
virtual int Compare(const Slice& a, const Slice& b) const;
|
||||
virtual void FindShortestSeparator(
|
||||
std::string* start,
|
||||
const Slice& limit) const;
|
||||
virtual void FindShortSuccessor(std::string* key) const;
|
||||
|
||||
const Comparator* user_comparator() const { return user_comparator_; }
|
||||
|
||||
int Compare(const InternalKey& a, const InternalKey& b) const;
|
||||
int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
|
||||
};
|
||||
|
||||
// Filter policy wrapper that converts from internal keys to user keys
|
||||
class InternalFilterPolicy : public FilterPolicy {
|
||||
private:
|
||||
const FilterPolicy* const user_policy_;
|
||||
public:
|
||||
explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
|
||||
virtual const char* Name() const;
|
||||
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
|
||||
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
|
||||
};
|
||||
|
||||
// Modules in this directory should keep internal keys wrapped inside
|
||||
// the following class instead of plain strings so that we do not
|
||||
// incorrectly use string comparisons instead of an InternalKeyComparator.
|
||||
class InternalKey {
|
||||
private:
|
||||
std::string rep_;
|
||||
public:
|
||||
InternalKey() { } // Leave rep_ as empty to indicate it is invalid
|
||||
InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
|
||||
AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
|
||||
}
|
||||
|
||||
bool Valid() const {
|
||||
ParsedInternalKey parsed;
|
||||
return ParseInternalKey(Slice(rep_), &parsed);
|
||||
}
|
||||
|
||||
void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
|
||||
Slice Encode() const {
|
||||
assert(!rep_.empty());
|
||||
return rep_;
|
||||
}
|
||||
|
||||
Slice user_key() const { return ExtractUserKey(rep_); }
|
||||
|
||||
void SetFrom(const ParsedInternalKey& p) {
|
||||
rep_.clear();
|
||||
AppendInternalKey(&rep_, p);
|
||||
}
|
||||
|
||||
void Clear() { rep_.clear(); }
|
||||
|
||||
std::string DebugString(bool hex = false) const;
|
||||
};
|
||||
|
||||
inline int InternalKeyComparator::Compare(
|
||||
const InternalKey& a, const InternalKey& b) const {
|
||||
return Compare(a.Encode(), b.Encode());
|
||||
}
|
||||
|
||||
inline bool ParseInternalKey(const Slice& internal_key,
|
||||
ParsedInternalKey* result) {
|
||||
const size_t n = internal_key.size();
|
||||
if (n < 8) return false;
|
||||
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
|
||||
unsigned char c = num & 0xff;
|
||||
result->sequence = num >> 8;
|
||||
result->type = static_cast<ValueType>(c);
|
||||
assert(result->type <= ValueType::kMaxValue);
|
||||
result->user_key = Slice(internal_key.data(), n - 8);
|
||||
return (c <= static_cast<unsigned char>(kValueTypeForSeek));
|
||||
}
|
||||
|
||||
// Update the sequence number in the internal key
|
||||
inline void UpdateInternalKey(char* internal_key,
|
||||
const size_t internal_key_size,
|
||||
uint64_t seq, ValueType t) {
|
||||
assert(internal_key_size >= 8);
|
||||
char* seqtype = internal_key + internal_key_size - 8;
|
||||
uint64_t newval = (seq << 8) | t;
|
||||
EncodeFixed64(seqtype, newval);
|
||||
}
|
||||
|
||||
// Get the sequence number from the internal key
|
||||
inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
|
||||
const size_t n = internal_key.size();
|
||||
assert(n >= 8);
|
||||
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
|
||||
return num >> 8;
|
||||
}
|
||||
|
||||
|
||||
// A helper class useful for DBImpl::Get()
|
||||
class LookupKey {
|
||||
public:
|
||||
// Initialize *this for looking up user_key at a snapshot with
|
||||
// the specified sequence number.
|
||||
LookupKey(const Slice& user_key, SequenceNumber sequence);
|
||||
|
||||
~LookupKey();
|
||||
|
||||
// Return a key suitable for lookup in a MemTable.
|
||||
Slice memtable_key() const { return Slice(start_, end_ - start_); }
|
||||
|
||||
// Return an internal key (suitable for passing to an internal iterator)
|
||||
Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
|
||||
|
||||
// Return the user key
|
||||
Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
|
||||
|
||||
private:
|
||||
// We construct a char array of the form:
|
||||
// klength varint32 <-- start_
|
||||
// userkey char[klength] <-- kstart_
|
||||
// tag uint64
|
||||
// <-- end_
|
||||
// The array is a suitable MemTable key.
|
||||
// The suffix starting with "userkey" can be used as an InternalKey.
|
||||
const char* start_;
|
||||
const char* kstart_;
|
||||
const char* end_;
|
||||
char space_[200]; // Avoid allocation for short keys
|
||||
|
||||
// No copying allowed
|
||||
LookupKey(const LookupKey&);
|
||||
void operator=(const LookupKey&);
|
||||
};
|
||||
|
||||
inline LookupKey::~LookupKey() {
|
||||
if (start_ != space_) delete[] start_;
|
||||
}
|
||||
|
||||
class IterKey {
|
||||
public:
|
||||
IterKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {}
|
||||
|
||||
~IterKey() { ResetBuffer(); }
|
||||
|
||||
Slice GetKey() const { return Slice(key_, key_size_); }
|
||||
|
||||
void Clear() { key_size_ = 0; }
|
||||
|
||||
void SetKey(const Slice& key) {
|
||||
size_t size = key.size();
|
||||
EnlargeBufferIfNeeded(size);
|
||||
memcpy(key_, key.data(), size);
|
||||
key_size_ = size;
|
||||
}
|
||||
|
||||
void SetInternalKey(const Slice& user_key, SequenceNumber s,
|
||||
ValueType value_type = kValueTypeForSeek) {
|
||||
size_t usize = user_key.size();
|
||||
EnlargeBufferIfNeeded(usize + sizeof(uint64_t));
|
||||
memcpy(key_, user_key.data(), usize);
|
||||
EncodeFixed64(key_ + usize, PackSequenceAndType(s, value_type));
|
||||
key_size_ = usize + sizeof(uint64_t);
|
||||
}
|
||||
|
||||
void SetInternalKey(const ParsedInternalKey& parsed_key) {
|
||||
SetInternalKey(parsed_key.user_key, parsed_key.sequence, parsed_key.type);
|
||||
}
|
||||
|
||||
private:
|
||||
char* key_;
|
||||
size_t buf_size_;
|
||||
size_t key_size_;
|
||||
char space_[32]; // Avoid allocation for short keys
|
||||
|
||||
void ResetBuffer() {
|
||||
if (key_ != nullptr && key_ != space_) {
|
||||
delete[] key_;
|
||||
}
|
||||
key_ = space_;
|
||||
buf_size_ = sizeof(space_);
|
||||
key_size_ = 0;
|
||||
}
|
||||
|
||||
// Enlarge the buffer size if needed based on key_size.
|
||||
// By default, static allocated buffer is used. Once there is a key
|
||||
// larger than the static allocated buffer, another buffer is dynamically
|
||||
// allocated, until a larger key buffer is requested. In that case, we
|
||||
// reallocate buffer and delete the old one.
|
||||
void EnlargeBufferIfNeeded(size_t key_size) {
|
||||
// If size is smaller than buffer size, continue using current buffer,
|
||||
// or the static allocated one, as default
|
||||
if (key_size > buf_size_) {
|
||||
// Need to enlarge the buffer.
|
||||
ResetBuffer();
|
||||
key_ = new char[key_size];
|
||||
buf_size_ = key_size;
|
||||
}
|
||||
}
|
||||
|
||||
// No copying allowed
|
||||
IterKey(const IterKey&) = delete;
|
||||
void operator=(const IterKey&) = delete;
|
||||
};
|
||||
|
||||
class InternalKeySliceTransform : public SliceTransform {
|
||||
public:
|
||||
explicit InternalKeySliceTransform(const SliceTransform* transform)
|
||||
: transform_(transform) {}
|
||||
|
||||
virtual const char* Name() const { return transform_->Name(); }
|
||||
|
||||
virtual Slice Transform(const Slice& src) const {
|
||||
auto user_key = ExtractUserKey(src);
|
||||
return transform_->Transform(user_key);
|
||||
}
|
||||
|
||||
virtual bool InDomain(const Slice& src) const {
|
||||
auto user_key = ExtractUserKey(src);
|
||||
return transform_->InDomain(user_key);
|
||||
}
|
||||
|
||||
virtual bool InRange(const Slice& dst) const {
|
||||
auto user_key = ExtractUserKey(dst);
|
||||
return transform_->InRange(user_key);
|
||||
}
|
||||
|
||||
const SliceTransform* user_prefix_extractor() const { return transform_; }
|
||||
|
||||
private:
|
||||
// Like comparator, InternalKeySliceTransform will not take care of the
|
||||
// deletion of transform_
|
||||
const SliceTransform* const transform_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
117
db/dbformat_test.cc
Normal file
117
db/dbformat_test.cc
Normal file
@@ -0,0 +1,117 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
static std::string IKey(const std::string& user_key,
|
||||
uint64_t seq,
|
||||
ValueType vt) {
|
||||
std::string encoded;
|
||||
AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
|
||||
return encoded;
|
||||
}
|
||||
|
||||
static std::string Shorten(const std::string& s, const std::string& l) {
|
||||
std::string result = s;
|
||||
InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string ShortSuccessor(const std::string& s) {
|
||||
std::string result = s;
|
||||
InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
|
||||
return result;
|
||||
}
|
||||
|
||||
static void TestKey(const std::string& key,
|
||||
uint64_t seq,
|
||||
ValueType vt) {
|
||||
std::string encoded = IKey(key, seq, vt);
|
||||
|
||||
Slice in(encoded);
|
||||
ParsedInternalKey decoded("", 0, kTypeValue);
|
||||
|
||||
ASSERT_TRUE(ParseInternalKey(in, &decoded));
|
||||
ASSERT_EQ(key, decoded.user_key.ToString());
|
||||
ASSERT_EQ(seq, decoded.sequence);
|
||||
ASSERT_EQ(vt, decoded.type);
|
||||
|
||||
ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
|
||||
}
|
||||
|
||||
class FormatTest { };
|
||||
|
||||
TEST(FormatTest, InternalKey_EncodeDecode) {
|
||||
const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
|
||||
const uint64_t seq[] = {
|
||||
1, 2, 3,
|
||||
(1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
|
||||
(1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
|
||||
(1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
|
||||
};
|
||||
for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
|
||||
for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
|
||||
TestKey(keys[k], seq[s], kTypeValue);
|
||||
TestKey("hello", 1, kTypeDeletion);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(FormatTest, InternalKeyShortSeparator) {
|
||||
// When user keys are same
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foo", 99, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foo", 101, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foo", 100, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foo", 100, kTypeDeletion)));
|
||||
|
||||
// When user keys are misordered
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("bar", 99, kTypeValue)));
|
||||
|
||||
// When user keys are different, but correctly ordered
|
||||
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("hello", 200, kTypeValue)));
|
||||
|
||||
// When start user key is prefix of limit user key
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foobar", 200, kTypeValue)));
|
||||
|
||||
// When limit user key is prefix of start user key
|
||||
ASSERT_EQ(IKey("foobar", 100, kTypeValue),
|
||||
Shorten(IKey("foobar", 100, kTypeValue),
|
||||
IKey("foo", 200, kTypeValue)));
|
||||
}
|
||||
|
||||
TEST(FormatTest, InternalKeyShortestSuccessor) {
|
||||
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
|
||||
ShortSuccessor(IKey("foo", 100, kTypeValue)));
|
||||
ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
|
||||
ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
||||
295
db/deletefile_test.cc
Normal file
295
db/deletefile_test.cc
Normal file
@@ -0,0 +1,295 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/version_set.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/transaction_log.h"
|
||||
#include <vector>
|
||||
#include <stdlib.h>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class DeleteFileTest {
|
||||
public:
|
||||
std::string dbname_;
|
||||
Options options_;
|
||||
DB* db_;
|
||||
Env* env_;
|
||||
int numlevels_;
|
||||
|
||||
DeleteFileTest() {
|
||||
db_ = nullptr;
|
||||
env_ = Env::Default();
|
||||
options_.write_buffer_size = 1024*1024*1000;
|
||||
options_.target_file_size_base = 1024*1024*1000;
|
||||
options_.max_bytes_for_level_base = 1024*1024*1000;
|
||||
options_.WAL_ttl_seconds = 300; // Used to test log files
|
||||
options_.WAL_size_limit_MB = 1024; // Used to test log files
|
||||
dbname_ = test::TmpDir() + "/deletefile_test";
|
||||
options_.wal_dir = dbname_ + "/wal_files";
|
||||
|
||||
// clean up all the files that might have been there before
|
||||
std::vector<std::string> old_files;
|
||||
env_->GetChildren(dbname_, &old_files);
|
||||
for (auto file : old_files) {
|
||||
env_->DeleteFile(dbname_ + "/" + file);
|
||||
}
|
||||
env_->GetChildren(options_.wal_dir, &old_files);
|
||||
for (auto file : old_files) {
|
||||
env_->DeleteFile(options_.wal_dir + "/" + file);
|
||||
}
|
||||
|
||||
DestroyDB(dbname_, options_);
|
||||
numlevels_ = 7;
|
||||
ASSERT_OK(ReopenDB(true));
|
||||
}
|
||||
|
||||
Status ReopenDB(bool create) {
|
||||
delete db_;
|
||||
if (create) {
|
||||
DestroyDB(dbname_, options_);
|
||||
}
|
||||
db_ = nullptr;
|
||||
options_.create_if_missing = create;
|
||||
return DB::Open(options_, dbname_, &db_);
|
||||
}
|
||||
|
||||
void CloseDB() {
|
||||
delete db_;
|
||||
}
|
||||
|
||||
void AddKeys(int numkeys, int startkey = 0) {
|
||||
WriteOptions options;
|
||||
options.sync = false;
|
||||
ReadOptions roptions;
|
||||
for (int i = startkey; i < (numkeys + startkey) ; i++) {
|
||||
std::string temp = std::to_string(i);
|
||||
Slice key(temp);
|
||||
Slice value(temp);
|
||||
ASSERT_OK(db_->Put(options, key, value));
|
||||
}
|
||||
}
|
||||
|
||||
int numKeysInLevels(
|
||||
std::vector<LiveFileMetaData> &metadata,
|
||||
std::vector<int> *keysperlevel = nullptr) {
|
||||
|
||||
if (keysperlevel != nullptr) {
|
||||
keysperlevel->resize(numlevels_);
|
||||
}
|
||||
|
||||
int numKeys = 0;
|
||||
for (size_t i = 0; i < metadata.size(); i++) {
|
||||
int startkey = atoi(metadata[i].smallestkey.c_str());
|
||||
int endkey = atoi(metadata[i].largestkey.c_str());
|
||||
int numkeysinfile = (endkey - startkey + 1);
|
||||
numKeys += numkeysinfile;
|
||||
if (keysperlevel != nullptr) {
|
||||
(*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
|
||||
}
|
||||
fprintf(stderr, "level %d name %s smallest %s largest %s\n",
|
||||
metadata[i].level, metadata[i].name.c_str(),
|
||||
metadata[i].smallestkey.c_str(),
|
||||
metadata[i].largestkey.c_str());
|
||||
}
|
||||
return numKeys;
|
||||
}
|
||||
|
||||
void CreateTwoLevels() {
|
||||
AddKeys(50000, 10000);
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
ASSERT_OK(dbi->TEST_FlushMemTable());
|
||||
ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
|
||||
|
||||
AddKeys(50000, 10000);
|
||||
ASSERT_OK(dbi->TEST_FlushMemTable());
|
||||
ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
|
||||
}
|
||||
|
||||
void CheckFileTypeCounts(std::string& dir,
|
||||
int required_log,
|
||||
int required_sst,
|
||||
int required_manifest) {
|
||||
std::vector<std::string> filenames;
|
||||
env_->GetChildren(dir, &filenames);
|
||||
|
||||
int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
|
||||
for (auto file : filenames) {
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
if (ParseFileName(file, &number, &type)) {
|
||||
log_cnt += (type == kLogFile);
|
||||
sst_cnt += (type == kTableFile);
|
||||
manifest_cnt += (type == kDescriptorFile);
|
||||
}
|
||||
}
|
||||
ASSERT_EQ(required_log, log_cnt);
|
||||
ASSERT_EQ(required_sst, sst_cnt);
|
||||
ASSERT_EQ(required_manifest, manifest_cnt);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
TEST(DeleteFileTest, AddKeysAndQueryLevels) {
|
||||
CreateTwoLevels();
|
||||
std::vector<LiveFileMetaData> metadata;
|
||||
std::vector<int> keysinlevel;
|
||||
db_->GetLiveFilesMetaData(&metadata);
|
||||
|
||||
std::string level1file = "";
|
||||
int level1keycount = 0;
|
||||
std::string level2file = "";
|
||||
int level2keycount = 0;
|
||||
int level1index = 0;
|
||||
int level2index = 1;
|
||||
|
||||
ASSERT_EQ((int)metadata.size(), 2);
|
||||
if (metadata[0].level == 2) {
|
||||
level1index = 1;
|
||||
level2index = 0;
|
||||
}
|
||||
|
||||
level1file = metadata[level1index].name;
|
||||
int startkey = atoi(metadata[level1index].smallestkey.c_str());
|
||||
int endkey = atoi(metadata[level1index].largestkey.c_str());
|
||||
level1keycount = (endkey - startkey + 1);
|
||||
level2file = metadata[level2index].name;
|
||||
startkey = atoi(metadata[level2index].smallestkey.c_str());
|
||||
endkey = atoi(metadata[level2index].largestkey.c_str());
|
||||
level2keycount = (endkey - startkey + 1);
|
||||
|
||||
// COntrolled setup. Levels 1 and 2 should both have 50K files.
|
||||
// This is a little fragile as it depends on the current
|
||||
// compaction heuristics.
|
||||
ASSERT_EQ(level1keycount, 50000);
|
||||
ASSERT_EQ(level2keycount, 50000);
|
||||
|
||||
Status status = db_->DeleteFile("0.sst");
|
||||
ASSERT_TRUE(status.IsInvalidArgument());
|
||||
|
||||
// intermediate level files cannot be deleted.
|
||||
status = db_->DeleteFile(level1file);
|
||||
ASSERT_TRUE(status.IsInvalidArgument());
|
||||
|
||||
// Lowest level file deletion should succeed.
|
||||
ASSERT_OK(db_->DeleteFile(level2file));
|
||||
|
||||
CloseDB();
|
||||
}
|
||||
|
||||
TEST(DeleteFileTest, PurgeObsoleteFilesTest) {
|
||||
CreateTwoLevels();
|
||||
// there should be only one (empty) log file because CreateTwoLevels()
|
||||
// flushes the memtables to disk
|
||||
CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
|
||||
// 2 ssts, 1 manifest
|
||||
CheckFileTypeCounts(dbname_, 0, 2, 1);
|
||||
std::string first("0"), last("999999");
|
||||
Slice first_slice(first), last_slice(last);
|
||||
db_->CompactRange(&first_slice, &last_slice, true, 2);
|
||||
// 1 sst after compaction
|
||||
CheckFileTypeCounts(dbname_, 0, 1, 1);
|
||||
|
||||
// this time, we keep an iterator alive
|
||||
ReopenDB(true);
|
||||
Iterator *itr = 0;
|
||||
CreateTwoLevels();
|
||||
itr = db_->NewIterator(ReadOptions());
|
||||
db_->CompactRange(&first_slice, &last_slice, true, 2);
|
||||
// 3 sst after compaction with live iterator
|
||||
CheckFileTypeCounts(dbname_, 0, 3, 1);
|
||||
delete itr;
|
||||
// 1 sst after iterator deletion
|
||||
CheckFileTypeCounts(dbname_, 0, 1, 1);
|
||||
|
||||
CloseDB();
|
||||
}
|
||||
|
||||
TEST(DeleteFileTest, DeleteFileWithIterator) {
|
||||
CreateTwoLevels();
|
||||
ReadOptions options;
|
||||
Iterator* it = db_->NewIterator(options);
|
||||
std::vector<LiveFileMetaData> metadata;
|
||||
db_->GetLiveFilesMetaData(&metadata);
|
||||
|
||||
std::string level2file = "";
|
||||
|
||||
ASSERT_EQ((int)metadata.size(), 2);
|
||||
if (metadata[0].level == 1) {
|
||||
level2file = metadata[1].name;
|
||||
} else {
|
||||
level2file = metadata[0].name;
|
||||
}
|
||||
|
||||
Status status = db_->DeleteFile(level2file);
|
||||
fprintf(stdout, "Deletion status %s: %s\n",
|
||||
level2file.c_str(), status.ToString().c_str());
|
||||
ASSERT_TRUE(status.ok());
|
||||
it->SeekToFirst();
|
||||
int numKeysIterated = 0;
|
||||
while(it->Valid()) {
|
||||
numKeysIterated++;
|
||||
it->Next();
|
||||
}
|
||||
ASSERT_EQ(numKeysIterated, 50000);
|
||||
delete it;
|
||||
CloseDB();
|
||||
}
|
||||
|
||||
TEST(DeleteFileTest, DeleteLogFiles) {
|
||||
AddKeys(10, 0);
|
||||
VectorLogPtr logfiles;
|
||||
db_->GetSortedWalFiles(logfiles);
|
||||
ASSERT_GT(logfiles.size(), 0UL);
|
||||
// Take the last log file which is expected to be alive and try to delete it
|
||||
// Should not succeed because live logs are not allowed to be deleted
|
||||
std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
|
||||
ASSERT_EQ(alive_log->Type(), kAliveLogFile);
|
||||
ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
|
||||
fprintf(stdout, "Deleting alive log file %s\n",
|
||||
alive_log->PathName().c_str());
|
||||
ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
|
||||
ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
|
||||
logfiles.clear();
|
||||
|
||||
// Call Flush to bring about a new working log file and add more keys
|
||||
// Call Flush again to flush out memtable and move alive log to archived log
|
||||
// and try to delete the archived log file
|
||||
FlushOptions fopts;
|
||||
db_->Flush(fopts);
|
||||
AddKeys(10, 0);
|
||||
db_->Flush(fopts);
|
||||
db_->GetSortedWalFiles(logfiles);
|
||||
ASSERT_GT(logfiles.size(), 0UL);
|
||||
std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
|
||||
ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
|
||||
ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" +
|
||||
archived_log->PathName()));
|
||||
fprintf(stdout, "Deleting archived log file %s\n",
|
||||
archived_log->PathName().c_str());
|
||||
ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
|
||||
ASSERT_TRUE(!env_->FileExists(options_.wal_dir + "/" +
|
||||
archived_log->PathName()));
|
||||
CloseDB();
|
||||
}
|
||||
|
||||
} //namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
||||
|
||||
202
db/file_indexer.cc
Normal file
202
db/file_indexer.cc
Normal file
@@ -0,0 +1,202 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/file_indexer.h"
|
||||
#include <algorithm>
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "db/version_edit.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
FileIndexer::FileIndexer(const uint32_t num_levels,
|
||||
const Comparator* ucmp)
|
||||
: num_levels_(num_levels),
|
||||
ucmp_(ucmp),
|
||||
next_level_index_(num_levels),
|
||||
level_rb_(num_levels, -1) {
|
||||
}
|
||||
|
||||
|
||||
uint32_t FileIndexer::NumLevelIndex() {
|
||||
return next_level_index_.size();
|
||||
}
|
||||
|
||||
uint32_t FileIndexer::LevelIndexSize(uint32_t level) {
|
||||
return next_level_index_[level].size();
|
||||
}
|
||||
|
||||
void FileIndexer::GetNextLevelIndex(
|
||||
const uint32_t level, const uint32_t file_index, const int cmp_smallest,
|
||||
const int cmp_largest, int32_t* left_bound, int32_t* right_bound) {
|
||||
assert(level > 0);
|
||||
|
||||
// Last level, no hint
|
||||
if (level == num_levels_ - 1) {
|
||||
*left_bound = 0;
|
||||
*right_bound = -1;
|
||||
return;
|
||||
}
|
||||
|
||||
assert(level < num_levels_ - 1);
|
||||
assert(static_cast<int32_t>(file_index) <= level_rb_[level]);
|
||||
|
||||
const auto& index = next_level_index_[level][file_index];
|
||||
|
||||
if (cmp_smallest < 0) {
|
||||
*left_bound = (level > 0 && file_index > 0) ?
|
||||
next_level_index_[level][file_index - 1].largest_lb : 0;
|
||||
*right_bound = index.smallest_rb;
|
||||
} else if (cmp_smallest == 0) {
|
||||
*left_bound = index.smallest_lb;
|
||||
*right_bound = index.smallest_rb;
|
||||
} else if (cmp_smallest > 0 && cmp_largest < 0) {
|
||||
*left_bound = index.smallest_lb;
|
||||
*right_bound = index.largest_rb;
|
||||
} else if (cmp_largest == 0) {
|
||||
*left_bound = index.largest_lb;
|
||||
*right_bound = index.largest_rb;
|
||||
} else if (cmp_largest > 0) {
|
||||
*left_bound = index.largest_lb;
|
||||
*right_bound = level_rb_[level + 1];
|
||||
} else {
|
||||
assert(false);
|
||||
}
|
||||
|
||||
assert(*left_bound >= 0);
|
||||
assert(*left_bound <= *right_bound + 1);
|
||||
assert(*right_bound <= level_rb_[level + 1]);
|
||||
}
|
||||
|
||||
void FileIndexer::ClearIndex() {
|
||||
for (uint32_t level = 1; level < num_levels_; ++level) {
|
||||
next_level_index_[level].clear();
|
||||
}
|
||||
}
|
||||
|
||||
void FileIndexer::UpdateIndex(std::vector<FileMetaData*>* const files) {
|
||||
if (files == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
// L1 - Ln-1
|
||||
for (uint32_t level = 1; level < num_levels_ - 1; ++level) {
|
||||
const auto& upper_files = files[level];
|
||||
const int32_t upper_size = upper_files.size();
|
||||
const auto& lower_files = files[level + 1];
|
||||
level_rb_[level] = upper_files.size() - 1;
|
||||
if (upper_size == 0) {
|
||||
continue;
|
||||
}
|
||||
auto& index = next_level_index_[level];
|
||||
index.resize(upper_size);
|
||||
|
||||
CalculateLB(upper_files, lower_files, &index,
|
||||
[this](const FileMetaData* a, const FileMetaData* b) -> int {
|
||||
return ucmp_->Compare(a->smallest.user_key(), b->largest.user_key());
|
||||
},
|
||||
[](IndexUnit* index, int32_t f_idx) {
|
||||
index->smallest_lb = f_idx;
|
||||
});
|
||||
CalculateLB(upper_files, lower_files, &index,
|
||||
[this](const FileMetaData* a, const FileMetaData* b) -> int {
|
||||
return ucmp_->Compare(a->largest.user_key(), b->largest.user_key());
|
||||
},
|
||||
[](IndexUnit* index, int32_t f_idx) {
|
||||
index->largest_lb = f_idx;
|
||||
});
|
||||
CalculateRB(upper_files, lower_files, &index,
|
||||
[this](const FileMetaData* a, const FileMetaData* b) -> int {
|
||||
return ucmp_->Compare(a->smallest.user_key(), b->smallest.user_key());
|
||||
},
|
||||
[](IndexUnit* index, int32_t f_idx) {
|
||||
index->smallest_rb = f_idx;
|
||||
});
|
||||
CalculateRB(upper_files, lower_files, &index,
|
||||
[this](const FileMetaData* a, const FileMetaData* b) -> int {
|
||||
return ucmp_->Compare(a->largest.user_key(), b->smallest.user_key());
|
||||
},
|
||||
[](IndexUnit* index, int32_t f_idx) {
|
||||
index->largest_rb = f_idx;
|
||||
});
|
||||
}
|
||||
level_rb_[num_levels_ - 1] = files[num_levels_ - 1].size() - 1;
|
||||
}
|
||||
|
||||
void FileIndexer::CalculateLB(const std::vector<FileMetaData*>& upper_files,
|
||||
const std::vector<FileMetaData*>& lower_files,
|
||||
std::vector<IndexUnit>* index,
|
||||
std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
|
||||
std::function<void(IndexUnit*, int32_t)> set_index) {
|
||||
const int32_t upper_size = upper_files.size();
|
||||
const int32_t lower_size = lower_files.size();
|
||||
int32_t upper_idx = 0;
|
||||
int32_t lower_idx = 0;
|
||||
while (upper_idx < upper_size && lower_idx < lower_size) {
|
||||
int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
|
||||
|
||||
if (cmp == 0) {
|
||||
set_index(&(*index)[upper_idx], lower_idx);
|
||||
++upper_idx;
|
||||
++lower_idx;
|
||||
} else if (cmp > 0) {
|
||||
// Lower level's file (largest) is smaller, a key won't hit in that
|
||||
// file. Move to next lower file
|
||||
++lower_idx;
|
||||
} else {
|
||||
// Lower level's file becomes larger, update the index, and
|
||||
// move to the next upper file
|
||||
set_index(&(*index)[upper_idx], lower_idx);
|
||||
++upper_idx;
|
||||
}
|
||||
}
|
||||
|
||||
while (upper_idx < upper_size) {
|
||||
// Lower files are exhausted, that means the remaining upper files are
|
||||
// greater than any lower files. Set the index to be the lower level size.
|
||||
set_index(&(*index)[upper_idx], lower_size);
|
||||
++upper_idx;
|
||||
}
|
||||
}
|
||||
|
||||
void FileIndexer::CalculateRB(const std::vector<FileMetaData*>& upper_files,
|
||||
const std::vector<FileMetaData*>& lower_files,
|
||||
std::vector<IndexUnit>* index,
|
||||
std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
|
||||
std::function<void(IndexUnit*, int32_t)> set_index) {
|
||||
const int32_t upper_size = upper_files.size();
|
||||
const int32_t lower_size = lower_files.size();
|
||||
int32_t upper_idx = upper_size - 1;
|
||||
int32_t lower_idx = lower_size - 1;
|
||||
while (upper_idx >= 0 && lower_idx >= 0) {
|
||||
int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
|
||||
|
||||
if (cmp == 0) {
|
||||
set_index(&(*index)[upper_idx], lower_idx);
|
||||
--upper_idx;
|
||||
--lower_idx;
|
||||
} else if (cmp < 0) {
|
||||
// Lower level's file (smallest) is larger, a key won't hit in that
|
||||
// file. Move to next lower file.
|
||||
--lower_idx;
|
||||
} else {
|
||||
// Lower level's file becomes smaller, update the index, and move to
|
||||
// the next the upper file
|
||||
set_index(&(*index)[upper_idx], lower_idx);
|
||||
--upper_idx;
|
||||
}
|
||||
}
|
||||
while (upper_idx >= 0) {
|
||||
// Lower files are exhausted, that means the remaining upper files are
|
||||
// smaller than any lower files. Set it to -1.
|
||||
set_index(&(*index)[upper_idx], -1);
|
||||
--upper_idx;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
129
db/file_indexer.h
Normal file
129
db/file_indexer.h
Normal file
@@ -0,0 +1,129 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Comparator;
|
||||
struct FileMetaData;
|
||||
|
||||
// The file tree structure in Version is prebuilt and the range of each file
|
||||
// is known. On Version::Get(), it uses binary search to find a potential file
|
||||
// and then check if a target key can be found in the file by comparing the key
|
||||
// to each file's smallest and largest key. The results of these comparisions
|
||||
// can be reused beyond checking if a key falls into a file's range.
|
||||
// With some pre-calculated knowledge, each key comparision that has been done
|
||||
// can serve as a hint to narrow down further searches: if a key compared to
|
||||
// be smaller than a file's smallest or largest, that comparison can be used
|
||||
// to find out the right bound of next binary search. Similarly, if a key
|
||||
// compared to be larger than a file's smallest or largest, it can be utilized
|
||||
// to find out the left bound of next binary search.
|
||||
// With these hints: it can greatly reduce the range of binary search,
|
||||
// especially for bottom levels, given that one file most likely overlaps with
|
||||
// only N files from level below (where N is max_bytes_for_level_multiplier).
|
||||
// So on level L, we will only look at ~N files instead of N^L files on the
|
||||
// naive approach.
|
||||
class FileIndexer {
|
||||
public:
|
||||
FileIndexer(const uint32_t num_levels, const Comparator* ucmp);
|
||||
|
||||
uint32_t NumLevelIndex();
|
||||
|
||||
uint32_t LevelIndexSize(uint32_t level);
|
||||
|
||||
// Return a file index range in the next level to search for a key based on
|
||||
// smallest and largest key comparision for the current file specified by
|
||||
// level and file_index. When *left_index < *right_index, both index should
|
||||
// be valid and fit in the vector size.
|
||||
void GetNextLevelIndex(
|
||||
const uint32_t level, const uint32_t file_index, const int cmp_smallest,
|
||||
const int cmp_largest, int32_t* left_bound, int32_t* right_bound);
|
||||
|
||||
void ClearIndex();
|
||||
|
||||
void UpdateIndex(std::vector<FileMetaData*>* const files);
|
||||
|
||||
enum {
|
||||
kLevelMaxIndex = std::numeric_limits<int32_t>::max()
|
||||
};
|
||||
|
||||
private:
|
||||
const uint32_t num_levels_;
|
||||
const Comparator* ucmp_;
|
||||
|
||||
struct IndexUnit {
|
||||
IndexUnit()
|
||||
: smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {}
|
||||
// During file search, a key is compared against smallest and largest
|
||||
// from a FileMetaData. It can have 3 possible outcomes:
|
||||
// (1) key is smaller than smallest, implying it is also smaller than
|
||||
// larger. Precalculated index based on "smallest < smallest" can
|
||||
// be used to provide right bound.
|
||||
// (2) key is in between smallest and largest.
|
||||
// Precalculated index based on "smallest > greatest" can be used to
|
||||
// provide left bound.
|
||||
// Precalculated index based on "largest < smallest" can be used to
|
||||
// provide right bound.
|
||||
// (3) key is larger than largest, implying it is also larger than smallest.
|
||||
// Precalculated index based on "largest > largest" can be used to
|
||||
// provide left bound.
|
||||
//
|
||||
// As a result, we will need to do:
|
||||
// Compare smallest (<=) and largest keys from upper level file with
|
||||
// smallest key from lower level to get a right bound.
|
||||
// Compare smallest (>=) and largest keys from upper level file with
|
||||
// largest key from lower level to get a left bound.
|
||||
//
|
||||
// Example:
|
||||
// level 1: [50 - 60]
|
||||
// level 2: [1 - 40], [45 - 55], [58 - 80]
|
||||
// A key 35, compared to be less than 50, 3rd file on level 2 can be
|
||||
// skipped according to rule (1). LB = 0, RB = 1.
|
||||
// A key 53, sits in the middle 50 and 60. 1st file on level 2 can be
|
||||
// skipped according to rule (2)-a, but the 3rd file cannot be skipped
|
||||
// because 60 is greater than 58. LB = 1, RB = 2.
|
||||
// A key 70, compared to be larger than 60. 1st and 2nd file can be skipped
|
||||
// according to rule (3). LB = 2, RB = 2.
|
||||
//
|
||||
// Point to a left most file in a lower level that may contain a key,
|
||||
// which compares greater than smallest of a FileMetaData (upper level)
|
||||
int32_t smallest_lb;
|
||||
// Point to a left most file in a lower level that may contain a key,
|
||||
// which compares greater than largest of a FileMetaData (upper level)
|
||||
int32_t largest_lb;
|
||||
// Point to a right most file in a lower level that may contain a key,
|
||||
// which compares smaller than smallest of a FileMetaData (upper level)
|
||||
int32_t smallest_rb;
|
||||
// Point to a right most file in a lower level that may contain a key,
|
||||
// which compares smaller than largest of a FileMetaData (upper level)
|
||||
int32_t largest_rb;
|
||||
};
|
||||
|
||||
void CalculateLB(const std::vector<FileMetaData*>& upper_files,
|
||||
const std::vector<FileMetaData*>& lower_files,
|
||||
std::vector<IndexUnit>* index,
|
||||
std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
|
||||
std::function<void(IndexUnit*, int32_t)> set_index);
|
||||
|
||||
void CalculateRB(const std::vector<FileMetaData*>& upper_files,
|
||||
const std::vector<FileMetaData*>& lower_files,
|
||||
std::vector<IndexUnit>* index,
|
||||
std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
|
||||
std::function<void(IndexUnit*, int32_t)> set_index);
|
||||
|
||||
std::vector<std::vector<IndexUnit>> next_level_index_;
|
||||
std::vector<int32_t> level_rb_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
330
db/file_indexer_test.cc
Normal file
330
db/file_indexer_test.cc
Normal file
@@ -0,0 +1,330 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include <string>
|
||||
#include "db/file_indexer.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class IntComparator : public Comparator {
|
||||
public:
|
||||
int Compare(const Slice& a, const Slice& b) const {
|
||||
assert(a.size() == 8);
|
||||
assert(b.size() == 8);
|
||||
return *reinterpret_cast<const int64_t*>(a.data()) -
|
||||
*reinterpret_cast<const int64_t*>(b.data());
|
||||
}
|
||||
|
||||
const char* Name() const {
|
||||
return "IntComparator";
|
||||
}
|
||||
|
||||
void FindShortestSeparator(std::string* start, const Slice& limit) const {}
|
||||
|
||||
void FindShortSuccessor(std::string* key) const {}
|
||||
};
|
||||
|
||||
|
||||
struct FileIndexerTest {
|
||||
public:
|
||||
FileIndexerTest() :
|
||||
kNumLevels(4), indexer(kNumLevels, &ucmp),
|
||||
files(new std::vector<FileMetaData*>[kNumLevels]) {
|
||||
}
|
||||
|
||||
~FileIndexerTest() {
|
||||
Reset();
|
||||
delete[] files;
|
||||
}
|
||||
|
||||
void AddFile(int level, int64_t smallest, int64_t largest) {
|
||||
auto* f = new FileMetaData();
|
||||
f->smallest = IntKey(smallest);
|
||||
f->largest = IntKey(largest);
|
||||
files[level].push_back(f);
|
||||
}
|
||||
|
||||
InternalKey IntKey(int64_t v) {
|
||||
return InternalKey(Slice(reinterpret_cast<char*>(&v), 8), 0, kTypeValue);
|
||||
}
|
||||
|
||||
void Reset() {
|
||||
for (uint32_t i = 0; i < kNumLevels; ++i) {
|
||||
for (auto* f : files[i]) {
|
||||
delete f;
|
||||
}
|
||||
files[i].clear();
|
||||
}
|
||||
indexer.ClearIndex();
|
||||
}
|
||||
|
||||
void GetNextLevelIndex(const uint32_t level, const uint32_t file_index,
|
||||
const int cmp_smallest, const int cmp_largest, int32_t* left_index,
|
||||
int32_t* right_index) {
|
||||
*left_index = 100;
|
||||
*right_index = 100;
|
||||
indexer.GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest,
|
||||
left_index, right_index);
|
||||
}
|
||||
|
||||
const uint32_t kNumLevels;
|
||||
IntComparator ucmp;
|
||||
FileIndexer indexer;
|
||||
|
||||
std::vector<FileMetaData*>* files;
|
||||
};
|
||||
|
||||
TEST(FileIndexerTest, next_level_hint) {
|
||||
for (uint32_t i = 0; i < kNumLevels; ++i) {
|
||||
ASSERT_EQ(0U, indexer.LevelIndexSize(i));
|
||||
}
|
||||
|
||||
// Case 1: no overlap, files are on the left of next level files
|
||||
// level 1
|
||||
AddFile(1, 100, 200);
|
||||
AddFile(1, 300, 400);
|
||||
AddFile(1, 500, 600);
|
||||
// level 2
|
||||
AddFile(2, 1500, 1600);
|
||||
AddFile(2, 1601, 1699);
|
||||
AddFile(2, 1700, 1800);
|
||||
// level 3
|
||||
AddFile(3, 2500, 2600);
|
||||
AddFile(3, 2601, 2699);
|
||||
AddFile(3, 2700, 2800);
|
||||
indexer.UpdateIndex(files);
|
||||
int32_t left = 100;
|
||||
int32_t right = 100;
|
||||
for (uint32_t level = 1; level < 3; ++level) {
|
||||
for (uint32_t f = 0; f < 3; ++f) {
|
||||
GetNextLevelIndex(level, f, -1, -1, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(-1, right);
|
||||
GetNextLevelIndex(level, f, 0, -1, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(-1, right);
|
||||
GetNextLevelIndex(level, f, 1, -1, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(-1, right);
|
||||
GetNextLevelIndex(level, f, 1, 0, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(-1, right);
|
||||
GetNextLevelIndex(level, f, 1, 1, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(2, right);
|
||||
}
|
||||
}
|
||||
|
||||
// Case 2: no overlap, files are on the right of next level files
|
||||
Reset();
|
||||
for (uint32_t i = 1; i < kNumLevels; ++i) {
|
||||
ASSERT_EQ(0U, indexer.LevelIndexSize(i));
|
||||
}
|
||||
// level 1
|
||||
AddFile(1, 2100, 2200);
|
||||
AddFile(1, 2300, 2400);
|
||||
AddFile(1, 2500, 2600);
|
||||
// level 2
|
||||
AddFile(2, 1500, 1600);
|
||||
AddFile(2, 1501, 1699);
|
||||
AddFile(2, 1700, 1800);
|
||||
// level 3
|
||||
AddFile(3, 500, 600);
|
||||
AddFile(3, 501, 699);
|
||||
AddFile(3, 700, 800);
|
||||
indexer.UpdateIndex(files);
|
||||
for (uint32_t level = 1; level < 3; ++level) {
|
||||
for (uint32_t f = 0; f < 3; ++f) {
|
||||
GetNextLevelIndex(level, f, -1, -1, &left, &right);
|
||||
ASSERT_EQ(f == 0 ? 0 : 3, left);
|
||||
ASSERT_EQ(2, right);
|
||||
GetNextLevelIndex(level, f, 0, -1, &left, &right);
|
||||
ASSERT_EQ(3, left);
|
||||
ASSERT_EQ(2, right);
|
||||
GetNextLevelIndex(level, f, 1, -1, &left, &right);
|
||||
ASSERT_EQ(3, left);
|
||||
ASSERT_EQ(2, right);
|
||||
GetNextLevelIndex(level, f, 1, -1, &left, &right);
|
||||
ASSERT_EQ(3, left);
|
||||
ASSERT_EQ(2, right);
|
||||
GetNextLevelIndex(level, f, 1, 0, &left, &right);
|
||||
ASSERT_EQ(3, left);
|
||||
ASSERT_EQ(2, right);
|
||||
GetNextLevelIndex(level, f, 1, 1, &left, &right);
|
||||
ASSERT_EQ(3, left);
|
||||
ASSERT_EQ(2, right);
|
||||
}
|
||||
}
|
||||
|
||||
// Case 3: empty L2
|
||||
Reset();
|
||||
for (uint32_t i = 1; i < kNumLevels; ++i) {
|
||||
ASSERT_EQ(0U, indexer.LevelIndexSize(i));
|
||||
}
|
||||
// level 1
|
||||
AddFile(1, 2100, 2200);
|
||||
AddFile(1, 2300, 2400);
|
||||
AddFile(1, 2500, 2600);
|
||||
// level 3
|
||||
AddFile(3, 500, 600);
|
||||
AddFile(3, 501, 699);
|
||||
AddFile(3, 700, 800);
|
||||
indexer.UpdateIndex(files);
|
||||
for (uint32_t f = 0; f < 3; ++f) {
|
||||
GetNextLevelIndex(1, f, -1, -1, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(-1, right);
|
||||
GetNextLevelIndex(1, f, 0, -1, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(-1, right);
|
||||
GetNextLevelIndex(1, f, 1, -1, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(-1, right);
|
||||
GetNextLevelIndex(1, f, 1, -1, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(-1, right);
|
||||
GetNextLevelIndex(1, f, 1, 0, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(-1, right);
|
||||
GetNextLevelIndex(1, f, 1, 1, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(-1, right);
|
||||
}
|
||||
|
||||
|
||||
// Case 4: mixed
|
||||
Reset();
|
||||
for (uint32_t i = 1; i < kNumLevels; ++i) {
|
||||
ASSERT_EQ(0U, indexer.LevelIndexSize(i));
|
||||
}
|
||||
// level 1
|
||||
AddFile(1, 100, 200);
|
||||
AddFile(1, 250, 400);
|
||||
AddFile(1, 450, 500);
|
||||
// level 2
|
||||
AddFile(2, 100, 150); // 0
|
||||
AddFile(2, 200, 250); // 1
|
||||
AddFile(2, 251, 300); // 2
|
||||
AddFile(2, 301, 350); // 3
|
||||
AddFile(2, 500, 600); // 4
|
||||
// level 3
|
||||
AddFile(3, 0, 50);
|
||||
AddFile(3, 100, 200);
|
||||
AddFile(3, 201, 250);
|
||||
indexer.UpdateIndex(files);
|
||||
// level 1, 0
|
||||
GetNextLevelIndex(1, 0, -1, -1, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(0, right);
|
||||
GetNextLevelIndex(1, 0, 0, -1, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(0, right);
|
||||
GetNextLevelIndex(1, 0, 1, -1, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(1, right);
|
||||
GetNextLevelIndex(1, 0, 1, 0, &left, &right);
|
||||
ASSERT_EQ(1, left);
|
||||
ASSERT_EQ(1, right);
|
||||
GetNextLevelIndex(1, 0, 1, 1, &left, &right);
|
||||
ASSERT_EQ(1, left);
|
||||
ASSERT_EQ(4, right);
|
||||
// level 1, 1
|
||||
GetNextLevelIndex(1, 1, -1, -1, &left, &right);
|
||||
ASSERT_EQ(1, left);
|
||||
ASSERT_EQ(1, right);
|
||||
GetNextLevelIndex(1, 1, 0, -1, &left, &right);
|
||||
ASSERT_EQ(1, left);
|
||||
ASSERT_EQ(1, right);
|
||||
GetNextLevelIndex(1, 1, 1, -1, &left, &right);
|
||||
ASSERT_EQ(1, left);
|
||||
ASSERT_EQ(3, right);
|
||||
GetNextLevelIndex(1, 1, 1, 0, &left, &right);
|
||||
ASSERT_EQ(4, left);
|
||||
ASSERT_EQ(3, right);
|
||||
GetNextLevelIndex(1, 1, 1, 1, &left, &right);
|
||||
ASSERT_EQ(4, left);
|
||||
ASSERT_EQ(4, right);
|
||||
// level 1, 2
|
||||
GetNextLevelIndex(1, 2, -1, -1, &left, &right);
|
||||
ASSERT_EQ(4, left);
|
||||
ASSERT_EQ(3, right);
|
||||
GetNextLevelIndex(1, 2, 0, -1, &left, &right);
|
||||
ASSERT_EQ(4, left);
|
||||
ASSERT_EQ(3, right);
|
||||
GetNextLevelIndex(1, 2, 1, -1, &left, &right);
|
||||
ASSERT_EQ(4, left);
|
||||
ASSERT_EQ(4, right);
|
||||
GetNextLevelIndex(1, 2, 1, 0, &left, &right);
|
||||
ASSERT_EQ(4, left);
|
||||
ASSERT_EQ(4, right);
|
||||
GetNextLevelIndex(1, 2, 1, 1, &left, &right);
|
||||
ASSERT_EQ(4, left);
|
||||
ASSERT_EQ(4, right);
|
||||
// level 2, 0
|
||||
GetNextLevelIndex(2, 0, -1, -1, &left, &right);
|
||||
ASSERT_EQ(0, left);
|
||||
ASSERT_EQ(1, right);
|
||||
GetNextLevelIndex(2, 0, 0, -1, &left, &right);
|
||||
ASSERT_EQ(1, left);
|
||||
ASSERT_EQ(1, right);
|
||||
GetNextLevelIndex(2, 0, 1, -1, &left, &right);
|
||||
ASSERT_EQ(1, left);
|
||||
ASSERT_EQ(1, right);
|
||||
GetNextLevelIndex(2, 0, 1, 0, &left, &right);
|
||||
ASSERT_EQ(1, left);
|
||||
ASSERT_EQ(1, right);
|
||||
GetNextLevelIndex(2, 0, 1, 1, &left, &right);
|
||||
ASSERT_EQ(1, left);
|
||||
ASSERT_EQ(2, right);
|
||||
// level 2, 1
|
||||
GetNextLevelIndex(2, 1, -1, -1, &left, &right);
|
||||
ASSERT_EQ(1, left);
|
||||
ASSERT_EQ(1, right);
|
||||
GetNextLevelIndex(2, 1, 0, -1, &left, &right);
|
||||
ASSERT_EQ(1, left);
|
||||
ASSERT_EQ(1, right);
|
||||
GetNextLevelIndex(2, 1, 1, -1, &left, &right);
|
||||
ASSERT_EQ(1, left);
|
||||
ASSERT_EQ(2, right);
|
||||
GetNextLevelIndex(2, 1, 1, 0, &left, &right);
|
||||
ASSERT_EQ(2, left);
|
||||
ASSERT_EQ(2, right);
|
||||
GetNextLevelIndex(2, 1, 1, 1, &left, &right);
|
||||
ASSERT_EQ(2, left);
|
||||
ASSERT_EQ(2, right);
|
||||
// level 2, [2 - 4], no overlap
|
||||
for (uint32_t f = 2; f <= 4; ++f) {
|
||||
GetNextLevelIndex(2, f, -1, -1, &left, &right);
|
||||
ASSERT_EQ(f == 2 ? 2 : 3, left);
|
||||
ASSERT_EQ(2, right);
|
||||
GetNextLevelIndex(2, f, 0, -1, &left, &right);
|
||||
ASSERT_EQ(3, left);
|
||||
ASSERT_EQ(2, right);
|
||||
GetNextLevelIndex(2, f, 1, -1, &left, &right);
|
||||
ASSERT_EQ(3, left);
|
||||
ASSERT_EQ(2, right);
|
||||
GetNextLevelIndex(2, f, 1, 0, &left, &right);
|
||||
ASSERT_EQ(3, left);
|
||||
ASSERT_EQ(2, right);
|
||||
GetNextLevelIndex(2, f, 1, 1, &left, &right);
|
||||
ASSERT_EQ(3, left);
|
||||
ASSERT_EQ(2, right);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
||||
266
db/filename.cc
Normal file
266
db/filename.cc
Normal file
@@ -0,0 +1,266 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/filename.h"
|
||||
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
#include "db/dbformat.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/logging.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// Given a path, flatten the path name by replacing all chars not in
|
||||
// {[0-9,a-z,A-Z,-,_,.]} with _. And append '\0' at the end.
|
||||
// Return the number of chars stored in dest not including the trailing '\0'.
|
||||
static int FlattenPath(const std::string& path, char* dest, int len) {
|
||||
int write_idx = 0;
|
||||
int i = 0;
|
||||
int src_len = path.size();
|
||||
|
||||
while (i < src_len && write_idx < len - 1) {
|
||||
if ((path[i] >= 'a' && path[i] <= 'z') ||
|
||||
(path[i] >= '0' && path[i] <= '9') ||
|
||||
(path[i] >= 'A' && path[i] <= 'Z') ||
|
||||
path[i] == '-' ||
|
||||
path[i] == '.' ||
|
||||
path[i] == '_'){
|
||||
dest[write_idx++] = path[i];
|
||||
} else {
|
||||
if (i > 0)
|
||||
dest[write_idx++] = '_';
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
dest[write_idx] = '\0';
|
||||
return write_idx;
|
||||
}
|
||||
|
||||
static std::string MakeFileName(const std::string& name, uint64_t number,
|
||||
const char* suffix) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "/%06llu.%s",
|
||||
static_cast<unsigned long long>(number),
|
||||
suffix);
|
||||
return name + buf;
|
||||
}
|
||||
|
||||
std::string LogFileName(const std::string& name, uint64_t number) {
|
||||
assert(number > 0);
|
||||
return MakeFileName(name, number, "log");
|
||||
}
|
||||
|
||||
std::string ArchivalDirectory(const std::string& dir) {
|
||||
return dir + "/" + ARCHIVAL_DIR;
|
||||
}
|
||||
std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
|
||||
assert(number > 0);
|
||||
return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log");
|
||||
}
|
||||
|
||||
std::string TableFileName(const std::string& name, uint64_t number) {
|
||||
assert(number > 0);
|
||||
return MakeFileName(name, number, "sst");
|
||||
}
|
||||
|
||||
std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
|
||||
assert(number > 0);
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
|
||||
static_cast<unsigned long long>(number));
|
||||
return dbname + buf;
|
||||
}
|
||||
|
||||
std::string CurrentFileName(const std::string& dbname) {
|
||||
return dbname + "/CURRENT";
|
||||
}
|
||||
|
||||
std::string LockFileName(const std::string& dbname) {
|
||||
return dbname + "/LOCK";
|
||||
}
|
||||
|
||||
std::string TempFileName(const std::string& dbname, uint64_t number) {
|
||||
return MakeFileName(dbname, number, "dbtmp");
|
||||
}
|
||||
|
||||
std::string InfoLogFileName(const std::string& dbname,
|
||||
const std::string& db_path, const std::string& log_dir) {
|
||||
if (log_dir.empty())
|
||||
return dbname + "/LOG";
|
||||
|
||||
char flatten_db_path[256];
|
||||
FlattenPath(db_path, flatten_db_path, 256);
|
||||
return log_dir + "/" + flatten_db_path + "_LOG";
|
||||
}
|
||||
|
||||
// Return the name of the old info log file for "dbname".
|
||||
std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
|
||||
const std::string& db_path, const std::string& log_dir) {
|
||||
char buf[50];
|
||||
snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
|
||||
|
||||
if (log_dir.empty())
|
||||
return dbname + "/LOG.old." + buf;
|
||||
|
||||
char flatten_db_path[256];
|
||||
FlattenPath(db_path, flatten_db_path, 256);
|
||||
return log_dir + "/" + flatten_db_path + "_LOG.old." + buf;
|
||||
}
|
||||
|
||||
std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "/METADB-%llu",
|
||||
static_cast<unsigned long long>(number));
|
||||
return dbname + buf;
|
||||
}
|
||||
|
||||
std::string IdentityFileName(const std::string& dbname) {
|
||||
return dbname + "/IDENTITY";
|
||||
}
|
||||
|
||||
// Owned filenames have the form:
|
||||
// dbname/IDENTITY
|
||||
// dbname/CURRENT
|
||||
// dbname/LOCK
|
||||
// dbname/LOG
|
||||
// dbname/LOG.old.[0-9]+
|
||||
// dbname/MANIFEST-[0-9]+
|
||||
// dbname/[0-9]+.(log|sst)
|
||||
// dbname/METADB-[0-9]+
|
||||
// Disregards / at the beginning
|
||||
bool ParseFileName(const std::string& fname,
|
||||
uint64_t* number,
|
||||
FileType* type,
|
||||
WalFileType* log_type) {
|
||||
Slice rest(fname);
|
||||
if (fname.length() > 1 && fname[0] == '/') {
|
||||
rest.remove_prefix(1);
|
||||
}
|
||||
if (rest == "IDENTITY") {
|
||||
*number = 0;
|
||||
*type = kIdentityFile;
|
||||
} else if (rest == "CURRENT") {
|
||||
*number = 0;
|
||||
*type = kCurrentFile;
|
||||
} else if (rest == "LOCK") {
|
||||
*number = 0;
|
||||
*type = kDBLockFile;
|
||||
} else if (rest == "LOG" || rest == "LOG.old") {
|
||||
*number = 0;
|
||||
*type = kInfoLogFile;
|
||||
} else if (rest.starts_with("LOG.old.")) {
|
||||
uint64_t ts_suffix;
|
||||
// sizeof also counts the trailing '\0'.
|
||||
rest.remove_prefix(sizeof("LOG.old.") - 1);
|
||||
if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
|
||||
return false;
|
||||
}
|
||||
*number = ts_suffix;
|
||||
*type = kInfoLogFile;
|
||||
} else if (rest.starts_with("MANIFEST-")) {
|
||||
rest.remove_prefix(strlen("MANIFEST-"));
|
||||
uint64_t num;
|
||||
if (!ConsumeDecimalNumber(&rest, &num)) {
|
||||
return false;
|
||||
}
|
||||
if (!rest.empty()) {
|
||||
return false;
|
||||
}
|
||||
*type = kDescriptorFile;
|
||||
*number = num;
|
||||
} else if (rest.starts_with("METADB-")) {
|
||||
rest.remove_prefix(strlen("METADB-"));
|
||||
uint64_t num;
|
||||
if (!ConsumeDecimalNumber(&rest, &num)) {
|
||||
return false;
|
||||
}
|
||||
if (!rest.empty()) {
|
||||
return false;
|
||||
}
|
||||
*type = kMetaDatabase;
|
||||
*number = num;
|
||||
} else {
|
||||
// Avoid strtoull() to keep filename format independent of the
|
||||
// current locale
|
||||
bool archive_dir_found = false;
|
||||
if (rest.starts_with(ARCHIVAL_DIR)) {
|
||||
if (rest.size() <= ARCHIVAL_DIR.size()) {
|
||||
return false;
|
||||
}
|
||||
rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also
|
||||
if (log_type) {
|
||||
*log_type = kArchivedLogFile;
|
||||
}
|
||||
archive_dir_found = true;
|
||||
}
|
||||
uint64_t num;
|
||||
if (!ConsumeDecimalNumber(&rest, &num)) {
|
||||
return false;
|
||||
}
|
||||
Slice suffix = rest;
|
||||
if (suffix == Slice(".log")) {
|
||||
*type = kLogFile;
|
||||
if (log_type && !archive_dir_found) {
|
||||
*log_type = kAliveLogFile;
|
||||
}
|
||||
} else if (archive_dir_found) {
|
||||
return false; // Archive dir can contain only log files
|
||||
} else if (suffix == Slice(".sst")) {
|
||||
*type = kTableFile;
|
||||
} else if (suffix == Slice(".dbtmp")) {
|
||||
*type = kTempFile;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
*number = num;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
Status SetCurrentFile(Env* env, const std::string& dbname,
|
||||
uint64_t descriptor_number,
|
||||
Directory* directory_to_fsync) {
|
||||
// Remove leading "dbname/" and add newline to manifest file name
|
||||
std::string manifest = DescriptorFileName(dbname, descriptor_number);
|
||||
Slice contents = manifest;
|
||||
assert(contents.starts_with(dbname + "/"));
|
||||
contents.remove_prefix(dbname.size() + 1);
|
||||
std::string tmp = TempFileName(dbname, descriptor_number);
|
||||
Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp, true);
|
||||
if (s.ok()) {
|
||||
s = env->RenameFile(tmp, CurrentFileName(dbname));
|
||||
}
|
||||
if (s.ok()) {
|
||||
if (directory_to_fsync != nullptr) {
|
||||
directory_to_fsync->Fsync();
|
||||
}
|
||||
} else {
|
||||
env->DeleteFile(tmp);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status SetIdentityFile(Env* env, const std::string& dbname) {
|
||||
std::string id = env->GenerateUniqueId();
|
||||
assert(!id.empty());
|
||||
// Reserve the filename dbname/000000.dbtmp for the temporary identity file
|
||||
std::string tmp = TempFileName(dbname, 0);
|
||||
Status s = WriteStringToFile(env, id, tmp, true);
|
||||
if (s.ok()) {
|
||||
s = env->RenameFile(tmp, IdentityFileName(dbname));
|
||||
}
|
||||
if (!s.ok()) {
|
||||
env->DeleteFile(tmp);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
110
db/filename.h
Normal file
110
db/filename.h
Normal file
@@ -0,0 +1,110 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// File names used by DB code
|
||||
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/transaction_log.h"
|
||||
#include "port/port.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Env;
|
||||
class Directory;
|
||||
|
||||
enum FileType {
|
||||
kLogFile,
|
||||
kDBLockFile,
|
||||
kTableFile,
|
||||
kDescriptorFile,
|
||||
kCurrentFile,
|
||||
kTempFile,
|
||||
kInfoLogFile, // Either the current one, or an old one
|
||||
kMetaDatabase,
|
||||
kIdentityFile
|
||||
};
|
||||
|
||||
// Return the name of the log file with the specified number
|
||||
// in the db named by "dbname". The result will be prefixed with
|
||||
// "dbname".
|
||||
extern std::string LogFileName(const std::string& dbname, uint64_t number);
|
||||
|
||||
static const std::string ARCHIVAL_DIR = "archive";
|
||||
|
||||
extern std::string ArchivalDirectory(const std::string& dbname);
|
||||
|
||||
// Return the name of the archived log file with the specified number
|
||||
// in the db named by "dbname". The result will be prefixed with "dbname".
|
||||
extern std::string ArchivedLogFileName(const std::string& dbname,
|
||||
uint64_t num);
|
||||
|
||||
// Return the name of the sstable with the specified number
|
||||
// in the db named by "dbname". The result will be prefixed with
|
||||
// "dbname".
|
||||
extern std::string TableFileName(const std::string& dbname, uint64_t number);
|
||||
|
||||
// Return the name of the descriptor file for the db named by
|
||||
// "dbname" and the specified incarnation number. The result will be
|
||||
// prefixed with "dbname".
|
||||
extern std::string DescriptorFileName(const std::string& dbname,
|
||||
uint64_t number);
|
||||
|
||||
// Return the name of the current file. This file contains the name
|
||||
// of the current manifest file. The result will be prefixed with
|
||||
// "dbname".
|
||||
extern std::string CurrentFileName(const std::string& dbname);
|
||||
|
||||
// Return the name of the lock file for the db named by
|
||||
// "dbname". The result will be prefixed with "dbname".
|
||||
extern std::string LockFileName(const std::string& dbname);
|
||||
|
||||
// Return the name of a temporary file owned by the db named "dbname".
|
||||
// The result will be prefixed with "dbname".
|
||||
extern std::string TempFileName(const std::string& dbname, uint64_t number);
|
||||
|
||||
// Return the name of the info log file for "dbname".
|
||||
extern std::string InfoLogFileName(const std::string& dbname,
|
||||
const std::string& db_path="", const std::string& log_dir="");
|
||||
|
||||
// Return the name of the old info log file for "dbname".
|
||||
extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
|
||||
const std::string& db_path="", const std::string& log_dir="");
|
||||
|
||||
// Return the name to use for a metadatabase. The result will be prefixed with
|
||||
// "dbname".
|
||||
extern std::string MetaDatabaseName(const std::string& dbname,
|
||||
uint64_t number);
|
||||
|
||||
// Return the name of the Identity file which stores a unique number for the db
|
||||
// that will get regenerated if the db loses all its data and is recreated fresh
|
||||
// either from a backup-image or empty
|
||||
extern std::string IdentityFileName(const std::string& dbname);
|
||||
|
||||
// If filename is a rocksdb file, store the type of the file in *type.
|
||||
// The number encoded in the filename is stored in *number. If the
|
||||
// filename was successfully parsed, returns true. Else return false.
|
||||
extern bool ParseFileName(const std::string& filename,
|
||||
uint64_t* number,
|
||||
FileType* type,
|
||||
WalFileType* log_type = nullptr);
|
||||
|
||||
// Make the CURRENT file point to the descriptor file with the
|
||||
// specified number.
|
||||
extern Status SetCurrentFile(Env* env, const std::string& dbname,
|
||||
uint64_t descriptor_number,
|
||||
Directory* directory_to_fsync);
|
||||
|
||||
// Make the IDENTITY file for the db
|
||||
extern Status SetIdentityFile(Env* env, const std::string& dbname);
|
||||
|
||||
} // namespace rocksdb
|
||||
140
db/filename_test.cc
Normal file
140
db/filename_test.cc
Normal file
@@ -0,0 +1,140 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/filename.h"
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "port/port.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class FileNameTest { };
|
||||
|
||||
TEST(FileNameTest, Parse) {
|
||||
Slice db;
|
||||
FileType type;
|
||||
uint64_t number;
|
||||
|
||||
// Successful parses
|
||||
static struct {
|
||||
const char* fname;
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
} cases[] = {
|
||||
{ "100.log", 100, kLogFile },
|
||||
{ "0.log", 0, kLogFile },
|
||||
{ "0.sst", 0, kTableFile },
|
||||
{ "CURRENT", 0, kCurrentFile },
|
||||
{ "LOCK", 0, kDBLockFile },
|
||||
{ "MANIFEST-2", 2, kDescriptorFile },
|
||||
{ "MANIFEST-7", 7, kDescriptorFile },
|
||||
{ "METADB-2", 2, kMetaDatabase },
|
||||
{ "METADB-7", 7, kMetaDatabase },
|
||||
{ "LOG", 0, kInfoLogFile },
|
||||
{ "LOG.old", 0, kInfoLogFile },
|
||||
{ "18446744073709551615.log", 18446744073709551615ull, kLogFile },
|
||||
};
|
||||
for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
|
||||
std::string f = cases[i].fname;
|
||||
ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
|
||||
ASSERT_EQ(cases[i].type, type) << f;
|
||||
ASSERT_EQ(cases[i].number, number) << f;
|
||||
}
|
||||
|
||||
// Errors
|
||||
static const char* errors[] = {
|
||||
"",
|
||||
"foo",
|
||||
"foo-dx-100.log",
|
||||
".log",
|
||||
"",
|
||||
"manifest",
|
||||
"CURREN",
|
||||
"CURRENTX",
|
||||
"MANIFES",
|
||||
"MANIFEST",
|
||||
"MANIFEST-",
|
||||
"XMANIFEST-3",
|
||||
"MANIFEST-3x",
|
||||
"META",
|
||||
"METADB",
|
||||
"METADB-",
|
||||
"XMETADB-3",
|
||||
"METADB-3x",
|
||||
"LOC",
|
||||
"LOCKx",
|
||||
"LO",
|
||||
"LOGx",
|
||||
"18446744073709551616.log",
|
||||
"184467440737095516150.log",
|
||||
"100",
|
||||
"100.",
|
||||
"100.lop"
|
||||
};
|
||||
for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
|
||||
std::string f = errors[i];
|
||||
ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
|
||||
};
|
||||
}
|
||||
|
||||
TEST(FileNameTest, Construction) {
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
std::string fname;
|
||||
|
||||
fname = CurrentFileName("foo");
|
||||
ASSERT_EQ("foo/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(0U, number);
|
||||
ASSERT_EQ(kCurrentFile, type);
|
||||
|
||||
fname = LockFileName("foo");
|
||||
ASSERT_EQ("foo/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(0U, number);
|
||||
ASSERT_EQ(kDBLockFile, type);
|
||||
|
||||
fname = LogFileName("foo", 192);
|
||||
ASSERT_EQ("foo/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(192U, number);
|
||||
ASSERT_EQ(kLogFile, type);
|
||||
|
||||
fname = TableFileName("bar", 200);
|
||||
ASSERT_EQ("bar/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(200U, number);
|
||||
ASSERT_EQ(kTableFile, type);
|
||||
|
||||
fname = DescriptorFileName("bar", 100);
|
||||
ASSERT_EQ("bar/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(100U, number);
|
||||
ASSERT_EQ(kDescriptorFile, type);
|
||||
|
||||
fname = TempFileName("tmp", 999);
|
||||
ASSERT_EQ("tmp/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(999U, number);
|
||||
ASSERT_EQ(kTempFile, type);
|
||||
|
||||
fname = MetaDatabaseName("met", 100);
|
||||
ASSERT_EQ("met/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(100U, number);
|
||||
ASSERT_EQ(kMetaDatabase, type);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
||||
383
db/forward_iterator.cc
Normal file
383
db/forward_iterator.cc
Normal file
@@ -0,0 +1,383 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
#include "db/forward_iterator.h"
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <limits>
|
||||
#include "db/db_impl.h"
|
||||
#include "db/db_iter.h"
|
||||
#include "db/column_family.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "table/merger.h"
|
||||
#include "db/dbformat.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// Usage:
|
||||
// LevelIterator iter;
|
||||
// iter.SetFileIndex(file_index);
|
||||
// iter.Seek(target);
|
||||
// iter.Next()
|
||||
class LevelIterator : public Iterator {
|
||||
public:
|
||||
LevelIterator(const ColumnFamilyData* const cfd,
|
||||
const ReadOptions& read_options,
|
||||
const std::vector<FileMetaData*>& files)
|
||||
: cfd_(cfd), read_options_(read_options), files_(files), valid_(false),
|
||||
file_index_(std::numeric_limits<uint32_t>::max()) {}
|
||||
|
||||
void SetFileIndex(uint32_t file_index) {
|
||||
assert(file_index < files_.size());
|
||||
if (file_index != file_index_) {
|
||||
file_index_ = file_index;
|
||||
file_iter_.reset(cfd_->table_cache()->NewIterator(
|
||||
read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
|
||||
*(files_[file_index_]), nullptr /* table_reader_ptr */, false));
|
||||
}
|
||||
valid_ = false;
|
||||
}
|
||||
void SeekToLast() override {
|
||||
status_ = Status::NotSupported("LevelIterator::SeekToLast()");
|
||||
valid_ = false;
|
||||
}
|
||||
void Prev() {
|
||||
status_ = Status::NotSupported("LevelIterator::Prev()");
|
||||
valid_ = false;
|
||||
}
|
||||
bool Valid() const override {
|
||||
return valid_;
|
||||
}
|
||||
void SeekToFirst() override {
|
||||
SetFileIndex(0);
|
||||
file_iter_->SeekToFirst();
|
||||
valid_ = file_iter_->Valid();
|
||||
}
|
||||
void Seek(const Slice& internal_key) override {
|
||||
assert(file_iter_ != nullptr);
|
||||
file_iter_->Seek(internal_key);
|
||||
valid_ = file_iter_->Valid();
|
||||
assert(valid_);
|
||||
}
|
||||
void Next() override {
|
||||
assert(valid_);
|
||||
file_iter_->Next();
|
||||
while (!file_iter_->Valid()) {
|
||||
if (file_index_ + 1 >= files_.size()) {
|
||||
valid_ = false;
|
||||
return;
|
||||
}
|
||||
SetFileIndex(file_index_ + 1);
|
||||
file_iter_->SeekToFirst();
|
||||
}
|
||||
valid_ = file_iter_->Valid();
|
||||
}
|
||||
Slice key() const override {
|
||||
assert(valid_);
|
||||
return file_iter_->key();
|
||||
}
|
||||
Slice value() const override {
|
||||
assert(valid_);
|
||||
return file_iter_->value();
|
||||
}
|
||||
Status status() const override {
|
||||
return status_;
|
||||
}
|
||||
|
||||
private:
|
||||
const ColumnFamilyData* const cfd_;
|
||||
const ReadOptions& read_options_;
|
||||
const std::vector<FileMetaData*>& files_;
|
||||
|
||||
bool valid_;
|
||||
uint32_t file_index_;
|
||||
Status status_;
|
||||
std::unique_ptr<Iterator> file_iter_;
|
||||
};
|
||||
|
||||
ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
|
||||
ColumnFamilyData* cfd)
|
||||
: db_(db),
|
||||
read_options_(read_options),
|
||||
cfd_(cfd),
|
||||
prefix_extractor_(cfd->options()->prefix_extractor.get()),
|
||||
user_comparator_(cfd->user_comparator()),
|
||||
immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
|
||||
sv_(nullptr),
|
||||
mutable_iter_(nullptr),
|
||||
current_(nullptr),
|
||||
valid_(false),
|
||||
is_prev_set_(false) {}
|
||||
|
||||
ForwardIterator::~ForwardIterator() {
|
||||
Cleanup();
|
||||
}
|
||||
|
||||
void ForwardIterator::Cleanup() {
|
||||
delete mutable_iter_;
|
||||
for (auto* m : imm_iters_) {
|
||||
delete m;
|
||||
}
|
||||
imm_iters_.clear();
|
||||
for (auto* f : l0_iters_) {
|
||||
delete f;
|
||||
}
|
||||
l0_iters_.clear();
|
||||
for (auto* l : level_iters_) {
|
||||
delete l;
|
||||
}
|
||||
level_iters_.clear();
|
||||
|
||||
if (sv_ != nullptr && sv_->Unref()) {
|
||||
DBImpl::DeletionState deletion_state;
|
||||
db_->mutex_.Lock();
|
||||
sv_->Cleanup();
|
||||
db_->FindObsoleteFiles(deletion_state, false, true);
|
||||
db_->mutex_.Unlock();
|
||||
delete sv_;
|
||||
if (deletion_state.HaveSomethingToDelete()) {
|
||||
db_->PurgeObsoleteFiles(deletion_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool ForwardIterator::Valid() const {
|
||||
return valid_;
|
||||
}
|
||||
|
||||
void ForwardIterator::SeekToFirst() {
|
||||
if (sv_ == nullptr ||
|
||||
sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
|
||||
RebuildIterators();
|
||||
}
|
||||
SeekInternal(Slice(), true);
|
||||
}
|
||||
|
||||
void ForwardIterator::Seek(const Slice& internal_key) {
|
||||
if (sv_ == nullptr ||
|
||||
sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
|
||||
RebuildIterators();
|
||||
}
|
||||
SeekInternal(internal_key, false);
|
||||
}
|
||||
|
||||
void ForwardIterator::SeekInternal(const Slice& internal_key,
|
||||
bool seek_to_first) {
|
||||
// mutable
|
||||
seek_to_first ? mutable_iter_->SeekToFirst() :
|
||||
mutable_iter_->Seek(internal_key);
|
||||
|
||||
// immutable
|
||||
// TODO(ljin): NeedToSeekImmutable has negative impact on performance
|
||||
// if it turns to need to seek immutable often. We probably want to have
|
||||
// an option to turn it off.
|
||||
if (seek_to_first || NeedToSeekImmutable(internal_key)) {
|
||||
{
|
||||
auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator()));
|
||||
immutable_min_heap_.swap(tmp);
|
||||
}
|
||||
for (auto* m : imm_iters_) {
|
||||
seek_to_first ? m->SeekToFirst() : m->Seek(internal_key);
|
||||
if (m->Valid()) {
|
||||
immutable_min_heap_.push(m);
|
||||
}
|
||||
}
|
||||
|
||||
auto* files = sv_->current->files_;
|
||||
for (uint32_t i = 0; i < files[0].size(); ++i) {
|
||||
if (seek_to_first) {
|
||||
l0_iters_[i]->SeekToFirst();
|
||||
} else {
|
||||
// If the target key passes over the larget key, we are sure Next()
|
||||
// won't go over this file.
|
||||
if (user_comparator_->Compare(ExtractUserKey(internal_key),
|
||||
files[0][i]->largest.user_key()) > 0) {
|
||||
continue;
|
||||
}
|
||||
l0_iters_[i]->Seek(internal_key);
|
||||
}
|
||||
if (l0_iters_[i]->Valid()) {
|
||||
immutable_min_heap_.push(l0_iters_[i]);
|
||||
}
|
||||
}
|
||||
for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) {
|
||||
if (files[level].empty()) {
|
||||
continue;
|
||||
}
|
||||
assert(level_iters_[level - 1] != nullptr);
|
||||
uint32_t f_idx = 0;
|
||||
if (!seek_to_first) {
|
||||
f_idx = FindFileInRange(
|
||||
files[level], internal_key, 0, files[level].size());
|
||||
}
|
||||
if (f_idx < files[level].size()) {
|
||||
level_iters_[level - 1]->SetFileIndex(f_idx);
|
||||
seek_to_first ? level_iters_[level - 1]->SeekToFirst() :
|
||||
level_iters_[level - 1]->Seek(internal_key);
|
||||
if (level_iters_[level - 1]->Valid()) {
|
||||
immutable_min_heap_.push(level_iters_[level - 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (seek_to_first || immutable_min_heap_.empty()) {
|
||||
is_prev_set_ = false;
|
||||
} else {
|
||||
prev_key_.SetKey(internal_key);
|
||||
is_prev_set_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
UpdateCurrent();
|
||||
}
|
||||
|
||||
void ForwardIterator::Next() {
|
||||
assert(valid_);
|
||||
|
||||
if (sv_ == nullptr ||
|
||||
sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
|
||||
std::string current_key = key().ToString();
|
||||
Slice old_key(current_key.data(), current_key.size());
|
||||
|
||||
RebuildIterators();
|
||||
SeekInternal(old_key, false);
|
||||
if (!valid_ || key().compare(old_key) != 0) {
|
||||
return;
|
||||
}
|
||||
} else if (current_ != mutable_iter_) {
|
||||
// It is going to advance immutable iterator
|
||||
prev_key_.SetKey(current_->key());
|
||||
is_prev_set_ = true;
|
||||
}
|
||||
|
||||
current_->Next();
|
||||
if (current_->Valid() && current_ != mutable_iter_) {
|
||||
immutable_min_heap_.push(current_);
|
||||
}
|
||||
UpdateCurrent();
|
||||
}
|
||||
|
||||
Slice ForwardIterator::key() const {
|
||||
assert(valid_);
|
||||
return current_->key();
|
||||
}
|
||||
|
||||
Slice ForwardIterator::value() const {
|
||||
assert(valid_);
|
||||
return current_->value();
|
||||
}
|
||||
|
||||
Status ForwardIterator::status() const {
|
||||
if (!status_.ok()) {
|
||||
return status_;
|
||||
} else if (!mutable_iter_->status().ok()) {
|
||||
return mutable_iter_->status();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void ForwardIterator::RebuildIterators() {
|
||||
// Clean up
|
||||
Cleanup();
|
||||
// New
|
||||
sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
|
||||
mutable_iter_ = sv_->mem->NewIterator(read_options_);
|
||||
sv_->imm->AddIterators(read_options_, &imm_iters_);
|
||||
const auto& l0_files = sv_->current->files_[0];
|
||||
l0_iters_.reserve(l0_files.size());
|
||||
for (const auto* l0 : l0_files) {
|
||||
l0_iters_.push_back(cfd_->table_cache()->NewIterator(
|
||||
read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0));
|
||||
}
|
||||
level_iters_.reserve(sv_->current->NumberLevels() - 1);
|
||||
for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) {
|
||||
if (sv_->current->files_[level].empty()) {
|
||||
level_iters_.push_back(nullptr);
|
||||
} else {
|
||||
level_iters_.push_back(new LevelIterator(cfd_, read_options_,
|
||||
sv_->current->files_[level]));
|
||||
}
|
||||
}
|
||||
|
||||
current_ = nullptr;
|
||||
is_prev_set_ = false;
|
||||
}
|
||||
|
||||
void ForwardIterator::UpdateCurrent() {
|
||||
if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) {
|
||||
current_ = nullptr;
|
||||
} else if (immutable_min_heap_.empty()) {
|
||||
current_ = mutable_iter_;
|
||||
} else if (!mutable_iter_->Valid()) {
|
||||
current_ = immutable_min_heap_.top();
|
||||
immutable_min_heap_.pop();
|
||||
} else {
|
||||
current_ = immutable_min_heap_.top();
|
||||
assert(current_ != nullptr);
|
||||
assert(current_->Valid());
|
||||
int cmp = cfd_->internal_comparator().InternalKeyComparator::Compare(
|
||||
mutable_iter_->key(), current_->key()) > 0;
|
||||
assert(cmp != 0);
|
||||
if (cmp > 0) {
|
||||
immutable_min_heap_.pop();
|
||||
} else {
|
||||
current_ = mutable_iter_;
|
||||
}
|
||||
}
|
||||
valid_ = (current_ != nullptr);
|
||||
if (!status_.ok()) {
|
||||
status_ = Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
|
||||
if (!is_prev_set_) {
|
||||
return true;
|
||||
}
|
||||
Slice prev_key = prev_key_.GetKey();
|
||||
if (prefix_extractor_ && prefix_extractor_->Transform(target).compare(
|
||||
prefix_extractor_->Transform(prev_key)) != 0) {
|
||||
return true;
|
||||
}
|
||||
if (cfd_->internal_comparator().InternalKeyComparator::Compare(
|
||||
prev_key, target) >= 0) {
|
||||
return true;
|
||||
}
|
||||
if (immutable_min_heap_.empty() ||
|
||||
cfd_->internal_comparator().InternalKeyComparator::Compare(
|
||||
target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key()
|
||||
: current_->key()) > 0) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t ForwardIterator::FindFileInRange(
|
||||
const std::vector<FileMetaData*>& files, const Slice& internal_key,
|
||||
uint32_t left, uint32_t right) {
|
||||
while (left < right) {
|
||||
uint32_t mid = (left + right) / 2;
|
||||
const FileMetaData* f = files[mid];
|
||||
if (cfd_->internal_comparator().InternalKeyComparator::Compare(
|
||||
f->largest.Encode(), internal_key) < 0) {
|
||||
// Key at "mid.largest" is < "target". Therefore all
|
||||
// files at or before "mid" are uninteresting.
|
||||
left = mid + 1;
|
||||
} else {
|
||||
// Key at "mid.largest" is >= "target". Therefore all files
|
||||
// after "mid" are uninteresting.
|
||||
right = mid;
|
||||
}
|
||||
}
|
||||
return right;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
105
db/forward_iterator.h
Normal file
105
db/forward_iterator.h
Normal file
@@ -0,0 +1,105 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#pragma once
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <queue>
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "db/dbformat.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class DBImpl;
|
||||
class Env;
|
||||
struct SuperVersion;
|
||||
class ColumnFamilyData;
|
||||
class LevelIterator;
|
||||
struct FileMetaData;
|
||||
|
||||
class MinIterComparator {
|
||||
public:
|
||||
explicit MinIterComparator(const Comparator* comparator) :
|
||||
comparator_(comparator) {}
|
||||
|
||||
bool operator()(Iterator* a, Iterator* b) {
|
||||
return comparator_->Compare(a->key(), b->key()) > 0;
|
||||
}
|
||||
private:
|
||||
const Comparator* comparator_;
|
||||
};
|
||||
|
||||
typedef std::priority_queue<Iterator*,
|
||||
std::vector<Iterator*>,
|
||||
MinIterComparator> MinIterHeap;
|
||||
|
||||
/**
|
||||
* ForwardIterator is a special type of iterator that only supports Seek()
|
||||
* and Next(). It is expected to perform better than TailingIterator by
|
||||
* removing the encapsulation and making all information accessible within
|
||||
* the iterator. At the current implementation, snapshot is taken at the
|
||||
* time Seek() is called. The Next() followed do not see new values after.
|
||||
*/
|
||||
class ForwardIterator : public Iterator {
|
||||
public:
|
||||
ForwardIterator(DBImpl* db, const ReadOptions& read_options,
|
||||
ColumnFamilyData* cfd);
|
||||
virtual ~ForwardIterator();
|
||||
|
||||
void SeekToLast() override {
|
||||
status_ = Status::NotSupported("ForwardIterator::SeekToLast()");
|
||||
valid_ = false;
|
||||
}
|
||||
void Prev() {
|
||||
status_ = Status::NotSupported("ForwardIterator::Prev");
|
||||
valid_ = false;
|
||||
}
|
||||
|
||||
virtual bool Valid() const override;
|
||||
void SeekToFirst() override;
|
||||
virtual void Seek(const Slice& target) override;
|
||||
virtual void Next() override;
|
||||
virtual Slice key() const override;
|
||||
virtual Slice value() const override;
|
||||
virtual Status status() const override;
|
||||
|
||||
private:
|
||||
void Cleanup();
|
||||
void RebuildIterators();
|
||||
void SeekInternal(const Slice& internal_key, bool seek_to_first);
|
||||
void UpdateCurrent();
|
||||
bool NeedToSeekImmutable(const Slice& internal_key);
|
||||
uint32_t FindFileInRange(
|
||||
const std::vector<FileMetaData*>& files, const Slice& internal_key,
|
||||
uint32_t left, uint32_t right);
|
||||
|
||||
DBImpl* const db_;
|
||||
const ReadOptions read_options_;
|
||||
ColumnFamilyData* const cfd_;
|
||||
const SliceTransform* const prefix_extractor_;
|
||||
const Comparator* user_comparator_;
|
||||
MinIterHeap immutable_min_heap_;
|
||||
|
||||
SuperVersion* sv_;
|
||||
Iterator* mutable_iter_;
|
||||
std::vector<Iterator*> imm_iters_;
|
||||
std::vector<Iterator*> l0_iters_;
|
||||
std::vector<LevelIterator*> level_iters_;
|
||||
Iterator* current_;
|
||||
// internal iterator status
|
||||
Status status_;
|
||||
bool valid_;
|
||||
|
||||
IterKey prev_key_;
|
||||
bool is_prev_set_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
369
db/internal_stats.cc
Normal file
369
db/internal_stats.cc
Normal file
@@ -0,0 +1,369 @@
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/internal_stats.h"
|
||||
#include "db/column_family.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
DBPropertyType GetPropertyType(const Slice& property) {
|
||||
Slice in = property;
|
||||
Slice prefix("rocksdb.");
|
||||
if (!in.starts_with(prefix)) return kUnknown;
|
||||
in.remove_prefix(prefix.size());
|
||||
|
||||
if (in.starts_with("num-files-at-level")) {
|
||||
return kNumFilesAtLevel;
|
||||
} else if (in == "levelstats") {
|
||||
return kLevelStats;
|
||||
} else if (in == "stats") {
|
||||
return kStats;
|
||||
} else if (in == "sstables") {
|
||||
return kSsTables;
|
||||
} else if (in == "num-immutable-mem-table") {
|
||||
return kNumImmutableMemTable;
|
||||
} else if (in == "mem-table-flush-pending") {
|
||||
return kMemtableFlushPending;
|
||||
} else if (in == "compaction-pending") {
|
||||
return kCompactionPending;
|
||||
} else if (in == "background-errors") {
|
||||
return kBackgroundErrors;
|
||||
} else if (in == "cur-size-active-mem-table") {
|
||||
return kCurSizeActiveMemTable;
|
||||
} else if (in == "num-entries-active-mem-table") {
|
||||
return kNumEntriesInMutableMemtable;
|
||||
} else if (in == "num-entries-imm-mem-tables") {
|
||||
return kNumEntriesInImmutableMemtable;
|
||||
}
|
||||
return kUnknown;
|
||||
}
|
||||
|
||||
bool InternalStats::GetProperty(DBPropertyType property_type,
|
||||
const Slice& property, std::string* value,
|
||||
ColumnFamilyData* cfd) {
|
||||
Version* current = cfd->current();
|
||||
Slice in = property;
|
||||
|
||||
switch (property_type) {
|
||||
case kNumFilesAtLevel: {
|
||||
in.remove_prefix(strlen("rocksdb.num-files-at-level"));
|
||||
uint64_t level;
|
||||
bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
|
||||
if (!ok || (int)level >= number_levels_) {
|
||||
return false;
|
||||
} else {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "%d",
|
||||
current->NumLevelFiles(static_cast<int>(level)));
|
||||
*value = buf;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
case kLevelStats: {
|
||||
char buf[1000];
|
||||
snprintf(buf, sizeof(buf),
|
||||
"Level Files Size(MB)\n"
|
||||
"--------------------\n");
|
||||
value->append(buf);
|
||||
|
||||
for (int level = 0; level < number_levels_; level++) {
|
||||
snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level,
|
||||
current->NumLevelFiles(level),
|
||||
current->NumLevelBytes(level) / 1048576.0);
|
||||
value->append(buf);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case kStats: {
|
||||
char buf[1000];
|
||||
|
||||
uint64_t wal_bytes = 0;
|
||||
uint64_t wal_synced = 0;
|
||||
uint64_t user_bytes_written = 0;
|
||||
uint64_t write_other = 0;
|
||||
uint64_t write_self = 0;
|
||||
uint64_t write_with_wal = 0;
|
||||
uint64_t total_bytes_written = 0;
|
||||
uint64_t total_bytes_read = 0;
|
||||
uint64_t micros_up = env_->NowMicros() - started_at_;
|
||||
// Add "+1" to make sure seconds_up is > 0 and avoid NaN later
|
||||
double seconds_up = (micros_up + 1) / 1000000.0;
|
||||
uint64_t total_slowdown = 0;
|
||||
uint64_t total_slowdown_count = 0;
|
||||
uint64_t interval_bytes_written = 0;
|
||||
uint64_t interval_bytes_read = 0;
|
||||
uint64_t interval_bytes_new = 0;
|
||||
double interval_seconds_up = 0;
|
||||
|
||||
if (statistics_) {
|
||||
wal_bytes = statistics_->getTickerCount(WAL_FILE_BYTES);
|
||||
wal_synced = statistics_->getTickerCount(WAL_FILE_SYNCED);
|
||||
user_bytes_written = statistics_->getTickerCount(BYTES_WRITTEN);
|
||||
write_other = statistics_->getTickerCount(WRITE_DONE_BY_OTHER);
|
||||
write_self = statistics_->getTickerCount(WRITE_DONE_BY_SELF);
|
||||
write_with_wal = statistics_->getTickerCount(WRITE_WITH_WAL);
|
||||
}
|
||||
|
||||
snprintf(
|
||||
buf, sizeof(buf),
|
||||
" Compactions\n"
|
||||
"Level Files Size(MB) Score Time(sec) Read(MB) Write(MB) Rn(MB) "
|
||||
" "
|
||||
"Rnp1(MB) Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s) Rn "
|
||||
"Rnp1 "
|
||||
" Wnp1 NewW Count msComp msStall Ln-stall Stall-cnt\n"
|
||||
"--------------------------------------------------------------------"
|
||||
"--"
|
||||
"--------------------------------------------------------------------"
|
||||
"--"
|
||||
"----------------------------------------------------------------\n");
|
||||
value->append(buf);
|
||||
for (int level = 0; level < number_levels_; level++) {
|
||||
int files = current->NumLevelFiles(level);
|
||||
if (compaction_stats_[level].micros > 0 || files > 0) {
|
||||
int64_t bytes_read = compaction_stats_[level].bytes_readn +
|
||||
compaction_stats_[level].bytes_readnp1;
|
||||
int64_t bytes_new = compaction_stats_[level].bytes_written -
|
||||
compaction_stats_[level].bytes_readnp1;
|
||||
double amplify =
|
||||
(compaction_stats_[level].bytes_readn == 0)
|
||||
? 0.0
|
||||
: (compaction_stats_[level].bytes_written +
|
||||
compaction_stats_[level].bytes_readnp1 +
|
||||
compaction_stats_[level].bytes_readn) /
|
||||
(double)compaction_stats_[level].bytes_readn;
|
||||
|
||||
total_bytes_read += bytes_read;
|
||||
total_bytes_written += compaction_stats_[level].bytes_written;
|
||||
|
||||
uint64_t stalls = level == 0 ? (stall_counts_[LEVEL0_SLOWDOWN] +
|
||||
stall_counts_[LEVEL0_NUM_FILES] +
|
||||
stall_counts_[MEMTABLE_COMPACTION])
|
||||
: stall_leveln_slowdown_count_[level];
|
||||
|
||||
double stall_us = level == 0 ? (stall_micros_[LEVEL0_SLOWDOWN] +
|
||||
stall_micros_[LEVEL0_NUM_FILES] +
|
||||
stall_micros_[MEMTABLE_COMPACTION])
|
||||
: stall_leveln_slowdown_[level];
|
||||
|
||||
snprintf(buf, sizeof(buf),
|
||||
"%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f "
|
||||
"%10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %8d %9.1f %9.1f "
|
||||
"%9lu\n",
|
||||
level, files, current->NumLevelBytes(level) / 1048576.0,
|
||||
current->NumLevelBytes(level) /
|
||||
cfd->compaction_picker()->MaxBytesForLevel(level),
|
||||
compaction_stats_[level].micros / 1e6,
|
||||
bytes_read / 1048576.0,
|
||||
compaction_stats_[level].bytes_written / 1048576.0,
|
||||
compaction_stats_[level].bytes_readn / 1048576.0,
|
||||
compaction_stats_[level].bytes_readnp1 / 1048576.0,
|
||||
bytes_new / 1048576.0, amplify,
|
||||
// +1 to avoid division by 0
|
||||
(bytes_read / 1048576.0) /
|
||||
((compaction_stats_[level].micros + 1) / 1000000.0),
|
||||
(compaction_stats_[level].bytes_written / 1048576.0) /
|
||||
((compaction_stats_[level].micros + 1) / 1000000.0),
|
||||
compaction_stats_[level].files_in_leveln,
|
||||
compaction_stats_[level].files_in_levelnp1,
|
||||
compaction_stats_[level].files_out_levelnp1,
|
||||
compaction_stats_[level].files_out_levelnp1 -
|
||||
compaction_stats_[level].files_in_levelnp1,
|
||||
compaction_stats_[level].count,
|
||||
(int)((double)compaction_stats_[level].micros / 1000.0 /
|
||||
(compaction_stats_[level].count + 1)),
|
||||
(double)stall_us / 1000.0 / (stalls + 1),
|
||||
stall_us / 1000000.0, (unsigned long)stalls);
|
||||
total_slowdown += stall_leveln_slowdown_[level];
|
||||
total_slowdown_count += stall_leveln_slowdown_count_[level];
|
||||
value->append(buf);
|
||||
}
|
||||
}
|
||||
|
||||
interval_bytes_new = user_bytes_written - last_stats_.ingest_bytes_;
|
||||
interval_bytes_read =
|
||||
total_bytes_read - last_stats_.compaction_bytes_read_;
|
||||
interval_bytes_written =
|
||||
total_bytes_written - last_stats_.compaction_bytes_written_;
|
||||
interval_seconds_up = seconds_up - last_stats_.seconds_up_;
|
||||
|
||||
snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
|
||||
seconds_up, interval_seconds_up);
|
||||
value->append(buf);
|
||||
|
||||
snprintf(buf, sizeof(buf),
|
||||
"Writes cumulative: %llu total, %llu batches, "
|
||||
"%.1f per batch, %.2f ingest GB\n",
|
||||
(unsigned long long)(write_other + write_self),
|
||||
(unsigned long long)write_self,
|
||||
(write_other + write_self) / (double)(write_self + 1),
|
||||
user_bytes_written / (1048576.0 * 1024));
|
||||
value->append(buf);
|
||||
|
||||
snprintf(buf, sizeof(buf),
|
||||
"WAL cumulative: %llu WAL writes, %llu WAL syncs, "
|
||||
"%.2f writes per sync, %.2f GB written\n",
|
||||
(unsigned long long)write_with_wal,
|
||||
(unsigned long long)wal_synced,
|
||||
write_with_wal / (double)(wal_synced + 1),
|
||||
wal_bytes / (1048576.0 * 1024));
|
||||
value->append(buf);
|
||||
|
||||
snprintf(buf, sizeof(buf),
|
||||
"Compaction IO cumulative (GB): "
|
||||
"%.2f new, %.2f read, %.2f write, %.2f read+write\n",
|
||||
user_bytes_written / (1048576.0 * 1024),
|
||||
total_bytes_read / (1048576.0 * 1024),
|
||||
total_bytes_written / (1048576.0 * 1024),
|
||||
(total_bytes_read + total_bytes_written) / (1048576.0 * 1024));
|
||||
value->append(buf);
|
||||
|
||||
snprintf(
|
||||
buf, sizeof(buf),
|
||||
"Compaction IO cumulative (MB/sec): "
|
||||
"%.1f new, %.1f read, %.1f write, %.1f read+write\n",
|
||||
user_bytes_written / 1048576.0 / seconds_up,
|
||||
total_bytes_read / 1048576.0 / seconds_up,
|
||||
total_bytes_written / 1048576.0 / seconds_up,
|
||||
(total_bytes_read + total_bytes_written) / 1048576.0 / seconds_up);
|
||||
value->append(buf);
|
||||
|
||||
// +1 to avoid divide by 0 and NaN
|
||||
snprintf(
|
||||
buf, sizeof(buf),
|
||||
"Amplification cumulative: %.1f write, %.1f compaction\n",
|
||||
(double)(total_bytes_written + wal_bytes) / (user_bytes_written + 1),
|
||||
(double)(total_bytes_written + total_bytes_read + wal_bytes) /
|
||||
(user_bytes_written + 1));
|
||||
value->append(buf);
|
||||
|
||||
uint64_t interval_write_other = write_other - last_stats_.write_other_;
|
||||
uint64_t interval_write_self = write_self - last_stats_.write_self_;
|
||||
|
||||
snprintf(buf, sizeof(buf),
|
||||
"Writes interval: %llu total, %llu batches, "
|
||||
"%.1f per batch, %.1f ingest MB\n",
|
||||
(unsigned long long)(interval_write_other + interval_write_self),
|
||||
(unsigned long long)interval_write_self,
|
||||
(double)(interval_write_other + interval_write_self) /
|
||||
(interval_write_self + 1),
|
||||
(user_bytes_written - last_stats_.ingest_bytes_) / 1048576.0);
|
||||
value->append(buf);
|
||||
|
||||
uint64_t interval_write_with_wal =
|
||||
write_with_wal - last_stats_.write_with_wal_;
|
||||
|
||||
uint64_t interval_wal_synced = wal_synced - last_stats_.wal_synced_;
|
||||
uint64_t interval_wal_bytes = wal_bytes - last_stats_.wal_bytes_;
|
||||
|
||||
snprintf(buf, sizeof(buf),
|
||||
"WAL interval: %llu WAL writes, %llu WAL syncs, "
|
||||
"%.2f writes per sync, %.2f MB written\n",
|
||||
(unsigned long long)interval_write_with_wal,
|
||||
(unsigned long long)interval_wal_synced,
|
||||
interval_write_with_wal / (double)(interval_wal_synced + 1),
|
||||
interval_wal_bytes / (1048576.0 * 1024));
|
||||
value->append(buf);
|
||||
|
||||
snprintf(buf, sizeof(buf),
|
||||
"Compaction IO interval (MB): "
|
||||
"%.2f new, %.2f read, %.2f write, %.2f read+write\n",
|
||||
interval_bytes_new / 1048576.0, interval_bytes_read / 1048576.0,
|
||||
interval_bytes_written / 1048576.0,
|
||||
(interval_bytes_read + interval_bytes_written) / 1048576.0);
|
||||
value->append(buf);
|
||||
|
||||
snprintf(buf, sizeof(buf),
|
||||
"Compaction IO interval (MB/sec): "
|
||||
"%.1f new, %.1f read, %.1f write, %.1f read+write\n",
|
||||
interval_bytes_new / 1048576.0 / interval_seconds_up,
|
||||
interval_bytes_read / 1048576.0 / interval_seconds_up,
|
||||
interval_bytes_written / 1048576.0 / interval_seconds_up,
|
||||
(interval_bytes_read + interval_bytes_written) / 1048576.0 /
|
||||
interval_seconds_up);
|
||||
value->append(buf);
|
||||
|
||||
// +1 to avoid divide by 0 and NaN
|
||||
snprintf(
|
||||
buf, sizeof(buf),
|
||||
"Amplification interval: %.1f write, %.1f compaction\n",
|
||||
(double)(interval_bytes_written + wal_bytes) /
|
||||
(interval_bytes_new + 1),
|
||||
(double)(interval_bytes_written + interval_bytes_read + wal_bytes) /
|
||||
(interval_bytes_new + 1));
|
||||
value->append(buf);
|
||||
|
||||
snprintf(buf, sizeof(buf),
|
||||
"Stalls(secs): %.3f level0_slowdown, %.3f level0_numfiles, "
|
||||
"%.3f memtable_compaction, %.3f leveln_slowdown\n",
|
||||
stall_micros_[LEVEL0_SLOWDOWN] / 1000000.0,
|
||||
stall_micros_[LEVEL0_NUM_FILES] / 1000000.0,
|
||||
stall_micros_[MEMTABLE_COMPACTION] / 1000000.0,
|
||||
total_slowdown / 1000000.0);
|
||||
value->append(buf);
|
||||
|
||||
snprintf(buf, sizeof(buf),
|
||||
"Stalls(count): %lu level0_slowdown, %lu level0_numfiles, "
|
||||
"%lu memtable_compaction, %lu leveln_slowdown\n",
|
||||
(unsigned long)stall_counts_[LEVEL0_SLOWDOWN],
|
||||
(unsigned long)stall_counts_[LEVEL0_NUM_FILES],
|
||||
(unsigned long)stall_counts_[MEMTABLE_COMPACTION],
|
||||
(unsigned long)total_slowdown_count);
|
||||
value->append(buf);
|
||||
|
||||
last_stats_.compaction_bytes_read_ = total_bytes_read;
|
||||
last_stats_.compaction_bytes_written_ = total_bytes_written;
|
||||
last_stats_.ingest_bytes_ = user_bytes_written;
|
||||
last_stats_.seconds_up_ = seconds_up;
|
||||
last_stats_.wal_bytes_ = wal_bytes;
|
||||
last_stats_.wal_synced_ = wal_synced;
|
||||
last_stats_.write_with_wal_ = write_with_wal;
|
||||
last_stats_.write_other_ = write_other;
|
||||
last_stats_.write_self_ = write_self;
|
||||
|
||||
return true;
|
||||
}
|
||||
case kSsTables:
|
||||
*value = current->DebugString();
|
||||
return true;
|
||||
case kNumImmutableMemTable:
|
||||
*value = std::to_string(cfd->imm()->size());
|
||||
return true;
|
||||
case kMemtableFlushPending:
|
||||
// Return number of mem tables that are ready to flush (made immutable)
|
||||
*value = std::to_string(cfd->imm()->IsFlushPending() ? 1 : 0);
|
||||
return true;
|
||||
case kCompactionPending:
|
||||
// 1 if the system already determines at least one compacdtion is needed.
|
||||
// 0 otherwise,
|
||||
*value = std::to_string(current->NeedsCompaction() ? 1 : 0);
|
||||
return true;
|
||||
case kBackgroundErrors:
|
||||
// Accumulated number of errors in background flushes or compactions.
|
||||
*value = std::to_string(GetBackgroundErrorCount());
|
||||
return true;
|
||||
case kCurSizeActiveMemTable:
|
||||
// Current size of the active memtable
|
||||
*value = std::to_string(cfd->mem()->ApproximateMemoryUsage());
|
||||
return true;
|
||||
case kNumEntriesInMutableMemtable:
|
||||
// Current size of the active memtable
|
||||
*value = std::to_string(cfd->mem()->GetNumEntries());
|
||||
return true;
|
||||
case kNumEntriesInImmutableMemtable:
|
||||
// Current size of the active memtable
|
||||
*value = std::to_string(cfd->imm()->current()->GetTotalNumEntries());
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
187
db/internal_stats.h
Normal file
187
db/internal_stats.h
Normal file
@@ -0,0 +1,187 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "util/statistics.h"
|
||||
#include "db/version_set.h"
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
class ColumnFamilyData;
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class MemTableList;
|
||||
class DBImpl;
|
||||
|
||||
enum DBPropertyType {
|
||||
kNumFilesAtLevel, // Number of files at a specific level
|
||||
kLevelStats, // Return number of files and total sizes of each level
|
||||
kStats, // Return general statitistics of DB
|
||||
kSsTables, // Return a human readable string of current SST files
|
||||
kNumImmutableMemTable, // Return number of immutable mem tables
|
||||
kMemtableFlushPending, // Return 1 if mem table flushing is pending,
|
||||
// otherwise 0.
|
||||
kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0.
|
||||
kBackgroundErrors, // Return accumulated background errors encountered.
|
||||
kCurSizeActiveMemTable, // Return current size of the active memtable
|
||||
kNumEntriesInMutableMemtable, // Return number of entries in the mutable
|
||||
// memtable.
|
||||
kNumEntriesInImmutableMemtable, // Return sum of number of entries in all
|
||||
// the immutable mem tables.
|
||||
kUnknown,
|
||||
};
|
||||
|
||||
extern DBPropertyType GetPropertyType(const Slice& property);
|
||||
|
||||
class InternalStats {
|
||||
public:
|
||||
enum WriteStallType {
|
||||
LEVEL0_SLOWDOWN,
|
||||
MEMTABLE_COMPACTION,
|
||||
LEVEL0_NUM_FILES,
|
||||
WRITE_STALLS_ENUM_MAX,
|
||||
};
|
||||
|
||||
InternalStats(int num_levels, Env* env, Statistics* statistics)
|
||||
: compaction_stats_(num_levels),
|
||||
stall_micros_(WRITE_STALLS_ENUM_MAX, 0),
|
||||
stall_counts_(WRITE_STALLS_ENUM_MAX, 0),
|
||||
stall_leveln_slowdown_(num_levels, 0),
|
||||
stall_leveln_slowdown_count_(num_levels, 0),
|
||||
bg_error_count_(0),
|
||||
number_levels_(num_levels),
|
||||
statistics_(statistics),
|
||||
env_(env),
|
||||
started_at_(env->NowMicros()) {}
|
||||
|
||||
// Per level compaction stats. compaction_stats_[level] stores the stats for
|
||||
// compactions that produced data for the specified "level".
|
||||
struct CompactionStats {
|
||||
uint64_t micros;
|
||||
|
||||
// Bytes read from level N during compaction between levels N and N+1
|
||||
int64_t bytes_readn;
|
||||
|
||||
// Bytes read from level N+1 during compaction between levels N and N+1
|
||||
int64_t bytes_readnp1;
|
||||
|
||||
// Total bytes written during compaction between levels N and N+1
|
||||
int64_t bytes_written;
|
||||
|
||||
// Files read from level N during compaction between levels N and N+1
|
||||
int files_in_leveln;
|
||||
|
||||
// Files read from level N+1 during compaction between levels N and N+1
|
||||
int files_in_levelnp1;
|
||||
|
||||
// Files written during compaction between levels N and N+1
|
||||
int files_out_levelnp1;
|
||||
|
||||
// Number of compactions done
|
||||
int count;
|
||||
|
||||
CompactionStats()
|
||||
: micros(0),
|
||||
bytes_readn(0),
|
||||
bytes_readnp1(0),
|
||||
bytes_written(0),
|
||||
files_in_leveln(0),
|
||||
files_in_levelnp1(0),
|
||||
files_out_levelnp1(0),
|
||||
count(0) {}
|
||||
|
||||
void Add(const CompactionStats& c) {
|
||||
this->micros += c.micros;
|
||||
this->bytes_readn += c.bytes_readn;
|
||||
this->bytes_readnp1 += c.bytes_readnp1;
|
||||
this->bytes_written += c.bytes_written;
|
||||
this->files_in_leveln += c.files_in_leveln;
|
||||
this->files_in_levelnp1 += c.files_in_levelnp1;
|
||||
this->files_out_levelnp1 += c.files_out_levelnp1;
|
||||
this->count += 1;
|
||||
}
|
||||
};
|
||||
|
||||
void AddCompactionStats(int level, const CompactionStats& stats) {
|
||||
compaction_stats_[level].Add(stats);
|
||||
}
|
||||
|
||||
void RecordWriteStall(WriteStallType write_stall_type, uint64_t micros) {
|
||||
stall_micros_[write_stall_type] += micros;
|
||||
stall_counts_[write_stall_type]++;
|
||||
}
|
||||
|
||||
void RecordLevelNSlowdown(int level, uint64_t micros) {
|
||||
stall_leveln_slowdown_[level] += micros;
|
||||
stall_leveln_slowdown_count_[level] += micros;
|
||||
}
|
||||
|
||||
uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
|
||||
|
||||
uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
|
||||
|
||||
bool GetProperty(DBPropertyType property_type, const Slice& property,
|
||||
std::string* value, ColumnFamilyData* cfd);
|
||||
|
||||
private:
|
||||
std::vector<CompactionStats> compaction_stats_;
|
||||
|
||||
// Used to compute per-interval statistics
|
||||
struct StatsSnapshot {
|
||||
uint64_t compaction_bytes_read_; // Bytes read by compaction
|
||||
uint64_t compaction_bytes_written_; // Bytes written by compaction
|
||||
uint64_t ingest_bytes_; // Bytes written by user
|
||||
uint64_t wal_bytes_; // Bytes written to WAL
|
||||
uint64_t wal_synced_; // Number of times WAL is synced
|
||||
uint64_t write_with_wal_; // Number of writes that request WAL
|
||||
// These count the number of writes processed by the calling thread or
|
||||
// another thread.
|
||||
uint64_t write_other_;
|
||||
uint64_t write_self_;
|
||||
double seconds_up_;
|
||||
|
||||
StatsSnapshot()
|
||||
: compaction_bytes_read_(0),
|
||||
compaction_bytes_written_(0),
|
||||
ingest_bytes_(0),
|
||||
wal_bytes_(0),
|
||||
wal_synced_(0),
|
||||
write_with_wal_(0),
|
||||
write_other_(0),
|
||||
write_self_(0),
|
||||
seconds_up_(0) {}
|
||||
};
|
||||
|
||||
// Counters from the previous time per-interval stats were computed
|
||||
StatsSnapshot last_stats_;
|
||||
|
||||
// These count the number of microseconds for which MakeRoomForWrite stalls.
|
||||
std::vector<uint64_t> stall_micros_;
|
||||
std::vector<uint64_t> stall_counts_;
|
||||
std::vector<uint64_t> stall_leveln_slowdown_;
|
||||
std::vector<uint64_t> stall_leveln_slowdown_count_;
|
||||
|
||||
// Total number of background errors encountered. Every time a flush task
|
||||
// or compaction task fails, this counter is incremented. The failure can
|
||||
// be caused by any possible reason, including file system errors, out of
|
||||
// resources, or input file corruption. Failing when retrying the same flush
|
||||
// or compaction will cause the counter to increase too.
|
||||
uint64_t bg_error_count_;
|
||||
|
||||
int number_levels_;
|
||||
Statistics* statistics_;
|
||||
Env* env_;
|
||||
uint64_t started_at_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
79
db/log_and_apply_bench.cc
Normal file
79
db/log_and_apply_bench.cc
Normal file
@@ -0,0 +1,79 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "util/testharness.h"
|
||||
#include "util/benchharness.h"
|
||||
#include "db/version_set.h"
|
||||
#include "util/mutexlock.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
std::string MakeKey(unsigned int num) {
|
||||
char buf[30];
|
||||
snprintf(buf, sizeof(buf), "%016u", num);
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
void BM_LogAndApply(int iters, int num_base_files) {
|
||||
VersionSet* vset;
|
||||
ColumnFamilyData* default_cfd;
|
||||
uint64_t fnum = 1;
|
||||
port::Mutex mu;
|
||||
MutexLock l(&mu);
|
||||
|
||||
BENCHMARK_SUSPEND {
|
||||
std::string dbname = test::TmpDir() + "/rocksdb_test_benchmark";
|
||||
ASSERT_OK(DestroyDB(dbname, Options()));
|
||||
|
||||
DB* db = nullptr;
|
||||
Options opts;
|
||||
opts.create_if_missing = true;
|
||||
Status s = DB::Open(opts, dbname, &db);
|
||||
ASSERT_OK(s);
|
||||
ASSERT_TRUE(db != nullptr);
|
||||
|
||||
delete db;
|
||||
db = nullptr;
|
||||
|
||||
Options options;
|
||||
EnvOptions sopt;
|
||||
vset = new VersionSet(dbname, &options, sopt, nullptr);
|
||||
std::vector<ColumnFamilyDescriptor> dummy;
|
||||
dummy.push_back(ColumnFamilyDescriptor());
|
||||
ASSERT_OK(vset->Recover(dummy));
|
||||
default_cfd = vset->GetColumnFamilySet()->GetDefault();
|
||||
VersionEdit vbase;
|
||||
for (int i = 0; i < num_base_files; i++) {
|
||||
InternalKey start(MakeKey(2 * fnum), 1, kTypeValue);
|
||||
InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
|
||||
vbase.AddFile(2, ++fnum, 1 /* file size */, start, limit, 1, 1);
|
||||
}
|
||||
ASSERT_OK(vset->LogAndApply(default_cfd, &vbase, &mu));
|
||||
}
|
||||
|
||||
for (int i = 0; i < iters; i++) {
|
||||
VersionEdit vedit;
|
||||
vedit.DeleteFile(2, fnum);
|
||||
InternalKey start(MakeKey(2 * fnum), 1, kTypeValue);
|
||||
InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
|
||||
vedit.AddFile(2, ++fnum, 1 /* file size */, start, limit, 1, 1);
|
||||
vset->LogAndApply(default_cfd, &vedit, &mu);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_1_file, 1000, 1)
|
||||
BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_100_files, 1000, 100)
|
||||
BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_10000_files, 1000, 10000)
|
||||
BENCHMARK_NAMED_PARAM(BM_LogAndApply, 100_iters_100000_files, 100, 100000)
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
rocksdb::benchmark::RunBenchmarks();
|
||||
return 0;
|
||||
}
|
||||
35
db/log_format.h
Normal file
35
db/log_format.h
Normal file
@@ -0,0 +1,35 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Log format information shared by reader and writer.
|
||||
// See ../doc/log_format.txt for more detail.
|
||||
|
||||
#pragma once
|
||||
namespace rocksdb {
|
||||
namespace log {
|
||||
|
||||
enum RecordType {
|
||||
// Zero is reserved for preallocated files
|
||||
kZeroType = 0,
|
||||
kFullType = 1,
|
||||
|
||||
// For fragments
|
||||
kFirstType = 2,
|
||||
kMiddleType = 3,
|
||||
kLastType = 4
|
||||
};
|
||||
static const int kMaxRecordType = kLastType;
|
||||
|
||||
static const unsigned int kBlockSize = 32768;
|
||||
|
||||
// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
|
||||
static const int kHeaderSize = 4 + 1 + 2;
|
||||
|
||||
} // namespace log
|
||||
} // namespace rocksdb
|
||||
339
db/log_reader.cc
Normal file
339
db/log_reader.cc
Normal file
@@ -0,0 +1,339 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/log_reader.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/crc32c.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace log {
|
||||
|
||||
Reader::Reporter::~Reporter() {
|
||||
}
|
||||
|
||||
Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
|
||||
bool checksum, uint64_t initial_offset)
|
||||
: file_(std::move(file)),
|
||||
reporter_(reporter),
|
||||
checksum_(checksum),
|
||||
backing_store_(new char[kBlockSize]),
|
||||
buffer_(),
|
||||
eof_(false),
|
||||
read_error_(false),
|
||||
eof_offset_(0),
|
||||
last_record_offset_(0),
|
||||
end_of_buffer_offset_(0),
|
||||
initial_offset_(initial_offset) {
|
||||
}
|
||||
|
||||
Reader::~Reader() {
|
||||
delete[] backing_store_;
|
||||
}
|
||||
|
||||
bool Reader::SkipToInitialBlock() {
|
||||
size_t offset_in_block = initial_offset_ % kBlockSize;
|
||||
uint64_t block_start_location = initial_offset_ - offset_in_block;
|
||||
|
||||
// Don't search a block if we'd be in the trailer
|
||||
if (offset_in_block > kBlockSize - 6) {
|
||||
offset_in_block = 0;
|
||||
block_start_location += kBlockSize;
|
||||
}
|
||||
|
||||
end_of_buffer_offset_ = block_start_location;
|
||||
|
||||
// Skip to start of first block that can contain the initial record
|
||||
if (block_start_location > 0) {
|
||||
Status skip_status = file_->Skip(block_start_location);
|
||||
if (!skip_status.ok()) {
|
||||
ReportDrop(block_start_location, skip_status);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
|
||||
if (last_record_offset_ < initial_offset_) {
|
||||
if (!SkipToInitialBlock()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
scratch->clear();
|
||||
record->clear();
|
||||
bool in_fragmented_record = false;
|
||||
// Record offset of the logical record that we're reading
|
||||
// 0 is a dummy value to make compilers happy
|
||||
uint64_t prospective_record_offset = 0;
|
||||
|
||||
Slice fragment;
|
||||
while (true) {
|
||||
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
|
||||
const unsigned int record_type = ReadPhysicalRecord(&fragment);
|
||||
switch (record_type) {
|
||||
case kFullType:
|
||||
if (in_fragmented_record) {
|
||||
// Handle bug in earlier versions of log::Writer where
|
||||
// it could emit an empty kFirstType record at the tail end
|
||||
// of a block followed by a kFullType or kFirstType record
|
||||
// at the beginning of the next block.
|
||||
if (scratch->empty()) {
|
||||
in_fragmented_record = false;
|
||||
} else {
|
||||
ReportCorruption(scratch->size(), "partial record without end(1)");
|
||||
}
|
||||
}
|
||||
prospective_record_offset = physical_record_offset;
|
||||
scratch->clear();
|
||||
*record = fragment;
|
||||
last_record_offset_ = prospective_record_offset;
|
||||
return true;
|
||||
|
||||
case kFirstType:
|
||||
if (in_fragmented_record) {
|
||||
// Handle bug in earlier versions of log::Writer where
|
||||
// it could emit an empty kFirstType record at the tail end
|
||||
// of a block followed by a kFullType or kFirstType record
|
||||
// at the beginning of the next block.
|
||||
if (scratch->empty()) {
|
||||
in_fragmented_record = false;
|
||||
} else {
|
||||
ReportCorruption(scratch->size(), "partial record without end(2)");
|
||||
}
|
||||
}
|
||||
prospective_record_offset = physical_record_offset;
|
||||
scratch->assign(fragment.data(), fragment.size());
|
||||
in_fragmented_record = true;
|
||||
break;
|
||||
|
||||
case kMiddleType:
|
||||
if (!in_fragmented_record) {
|
||||
ReportCorruption(fragment.size(),
|
||||
"missing start of fragmented record(1)");
|
||||
} else {
|
||||
scratch->append(fragment.data(), fragment.size());
|
||||
}
|
||||
break;
|
||||
|
||||
case kLastType:
|
||||
if (!in_fragmented_record) {
|
||||
ReportCorruption(fragment.size(),
|
||||
"missing start of fragmented record(2)");
|
||||
} else {
|
||||
scratch->append(fragment.data(), fragment.size());
|
||||
*record = Slice(*scratch);
|
||||
last_record_offset_ = prospective_record_offset;
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
|
||||
case kEof:
|
||||
if (in_fragmented_record) {
|
||||
// This can be caused by the writer dying immediately after
|
||||
// writing a physical record but before completing the next; don't
|
||||
// treat it as a corruption, just ignore the entire logical record.
|
||||
scratch->clear();
|
||||
}
|
||||
return false;
|
||||
|
||||
case kBadRecord:
|
||||
if (in_fragmented_record) {
|
||||
ReportCorruption(scratch->size(), "error in middle of record");
|
||||
in_fragmented_record = false;
|
||||
scratch->clear();
|
||||
}
|
||||
break;
|
||||
|
||||
default: {
|
||||
char buf[40];
|
||||
snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
|
||||
ReportCorruption(
|
||||
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
|
||||
buf);
|
||||
in_fragmented_record = false;
|
||||
scratch->clear();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
uint64_t Reader::LastRecordOffset() {
|
||||
return last_record_offset_;
|
||||
}
|
||||
|
||||
void Reader::UnmarkEOF() {
|
||||
if (read_error_) {
|
||||
return;
|
||||
}
|
||||
|
||||
eof_ = false;
|
||||
|
||||
if (eof_offset_ == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// If the EOF was in the middle of a block (a partial block was read) we have
|
||||
// to read the rest of the block as ReadPhysicalRecord can only read full
|
||||
// blocks and expects the file position indicator to be aligned to the start
|
||||
// of a block.
|
||||
//
|
||||
// consumed_bytes + buffer_size() + remaining == kBlockSize
|
||||
|
||||
size_t consumed_bytes = eof_offset_ - buffer_.size();
|
||||
size_t remaining = kBlockSize - eof_offset_;
|
||||
|
||||
// backing_store_ is used to concatenate what is left in buffer_ and
|
||||
// the remainder of the block. If buffer_ already uses backing_store_,
|
||||
// we just append the new data.
|
||||
if (buffer_.data() != backing_store_ + consumed_bytes) {
|
||||
// Buffer_ does not use backing_store_ for storage.
|
||||
// Copy what is left in buffer_ to backing_store.
|
||||
memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size());
|
||||
}
|
||||
|
||||
Slice read_buffer;
|
||||
Status status = file_->Read(remaining, &read_buffer,
|
||||
backing_store_ + eof_offset_);
|
||||
|
||||
size_t added = read_buffer.size();
|
||||
end_of_buffer_offset_ += added;
|
||||
|
||||
if (!status.ok()) {
|
||||
if (added > 0) {
|
||||
ReportDrop(added, status);
|
||||
}
|
||||
|
||||
read_error_ = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if (read_buffer.data() != backing_store_ + eof_offset_) {
|
||||
// Read did not write to backing_store_
|
||||
memmove(backing_store_ + eof_offset_, read_buffer.data(),
|
||||
read_buffer.size());
|
||||
}
|
||||
|
||||
buffer_ = Slice(backing_store_ + consumed_bytes,
|
||||
eof_offset_ + added - consumed_bytes);
|
||||
|
||||
if (added < remaining) {
|
||||
eof_ = true;
|
||||
eof_offset_ += added;
|
||||
} else {
|
||||
eof_offset_ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void Reader::ReportCorruption(size_t bytes, const char* reason) {
|
||||
ReportDrop(bytes, Status::Corruption(reason));
|
||||
}
|
||||
|
||||
void Reader::ReportDrop(size_t bytes, const Status& reason) {
|
||||
if (reporter_ != nullptr &&
|
||||
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
|
||||
reporter_->Corruption(bytes, reason);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
||||
while (true) {
|
||||
if (buffer_.size() < (size_t)kHeaderSize) {
|
||||
if (!eof_ && !read_error_) {
|
||||
// Last read was a full read, so this is a trailer to skip
|
||||
buffer_.clear();
|
||||
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
|
||||
end_of_buffer_offset_ += buffer_.size();
|
||||
if (!status.ok()) {
|
||||
buffer_.clear();
|
||||
ReportDrop(kBlockSize, status);
|
||||
read_error_ = true;
|
||||
return kEof;
|
||||
} else if (buffer_.size() < (size_t)kBlockSize) {
|
||||
eof_ = true;
|
||||
eof_offset_ = buffer_.size();
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
// Note that if buffer_ is non-empty, we have a truncated header at the
|
||||
// end of the file, which can be caused by the writer crashing in the
|
||||
// middle of writing the header. Instead of considering this an error,
|
||||
// just report EOF.
|
||||
buffer_.clear();
|
||||
return kEof;
|
||||
}
|
||||
}
|
||||
|
||||
// Parse the header
|
||||
const char* header = buffer_.data();
|
||||
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
|
||||
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
|
||||
const unsigned int type = header[6];
|
||||
const uint32_t length = a | (b << 8);
|
||||
if (kHeaderSize + length > buffer_.size()) {
|
||||
size_t drop_size = buffer_.size();
|
||||
buffer_.clear();
|
||||
if (!eof_) {
|
||||
ReportCorruption(drop_size, "bad record length");
|
||||
return kBadRecord;
|
||||
}
|
||||
// If the end of the file has been reached without reading |length| bytes
|
||||
// of payload, assume the writer died in the middle of writing the record.
|
||||
// Don't report a corruption.
|
||||
return kEof;
|
||||
}
|
||||
|
||||
if (type == kZeroType && length == 0) {
|
||||
// Skip zero length record without reporting any drops since
|
||||
// such records are produced by the mmap based writing code in
|
||||
// env_posix.cc that preallocates file regions.
|
||||
// NOTE: this should never happen in DB written by new RocksDB versions,
|
||||
// since we turn off mmap writes to manifest and log files
|
||||
buffer_.clear();
|
||||
return kBadRecord;
|
||||
}
|
||||
|
||||
// Check crc
|
||||
if (checksum_) {
|
||||
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
|
||||
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
|
||||
if (actual_crc != expected_crc) {
|
||||
// Drop the rest of the buffer since "length" itself may have
|
||||
// been corrupted and if we trust it, we could find some
|
||||
// fragment of a real log record that just happens to look
|
||||
// like a valid log record.
|
||||
size_t drop_size = buffer_.size();
|
||||
buffer_.clear();
|
||||
ReportCorruption(drop_size, "checksum mismatch");
|
||||
return kBadRecord;
|
||||
}
|
||||
}
|
||||
|
||||
buffer_.remove_prefix(kHeaderSize + length);
|
||||
|
||||
// Skip physical record that started before initial_offset_
|
||||
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
|
||||
initial_offset_) {
|
||||
result->clear();
|
||||
return kBadRecord;
|
||||
}
|
||||
|
||||
*result = Slice(header + kHeaderSize, length);
|
||||
return type;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace log
|
||||
} // namespace rocksdb
|
||||
130
db/log_reader.h
Normal file
130
db/log_reader.h
Normal file
@@ -0,0 +1,130 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <memory>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "db/log_format.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class SequentialFile;
|
||||
using std::unique_ptr;
|
||||
|
||||
namespace log {
|
||||
|
||||
class Reader {
|
||||
public:
|
||||
// Interface for reporting errors.
|
||||
class Reporter {
|
||||
public:
|
||||
virtual ~Reporter();
|
||||
|
||||
// Some corruption was detected. "size" is the approximate number
|
||||
// of bytes dropped due to the corruption.
|
||||
virtual void Corruption(size_t bytes, const Status& status) = 0;
|
||||
};
|
||||
|
||||
// Create a reader that will return log records from "*file".
|
||||
// "*file" must remain live while this Reader is in use.
|
||||
//
|
||||
// If "reporter" is non-nullptr, it is notified whenever some data is
|
||||
// dropped due to a detected corruption. "*reporter" must remain
|
||||
// live while this Reader is in use.
|
||||
//
|
||||
// If "checksum" is true, verify checksums if available.
|
||||
//
|
||||
// The Reader will start reading at the first record located at physical
|
||||
// position >= initial_offset within the file.
|
||||
Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
|
||||
bool checksum, uint64_t initial_offset);
|
||||
|
||||
~Reader();
|
||||
|
||||
// Read the next record into *record. Returns true if read
|
||||
// successfully, false if we hit end of the input. May use
|
||||
// "*scratch" as temporary storage. The contents filled in *record
|
||||
// will only be valid until the next mutating operation on this
|
||||
// reader or the next mutation to *scratch.
|
||||
bool ReadRecord(Slice* record, std::string* scratch);
|
||||
|
||||
// Returns the physical offset of the last record returned by ReadRecord.
|
||||
//
|
||||
// Undefined before the first call to ReadRecord.
|
||||
uint64_t LastRecordOffset();
|
||||
|
||||
// returns true if the reader has encountered an eof condition.
|
||||
bool IsEOF() {
|
||||
return eof_;
|
||||
}
|
||||
|
||||
// when we know more data has been written to the file. we can use this
|
||||
// function to force the reader to look again in the file.
|
||||
// Also aligns the file position indicator to the start of the next block
|
||||
// by reading the rest of the data from the EOF position to the end of the
|
||||
// block that was partially read.
|
||||
void UnmarkEOF();
|
||||
|
||||
SequentialFile* file() { return file_.get(); }
|
||||
|
||||
private:
|
||||
const unique_ptr<SequentialFile> file_;
|
||||
Reporter* const reporter_;
|
||||
bool const checksum_;
|
||||
char* const backing_store_;
|
||||
Slice buffer_;
|
||||
bool eof_; // Last Read() indicated EOF by returning < kBlockSize
|
||||
bool read_error_; // Error occurred while reading from file
|
||||
|
||||
// Offset of the file position indicator within the last block when an
|
||||
// EOF was detected.
|
||||
size_t eof_offset_;
|
||||
|
||||
// Offset of the last record returned by ReadRecord.
|
||||
uint64_t last_record_offset_;
|
||||
// Offset of the first location past the end of buffer_.
|
||||
uint64_t end_of_buffer_offset_;
|
||||
|
||||
// Offset at which to start looking for the first record to return
|
||||
uint64_t const initial_offset_;
|
||||
|
||||
// Extend record types with the following special values
|
||||
enum {
|
||||
kEof = kMaxRecordType + 1,
|
||||
// Returned whenever we find an invalid physical record.
|
||||
// Currently there are three situations in which this happens:
|
||||
// * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
|
||||
// * The record is a 0-length record (No drop is reported)
|
||||
// * The record is below constructor's initial_offset (No drop is reported)
|
||||
kBadRecord = kMaxRecordType + 2
|
||||
};
|
||||
|
||||
// Skips all blocks that are completely before "initial_offset_".
|
||||
//
|
||||
// Returns true on success. Handles reporting.
|
||||
bool SkipToInitialBlock();
|
||||
|
||||
// Return type, or one of the preceding special values
|
||||
unsigned int ReadPhysicalRecord(Slice* result);
|
||||
|
||||
// Reports dropped bytes to the reporter.
|
||||
// buffer_ must be updated to remove the dropped bytes prior to invocation.
|
||||
void ReportCorruption(size_t bytes, const char* reason);
|
||||
void ReportDrop(size_t bytes, const Status& reason);
|
||||
|
||||
// No copying allowed
|
||||
Reader(const Reader&);
|
||||
void operator=(const Reader&);
|
||||
};
|
||||
|
||||
} // namespace log
|
||||
} // namespace rocksdb
|
||||
689
db/log_test.cc
Normal file
689
db/log_test.cc
Normal file
@@ -0,0 +1,689 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/log_reader.h"
|
||||
#include "db/log_writer.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/crc32c.h"
|
||||
#include "util/random.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace log {
|
||||
|
||||
// Construct a string of the specified length made out of the supplied
|
||||
// partial string.
|
||||
static std::string BigString(const std::string& partial_string, size_t n) {
|
||||
std::string result;
|
||||
while (result.size() < n) {
|
||||
result.append(partial_string);
|
||||
}
|
||||
result.resize(n);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Construct a string from a number
|
||||
static std::string NumberString(int n) {
|
||||
char buf[50];
|
||||
snprintf(buf, sizeof(buf), "%d.", n);
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
// Return a skewed potentially long string
|
||||
static std::string RandomSkewedString(int i, Random* rnd) {
|
||||
return BigString(NumberString(i), rnd->Skewed(17));
|
||||
}
|
||||
|
||||
class LogTest {
|
||||
private:
|
||||
class StringDest : public WritableFile {
|
||||
public:
|
||||
std::string contents_;
|
||||
|
||||
explicit StringDest(Slice& reader_contents) :
|
||||
WritableFile(),
|
||||
contents_(""),
|
||||
reader_contents_(reader_contents),
|
||||
last_flush_(0) {
|
||||
reader_contents_ = Slice(contents_.data(), 0);
|
||||
};
|
||||
|
||||
virtual Status Close() { return Status::OK(); }
|
||||
virtual Status Flush() {
|
||||
ASSERT_TRUE(reader_contents_.size() <= last_flush_);
|
||||
size_t offset = last_flush_ - reader_contents_.size();
|
||||
reader_contents_ = Slice(
|
||||
contents_.data() + offset,
|
||||
contents_.size() - offset);
|
||||
last_flush_ = contents_.size();
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
virtual Status Sync() { return Status::OK(); }
|
||||
virtual Status Append(const Slice& slice) {
|
||||
contents_.append(slice.data(), slice.size());
|
||||
return Status::OK();
|
||||
}
|
||||
void Drop(size_t bytes) {
|
||||
contents_.resize(contents_.size() - bytes);
|
||||
reader_contents_ = Slice(
|
||||
reader_contents_.data(), reader_contents_.size() - bytes);
|
||||
last_flush_ = contents_.size();
|
||||
}
|
||||
|
||||
private:
|
||||
Slice& reader_contents_;
|
||||
size_t last_flush_;
|
||||
};
|
||||
|
||||
class StringSource : public SequentialFile {
|
||||
public:
|
||||
Slice& contents_;
|
||||
bool force_error_;
|
||||
size_t force_error_position_;
|
||||
bool force_eof_;
|
||||
size_t force_eof_position_;
|
||||
bool returned_partial_;
|
||||
explicit StringSource(Slice& contents) :
|
||||
contents_(contents),
|
||||
force_error_(false),
|
||||
force_error_position_(0),
|
||||
force_eof_(false),
|
||||
force_eof_position_(0),
|
||||
returned_partial_(false) { }
|
||||
|
||||
virtual Status Read(size_t n, Slice* result, char* scratch) {
|
||||
ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
|
||||
|
||||
if (force_error_) {
|
||||
if (force_error_position_ >= n) {
|
||||
force_error_position_ -= n;
|
||||
} else {
|
||||
*result = Slice(contents_.data(), force_error_position_);
|
||||
contents_.remove_prefix(force_error_position_);
|
||||
force_error_ = false;
|
||||
returned_partial_ = true;
|
||||
return Status::Corruption("read error");
|
||||
}
|
||||
}
|
||||
|
||||
if (contents_.size() < n) {
|
||||
n = contents_.size();
|
||||
returned_partial_ = true;
|
||||
}
|
||||
|
||||
if (force_eof_) {
|
||||
if (force_eof_position_ >= n) {
|
||||
force_eof_position_ -= n;
|
||||
} else {
|
||||
force_eof_ = false;
|
||||
n = force_eof_position_;
|
||||
returned_partial_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
// By using scratch we ensure that caller has control over the
|
||||
// lifetime of result.data()
|
||||
memcpy(scratch, contents_.data(), n);
|
||||
*result = Slice(scratch, n);
|
||||
|
||||
contents_.remove_prefix(n);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status Skip(uint64_t n) {
|
||||
if (n > contents_.size()) {
|
||||
contents_.clear();
|
||||
return Status::NotFound("in-memory file skipepd past end");
|
||||
}
|
||||
|
||||
contents_.remove_prefix(n);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
class ReportCollector : public Reader::Reporter {
|
||||
public:
|
||||
size_t dropped_bytes_;
|
||||
std::string message_;
|
||||
|
||||
ReportCollector() : dropped_bytes_(0) { }
|
||||
virtual void Corruption(size_t bytes, const Status& status) {
|
||||
dropped_bytes_ += bytes;
|
||||
message_.append(status.ToString());
|
||||
}
|
||||
};
|
||||
|
||||
std::string& dest_contents() {
|
||||
auto dest = dynamic_cast<StringDest*>(writer_.file());
|
||||
assert(dest);
|
||||
return dest->contents_;
|
||||
}
|
||||
|
||||
const std::string& dest_contents() const {
|
||||
auto dest = dynamic_cast<const StringDest*>(writer_.file());
|
||||
assert(dest);
|
||||
return dest->contents_;
|
||||
}
|
||||
|
||||
void reset_source_contents() {
|
||||
auto src = dynamic_cast<StringSource*>(reader_.file());
|
||||
assert(src);
|
||||
src->contents_ = dest_contents();
|
||||
}
|
||||
|
||||
Slice reader_contents_;
|
||||
unique_ptr<StringDest> dest_holder_;
|
||||
unique_ptr<StringSource> source_holder_;
|
||||
ReportCollector report_;
|
||||
Writer writer_;
|
||||
Reader reader_;
|
||||
|
||||
// Record metadata for testing initial offset functionality
|
||||
static size_t initial_offset_record_sizes_[];
|
||||
static uint64_t initial_offset_last_record_offsets_[];
|
||||
|
||||
public:
|
||||
LogTest() : reader_contents_(),
|
||||
dest_holder_(new StringDest(reader_contents_)),
|
||||
source_holder_(new StringSource(reader_contents_)),
|
||||
writer_(std::move(dest_holder_)),
|
||||
reader_(std::move(source_holder_), &report_, true/*checksum*/,
|
||||
0/*initial_offset*/) {
|
||||
}
|
||||
|
||||
void Write(const std::string& msg) {
|
||||
writer_.AddRecord(Slice(msg));
|
||||
}
|
||||
|
||||
size_t WrittenBytes() const {
|
||||
return dest_contents().size();
|
||||
}
|
||||
|
||||
std::string Read() {
|
||||
std::string scratch;
|
||||
Slice record;
|
||||
if (reader_.ReadRecord(&record, &scratch)) {
|
||||
return record.ToString();
|
||||
} else {
|
||||
return "EOF";
|
||||
}
|
||||
}
|
||||
|
||||
void IncrementByte(int offset, int delta) {
|
||||
dest_contents()[offset] += delta;
|
||||
}
|
||||
|
||||
void SetByte(int offset, char new_byte) {
|
||||
dest_contents()[offset] = new_byte;
|
||||
}
|
||||
|
||||
void ShrinkSize(int bytes) {
|
||||
auto dest = dynamic_cast<StringDest*>(writer_.file());
|
||||
assert(dest);
|
||||
dest->Drop(bytes);
|
||||
}
|
||||
|
||||
void FixChecksum(int header_offset, int len) {
|
||||
// Compute crc of type/len/data
|
||||
uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len);
|
||||
crc = crc32c::Mask(crc);
|
||||
EncodeFixed32(&dest_contents()[header_offset], crc);
|
||||
}
|
||||
|
||||
void ForceError(size_t position = 0) {
|
||||
auto src = dynamic_cast<StringSource*>(reader_.file());
|
||||
src->force_error_ = true;
|
||||
src->force_error_position_ = position;
|
||||
}
|
||||
|
||||
size_t DroppedBytes() const {
|
||||
return report_.dropped_bytes_;
|
||||
}
|
||||
|
||||
std::string ReportMessage() const {
|
||||
return report_.message_;
|
||||
}
|
||||
|
||||
void ForceEOF(size_t position = 0) {
|
||||
auto src = dynamic_cast<StringSource*>(reader_.file());
|
||||
src->force_eof_ = true;
|
||||
src->force_eof_position_ = position;
|
||||
}
|
||||
|
||||
void UnmarkEOF() {
|
||||
auto src = dynamic_cast<StringSource*>(reader_.file());
|
||||
src->returned_partial_ = false;
|
||||
reader_.UnmarkEOF();
|
||||
}
|
||||
|
||||
bool IsEOF() {
|
||||
return reader_.IsEOF();
|
||||
}
|
||||
|
||||
// Returns OK iff recorded error message contains "msg"
|
||||
std::string MatchError(const std::string& msg) const {
|
||||
if (report_.message_.find(msg) == std::string::npos) {
|
||||
return report_.message_;
|
||||
} else {
|
||||
return "OK";
|
||||
}
|
||||
}
|
||||
|
||||
void WriteInitialOffsetLog() {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
std::string record(initial_offset_record_sizes_[i],
|
||||
static_cast<char>('a' + i));
|
||||
Write(record);
|
||||
}
|
||||
}
|
||||
|
||||
void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
|
||||
WriteInitialOffsetLog();
|
||||
unique_ptr<StringSource> source(new StringSource(reader_contents_));
|
||||
unique_ptr<Reader> offset_reader(
|
||||
new Reader(std::move(source), &report_, true/*checksum*/,
|
||||
WrittenBytes() + offset_past_end));
|
||||
Slice record;
|
||||
std::string scratch;
|
||||
ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
|
||||
}
|
||||
|
||||
void CheckInitialOffsetRecord(uint64_t initial_offset,
|
||||
int expected_record_offset) {
|
||||
WriteInitialOffsetLog();
|
||||
unique_ptr<StringSource> source(new StringSource(reader_contents_));
|
||||
unique_ptr<Reader> offset_reader(
|
||||
new Reader(std::move(source), &report_, true/*checksum*/,
|
||||
initial_offset));
|
||||
Slice record;
|
||||
std::string scratch;
|
||||
ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
|
||||
ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
|
||||
record.size());
|
||||
ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
|
||||
offset_reader->LastRecordOffset());
|
||||
ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
size_t LogTest::initial_offset_record_sizes_[] =
|
||||
{10000, // Two sizable records in first block
|
||||
10000,
|
||||
2 * log::kBlockSize - 1000, // Span three blocks
|
||||
1};
|
||||
|
||||
uint64_t LogTest::initial_offset_last_record_offsets_[] =
|
||||
{0,
|
||||
kHeaderSize + 10000,
|
||||
2 * (kHeaderSize + 10000),
|
||||
2 * (kHeaderSize + 10000) +
|
||||
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
|
||||
|
||||
|
||||
TEST(LogTest, Empty) {
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadWrite) {
|
||||
Write("foo");
|
||||
Write("bar");
|
||||
Write("");
|
||||
Write("xxxx");
|
||||
ASSERT_EQ("foo", Read());
|
||||
ASSERT_EQ("bar", Read());
|
||||
ASSERT_EQ("", Read());
|
||||
ASSERT_EQ("xxxx", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
|
||||
}
|
||||
|
||||
TEST(LogTest, ManyBlocks) {
|
||||
for (int i = 0; i < 100000; i++) {
|
||||
Write(NumberString(i));
|
||||
}
|
||||
for (int i = 0; i < 100000; i++) {
|
||||
ASSERT_EQ(NumberString(i), Read());
|
||||
}
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, Fragmentation) {
|
||||
Write("small");
|
||||
Write(BigString("medium", 50000));
|
||||
Write(BigString("large", 100000));
|
||||
ASSERT_EQ("small", Read());
|
||||
ASSERT_EQ(BigString("medium", 50000), Read());
|
||||
ASSERT_EQ(BigString("large", 100000), Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, MarginalTrailer) {
|
||||
// Make a trailer that is exactly the same length as an empty record.
|
||||
const int n = kBlockSize - 2*kHeaderSize;
|
||||
Write(BigString("foo", n));
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
|
||||
Write("");
|
||||
Write("bar");
|
||||
ASSERT_EQ(BigString("foo", n), Read());
|
||||
ASSERT_EQ("", Read());
|
||||
ASSERT_EQ("bar", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, MarginalTrailer2) {
|
||||
// Make a trailer that is exactly the same length as an empty record.
|
||||
const int n = kBlockSize - 2*kHeaderSize;
|
||||
Write(BigString("foo", n));
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
|
||||
Write("bar");
|
||||
ASSERT_EQ(BigString("foo", n), Read());
|
||||
ASSERT_EQ("bar", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(0U, DroppedBytes());
|
||||
ASSERT_EQ("", ReportMessage());
|
||||
}
|
||||
|
||||
TEST(LogTest, ShortTrailer) {
|
||||
const int n = kBlockSize - 2*kHeaderSize + 4;
|
||||
Write(BigString("foo", n));
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
|
||||
Write("");
|
||||
Write("bar");
|
||||
ASSERT_EQ(BigString("foo", n), Read());
|
||||
ASSERT_EQ("", Read());
|
||||
ASSERT_EQ("bar", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, AlignedEof) {
|
||||
const int n = kBlockSize - 2*kHeaderSize + 4;
|
||||
Write(BigString("foo", n));
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
|
||||
ASSERT_EQ(BigString("foo", n), Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, RandomRead) {
|
||||
const int N = 500;
|
||||
Random write_rnd(301);
|
||||
for (int i = 0; i < N; i++) {
|
||||
Write(RandomSkewedString(i, &write_rnd));
|
||||
}
|
||||
Random read_rnd(301);
|
||||
for (int i = 0; i < N; i++) {
|
||||
ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
|
||||
}
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
// Tests of all the error paths in log_reader.cc follow:
|
||||
|
||||
TEST(LogTest, ReadError) {
|
||||
Write("foo");
|
||||
ForceError();
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("read error"));
|
||||
}
|
||||
|
||||
TEST(LogTest, BadRecordType) {
|
||||
Write("foo");
|
||||
// Type is stored in header[6]
|
||||
IncrementByte(6, 100);
|
||||
FixChecksum(0, 3);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("unknown record type"));
|
||||
}
|
||||
|
||||
TEST(LogTest, TruncatedTrailingRecordIsIgnored) {
|
||||
Write("foo");
|
||||
ShrinkSize(4); // Drop all payload as well as a header byte
|
||||
ASSERT_EQ("EOF", Read());
|
||||
// Truncated last record is ignored, not treated as an error
|
||||
ASSERT_EQ(0U, DroppedBytes());
|
||||
ASSERT_EQ("", ReportMessage());
|
||||
}
|
||||
|
||||
TEST(LogTest, BadLength) {
|
||||
const int kPayloadSize = kBlockSize - kHeaderSize;
|
||||
Write(BigString("bar", kPayloadSize));
|
||||
Write("foo");
|
||||
// Least significant size byte is stored in header[4].
|
||||
IncrementByte(4, 1);
|
||||
ASSERT_EQ("foo", Read());
|
||||
ASSERT_EQ(kBlockSize, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("bad record length"));
|
||||
}
|
||||
|
||||
TEST(LogTest, BadLengthAtEndIsIgnored) {
|
||||
Write("foo");
|
||||
ShrinkSize(1);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(0U, DroppedBytes());
|
||||
ASSERT_EQ("", ReportMessage());
|
||||
}
|
||||
|
||||
TEST(LogTest, ChecksumMismatch) {
|
||||
Write("foo");
|
||||
IncrementByte(0, 10);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(10U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("checksum mismatch"));
|
||||
}
|
||||
|
||||
TEST(LogTest, UnexpectedMiddleType) {
|
||||
Write("foo");
|
||||
SetByte(6, kMiddleType);
|
||||
FixChecksum(0, 3);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("missing start"));
|
||||
}
|
||||
|
||||
TEST(LogTest, UnexpectedLastType) {
|
||||
Write("foo");
|
||||
SetByte(6, kLastType);
|
||||
FixChecksum(0, 3);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("missing start"));
|
||||
}
|
||||
|
||||
TEST(LogTest, UnexpectedFullType) {
|
||||
Write("foo");
|
||||
Write("bar");
|
||||
SetByte(6, kFirstType);
|
||||
FixChecksum(0, 3);
|
||||
ASSERT_EQ("bar", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("partial record without end"));
|
||||
}
|
||||
|
||||
TEST(LogTest, UnexpectedFirstType) {
|
||||
Write("foo");
|
||||
Write(BigString("bar", 100000));
|
||||
SetByte(6, kFirstType);
|
||||
FixChecksum(0, 3);
|
||||
ASSERT_EQ(BigString("bar", 100000), Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("partial record without end"));
|
||||
}
|
||||
|
||||
TEST(LogTest, MissingLastIsIgnored) {
|
||||
Write(BigString("bar", kBlockSize));
|
||||
// Remove the LAST block, including header.
|
||||
ShrinkSize(14);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ("", ReportMessage());
|
||||
ASSERT_EQ(0U, DroppedBytes());
|
||||
}
|
||||
|
||||
TEST(LogTest, PartialLastIsIgnored) {
|
||||
Write(BigString("bar", kBlockSize));
|
||||
// Cause a bad record length in the LAST block.
|
||||
ShrinkSize(1);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ("", ReportMessage());
|
||||
ASSERT_EQ(0U, DroppedBytes());
|
||||
}
|
||||
|
||||
TEST(LogTest, ErrorJoinsRecords) {
|
||||
// Consider two fragmented records:
|
||||
// first(R1) last(R1) first(R2) last(R2)
|
||||
// where the middle two fragments disappear. We do not want
|
||||
// first(R1),last(R2) to get joined and returned as a valid record.
|
||||
|
||||
// Write records that span two blocks
|
||||
Write(BigString("foo", kBlockSize));
|
||||
Write(BigString("bar", kBlockSize));
|
||||
Write("correct");
|
||||
|
||||
// Wipe the middle block
|
||||
for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
|
||||
SetByte(offset, 'x');
|
||||
}
|
||||
|
||||
ASSERT_EQ("correct", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
const unsigned int dropped = DroppedBytes();
|
||||
ASSERT_LE(dropped, 2*kBlockSize + 100);
|
||||
ASSERT_GE(dropped, 2*kBlockSize);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadStart) {
|
||||
CheckInitialOffsetRecord(0, 0);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadSecondOneOff) {
|
||||
CheckInitialOffsetRecord(1, 1);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadSecondTenThousand) {
|
||||
CheckInitialOffsetRecord(10000, 1);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadSecondStart) {
|
||||
CheckInitialOffsetRecord(10007, 1);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadThirdOneOff) {
|
||||
CheckInitialOffsetRecord(10008, 2);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadThirdStart) {
|
||||
CheckInitialOffsetRecord(20014, 2);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadFourthOneOff) {
|
||||
CheckInitialOffsetRecord(20015, 3);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadFourthFirstBlockTrailer) {
|
||||
CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadFourthMiddleBlock) {
|
||||
CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadFourthLastBlock) {
|
||||
CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadFourthStart) {
|
||||
CheckInitialOffsetRecord(
|
||||
2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
|
||||
3);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadEnd) {
|
||||
CheckOffsetPastEndReturnsNoRecords(0);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadPastEnd) {
|
||||
CheckOffsetPastEndReturnsNoRecords(5);
|
||||
}
|
||||
|
||||
TEST(LogTest, ClearEofSingleBlock) {
|
||||
Write("foo");
|
||||
Write("bar");
|
||||
ForceEOF(3 + kHeaderSize + 2);
|
||||
ASSERT_EQ("foo", Read());
|
||||
UnmarkEOF();
|
||||
ASSERT_EQ("bar", Read());
|
||||
ASSERT_TRUE(IsEOF());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
Write("xxx");
|
||||
UnmarkEOF();
|
||||
ASSERT_EQ("xxx", Read());
|
||||
ASSERT_TRUE(IsEOF());
|
||||
}
|
||||
|
||||
TEST(LogTest, ClearEofMultiBlock) {
|
||||
size_t num_full_blocks = 5;
|
||||
size_t n = (kBlockSize - kHeaderSize) * num_full_blocks + 25;
|
||||
Write(BigString("foo", n));
|
||||
Write(BigString("bar", n));
|
||||
ForceEOF(n + num_full_blocks * kHeaderSize + 10);
|
||||
ASSERT_EQ(BigString("foo", n), Read());
|
||||
ASSERT_TRUE(IsEOF());
|
||||
UnmarkEOF();
|
||||
ASSERT_EQ(BigString("bar", n), Read());
|
||||
ASSERT_TRUE(IsEOF());
|
||||
Write(BigString("xxx", n));
|
||||
UnmarkEOF();
|
||||
ASSERT_EQ(BigString("xxx", n), Read());
|
||||
ASSERT_TRUE(IsEOF());
|
||||
}
|
||||
|
||||
TEST(LogTest, ClearEofError) {
|
||||
// If an error occurs during Read() in UnmarkEOF(), the records contained
|
||||
// in the buffer should be returned on subsequent calls of ReadRecord()
|
||||
// until no more full records are left, whereafter ReadRecord() should return
|
||||
// false to indicate that it cannot read any further.
|
||||
|
||||
Write("foo");
|
||||
Write("bar");
|
||||
UnmarkEOF();
|
||||
ASSERT_EQ("foo", Read());
|
||||
ASSERT_TRUE(IsEOF());
|
||||
Write("xxx");
|
||||
ForceError(0);
|
||||
UnmarkEOF();
|
||||
ASSERT_EQ("bar", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, ClearEofError2) {
|
||||
Write("foo");
|
||||
Write("bar");
|
||||
UnmarkEOF();
|
||||
ASSERT_EQ("foo", Read());
|
||||
Write("xxx");
|
||||
ForceError(3);
|
||||
UnmarkEOF();
|
||||
ASSERT_EQ("bar", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("read error"));
|
||||
}
|
||||
|
||||
} // namespace log
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
||||
108
db/log_writer.cc
Normal file
108
db/log_writer.cc
Normal file
@@ -0,0 +1,108 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/log_writer.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/crc32c.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace log {
|
||||
|
||||
Writer::Writer(unique_ptr<WritableFile>&& dest)
|
||||
: dest_(std::move(dest)),
|
||||
block_offset_(0) {
|
||||
for (int i = 0; i <= kMaxRecordType; i++) {
|
||||
char t = static_cast<char>(i);
|
||||
type_crc_[i] = crc32c::Value(&t, 1);
|
||||
}
|
||||
}
|
||||
|
||||
Writer::~Writer() {
|
||||
}
|
||||
|
||||
Status Writer::AddRecord(const Slice& slice) {
|
||||
const char* ptr = slice.data();
|
||||
size_t left = slice.size();
|
||||
|
||||
// Fragment the record if necessary and emit it. Note that if slice
|
||||
// is empty, we still want to iterate once to emit a single
|
||||
// zero-length record
|
||||
Status s;
|
||||
bool begin = true;
|
||||
do {
|
||||
const int leftover = kBlockSize - block_offset_;
|
||||
assert(leftover >= 0);
|
||||
if (leftover < kHeaderSize) {
|
||||
// Switch to a new block
|
||||
if (leftover > 0) {
|
||||
// Fill the trailer (literal below relies on kHeaderSize being 7)
|
||||
assert(kHeaderSize == 7);
|
||||
dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
|
||||
}
|
||||
block_offset_ = 0;
|
||||
}
|
||||
|
||||
// Invariant: we never leave < kHeaderSize bytes in a block.
|
||||
assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
|
||||
|
||||
const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
|
||||
const size_t fragment_length = (left < avail) ? left : avail;
|
||||
|
||||
RecordType type;
|
||||
const bool end = (left == fragment_length);
|
||||
if (begin && end) {
|
||||
type = kFullType;
|
||||
} else if (begin) {
|
||||
type = kFirstType;
|
||||
} else if (end) {
|
||||
type = kLastType;
|
||||
} else {
|
||||
type = kMiddleType;
|
||||
}
|
||||
|
||||
s = EmitPhysicalRecord(type, ptr, fragment_length);
|
||||
ptr += fragment_length;
|
||||
left -= fragment_length;
|
||||
begin = false;
|
||||
} while (s.ok() && left > 0);
|
||||
return s;
|
||||
}
|
||||
|
||||
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
|
||||
assert(n <= 0xffff); // Must fit in two bytes
|
||||
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
|
||||
|
||||
// Format the header
|
||||
char buf[kHeaderSize];
|
||||
buf[4] = static_cast<char>(n & 0xff);
|
||||
buf[5] = static_cast<char>(n >> 8);
|
||||
buf[6] = static_cast<char>(t);
|
||||
|
||||
// Compute the crc of the record type and the payload.
|
||||
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
|
||||
crc = crc32c::Mask(crc); // Adjust for storage
|
||||
EncodeFixed32(buf, crc);
|
||||
|
||||
// Write the header and the payload
|
||||
Status s = dest_->Append(Slice(buf, kHeaderSize));
|
||||
if (s.ok()) {
|
||||
s = dest_->Append(Slice(ptr, n));
|
||||
if (s.ok()) {
|
||||
s = dest_->Flush();
|
||||
}
|
||||
}
|
||||
block_offset_ += kHeaderSize + n;
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace log
|
||||
} // namespace rocksdb
|
||||
55
db/log_writer.h
Normal file
55
db/log_writer.h
Normal file
@@ -0,0 +1,55 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <memory>
|
||||
#include <stdint.h>
|
||||
#include "db/log_format.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class WritableFile;
|
||||
|
||||
using std::unique_ptr;
|
||||
|
||||
namespace log {
|
||||
|
||||
class Writer {
|
||||
public:
|
||||
// Create a writer that will append data to "*dest".
|
||||
// "*dest" must be initially empty.
|
||||
// "*dest" must remain live while this Writer is in use.
|
||||
explicit Writer(unique_ptr<WritableFile>&& dest);
|
||||
~Writer();
|
||||
|
||||
Status AddRecord(const Slice& slice);
|
||||
|
||||
WritableFile* file() { return dest_.get(); }
|
||||
const WritableFile* file() const { return dest_.get(); }
|
||||
|
||||
private:
|
||||
unique_ptr<WritableFile> dest_;
|
||||
int block_offset_; // Current offset in block
|
||||
|
||||
// crc32c values for all supported record types. These are
|
||||
// pre-computed to reduce the overhead of computing the crc of the
|
||||
// record type stored in the header.
|
||||
uint32_t type_crc_[kMaxRecordType + 1];
|
||||
|
||||
Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
|
||||
|
||||
// No copying allowed
|
||||
Writer(const Writer&);
|
||||
void operator=(const Writer&);
|
||||
};
|
||||
|
||||
} // namespace log
|
||||
} // namespace rocksdb
|
||||
620
db/memtable.cc
Normal file
620
db/memtable.cc
Normal file
@@ -0,0 +1,620 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/memtable.h"
|
||||
|
||||
#include <memory>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "db/merge_context.h"
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/merge_operator.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "table/merger.h"
|
||||
#include "util/arena.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/murmurhash.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include "util/perf_context_imp.h"
|
||||
#include "util/statistics.h"
|
||||
#include "util/stop_watch.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
|
||||
: comparator_(cmp),
|
||||
refs_(0),
|
||||
kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)),
|
||||
kWriteBufferSize(options.write_buffer_size),
|
||||
arena_(options.arena_block_size),
|
||||
table_(options.memtable_factory->CreateMemTableRep(
|
||||
comparator_, &arena_, options.prefix_extractor.get(),
|
||||
options.info_log.get())),
|
||||
num_entries_(0),
|
||||
flush_in_progress_(false),
|
||||
flush_completed_(false),
|
||||
file_number_(0),
|
||||
first_seqno_(0),
|
||||
mem_next_logfile_number_(0),
|
||||
locks_(options.inplace_update_support ? options.inplace_update_num_locks
|
||||
: 0),
|
||||
prefix_extractor_(options.prefix_extractor.get()),
|
||||
should_flush_(ShouldFlushNow()) {
|
||||
// if should_flush_ == true without an entry inserted, something must have
|
||||
// gone wrong already.
|
||||
assert(!should_flush_);
|
||||
if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
|
||||
prefix_bloom_.reset(new DynamicBloom(
|
||||
options.memtable_prefix_bloom_bits, options.bloom_locality,
|
||||
options.memtable_prefix_bloom_probes, nullptr,
|
||||
options.memtable_prefix_bloom_huge_page_tlb_size,
|
||||
options.info_log.get()));
|
||||
}
|
||||
}
|
||||
|
||||
MemTable::~MemTable() {
|
||||
assert(refs_ == 0);
|
||||
}
|
||||
|
||||
size_t MemTable::ApproximateMemoryUsage() {
|
||||
size_t arena_usage = arena_.ApproximateMemoryUsage();
|
||||
size_t table_usage = table_->ApproximateMemoryUsage();
|
||||
// let MAX_USAGE = std::numeric_limits<size_t>::max()
|
||||
// then if arena_usage + total_usage >= MAX_USAGE, return MAX_USAGE.
|
||||
// the following variation is to avoid numeric overflow.
|
||||
if (arena_usage >= std::numeric_limits<size_t>::max() - table_usage) {
|
||||
return std::numeric_limits<size_t>::max();
|
||||
}
|
||||
// otherwise, return the actual usage
|
||||
return arena_usage + table_usage;
|
||||
}
|
||||
|
||||
bool MemTable::ShouldFlushNow() const {
|
||||
// In a lot of times, we cannot allocate arena blocks that exactly matches the
|
||||
// buffer size. Thus we have to decide if we should over-allocate or
|
||||
// under-allocate.
|
||||
// This constant avariable can be interpreted as: if we still have more than
|
||||
// "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over
|
||||
// allocate one more block.
|
||||
const double kAllowOverAllocationRatio = 0.6;
|
||||
|
||||
// If arena still have room for new block allocation, we can safely say it
|
||||
// shouldn't flush.
|
||||
auto allocated_memory =
|
||||
table_->ApproximateMemoryUsage() + arena_.MemoryAllocatedBytes();
|
||||
|
||||
// if we can still allocate one more block without exceeding the
|
||||
// over-allocation ratio, then we should not flush.
|
||||
if (allocated_memory + kArenaBlockSize <
|
||||
kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// if user keeps adding entries that exceeds kWriteBufferSize, we need to
|
||||
// flush earlier even though we still have much available memory left.
|
||||
if (allocated_memory >
|
||||
kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// In this code path, Arena has already allocated its "last block", which
|
||||
// means the total allocatedmemory size is either:
|
||||
// (1) "moderately" over allocated the memory (no more than `0.6 * arena
|
||||
// block size`. Or,
|
||||
// (2) the allocated memory is less than write buffer size, but we'll stop
|
||||
// here since if we allocate a new arena block, we'll over allocate too much
|
||||
// more (half of the arena block size) memory.
|
||||
//
|
||||
// In either case, to avoid over-allocate, the last block will stop allocation
|
||||
// when its usage reaches a certain ratio, which we carefully choose "0.75
|
||||
// full" as the stop condition because it addresses the following issue with
|
||||
// great simplicity: What if the next inserted entry's size is
|
||||
// bigger than AllocatedAndUnused()?
|
||||
//
|
||||
// The answer is: if the entry size is also bigger than 0.25 *
|
||||
// kArenaBlockSize, a dedicated block will be allocated for it; otherwise
|
||||
// arena will anyway skip the AllocatedAndUnused() and allocate a new, empty
|
||||
// and regular block. In either case, we *overly* over-allocated.
|
||||
//
|
||||
// Therefore, setting the last block to be at most "0.75 full" avoids both
|
||||
// cases.
|
||||
//
|
||||
// NOTE: the average percentage of waste space of this approach can be counted
|
||||
// as: "arena block size * 0.25 / write buffer size". User who specify a small
|
||||
// write buffer size and/or big arena block size may suffer.
|
||||
return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
|
||||
}
|
||||
|
||||
int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
|
||||
const char* prefix_len_key2) const {
|
||||
// Internal keys are encoded as length-prefixed strings.
|
||||
Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
|
||||
Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
|
||||
return comparator.Compare(k1, k2);
|
||||
}
|
||||
|
||||
int MemTable::KeyComparator::operator()(const char* prefix_len_key,
|
||||
const Slice& key)
|
||||
const {
|
||||
// Internal keys are encoded as length-prefixed strings.
|
||||
Slice a = GetLengthPrefixedSlice(prefix_len_key);
|
||||
return comparator.Compare(a, key);
|
||||
}
|
||||
|
||||
Slice MemTableRep::UserKey(const char* key) const {
|
||||
Slice slice = GetLengthPrefixedSlice(key);
|
||||
return Slice(slice.data(), slice.size() - 8);
|
||||
}
|
||||
|
||||
KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
|
||||
*buf = arena_->Allocate(len);
|
||||
return static_cast<KeyHandle>(*buf);
|
||||
}
|
||||
|
||||
// Encode a suitable internal key target for "target" and return it.
|
||||
// Uses *scratch as scratch space, and the returned pointer will point
|
||||
// into this scratch space.
|
||||
const char* EncodeKey(std::string* scratch, const Slice& target) {
|
||||
scratch->clear();
|
||||
PutVarint32(scratch, target.size());
|
||||
scratch->append(target.data(), target.size());
|
||||
return scratch->data();
|
||||
}
|
||||
|
||||
class MemTableIterator: public Iterator {
|
||||
public:
|
||||
MemTableIterator(const MemTable& mem, const ReadOptions& options,
|
||||
bool enforce_total_order, Arena* arena)
|
||||
: bloom_(nullptr),
|
||||
prefix_extractor_(mem.prefix_extractor_),
|
||||
valid_(false),
|
||||
arena_mode_(arena != nullptr) {
|
||||
if (prefix_extractor_ != nullptr && !enforce_total_order) {
|
||||
bloom_ = mem.prefix_bloom_.get();
|
||||
iter_ = mem.table_->GetDynamicPrefixIterator(arena);
|
||||
} else {
|
||||
iter_ = mem.table_->GetIterator(arena);
|
||||
}
|
||||
}
|
||||
|
||||
~MemTableIterator() {
|
||||
if (arena_mode_) {
|
||||
iter_->~Iterator();
|
||||
} else {
|
||||
delete iter_;
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool Valid() const { return valid_; }
|
||||
virtual void Seek(const Slice& k) {
|
||||
if (bloom_ != nullptr &&
|
||||
!bloom_->MayContain(prefix_extractor_->Transform(ExtractUserKey(k)))) {
|
||||
valid_ = false;
|
||||
return;
|
||||
}
|
||||
iter_->Seek(k, nullptr);
|
||||
valid_ = iter_->Valid();
|
||||
}
|
||||
virtual void SeekToFirst() {
|
||||
iter_->SeekToFirst();
|
||||
valid_ = iter_->Valid();
|
||||
}
|
||||
virtual void SeekToLast() {
|
||||
iter_->SeekToLast();
|
||||
valid_ = iter_->Valid();
|
||||
}
|
||||
virtual void Next() {
|
||||
assert(Valid());
|
||||
iter_->Next();
|
||||
valid_ = iter_->Valid();
|
||||
}
|
||||
virtual void Prev() {
|
||||
assert(Valid());
|
||||
iter_->Prev();
|
||||
valid_ = iter_->Valid();
|
||||
}
|
||||
virtual Slice key() const {
|
||||
assert(Valid());
|
||||
return GetLengthPrefixedSlice(iter_->key());
|
||||
}
|
||||
virtual Slice value() const {
|
||||
assert(Valid());
|
||||
Slice key_slice = GetLengthPrefixedSlice(iter_->key());
|
||||
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
|
||||
}
|
||||
|
||||
virtual Status status() const { return Status::OK(); }
|
||||
|
||||
private:
|
||||
DynamicBloom* bloom_;
|
||||
const SliceTransform* const prefix_extractor_;
|
||||
MemTableRep::Iterator* iter_;
|
||||
bool valid_;
|
||||
bool arena_mode_;
|
||||
|
||||
// No copying allowed
|
||||
MemTableIterator(const MemTableIterator&);
|
||||
void operator=(const MemTableIterator&);
|
||||
};
|
||||
|
||||
Iterator* MemTable::NewIterator(const ReadOptions& options,
|
||||
bool enforce_total_order, Arena* arena) {
|
||||
if (arena == nullptr) {
|
||||
return new MemTableIterator(*this, options, enforce_total_order, nullptr);
|
||||
} else {
|
||||
auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
|
||||
return new (mem)
|
||||
MemTableIterator(*this, options, enforce_total_order, arena);
|
||||
}
|
||||
}
|
||||
|
||||
port::RWMutex* MemTable::GetLock(const Slice& key) {
|
||||
static murmur_hash hash;
|
||||
return &locks_[hash(key) % locks_.size()];
|
||||
}
|
||||
|
||||
void MemTable::Add(SequenceNumber s, ValueType type,
|
||||
const Slice& key, /* user key */
|
||||
const Slice& value) {
|
||||
// Format of an entry is concatenation of:
|
||||
// key_size : varint32 of internal_key.size()
|
||||
// key bytes : char[internal_key.size()]
|
||||
// value_size : varint32 of value.size()
|
||||
// value bytes : char[value.size()]
|
||||
size_t key_size = key.size();
|
||||
size_t val_size = value.size();
|
||||
size_t internal_key_size = key_size + 8;
|
||||
const size_t encoded_len =
|
||||
VarintLength(internal_key_size) + internal_key_size +
|
||||
VarintLength(val_size) + val_size;
|
||||
char* buf = nullptr;
|
||||
KeyHandle handle = table_->Allocate(encoded_len, &buf);
|
||||
assert(buf != nullptr);
|
||||
char* p = EncodeVarint32(buf, internal_key_size);
|
||||
memcpy(p, key.data(), key_size);
|
||||
p += key_size;
|
||||
EncodeFixed64(p, (s << 8) | type);
|
||||
p += 8;
|
||||
p = EncodeVarint32(p, val_size);
|
||||
memcpy(p, value.data(), val_size);
|
||||
assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
|
||||
table_->Insert(handle);
|
||||
num_entries_++;
|
||||
|
||||
if (prefix_bloom_) {
|
||||
assert(prefix_extractor_);
|
||||
prefix_bloom_->Add(prefix_extractor_->Transform(key));
|
||||
}
|
||||
|
||||
// The first sequence number inserted into the memtable
|
||||
assert(first_seqno_ == 0 || s > first_seqno_);
|
||||
if (first_seqno_ == 0) {
|
||||
first_seqno_ = s;
|
||||
}
|
||||
|
||||
should_flush_ = ShouldFlushNow();
|
||||
}
|
||||
|
||||
// Callback from MemTable::Get()
|
||||
namespace {
|
||||
|
||||
struct Saver {
|
||||
Status* status;
|
||||
const LookupKey* key;
|
||||
bool* found_final_value; // Is value set correctly? Used by KeyMayExist
|
||||
bool* merge_in_progress;
|
||||
std::string* value;
|
||||
const MergeOperator* merge_operator;
|
||||
// the merge operations encountered;
|
||||
MergeContext* merge_context;
|
||||
MemTable* mem;
|
||||
Logger* logger;
|
||||
Statistics* statistics;
|
||||
bool inplace_update_support;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
static bool SaveValue(void* arg, const char* entry) {
|
||||
Saver* s = reinterpret_cast<Saver*>(arg);
|
||||
MergeContext* merge_context = s->merge_context;
|
||||
const MergeOperator* merge_operator = s->merge_operator;
|
||||
|
||||
assert(s != nullptr && merge_context != nullptr);
|
||||
|
||||
// entry format is:
|
||||
// klength varint32
|
||||
// userkey char[klength-8]
|
||||
// tag uint64
|
||||
// vlength varint32
|
||||
// value char[vlength]
|
||||
// Check that it belongs to same user key. We do not check the
|
||||
// sequence number since the Seek() call above should have skipped
|
||||
// all entries with overly large sequence numbers.
|
||||
uint32_t key_length;
|
||||
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
||||
if (s->mem->GetInternalKeyComparator().user_comparator()->Compare(
|
||||
Slice(key_ptr, key_length - 8), s->key->user_key()) == 0) {
|
||||
// Correct user key
|
||||
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
||||
switch (static_cast<ValueType>(tag & 0xff)) {
|
||||
case kTypeValue: {
|
||||
if (s->inplace_update_support) {
|
||||
s->mem->GetLock(s->key->user_key())->ReadLock();
|
||||
}
|
||||
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
||||
*(s->status) = Status::OK();
|
||||
if (*(s->merge_in_progress)) {
|
||||
assert(merge_operator);
|
||||
if (!merge_operator->FullMerge(s->key->user_key(), &v,
|
||||
merge_context->GetOperands(), s->value,
|
||||
s->logger)) {
|
||||
RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
|
||||
*(s->status) =
|
||||
Status::Corruption("Error: Could not perform merge.");
|
||||
}
|
||||
} else {
|
||||
s->value->assign(v.data(), v.size());
|
||||
}
|
||||
if (s->inplace_update_support) {
|
||||
s->mem->GetLock(s->key->user_key())->Unlock();
|
||||
}
|
||||
*(s->found_final_value) = true;
|
||||
return false;
|
||||
}
|
||||
case kTypeDeletion: {
|
||||
if (*(s->merge_in_progress)) {
|
||||
assert(merge_operator);
|
||||
*(s->status) = Status::OK();
|
||||
if (!merge_operator->FullMerge(s->key->user_key(), nullptr,
|
||||
merge_context->GetOperands(), s->value,
|
||||
s->logger)) {
|
||||
RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
|
||||
*(s->status) =
|
||||
Status::Corruption("Error: Could not perform merge.");
|
||||
}
|
||||
} else {
|
||||
*(s->status) = Status::NotFound();
|
||||
}
|
||||
*(s->found_final_value) = true;
|
||||
return false;
|
||||
}
|
||||
case kTypeMerge: {
|
||||
std::string merge_result; // temporary area for merge results later
|
||||
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
||||
*(s->merge_in_progress) = true;
|
||||
merge_context->PushOperand(v);
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
assert(false);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// s->state could be Corrupt, merge or notfound
|
||||
return false;
|
||||
}
|
||||
|
||||
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
|
||||
MergeContext& merge_context, const Options& options) {
|
||||
PERF_TIMER_AUTO(get_from_memtable_time);
|
||||
|
||||
Slice user_key = key.user_key();
|
||||
bool found_final_value = false;
|
||||
bool merge_in_progress = s->IsMergeInProgress();
|
||||
|
||||
if (prefix_bloom_ &&
|
||||
!prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) {
|
||||
// iter is null if prefix bloom says the key does not exist
|
||||
} else {
|
||||
Saver saver;
|
||||
saver.status = s;
|
||||
saver.found_final_value = &found_final_value;
|
||||
saver.merge_in_progress = &merge_in_progress;
|
||||
saver.key = &key;
|
||||
saver.value = value;
|
||||
saver.status = s;
|
||||
saver.mem = this;
|
||||
saver.merge_context = &merge_context;
|
||||
saver.merge_operator = options.merge_operator.get();
|
||||
saver.logger = options.info_log.get();
|
||||
saver.inplace_update_support = options.inplace_update_support;
|
||||
saver.statistics = options.statistics.get();
|
||||
table_->Get(key, &saver, SaveValue);
|
||||
}
|
||||
|
||||
// No change to value, since we have not yet found a Put/Delete
|
||||
if (!found_final_value && merge_in_progress) {
|
||||
*s = Status::MergeInProgress("");
|
||||
}
|
||||
PERF_TIMER_STOP(get_from_memtable_time);
|
||||
PERF_COUNTER_ADD(get_from_memtable_count, 1);
|
||||
return found_final_value;
|
||||
}
|
||||
|
||||
void MemTable::Update(SequenceNumber seq,
|
||||
const Slice& key,
|
||||
const Slice& value) {
|
||||
LookupKey lkey(key, seq);
|
||||
Slice mem_key = lkey.memtable_key();
|
||||
|
||||
std::unique_ptr<MemTableRep::Iterator> iter(
|
||||
table_->GetIterator(lkey.user_key()));
|
||||
iter->Seek(lkey.internal_key(), mem_key.data());
|
||||
|
||||
if (iter->Valid()) {
|
||||
// entry format is:
|
||||
// key_length varint32
|
||||
// userkey char[klength-8]
|
||||
// tag uint64
|
||||
// vlength varint32
|
||||
// value char[vlength]
|
||||
// Check that it belongs to same user key. We do not check the
|
||||
// sequence number since the Seek() call above should have skipped
|
||||
// all entries with overly large sequence numbers.
|
||||
const char* entry = iter->key();
|
||||
uint32_t key_length = 0;
|
||||
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
||||
if (comparator_.comparator.user_comparator()->Compare(
|
||||
Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
|
||||
// Correct user key
|
||||
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
||||
switch (static_cast<ValueType>(tag & 0xff)) {
|
||||
case kTypeValue: {
|
||||
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
|
||||
uint32_t prev_size = prev_value.size();
|
||||
uint32_t new_size = value.size();
|
||||
|
||||
// Update value, if new value size <= previous value size
|
||||
if (new_size <= prev_size ) {
|
||||
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
|
||||
new_size);
|
||||
WriteLock wl(GetLock(lkey.user_key()));
|
||||
memcpy(p, value.data(), value.size());
|
||||
assert((unsigned)((p + value.size()) - entry) ==
|
||||
(unsigned)(VarintLength(key_length) + key_length +
|
||||
VarintLength(value.size()) + value.size()));
|
||||
return;
|
||||
}
|
||||
}
|
||||
default:
|
||||
// If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
|
||||
// we don't have enough space for update inplace
|
||||
Add(seq, kTypeValue, key, value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// key doesn't exist
|
||||
Add(seq, kTypeValue, key, value);
|
||||
}
|
||||
|
||||
bool MemTable::UpdateCallback(SequenceNumber seq,
|
||||
const Slice& key,
|
||||
const Slice& delta,
|
||||
const Options& options) {
|
||||
LookupKey lkey(key, seq);
|
||||
Slice memkey = lkey.memtable_key();
|
||||
|
||||
std::unique_ptr<MemTableRep::Iterator> iter(
|
||||
table_->GetIterator(lkey.user_key()));
|
||||
iter->Seek(lkey.internal_key(), memkey.data());
|
||||
|
||||
if (iter->Valid()) {
|
||||
// entry format is:
|
||||
// key_length varint32
|
||||
// userkey char[klength-8]
|
||||
// tag uint64
|
||||
// vlength varint32
|
||||
// value char[vlength]
|
||||
// Check that it belongs to same user key. We do not check the
|
||||
// sequence number since the Seek() call above should have skipped
|
||||
// all entries with overly large sequence numbers.
|
||||
const char* entry = iter->key();
|
||||
uint32_t key_length = 0;
|
||||
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
||||
if (comparator_.comparator.user_comparator()->Compare(
|
||||
Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
|
||||
// Correct user key
|
||||
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
||||
switch (static_cast<ValueType>(tag & 0xff)) {
|
||||
case kTypeValue: {
|
||||
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
|
||||
uint32_t prev_size = prev_value.size();
|
||||
|
||||
char* prev_buffer = const_cast<char*>(prev_value.data());
|
||||
uint32_t new_prev_size = prev_size;
|
||||
|
||||
std::string str_value;
|
||||
WriteLock wl(GetLock(lkey.user_key()));
|
||||
auto status = options.inplace_callback(prev_buffer, &new_prev_size,
|
||||
delta, &str_value);
|
||||
if (status == UpdateStatus::UPDATED_INPLACE) {
|
||||
// Value already updated by callback.
|
||||
assert(new_prev_size <= prev_size);
|
||||
if (new_prev_size < prev_size) {
|
||||
// overwrite the new prev_size
|
||||
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
|
||||
new_prev_size);
|
||||
if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
|
||||
// shift the value buffer as well.
|
||||
memcpy(p, prev_buffer, new_prev_size);
|
||||
}
|
||||
}
|
||||
RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED);
|
||||
should_flush_ = ShouldFlushNow();
|
||||
return true;
|
||||
} else if (status == UpdateStatus::UPDATED) {
|
||||
Add(seq, kTypeValue, key, Slice(str_value));
|
||||
RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN);
|
||||
should_flush_ = ShouldFlushNow();
|
||||
return true;
|
||||
} else if (status == UpdateStatus::UPDATE_FAILED) {
|
||||
// No action required. Return.
|
||||
should_flush_ = ShouldFlushNow();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// If the latest value is not kTypeValue
|
||||
// or key doesn't exist
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
|
||||
Slice memkey = key.memtable_key();
|
||||
|
||||
// A total ordered iterator is costly for some memtablerep (prefix aware
|
||||
// reps). By passing in the user key, we allow efficient iterator creation.
|
||||
// The iterator only needs to be ordered within the same user key.
|
||||
std::unique_ptr<MemTableRep::Iterator> iter(
|
||||
table_->GetIterator(key.user_key()));
|
||||
iter->Seek(key.internal_key(), memkey.data());
|
||||
|
||||
size_t num_successive_merges = 0;
|
||||
|
||||
for (; iter->Valid(); iter->Next()) {
|
||||
const char* entry = iter->key();
|
||||
uint32_t key_length = 0;
|
||||
const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
||||
if (comparator_.comparator.user_comparator()->Compare(
|
||||
Slice(iter_key_ptr, key_length - 8), key.user_key()) != 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
|
||||
if (static_cast<ValueType>(tag & 0xff) != kTypeMerge) {
|
||||
break;
|
||||
}
|
||||
|
||||
++num_successive_merges;
|
||||
}
|
||||
|
||||
return num_successive_merges;
|
||||
}
|
||||
|
||||
void MemTableRep::Get(const LookupKey& k, void* callback_args,
|
||||
bool (*callback_func)(void* arg, const char* entry)) {
|
||||
auto iter = GetIterator(k.user_key());
|
||||
for (iter->Seek(k.internal_key(), k.memtable_key().data());
|
||||
iter->Valid() && callback_func(callback_args, iter->key());
|
||||
iter->Next()) {
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
222
db/memtable.h
Normal file
222
db/memtable.h
Normal file
@@ -0,0 +1,222 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <deque>
|
||||
#include "db/dbformat.h"
|
||||
#include "db/skiplist.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/memtablerep.h"
|
||||
#include "util/arena.h"
|
||||
#include "util/dynamic_bloom.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Arena;
|
||||
class Mutex;
|
||||
class MemTableIterator;
|
||||
class MergeContext;
|
||||
|
||||
class MemTable {
|
||||
public:
|
||||
struct KeyComparator : public MemTableRep::KeyComparator {
|
||||
const InternalKeyComparator comparator;
|
||||
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
|
||||
virtual int operator()(const char* prefix_len_key1,
|
||||
const char* prefix_len_key2) const;
|
||||
virtual int operator()(const char* prefix_len_key,
|
||||
const Slice& key) const override;
|
||||
};
|
||||
|
||||
// MemTables are reference counted. The initial reference count
|
||||
// is zero and the caller must call Ref() at least once.
|
||||
explicit MemTable(const InternalKeyComparator& comparator,
|
||||
const Options& options);
|
||||
|
||||
~MemTable();
|
||||
|
||||
// Increase reference count.
|
||||
void Ref() { ++refs_; }
|
||||
|
||||
// Drop reference count.
|
||||
// If the refcount goes to zero return this memtable, otherwise return null
|
||||
MemTable* Unref() {
|
||||
--refs_;
|
||||
assert(refs_ >= 0);
|
||||
if (refs_ <= 0) {
|
||||
return this;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Returns an estimate of the number of bytes of data in use by this
|
||||
// data structure.
|
||||
//
|
||||
// REQUIRES: external synchronization to prevent simultaneous
|
||||
// operations on the same MemTable.
|
||||
size_t ApproximateMemoryUsage();
|
||||
|
||||
// This method heuristically determines if the memtable should continue to
|
||||
// host more data.
|
||||
bool ShouldFlush() const { return should_flush_; }
|
||||
|
||||
// Return an iterator that yields the contents of the memtable.
|
||||
//
|
||||
// The caller must ensure that the underlying MemTable remains live
|
||||
// while the returned iterator is live. The keys returned by this
|
||||
// iterator are internal keys encoded by AppendInternalKey in the
|
||||
// db/dbformat.{h,cc} module.
|
||||
//
|
||||
// By default, it returns an iterator for prefix seek if prefix_extractor
|
||||
// is configured in Options.
|
||||
// arena: If not null, the arena needs to be used to allocate the Iterator.
|
||||
// Calling ~Iterator of the iterator will destroy all the states but
|
||||
// those allocated in arena.
|
||||
Iterator* NewIterator(const ReadOptions& options,
|
||||
bool enforce_total_order = false,
|
||||
Arena* arena = nullptr);
|
||||
|
||||
// Add an entry into memtable that maps key to value at the
|
||||
// specified sequence number and with the specified type.
|
||||
// Typically value will be empty if type==kTypeDeletion.
|
||||
void Add(SequenceNumber seq, ValueType type,
|
||||
const Slice& key,
|
||||
const Slice& value);
|
||||
|
||||
// If memtable contains a value for key, store it in *value and return true.
|
||||
// If memtable contains a deletion for key, store a NotFound() error
|
||||
// in *status and return true.
|
||||
// If memtable contains Merge operation as the most recent entry for a key,
|
||||
// and the merge process does not stop (not reaching a value or delete),
|
||||
// prepend the current merge operand to *operands.
|
||||
// store MergeInProgress in s, and return false.
|
||||
// Else, return false.
|
||||
bool Get(const LookupKey& key, std::string* value, Status* s,
|
||||
MergeContext& merge_context, const Options& options);
|
||||
|
||||
// Attempts to update the new_value inplace, else does normal Add
|
||||
// Pseudocode
|
||||
// if key exists in current memtable && prev_value is of type kTypeValue
|
||||
// if new sizeof(new_value) <= sizeof(prev_value)
|
||||
// update inplace
|
||||
// else add(key, new_value)
|
||||
// else add(key, new_value)
|
||||
void Update(SequenceNumber seq,
|
||||
const Slice& key,
|
||||
const Slice& value);
|
||||
|
||||
// If prev_value for key exits, attempts to update it inplace.
|
||||
// else returns false
|
||||
// Pseudocode
|
||||
// if key exists in current memtable && prev_value is of type kTypeValue
|
||||
// new_value = delta(prev_value)
|
||||
// if sizeof(new_value) <= sizeof(prev_value)
|
||||
// update inplace
|
||||
// else add(key, new_value)
|
||||
// else return false
|
||||
bool UpdateCallback(SequenceNumber seq,
|
||||
const Slice& key,
|
||||
const Slice& delta,
|
||||
const Options& options);
|
||||
|
||||
// Returns the number of successive merge entries starting from the newest
|
||||
// entry for the key up to the last non-merge entry or last entry for the
|
||||
// key in the memtable.
|
||||
size_t CountSuccessiveMergeEntries(const LookupKey& key);
|
||||
|
||||
// Get total number of entries in the mem table.
|
||||
uint64_t GetNumEntries() const { return num_entries_; }
|
||||
|
||||
// Returns the edits area that is needed for flushing the memtable
|
||||
VersionEdit* GetEdits() { return &edit_; }
|
||||
|
||||
// Returns the sequence number of the first element that was inserted
|
||||
// into the memtable
|
||||
SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }
|
||||
|
||||
// Returns the next active logfile number when this memtable is about to
|
||||
// be flushed to storage
|
||||
uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
|
||||
|
||||
// Sets the next active logfile number when this memtable is about to
|
||||
// be flushed to storage
|
||||
void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
|
||||
|
||||
// Notify the underlying storage that no more items will be added
|
||||
void MarkImmutable() { table_->MarkReadOnly(); }
|
||||
|
||||
// return true if the current MemTableRep supports merge operator.
|
||||
bool IsMergeOperatorSupported() const {
|
||||
return table_->IsMergeOperatorSupported();
|
||||
}
|
||||
|
||||
// return true if the current MemTableRep supports snapshots.
|
||||
bool IsSnapshotSupported() const { return table_->IsSnapshotSupported(); }
|
||||
|
||||
// Get the lock associated for the key
|
||||
port::RWMutex* GetLock(const Slice& key);
|
||||
|
||||
const InternalKeyComparator& GetInternalKeyComparator() const {
|
||||
return comparator_.comparator;
|
||||
}
|
||||
|
||||
const Arena& TEST_GetArena() const { return arena_; }
|
||||
|
||||
private:
|
||||
// Dynamically check if we can add more incoming entries.
|
||||
bool ShouldFlushNow() const;
|
||||
|
||||
friend class MemTableIterator;
|
||||
friend class MemTableBackwardIterator;
|
||||
friend class MemTableList;
|
||||
|
||||
KeyComparator comparator_;
|
||||
int refs_;
|
||||
const size_t kArenaBlockSize;
|
||||
const size_t kWriteBufferSize;
|
||||
Arena arena_;
|
||||
unique_ptr<MemTableRep> table_;
|
||||
|
||||
uint64_t num_entries_;
|
||||
|
||||
// These are used to manage memtable flushes to storage
|
||||
bool flush_in_progress_; // started the flush
|
||||
bool flush_completed_; // finished the flush
|
||||
uint64_t file_number_; // filled up after flush is complete
|
||||
|
||||
// The updates to be applied to the transaction log when this
|
||||
// memtable is flushed to storage.
|
||||
VersionEdit edit_;
|
||||
|
||||
// The sequence number of the kv that was inserted first
|
||||
SequenceNumber first_seqno_;
|
||||
|
||||
// The log files earlier than this number can be deleted.
|
||||
uint64_t mem_next_logfile_number_;
|
||||
|
||||
// rw locks for inplace updates
|
||||
std::vector<port::RWMutex> locks_;
|
||||
|
||||
// No copying allowed
|
||||
MemTable(const MemTable&);
|
||||
void operator=(const MemTable&);
|
||||
|
||||
const SliceTransform* const prefix_extractor_;
|
||||
std::unique_ptr<DynamicBloom> prefix_bloom_;
|
||||
|
||||
// a flag indicating if a memtable has met the criteria to flush
|
||||
bool should_flush_;
|
||||
};
|
||||
|
||||
extern const char* EncodeKey(std::string* scratch, const Slice& target);
|
||||
|
||||
} // namespace rocksdb
|
||||
286
db/memtable_list.cc
Normal file
286
db/memtable_list.cc
Normal file
@@ -0,0 +1,286 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#include "db/memtable_list.h"
|
||||
|
||||
#include <string>
|
||||
#include "rocksdb/db.h"
|
||||
#include "db/memtable.h"
|
||||
#include "db/version_set.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "table/merger.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/log_buffer.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class InternalKeyComparator;
|
||||
class Mutex;
|
||||
class VersionSet;
|
||||
|
||||
MemTableListVersion::MemTableListVersion(MemTableListVersion* old) {
|
||||
if (old != nullptr) {
|
||||
memlist_ = old->memlist_;
|
||||
size_ = old->size_;
|
||||
for (auto& m : memlist_) {
|
||||
m->Ref();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MemTableListVersion::Ref() { ++refs_; }
|
||||
|
||||
void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
|
||||
assert(refs_ >= 1);
|
||||
--refs_;
|
||||
if (refs_ == 0) {
|
||||
// if to_delete is equal to nullptr it means we're confident
|
||||
// that refs_ will not be zero
|
||||
assert(to_delete != nullptr);
|
||||
for (const auto& m : memlist_) {
|
||||
MemTable* x = m->Unref();
|
||||
if (x != nullptr) {
|
||||
to_delete->push_back(x);
|
||||
}
|
||||
}
|
||||
delete this;
|
||||
}
|
||||
}
|
||||
|
||||
int MemTableListVersion::size() const { return size_; }
|
||||
|
||||
// Returns the total number of memtables in the list
|
||||
int MemTableList::size() const {
|
||||
assert(num_flush_not_started_ <= current_->size_);
|
||||
return current_->size_;
|
||||
}
|
||||
|
||||
// Search all the memtables starting from the most recent one.
|
||||
// Return the most recent value found, if any.
|
||||
// Operands stores the list of merge operations to apply, so far.
|
||||
bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
|
||||
Status* s, MergeContext& merge_context,
|
||||
const Options& options) {
|
||||
for (auto& memtable : memlist_) {
|
||||
if (memtable->Get(key, value, s, merge_context, options)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void MemTableListVersion::AddIterators(const ReadOptions& options,
|
||||
std::vector<Iterator*>* iterator_list) {
|
||||
for (auto& m : memlist_) {
|
||||
iterator_list->push_back(m->NewIterator(options));
|
||||
}
|
||||
}
|
||||
|
||||
void MemTableListVersion::AddIterators(
|
||||
const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder) {
|
||||
for (auto& m : memlist_) {
|
||||
merge_iter_builder->AddIterator(
|
||||
m->NewIterator(options, merge_iter_builder->GetArena()));
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t MemTableListVersion::GetTotalNumEntries() const {
|
||||
uint64_t total_num = 0;
|
||||
for (auto& m : memlist_) {
|
||||
total_num += m->GetNumEntries();
|
||||
}
|
||||
return total_num;
|
||||
}
|
||||
|
||||
// caller is responsible for referencing m
|
||||
void MemTableListVersion::Add(MemTable* m) {
|
||||
assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable
|
||||
memlist_.push_front(m);
|
||||
++size_;
|
||||
}
|
||||
|
||||
// caller is responsible for unreferencing m
|
||||
void MemTableListVersion::Remove(MemTable* m) {
|
||||
assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable
|
||||
memlist_.remove(m);
|
||||
--size_;
|
||||
}
|
||||
|
||||
// Returns true if there is at least one memtable on which flush has
|
||||
// not yet started.
|
||||
bool MemTableList::IsFlushPending() const {
|
||||
if ((flush_requested_ && num_flush_not_started_ >= 1) ||
|
||||
(num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
|
||||
assert(imm_flush_needed.NoBarrier_Load() != nullptr);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns the memtables that need to be flushed.
|
||||
void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
|
||||
const auto& memlist = current_->memlist_;
|
||||
for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
|
||||
MemTable* m = *it;
|
||||
if (!m->flush_in_progress_) {
|
||||
assert(!m->flush_completed_);
|
||||
num_flush_not_started_--;
|
||||
if (num_flush_not_started_ == 0) {
|
||||
imm_flush_needed.Release_Store(nullptr);
|
||||
}
|
||||
m->flush_in_progress_ = true; // flushing will start very soon
|
||||
ret->push_back(m);
|
||||
}
|
||||
}
|
||||
flush_requested_ = false; // start-flush request is complete
|
||||
}
|
||||
|
||||
void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
|
||||
uint64_t file_number,
|
||||
std::set<uint64_t>* pending_outputs) {
|
||||
assert(!mems.empty());
|
||||
|
||||
// If the flush was not successful, then just reset state.
|
||||
// Maybe a suceeding attempt to flush will be successful.
|
||||
for (MemTable* m : mems) {
|
||||
assert(m->flush_in_progress_);
|
||||
assert(m->file_number_ == 0);
|
||||
|
||||
m->flush_in_progress_ = false;
|
||||
m->flush_completed_ = false;
|
||||
m->edit_.Clear();
|
||||
num_flush_not_started_++;
|
||||
}
|
||||
pending_outputs->erase(file_number);
|
||||
imm_flush_needed.Release_Store(reinterpret_cast<void *>(1));
|
||||
}
|
||||
|
||||
// Record a successful flush in the manifest file
|
||||
Status MemTableList::InstallMemtableFlushResults(
|
||||
ColumnFamilyData* cfd, const autovector<MemTable*>& mems, VersionSet* vset,
|
||||
port::Mutex* mu, Logger* info_log, uint64_t file_number,
|
||||
std::set<uint64_t>& pending_outputs, autovector<MemTable*>* to_delete,
|
||||
Directory* db_directory, LogBuffer* log_buffer) {
|
||||
mu->AssertHeld();
|
||||
|
||||
// flush was sucessful
|
||||
for (size_t i = 0; i < mems.size(); ++i) {
|
||||
// All the edits are associated with the first memtable of this batch.
|
||||
assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
|
||||
|
||||
mems[i]->flush_completed_ = true;
|
||||
mems[i]->file_number_ = file_number;
|
||||
}
|
||||
|
||||
// if some other thread is already commiting, then return
|
||||
Status s;
|
||||
if (commit_in_progress_) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// Only a single thread can be executing this piece of code
|
||||
commit_in_progress_ = true;
|
||||
|
||||
// scan all memtables from the earliest, and commit those
|
||||
// (in that order) that have finished flushing. Memetables
|
||||
// are always committed in the order that they were created.
|
||||
while (!current_->memlist_.empty() && s.ok()) {
|
||||
MemTable* m = current_->memlist_.back(); // get the last element
|
||||
if (!m->flush_completed_) {
|
||||
break;
|
||||
}
|
||||
|
||||
LogToBuffer(log_buffer, "[%s] Level-0 commit table #%lu started",
|
||||
cfd->GetName().c_str(), (unsigned long)m->file_number_);
|
||||
|
||||
// this can release and reacquire the mutex.
|
||||
s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory);
|
||||
|
||||
// we will be changing the version in the next code path,
|
||||
// so we better create a new one, since versions are immutable
|
||||
InstallNewVersion();
|
||||
|
||||
// All the later memtables that have the same filenum
|
||||
// are part of the same batch. They can be committed now.
|
||||
uint64_t mem_id = 1; // how many memtables has been flushed.
|
||||
do {
|
||||
if (s.ok()) { // commit new state
|
||||
LogToBuffer(log_buffer,
|
||||
"[%s] Level-0 commit table #%lu: memtable #%lu done",
|
||||
cfd->GetName().c_str(), (unsigned long)m->file_number_,
|
||||
(unsigned long)mem_id);
|
||||
current_->Remove(m);
|
||||
assert(m->file_number_ > 0);
|
||||
|
||||
// pending_outputs can be cleared only after the newly created file
|
||||
// has been written to a committed version so that other concurrently
|
||||
// executing compaction threads do not mistakenly assume that this
|
||||
// file is not live.
|
||||
pending_outputs.erase(m->file_number_);
|
||||
if (m->Unref() != nullptr) {
|
||||
to_delete->push_back(m);
|
||||
}
|
||||
} else {
|
||||
//commit failed. setup state so that we can flush again.
|
||||
Log(info_log,
|
||||
"Level-0 commit table #%lu: memtable #%lu failed",
|
||||
(unsigned long)m->file_number_,
|
||||
(unsigned long)mem_id);
|
||||
m->flush_completed_ = false;
|
||||
m->flush_in_progress_ = false;
|
||||
m->edit_.Clear();
|
||||
num_flush_not_started_++;
|
||||
pending_outputs.erase(m->file_number_);
|
||||
m->file_number_ = 0;
|
||||
imm_flush_needed.Release_Store((void *)1);
|
||||
}
|
||||
++mem_id;
|
||||
} while (!current_->memlist_.empty() && (m = current_->memlist_.back()) &&
|
||||
m->file_number_ == file_number);
|
||||
}
|
||||
commit_in_progress_ = false;
|
||||
return s;
|
||||
}
|
||||
|
||||
// New memtables are inserted at the front of the list.
|
||||
void MemTableList::Add(MemTable* m) {
|
||||
assert(current_->size_ >= num_flush_not_started_);
|
||||
InstallNewVersion();
|
||||
// this method is used to move mutable memtable into an immutable list.
|
||||
// since mutable memtable is already refcounted by the DBImpl,
|
||||
// and when moving to the imutable list we don't unref it,
|
||||
// we don't have to ref the memtable here. we just take over the
|
||||
// reference from the DBImpl.
|
||||
current_->Add(m);
|
||||
m->MarkImmutable();
|
||||
num_flush_not_started_++;
|
||||
if (num_flush_not_started_ == 1) {
|
||||
imm_flush_needed.Release_Store((void *)1);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns an estimate of the number of bytes of data in use.
|
||||
size_t MemTableList::ApproximateMemoryUsage() {
|
||||
size_t size = 0;
|
||||
for (auto& memtable : current_->memlist_) {
|
||||
size += memtable->ApproximateMemoryUsage();
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
void MemTableList::InstallNewVersion() {
|
||||
if (current_->refs_ == 1) {
|
||||
// we're the only one using the version, just keep using it
|
||||
} else {
|
||||
// somebody else holds the current version, we need to create new one
|
||||
MemTableListVersion* version = current_;
|
||||
current_ = new MemTableListVersion(current_);
|
||||
current_->Ref();
|
||||
version->Unref();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
156
db/memtable_list.h
Normal file
156
db/memtable_list.h
Normal file
@@ -0,0 +1,156 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <deque>
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "db/skiplist.h"
|
||||
#include "db/memtable.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "util/autovector.h"
|
||||
#include "util/log_buffer.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class ColumnFamilyData;
|
||||
class InternalKeyComparator;
|
||||
class Mutex;
|
||||
class MergeIteratorBuilder;
|
||||
|
||||
// keeps a list of immutable memtables in a vector. the list is immutable
|
||||
// if refcount is bigger than one. It is used as a state for Get() and
|
||||
// Iterator code paths
|
||||
class MemTableListVersion {
|
||||
public:
|
||||
explicit MemTableListVersion(MemTableListVersion* old = nullptr);
|
||||
|
||||
void Ref();
|
||||
void Unref(autovector<MemTable*>* to_delete = nullptr);
|
||||
|
||||
int size() const;
|
||||
|
||||
// Search all the memtables starting from the most recent one.
|
||||
// Return the most recent value found, if any.
|
||||
bool Get(const LookupKey& key, std::string* value, Status* s,
|
||||
MergeContext& merge_context, const Options& options);
|
||||
|
||||
void AddIterators(const ReadOptions& options,
|
||||
std::vector<Iterator*>* iterator_list);
|
||||
|
||||
void AddIterators(const ReadOptions& options,
|
||||
MergeIteratorBuilder* merge_iter_builder);
|
||||
|
||||
uint64_t GetTotalNumEntries() const;
|
||||
|
||||
private:
|
||||
// REQUIRE: m is mutable memtable
|
||||
void Add(MemTable* m);
|
||||
// REQUIRE: m is mutable memtable
|
||||
void Remove(MemTable* m);
|
||||
|
||||
friend class MemTableList;
|
||||
std::list<MemTable*> memlist_;
|
||||
int size_ = 0;
|
||||
int refs_ = 0;
|
||||
};
|
||||
|
||||
// This class stores references to all the immutable memtables.
|
||||
// The memtables are flushed to L0 as soon as possible and in
|
||||
// any order. If there are more than one immutable memtable, their
|
||||
// flushes can occur concurrently. However, they are 'committed'
|
||||
// to the manifest in FIFO order to maintain correctness and
|
||||
// recoverability from a crash.
|
||||
class MemTableList {
|
||||
public:
|
||||
// A list of memtables.
|
||||
explicit MemTableList(int min_write_buffer_number_to_merge)
|
||||
: min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
|
||||
current_(new MemTableListVersion()),
|
||||
num_flush_not_started_(0),
|
||||
commit_in_progress_(false),
|
||||
flush_requested_(false) {
|
||||
imm_flush_needed.Release_Store(nullptr);
|
||||
current_->Ref();
|
||||
}
|
||||
~MemTableList() {}
|
||||
|
||||
MemTableListVersion* current() { return current_; }
|
||||
|
||||
// so that background threads can detect non-nullptr pointer to
|
||||
// determine whether there is anything more to start flushing.
|
||||
port::AtomicPointer imm_flush_needed;
|
||||
|
||||
// Returns the total number of memtables in the list
|
||||
int size() const;
|
||||
|
||||
// Returns true if there is at least one memtable on which flush has
|
||||
// not yet started.
|
||||
bool IsFlushPending() const;
|
||||
|
||||
// Returns the earliest memtables that needs to be flushed. The returned
|
||||
// memtables are guaranteed to be in the ascending order of created time.
|
||||
void PickMemtablesToFlush(autovector<MemTable*>* mems);
|
||||
|
||||
// Reset status of the given memtable list back to pending state so that
|
||||
// they can get picked up again on the next round of flush.
|
||||
void RollbackMemtableFlush(const autovector<MemTable*>& mems,
|
||||
uint64_t file_number,
|
||||
std::set<uint64_t>* pending_outputs);
|
||||
|
||||
// Commit a successful flush in the manifest file
|
||||
Status InstallMemtableFlushResults(ColumnFamilyData* cfd,
|
||||
const autovector<MemTable*>& m,
|
||||
VersionSet* vset, port::Mutex* mu,
|
||||
Logger* info_log, uint64_t file_number,
|
||||
std::set<uint64_t>& pending_outputs,
|
||||
autovector<MemTable*>* to_delete,
|
||||
Directory* db_directory,
|
||||
LogBuffer* log_buffer);
|
||||
|
||||
// New memtables are inserted at the front of the list.
|
||||
// Takes ownership of the referenced held on *m by the caller of Add().
|
||||
void Add(MemTable* m);
|
||||
|
||||
// Returns an estimate of the number of bytes of data in use.
|
||||
size_t ApproximateMemoryUsage();
|
||||
|
||||
// Request a flush of all existing memtables to storage
|
||||
void FlushRequested() { flush_requested_ = true; }
|
||||
|
||||
// Copying allowed
|
||||
// MemTableList(const MemTableList&);
|
||||
// void operator=(const MemTableList&);
|
||||
|
||||
private:
|
||||
// DB mutex held
|
||||
void InstallNewVersion();
|
||||
|
||||
int min_write_buffer_number_to_merge_;
|
||||
|
||||
MemTableListVersion* current_;
|
||||
|
||||
// the number of elements that still need flushing
|
||||
int num_flush_not_started_;
|
||||
|
||||
// committing in progress
|
||||
bool commit_in_progress_;
|
||||
|
||||
// Requested a flush of all memtables to storage
|
||||
bool flush_requested_;
|
||||
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
69
db/merge_context.h
Normal file
69
db/merge_context.h
Normal file
@@ -0,0 +1,69 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#pragma once
|
||||
#include "db/dbformat.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include <string>
|
||||
#include <deque>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
const std::deque<std::string> empty_operand_list;
|
||||
|
||||
// The merge context for merging a user key.
|
||||
// When doing a Get(), DB will create such a class and pass it when
|
||||
// issuing Get() operation to memtables and version_set. The operands
|
||||
// will be fetched from the context when issuing partial of full merge.
|
||||
class MergeContext {
|
||||
public:
|
||||
// Clear all the operands
|
||||
void Clear() {
|
||||
if (operand_list) {
|
||||
operand_list->clear();
|
||||
}
|
||||
}
|
||||
// Replace all operands with merge_result, which are expected to be the
|
||||
// merge result of them.
|
||||
void PushPartialMergeResult(std::string& merge_result) {
|
||||
assert (operand_list);
|
||||
operand_list->clear();
|
||||
operand_list->push_front(std::move(merge_result));
|
||||
}
|
||||
// Push a merge operand
|
||||
void PushOperand(const Slice& operand_slice) {
|
||||
Initialize();
|
||||
operand_list->push_front(operand_slice.ToString());
|
||||
}
|
||||
// return total number of operands in the list
|
||||
size_t GetNumOperands() const {
|
||||
if (!operand_list) {
|
||||
return 0;
|
||||
}
|
||||
return operand_list->size();
|
||||
}
|
||||
// Get the operand at the index.
|
||||
Slice GetOperand(int index) const {
|
||||
assert (operand_list);
|
||||
return (*operand_list)[index];
|
||||
}
|
||||
// Return all the operands.
|
||||
const std::deque<std::string>& GetOperands() const {
|
||||
if (!operand_list) {
|
||||
return empty_operand_list;
|
||||
}
|
||||
return *operand_list;
|
||||
}
|
||||
private:
|
||||
void Initialize() {
|
||||
if (!operand_list) {
|
||||
operand_list.reset(new std::deque<std::string>());
|
||||
}
|
||||
}
|
||||
std::unique_ptr<std::deque<std::string>> operand_list;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
209
db/merge_helper.cc
Normal file
209
db/merge_helper.cc
Normal file
@@ -0,0 +1,209 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#include "merge_helper.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/merge_operator.h"
|
||||
#include "util/statistics.h"
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// PRE: iter points to the first merge type entry
|
||||
// POST: iter points to the first entry beyond the merge process (or the end)
|
||||
// keys_, operands_ are updated to reflect the merge result.
|
||||
// keys_ stores the list of keys encountered while merging.
|
||||
// operands_ stores the list of merge operands encountered while merging.
|
||||
// keys_[i] corresponds to operands_[i] for each i.
|
||||
void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
|
||||
bool at_bottom, Statistics* stats, int* steps) {
|
||||
// Get a copy of the internal key, before it's invalidated by iter->Next()
|
||||
// Also maintain the list of merge operands seen.
|
||||
keys_.clear();
|
||||
operands_.clear();
|
||||
keys_.push_front(iter->key().ToString());
|
||||
operands_.push_front(iter->value().ToString());
|
||||
|
||||
success_ = false; // Will become true if we hit Put/Delete or bottom
|
||||
|
||||
// We need to parse the internal key again as the parsed key is
|
||||
// backed by the internal key!
|
||||
// Assume no internal key corruption as it has been successfully parsed
|
||||
// by the caller.
|
||||
// Invariant: keys_.back() will not change. Hence, orig_ikey is always valid.
|
||||
ParsedInternalKey orig_ikey;
|
||||
ParseInternalKey(keys_.back(), &orig_ikey);
|
||||
|
||||
bool hit_the_next_user_key = false;
|
||||
std::string merge_result; // Temporary value for merge results
|
||||
if (steps) {
|
||||
++(*steps);
|
||||
}
|
||||
for (iter->Next(); iter->Valid(); iter->Next()) {
|
||||
ParsedInternalKey ikey;
|
||||
assert(operands_.size() >= 1); // Should be invariants!
|
||||
assert(keys_.size() == operands_.size());
|
||||
|
||||
if (!ParseInternalKey(iter->key(), &ikey)) {
|
||||
// stop at corrupted key
|
||||
if (assert_valid_internal_key_) {
|
||||
assert(!"corrupted internal key is not expected");
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (user_comparator_->Compare(ikey.user_key, orig_ikey.user_key) != 0) {
|
||||
// hit a different user key, stop right here
|
||||
hit_the_next_user_key = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (stop_before && ikey.sequence <= stop_before) {
|
||||
// hit an entry that's visible by the previous snapshot, can't touch that
|
||||
break;
|
||||
}
|
||||
|
||||
// At this point we are guaranteed that we need to process this key.
|
||||
|
||||
if (kTypeDeletion == ikey.type) {
|
||||
// hit a delete
|
||||
// => merge nullptr with operands_
|
||||
// => store result in operands_.back() (and update keys_.back())
|
||||
// => change the entry type to kTypeValue for keys_.back()
|
||||
// We are done! Return a success if the merge passes.
|
||||
success_ = user_merge_operator_->FullMerge(ikey.user_key, nullptr,
|
||||
operands_, &merge_result,
|
||||
logger_);
|
||||
|
||||
// We store the result in keys_.back() and operands_.back()
|
||||
// if nothing went wrong (i.e.: no operand corruption on disk)
|
||||
if (success_) {
|
||||
std::string& key = keys_.back(); // The original key encountered
|
||||
orig_ikey.type = kTypeValue;
|
||||
UpdateInternalKey(&key[0], key.size(),
|
||||
orig_ikey.sequence, orig_ikey.type);
|
||||
swap(operands_.back(), merge_result);
|
||||
} else {
|
||||
RecordTick(stats, NUMBER_MERGE_FAILURES);
|
||||
}
|
||||
|
||||
// move iter to the next entry (before doing anything else)
|
||||
iter->Next();
|
||||
if (steps) {
|
||||
++(*steps);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (kTypeValue == ikey.type) {
|
||||
// hit a put
|
||||
// => merge the put value with operands_
|
||||
// => store result in operands_.back() (and update keys_.back())
|
||||
// => change the entry type to kTypeValue for keys_.back()
|
||||
// We are done! Success!
|
||||
const Slice value = iter->value();
|
||||
success_ = user_merge_operator_->FullMerge(ikey.user_key, &value,
|
||||
operands_, &merge_result,
|
||||
logger_);
|
||||
|
||||
// We store the result in keys_.back() and operands_.back()
|
||||
// if nothing went wrong (i.e.: no operand corruption on disk)
|
||||
if (success_) {
|
||||
std::string& key = keys_.back(); // The original key encountered
|
||||
orig_ikey.type = kTypeValue;
|
||||
UpdateInternalKey(&key[0], key.size(),
|
||||
orig_ikey.sequence, orig_ikey.type);
|
||||
swap(operands_.back(), merge_result);
|
||||
} else {
|
||||
RecordTick(stats, NUMBER_MERGE_FAILURES);
|
||||
}
|
||||
|
||||
// move iter to the next entry
|
||||
iter->Next();
|
||||
if (steps) {
|
||||
++(*steps);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (kTypeMerge == ikey.type) {
|
||||
// hit a merge
|
||||
// => merge the operand into the front of the operands_ list
|
||||
// => use the user's associative merge function to determine how.
|
||||
// => then continue because we haven't yet seen a Put/Delete.
|
||||
assert(!operands_.empty()); // Should have at least one element in it
|
||||
|
||||
// keep queuing keys and operands until we either meet a put / delete
|
||||
// request or later did a partial merge.
|
||||
keys_.push_front(iter->key().ToString());
|
||||
operands_.push_front(iter->value().ToString());
|
||||
if (steps) {
|
||||
++(*steps);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We are sure we have seen this key's entire history if we are at the
|
||||
// last level and exhausted all internal keys of this user key.
|
||||
// NOTE: !iter->Valid() does not necessarily mean we hit the
|
||||
// beginning of a user key, as versions of a user key might be
|
||||
// split into multiple files (even files on the same level)
|
||||
// and some files might not be included in the compaction/merge.
|
||||
//
|
||||
// There are also cases where we have seen the root of history of this
|
||||
// key without being sure of it. Then, we simply miss the opportunity
|
||||
// to combine the keys. Since VersionSet::SetupOtherInputs() always makes
|
||||
// sure that all merge-operands on the same level get compacted together,
|
||||
// this will simply lead to these merge operands moving to the next level.
|
||||
//
|
||||
// So, we only perform the following logic (to merge all operands together
|
||||
// without a Put/Delete) if we are certain that we have seen the end of key.
|
||||
bool surely_seen_the_beginning = hit_the_next_user_key && at_bottom;
|
||||
if (surely_seen_the_beginning) {
|
||||
// do a final merge with nullptr as the existing value and say
|
||||
// bye to the merge type (it's now converted to a Put)
|
||||
assert(kTypeMerge == orig_ikey.type);
|
||||
assert(operands_.size() >= 1);
|
||||
assert(operands_.size() == keys_.size());
|
||||
success_ = user_merge_operator_->FullMerge(orig_ikey.user_key, nullptr,
|
||||
operands_, &merge_result,
|
||||
logger_);
|
||||
|
||||
if (success_) {
|
||||
std::string& key = keys_.back(); // The original key encountered
|
||||
orig_ikey.type = kTypeValue;
|
||||
UpdateInternalKey(&key[0], key.size(),
|
||||
orig_ikey.sequence, orig_ikey.type);
|
||||
|
||||
// The final value() is always stored in operands_.back()
|
||||
swap(operands_.back(),merge_result);
|
||||
} else {
|
||||
RecordTick(stats, NUMBER_MERGE_FAILURES);
|
||||
// Do nothing if not success_. Leave keys() and operands() as they are.
|
||||
}
|
||||
} else {
|
||||
// We haven't seen the beginning of the key nor a Put/Delete.
|
||||
// Attempt to use the user's associative merge function to
|
||||
// merge the stacked merge operands into a single operand.
|
||||
|
||||
if (operands_.size() >= 2 &&
|
||||
operands_.size() >= min_partial_merge_operands_ &&
|
||||
user_merge_operator_->PartialMergeMulti(
|
||||
orig_ikey.user_key,
|
||||
std::deque<Slice>(operands_.begin(), operands_.end()),
|
||||
&merge_result, logger_)) {
|
||||
// Merging of operands (associative merge) was successful.
|
||||
// Replace operands with the merge result
|
||||
operands_.clear();
|
||||
operands_.push_front(std::move(merge_result));
|
||||
keys_.erase(keys_.begin(), keys_.end() - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
105
db/merge_helper.h
Normal file
105
db/merge_helper.h
Normal file
@@ -0,0 +1,105 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#ifndef MERGE_HELPER_H
|
||||
#define MERGE_HELPER_H
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include <string>
|
||||
#include <deque>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Comparator;
|
||||
class Iterator;
|
||||
class Logger;
|
||||
class MergeOperator;
|
||||
class Statistics;
|
||||
|
||||
class MergeHelper {
|
||||
public:
|
||||
MergeHelper(const Comparator* user_comparator,
|
||||
const MergeOperator* user_merge_operator, Logger* logger,
|
||||
unsigned min_partial_merge_operands,
|
||||
bool assert_valid_internal_key)
|
||||
: user_comparator_(user_comparator),
|
||||
user_merge_operator_(user_merge_operator),
|
||||
logger_(logger),
|
||||
min_partial_merge_operands_(min_partial_merge_operands),
|
||||
assert_valid_internal_key_(assert_valid_internal_key),
|
||||
keys_(),
|
||||
operands_(),
|
||||
success_(false) {}
|
||||
|
||||
// Merge entries until we hit
|
||||
// - a corrupted key
|
||||
// - a Put/Delete,
|
||||
// - a different user key,
|
||||
// - a specific sequence number (snapshot boundary),
|
||||
// or - the end of iteration
|
||||
// iter: (IN) points to the first merge type entry
|
||||
// (OUT) points to the first entry not included in the merge process
|
||||
// stop_before: (IN) a sequence number that merge should not cross.
|
||||
// 0 means no restriction
|
||||
// at_bottom: (IN) true if the iterator covers the bottem level, which means
|
||||
// we could reach the start of the history of this user key.
|
||||
void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0,
|
||||
bool at_bottom = false, Statistics* stats = nullptr,
|
||||
int* steps = nullptr);
|
||||
|
||||
// Query the merge result
|
||||
// These are valid until the next MergeUntil call
|
||||
// If the merging was successful:
|
||||
// - IsSuccess() will be true
|
||||
// - key() will have the latest sequence number of the merges.
|
||||
// The type will be Put or Merge. See IMPORTANT 1 note, below.
|
||||
// - value() will be the result of merging all the operands together
|
||||
// - The user should ignore keys() and values().
|
||||
//
|
||||
// IMPORTANT 1: the key type could change after the MergeUntil call.
|
||||
// Put/Delete + Merge + ... + Merge => Put
|
||||
// Merge + ... + Merge => Merge
|
||||
//
|
||||
// If the merge operator is not associative, and if a Put/Delete is not found
|
||||
// then the merging will be unsuccessful. In this case:
|
||||
// - IsSuccess() will be false
|
||||
// - keys() contains the list of internal keys seen in order of iteration.
|
||||
// - values() contains the list of values (merges) seen in the same order.
|
||||
// values() is parallel to keys() so that the first entry in
|
||||
// keys() is the key associated with the first entry in values()
|
||||
// and so on. These lists will be the same length.
|
||||
// All of these pairs will be merges over the same user key.
|
||||
// See IMPORTANT 2 note below.
|
||||
// - The user should ignore key() and value().
|
||||
//
|
||||
// IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
|
||||
// So keys().back() was the first key seen by iterator.
|
||||
// TODO: Re-style this comment to be like the first one
|
||||
bool IsSuccess() { return success_; }
|
||||
Slice key() { assert(success_); return Slice(keys_.back()); }
|
||||
Slice value() { assert(success_); return Slice(operands_.back()); }
|
||||
const std::deque<std::string>& keys() { assert(!success_); return keys_; }
|
||||
const std::deque<std::string>& values() {
|
||||
assert(!success_); return operands_;
|
||||
}
|
||||
|
||||
private:
|
||||
const Comparator* user_comparator_;
|
||||
const MergeOperator* user_merge_operator_;
|
||||
Logger* logger_;
|
||||
unsigned min_partial_merge_operands_;
|
||||
bool assert_valid_internal_key_; // enforce no internal key corruption?
|
||||
|
||||
// the scratch area that holds the result of MergeUntil
|
||||
// valid up to the next MergeUntil call
|
||||
std::deque<std::string> keys_; // Keeps track of the sequence of keys seen
|
||||
std::deque<std::string> operands_; // Parallel with keys_; stores the values
|
||||
bool success_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif
|
||||
77
db/merge_operator.cc
Normal file
77
db/merge_operator.cc
Normal file
@@ -0,0 +1,77 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
/**
|
||||
* Back-end implementation details specific to the Merge Operator.
|
||||
*/
|
||||
|
||||
#include "rocksdb/merge_operator.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// The default implementation of PartialMergeMulti, which invokes
|
||||
// PartialMerge multiple times internally and merges two operands at
|
||||
// a time.
|
||||
bool MergeOperator::PartialMergeMulti(const Slice& key,
|
||||
const std::deque<Slice>& operand_list,
|
||||
std::string* new_value,
|
||||
Logger* logger) const {
|
||||
assert(operand_list.size() >= 2);
|
||||
// Simply loop through the operands
|
||||
std::string temp_value;
|
||||
Slice temp_slice(operand_list[0]);
|
||||
|
||||
for (size_t i = 1; i < operand_list.size(); ++i) {
|
||||
auto& operand = operand_list[i];
|
||||
if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) {
|
||||
return false;
|
||||
}
|
||||
swap(temp_value, *new_value);
|
||||
temp_slice = Slice(*new_value);
|
||||
}
|
||||
|
||||
// The result will be in *new_value. All merges succeeded.
|
||||
return true;
|
||||
}
|
||||
|
||||
// Given a "real" merge from the library, call the user's
|
||||
// associative merge function one-by-one on each of the operands.
|
||||
// NOTE: It is assumed that the client's merge-operator will handle any errors.
|
||||
bool AssociativeMergeOperator::FullMerge(
|
||||
const Slice& key,
|
||||
const Slice* existing_value,
|
||||
const std::deque<std::string>& operand_list,
|
||||
std::string* new_value,
|
||||
Logger* logger) const {
|
||||
|
||||
// Simply loop through the operands
|
||||
Slice temp_existing;
|
||||
std::string temp_value;
|
||||
for (const auto& operand : operand_list) {
|
||||
Slice value(operand);
|
||||
if (!Merge(key, existing_value, value, &temp_value, logger)) {
|
||||
return false;
|
||||
}
|
||||
swap(temp_value, *new_value);
|
||||
temp_existing = Slice(*new_value);
|
||||
existing_value = &temp_existing;
|
||||
}
|
||||
|
||||
// The result will be in *new_value. All merges succeeded.
|
||||
return true;
|
||||
}
|
||||
|
||||
// Call the user defined simple merge on the operands;
|
||||
// NOTE: It is assumed that the client's merge-operator will handle any errors.
|
||||
bool AssociativeMergeOperator::PartialMerge(
|
||||
const Slice& key,
|
||||
const Slice& left_operand,
|
||||
const Slice& right_operand,
|
||||
std::string* new_value,
|
||||
Logger* logger) const {
|
||||
return Merge(key, &left_operand, right_operand, new_value, logger);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
472
db/merge_test.cc
Normal file
472
db/merge_test.cc
Normal file
@@ -0,0 +1,472 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#include <assert.h>
|
||||
#include <memory>
|
||||
#include <iostream>
|
||||
|
||||
#include "rocksdb/cache.h"
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/merge_operator.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "utilities/merge_operators.h"
|
||||
#include "util/testharness.h"
|
||||
#include "utilities/db_ttl.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace rocksdb;
|
||||
|
||||
namespace {
|
||||
int numMergeOperatorCalls;
|
||||
void resetNumMergeOperatorCalls() {
|
||||
numMergeOperatorCalls = 0;
|
||||
}
|
||||
|
||||
int num_partial_merge_calls;
|
||||
void resetNumPartialMergeCalls() {
|
||||
num_partial_merge_calls = 0;
|
||||
}
|
||||
}
|
||||
|
||||
class CountMergeOperator : public AssociativeMergeOperator {
|
||||
public:
|
||||
CountMergeOperator() {
|
||||
mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
|
||||
}
|
||||
|
||||
virtual bool Merge(const Slice& key,
|
||||
const Slice* existing_value,
|
||||
const Slice& value,
|
||||
std::string* new_value,
|
||||
Logger* logger) const override {
|
||||
++numMergeOperatorCalls;
|
||||
if (existing_value == nullptr) {
|
||||
new_value->assign(value.data(), value.size());
|
||||
return true;
|
||||
}
|
||||
|
||||
return mergeOperator_->PartialMerge(
|
||||
key,
|
||||
*existing_value,
|
||||
value,
|
||||
new_value,
|
||||
logger);
|
||||
}
|
||||
|
||||
virtual bool PartialMergeMulti(const Slice& key,
|
||||
const std::deque<Slice>& operand_list,
|
||||
std::string* new_value, Logger* logger) const {
|
||||
++num_partial_merge_calls;
|
||||
return mergeOperator_->PartialMergeMulti(key, operand_list, new_value,
|
||||
logger);
|
||||
}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "UInt64AddOperator";
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<MergeOperator> mergeOperator_;
|
||||
};
|
||||
|
||||
namespace {
|
||||
std::shared_ptr<DB> OpenDb(const string& dbname, const bool ttl = false,
|
||||
const size_t max_successive_merges = 0,
|
||||
const uint32_t min_partial_merge_operands = 2) {
|
||||
DB* db;
|
||||
Options options;
|
||||
options.create_if_missing = true;
|
||||
options.merge_operator = std::make_shared<CountMergeOperator>();
|
||||
options.max_successive_merges = max_successive_merges;
|
||||
options.min_partial_merge_operands = min_partial_merge_operands;
|
||||
Status s;
|
||||
DestroyDB(dbname, Options());
|
||||
if (ttl) {
|
||||
cout << "Opening database with TTL\n";
|
||||
DBWithTTL* db_with_ttl;
|
||||
s = DBWithTTL::Open(options, dbname, &db_with_ttl);
|
||||
db = db_with_ttl;
|
||||
} else {
|
||||
s = DB::Open(options, dbname, &db);
|
||||
}
|
||||
if (!s.ok()) {
|
||||
cerr << s.ToString() << endl;
|
||||
assert(false);
|
||||
}
|
||||
return std::shared_ptr<DB>(db);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// Imagine we are maintaining a set of uint64 counters.
|
||||
// Each counter has a distinct name. And we would like
|
||||
// to support four high level operations:
|
||||
// set, add, get and remove
|
||||
// This is a quick implementation without a Merge operation.
|
||||
class Counters {
|
||||
|
||||
protected:
|
||||
std::shared_ptr<DB> db_;
|
||||
|
||||
WriteOptions put_option_;
|
||||
ReadOptions get_option_;
|
||||
WriteOptions delete_option_;
|
||||
|
||||
uint64_t default_;
|
||||
|
||||
public:
|
||||
explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
|
||||
: db_(db),
|
||||
put_option_(),
|
||||
get_option_(),
|
||||
delete_option_(),
|
||||
default_(defaultCount) {
|
||||
assert(db_);
|
||||
}
|
||||
|
||||
virtual ~Counters() {}
|
||||
|
||||
// public interface of Counters.
|
||||
// All four functions return false
|
||||
// if the underlying level db operation failed.
|
||||
|
||||
// mapped to a levedb Put
|
||||
bool set(const string& key, uint64_t value) {
|
||||
// just treat the internal rep of int64 as the string
|
||||
Slice slice((char *)&value, sizeof(value));
|
||||
auto s = db_->Put(put_option_, key, slice);
|
||||
|
||||
if (s.ok()) {
|
||||
return true;
|
||||
} else {
|
||||
cerr << s.ToString() << endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// mapped to a rocksdb Delete
|
||||
bool remove(const string& key) {
|
||||
auto s = db_->Delete(delete_option_, key);
|
||||
|
||||
if (s.ok()) {
|
||||
return true;
|
||||
} else {
|
||||
cerr << s.ToString() << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// mapped to a rocksdb Get
|
||||
bool get(const string& key, uint64_t *value) {
|
||||
string str;
|
||||
auto s = db_->Get(get_option_, key, &str);
|
||||
|
||||
if (s.IsNotFound()) {
|
||||
// return default value if not found;
|
||||
*value = default_;
|
||||
return true;
|
||||
} else if (s.ok()) {
|
||||
// deserialization
|
||||
if (str.size() != sizeof(uint64_t)) {
|
||||
cerr << "value corruption\n";
|
||||
return false;
|
||||
}
|
||||
*value = DecodeFixed64(&str[0]);
|
||||
return true;
|
||||
} else {
|
||||
cerr << s.ToString() << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// 'add' is implemented as get -> modify -> set
|
||||
// An alternative is a single merge operation, see MergeBasedCounters
|
||||
virtual bool add(const string& key, uint64_t value) {
|
||||
uint64_t base = default_;
|
||||
return get(key, &base) && set(key, base + value);
|
||||
}
|
||||
|
||||
|
||||
// convenience functions for testing
|
||||
void assert_set(const string& key, uint64_t value) {
|
||||
assert(set(key, value));
|
||||
}
|
||||
|
||||
void assert_remove(const string& key) {
|
||||
assert(remove(key));
|
||||
}
|
||||
|
||||
uint64_t assert_get(const string& key) {
|
||||
uint64_t value = default_;
|
||||
int result = get(key, &value);
|
||||
assert(result);
|
||||
if (result == 0) exit(1); // Disable unused variable warning.
|
||||
return value;
|
||||
}
|
||||
|
||||
void assert_add(const string& key, uint64_t value) {
|
||||
int result = add(key, value);
|
||||
assert(result);
|
||||
if (result == 0) exit(1); // Disable unused variable warning.
|
||||
}
|
||||
};
|
||||
|
||||
// Implement 'add' directly with the new Merge operation
|
||||
class MergeBasedCounters : public Counters {
|
||||
private:
|
||||
WriteOptions merge_option_; // for merge
|
||||
|
||||
public:
|
||||
explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
|
||||
: Counters(db, defaultCount),
|
||||
merge_option_() {
|
||||
}
|
||||
|
||||
// mapped to a rocksdb Merge operation
|
||||
virtual bool add(const string& key, uint64_t value) override {
|
||||
char encoded[sizeof(uint64_t)];
|
||||
EncodeFixed64(encoded, value);
|
||||
Slice slice(encoded, sizeof(uint64_t));
|
||||
auto s = db_->Merge(merge_option_, key, slice);
|
||||
|
||||
if (s.ok()) {
|
||||
return true;
|
||||
} else {
|
||||
cerr << s.ToString() << endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
namespace {
|
||||
void dumpDb(DB* db) {
|
||||
auto it = unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
|
||||
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
||||
uint64_t value = DecodeFixed64(it->value().data());
|
||||
cout << it->key().ToString() << ": " << value << endl;
|
||||
}
|
||||
assert(it->status().ok()); // Check for any errors found during the scan
|
||||
}
|
||||
|
||||
void testCounters(Counters& counters, DB* db, bool test_compaction) {
|
||||
|
||||
FlushOptions o;
|
||||
o.wait = true;
|
||||
|
||||
counters.assert_set("a", 1);
|
||||
|
||||
if (test_compaction) db->Flush(o);
|
||||
|
||||
assert(counters.assert_get("a") == 1);
|
||||
|
||||
counters.assert_remove("b");
|
||||
|
||||
// defaut value is 0 if non-existent
|
||||
assert(counters.assert_get("b") == 0);
|
||||
|
||||
counters.assert_add("a", 2);
|
||||
|
||||
if (test_compaction) db->Flush(o);
|
||||
|
||||
// 1+2 = 3
|
||||
assert(counters.assert_get("a")== 3);
|
||||
|
||||
dumpDb(db);
|
||||
|
||||
std::cout << "1\n";
|
||||
|
||||
// 1+...+49 = ?
|
||||
uint64_t sum = 0;
|
||||
for (int i = 1; i < 50; i++) {
|
||||
counters.assert_add("b", i);
|
||||
sum += i;
|
||||
}
|
||||
assert(counters.assert_get("b") == sum);
|
||||
|
||||
std::cout << "2\n";
|
||||
dumpDb(db);
|
||||
|
||||
std::cout << "3\n";
|
||||
|
||||
if (test_compaction) {
|
||||
db->Flush(o);
|
||||
|
||||
cout << "Compaction started ...\n";
|
||||
db->CompactRange(nullptr, nullptr);
|
||||
cout << "Compaction ended\n";
|
||||
|
||||
dumpDb(db);
|
||||
|
||||
assert(counters.assert_get("a")== 3);
|
||||
assert(counters.assert_get("b") == sum);
|
||||
}
|
||||
}
|
||||
|
||||
void testSuccessiveMerge(
|
||||
Counters& counters, int max_num_merges, int num_merges) {
|
||||
|
||||
counters.assert_remove("z");
|
||||
uint64_t sum = 0;
|
||||
|
||||
for (int i = 1; i <= num_merges; ++i) {
|
||||
resetNumMergeOperatorCalls();
|
||||
counters.assert_add("z", i);
|
||||
sum += i;
|
||||
|
||||
if (i % (max_num_merges + 1) == 0) {
|
||||
assert(numMergeOperatorCalls == max_num_merges + 1);
|
||||
} else {
|
||||
assert(numMergeOperatorCalls == 0);
|
||||
}
|
||||
|
||||
resetNumMergeOperatorCalls();
|
||||
assert(counters.assert_get("z") == sum);
|
||||
assert(numMergeOperatorCalls == i % (max_num_merges + 1));
|
||||
}
|
||||
}
|
||||
|
||||
void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
|
||||
int count) {
|
||||
FlushOptions o;
|
||||
o.wait = true;
|
||||
|
||||
// Test case 1: partial merge should be called when the number of merge
|
||||
// operands exceeds the threshold.
|
||||
uint64_t tmp_sum = 0;
|
||||
resetNumPartialMergeCalls();
|
||||
for (int i = 1; i <= count; i++) {
|
||||
counters->assert_add("b", i);
|
||||
tmp_sum += i;
|
||||
}
|
||||
db->Flush(o);
|
||||
db->CompactRange(nullptr, nullptr);
|
||||
ASSERT_EQ(tmp_sum, counters->assert_get("b"));
|
||||
if (count > max_merge) {
|
||||
// in this case, FullMerge should be called instead.
|
||||
ASSERT_EQ(num_partial_merge_calls, 0);
|
||||
} else {
|
||||
// if count >= min_merge, then partial merge should be called once.
|
||||
ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1));
|
||||
}
|
||||
|
||||
// Test case 2: partial merge should not be called when a put is found.
|
||||
resetNumPartialMergeCalls();
|
||||
tmp_sum = 0;
|
||||
db->Put(rocksdb::WriteOptions(), "c", "10");
|
||||
for (int i = 1; i <= count; i++) {
|
||||
counters->assert_add("c", i);
|
||||
tmp_sum += i;
|
||||
}
|
||||
db->Flush(o);
|
||||
db->CompactRange(nullptr, nullptr);
|
||||
ASSERT_EQ(tmp_sum, counters->assert_get("c"));
|
||||
ASSERT_EQ(num_partial_merge_calls, 0);
|
||||
}
|
||||
|
||||
void testSingleBatchSuccessiveMerge(
|
||||
DB* db,
|
||||
int max_num_merges,
|
||||
int num_merges) {
|
||||
assert(num_merges > max_num_merges);
|
||||
|
||||
Slice key("BatchSuccessiveMerge");
|
||||
uint64_t merge_value = 1;
|
||||
Slice merge_value_slice((char *)&merge_value, sizeof(merge_value));
|
||||
|
||||
// Create the batch
|
||||
WriteBatch batch;
|
||||
for (int i = 0; i < num_merges; ++i) {
|
||||
batch.Merge(key, merge_value_slice);
|
||||
}
|
||||
|
||||
// Apply to memtable and count the number of merges
|
||||
resetNumMergeOperatorCalls();
|
||||
{
|
||||
Status s = db->Write(WriteOptions(), &batch);
|
||||
assert(s.ok());
|
||||
}
|
||||
assert(numMergeOperatorCalls ==
|
||||
num_merges - (num_merges % (max_num_merges + 1)));
|
||||
|
||||
// Get the value
|
||||
resetNumMergeOperatorCalls();
|
||||
string get_value_str;
|
||||
{
|
||||
Status s = db->Get(ReadOptions(), key, &get_value_str);
|
||||
assert(s.ok());
|
||||
}
|
||||
assert(get_value_str.size() == sizeof(uint64_t));
|
||||
uint64_t get_value = DecodeFixed64(&get_value_str[0]);
|
||||
ASSERT_EQ(get_value, num_merges * merge_value);
|
||||
ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1)));
|
||||
}
|
||||
|
||||
void runTest(int argc, const string& dbname, const bool use_ttl = false) {
|
||||
auto db = OpenDb(dbname, use_ttl);
|
||||
|
||||
{
|
||||
cout << "Test read-modify-write counters... \n";
|
||||
Counters counters(db, 0);
|
||||
testCounters(counters, db.get(), true);
|
||||
}
|
||||
|
||||
bool compact = false;
|
||||
if (argc > 1) {
|
||||
compact = true;
|
||||
cout << "Turn on Compaction\n";
|
||||
}
|
||||
|
||||
{
|
||||
cout << "Test merge-based counters... \n";
|
||||
MergeBasedCounters counters(db, 0);
|
||||
testCounters(counters, db.get(), compact);
|
||||
}
|
||||
|
||||
DestroyDB(dbname, Options());
|
||||
db.reset();
|
||||
|
||||
{
|
||||
cout << "Test merge in memtable... \n";
|
||||
size_t max_merge = 5;
|
||||
auto db = OpenDb(dbname, use_ttl, max_merge);
|
||||
MergeBasedCounters counters(db, 0);
|
||||
testCounters(counters, db.get(), compact);
|
||||
testSuccessiveMerge(counters, max_merge, max_merge * 2);
|
||||
testSingleBatchSuccessiveMerge(db.get(), 5, 7);
|
||||
DestroyDB(dbname, Options());
|
||||
}
|
||||
|
||||
{
|
||||
cout << "Test Partial-Merge\n";
|
||||
size_t max_merge = 100;
|
||||
for (uint32_t min_merge = 5; min_merge < 25; min_merge += 5) {
|
||||
for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) {
|
||||
auto db = OpenDb(dbname, use_ttl, max_merge, min_merge);
|
||||
MergeBasedCounters counters(db, 0);
|
||||
testPartialMerge(&counters, db.get(), max_merge, min_merge, count);
|
||||
DestroyDB(dbname, Options());
|
||||
}
|
||||
{
|
||||
auto db = OpenDb(dbname, use_ttl, max_merge, min_merge);
|
||||
MergeBasedCounters counters(db, 0);
|
||||
testPartialMerge(&counters, db.get(), max_merge, min_merge,
|
||||
min_merge * 10);
|
||||
DestroyDB(dbname, Options());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
//TODO: Make this test like a general rocksdb unit-test
|
||||
runTest(argc, test::TmpDir() + "/merge_testdb");
|
||||
runTest(argc, test::TmpDir() + "/merge_testdbttl", true); // Run test on TTL database
|
||||
printf("Passed all tests!\n");
|
||||
return 0;
|
||||
}
|
||||
358
db/perf_context_test.cc
Normal file
358
db/perf_context_test.cc
Normal file
@@ -0,0 +1,358 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include "/usr/include/valgrind/callgrind.h"
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/perf_context.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "rocksdb/memtablerep.h"
|
||||
#include "util/histogram.h"
|
||||
#include "util/stop_watch.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
|
||||
bool FLAGS_random_key = false;
|
||||
bool FLAGS_use_set_based_memetable = false;
|
||||
int FLAGS_total_keys = 100;
|
||||
int FLAGS_write_buffer_size = 1000000000;
|
||||
int FLAGS_max_write_buffer_number = 8;
|
||||
int FLAGS_min_write_buffer_number_to_merge = 7;
|
||||
|
||||
// Path to the database on file system
|
||||
const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test";
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
std::shared_ptr<DB> OpenDb() {
|
||||
DB* db;
|
||||
Options options;
|
||||
options.create_if_missing = true;
|
||||
options.write_buffer_size = FLAGS_write_buffer_size;
|
||||
options.max_write_buffer_number = FLAGS_max_write_buffer_number;
|
||||
options.min_write_buffer_number_to_merge =
|
||||
FLAGS_min_write_buffer_number_to_merge;
|
||||
|
||||
if (FLAGS_use_set_based_memetable) {
|
||||
auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0);
|
||||
options.memtable_factory.reset(
|
||||
NewHashSkipListRepFactory(prefix_extractor));
|
||||
}
|
||||
|
||||
Status s = DB::Open(options, kDbName, &db);
|
||||
ASSERT_OK(s);
|
||||
return std::shared_ptr<DB>(db);
|
||||
}
|
||||
|
||||
class PerfContextTest { };
|
||||
|
||||
TEST(PerfContextTest, SeekIntoDeletion) {
|
||||
DestroyDB(kDbName, Options());
|
||||
auto db = OpenDb();
|
||||
WriteOptions write_options;
|
||||
ReadOptions read_options;
|
||||
|
||||
for (int i = 0; i < FLAGS_total_keys; ++i) {
|
||||
std::string key = "k" + std::to_string(i);
|
||||
std::string value = "v" + std::to_string(i);
|
||||
|
||||
db->Put(write_options, key, value);
|
||||
}
|
||||
|
||||
for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
|
||||
std::string key = "k" + std::to_string(i);
|
||||
db->Delete(write_options, key);
|
||||
}
|
||||
|
||||
HistogramImpl hist_get;
|
||||
HistogramImpl hist_get_time;
|
||||
for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
|
||||
std::string key = "k" + std::to_string(i);
|
||||
std::string value;
|
||||
|
||||
perf_context.Reset();
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
auto status = db->Get(read_options, key, &value);
|
||||
auto elapsed_nanos = timer.ElapsedNanos();
|
||||
ASSERT_TRUE(status.IsNotFound());
|
||||
hist_get.Add(perf_context.user_key_comparison_count);
|
||||
hist_get_time.Add(elapsed_nanos);
|
||||
}
|
||||
|
||||
std::cout << "Get uesr key comparison: \n" << hist_get.ToString()
|
||||
<< "Get time: \n" << hist_get_time.ToString();
|
||||
|
||||
HistogramImpl hist_seek_to_first;
|
||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||
|
||||
perf_context.Reset();
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
iter->SeekToFirst();
|
||||
hist_seek_to_first.Add(perf_context.user_key_comparison_count);
|
||||
auto elapsed_nanos = timer.ElapsedNanos();
|
||||
|
||||
std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString()
|
||||
<< "ikey skipped: " << perf_context.internal_key_skipped_count << "\n"
|
||||
<< "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n"
|
||||
<< "elapsed: " << elapsed_nanos << "\n";
|
||||
|
||||
HistogramImpl hist_seek;
|
||||
for (int i = 0; i < FLAGS_total_keys; ++i) {
|
||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||
std::string key = "k" + std::to_string(i);
|
||||
|
||||
perf_context.Reset();
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
iter->Seek(key);
|
||||
auto elapsed_nanos = timer.ElapsedNanos();
|
||||
hist_seek.Add(perf_context.user_key_comparison_count);
|
||||
std::cout << "seek cmp: " << perf_context.user_key_comparison_count
|
||||
<< " ikey skipped " << perf_context.internal_key_skipped_count
|
||||
<< " idelete skipped " << perf_context.internal_delete_skipped_count
|
||||
<< " elapsed: " << elapsed_nanos << "ns\n";
|
||||
|
||||
perf_context.Reset();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
StopWatchNano timer2(Env::Default(), true);
|
||||
iter->Next();
|
||||
auto elapsed_nanos2 = timer2.ElapsedNanos();
|
||||
std::cout << "next cmp: " << perf_context.user_key_comparison_count
|
||||
<< "elapsed: " << elapsed_nanos2 << "ns\n";
|
||||
}
|
||||
|
||||
std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString();
|
||||
}
|
||||
|
||||
TEST(PerfContextTest, StopWatchNanoOverhead) {
|
||||
// profile the timer cost by itself!
|
||||
const int kTotalIterations = 1000000;
|
||||
std::vector<uint64_t> timings(kTotalIterations);
|
||||
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
for (auto& timing : timings) {
|
||||
timing = timer.ElapsedNanos(true /* reset */);
|
||||
}
|
||||
|
||||
HistogramImpl histogram;
|
||||
for (const auto timing : timings) {
|
||||
histogram.Add(timing);
|
||||
}
|
||||
|
||||
std::cout << histogram.ToString();
|
||||
}
|
||||
|
||||
TEST(PerfContextTest, StopWatchOverhead) {
|
||||
// profile the timer cost by itself!
|
||||
const int kTotalIterations = 1000000;
|
||||
std::vector<uint64_t> timings(kTotalIterations);
|
||||
|
||||
StopWatch timer(Env::Default());
|
||||
for (auto& timing : timings) {
|
||||
timing = timer.ElapsedMicros();
|
||||
}
|
||||
|
||||
HistogramImpl histogram;
|
||||
uint64_t prev_timing = 0;
|
||||
for (const auto timing : timings) {
|
||||
histogram.Add(timing - prev_timing);
|
||||
prev_timing = timing;
|
||||
}
|
||||
|
||||
std::cout << histogram.ToString();
|
||||
}
|
||||
|
||||
void ProfileKeyComparison() {
|
||||
DestroyDB(kDbName, Options()); // Start this test with a fresh DB
|
||||
|
||||
auto db = OpenDb();
|
||||
|
||||
WriteOptions write_options;
|
||||
ReadOptions read_options;
|
||||
|
||||
HistogramImpl hist_put;
|
||||
HistogramImpl hist_get;
|
||||
HistogramImpl hist_get_snapshot;
|
||||
HistogramImpl hist_get_memtable;
|
||||
HistogramImpl hist_get_post_process;
|
||||
HistogramImpl hist_num_memtable_checked;
|
||||
HistogramImpl hist_write_pre_post;
|
||||
HistogramImpl hist_write_wal_time;
|
||||
HistogramImpl hist_write_memtable_time;
|
||||
|
||||
std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
|
||||
|
||||
std::vector<int> keys;
|
||||
for (int i = 0; i < FLAGS_total_keys; ++i) {
|
||||
keys.push_back(i);
|
||||
}
|
||||
|
||||
if (FLAGS_random_key) {
|
||||
std::random_shuffle(keys.begin(), keys.end());
|
||||
}
|
||||
|
||||
for (const int i : keys) {
|
||||
std::string key = "k" + std::to_string(i);
|
||||
std::string value = "v" + std::to_string(i);
|
||||
|
||||
perf_context.Reset();
|
||||
db->Put(write_options, key, value);
|
||||
hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
|
||||
hist_write_wal_time.Add(perf_context.write_wal_time);
|
||||
hist_write_memtable_time.Add(perf_context.write_memtable_time);
|
||||
hist_put.Add(perf_context.user_key_comparison_count);
|
||||
|
||||
perf_context.Reset();
|
||||
db->Get(read_options, key, &value);
|
||||
hist_get_snapshot.Add(perf_context.get_snapshot_time);
|
||||
hist_get_memtable.Add(perf_context.get_from_memtable_time);
|
||||
hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
|
||||
hist_get_post_process.Add(perf_context.get_post_process_time);
|
||||
hist_get.Add(perf_context.user_key_comparison_count);
|
||||
}
|
||||
|
||||
std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
|
||||
<< "Get uesr key comparison: \n" << hist_get.ToString();
|
||||
std::cout << "Put(): Pre and Post Process Time: \n"
|
||||
<< hist_write_pre_post.ToString()
|
||||
<< " Writing WAL time: \n"
|
||||
<< hist_write_wal_time.ToString() << "\n"
|
||||
<< " Writing Mem Table time: \n"
|
||||
<< hist_write_memtable_time.ToString() << "\n";
|
||||
|
||||
std::cout << "Get(): Time to get snapshot: \n"
|
||||
<< hist_get_snapshot.ToString()
|
||||
<< " Time to get value from memtables: \n"
|
||||
<< hist_get_memtable.ToString() << "\n"
|
||||
<< " Number of memtables checked: \n"
|
||||
<< hist_num_memtable_checked.ToString() << "\n"
|
||||
<< " Time to post process: \n"
|
||||
<< hist_get_post_process.ToString() << "\n";
|
||||
}
|
||||
|
||||
TEST(PerfContextTest, KeyComparisonCount) {
|
||||
SetPerfLevel(kEnableCount);
|
||||
ProfileKeyComparison();
|
||||
|
||||
SetPerfLevel(kDisable);
|
||||
ProfileKeyComparison();
|
||||
|
||||
SetPerfLevel(kEnableTime);
|
||||
ProfileKeyComparison();
|
||||
}
|
||||
|
||||
// make perf_context_test
|
||||
// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison
|
||||
// For one memtable:
|
||||
// ./perf_context_test --write_buffer_size=500000 --total_keys=10000
|
||||
// For two memtables:
|
||||
// ./perf_context_test --write_buffer_size=250000 --total_keys=10000
|
||||
// Specify --random_key=1 to shuffle the key before insertion
|
||||
// Results show that, for sequential insertion, worst-case Seek Key comparison
|
||||
// is close to the total number of keys (linear), when there is only one
|
||||
// memtable. When there are two memtables, even the avg Seek Key comparison
|
||||
// starts to become linear to the input size.
|
||||
|
||||
TEST(PerfContextTest, SeekKeyComparison) {
|
||||
DestroyDB(kDbName, Options());
|
||||
auto db = OpenDb();
|
||||
WriteOptions write_options;
|
||||
ReadOptions read_options;
|
||||
|
||||
std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
|
||||
|
||||
std::vector<int> keys;
|
||||
for (int i = 0; i < FLAGS_total_keys; ++i) {
|
||||
keys.push_back(i);
|
||||
}
|
||||
|
||||
if (FLAGS_random_key) {
|
||||
std::random_shuffle(keys.begin(), keys.end());
|
||||
}
|
||||
|
||||
HistogramImpl hist_put_time;
|
||||
HistogramImpl hist_wal_time;
|
||||
HistogramImpl hist_time_diff;
|
||||
|
||||
SetPerfLevel(kEnableTime);
|
||||
StopWatchNano timer(Env::Default());
|
||||
for (const int i : keys) {
|
||||
std::string key = "k" + std::to_string(i);
|
||||
std::string value = "v" + std::to_string(i);
|
||||
|
||||
perf_context.Reset();
|
||||
timer.Start();
|
||||
db->Put(write_options, key, value);
|
||||
auto put_time = timer.ElapsedNanos();
|
||||
hist_put_time.Add(put_time);
|
||||
hist_wal_time.Add(perf_context.write_wal_time);
|
||||
hist_time_diff.Add(put_time - perf_context.write_wal_time);
|
||||
}
|
||||
|
||||
std::cout << "Put time:\n" << hist_put_time.ToString()
|
||||
<< "WAL time:\n" << hist_wal_time.ToString()
|
||||
<< "time diff:\n" << hist_time_diff.ToString();
|
||||
|
||||
HistogramImpl hist_seek;
|
||||
HistogramImpl hist_next;
|
||||
|
||||
for (int i = 0; i < FLAGS_total_keys; ++i) {
|
||||
std::string key = "k" + std::to_string(i);
|
||||
std::string value = "v" + std::to_string(i);
|
||||
|
||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||
perf_context.Reset();
|
||||
iter->Seek(key);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ(iter->value().ToString(), value);
|
||||
hist_seek.Add(perf_context.user_key_comparison_count);
|
||||
}
|
||||
|
||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||
for (iter->SeekToFirst(); iter->Valid();) {
|
||||
perf_context.Reset();
|
||||
iter->Next();
|
||||
hist_next.Add(perf_context.user_key_comparison_count);
|
||||
}
|
||||
|
||||
std::cout << "Seek:\n" << hist_seek.ToString()
|
||||
<< "Next:\n" << hist_next.ToString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
int n;
|
||||
char junk;
|
||||
|
||||
if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
|
||||
FLAGS_write_buffer_size = n;
|
||||
}
|
||||
|
||||
if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) {
|
||||
FLAGS_total_keys = n;
|
||||
}
|
||||
|
||||
if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 &&
|
||||
(n == 0 || n == 1)) {
|
||||
FLAGS_random_key = n;
|
||||
}
|
||||
|
||||
if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 &&
|
||||
(n == 0 || n == 1)) {
|
||||
FLAGS_use_set_based_memetable = n;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
std::cout << kDbName << "\n";
|
||||
|
||||
rocksdb::test::RunAllTests();
|
||||
return 0;
|
||||
}
|
||||
853
db/plain_table_db_test.cc
Normal file
853
db/plain_table_db_test.cc
Normal file
@@ -0,0 +1,853 @@
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
|
||||
#include "db/db_impl.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/version_set.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "rocksdb/cache.h"
|
||||
#include "rocksdb/compaction_filter.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/filter_policy.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "table/meta_blocks.h"
|
||||
#include "table/plain_table_factory.h"
|
||||
#include "table/plain_table_reader.h"
|
||||
#include "util/hash.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
#include "utilities/merge_operators.h"
|
||||
|
||||
using std::unique_ptr;
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class PlainTableDBTest {
|
||||
protected:
|
||||
private:
|
||||
std::string dbname_;
|
||||
Env* env_;
|
||||
DB* db_;
|
||||
|
||||
Options last_options_;
|
||||
|
||||
public:
|
||||
PlainTableDBTest() : env_(Env::Default()) {
|
||||
dbname_ = test::TmpDir() + "/plain_table_db_test";
|
||||
ASSERT_OK(DestroyDB(dbname_, Options()));
|
||||
db_ = nullptr;
|
||||
Reopen();
|
||||
}
|
||||
|
||||
~PlainTableDBTest() {
|
||||
delete db_;
|
||||
ASSERT_OK(DestroyDB(dbname_, Options()));
|
||||
}
|
||||
|
||||
// Return the current option configuration.
|
||||
Options CurrentOptions() {
|
||||
Options options;
|
||||
options.table_factory.reset(NewPlainTableFactory(16, 2, 0.8, 3));
|
||||
options.prefix_extractor.reset(NewFixedPrefixTransform(8));
|
||||
options.allow_mmap_reads = true;
|
||||
return options;
|
||||
}
|
||||
|
||||
DBImpl* dbfull() {
|
||||
return reinterpret_cast<DBImpl*>(db_);
|
||||
}
|
||||
|
||||
void Reopen(Options* options = nullptr) {
|
||||
ASSERT_OK(TryReopen(options));
|
||||
}
|
||||
|
||||
void Close() {
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
}
|
||||
|
||||
void DestroyAndReopen(Options* options = nullptr) {
|
||||
//Destroy using last options
|
||||
Destroy(&last_options_);
|
||||
ASSERT_OK(TryReopen(options));
|
||||
}
|
||||
|
||||
void Destroy(Options* options) {
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
ASSERT_OK(DestroyDB(dbname_, *options));
|
||||
}
|
||||
|
||||
Status PureReopen(Options* options, DB** db) {
|
||||
return DB::Open(*options, dbname_, db);
|
||||
}
|
||||
|
||||
Status TryReopen(Options* options = nullptr) {
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
Options opts;
|
||||
if (options != nullptr) {
|
||||
opts = *options;
|
||||
} else {
|
||||
opts = CurrentOptions();
|
||||
opts.create_if_missing = true;
|
||||
}
|
||||
last_options_ = opts;
|
||||
|
||||
return DB::Open(opts, dbname_, &db_);
|
||||
}
|
||||
|
||||
Status Put(const Slice& k, const Slice& v) {
|
||||
return db_->Put(WriteOptions(), k, v);
|
||||
}
|
||||
|
||||
Status Delete(const std::string& k) {
|
||||
return db_->Delete(WriteOptions(), k);
|
||||
}
|
||||
|
||||
std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
|
||||
ReadOptions options;
|
||||
options.snapshot = snapshot;
|
||||
std::string result;
|
||||
Status s = db_->Get(options, k, &result);
|
||||
if (s.IsNotFound()) {
|
||||
result = "NOT_FOUND";
|
||||
} else if (!s.ok()) {
|
||||
result = s.ToString();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
int NumTableFilesAtLevel(int level) {
|
||||
std::string property;
|
||||
ASSERT_TRUE(
|
||||
db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
|
||||
&property));
|
||||
return atoi(property.c_str());
|
||||
}
|
||||
|
||||
// Return spread of files per level
|
||||
std::string FilesPerLevel() {
|
||||
std::string result;
|
||||
int last_non_zero_offset = 0;
|
||||
for (int level = 0; level < db_->NumberLevels(); level++) {
|
||||
int f = NumTableFilesAtLevel(level);
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
|
||||
result += buf;
|
||||
if (f > 0) {
|
||||
last_non_zero_offset = result.size();
|
||||
}
|
||||
}
|
||||
result.resize(last_non_zero_offset);
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string IterStatus(Iterator* iter) {
|
||||
std::string result;
|
||||
if (iter->Valid()) {
|
||||
result = iter->key().ToString() + "->" + iter->value().ToString();
|
||||
} else {
|
||||
result = "(invalid)";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
TEST(PlainTableDBTest, Empty) {
|
||||
ASSERT_TRUE(dbfull() != nullptr);
|
||||
ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
|
||||
}
|
||||
|
||||
class TestPlainTableReader : public PlainTableReader {
|
||||
public:
|
||||
TestPlainTableReader(const EnvOptions& storage_options,
|
||||
const InternalKeyComparator& icomparator,
|
||||
uint64_t file_size, int bloom_bits_per_key,
|
||||
double hash_table_ratio, size_t index_sparseness,
|
||||
const TableProperties* table_properties,
|
||||
unique_ptr<RandomAccessFile>&& file,
|
||||
const Options& options, bool* expect_bloom_not_match)
|
||||
: PlainTableReader(options, std::move(file), storage_options, icomparator,
|
||||
file_size, bloom_bits_per_key, hash_table_ratio,
|
||||
index_sparseness, table_properties, 2 * 1024 * 1024),
|
||||
expect_bloom_not_match_(expect_bloom_not_match) {
|
||||
Status s = PopulateIndex(const_cast<TableProperties*>(table_properties));
|
||||
ASSERT_TRUE(s.ok());
|
||||
}
|
||||
|
||||
virtual ~TestPlainTableReader() {}
|
||||
|
||||
private:
|
||||
virtual bool MatchBloom(uint32_t hash) const override {
|
||||
bool ret = PlainTableReader::MatchBloom(hash);
|
||||
ASSERT_TRUE(!*expect_bloom_not_match_ || !ret);
|
||||
return ret;
|
||||
}
|
||||
bool* expect_bloom_not_match_;
|
||||
};
|
||||
|
||||
extern const uint64_t kPlainTableMagicNumber;
|
||||
class TestPlainTableFactory : public PlainTableFactory {
|
||||
public:
|
||||
explicit TestPlainTableFactory(bool* expect_bloom_not_match,
|
||||
uint32_t user_key_len, int bloom_bits_per_key,
|
||||
double hash_table_ratio,
|
||||
size_t index_sparseness,
|
||||
size_t huge_page_tlb_size)
|
||||
: PlainTableFactory(user_key_len, user_key_len, hash_table_ratio,
|
||||
index_sparseness, huge_page_tlb_size),
|
||||
bloom_bits_per_key_(bloom_bits_per_key),
|
||||
hash_table_ratio_(hash_table_ratio),
|
||||
index_sparseness_(index_sparseness),
|
||||
expect_bloom_not_match_(expect_bloom_not_match) {}
|
||||
|
||||
Status NewTableReader(const Options& options, const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||
unique_ptr<TableReader>* table) const override {
|
||||
TableProperties* props = nullptr;
|
||||
auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
|
||||
options.env, options.info_log.get(), &props);
|
||||
ASSERT_TRUE(s.ok());
|
||||
|
||||
std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
|
||||
soptions, internal_comparator, file_size, bloom_bits_per_key_,
|
||||
hash_table_ratio_, index_sparseness_, props, std::move(file), options,
|
||||
expect_bloom_not_match_));
|
||||
|
||||
*table = std::move(new_reader);
|
||||
return s;
|
||||
}
|
||||
|
||||
private:
|
||||
int bloom_bits_per_key_;
|
||||
double hash_table_ratio_;
|
||||
size_t index_sparseness_;
|
||||
bool* expect_bloom_not_match_;
|
||||
};
|
||||
|
||||
TEST(PlainTableDBTest, Flush) {
|
||||
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
|
||||
huge_page_tlb_size += 2 * 1024 * 1024) {
|
||||
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
|
||||
for (int total_order = 0; total_order <= 1; total_order++) {
|
||||
Options options = CurrentOptions();
|
||||
options.create_if_missing = true;
|
||||
// Set only one bucket to force bucket conflict.
|
||||
// Test index interval for the same prefix to be 1, 2 and 4
|
||||
if (total_order) {
|
||||
options.table_factory.reset(NewTotalOrderPlainTableFactory(
|
||||
16, bloom_bits, 2, huge_page_tlb_size));
|
||||
} else {
|
||||
options.table_factory.reset(NewPlainTableFactory(
|
||||
16, bloom_bits, 0.75, 16, huge_page_tlb_size));
|
||||
}
|
||||
DestroyAndReopen(&options);
|
||||
|
||||
ASSERT_OK(Put("1000000000000foo", "v1"));
|
||||
ASSERT_OK(Put("0000000000000bar", "v2"));
|
||||
ASSERT_OK(Put("1000000000000foo", "v3"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
|
||||
TablePropertiesCollection ptc;
|
||||
reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
|
||||
ASSERT_EQ(1U, ptc.size());
|
||||
auto row = ptc.begin();
|
||||
auto tp = row->second;
|
||||
ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at(
|
||||
"plain_table_hash_table_size"));
|
||||
ASSERT_EQ(total_order ? "9" : "0", (tp->user_collected_properties).at(
|
||||
"plain_table_sub_index_size"));
|
||||
|
||||
ASSERT_EQ("v3", Get("1000000000000foo"));
|
||||
ASSERT_EQ("v2", Get("0000000000000bar"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PlainTableDBTest, Flush2) {
|
||||
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
|
||||
huge_page_tlb_size += 2 * 1024 * 1024) {
|
||||
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
|
||||
for (int total_order = 0; total_order <= 1; total_order++) {
|
||||
bool expect_bloom_not_match = false;
|
||||
Options options = CurrentOptions();
|
||||
options.create_if_missing = true;
|
||||
// Set only one bucket to force bucket conflict.
|
||||
// Test index interval for the same prefix to be 1, 2 and 4
|
||||
if (total_order) {
|
||||
options.prefix_extractor = nullptr;
|
||||
options.table_factory.reset(
|
||||
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
|
||||
0, 2, huge_page_tlb_size));
|
||||
} else {
|
||||
options.table_factory.reset(
|
||||
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
|
||||
0.75, 16, huge_page_tlb_size));
|
||||
}
|
||||
DestroyAndReopen(&options);
|
||||
ASSERT_OK(Put("0000000000000bar", "b"));
|
||||
ASSERT_OK(Put("1000000000000foo", "v1"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
|
||||
ASSERT_OK(Put("1000000000000foo", "v2"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("v2", Get("1000000000000foo"));
|
||||
|
||||
ASSERT_OK(Put("0000000000000eee", "v3"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("v3", Get("0000000000000eee"));
|
||||
|
||||
ASSERT_OK(Delete("0000000000000bar"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
|
||||
|
||||
ASSERT_OK(Put("0000000000000eee", "v5"));
|
||||
ASSERT_OK(Put("9000000000000eee", "v5"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("v5", Get("0000000000000eee"));
|
||||
|
||||
// Test Bloom Filter
|
||||
if (bloom_bits > 0) {
|
||||
// Neither key nor value should exist.
|
||||
expect_bloom_not_match = true;
|
||||
ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
|
||||
|
||||
// Key doesn't exist any more but prefix exists.
|
||||
if (total_order) {
|
||||
ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
|
||||
ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
|
||||
}
|
||||
expect_bloom_not_match = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PlainTableDBTest, Iterator) {
|
||||
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
|
||||
huge_page_tlb_size += 2 * 1024 * 1024) {
|
||||
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
|
||||
for (int total_order = 0; total_order <= 1; total_order++) {
|
||||
bool expect_bloom_not_match = false;
|
||||
Options options = CurrentOptions();
|
||||
options.create_if_missing = true;
|
||||
// Set only one bucket to force bucket conflict.
|
||||
// Test index interval for the same prefix to be 1, 2 and 4
|
||||
if (total_order) {
|
||||
options.prefix_extractor = nullptr;
|
||||
options.table_factory.reset(
|
||||
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
|
||||
0, 2, huge_page_tlb_size));
|
||||
} else {
|
||||
options.table_factory.reset(
|
||||
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
|
||||
0.75, 16, huge_page_tlb_size));
|
||||
}
|
||||
DestroyAndReopen(&options);
|
||||
|
||||
ASSERT_OK(Put("1000000000foo002", "v_2"));
|
||||
ASSERT_OK(Put("0000000000000bar", "random"));
|
||||
ASSERT_OK(Put("1000000000foo001", "v1"));
|
||||
ASSERT_OK(Put("3000000000000bar", "bar_v"));
|
||||
ASSERT_OK(Put("1000000000foo003", "v__3"));
|
||||
ASSERT_OK(Put("1000000000foo004", "v__4"));
|
||||
ASSERT_OK(Put("1000000000foo005", "v__5"));
|
||||
ASSERT_OK(Put("1000000000foo007", "v__7"));
|
||||
ASSERT_OK(Put("1000000000foo008", "v__8"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("v1", Get("1000000000foo001"));
|
||||
ASSERT_EQ("v__3", Get("1000000000foo003"));
|
||||
Iterator* iter = dbfull()->NewIterator(ReadOptions());
|
||||
iter->Seek("1000000000foo000");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo001", iter->key().ToString());
|
||||
ASSERT_EQ("v1", iter->value().ToString());
|
||||
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo002", iter->key().ToString());
|
||||
ASSERT_EQ("v_2", iter->value().ToString());
|
||||
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo003", iter->key().ToString());
|
||||
ASSERT_EQ("v__3", iter->value().ToString());
|
||||
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo004", iter->key().ToString());
|
||||
ASSERT_EQ("v__4", iter->value().ToString());
|
||||
|
||||
iter->Seek("3000000000000bar");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("3000000000000bar", iter->key().ToString());
|
||||
ASSERT_EQ("bar_v", iter->value().ToString());
|
||||
|
||||
iter->Seek("1000000000foo000");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo001", iter->key().ToString());
|
||||
ASSERT_EQ("v1", iter->value().ToString());
|
||||
|
||||
iter->Seek("1000000000foo005");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo005", iter->key().ToString());
|
||||
ASSERT_EQ("v__5", iter->value().ToString());
|
||||
|
||||
iter->Seek("1000000000foo006");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo007", iter->key().ToString());
|
||||
ASSERT_EQ("v__7", iter->value().ToString());
|
||||
|
||||
iter->Seek("1000000000foo008");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo008", iter->key().ToString());
|
||||
ASSERT_EQ("v__8", iter->value().ToString());
|
||||
|
||||
if (total_order == 0) {
|
||||
iter->Seek("1000000000foo009");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("3000000000000bar", iter->key().ToString());
|
||||
}
|
||||
|
||||
// Test Bloom Filter
|
||||
if (bloom_bits > 0) {
|
||||
if (!total_order) {
|
||||
// Neither key nor value should exist.
|
||||
expect_bloom_not_match = true;
|
||||
iter->Seek("2not000000000bar");
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
|
||||
expect_bloom_not_match = false;
|
||||
} else {
|
||||
expect_bloom_not_match = true;
|
||||
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
|
||||
expect_bloom_not_match = false;
|
||||
}
|
||||
}
|
||||
|
||||
delete iter;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
std::string MakeLongKey(size_t length, char c) {
|
||||
return std::string(length, c);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
TEST(PlainTableDBTest, IteratorLargeKeys) {
|
||||
Options options = CurrentOptions();
|
||||
options.table_factory.reset(NewTotalOrderPlainTableFactory(0, 0, 16));
|
||||
options.create_if_missing = true;
|
||||
options.prefix_extractor.reset();
|
||||
DestroyAndReopen(&options);
|
||||
|
||||
std::string key_list[] = {
|
||||
MakeLongKey(30, '0'),
|
||||
MakeLongKey(16, '1'),
|
||||
MakeLongKey(32, '2'),
|
||||
MakeLongKey(60, '3'),
|
||||
MakeLongKey(90, '4'),
|
||||
MakeLongKey(50, '5'),
|
||||
MakeLongKey(26, '6')
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < 7; i++) {
|
||||
ASSERT_OK(Put(key_list[i], std::to_string(i)));
|
||||
}
|
||||
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
|
||||
Iterator* iter = dbfull()->NewIterator(ReadOptions());
|
||||
iter->Seek(key_list[0]);
|
||||
|
||||
for (size_t i = 0; i < 7; i++) {
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ(key_list[i], iter->key().ToString());
|
||||
ASSERT_EQ(std::to_string(i), iter->value().ToString());
|
||||
iter->Next();
|
||||
}
|
||||
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
delete iter;
|
||||
}
|
||||
|
||||
// A test comparator which compare two strings in this way:
|
||||
// (1) first compare prefix of 8 bytes in alphabet order,
|
||||
// (2) if two strings share the same prefix, sort the other part of the string
|
||||
// in the reverse alphabet order.
|
||||
class SimpleSuffixReverseComparator : public Comparator {
|
||||
public:
|
||||
SimpleSuffixReverseComparator() {}
|
||||
|
||||
virtual const char* Name() const { return "SimpleSuffixReverseComparator"; }
|
||||
|
||||
virtual int Compare(const Slice& a, const Slice& b) const {
|
||||
Slice prefix_a = Slice(a.data(), 8);
|
||||
Slice prefix_b = Slice(b.data(), 8);
|
||||
int prefix_comp = prefix_a.compare(prefix_b);
|
||||
if (prefix_comp != 0) {
|
||||
return prefix_comp;
|
||||
} else {
|
||||
Slice suffix_a = Slice(a.data() + 8, a.size() - 8);
|
||||
Slice suffix_b = Slice(b.data() + 8, b.size() - 8);
|
||||
return -(suffix_a.compare(suffix_b));
|
||||
}
|
||||
}
|
||||
virtual void FindShortestSeparator(std::string* start,
|
||||
const Slice& limit) const {}
|
||||
|
||||
virtual void FindShortSuccessor(std::string* key) const {}
|
||||
};
|
||||
|
||||
TEST(PlainTableDBTest, IteratorReverseSuffixComparator) {
|
||||
Options options = CurrentOptions();
|
||||
options.create_if_missing = true;
|
||||
// Set only one bucket to force bucket conflict.
|
||||
// Test index interval for the same prefix to be 1, 2 and 4
|
||||
SimpleSuffixReverseComparator comp;
|
||||
options.comparator = ∁
|
||||
DestroyAndReopen(&options);
|
||||
|
||||
ASSERT_OK(Put("1000000000foo002", "v_2"));
|
||||
ASSERT_OK(Put("0000000000000bar", "random"));
|
||||
ASSERT_OK(Put("1000000000foo001", "v1"));
|
||||
ASSERT_OK(Put("3000000000000bar", "bar_v"));
|
||||
ASSERT_OK(Put("1000000000foo003", "v__3"));
|
||||
ASSERT_OK(Put("1000000000foo004", "v__4"));
|
||||
ASSERT_OK(Put("1000000000foo005", "v__5"));
|
||||
ASSERT_OK(Put("1000000000foo007", "v__7"));
|
||||
ASSERT_OK(Put("1000000000foo008", "v__8"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("v1", Get("1000000000foo001"));
|
||||
ASSERT_EQ("v__3", Get("1000000000foo003"));
|
||||
Iterator* iter = dbfull()->NewIterator(ReadOptions());
|
||||
iter->Seek("1000000000foo009");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo008", iter->key().ToString());
|
||||
ASSERT_EQ("v__8", iter->value().ToString());
|
||||
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo007", iter->key().ToString());
|
||||
ASSERT_EQ("v__7", iter->value().ToString());
|
||||
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo005", iter->key().ToString());
|
||||
ASSERT_EQ("v__5", iter->value().ToString());
|
||||
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo004", iter->key().ToString());
|
||||
ASSERT_EQ("v__4", iter->value().ToString());
|
||||
|
||||
iter->Seek("3000000000000bar");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("3000000000000bar", iter->key().ToString());
|
||||
ASSERT_EQ("bar_v", iter->value().ToString());
|
||||
|
||||
iter->Seek("1000000000foo005");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo005", iter->key().ToString());
|
||||
ASSERT_EQ("v__5", iter->value().ToString());
|
||||
|
||||
iter->Seek("1000000000foo006");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo005", iter->key().ToString());
|
||||
ASSERT_EQ("v__5", iter->value().ToString());
|
||||
|
||||
iter->Seek("1000000000foo008");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("1000000000foo008", iter->key().ToString());
|
||||
ASSERT_EQ("v__8", iter->value().ToString());
|
||||
|
||||
iter->Seek("1000000000foo000");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("3000000000000bar", iter->key().ToString());
|
||||
|
||||
delete iter;
|
||||
}
|
||||
|
||||
TEST(PlainTableDBTest, HashBucketConflict) {
|
||||
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
|
||||
huge_page_tlb_size += 2 * 1024 * 1024) {
|
||||
for (unsigned char i = 1; i <= 3; i++) {
|
||||
Options options = CurrentOptions();
|
||||
options.create_if_missing = true;
|
||||
// Set only one bucket to force bucket conflict.
|
||||
// Test index interval for the same prefix to be 1, 2 and 4
|
||||
options.table_factory.reset(
|
||||
NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
|
||||
DestroyAndReopen(&options);
|
||||
ASSERT_OK(Put("5000000000000fo0", "v1"));
|
||||
ASSERT_OK(Put("5000000000000fo1", "v2"));
|
||||
ASSERT_OK(Put("5000000000000fo2", "v"));
|
||||
ASSERT_OK(Put("2000000000000fo0", "v3"));
|
||||
ASSERT_OK(Put("2000000000000fo1", "v4"));
|
||||
ASSERT_OK(Put("2000000000000fo2", "v"));
|
||||
ASSERT_OK(Put("2000000000000fo3", "v"));
|
||||
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
|
||||
ASSERT_EQ("v1", Get("5000000000000fo0"));
|
||||
ASSERT_EQ("v2", Get("5000000000000fo1"));
|
||||
ASSERT_EQ("v3", Get("2000000000000fo0"));
|
||||
ASSERT_EQ("v4", Get("2000000000000fo1"));
|
||||
|
||||
ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
|
||||
ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
|
||||
ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
|
||||
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
|
||||
|
||||
ReadOptions ro;
|
||||
Iterator* iter = dbfull()->NewIterator(ro);
|
||||
|
||||
iter->Seek("5000000000000fo0");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
|
||||
|
||||
iter->Seek("5000000000000fo1");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
|
||||
|
||||
iter->Seek("2000000000000fo0");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
|
||||
|
||||
iter->Seek("2000000000000fo1");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
|
||||
|
||||
iter->Seek("2000000000000bar");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
|
||||
|
||||
iter->Seek("5000000000000bar");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
|
||||
|
||||
iter->Seek("2000000000000fo8");
|
||||
ASSERT_TRUE(!iter->Valid() ||
|
||||
options.comparator->Compare(iter->key(), "20000001") > 0);
|
||||
|
||||
iter->Seek("5000000000000fo8");
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
iter->Seek("1000000000000fo2");
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
iter->Seek("3000000000000fo2");
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
iter->Seek("8000000000000fo2");
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
delete iter;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
|
||||
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
|
||||
huge_page_tlb_size += 2 * 1024 * 1024) {
|
||||
for (unsigned char i = 1; i <= 3; i++) {
|
||||
Options options = CurrentOptions();
|
||||
options.create_if_missing = true;
|
||||
SimpleSuffixReverseComparator comp;
|
||||
options.comparator = ∁
|
||||
// Set only one bucket to force bucket conflict.
|
||||
// Test index interval for the same prefix to be 1, 2 and 4
|
||||
options.table_factory.reset(
|
||||
NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
|
||||
DestroyAndReopen(&options);
|
||||
ASSERT_OK(Put("5000000000000fo0", "v1"));
|
||||
ASSERT_OK(Put("5000000000000fo1", "v2"));
|
||||
ASSERT_OK(Put("5000000000000fo2", "v"));
|
||||
ASSERT_OK(Put("2000000000000fo0", "v3"));
|
||||
ASSERT_OK(Put("2000000000000fo1", "v4"));
|
||||
ASSERT_OK(Put("2000000000000fo2", "v"));
|
||||
ASSERT_OK(Put("2000000000000fo3", "v"));
|
||||
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
|
||||
ASSERT_EQ("v1", Get("5000000000000fo0"));
|
||||
ASSERT_EQ("v2", Get("5000000000000fo1"));
|
||||
ASSERT_EQ("v3", Get("2000000000000fo0"));
|
||||
ASSERT_EQ("v4", Get("2000000000000fo1"));
|
||||
|
||||
ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
|
||||
ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
|
||||
ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
|
||||
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
|
||||
|
||||
ReadOptions ro;
|
||||
Iterator* iter = dbfull()->NewIterator(ro);
|
||||
|
||||
iter->Seek("5000000000000fo1");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
|
||||
|
||||
iter->Seek("5000000000000fo1");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
|
||||
|
||||
iter->Seek("2000000000000fo1");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
|
||||
|
||||
iter->Seek("2000000000000fo1");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
|
||||
|
||||
iter->Seek("2000000000000var");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("2000000000000fo3", iter->key().ToString());
|
||||
|
||||
iter->Seek("5000000000000var");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("5000000000000fo2", iter->key().ToString());
|
||||
|
||||
std::string seek_key = "2000000000000bar";
|
||||
iter->Seek(seek_key);
|
||||
ASSERT_TRUE(!iter->Valid() ||
|
||||
options.prefix_extractor->Transform(iter->key()) !=
|
||||
options.prefix_extractor->Transform(seek_key));
|
||||
|
||||
iter->Seek("1000000000000fo2");
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
iter->Seek("3000000000000fo2");
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
iter->Seek("8000000000000fo2");
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
delete iter;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
|
||||
Options options = CurrentOptions();
|
||||
options.create_if_missing = true;
|
||||
// Set only one bucket to force bucket conflict.
|
||||
// Test index interval for the same prefix to be 1, 2 and 4
|
||||
options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 5));
|
||||
DestroyAndReopen(&options);
|
||||
ASSERT_OK(Put("5000000000000fo0", "v1"));
|
||||
ASSERT_OK(Put("5000000000000fo1", "v2"));
|
||||
ASSERT_OK(Put("5000000000000fo2", "v3"));
|
||||
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
|
||||
ASSERT_EQ("v1", Get("5000000000000fo0"));
|
||||
ASSERT_EQ("v2", Get("5000000000000fo1"));
|
||||
ASSERT_EQ("v3", Get("5000000000000fo2"));
|
||||
|
||||
ASSERT_EQ("NOT_FOUND", Get("8000000000000bar"));
|
||||
ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
|
||||
|
||||
Iterator* iter = dbfull()->NewIterator(ReadOptions());
|
||||
|
||||
iter->Seek("5000000000000bar");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
|
||||
|
||||
iter->Seek("5000000000000fo8");
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
iter->Seek("1000000000000fo2");
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
iter->Seek("8000000000000fo2");
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
delete iter;
|
||||
}
|
||||
|
||||
static std::string Key(int i) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "key_______%06d", i);
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
static std::string RandomString(Random* rnd, int len) {
|
||||
std::string r;
|
||||
test::RandomString(rnd, len, &r);
|
||||
return r;
|
||||
}
|
||||
|
||||
TEST(PlainTableDBTest, CompactionTrigger) {
|
||||
Options options = CurrentOptions();
|
||||
options.write_buffer_size = 100 << 10; //100KB
|
||||
options.num_levels = 3;
|
||||
options.max_mem_compaction_level = 0;
|
||||
options.level0_file_num_compaction_trigger = 3;
|
||||
Reopen(&options);
|
||||
|
||||
Random rnd(301);
|
||||
|
||||
for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
|
||||
num++) {
|
||||
std::vector<std::string> values;
|
||||
// Write 120KB (12 values, each 10K)
|
||||
for (int i = 0; i < 12; i++) {
|
||||
values.push_back(RandomString(&rnd, 10000));
|
||||
ASSERT_OK(Put(Key(i), values[i]));
|
||||
}
|
||||
dbfull()->TEST_WaitForFlushMemTable();
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
|
||||
}
|
||||
|
||||
//generate one more file in level-0, and should trigger level-0 compaction
|
||||
std::vector<std::string> values;
|
||||
for (int i = 0; i < 12; i++) {
|
||||
values.push_back(RandomString(&rnd, 10000));
|
||||
ASSERT_OK(Put(Key(i), values[i]));
|
||||
}
|
||||
dbfull()->TEST_WaitForCompact();
|
||||
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(1), 1);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
||||
499
db/prefix_test.cc
Normal file
499
db/prefix_test.cc
Normal file
@@ -0,0 +1,499 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef GFLAGS
|
||||
#include <cstdio>
|
||||
int main() {
|
||||
fprintf(stderr, "Please install gflags to run rocksdb tools\n");
|
||||
return 1;
|
||||
}
|
||||
#else
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include <gflags/gflags.h>
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/perf_context.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "rocksdb/memtablerep.h"
|
||||
#include "util/histogram.h"
|
||||
#include "util/stop_watch.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
using GFLAGS::ParseCommandLineFlags;
|
||||
|
||||
DEFINE_bool(trigger_deadlock, false,
|
||||
"issue delete in range scan to trigger PrefixHashMap deadlock");
|
||||
DEFINE_uint64(bucket_count, 100000, "number of buckets");
|
||||
DEFINE_uint64(num_locks, 10001, "number of locks");
|
||||
DEFINE_bool(random_prefix, false, "randomize prefix");
|
||||
DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
|
||||
DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
|
||||
DEFINE_int64(write_buffer_size, 33554432, "");
|
||||
DEFINE_int64(max_write_buffer_number, 2, "");
|
||||
DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
|
||||
DEFINE_int32(skiplist_height, 4, "");
|
||||
DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
|
||||
DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
|
||||
DEFINE_int32(memtable_prefix_bloom_huge_page_tlb_size, 2 * 1024 * 1024, "");
|
||||
DEFINE_int32(value_size, 40, "");
|
||||
|
||||
// Path to the database on file system
|
||||
const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
struct TestKey {
|
||||
uint64_t prefix;
|
||||
uint64_t sorted;
|
||||
|
||||
TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {}
|
||||
};
|
||||
|
||||
// return a slice backed by test_key
|
||||
inline Slice TestKeyToSlice(const TestKey& test_key) {
|
||||
return Slice((const char*)&test_key, sizeof(test_key));
|
||||
}
|
||||
|
||||
inline const TestKey* SliceToTestKey(const Slice& slice) {
|
||||
return (const TestKey*)slice.data();
|
||||
}
|
||||
|
||||
class TestKeyComparator : public Comparator {
|
||||
public:
|
||||
|
||||
// Compare needs to be aware of the possibility of a and/or b is
|
||||
// prefix only
|
||||
virtual int Compare(const Slice& a, const Slice& b) const {
|
||||
const TestKey* key_a = SliceToTestKey(a);
|
||||
const TestKey* key_b = SliceToTestKey(b);
|
||||
if (key_a->prefix != key_b->prefix) {
|
||||
if (key_a->prefix < key_b->prefix) return -1;
|
||||
if (key_a->prefix > key_b->prefix) return 1;
|
||||
} else {
|
||||
ASSERT_TRUE(key_a->prefix == key_b->prefix);
|
||||
// note, both a and b could be prefix only
|
||||
if (a.size() != b.size()) {
|
||||
// one of them is prefix
|
||||
ASSERT_TRUE(
|
||||
(a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) ||
|
||||
(b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey)));
|
||||
if (a.size() < b.size()) return -1;
|
||||
if (a.size() > b.size()) return 1;
|
||||
} else {
|
||||
// both a and b are prefix
|
||||
if (a.size() == sizeof(uint64_t)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// both a and b are whole key
|
||||
ASSERT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey));
|
||||
if (key_a->sorted < key_b->sorted) return -1;
|
||||
if (key_a->sorted > key_b->sorted) return 1;
|
||||
if (key_a->sorted == key_b->sorted) return 0;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "TestKeyComparator";
|
||||
}
|
||||
|
||||
virtual void FindShortestSeparator(
|
||||
std::string* start,
|
||||
const Slice& limit) const {
|
||||
}
|
||||
|
||||
virtual void FindShortSuccessor(std::string* key) const {}
|
||||
|
||||
};
|
||||
|
||||
namespace {
|
||||
void PutKey(DB* db, WriteOptions write_options, uint64_t prefix,
|
||||
uint64_t suffix, const Slice& value) {
|
||||
TestKey test_key(prefix, suffix);
|
||||
Slice key = TestKeyToSlice(test_key);
|
||||
ASSERT_OK(db->Put(write_options, key, value));
|
||||
}
|
||||
|
||||
void SeekIterator(Iterator* iter, uint64_t prefix, uint64_t suffix) {
|
||||
TestKey test_key(prefix, suffix);
|
||||
Slice key = TestKeyToSlice(test_key);
|
||||
iter->Seek(key);
|
||||
}
|
||||
|
||||
const std::string kNotFoundResult = "NOT_FOUND";
|
||||
|
||||
std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix,
|
||||
uint64_t suffix) {
|
||||
TestKey test_key(prefix, suffix);
|
||||
Slice key = TestKeyToSlice(test_key);
|
||||
|
||||
std::string result;
|
||||
Status s = db->Get(read_options, key, &result);
|
||||
if (s.IsNotFound()) {
|
||||
result = kNotFoundResult;
|
||||
} else if (!s.ok()) {
|
||||
result = s.ToString();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
class PrefixTest {
|
||||
public:
|
||||
std::shared_ptr<DB> OpenDb() {
|
||||
DB* db;
|
||||
|
||||
options.create_if_missing = true;
|
||||
options.write_buffer_size = FLAGS_write_buffer_size;
|
||||
options.max_write_buffer_number = FLAGS_max_write_buffer_number;
|
||||
options.min_write_buffer_number_to_merge =
|
||||
FLAGS_min_write_buffer_number_to_merge;
|
||||
|
||||
options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
|
||||
options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
|
||||
options.memtable_prefix_bloom_huge_page_tlb_size =
|
||||
FLAGS_memtable_prefix_bloom_huge_page_tlb_size;
|
||||
|
||||
Status s = DB::Open(options, kDbName, &db);
|
||||
ASSERT_OK(s);
|
||||
return std::shared_ptr<DB>(db);
|
||||
}
|
||||
|
||||
void FirstOption() {
|
||||
option_config_ = kBegin;
|
||||
}
|
||||
|
||||
bool NextOptions(int bucket_count) {
|
||||
// skip some options
|
||||
option_config_++;
|
||||
if (option_config_ < kEnd) {
|
||||
options.prefix_extractor.reset(NewFixedPrefixTransform(8));
|
||||
switch(option_config_) {
|
||||
case kHashSkipList:
|
||||
options.memtable_factory.reset(
|
||||
NewHashSkipListRepFactory(bucket_count, FLAGS_skiplist_height));
|
||||
return true;
|
||||
case kHashLinkList:
|
||||
options.memtable_factory.reset(
|
||||
NewHashLinkListRepFactory(bucket_count));
|
||||
return true;
|
||||
case kHashLinkListHugePageTlb:
|
||||
options.memtable_factory.reset(
|
||||
NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
PrefixTest() : option_config_(kBegin) {
|
||||
options.comparator = new TestKeyComparator();
|
||||
}
|
||||
~PrefixTest() {
|
||||
delete options.comparator;
|
||||
}
|
||||
protected:
|
||||
enum OptionConfig {
|
||||
kBegin,
|
||||
kHashSkipList,
|
||||
kHashLinkList,
|
||||
kHashLinkListHugePageTlb,
|
||||
kEnd
|
||||
};
|
||||
int option_config_;
|
||||
Options options;
|
||||
};
|
||||
|
||||
TEST(PrefixTest, TestResult) {
|
||||
for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
|
||||
FirstOption();
|
||||
while (NextOptions(num_buckets)) {
|
||||
std::cout << "*** Mem table: " << options.memtable_factory->Name()
|
||||
<< " number of buckets: " << num_buckets
|
||||
<< std::endl;
|
||||
DestroyDB(kDbName, Options());
|
||||
auto db = OpenDb();
|
||||
WriteOptions write_options;
|
||||
ReadOptions read_options;
|
||||
|
||||
// 1. Insert one row.
|
||||
Slice v16("v16");
|
||||
PutKey(db.get(), write_options, 1, 6, v16);
|
||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||
SeekIterator(iter.get(), 1, 6);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v16 == iter->value());
|
||||
SeekIterator(iter.get(), 1, 5);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v16 == iter->value());
|
||||
SeekIterator(iter.get(), 1, 5);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v16 == iter->value());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
SeekIterator(iter.get(), 2, 0);
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
|
||||
ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5));
|
||||
ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 7));
|
||||
ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 0, 6));
|
||||
ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 2, 6));
|
||||
|
||||
// 2. Insert an entry for the same prefix as the last entry in the bucket.
|
||||
Slice v17("v17");
|
||||
PutKey(db.get(), write_options, 1, 7, v17);
|
||||
iter.reset(db->NewIterator(read_options));
|
||||
SeekIterator(iter.get(), 1, 7);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v17 == iter->value());
|
||||
|
||||
SeekIterator(iter.get(), 1, 6);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v16 == iter->value());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v17 == iter->value());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
SeekIterator(iter.get(), 2, 0);
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
// 3. Insert an entry for the same prefix as the head of the bucket.
|
||||
Slice v15("v15");
|
||||
PutKey(db.get(), write_options, 1, 5, v15);
|
||||
iter.reset(db->NewIterator(read_options));
|
||||
|
||||
SeekIterator(iter.get(), 1, 7);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v17 == iter->value());
|
||||
|
||||
SeekIterator(iter.get(), 1, 5);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v15 == iter->value());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v16 == iter->value());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v17 == iter->value());
|
||||
|
||||
SeekIterator(iter.get(), 1, 5);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v15 == iter->value());
|
||||
|
||||
ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
|
||||
ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
|
||||
ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
|
||||
|
||||
// 4. Insert an entry with a larger prefix
|
||||
Slice v22("v22");
|
||||
PutKey(db.get(), write_options, 2, 2, v22);
|
||||
iter.reset(db->NewIterator(read_options));
|
||||
|
||||
SeekIterator(iter.get(), 2, 2);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v22 == iter->value());
|
||||
SeekIterator(iter.get(), 2, 0);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v22 == iter->value());
|
||||
|
||||
SeekIterator(iter.get(), 1, 5);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v15 == iter->value());
|
||||
|
||||
SeekIterator(iter.get(), 1, 7);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v17 == iter->value());
|
||||
|
||||
// 5. Insert an entry with a smaller prefix
|
||||
Slice v02("v02");
|
||||
PutKey(db.get(), write_options, 0, 2, v02);
|
||||
iter.reset(db->NewIterator(read_options));
|
||||
|
||||
SeekIterator(iter.get(), 0, 2);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v02 == iter->value());
|
||||
SeekIterator(iter.get(), 0, 0);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v02 == iter->value());
|
||||
|
||||
SeekIterator(iter.get(), 2, 0);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v22 == iter->value());
|
||||
|
||||
SeekIterator(iter.get(), 1, 5);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v15 == iter->value());
|
||||
|
||||
SeekIterator(iter.get(), 1, 7);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v17 == iter->value());
|
||||
|
||||
// 6. Insert to the beginning and the end of the first prefix
|
||||
Slice v13("v13");
|
||||
Slice v18("v18");
|
||||
PutKey(db.get(), write_options, 1, 3, v13);
|
||||
PutKey(db.get(), write_options, 1, 8, v18);
|
||||
iter.reset(db->NewIterator(read_options));
|
||||
SeekIterator(iter.get(), 1, 7);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v17 == iter->value());
|
||||
|
||||
SeekIterator(iter.get(), 1, 3);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v13 == iter->value());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v15 == iter->value());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v16 == iter->value());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v17 == iter->value());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v18 == iter->value());
|
||||
|
||||
SeekIterator(iter.get(), 0, 0);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v02 == iter->value());
|
||||
|
||||
SeekIterator(iter.get(), 2, 0);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v22 == iter->value());
|
||||
|
||||
ASSERT_EQ(v22.ToString(), Get(db.get(), read_options, 2, 2));
|
||||
ASSERT_EQ(v02.ToString(), Get(db.get(), read_options, 0, 2));
|
||||
ASSERT_EQ(v13.ToString(), Get(db.get(), read_options, 1, 3));
|
||||
ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
|
||||
ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
|
||||
ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
|
||||
ASSERT_EQ(v18.ToString(), Get(db.get(), read_options, 1, 8));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PrefixTest, DynamicPrefixIterator) {
|
||||
while (NextOptions(FLAGS_bucket_count)) {
|
||||
std::cout << "*** Mem table: " << options.memtable_factory->Name()
|
||||
<< std::endl;
|
||||
DestroyDB(kDbName, Options());
|
||||
auto db = OpenDb();
|
||||
WriteOptions write_options;
|
||||
ReadOptions read_options;
|
||||
|
||||
std::vector<uint64_t> prefixes;
|
||||
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
|
||||
prefixes.push_back(i);
|
||||
}
|
||||
|
||||
if (FLAGS_random_prefix) {
|
||||
std::random_shuffle(prefixes.begin(), prefixes.end());
|
||||
}
|
||||
|
||||
HistogramImpl hist_put_time;
|
||||
HistogramImpl hist_put_comparison;
|
||||
|
||||
// insert x random prefix, each with y continuous element.
|
||||
for (auto prefix : prefixes) {
|
||||
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
|
||||
TestKey test_key(prefix, sorted);
|
||||
|
||||
Slice key = TestKeyToSlice(test_key);
|
||||
std::string value(FLAGS_value_size, 0);
|
||||
|
||||
perf_context.Reset();
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
ASSERT_OK(db->Put(write_options, key, value));
|
||||
hist_put_time.Add(timer.ElapsedNanos());
|
||||
hist_put_comparison.Add(perf_context.user_key_comparison_count);
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
|
||||
<< "Put time: \n" << hist_put_time.ToString();
|
||||
|
||||
// test seek existing keys
|
||||
HistogramImpl hist_seek_time;
|
||||
HistogramImpl hist_seek_comparison;
|
||||
|
||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||
|
||||
for (auto prefix : prefixes) {
|
||||
TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
|
||||
Slice key = TestKeyToSlice(test_key);
|
||||
std::string value = "v" + std::to_string(0);
|
||||
|
||||
perf_context.Reset();
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
auto key_prefix = options.prefix_extractor->Transform(key);
|
||||
uint64_t total_keys = 0;
|
||||
for (iter->Seek(key);
|
||||
iter->Valid() && iter->key().starts_with(key_prefix);
|
||||
iter->Next()) {
|
||||
if (FLAGS_trigger_deadlock) {
|
||||
std::cout << "Behold the deadlock!\n";
|
||||
db->Delete(write_options, iter->key());
|
||||
}
|
||||
total_keys++;
|
||||
}
|
||||
hist_seek_time.Add(timer.ElapsedNanos());
|
||||
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
|
||||
ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
|
||||
}
|
||||
|
||||
std::cout << "Seek key comparison: \n"
|
||||
<< hist_seek_comparison.ToString()
|
||||
<< "Seek time: \n"
|
||||
<< hist_seek_time.ToString();
|
||||
|
||||
// test non-existing keys
|
||||
HistogramImpl hist_no_seek_time;
|
||||
HistogramImpl hist_no_seek_comparison;
|
||||
|
||||
for (auto prefix = FLAGS_total_prefixes;
|
||||
prefix < FLAGS_total_prefixes + 10000;
|
||||
prefix++) {
|
||||
TestKey test_key(prefix, 0);
|
||||
Slice key = TestKeyToSlice(test_key);
|
||||
|
||||
perf_context.Reset();
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
iter->Seek(key);
|
||||
hist_no_seek_time.Add(timer.ElapsedNanos());
|
||||
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
}
|
||||
|
||||
std::cout << "non-existing Seek key comparison: \n"
|
||||
<< hist_no_seek_comparison.ToString()
|
||||
<< "non-existing Seek time: \n"
|
||||
<< hist_no_seek_time.ToString();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
ParseCommandLineFlags(&argc, &argv, true);
|
||||
std::cout << kDbName << "\n";
|
||||
|
||||
rocksdb::test::RunAllTests();
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // GFLAGS
|
||||
403
db/repair.cc
Normal file
403
db/repair.cc
Normal file
@@ -0,0 +1,403 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// We recover the contents of the descriptor from the other files we find.
|
||||
// (1) Any log files are first converted to tables
|
||||
// (2) We scan every table to compute
|
||||
// (a) smallest/largest for the table
|
||||
// (b) largest sequence number in the table
|
||||
// (3) We generate descriptor contents:
|
||||
// - log number is set to zero
|
||||
// - next-file-number is set to 1 + largest file number we found
|
||||
// - last-sequence-number is set to largest sequence# found across
|
||||
// all tables (see 2c)
|
||||
// - compaction pointers are cleared
|
||||
// - every table file is added at level 0
|
||||
//
|
||||
// Possible optimization 1:
|
||||
// (a) Compute total size and use to pick appropriate max-level M
|
||||
// (b) Sort tables by largest sequence# in the table
|
||||
// (c) For each table: if it overlaps earlier table, place in level-0,
|
||||
// else place in level-M.
|
||||
// Possible optimization 2:
|
||||
// Store per-table metadata (smallest, largest, largest-seq#, ...)
|
||||
// in the table's meta section to speed up ScanTable.
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include "db/builder.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/log_reader.h"
|
||||
#include "db/log_writer.h"
|
||||
#include "db/memtable.h"
|
||||
#include "db/table_cache.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
|
||||
class Repairer {
|
||||
public:
|
||||
Repairer(const std::string& dbname, const Options& options)
|
||||
: dbname_(dbname),
|
||||
env_(options.env),
|
||||
icmp_(options.comparator),
|
||||
ipolicy_(options.filter_policy),
|
||||
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
|
||||
raw_table_cache_(
|
||||
// TableCache can be small since we expect each table to be opened
|
||||
// once.
|
||||
NewLRUCache(10, options_.table_cache_numshardbits,
|
||||
options_.table_cache_remove_scan_count_limit)),
|
||||
next_file_number_(1) {
|
||||
table_cache_ = new TableCache(dbname_, &options_, storage_options_,
|
||||
raw_table_cache_.get());
|
||||
edit_ = new VersionEdit();
|
||||
}
|
||||
|
||||
~Repairer() {
|
||||
delete table_cache_;
|
||||
raw_table_cache_.reset();
|
||||
delete edit_;
|
||||
}
|
||||
|
||||
Status Run() {
|
||||
Status status = FindFiles();
|
||||
if (status.ok()) {
|
||||
ConvertLogFilesToTables();
|
||||
ExtractMetaData();
|
||||
status = WriteDescriptor();
|
||||
}
|
||||
if (status.ok()) {
|
||||
unsigned long long bytes = 0;
|
||||
for (size_t i = 0; i < tables_.size(); i++) {
|
||||
bytes += tables_[i].meta.file_size;
|
||||
}
|
||||
Log(options_.info_log,
|
||||
"**** Repaired rocksdb %s; "
|
||||
"recovered %d files; %llu bytes. "
|
||||
"Some data may have been lost. "
|
||||
"****",
|
||||
dbname_.c_str(),
|
||||
static_cast<int>(tables_.size()),
|
||||
bytes);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
private:
|
||||
struct TableInfo {
|
||||
FileMetaData meta;
|
||||
SequenceNumber min_sequence;
|
||||
SequenceNumber max_sequence;
|
||||
};
|
||||
|
||||
std::string const dbname_;
|
||||
Env* const env_;
|
||||
InternalKeyComparator const icmp_;
|
||||
InternalFilterPolicy const ipolicy_;
|
||||
Options const options_;
|
||||
std::shared_ptr<Cache> raw_table_cache_;
|
||||
TableCache* table_cache_;
|
||||
VersionEdit* edit_;
|
||||
|
||||
std::vector<std::string> manifests_;
|
||||
std::vector<uint64_t> table_numbers_;
|
||||
std::vector<uint64_t> logs_;
|
||||
std::vector<TableInfo> tables_;
|
||||
uint64_t next_file_number_;
|
||||
const EnvOptions storage_options_;
|
||||
|
||||
Status FindFiles() {
|
||||
std::vector<std::string> filenames;
|
||||
Status status = env_->GetChildren(dbname_, &filenames);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
if (filenames.empty()) {
|
||||
return Status::Corruption(dbname_, "repair found no files");
|
||||
}
|
||||
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
for (size_t i = 0; i < filenames.size(); i++) {
|
||||
if (ParseFileName(filenames[i], &number, &type)) {
|
||||
if (type == kDescriptorFile) {
|
||||
manifests_.push_back(filenames[i]);
|
||||
} else {
|
||||
if (number + 1 > next_file_number_) {
|
||||
next_file_number_ = number + 1;
|
||||
}
|
||||
if (type == kLogFile) {
|
||||
logs_.push_back(number);
|
||||
} else if (type == kTableFile) {
|
||||
table_numbers_.push_back(number);
|
||||
} else {
|
||||
// Ignore other files
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
void ConvertLogFilesToTables() {
|
||||
for (size_t i = 0; i < logs_.size(); i++) {
|
||||
std::string logname = LogFileName(dbname_, logs_[i]);
|
||||
Status status = ConvertLogToTable(logs_[i]);
|
||||
if (!status.ok()) {
|
||||
Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
|
||||
(unsigned long long) logs_[i],
|
||||
status.ToString().c_str());
|
||||
}
|
||||
ArchiveFile(logname);
|
||||
}
|
||||
}
|
||||
|
||||
Status ConvertLogToTable(uint64_t log) {
|
||||
struct LogReporter : public log::Reader::Reporter {
|
||||
Env* env;
|
||||
std::shared_ptr<Logger> info_log;
|
||||
uint64_t lognum;
|
||||
virtual void Corruption(size_t bytes, const Status& s) {
|
||||
// We print error messages for corruption, but continue repairing.
|
||||
Log(info_log, "Log #%llu: dropping %d bytes; %s",
|
||||
(unsigned long long) lognum,
|
||||
static_cast<int>(bytes),
|
||||
s.ToString().c_str());
|
||||
}
|
||||
};
|
||||
|
||||
// Open the log file
|
||||
std::string logname = LogFileName(dbname_, log);
|
||||
unique_ptr<SequentialFile> lfile;
|
||||
Status status = env_->NewSequentialFile(logname, &lfile, storage_options_);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// Create the log reader.
|
||||
LogReporter reporter;
|
||||
reporter.env = env_;
|
||||
reporter.info_log = options_.info_log;
|
||||
reporter.lognum = log;
|
||||
// We intentially make log::Reader do checksumming so that
|
||||
// corruptions cause entire commits to be skipped instead of
|
||||
// propagating bad information (like overly large sequence
|
||||
// numbers).
|
||||
log::Reader reader(std::move(lfile), &reporter, false/*do not checksum*/,
|
||||
0/*initial_offset*/);
|
||||
|
||||
// Read all the records and add to a memtable
|
||||
std::string scratch;
|
||||
Slice record;
|
||||
WriteBatch batch;
|
||||
MemTable* mem = new MemTable(icmp_, options_);
|
||||
auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
|
||||
mem->Ref();
|
||||
int counter = 0;
|
||||
while (reader.ReadRecord(&record, &scratch)) {
|
||||
if (record.size() < 12) {
|
||||
reporter.Corruption(
|
||||
record.size(), Status::Corruption("log record too small"));
|
||||
continue;
|
||||
}
|
||||
WriteBatchInternal::SetContents(&batch, record);
|
||||
status = WriteBatchInternal::InsertInto(&batch, cf_mems_default);
|
||||
if (status.ok()) {
|
||||
counter += WriteBatchInternal::Count(&batch);
|
||||
} else {
|
||||
Log(options_.info_log, "Log #%llu: ignoring %s",
|
||||
(unsigned long long) log,
|
||||
status.ToString().c_str());
|
||||
status = Status::OK(); // Keep going with rest of file
|
||||
}
|
||||
}
|
||||
|
||||
// Do not record a version edit for this conversion to a Table
|
||||
// since ExtractMetaData() will also generate edits.
|
||||
FileMetaData meta;
|
||||
meta.number = next_file_number_++;
|
||||
ReadOptions ro;
|
||||
Iterator* iter = mem->NewIterator(ro, true /* enforce_total_order */);
|
||||
status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_,
|
||||
iter, &meta, icmp_, 0, 0, kNoCompression);
|
||||
delete iter;
|
||||
delete mem->Unref();
|
||||
delete cf_mems_default;
|
||||
mem = nullptr;
|
||||
if (status.ok()) {
|
||||
if (meta.file_size > 0) {
|
||||
table_numbers_.push_back(meta.number);
|
||||
}
|
||||
}
|
||||
Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
|
||||
(unsigned long long) log,
|
||||
counter,
|
||||
(unsigned long long) meta.number,
|
||||
status.ToString().c_str());
|
||||
return status;
|
||||
}
|
||||
|
||||
void ExtractMetaData() {
|
||||
for (size_t i = 0; i < table_numbers_.size(); i++) {
|
||||
TableInfo t;
|
||||
t.meta.number = table_numbers_[i];
|
||||
Status status = ScanTable(&t);
|
||||
if (!status.ok()) {
|
||||
std::string fname = TableFileName(dbname_, table_numbers_[i]);
|
||||
Log(options_.info_log, "Table #%llu: ignoring %s",
|
||||
(unsigned long long) table_numbers_[i],
|
||||
status.ToString().c_str());
|
||||
ArchiveFile(fname);
|
||||
} else {
|
||||
tables_.push_back(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Status ScanTable(TableInfo* t) {
|
||||
std::string fname = TableFileName(dbname_, t->meta.number);
|
||||
int counter = 0;
|
||||
Status status = env_->GetFileSize(fname, &t->meta.file_size);
|
||||
if (status.ok()) {
|
||||
FileMetaData dummy_meta(t->meta.number, t->meta.file_size);
|
||||
Iterator* iter = table_cache_->NewIterator(
|
||||
ReadOptions(), storage_options_, icmp_, dummy_meta);
|
||||
bool empty = true;
|
||||
ParsedInternalKey parsed;
|
||||
t->min_sequence = 0;
|
||||
t->max_sequence = 0;
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
Slice key = iter->key();
|
||||
if (!ParseInternalKey(key, &parsed)) {
|
||||
Log(options_.info_log, "Table #%llu: unparsable key %s",
|
||||
(unsigned long long) t->meta.number,
|
||||
EscapeString(key).c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
counter++;
|
||||
if (empty) {
|
||||
empty = false;
|
||||
t->meta.smallest.DecodeFrom(key);
|
||||
}
|
||||
t->meta.largest.DecodeFrom(key);
|
||||
if (parsed.sequence < t->min_sequence) {
|
||||
t->min_sequence = parsed.sequence;
|
||||
}
|
||||
if (parsed.sequence > t->max_sequence) {
|
||||
t->max_sequence = parsed.sequence;
|
||||
}
|
||||
}
|
||||
if (!iter->status().ok()) {
|
||||
status = iter->status();
|
||||
}
|
||||
delete iter;
|
||||
}
|
||||
Log(options_.info_log, "Table #%llu: %d entries %s",
|
||||
(unsigned long long) t->meta.number,
|
||||
counter,
|
||||
status.ToString().c_str());
|
||||
return status;
|
||||
}
|
||||
|
||||
Status WriteDescriptor() {
|
||||
std::string tmp = TempFileName(dbname_, 1);
|
||||
unique_ptr<WritableFile> file;
|
||||
Status status = env_->NewWritableFile(
|
||||
tmp, &file, env_->OptimizeForManifestWrite(storage_options_));
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
SequenceNumber max_sequence = 0;
|
||||
for (size_t i = 0; i < tables_.size(); i++) {
|
||||
if (max_sequence < tables_[i].max_sequence) {
|
||||
max_sequence = tables_[i].max_sequence;
|
||||
}
|
||||
}
|
||||
|
||||
edit_->SetComparatorName(icmp_.user_comparator()->Name());
|
||||
edit_->SetLogNumber(0);
|
||||
edit_->SetNextFile(next_file_number_);
|
||||
edit_->SetLastSequence(max_sequence);
|
||||
|
||||
for (size_t i = 0; i < tables_.size(); i++) {
|
||||
// TODO(opt): separate out into multiple levels
|
||||
const TableInfo& t = tables_[i];
|
||||
edit_->AddFile(0, t.meta.number, t.meta.file_size,
|
||||
t.meta.smallest, t.meta.largest,
|
||||
t.min_sequence, t.max_sequence);
|
||||
}
|
||||
|
||||
//fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
|
||||
{
|
||||
log::Writer log(std::move(file));
|
||||
std::string record;
|
||||
edit_->EncodeTo(&record);
|
||||
status = log.AddRecord(record);
|
||||
}
|
||||
|
||||
if (!status.ok()) {
|
||||
env_->DeleteFile(tmp);
|
||||
} else {
|
||||
// Discard older manifests
|
||||
for (size_t i = 0; i < manifests_.size(); i++) {
|
||||
ArchiveFile(dbname_ + "/" + manifests_[i]);
|
||||
}
|
||||
|
||||
// Install new manifest
|
||||
status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
|
||||
if (status.ok()) {
|
||||
status = SetCurrentFile(env_, dbname_, 1, nullptr);
|
||||
} else {
|
||||
env_->DeleteFile(tmp);
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
void ArchiveFile(const std::string& fname) {
|
||||
// Move into another directory. E.g., for
|
||||
// dir/foo
|
||||
// rename to
|
||||
// dir/lost/foo
|
||||
const char* slash = strrchr(fname.c_str(), '/');
|
||||
std::string new_dir;
|
||||
if (slash != nullptr) {
|
||||
new_dir.assign(fname.data(), slash - fname.data());
|
||||
}
|
||||
new_dir.append("/lost");
|
||||
env_->CreateDir(new_dir); // Ignore error
|
||||
std::string new_file = new_dir;
|
||||
new_file.append("/");
|
||||
new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
|
||||
Status s = env_->RenameFile(fname, new_file);
|
||||
Log(options_.info_log, "Archiving %s: %s\n",
|
||||
fname.c_str(), s.ToString().c_str());
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
Status RepairDB(const std::string& dbname, const Options& options) {
|
||||
Repairer repairer(dbname, options);
|
||||
return repairer.Run();
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
800
db/simple_table_db_test.cc
Normal file
800
db/simple_table_db_test.cc
Normal file
@@ -0,0 +1,800 @@
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/filter_policy.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/version_set.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "rocksdb/cache.h"
|
||||
#include "rocksdb/compaction_filter.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "rocksdb/table_properties.h"
|
||||
#include "table/table_builder.h"
|
||||
#include "util/hash.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
#include "utilities/merge_operators.h"
|
||||
|
||||
using std::unique_ptr;
|
||||
|
||||
// IS THIS FILE STILL NEEDED?
|
||||
namespace rocksdb {
|
||||
|
||||
// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
|
||||
// as production quality.
|
||||
// SimpleTable requires the input key size to be fixed 16 bytes, value cannot
|
||||
// be longer than 150000 bytes and stored data on disk in this format:
|
||||
// +--------------------------------------------+ <= key1 offset
|
||||
// | key1 | value_size (4 bytes) | |
|
||||
// +----------------------------------------+ |
|
||||
// | value1 |
|
||||
// | |
|
||||
// +----------------------------------------+---+ <= key2 offset
|
||||
// | key2 | value_size (4 bytes) | |
|
||||
// +----------------------------------------+ |
|
||||
// | value2 |
|
||||
// | |
|
||||
// | ...... |
|
||||
// +-----------------+--------------------------+ <= index_block_offset
|
||||
// | key1 | key1 offset (8 bytes) |
|
||||
// +-----------------+--------------------------+
|
||||
// | key2 | key2 offset (8 bytes) |
|
||||
// +-----------------+--------------------------+
|
||||
// | key3 | key3 offset (8 bytes) |
|
||||
// +-----------------+--------------------------+
|
||||
// | ...... |
|
||||
// +-----------------+------------+-------------+
|
||||
// | index_block_offset (8 bytes) |
|
||||
// +------------------------------+
|
||||
|
||||
// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
|
||||
// as production quality.
|
||||
class SimpleTableReader: public TableReader {
|
||||
public:
|
||||
// Attempt to open the table that is stored in bytes [0..file_size)
|
||||
// of "file", and read the metadata entries necessary to allow
|
||||
// retrieving data from the table.
|
||||
//
|
||||
// If successful, returns ok and sets "*table" to the newly opened
|
||||
// table. The client should delete "*table" when no longer needed.
|
||||
// If there was an error while initializing the table, sets "*table"
|
||||
// to nullptr and returns a non-ok status. Does not take ownership of
|
||||
// "*source", but the client must ensure that "source" remains live
|
||||
// for the duration of the returned table's lifetime.
|
||||
//
|
||||
// *file must remain live while this Table is in use.
|
||||
static Status Open(const Options& options, const EnvOptions& soptions,
|
||||
unique_ptr<RandomAccessFile> && file, uint64_t file_size,
|
||||
unique_ptr<TableReader>* table_reader);
|
||||
|
||||
Iterator* NewIterator(const ReadOptions&, Arena* arena) override;
|
||||
|
||||
Status Get(const ReadOptions&, const Slice& key, void* arg,
|
||||
bool (*handle_result)(void* arg, const ParsedInternalKey& k,
|
||||
const Slice& v, bool),
|
||||
void (*mark_key_may_exist)(void*) = nullptr) override;
|
||||
|
||||
uint64_t ApproximateOffsetOf(const Slice& key) override;
|
||||
|
||||
void SetupForCompaction() override;
|
||||
|
||||
std::shared_ptr<const TableProperties> GetTableProperties() const override;
|
||||
|
||||
~SimpleTableReader();
|
||||
|
||||
private:
|
||||
struct Rep;
|
||||
Rep* rep_;
|
||||
|
||||
explicit SimpleTableReader(Rep* rep) {
|
||||
rep_ = rep;
|
||||
}
|
||||
friend class TableCache;
|
||||
friend class SimpleTableIterator;
|
||||
|
||||
Status GetOffset(const Slice& target, uint64_t* offset);
|
||||
|
||||
// No copying allowed
|
||||
explicit SimpleTableReader(const TableReader&) = delete;
|
||||
void operator=(const TableReader&) = delete;
|
||||
};
|
||||
|
||||
// Iterator to iterate SimpleTable
|
||||
class SimpleTableIterator: public Iterator {
|
||||
public:
|
||||
explicit SimpleTableIterator(SimpleTableReader* table);
|
||||
~SimpleTableIterator();
|
||||
|
||||
bool Valid() const;
|
||||
|
||||
void SeekToFirst();
|
||||
|
||||
void SeekToLast();
|
||||
|
||||
void Seek(const Slice& target);
|
||||
|
||||
void Next();
|
||||
|
||||
void Prev();
|
||||
|
||||
Slice key() const;
|
||||
|
||||
Slice value() const;
|
||||
|
||||
Status status() const;
|
||||
|
||||
private:
|
||||
SimpleTableReader* table_;
|
||||
uint64_t offset_;
|
||||
uint64_t next_offset_;
|
||||
Slice key_;
|
||||
Slice value_;
|
||||
char tmp_str_[4];
|
||||
char* key_str_;
|
||||
char* value_str_;
|
||||
int value_str_len_;
|
||||
Status status_;
|
||||
// No copying allowed
|
||||
SimpleTableIterator(const SimpleTableIterator&) = delete;
|
||||
void operator=(const Iterator&) = delete;
|
||||
};
|
||||
|
||||
struct SimpleTableReader::Rep {
|
||||
~Rep() {
|
||||
}
|
||||
Rep(const EnvOptions& storage_options, uint64_t index_start_offset,
|
||||
int num_entries) :
|
||||
soptions(storage_options), index_start_offset(index_start_offset),
|
||||
num_entries(num_entries) {
|
||||
}
|
||||
|
||||
Options options;
|
||||
const EnvOptions& soptions;
|
||||
Status status;
|
||||
unique_ptr<RandomAccessFile> file;
|
||||
uint64_t index_start_offset;
|
||||
int num_entries;
|
||||
std::shared_ptr<TableProperties> table_properties;
|
||||
|
||||
const static int user_key_size = 16;
|
||||
const static int offset_length = 8;
|
||||
const static int key_footer_len = 8;
|
||||
|
||||
static int GetInternalKeyLength() {
|
||||
return user_key_size + key_footer_len;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleTableReader::~SimpleTableReader() {
|
||||
delete rep_;
|
||||
}
|
||||
|
||||
Status SimpleTableReader::Open(const Options& options,
|
||||
const EnvOptions& soptions,
|
||||
unique_ptr<RandomAccessFile> && file,
|
||||
uint64_t size,
|
||||
unique_ptr<TableReader>* table_reader) {
|
||||
char footer_space[Rep::offset_length];
|
||||
Slice footer_input;
|
||||
Status s = file->Read(size - Rep::offset_length, Rep::offset_length,
|
||||
&footer_input, footer_space);
|
||||
if (s.ok()) {
|
||||
uint64_t index_start_offset = DecodeFixed64(footer_space);
|
||||
|
||||
int num_entries = (size - Rep::offset_length - index_start_offset)
|
||||
/ (Rep::GetInternalKeyLength() + Rep::offset_length);
|
||||
SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions,
|
||||
index_start_offset,
|
||||
num_entries);
|
||||
|
||||
rep->file = std::move(file);
|
||||
rep->options = options;
|
||||
table_reader->reset(new SimpleTableReader(rep));
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
void SimpleTableReader::SetupForCompaction() {
|
||||
}
|
||||
|
||||
std::shared_ptr<const TableProperties> SimpleTableReader::GetTableProperties()
|
||||
const {
|
||||
return rep_->table_properties;
|
||||
}
|
||||
|
||||
Iterator* SimpleTableReader::NewIterator(const ReadOptions& options,
|
||||
Arena* arena) {
|
||||
if (arena == nullptr) {
|
||||
return new SimpleTableIterator(this);
|
||||
} else {
|
||||
auto mem = arena->AllocateAligned(sizeof(SimpleTableIterator));
|
||||
return new (mem) SimpleTableIterator(this);
|
||||
}
|
||||
}
|
||||
|
||||
Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
|
||||
uint32_t left = 0;
|
||||
uint32_t right = rep_->num_entries - 1;
|
||||
char key_chars[Rep::GetInternalKeyLength()];
|
||||
Slice tmp_slice;
|
||||
|
||||
uint32_t target_offset = 0;
|
||||
while (left <= right) {
|
||||
uint32_t mid = (left + right + 1) / 2;
|
||||
|
||||
uint64_t offset_to_read = rep_->index_start_offset
|
||||
+ (Rep::GetInternalKeyLength() + Rep::offset_length) * mid;
|
||||
Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(),
|
||||
&tmp_slice, key_chars);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
InternalKeyComparator ikc(rep_->options.comparator);
|
||||
int compare_result = ikc.Compare(tmp_slice, target);
|
||||
|
||||
if (compare_result < 0) {
|
||||
if (left == right) {
|
||||
target_offset = right + 1;
|
||||
break;
|
||||
}
|
||||
left = mid;
|
||||
} else {
|
||||
if (left == right) {
|
||||
target_offset = left;
|
||||
break;
|
||||
}
|
||||
right = mid - 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (target_offset >= (uint32_t) rep_->num_entries) {
|
||||
*offset = rep_->index_start_offset;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
char value_offset_chars[Rep::offset_length];
|
||||
|
||||
int64_t offset_for_value_offset = rep_->index_start_offset
|
||||
+ (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset
|
||||
+ Rep::GetInternalKeyLength();
|
||||
Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length,
|
||||
&tmp_slice, value_offset_chars);
|
||||
if (s.ok()) {
|
||||
*offset = DecodeFixed64(value_offset_chars);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k,
|
||||
void* arg,
|
||||
bool (*saver)(void*, const ParsedInternalKey&,
|
||||
const Slice&, bool),
|
||||
void (*mark_key_may_exist)(void*)) {
|
||||
Status s;
|
||||
SimpleTableIterator* iter = new SimpleTableIterator(this);
|
||||
for (iter->Seek(k); iter->Valid(); iter->Next()) {
|
||||
ParsedInternalKey parsed_key;
|
||||
if (!ParseInternalKey(iter->key(), &parsed_key)) {
|
||||
return Status::Corruption(Slice());
|
||||
}
|
||||
|
||||
if (!(*saver)(arg, parsed_key, iter->value(), true)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
s = iter->status();
|
||||
delete iter;
|
||||
return s;
|
||||
}
|
||||
|
||||
uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) :
|
||||
table_(table) {
|
||||
key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()];
|
||||
value_str_len_ = -1;
|
||||
SeekToFirst();
|
||||
}
|
||||
|
||||
SimpleTableIterator::~SimpleTableIterator() {
|
||||
delete[] key_str_;
|
||||
if (value_str_len_ >= 0) {
|
||||
delete[] value_str_;
|
||||
}
|
||||
}
|
||||
|
||||
bool SimpleTableIterator::Valid() const {
|
||||
return offset_ < table_->rep_->index_start_offset;
|
||||
}
|
||||
|
||||
void SimpleTableIterator::SeekToFirst() {
|
||||
next_offset_ = 0;
|
||||
Next();
|
||||
}
|
||||
|
||||
void SimpleTableIterator::SeekToLast() {
|
||||
assert(false);
|
||||
}
|
||||
|
||||
void SimpleTableIterator::Seek(const Slice& target) {
|
||||
Status s = table_->GetOffset(target, &next_offset_);
|
||||
if (!s.ok()) {
|
||||
status_ = s;
|
||||
}
|
||||
Next();
|
||||
}
|
||||
|
||||
void SimpleTableIterator::Next() {
|
||||
offset_ = next_offset_;
|
||||
if (offset_ >= table_->rep_->index_start_offset) {
|
||||
return;
|
||||
}
|
||||
Slice result;
|
||||
int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength();
|
||||
|
||||
Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result,
|
||||
key_str_);
|
||||
next_offset_ += internal_key_size;
|
||||
key_ = result;
|
||||
|
||||
Slice value_size_slice;
|
||||
s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_);
|
||||
next_offset_ += 4;
|
||||
uint32_t value_size = DecodeFixed32(tmp_str_);
|
||||
|
||||
Slice value_slice;
|
||||
if ((int) value_size > value_str_len_) {
|
||||
if (value_str_len_ >= 0) {
|
||||
delete[] value_str_;
|
||||
}
|
||||
value_str_ = new char[value_size];
|
||||
value_str_len_ = value_size;
|
||||
}
|
||||
s = table_->rep_->file->Read(next_offset_, value_size, &value_slice,
|
||||
value_str_);
|
||||
next_offset_ += value_size;
|
||||
value_ = value_slice;
|
||||
}
|
||||
|
||||
void SimpleTableIterator::Prev() {
|
||||
assert(false);
|
||||
}
|
||||
|
||||
Slice SimpleTableIterator::key() const {
|
||||
Log(table_->rep_->options.info_log, "key!!!!");
|
||||
return key_;
|
||||
}
|
||||
|
||||
Slice SimpleTableIterator::value() const {
|
||||
return value_;
|
||||
}
|
||||
|
||||
Status SimpleTableIterator::status() const {
|
||||
return status_;
|
||||
}
|
||||
|
||||
class SimpleTableBuilder: public TableBuilder {
|
||||
public:
|
||||
// Create a builder that will store the contents of the table it is
|
||||
// building in *file. Does not close the file. It is up to the
|
||||
// caller to close the file after calling Finish(). The output file
|
||||
// will be part of level specified by 'level'. A value of -1 means
|
||||
// that the caller does not know which level the output file will reside.
|
||||
SimpleTableBuilder(const Options& options, WritableFile* file,
|
||||
CompressionType compression_type);
|
||||
|
||||
// REQUIRES: Either Finish() or Abandon() has been called.
|
||||
~SimpleTableBuilder();
|
||||
|
||||
// Add key,value to the table being constructed.
|
||||
// REQUIRES: key is after any previously added key according to comparator.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
void Add(const Slice& key, const Slice& value) override;
|
||||
|
||||
// Return non-ok iff some error has been detected.
|
||||
Status status() const override;
|
||||
|
||||
// Finish building the table. Stops using the file passed to the
|
||||
// constructor after this function returns.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
Status Finish() override;
|
||||
|
||||
// Indicate that the contents of this builder should be abandoned. Stops
|
||||
// using the file passed to the constructor after this function returns.
|
||||
// If the caller is not going to call Finish(), it must call Abandon()
|
||||
// before destroying this builder.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
void Abandon() override;
|
||||
|
||||
// Number of calls to Add() so far.
|
||||
uint64_t NumEntries() const override;
|
||||
|
||||
// Size of the file generated so far. If invoked after a successful
|
||||
// Finish() call, returns the size of the final generated file.
|
||||
uint64_t FileSize() const override;
|
||||
|
||||
private:
|
||||
struct Rep;
|
||||
Rep* rep_;
|
||||
|
||||
// No copying allowed
|
||||
SimpleTableBuilder(const SimpleTableBuilder&) = delete;
|
||||
void operator=(const SimpleTableBuilder&) = delete;
|
||||
};
|
||||
|
||||
struct SimpleTableBuilder::Rep {
|
||||
Options options;
|
||||
WritableFile* file;
|
||||
uint64_t offset = 0;
|
||||
Status status;
|
||||
|
||||
uint64_t num_entries = 0;
|
||||
|
||||
bool closed = false; // Either Finish() or Abandon() has been called.
|
||||
|
||||
const static int user_key_size = 16;
|
||||
const static int offset_length = 8;
|
||||
const static int key_footer_len = 8;
|
||||
|
||||
static int GetInternalKeyLength() {
|
||||
return user_key_size + key_footer_len;
|
||||
}
|
||||
|
||||
std::string index;
|
||||
|
||||
Rep(const Options& opt, WritableFile* f) :
|
||||
options(opt), file(f) {
|
||||
}
|
||||
~Rep() {
|
||||
}
|
||||
};
|
||||
|
||||
SimpleTableBuilder::SimpleTableBuilder(const Options& options,
|
||||
WritableFile* file,
|
||||
CompressionType compression_type) :
|
||||
rep_(new SimpleTableBuilder::Rep(options, file)) {
|
||||
}
|
||||
|
||||
SimpleTableBuilder::~SimpleTableBuilder() {
|
||||
delete (rep_);
|
||||
}
|
||||
|
||||
void SimpleTableBuilder::Add(const Slice& key, const Slice& value) {
|
||||
assert((int ) key.size() == Rep::GetInternalKeyLength());
|
||||
|
||||
// Update index
|
||||
rep_->index.append(key.data(), key.size());
|
||||
PutFixed64(&(rep_->index), rep_->offset);
|
||||
|
||||
// Write key-value pair
|
||||
rep_->file->Append(key);
|
||||
rep_->offset += Rep::GetInternalKeyLength();
|
||||
|
||||
std::string size;
|
||||
int value_size = value.size();
|
||||
PutFixed32(&size, value_size);
|
||||
Slice sizeSlice(size);
|
||||
rep_->file->Append(sizeSlice);
|
||||
rep_->file->Append(value);
|
||||
rep_->offset += value_size + 4;
|
||||
|
||||
rep_->num_entries++;
|
||||
}
|
||||
|
||||
Status SimpleTableBuilder::status() const {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status SimpleTableBuilder::Finish() {
|
||||
Rep* r = rep_;
|
||||
assert(!r->closed);
|
||||
r->closed = true;
|
||||
|
||||
uint64_t index_offset = rep_->offset;
|
||||
Slice index_slice(rep_->index);
|
||||
rep_->file->Append(index_slice);
|
||||
rep_->offset += index_slice.size();
|
||||
|
||||
std::string index_offset_str;
|
||||
PutFixed64(&index_offset_str, index_offset);
|
||||
Slice foot_slice(index_offset_str);
|
||||
rep_->file->Append(foot_slice);
|
||||
rep_->offset += foot_slice.size();
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void SimpleTableBuilder::Abandon() {
|
||||
rep_->closed = true;
|
||||
}
|
||||
|
||||
uint64_t SimpleTableBuilder::NumEntries() const {
|
||||
return rep_->num_entries;
|
||||
}
|
||||
|
||||
uint64_t SimpleTableBuilder::FileSize() const {
|
||||
return rep_->offset;
|
||||
}
|
||||
|
||||
class SimpleTableFactory: public TableFactory {
|
||||
public:
|
||||
~SimpleTableFactory() {
|
||||
}
|
||||
SimpleTableFactory() {
|
||||
}
|
||||
const char* Name() const override {
|
||||
return "SimpleTable";
|
||||
}
|
||||
Status NewTableReader(const Options& options, const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_key,
|
||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||
unique_ptr<TableReader>* table_reader) const;
|
||||
|
||||
TableBuilder* NewTableBuilder(const Options& options,
|
||||
const InternalKeyComparator& internal_key,
|
||||
WritableFile* file,
|
||||
CompressionType compression_type) const;
|
||||
};
|
||||
|
||||
Status SimpleTableFactory::NewTableReader(
|
||||
const Options& options, const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_key,
|
||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||
unique_ptr<TableReader>* table_reader) const {
|
||||
|
||||
return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
|
||||
table_reader);
|
||||
}
|
||||
|
||||
TableBuilder* SimpleTableFactory::NewTableBuilder(
|
||||
const Options& options, const InternalKeyComparator& internal_key,
|
||||
WritableFile* file, CompressionType compression_type) const {
|
||||
return new SimpleTableBuilder(options, file, compression_type);
|
||||
}
|
||||
|
||||
class SimpleTableDBTest {
|
||||
protected:
|
||||
public:
|
||||
std::string dbname_;
|
||||
Env* env_;
|
||||
DB* db_;
|
||||
|
||||
Options last_options_;
|
||||
|
||||
SimpleTableDBTest() :
|
||||
env_(Env::Default()) {
|
||||
dbname_ = test::TmpDir() + "/simple_table_db_test";
|
||||
ASSERT_OK(DestroyDB(dbname_, Options()));
|
||||
db_ = nullptr;
|
||||
Reopen();
|
||||
}
|
||||
|
||||
~SimpleTableDBTest() {
|
||||
delete db_;
|
||||
ASSERT_OK(DestroyDB(dbname_, Options()));
|
||||
}
|
||||
|
||||
// Return the current option configuration.
|
||||
Options CurrentOptions() {
|
||||
Options options;
|
||||
options.table_factory.reset(new SimpleTableFactory());
|
||||
return options;
|
||||
}
|
||||
|
||||
DBImpl* dbfull() {
|
||||
return reinterpret_cast<DBImpl*>(db_);
|
||||
}
|
||||
|
||||
void Reopen(Options* options = nullptr) {
|
||||
ASSERT_OK(TryReopen(options));
|
||||
}
|
||||
|
||||
void Close() {
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
}
|
||||
|
||||
void DestroyAndReopen(Options* options = nullptr) {
|
||||
//Destroy using last options
|
||||
Destroy(&last_options_);
|
||||
ASSERT_OK(TryReopen(options));
|
||||
}
|
||||
|
||||
void Destroy(Options* options) {
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
ASSERT_OK(DestroyDB(dbname_, *options));
|
||||
}
|
||||
|
||||
Status PureReopen(Options* options, DB** db) {
|
||||
return DB::Open(*options, dbname_, db);
|
||||
}
|
||||
|
||||
Status TryReopen(Options* options = nullptr) {
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
Options opts;
|
||||
if (options != nullptr) {
|
||||
opts = *options;
|
||||
} else {
|
||||
opts = CurrentOptions();
|
||||
opts.create_if_missing = true;
|
||||
}
|
||||
last_options_ = opts;
|
||||
|
||||
return DB::Open(opts, dbname_, &db_);
|
||||
}
|
||||
|
||||
Status Put(const Slice& k, const Slice& v) {
|
||||
return db_->Put(WriteOptions(), k, v);
|
||||
}
|
||||
|
||||
Status Delete(const std::string& k) {
|
||||
return db_->Delete(WriteOptions(), k);
|
||||
}
|
||||
|
||||
std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
|
||||
ReadOptions options;
|
||||
options.snapshot = snapshot;
|
||||
std::string result;
|
||||
Status s = db_->Get(options, k, &result);
|
||||
if (s.IsNotFound()) {
|
||||
result = "NOT_FOUND";
|
||||
} else if (!s.ok()) {
|
||||
result = s.ToString();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
int NumTableFilesAtLevel(int level) {
|
||||
std::string property;
|
||||
ASSERT_TRUE(
|
||||
db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
|
||||
&property));
|
||||
return atoi(property.c_str());
|
||||
}
|
||||
|
||||
// Return spread of files per level
|
||||
std::string FilesPerLevel() {
|
||||
std::string result;
|
||||
int last_non_zero_offset = 0;
|
||||
for (int level = 0; level < db_->NumberLevels(); level++) {
|
||||
int f = NumTableFilesAtLevel(level);
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
|
||||
result += buf;
|
||||
if (f > 0) {
|
||||
last_non_zero_offset = result.size();
|
||||
}
|
||||
}
|
||||
result.resize(last_non_zero_offset);
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string IterStatus(Iterator* iter) {
|
||||
std::string result;
|
||||
if (iter->Valid()) {
|
||||
result = iter->key().ToString() + "->" + iter->value().ToString();
|
||||
} else {
|
||||
result = "(invalid)";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
TEST(SimpleTableDBTest, Empty) {
|
||||
ASSERT_TRUE(db_ != nullptr);
|
||||
ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
|
||||
}
|
||||
|
||||
TEST(SimpleTableDBTest, ReadWrite) {
|
||||
ASSERT_OK(Put("0000000000000foo", "v1"));
|
||||
ASSERT_EQ("v1", Get("0000000000000foo"));
|
||||
ASSERT_OK(Put("0000000000000bar", "v2"));
|
||||
ASSERT_OK(Put("0000000000000foo", "v3"));
|
||||
ASSERT_EQ("v3", Get("0000000000000foo"));
|
||||
ASSERT_EQ("v2", Get("0000000000000bar"));
|
||||
}
|
||||
|
||||
TEST(SimpleTableDBTest, Flush) {
|
||||
ASSERT_OK(Put("0000000000000foo", "v1"));
|
||||
ASSERT_OK(Put("0000000000000bar", "v2"));
|
||||
ASSERT_OK(Put("0000000000000foo", "v3"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("v3", Get("0000000000000foo"));
|
||||
ASSERT_EQ("v2", Get("0000000000000bar"));
|
||||
}
|
||||
|
||||
TEST(SimpleTableDBTest, Flush2) {
|
||||
ASSERT_OK(Put("0000000000000bar", "b"));
|
||||
ASSERT_OK(Put("0000000000000foo", "v1"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
|
||||
ASSERT_OK(Put("0000000000000foo", "v2"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("v2", Get("0000000000000foo"));
|
||||
|
||||
ASSERT_OK(Put("0000000000000eee", "v3"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("v3", Get("0000000000000eee"));
|
||||
|
||||
ASSERT_OK(Delete("0000000000000bar"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
|
||||
|
||||
ASSERT_OK(Put("0000000000000eee", "v5"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("v5", Get("0000000000000eee"));
|
||||
}
|
||||
|
||||
static std::string Key(int i) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "key_______%06d", i);
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
static std::string RandomString(Random* rnd, int len) {
|
||||
std::string r;
|
||||
test::RandomString(rnd, len, &r);
|
||||
return r;
|
||||
}
|
||||
|
||||
TEST(SimpleTableDBTest, CompactionTrigger) {
|
||||
Options options = CurrentOptions();
|
||||
options.write_buffer_size = 100 << 10; //100KB
|
||||
options.num_levels = 3;
|
||||
options.max_mem_compaction_level = 0;
|
||||
options.level0_file_num_compaction_trigger = 3;
|
||||
Reopen(&options);
|
||||
|
||||
Random rnd(301);
|
||||
|
||||
for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
|
||||
num++) {
|
||||
std::vector<std::string> values;
|
||||
// Write 120KB (12 values, each 10K)
|
||||
for (int i = 0; i < 12; i++) {
|
||||
values.push_back(RandomString(&rnd, 10000));
|
||||
ASSERT_OK(Put(Key(i), values[i]));
|
||||
}
|
||||
dbfull()->TEST_WaitForFlushMemTable();
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
|
||||
}
|
||||
|
||||
//generate one more file in level-0, and should trigger level-0 compaction
|
||||
std::vector<std::string> values;
|
||||
for (int i = 0; i < 12; i++) {
|
||||
values.push_back(RandomString(&rnd, 10000));
|
||||
ASSERT_OK(Put(Key(i), values[i]));
|
||||
}
|
||||
dbfull()->TEST_WaitForCompact();
|
||||
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(1), 1);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
||||
429
db/skiplist.h
Normal file
429
db/skiplist.h
Normal file
@@ -0,0 +1,429 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Thread safety
|
||||
// -------------
|
||||
//
|
||||
// Writes require external synchronization, most likely a mutex.
|
||||
// Reads require a guarantee that the SkipList will not be destroyed
|
||||
// while the read is in progress. Apart from that, reads progress
|
||||
// without any internal locking or synchronization.
|
||||
//
|
||||
// Invariants:
|
||||
//
|
||||
// (1) Allocated nodes are never deleted until the SkipList is
|
||||
// destroyed. This is trivially guaranteed by the code since we
|
||||
// never delete any skip list nodes.
|
||||
//
|
||||
// (2) The contents of a Node except for the next/prev pointers are
|
||||
// immutable after the Node has been linked into the SkipList.
|
||||
// Only Insert() modifies the list, and it is careful to initialize
|
||||
// a node and use release-stores to publish the nodes in one or
|
||||
// more lists.
|
||||
//
|
||||
// ... prev vs. next pointer ordering ...
|
||||
//
|
||||
|
||||
#pragma once
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include "util/arena.h"
|
||||
#include "port/port.h"
|
||||
#include "util/arena.h"
|
||||
#include "util/random.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
class SkipList {
|
||||
private:
|
||||
struct Node;
|
||||
|
||||
public:
|
||||
// Create a new SkipList object that will use "cmp" for comparing keys,
|
||||
// and will allocate memory using "*arena". Objects allocated in the arena
|
||||
// must remain allocated for the lifetime of the skiplist object.
|
||||
explicit SkipList(Comparator cmp, Arena* arena,
|
||||
int32_t max_height = 12, int32_t branching_factor = 4);
|
||||
|
||||
// Insert key into the list.
|
||||
// REQUIRES: nothing that compares equal to key is currently in the list.
|
||||
void Insert(const Key& key);
|
||||
|
||||
// Returns true iff an entry that compares equal to key is in the list.
|
||||
bool Contains(const Key& key) const;
|
||||
|
||||
// Iteration over the contents of a skip list
|
||||
class Iterator {
|
||||
public:
|
||||
// Initialize an iterator over the specified list.
|
||||
// The returned iterator is not valid.
|
||||
explicit Iterator(const SkipList* list);
|
||||
|
||||
// Change the underlying skiplist used for this iterator
|
||||
// This enables us not changing the iterator without deallocating
|
||||
// an old one and then allocating a new one
|
||||
void SetList(const SkipList* list);
|
||||
|
||||
// Returns true iff the iterator is positioned at a valid node.
|
||||
bool Valid() const;
|
||||
|
||||
// Returns the key at the current position.
|
||||
// REQUIRES: Valid()
|
||||
const Key& key() const;
|
||||
|
||||
// Advances to the next position.
|
||||
// REQUIRES: Valid()
|
||||
void Next();
|
||||
|
||||
// Advances to the previous position.
|
||||
// REQUIRES: Valid()
|
||||
void Prev();
|
||||
|
||||
// Advance to the first entry with a key >= target
|
||||
void Seek(const Key& target);
|
||||
|
||||
// Position at the first entry in list.
|
||||
// Final state of iterator is Valid() iff list is not empty.
|
||||
void SeekToFirst();
|
||||
|
||||
// Position at the last entry in list.
|
||||
// Final state of iterator is Valid() iff list is not empty.
|
||||
void SeekToLast();
|
||||
|
||||
private:
|
||||
const SkipList* list_;
|
||||
Node* node_;
|
||||
// Intentionally copyable
|
||||
};
|
||||
|
||||
private:
|
||||
const int32_t kMaxHeight_;
|
||||
const int32_t kBranching_;
|
||||
|
||||
// Immutable after construction
|
||||
Comparator const compare_;
|
||||
Arena* const arena_; // Arena used for allocations of nodes
|
||||
|
||||
Node* const head_;
|
||||
|
||||
// Modified only by Insert(). Read racily by readers, but stale
|
||||
// values are ok.
|
||||
port::AtomicPointer max_height_; // Height of the entire list
|
||||
|
||||
// Used for optimizing sequential insert patterns
|
||||
Node** prev_;
|
||||
int32_t prev_height_;
|
||||
|
||||
inline int GetMaxHeight() const {
|
||||
return static_cast<int>(
|
||||
reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
|
||||
}
|
||||
|
||||
// Read/written only by Insert().
|
||||
Random rnd_;
|
||||
|
||||
Node* NewNode(const Key& key, int height);
|
||||
int RandomHeight();
|
||||
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
|
||||
|
||||
// Return true if key is greater than the data stored in "n"
|
||||
bool KeyIsAfterNode(const Key& key, Node* n) const;
|
||||
|
||||
// Return the earliest node that comes at or after key.
|
||||
// Return nullptr if there is no such node.
|
||||
//
|
||||
// If prev is non-nullptr, fills prev[level] with pointer to previous
|
||||
// node at "level" for every level in [0..max_height_-1].
|
||||
Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
|
||||
|
||||
// Return the latest node with a key < key.
|
||||
// Return head_ if there is no such node.
|
||||
Node* FindLessThan(const Key& key) const;
|
||||
|
||||
// Return the last node in the list.
|
||||
// Return head_ if list is empty.
|
||||
Node* FindLast() const;
|
||||
|
||||
// No copying allowed
|
||||
SkipList(const SkipList&);
|
||||
void operator=(const SkipList&);
|
||||
};
|
||||
|
||||
// Implementation details follow
|
||||
template<typename Key, class Comparator>
|
||||
struct SkipList<Key, Comparator>::Node {
|
||||
explicit Node(const Key& k) : key(k) { }
|
||||
|
||||
Key const key;
|
||||
|
||||
// Accessors/mutators for links. Wrapped in methods so we can
|
||||
// add the appropriate barriers as necessary.
|
||||
Node* Next(int n) {
|
||||
assert(n >= 0);
|
||||
// Use an 'acquire load' so that we observe a fully initialized
|
||||
// version of the returned Node.
|
||||
return reinterpret_cast<Node*>(next_[n].Acquire_Load());
|
||||
}
|
||||
void SetNext(int n, Node* x) {
|
||||
assert(n >= 0);
|
||||
// Use a 'release store' so that anybody who reads through this
|
||||
// pointer observes a fully initialized version of the inserted node.
|
||||
next_[n].Release_Store(x);
|
||||
}
|
||||
|
||||
// No-barrier variants that can be safely used in a few locations.
|
||||
Node* NoBarrier_Next(int n) {
|
||||
assert(n >= 0);
|
||||
return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
|
||||
}
|
||||
void NoBarrier_SetNext(int n, Node* x) {
|
||||
assert(n >= 0);
|
||||
next_[n].NoBarrier_Store(x);
|
||||
}
|
||||
|
||||
private:
|
||||
// Array of length equal to the node height. next_[0] is lowest level link.
|
||||
port::AtomicPointer next_[1];
|
||||
};
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
typename SkipList<Key, Comparator>::Node*
|
||||
SkipList<Key, Comparator>::NewNode(const Key& key, int height) {
|
||||
char* mem = arena_->AllocateAligned(
|
||||
sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
|
||||
return new (mem) Node(key);
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline SkipList<Key, Comparator>::Iterator::Iterator(const SkipList* list) {
|
||||
SetList(list);
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline void SkipList<Key, Comparator>::Iterator::SetList(const SkipList* list) {
|
||||
list_ = list;
|
||||
node_ = nullptr;
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline bool SkipList<Key, Comparator>::Iterator::Valid() const {
|
||||
return node_ != nullptr;
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline const Key& SkipList<Key, Comparator>::Iterator::key() const {
|
||||
assert(Valid());
|
||||
return node_->key;
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline void SkipList<Key, Comparator>::Iterator::Next() {
|
||||
assert(Valid());
|
||||
node_ = node_->Next(0);
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline void SkipList<Key, Comparator>::Iterator::Prev() {
|
||||
// Instead of using explicit "prev" links, we just search for the
|
||||
// last node that falls before key.
|
||||
assert(Valid());
|
||||
node_ = list_->FindLessThan(node_->key);
|
||||
if (node_ == list_->head_) {
|
||||
node_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline void SkipList<Key, Comparator>::Iterator::Seek(const Key& target) {
|
||||
node_ = list_->FindGreaterOrEqual(target, nullptr);
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline void SkipList<Key, Comparator>::Iterator::SeekToFirst() {
|
||||
node_ = list_->head_->Next(0);
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline void SkipList<Key, Comparator>::Iterator::SeekToLast() {
|
||||
node_ = list_->FindLast();
|
||||
if (node_ == list_->head_) {
|
||||
node_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
int SkipList<Key, Comparator>::RandomHeight() {
|
||||
// Increase height with probability 1 in kBranching
|
||||
int height = 1;
|
||||
while (height < kMaxHeight_ && ((rnd_.Next() % kBranching_) == 0)) {
|
||||
height++;
|
||||
}
|
||||
assert(height > 0);
|
||||
assert(height <= kMaxHeight_);
|
||||
return height;
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
bool SkipList<Key, Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
|
||||
// nullptr n is considered infinite
|
||||
return (n != nullptr) && (compare_(n->key, key) < 0);
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::
|
||||
FindGreaterOrEqual(const Key& key, Node** prev) const {
|
||||
// Use prev as an optimization hint and fallback to slow path
|
||||
if (prev && !KeyIsAfterNode(key, prev[0]->Next(0))) {
|
||||
Node* x = prev[0];
|
||||
Node* next = x->Next(0);
|
||||
if ((x == head_) || KeyIsAfterNode(key, x)) {
|
||||
// Adjust all relevant insertion points to the previous entry
|
||||
for (int i = 1; i < prev_height_; i++) {
|
||||
prev[i] = x;
|
||||
}
|
||||
return next;
|
||||
}
|
||||
}
|
||||
// Normal lookup
|
||||
Node* x = head_;
|
||||
int level = GetMaxHeight() - 1;
|
||||
while (true) {
|
||||
Node* next = x->Next(level);
|
||||
// Make sure the lists are sorted.
|
||||
// If x points to head_ or next points nullptr, it is trivially satisfied.
|
||||
assert((x == head_) || (next == nullptr) || KeyIsAfterNode(next->key, x));
|
||||
if (KeyIsAfterNode(key, next)) {
|
||||
// Keep searching in this list
|
||||
x = next;
|
||||
} else {
|
||||
if (prev != nullptr) prev[level] = x;
|
||||
if (level == 0) {
|
||||
return next;
|
||||
} else {
|
||||
// Switch to next list
|
||||
level--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
typename SkipList<Key, Comparator>::Node*
|
||||
SkipList<Key, Comparator>::FindLessThan(const Key& key) const {
|
||||
Node* x = head_;
|
||||
int level = GetMaxHeight() - 1;
|
||||
while (true) {
|
||||
assert(x == head_ || compare_(x->key, key) < 0);
|
||||
Node* next = x->Next(level);
|
||||
if (next == nullptr || compare_(next->key, key) >= 0) {
|
||||
if (level == 0) {
|
||||
return x;
|
||||
} else {
|
||||
// Switch to next list
|
||||
level--;
|
||||
}
|
||||
} else {
|
||||
x = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::FindLast()
|
||||
const {
|
||||
Node* x = head_;
|
||||
int level = GetMaxHeight() - 1;
|
||||
while (true) {
|
||||
Node* next = x->Next(level);
|
||||
if (next == nullptr) {
|
||||
if (level == 0) {
|
||||
return x;
|
||||
} else {
|
||||
// Switch to next list
|
||||
level--;
|
||||
}
|
||||
} else {
|
||||
x = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
SkipList<Key, Comparator>::SkipList(const Comparator cmp, Arena* arena,
|
||||
int32_t max_height,
|
||||
int32_t branching_factor)
|
||||
: kMaxHeight_(max_height),
|
||||
kBranching_(branching_factor),
|
||||
compare_(cmp),
|
||||
arena_(arena),
|
||||
head_(NewNode(0 /* any key will do */, max_height)),
|
||||
max_height_(reinterpret_cast<void*>(1)),
|
||||
prev_height_(1),
|
||||
rnd_(0xdeadbeef) {
|
||||
assert(kMaxHeight_ > 0);
|
||||
assert(kBranching_ > 0);
|
||||
// Allocate the prev_ Node* array, directly from the passed-in arena.
|
||||
// prev_ does not need to be freed, as its life cycle is tied up with
|
||||
// the arena as a whole.
|
||||
prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_);
|
||||
for (int i = 0; i < kMaxHeight_; i++) {
|
||||
head_->SetNext(i, nullptr);
|
||||
prev_[i] = head_;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
void SkipList<Key, Comparator>::Insert(const Key& key) {
|
||||
// TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
|
||||
// here since Insert() is externally synchronized.
|
||||
Node* x = FindGreaterOrEqual(key, prev_);
|
||||
|
||||
// Our data structure does not allow duplicate insertion
|
||||
assert(x == nullptr || !Equal(key, x->key));
|
||||
|
||||
int height = RandomHeight();
|
||||
if (height > GetMaxHeight()) {
|
||||
for (int i = GetMaxHeight(); i < height; i++) {
|
||||
prev_[i] = head_;
|
||||
}
|
||||
//fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
|
||||
|
||||
// It is ok to mutate max_height_ without any synchronization
|
||||
// with concurrent readers. A concurrent reader that observes
|
||||
// the new value of max_height_ will see either the old value of
|
||||
// new level pointers from head_ (nullptr), or a new value set in
|
||||
// the loop below. In the former case the reader will
|
||||
// immediately drop to the next level since nullptr sorts after all
|
||||
// keys. In the latter case the reader will use the new node.
|
||||
max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
|
||||
}
|
||||
|
||||
x = NewNode(key, height);
|
||||
for (int i = 0; i < height; i++) {
|
||||
// NoBarrier_SetNext() suffices since we will add a barrier when
|
||||
// we publish a pointer to "x" in prev[i].
|
||||
x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i));
|
||||
prev_[i]->SetNext(i, x);
|
||||
}
|
||||
prev_[0] = x;
|
||||
prev_height_ = height;
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
bool SkipList<Key, Comparator>::Contains(const Key& key) const {
|
||||
Node* x = FindGreaterOrEqual(key, nullptr);
|
||||
if (x != nullptr && Equal(key, x->key)) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
383
db/skiplist_test.cc
Normal file
383
db/skiplist_test.cc
Normal file
@@ -0,0 +1,383 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/skiplist.h"
|
||||
#include <set>
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/arena.h"
|
||||
#include "util/hash.h"
|
||||
#include "util/random.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
typedef uint64_t Key;
|
||||
|
||||
struct TestComparator {
|
||||
int operator()(const Key& a, const Key& b) const {
|
||||
if (a < b) {
|
||||
return -1;
|
||||
} else if (a > b) {
|
||||
return +1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class SkipTest { };
|
||||
|
||||
TEST(SkipTest, Empty) {
|
||||
Arena arena;
|
||||
TestComparator cmp;
|
||||
SkipList<Key, TestComparator> list(cmp, &arena);
|
||||
ASSERT_TRUE(!list.Contains(10));
|
||||
|
||||
SkipList<Key, TestComparator>::Iterator iter(&list);
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
iter.SeekToFirst();
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
iter.Seek(100);
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
iter.SeekToLast();
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
}
|
||||
|
||||
TEST(SkipTest, InsertAndLookup) {
|
||||
const int N = 2000;
|
||||
const int R = 5000;
|
||||
Random rnd(1000);
|
||||
std::set<Key> keys;
|
||||
Arena arena;
|
||||
TestComparator cmp;
|
||||
SkipList<Key, TestComparator> list(cmp, &arena);
|
||||
for (int i = 0; i < N; i++) {
|
||||
Key key = rnd.Next() % R;
|
||||
if (keys.insert(key).second) {
|
||||
list.Insert(key);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < R; i++) {
|
||||
if (list.Contains(i)) {
|
||||
ASSERT_EQ(keys.count(i), 1U);
|
||||
} else {
|
||||
ASSERT_EQ(keys.count(i), 0U);
|
||||
}
|
||||
}
|
||||
|
||||
// Simple iterator tests
|
||||
{
|
||||
SkipList<Key, TestComparator>::Iterator iter(&list);
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
|
||||
iter.Seek(0);
|
||||
ASSERT_TRUE(iter.Valid());
|
||||
ASSERT_EQ(*(keys.begin()), iter.key());
|
||||
|
||||
iter.SeekToFirst();
|
||||
ASSERT_TRUE(iter.Valid());
|
||||
ASSERT_EQ(*(keys.begin()), iter.key());
|
||||
|
||||
iter.SeekToLast();
|
||||
ASSERT_TRUE(iter.Valid());
|
||||
ASSERT_EQ(*(keys.rbegin()), iter.key());
|
||||
}
|
||||
|
||||
// Forward iteration test
|
||||
for (int i = 0; i < R; i++) {
|
||||
SkipList<Key, TestComparator>::Iterator iter(&list);
|
||||
iter.Seek(i);
|
||||
|
||||
// Compare against model iterator
|
||||
std::set<Key>::iterator model_iter = keys.lower_bound(i);
|
||||
for (int j = 0; j < 3; j++) {
|
||||
if (model_iter == keys.end()) {
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
break;
|
||||
} else {
|
||||
ASSERT_TRUE(iter.Valid());
|
||||
ASSERT_EQ(*model_iter, iter.key());
|
||||
++model_iter;
|
||||
iter.Next();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Backward iteration test
|
||||
{
|
||||
SkipList<Key, TestComparator>::Iterator iter(&list);
|
||||
iter.SeekToLast();
|
||||
|
||||
// Compare against model iterator
|
||||
for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
|
||||
model_iter != keys.rend();
|
||||
++model_iter) {
|
||||
ASSERT_TRUE(iter.Valid());
|
||||
ASSERT_EQ(*model_iter, iter.key());
|
||||
iter.Prev();
|
||||
}
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
}
|
||||
}
|
||||
|
||||
// We want to make sure that with a single writer and multiple
|
||||
// concurrent readers (with no synchronization other than when a
|
||||
// reader's iterator is created), the reader always observes all the
|
||||
// data that was present in the skip list when the iterator was
|
||||
// constructor. Because insertions are happening concurrently, we may
|
||||
// also observe new values that were inserted since the iterator was
|
||||
// constructed, but we should never miss any values that were present
|
||||
// at iterator construction time.
|
||||
//
|
||||
// We generate multi-part keys:
|
||||
// <key,gen,hash>
|
||||
// where:
|
||||
// key is in range [0..K-1]
|
||||
// gen is a generation number for key
|
||||
// hash is hash(key,gen)
|
||||
//
|
||||
// The insertion code picks a random key, sets gen to be 1 + the last
|
||||
// generation number inserted for that key, and sets hash to Hash(key,gen).
|
||||
//
|
||||
// At the beginning of a read, we snapshot the last inserted
|
||||
// generation number for each key. We then iterate, including random
|
||||
// calls to Next() and Seek(). For every key we encounter, we
|
||||
// check that it is either expected given the initial snapshot or has
|
||||
// been concurrently added since the iterator started.
|
||||
class ConcurrentTest {
|
||||
private:
|
||||
static const uint32_t K = 4;
|
||||
|
||||
static uint64_t key(Key key) { return (key >> 40); }
|
||||
static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
|
||||
static uint64_t hash(Key key) { return key & 0xff; }
|
||||
|
||||
static uint64_t HashNumbers(uint64_t k, uint64_t g) {
|
||||
uint64_t data[2] = { k, g };
|
||||
return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
|
||||
}
|
||||
|
||||
static Key MakeKey(uint64_t k, uint64_t g) {
|
||||
assert(sizeof(Key) == sizeof(uint64_t));
|
||||
assert(k <= K); // We sometimes pass K to seek to the end of the skiplist
|
||||
assert(g <= 0xffffffffu);
|
||||
return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
|
||||
}
|
||||
|
||||
static bool IsValidKey(Key k) {
|
||||
return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
|
||||
}
|
||||
|
||||
static Key RandomTarget(Random* rnd) {
|
||||
switch (rnd->Next() % 10) {
|
||||
case 0:
|
||||
// Seek to beginning
|
||||
return MakeKey(0, 0);
|
||||
case 1:
|
||||
// Seek to end
|
||||
return MakeKey(K, 0);
|
||||
default:
|
||||
// Seek to middle
|
||||
return MakeKey(rnd->Next() % K, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// Per-key generation
|
||||
struct State {
|
||||
port::AtomicPointer generation[K];
|
||||
void Set(int k, intptr_t v) {
|
||||
generation[k].Release_Store(reinterpret_cast<void*>(v));
|
||||
}
|
||||
intptr_t Get(int k) {
|
||||
return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
|
||||
}
|
||||
|
||||
State() {
|
||||
for (unsigned int k = 0; k < K; k++) {
|
||||
Set(k, 0);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Current state of the test
|
||||
State current_;
|
||||
|
||||
Arena arena_;
|
||||
|
||||
// SkipList is not protected by mu_. We just use a single writer
|
||||
// thread to modify it.
|
||||
SkipList<Key, TestComparator> list_;
|
||||
|
||||
public:
|
||||
ConcurrentTest() : list_(TestComparator(), &arena_) {}
|
||||
|
||||
// REQUIRES: External synchronization
|
||||
void WriteStep(Random* rnd) {
|
||||
const uint32_t k = rnd->Next() % K;
|
||||
const intptr_t g = current_.Get(k) + 1;
|
||||
const Key key = MakeKey(k, g);
|
||||
list_.Insert(key);
|
||||
current_.Set(k, g);
|
||||
}
|
||||
|
||||
void ReadStep(Random* rnd) {
|
||||
// Remember the initial committed state of the skiplist.
|
||||
State initial_state;
|
||||
for (unsigned int k = 0; k < K; k++) {
|
||||
initial_state.Set(k, current_.Get(k));
|
||||
}
|
||||
|
||||
Key pos = RandomTarget(rnd);
|
||||
SkipList<Key, TestComparator>::Iterator iter(&list_);
|
||||
iter.Seek(pos);
|
||||
while (true) {
|
||||
Key current;
|
||||
if (!iter.Valid()) {
|
||||
current = MakeKey(K, 0);
|
||||
} else {
|
||||
current = iter.key();
|
||||
ASSERT_TRUE(IsValidKey(current)) << current;
|
||||
}
|
||||
ASSERT_LE(pos, current) << "should not go backwards";
|
||||
|
||||
// Verify that everything in [pos,current) was not present in
|
||||
// initial_state.
|
||||
while (pos < current) {
|
||||
ASSERT_LT(key(pos), K) << pos;
|
||||
|
||||
// Note that generation 0 is never inserted, so it is ok if
|
||||
// <*,0,*> is missing.
|
||||
ASSERT_TRUE((gen(pos) == 0U) ||
|
||||
(gen(pos) > (uint64_t)initial_state.Get(key(pos)))
|
||||
) << "key: " << key(pos)
|
||||
<< "; gen: " << gen(pos)
|
||||
<< "; initgen: "
|
||||
<< initial_state.Get(key(pos));
|
||||
|
||||
// Advance to next key in the valid key space
|
||||
if (key(pos) < key(current)) {
|
||||
pos = MakeKey(key(pos) + 1, 0);
|
||||
} else {
|
||||
pos = MakeKey(key(pos), gen(pos) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (!iter.Valid()) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (rnd->Next() % 2) {
|
||||
iter.Next();
|
||||
pos = MakeKey(key(pos), gen(pos) + 1);
|
||||
} else {
|
||||
Key new_target = RandomTarget(rnd);
|
||||
if (new_target > pos) {
|
||||
pos = new_target;
|
||||
iter.Seek(new_target);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
const uint32_t ConcurrentTest::K;
|
||||
|
||||
// Simple test that does single-threaded testing of the ConcurrentTest
|
||||
// scaffolding.
|
||||
TEST(SkipTest, ConcurrentWithoutThreads) {
|
||||
ConcurrentTest test;
|
||||
Random rnd(test::RandomSeed());
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
test.ReadStep(&rnd);
|
||||
test.WriteStep(&rnd);
|
||||
}
|
||||
}
|
||||
|
||||
class TestState {
|
||||
public:
|
||||
ConcurrentTest t_;
|
||||
int seed_;
|
||||
port::AtomicPointer quit_flag_;
|
||||
|
||||
enum ReaderState {
|
||||
STARTING,
|
||||
RUNNING,
|
||||
DONE
|
||||
};
|
||||
|
||||
explicit TestState(int s)
|
||||
: seed_(s),
|
||||
quit_flag_(nullptr),
|
||||
state_(STARTING),
|
||||
state_cv_(&mu_) {}
|
||||
|
||||
void Wait(ReaderState s) {
|
||||
mu_.Lock();
|
||||
while (state_ != s) {
|
||||
state_cv_.Wait();
|
||||
}
|
||||
mu_.Unlock();
|
||||
}
|
||||
|
||||
void Change(ReaderState s) {
|
||||
mu_.Lock();
|
||||
state_ = s;
|
||||
state_cv_.Signal();
|
||||
mu_.Unlock();
|
||||
}
|
||||
|
||||
private:
|
||||
port::Mutex mu_;
|
||||
ReaderState state_;
|
||||
port::CondVar state_cv_;
|
||||
};
|
||||
|
||||
static void ConcurrentReader(void* arg) {
|
||||
TestState* state = reinterpret_cast<TestState*>(arg);
|
||||
Random rnd(state->seed_);
|
||||
int64_t reads = 0;
|
||||
state->Change(TestState::RUNNING);
|
||||
while (!state->quit_flag_.Acquire_Load()) {
|
||||
state->t_.ReadStep(&rnd);
|
||||
++reads;
|
||||
}
|
||||
state->Change(TestState::DONE);
|
||||
}
|
||||
|
||||
static void RunConcurrent(int run) {
|
||||
const int seed = test::RandomSeed() + (run * 100);
|
||||
Random rnd(seed);
|
||||
const int N = 1000;
|
||||
const int kSize = 1000;
|
||||
for (int i = 0; i < N; i++) {
|
||||
if ((i % 100) == 0) {
|
||||
fprintf(stderr, "Run %d of %d\n", i, N);
|
||||
}
|
||||
TestState state(seed + 1);
|
||||
Env::Default()->Schedule(ConcurrentReader, &state);
|
||||
state.Wait(TestState::RUNNING);
|
||||
for (int i = 0; i < kSize; i++) {
|
||||
state.t_.WriteStep(&rnd);
|
||||
}
|
||||
state.quit_flag_.Release_Store(&state); // Any non-nullptr arg will do
|
||||
state.Wait(TestState::DONE);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
|
||||
TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
|
||||
TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
|
||||
TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
|
||||
TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
||||
86
db/snapshot.h
Normal file
86
db/snapshot.h
Normal file
@@ -0,0 +1,86 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include "rocksdb/db.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class SnapshotList;
|
||||
|
||||
// Snapshots are kept in a doubly-linked list in the DB.
|
||||
// Each SnapshotImpl corresponds to a particular sequence number.
|
||||
class SnapshotImpl : public Snapshot {
|
||||
public:
|
||||
SequenceNumber number_; // const after creation
|
||||
|
||||
private:
|
||||
friend class SnapshotList;
|
||||
|
||||
// SnapshotImpl is kept in a doubly-linked circular list
|
||||
SnapshotImpl* prev_;
|
||||
SnapshotImpl* next_;
|
||||
|
||||
SnapshotList* list_; // just for sanity checks
|
||||
};
|
||||
|
||||
class SnapshotList {
|
||||
public:
|
||||
SnapshotList() {
|
||||
list_.prev_ = &list_;
|
||||
list_.next_ = &list_;
|
||||
list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging
|
||||
}
|
||||
|
||||
bool empty() const { return list_.next_ == &list_; }
|
||||
SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
|
||||
SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
|
||||
|
||||
const SnapshotImpl* New(SequenceNumber seq) {
|
||||
SnapshotImpl* s = new SnapshotImpl;
|
||||
s->number_ = seq;
|
||||
s->list_ = this;
|
||||
s->next_ = &list_;
|
||||
s->prev_ = list_.prev_;
|
||||
s->prev_->next_ = s;
|
||||
s->next_->prev_ = s;
|
||||
return s;
|
||||
}
|
||||
|
||||
void Delete(const SnapshotImpl* s) {
|
||||
assert(s->list_ == this);
|
||||
s->prev_->next_ = s->next_;
|
||||
s->next_->prev_ = s->prev_;
|
||||
delete s;
|
||||
}
|
||||
|
||||
// retrieve all snapshot numbers. They are sorted in ascending order.
|
||||
void getAll(std::vector<SequenceNumber>& ret) {
|
||||
if (empty()) return;
|
||||
SnapshotImpl* s = &list_;
|
||||
while (s->next_ != &list_) {
|
||||
ret.push_back(s->next_->number_);
|
||||
s = s ->next_;
|
||||
}
|
||||
}
|
||||
|
||||
// get the sequence number of the most recent snapshot
|
||||
const SequenceNumber GetNewest() {
|
||||
if (empty()) {
|
||||
return 0;
|
||||
}
|
||||
return newest()->number_;
|
||||
}
|
||||
|
||||
private:
|
||||
// Dummy head of doubly-linked list of snapshots
|
||||
SnapshotImpl list_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
198
db/table_cache.cc
Normal file
198
db/table_cache.cc
Normal file
@@ -0,0 +1,198 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/table_cache.h"
|
||||
|
||||
#include "db/filename.h"
|
||||
#include "db/version_edit.h"
|
||||
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "table/iterator_wrapper.h"
|
||||
#include "table/table_reader.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/stop_watch.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
static void DeleteEntry(const Slice& key, void* value) {
|
||||
TableReader* table_reader = reinterpret_cast<TableReader*>(value);
|
||||
delete table_reader;
|
||||
}
|
||||
|
||||
static void UnrefEntry(void* arg1, void* arg2) {
|
||||
Cache* cache = reinterpret_cast<Cache*>(arg1);
|
||||
Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
|
||||
cache->Release(h);
|
||||
}
|
||||
|
||||
static Slice GetSliceForFileNumber(uint64_t* file_number) {
|
||||
return Slice(reinterpret_cast<const char*>(file_number),
|
||||
sizeof(*file_number));
|
||||
}
|
||||
|
||||
TableCache::TableCache(const std::string& dbname, const Options* options,
|
||||
const EnvOptions& storage_options, Cache* const cache)
|
||||
: env_(options->env),
|
||||
dbname_(dbname),
|
||||
options_(options),
|
||||
storage_options_(storage_options),
|
||||
cache_(cache) {}
|
||||
|
||||
TableCache::~TableCache() {
|
||||
}
|
||||
|
||||
TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
|
||||
return reinterpret_cast<TableReader*>(cache_->Value(handle));
|
||||
}
|
||||
|
||||
void TableCache::ReleaseHandle(Cache::Handle* handle) {
|
||||
cache_->Release(handle);
|
||||
}
|
||||
|
||||
Status TableCache::FindTable(const EnvOptions& toptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
uint64_t file_number, uint64_t file_size,
|
||||
Cache::Handle** handle, bool* table_io,
|
||||
const bool no_io) {
|
||||
Status s;
|
||||
Slice key = GetSliceForFileNumber(&file_number);
|
||||
*handle = cache_->Lookup(key);
|
||||
if (*handle == nullptr) {
|
||||
if (no_io) { // Dont do IO and return a not-found status
|
||||
return Status::Incomplete("Table not found in table_cache, no_io is set");
|
||||
}
|
||||
if (table_io != nullptr) {
|
||||
*table_io = true; // we had to do IO from storage
|
||||
}
|
||||
std::string fname = TableFileName(dbname_, file_number);
|
||||
unique_ptr<RandomAccessFile> file;
|
||||
unique_ptr<TableReader> table_reader;
|
||||
s = env_->NewRandomAccessFile(fname, &file, toptions);
|
||||
RecordTick(options_->statistics.get(), NO_FILE_OPENS);
|
||||
if (s.ok()) {
|
||||
if (options_->advise_random_on_open) {
|
||||
file->Hint(RandomAccessFile::RANDOM);
|
||||
}
|
||||
StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
|
||||
s = options_->table_factory->NewTableReader(
|
||||
*options_, toptions, internal_comparator, std::move(file), file_size,
|
||||
&table_reader);
|
||||
}
|
||||
|
||||
if (!s.ok()) {
|
||||
assert(table_reader == nullptr);
|
||||
RecordTick(options_->statistics.get(), NO_FILE_ERRORS);
|
||||
// We do not cache error results so that if the error is transient,
|
||||
// or somebody repairs the file, we recover automatically.
|
||||
} else {
|
||||
assert(file.get() == nullptr);
|
||||
*handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry);
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Iterator* TableCache::NewIterator(const ReadOptions& options,
|
||||
const EnvOptions& toptions,
|
||||
const InternalKeyComparator& icomparator,
|
||||
const FileMetaData& file_meta,
|
||||
TableReader** table_reader_ptr,
|
||||
bool for_compaction, Arena* arena) {
|
||||
if (table_reader_ptr != nullptr) {
|
||||
*table_reader_ptr = nullptr;
|
||||
}
|
||||
TableReader* table_reader = file_meta.table_reader;
|
||||
Cache::Handle* handle = nullptr;
|
||||
Status s;
|
||||
if (table_reader == nullptr) {
|
||||
s = FindTable(toptions, icomparator, file_meta.number, file_meta.file_size,
|
||||
&handle, nullptr, options.read_tier == kBlockCacheTier);
|
||||
if (!s.ok()) {
|
||||
return NewErrorIterator(s, arena);
|
||||
}
|
||||
table_reader = GetTableReaderFromHandle(handle);
|
||||
}
|
||||
|
||||
Iterator* result = table_reader->NewIterator(options, arena);
|
||||
if (handle != nullptr) {
|
||||
result->RegisterCleanup(&UnrefEntry, cache_, handle);
|
||||
}
|
||||
if (table_reader_ptr != nullptr) {
|
||||
*table_reader_ptr = table_reader;
|
||||
}
|
||||
|
||||
if (for_compaction) {
|
||||
table_reader->SetupForCompaction();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Status TableCache::Get(const ReadOptions& options,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
const FileMetaData& file_meta, const Slice& k, void* arg,
|
||||
bool (*saver)(void*, const ParsedInternalKey&,
|
||||
const Slice&, bool),
|
||||
bool* table_io, void (*mark_key_may_exist)(void*)) {
|
||||
TableReader* t = file_meta.table_reader;
|
||||
Status s;
|
||||
Cache::Handle* handle = nullptr;
|
||||
if (!t) {
|
||||
s = FindTable(storage_options_, internal_comparator, file_meta.number,
|
||||
file_meta.file_size, &handle, table_io,
|
||||
options.read_tier == kBlockCacheTier);
|
||||
if (s.ok()) {
|
||||
t = GetTableReaderFromHandle(handle);
|
||||
}
|
||||
}
|
||||
if (s.ok()) {
|
||||
s = t->Get(options, k, arg, saver, mark_key_may_exist);
|
||||
if (handle != nullptr) {
|
||||
ReleaseHandle(handle);
|
||||
}
|
||||
} else if (options.read_tier && s.IsIncomplete()) {
|
||||
// Couldnt find Table in cache but treat as kFound if no_io set
|
||||
(*mark_key_may_exist)(arg);
|
||||
return Status::OK();
|
||||
}
|
||||
return s;
|
||||
}
|
||||
Status TableCache::GetTableProperties(
|
||||
const EnvOptions& toptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
const FileMetaData& file_meta,
|
||||
std::shared_ptr<const TableProperties>* properties, bool no_io) {
|
||||
Status s;
|
||||
auto table_reader = file_meta.table_reader;
|
||||
// table already been pre-loaded?
|
||||
if (table_reader) {
|
||||
*properties = table_reader->GetTableProperties();
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
bool table_io;
|
||||
Cache::Handle* table_handle = nullptr;
|
||||
s = FindTable(toptions, internal_comparator, file_meta.number,
|
||||
file_meta.file_size, &table_handle, &table_io, no_io);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
assert(table_handle);
|
||||
auto table = GetTableReaderFromHandle(table_handle);
|
||||
*properties = table->GetTableProperties();
|
||||
ReleaseHandle(table_handle);
|
||||
return s;
|
||||
}
|
||||
|
||||
void TableCache::Evict(Cache* cache, uint64_t file_number) {
|
||||
cache->Erase(GetSliceForFileNumber(&file_number));
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
95
db/table_cache.h
Normal file
95
db/table_cache.h
Normal file
@@ -0,0 +1,95 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Thread-safe (provides internal synchronization)
|
||||
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "port/port.h"
|
||||
#include "rocksdb/cache.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "table/table_reader.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Env;
|
||||
class Arena;
|
||||
struct FileMetaData;
|
||||
|
||||
// TODO(sdong): try to come up with a better API to pass the file information
|
||||
// other than simply passing FileMetaData.
|
||||
class TableCache {
|
||||
public:
|
||||
TableCache(const std::string& dbname, const Options* options,
|
||||
const EnvOptions& storage_options, Cache* cache);
|
||||
~TableCache();
|
||||
|
||||
// Return an iterator for the specified file number (the corresponding
|
||||
// file length must be exactly "file_size" bytes). If "tableptr" is
|
||||
// non-nullptr, also sets "*tableptr" to point to the Table object
|
||||
// underlying the returned iterator, or nullptr if no Table object underlies
|
||||
// the returned iterator. The returned "*tableptr" object is owned by
|
||||
// the cache and should not be deleted, and is valid for as long as the
|
||||
// returned iterator is live.
|
||||
Iterator* NewIterator(const ReadOptions& options, const EnvOptions& toptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
const FileMetaData& file_meta,
|
||||
TableReader** table_reader_ptr = nullptr,
|
||||
bool for_compaction = false, Arena* arena = nullptr);
|
||||
|
||||
// If a seek to internal key "k" in specified file finds an entry,
|
||||
// call (*handle_result)(arg, found_key, found_value) repeatedly until
|
||||
// it returns false.
|
||||
Status Get(const ReadOptions& options,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
const FileMetaData& file_meta, const Slice& k, void* arg,
|
||||
bool (*handle_result)(void*, const ParsedInternalKey&,
|
||||
const Slice&, bool),
|
||||
bool* table_io, void (*mark_key_may_exist)(void*) = nullptr);
|
||||
|
||||
// Evict any entry for the specified file number
|
||||
static void Evict(Cache* cache, uint64_t file_number);
|
||||
|
||||
// Find table reader
|
||||
Status FindTable(const EnvOptions& toptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
uint64_t file_number, uint64_t file_size, Cache::Handle**,
|
||||
bool* table_io = nullptr, const bool no_io = false);
|
||||
|
||||
// Get TableReader from a cache handle.
|
||||
TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
|
||||
|
||||
// Get the table properties of a given table.
|
||||
// @no_io: indicates if we should load table to the cache if it is not present
|
||||
// in table cache yet.
|
||||
// @returns: `properties` will be reset on success. Please note that we will
|
||||
// return Status::Incomplete() if table is not present in cache and
|
||||
// we set `no_io` to be true.
|
||||
Status GetTableProperties(const EnvOptions& toptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
const FileMetaData& file_meta,
|
||||
std::shared_ptr<const TableProperties>* properties,
|
||||
bool no_io = false);
|
||||
|
||||
// Release the handle from a cache
|
||||
void ReleaseHandle(Cache::Handle* handle);
|
||||
|
||||
private:
|
||||
Env* const env_;
|
||||
const std::string dbname_;
|
||||
const Options* options_;
|
||||
const EnvOptions& storage_options_;
|
||||
Cache* const cache_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
83
db/table_properties_collector.cc
Normal file
83
db/table_properties_collector.cc
Normal file
@@ -0,0 +1,83 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "db/table_properties_collector.h"
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "util/coding.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
Status InternalKeyPropertiesCollector::Add(
|
||||
const Slice& key, const Slice& value) {
|
||||
ParsedInternalKey ikey;
|
||||
if (!ParseInternalKey(key, &ikey)) {
|
||||
return Status::InvalidArgument("Invalid internal key");
|
||||
}
|
||||
|
||||
if (ikey.type == ValueType::kTypeDeletion) {
|
||||
++deleted_keys_;
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status InternalKeyPropertiesCollector::Finish(
|
||||
UserCollectedProperties* properties) {
|
||||
assert(properties);
|
||||
assert(properties->find(
|
||||
InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end());
|
||||
std::string val;
|
||||
|
||||
PutVarint64(&val, deleted_keys_);
|
||||
properties->insert({ InternalKeyTablePropertiesNames::kDeletedKeys, val });
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
UserCollectedProperties
|
||||
InternalKeyPropertiesCollector::GetReadableProperties() const {
|
||||
return {
|
||||
{ "kDeletedKeys", std::to_string(deleted_keys_) }
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
Status UserKeyTablePropertiesCollector::Add(
|
||||
const Slice& key, const Slice& value) {
|
||||
ParsedInternalKey ikey;
|
||||
if (!ParseInternalKey(key, &ikey)) {
|
||||
return Status::InvalidArgument("Invalid internal key");
|
||||
}
|
||||
|
||||
return collector_->Add(ikey.user_key, value);
|
||||
}
|
||||
|
||||
Status UserKeyTablePropertiesCollector::Finish(
|
||||
UserCollectedProperties* properties) {
|
||||
return collector_->Finish(properties);
|
||||
}
|
||||
|
||||
UserCollectedProperties
|
||||
UserKeyTablePropertiesCollector::GetReadableProperties() const {
|
||||
return collector_->GetReadableProperties();
|
||||
}
|
||||
|
||||
|
||||
const std::string InternalKeyTablePropertiesNames::kDeletedKeys
|
||||
= "rocksdb.deleted.keys";
|
||||
|
||||
uint64_t GetDeletedKeys(
|
||||
const UserCollectedProperties& props) {
|
||||
auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys);
|
||||
if (pos == props.end()) {
|
||||
return 0;
|
||||
}
|
||||
Slice raw = pos->second;
|
||||
uint64_t val = 0;
|
||||
return GetVarint64(&raw, &val) ? val : 0;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
95
db/table_properties_collector.h
Normal file
95
db/table_properties_collector.h
Normal file
@@ -0,0 +1,95 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// This file defines a collection of statistics collectors.
|
||||
#pragma once
|
||||
|
||||
#include "rocksdb/table_properties.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
struct InternalKeyTablePropertiesNames {
|
||||
static const std::string kDeletedKeys;
|
||||
};
|
||||
|
||||
// Collecting the statistics for internal keys. Visible only by internal
|
||||
// rocksdb modules.
|
||||
class InternalKeyPropertiesCollector : public TablePropertiesCollector {
|
||||
public:
|
||||
virtual Status Add(const Slice& key, const Slice& value) override;
|
||||
|
||||
virtual Status Finish(UserCollectedProperties* properties) override;
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "InternalKeyPropertiesCollector";
|
||||
}
|
||||
|
||||
UserCollectedProperties GetReadableProperties() const override;
|
||||
|
||||
private:
|
||||
uint64_t deleted_keys_ = 0;
|
||||
};
|
||||
|
||||
class InternalKeyPropertiesCollectorFactory
|
||||
: public TablePropertiesCollectorFactory {
|
||||
public:
|
||||
virtual TablePropertiesCollector* CreateTablePropertiesCollector() {
|
||||
return new InternalKeyPropertiesCollector();
|
||||
}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "InternalKeyPropertiesCollectorFactory";
|
||||
}
|
||||
};
|
||||
|
||||
// When rocksdb creates a new table, it will encode all "user keys" into
|
||||
// "internal keys", which contains meta information of a given entry.
|
||||
//
|
||||
// This class extracts user key from the encoded internal key when Add() is
|
||||
// invoked.
|
||||
class UserKeyTablePropertiesCollector : public TablePropertiesCollector {
|
||||
public:
|
||||
// transfer of ownership
|
||||
explicit UserKeyTablePropertiesCollector(TablePropertiesCollector* collector)
|
||||
: collector_(collector) {}
|
||||
|
||||
virtual ~UserKeyTablePropertiesCollector() {}
|
||||
|
||||
virtual Status Add(const Slice& key, const Slice& value) override;
|
||||
|
||||
virtual Status Finish(UserCollectedProperties* properties) override;
|
||||
|
||||
virtual const char* Name() const override { return collector_->Name(); }
|
||||
|
||||
UserCollectedProperties GetReadableProperties() const override;
|
||||
|
||||
protected:
|
||||
std::unique_ptr<TablePropertiesCollector> collector_;
|
||||
};
|
||||
|
||||
class UserKeyTablePropertiesCollectorFactory
|
||||
: public TablePropertiesCollectorFactory {
|
||||
public:
|
||||
explicit UserKeyTablePropertiesCollectorFactory(
|
||||
std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory)
|
||||
: user_collector_factory_(user_collector_factory) {}
|
||||
virtual TablePropertiesCollector* CreateTablePropertiesCollector() {
|
||||
return new UserKeyTablePropertiesCollector(
|
||||
user_collector_factory_->CreateTablePropertiesCollector());
|
||||
}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return user_collector_factory_->Name();
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
313
db/table_properties_collector_test.cc
Normal file
313
db/table_properties_collector_test.cc
Normal file
@@ -0,0 +1,313 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "db/db_impl.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/table_properties_collector.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "table/block_based_table_factory.h"
|
||||
#include "table/meta_blocks.h"
|
||||
#include "table/plain_table_factory.h"
|
||||
#include "table/table_builder.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class TablePropertiesTest {
|
||||
};
|
||||
|
||||
// TODO(kailiu) the following classes should be moved to some more general
|
||||
// places, so that other tests can also make use of them.
|
||||
// `FakeWritableFile` and `FakeRandomeAccessFile` bypass the real file system
|
||||
// and therefore enable us to quickly setup the tests.
|
||||
class FakeWritableFile : public WritableFile {
|
||||
public:
|
||||
~FakeWritableFile() { }
|
||||
|
||||
const std::string& contents() const { return contents_; }
|
||||
|
||||
virtual Status Close() { return Status::OK(); }
|
||||
virtual Status Flush() { return Status::OK(); }
|
||||
virtual Status Sync() { return Status::OK(); }
|
||||
|
||||
virtual Status Append(const Slice& data) {
|
||||
contents_.append(data.data(), data.size());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
std::string contents_;
|
||||
};
|
||||
|
||||
|
||||
class FakeRandomeAccessFile : public RandomAccessFile {
|
||||
public:
|
||||
explicit FakeRandomeAccessFile(const Slice& contents)
|
||||
: contents_(contents.data(), contents.size()) {
|
||||
}
|
||||
|
||||
virtual ~FakeRandomeAccessFile() { }
|
||||
|
||||
uint64_t Size() const { return contents_.size(); }
|
||||
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const {
|
||||
if (offset > contents_.size()) {
|
||||
return Status::InvalidArgument("invalid Read offset");
|
||||
}
|
||||
if (offset + n > contents_.size()) {
|
||||
n = contents_.size() - offset;
|
||||
}
|
||||
memcpy(scratch, &contents_[offset], n);
|
||||
*result = Slice(scratch, n);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
std::string contents_;
|
||||
};
|
||||
|
||||
|
||||
class DumbLogger : public Logger {
|
||||
public:
|
||||
virtual void Logv(const char* format, va_list ap) { }
|
||||
virtual size_t GetLogFileSize() const { return 0; }
|
||||
};
|
||||
|
||||
// Utilities test functions
|
||||
namespace {
|
||||
void MakeBuilder(const Options& options,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
std::unique_ptr<FakeWritableFile>* writable,
|
||||
std::unique_ptr<TableBuilder>* builder) {
|
||||
writable->reset(new FakeWritableFile);
|
||||
builder->reset(options.table_factory->NewTableBuilder(
|
||||
options, internal_comparator, writable->get(), options.compression));
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// Collects keys that starts with "A" in a table.
|
||||
class RegularKeysStartWithA: public TablePropertiesCollector {
|
||||
public:
|
||||
const char* Name() const { return "RegularKeysStartWithA"; }
|
||||
|
||||
Status Finish(UserCollectedProperties* properties) {
|
||||
std::string encoded;
|
||||
PutVarint32(&encoded, count_);
|
||||
*properties = UserCollectedProperties {
|
||||
{ "TablePropertiesTest", "Rocksdb" },
|
||||
{ "Count", encoded }
|
||||
};
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Add(const Slice& user_key, const Slice& value) {
|
||||
// simply asssume all user keys are not empty.
|
||||
if (user_key.data()[0] == 'A') {
|
||||
++count_;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual UserCollectedProperties GetReadableProperties() const {
|
||||
return UserCollectedProperties{};
|
||||
}
|
||||
|
||||
private:
|
||||
uint32_t count_ = 0;
|
||||
};
|
||||
|
||||
class RegularKeysStartWithAFactory : public TablePropertiesCollectorFactory {
|
||||
public:
|
||||
virtual TablePropertiesCollector* CreateTablePropertiesCollector() {
|
||||
return new RegularKeysStartWithA();
|
||||
}
|
||||
const char* Name() const { return "RegularKeysStartWithA"; }
|
||||
};
|
||||
|
||||
extern uint64_t kBlockBasedTableMagicNumber;
|
||||
extern uint64_t kPlainTableMagicNumber;
|
||||
namespace {
|
||||
void TestCustomizedTablePropertiesCollector(
|
||||
uint64_t magic_number, bool encode_as_internal, const Options& options,
|
||||
const InternalKeyComparator& internal_comparator) {
|
||||
// make sure the entries will be inserted with order.
|
||||
std::map<std::string, std::string> kvs = {
|
||||
{"About ", "val5"}, // starts with 'A'
|
||||
{"Abstract", "val2"}, // starts with 'A'
|
||||
{"Around ", "val7"}, // starts with 'A'
|
||||
{"Beyond ", "val3"},
|
||||
{"Builder ", "val1"},
|
||||
{"Cancel ", "val4"},
|
||||
{"Find ", "val6"},
|
||||
};
|
||||
|
||||
// -- Step 1: build table
|
||||
std::unique_ptr<TableBuilder> builder;
|
||||
std::unique_ptr<FakeWritableFile> writable;
|
||||
MakeBuilder(options, internal_comparator, &writable, &builder);
|
||||
|
||||
for (const auto& kv : kvs) {
|
||||
if (encode_as_internal) {
|
||||
InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
|
||||
builder->Add(ikey.Encode(), kv.second);
|
||||
} else {
|
||||
builder->Add(kv.first, kv.second);
|
||||
}
|
||||
}
|
||||
ASSERT_OK(builder->Finish());
|
||||
|
||||
// -- Step 2: Read properties
|
||||
FakeRandomeAccessFile readable(writable->contents());
|
||||
TableProperties* props;
|
||||
Status s = ReadTableProperties(
|
||||
&readable,
|
||||
writable->contents().size(),
|
||||
magic_number,
|
||||
Env::Default(),
|
||||
nullptr,
|
||||
&props
|
||||
);
|
||||
std::unique_ptr<TableProperties> props_guard(props);
|
||||
ASSERT_OK(s);
|
||||
|
||||
auto user_collected = props->user_collected_properties;
|
||||
|
||||
ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
|
||||
|
||||
uint32_t starts_with_A = 0;
|
||||
Slice key(user_collected.at("Count"));
|
||||
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
|
||||
ASSERT_EQ(3u, starts_with_A);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
|
||||
// Test properties collectors with internal keys or regular keys
|
||||
// for block based table
|
||||
for (bool encode_as_internal : { true, false }) {
|
||||
Options options;
|
||||
std::shared_ptr<TablePropertiesCollectorFactory> collector_factory(
|
||||
new RegularKeysStartWithAFactory());
|
||||
if (encode_as_internal) {
|
||||
options.table_properties_collector_factories.emplace_back(
|
||||
new UserKeyTablePropertiesCollectorFactory(collector_factory));
|
||||
} else {
|
||||
options.table_properties_collector_factories.resize(1);
|
||||
options.table_properties_collector_factories[0] = collector_factory;
|
||||
}
|
||||
test::PlainInternalKeyComparator ikc(options.comparator);
|
||||
TestCustomizedTablePropertiesCollector(kBlockBasedTableMagicNumber,
|
||||
encode_as_internal, options, ikc);
|
||||
}
|
||||
|
||||
// test plain table
|
||||
Options options;
|
||||
options.table_properties_collector_factories.emplace_back(
|
||||
new RegularKeysStartWithAFactory());
|
||||
options.table_factory = std::make_shared<PlainTableFactory>(8, 8, 0);
|
||||
test::PlainInternalKeyComparator ikc(options.comparator);
|
||||
TestCustomizedTablePropertiesCollector(kPlainTableMagicNumber, true, options,
|
||||
ikc);
|
||||
}
|
||||
|
||||
namespace {
|
||||
void TestInternalKeyPropertiesCollector(
|
||||
uint64_t magic_number,
|
||||
bool sanitized,
|
||||
std::shared_ptr<TableFactory> table_factory) {
|
||||
InternalKey keys[] = {
|
||||
InternalKey("A ", 0, ValueType::kTypeValue),
|
||||
InternalKey("B ", 0, ValueType::kTypeValue),
|
||||
InternalKey("C ", 0, ValueType::kTypeValue),
|
||||
InternalKey("W ", 0, ValueType::kTypeDeletion),
|
||||
InternalKey("X ", 0, ValueType::kTypeDeletion),
|
||||
InternalKey("Y ", 0, ValueType::kTypeDeletion),
|
||||
InternalKey("Z ", 0, ValueType::kTypeDeletion),
|
||||
};
|
||||
|
||||
std::unique_ptr<TableBuilder> builder;
|
||||
std::unique_ptr<FakeWritableFile> writable;
|
||||
Options options;
|
||||
test::PlainInternalKeyComparator pikc(options.comparator);
|
||||
|
||||
options.table_factory = table_factory;
|
||||
if (sanitized) {
|
||||
options.table_properties_collector_factories.emplace_back(
|
||||
new RegularKeysStartWithAFactory());
|
||||
// with sanitization, even regular properties collector will be able to
|
||||
// handle internal keys.
|
||||
auto comparator = options.comparator;
|
||||
// HACK: Set options.info_log to avoid writing log in
|
||||
// SanitizeOptions().
|
||||
options.info_log = std::make_shared<DumbLogger>();
|
||||
options = SanitizeOptions("db", // just a place holder
|
||||
&pikc, nullptr, // don't care filter policy
|
||||
options);
|
||||
options.comparator = comparator;
|
||||
} else {
|
||||
options.table_properties_collector_factories = {
|
||||
std::make_shared<InternalKeyPropertiesCollectorFactory>()};
|
||||
}
|
||||
|
||||
for (int iter = 0; iter < 2; ++iter) {
|
||||
MakeBuilder(options, pikc, &writable, &builder);
|
||||
for (const auto& k : keys) {
|
||||
builder->Add(k.Encode(), "val");
|
||||
}
|
||||
|
||||
ASSERT_OK(builder->Finish());
|
||||
|
||||
FakeRandomeAccessFile readable(writable->contents());
|
||||
TableProperties* props;
|
||||
Status s =
|
||||
ReadTableProperties(&readable, writable->contents().size(),
|
||||
magic_number, Env::Default(), nullptr, &props);
|
||||
ASSERT_OK(s);
|
||||
|
||||
std::unique_ptr<TableProperties> props_guard(props);
|
||||
auto user_collected = props->user_collected_properties;
|
||||
uint64_t deleted = GetDeletedKeys(user_collected);
|
||||
ASSERT_EQ(4u, deleted);
|
||||
|
||||
if (sanitized) {
|
||||
uint32_t starts_with_A = 0;
|
||||
Slice key(user_collected.at("Count"));
|
||||
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
|
||||
ASSERT_EQ(1u, starts_with_A);
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
TEST(TablePropertiesTest, InternalKeyPropertiesCollector) {
|
||||
TestInternalKeyPropertiesCollector(
|
||||
kBlockBasedTableMagicNumber,
|
||||
true /* sanitize */,
|
||||
std::make_shared<BlockBasedTableFactory>()
|
||||
);
|
||||
TestInternalKeyPropertiesCollector(
|
||||
kBlockBasedTableMagicNumber,
|
||||
true /* not sanitize */,
|
||||
std::make_shared<BlockBasedTableFactory>()
|
||||
);
|
||||
TestInternalKeyPropertiesCollector(
|
||||
kPlainTableMagicNumber,
|
||||
false /* not sanitize */,
|
||||
std::make_shared<PlainTableFactory>(8, 8, 0)
|
||||
);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
||||
221
db/tailing_iter.cc
Normal file
221
db/tailing_iter.cc
Normal file
@@ -0,0 +1,221 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
#include "db/tailing_iter.h"
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "db/db_impl.h"
|
||||
#include "db/db_iter.h"
|
||||
#include "db/column_family.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "table/merger.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
TailingIterator::TailingIterator(Env* const env, DBImpl* db,
|
||||
const ReadOptions& read_options, ColumnFamilyData* cfd)
|
||||
: env_(env),
|
||||
db_(db),
|
||||
read_options_(read_options),
|
||||
cfd_(cfd),
|
||||
super_version_(nullptr),
|
||||
current_(nullptr),
|
||||
status_(Status::InvalidArgument("Seek() not called on this iterator")) {}
|
||||
|
||||
TailingIterator::~TailingIterator() {
|
||||
Cleanup();
|
||||
}
|
||||
|
||||
bool TailingIterator::Valid() const {
|
||||
return current_ != nullptr;
|
||||
}
|
||||
|
||||
void TailingIterator::SeekToFirst() {
|
||||
if (!IsCurrentVersion()) {
|
||||
CreateIterators();
|
||||
}
|
||||
|
||||
mutable_->SeekToFirst();
|
||||
immutable_->SeekToFirst();
|
||||
UpdateCurrent();
|
||||
}
|
||||
|
||||
void TailingIterator::Seek(const Slice& target) {
|
||||
if (!IsCurrentVersion()) {
|
||||
CreateIterators();
|
||||
}
|
||||
|
||||
mutable_->Seek(target);
|
||||
|
||||
// We maintain the interval (prev_key_, immutable_->key()] such that there
|
||||
// are no records with keys within that range in immutable_ other than
|
||||
// immutable_->key(). Since immutable_ can't change in this version, we don't
|
||||
// need to do a seek if 'target' belongs to that interval (i.e. immutable_ is
|
||||
// already at the correct position)!
|
||||
//
|
||||
// If prefix seek is used and immutable_ is not valid, seek if target has a
|
||||
// different prefix than prev_key.
|
||||
//
|
||||
// prev_key_ is updated by Next(). SeekImmutable() sets prev_key_ to
|
||||
// 'target' -- in this case, prev_key_ is included in the interval, so
|
||||
// prev_inclusive_ has to be set.
|
||||
|
||||
const Comparator* cmp = cfd_->user_comparator();
|
||||
if (!is_prev_set_ || cmp->Compare(prev_key_, target) >= !is_prev_inclusive_ ||
|
||||
(immutable_->Valid() && cmp->Compare(target, immutable_->key()) > 0) ||
|
||||
(cfd_->options()->prefix_extractor != nullptr && !IsSamePrefix(target))) {
|
||||
SeekImmutable(target);
|
||||
}
|
||||
|
||||
UpdateCurrent();
|
||||
}
|
||||
|
||||
void TailingIterator::Next() {
|
||||
assert(Valid());
|
||||
|
||||
if (!IsCurrentVersion()) {
|
||||
// save the current key, create new iterators and then seek
|
||||
std::string current_key = key().ToString();
|
||||
Slice key_slice(current_key.data(), current_key.size());
|
||||
|
||||
CreateIterators();
|
||||
Seek(key_slice);
|
||||
|
||||
if (!Valid() || key().compare(key_slice) != 0) {
|
||||
// record with current_key no longer exists
|
||||
return;
|
||||
}
|
||||
|
||||
} else if (current_ == immutable_.get()) {
|
||||
// immutable iterator is advanced -- update prev_key_
|
||||
prev_key_ = key().ToString();
|
||||
is_prev_inclusive_ = false;
|
||||
is_prev_set_ = true;
|
||||
}
|
||||
|
||||
current_->Next();
|
||||
UpdateCurrent();
|
||||
}
|
||||
|
||||
Slice TailingIterator::key() const {
|
||||
assert(Valid());
|
||||
return current_->key();
|
||||
}
|
||||
|
||||
Slice TailingIterator::value() const {
|
||||
assert(Valid());
|
||||
return current_->value();
|
||||
}
|
||||
|
||||
Status TailingIterator::status() const {
|
||||
if (!status_.ok()) {
|
||||
return status_;
|
||||
} else if (!mutable_->status().ok()) {
|
||||
return mutable_->status();
|
||||
} else {
|
||||
return immutable_->status();
|
||||
}
|
||||
}
|
||||
|
||||
void TailingIterator::Prev() {
|
||||
status_ = Status::NotSupported("This iterator doesn't support Prev()");
|
||||
}
|
||||
|
||||
void TailingIterator::SeekToLast() {
|
||||
status_ = Status::NotSupported("This iterator doesn't support SeekToLast()");
|
||||
}
|
||||
|
||||
void TailingIterator::Cleanup() {
|
||||
// Release old super version if necessary
|
||||
mutable_.reset();
|
||||
immutable_.reset();
|
||||
if (super_version_ != nullptr && super_version_->Unref()) {
|
||||
DBImpl::DeletionState deletion_state;
|
||||
db_->mutex_.Lock();
|
||||
super_version_->Cleanup();
|
||||
db_->FindObsoleteFiles(deletion_state, false, true);
|
||||
db_->mutex_.Unlock();
|
||||
delete super_version_;
|
||||
if (deletion_state.HaveSomethingToDelete()) {
|
||||
db_->PurgeObsoleteFiles(deletion_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TailingIterator::CreateIterators() {
|
||||
Cleanup();
|
||||
super_version_= cfd_->GetReferencedSuperVersion(&(db_->mutex_));
|
||||
|
||||
Iterator* mutable_iter = super_version_->mem->NewIterator(read_options_);
|
||||
// create a DBIter that only uses memtable content; see NewIterator()
|
||||
mutable_.reset(
|
||||
NewDBIterator(env_, *cfd_->options(), cfd_->user_comparator(),
|
||||
mutable_iter, kMaxSequenceNumber));
|
||||
|
||||
std::vector<Iterator*> list;
|
||||
super_version_->imm->AddIterators(read_options_, &list);
|
||||
super_version_->current->AddIterators(
|
||||
read_options_, *cfd_->soptions(), &list);
|
||||
Iterator* immutable_iter =
|
||||
NewMergingIterator(&cfd_->internal_comparator(), &list[0], list.size());
|
||||
|
||||
// create a DBIter that only uses memtable content; see NewIterator()
|
||||
immutable_.reset(
|
||||
NewDBIterator(env_, *cfd_->options(), cfd_->user_comparator(),
|
||||
immutable_iter, kMaxSequenceNumber));
|
||||
|
||||
current_ = nullptr;
|
||||
is_prev_set_ = false;
|
||||
}
|
||||
|
||||
void TailingIterator::UpdateCurrent() {
|
||||
current_ = nullptr;
|
||||
|
||||
if (mutable_->Valid()) {
|
||||
current_ = mutable_.get();
|
||||
}
|
||||
const Comparator* cmp = cfd_->user_comparator();
|
||||
if (immutable_->Valid() &&
|
||||
(current_ == nullptr ||
|
||||
cmp->Compare(immutable_->key(), current_->key()) < 0)) {
|
||||
current_ = immutable_.get();
|
||||
}
|
||||
|
||||
if (!status_.ok()) {
|
||||
// reset status that was set by Prev() or SeekToLast()
|
||||
status_ = Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
bool TailingIterator::IsCurrentVersion() const {
|
||||
return super_version_ != nullptr &&
|
||||
super_version_->version_number == cfd_->GetSuperVersionNumber();
|
||||
}
|
||||
|
||||
bool TailingIterator::IsSamePrefix(const Slice& target) const {
|
||||
const SliceTransform* extractor = cfd_->options()->prefix_extractor.get();
|
||||
|
||||
assert(extractor);
|
||||
assert(is_prev_set_);
|
||||
|
||||
return extractor->Transform(target)
|
||||
.compare(extractor->Transform(prev_key_)) == 0;
|
||||
}
|
||||
|
||||
void TailingIterator::SeekImmutable(const Slice& target) {
|
||||
prev_key_ = target.ToString();
|
||||
is_prev_inclusive_ = true;
|
||||
is_prev_set_ = true;
|
||||
|
||||
immutable_->Seek(target);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
97
db/tailing_iter.h
Normal file
97
db/tailing_iter.h
Normal file
@@ -0,0 +1,97 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#pragma once
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/options.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class DBImpl;
|
||||
class Env;
|
||||
struct SuperVersion;
|
||||
class ColumnFamilyData;
|
||||
|
||||
/**
|
||||
* TailingIterator is a special type of iterator that doesn't use an (implicit)
|
||||
* snapshot. In other words, it can be used to read data that was added to the
|
||||
* db after the iterator had been created.
|
||||
*
|
||||
* TailingIterator is optimized for sequential reading. It doesn't support
|
||||
* Prev() and SeekToLast() operations.
|
||||
*/
|
||||
class TailingIterator : public Iterator {
|
||||
public:
|
||||
TailingIterator(Env* const env, DBImpl* db, const ReadOptions& read_options,
|
||||
ColumnFamilyData* cfd);
|
||||
virtual ~TailingIterator();
|
||||
|
||||
virtual bool Valid() const override;
|
||||
virtual void SeekToFirst() override;
|
||||
virtual void SeekToLast() override;
|
||||
virtual void Seek(const Slice& target) override;
|
||||
virtual void Next() override;
|
||||
virtual void Prev() override;
|
||||
virtual Slice key() const override;
|
||||
virtual Slice value() const override;
|
||||
virtual Status status() const override;
|
||||
|
||||
private:
|
||||
void Cleanup();
|
||||
|
||||
Env* const env_;
|
||||
DBImpl* const db_;
|
||||
const ReadOptions read_options_;
|
||||
ColumnFamilyData* const cfd_;
|
||||
SuperVersion* super_version_;
|
||||
|
||||
// TailingIterator merges the contents of the two iterators below (one using
|
||||
// mutable memtable contents only, other over SSTs and immutable memtables).
|
||||
// See DBIter::GetTailingIteratorPair().
|
||||
std::unique_ptr<Iterator> mutable_;
|
||||
std::unique_ptr<Iterator> immutable_;
|
||||
|
||||
// points to either mutable_ or immutable_
|
||||
Iterator* current_;
|
||||
|
||||
// key that precedes immutable iterator's current key
|
||||
std::string prev_key_;
|
||||
|
||||
// unless prev_set is true, prev_key/prev_head is not valid and shouldn't be
|
||||
// used; reset by createIterators()
|
||||
bool is_prev_set_;
|
||||
|
||||
// prev_key_ was set by SeekImmutable(), which means that the interval of
|
||||
// keys covered by immutable_ is [prev_key_, current], i.e. it includes the
|
||||
// left endpoint
|
||||
bool is_prev_inclusive_;
|
||||
|
||||
// internal iterator status
|
||||
Status status_;
|
||||
|
||||
// check if this iterator's version matches DB's version
|
||||
bool IsCurrentVersion() const;
|
||||
|
||||
// check if SeekImmutable() is needed due to target having a different prefix
|
||||
// than prev_key_ (used when in prefix seek mode)
|
||||
bool IsSamePrefix(const Slice& target) const;
|
||||
|
||||
// creates mutable_ and immutable_ iterators and updates version_number_
|
||||
void CreateIterators();
|
||||
|
||||
// set current_ to be one of the iterators with the smallest key
|
||||
void UpdateCurrent();
|
||||
|
||||
// seek on immutable_ and update prev_key
|
||||
void SeekImmutable(const Slice& target);
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
262
db/transaction_log_impl.cc
Normal file
262
db/transaction_log_impl.cc
Normal file
@@ -0,0 +1,262 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
#include "db/transaction_log_impl.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
TransactionLogIteratorImpl::TransactionLogIteratorImpl(
|
||||
const std::string& dir, const DBOptions* options,
|
||||
const TransactionLogIterator::ReadOptions& read_options,
|
||||
const EnvOptions& soptions, const SequenceNumber seq,
|
||||
std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl)
|
||||
: dir_(dir),
|
||||
options_(options),
|
||||
read_options_(read_options),
|
||||
soptions_(soptions),
|
||||
startingSequenceNumber_(seq),
|
||||
files_(std::move(files)),
|
||||
started_(false),
|
||||
isValid_(false),
|
||||
currentFileIndex_(0),
|
||||
currentBatchSeq_(0),
|
||||
currentLastSeq_(0),
|
||||
dbimpl_(dbimpl) {
|
||||
assert(files_ != nullptr);
|
||||
assert(dbimpl_ != nullptr);
|
||||
|
||||
reporter_.env = options_->env;
|
||||
reporter_.info_log = options_->info_log.get();
|
||||
SeekToStartSequence(); // Seek till starting sequence
|
||||
}
|
||||
|
||||
Status TransactionLogIteratorImpl::OpenLogFile(
|
||||
const LogFile* logFile,
|
||||
unique_ptr<SequentialFile>* file) {
|
||||
Env* env = options_->env;
|
||||
if (logFile->Type() == kArchivedLogFile) {
|
||||
std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber());
|
||||
return env->NewSequentialFile(fname, file, soptions_);
|
||||
} else {
|
||||
std::string fname = LogFileName(dir_, logFile->LogNumber());
|
||||
Status status = env->NewSequentialFile(fname, file, soptions_);
|
||||
if (!status.ok()) {
|
||||
// If cannot open file in DB directory.
|
||||
// Try the archive dir, as it could have moved in the meanwhile.
|
||||
fname = ArchivedLogFileName(dir_, logFile->LogNumber());
|
||||
status = env->NewSequentialFile(fname, file, soptions_);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
BatchResult TransactionLogIteratorImpl::GetBatch() {
|
||||
assert(isValid_); // cannot call in a non valid state.
|
||||
BatchResult result;
|
||||
result.sequence = currentBatchSeq_;
|
||||
result.writeBatchPtr = std::move(currentBatch_);
|
||||
return result;
|
||||
}
|
||||
|
||||
Status TransactionLogIteratorImpl::status() {
|
||||
return currentStatus_;
|
||||
}
|
||||
|
||||
bool TransactionLogIteratorImpl::Valid() {
|
||||
return started_ && isValid_;
|
||||
}
|
||||
|
||||
bool TransactionLogIteratorImpl::RestrictedRead(
|
||||
Slice* record,
|
||||
std::string* scratch) {
|
||||
// Don't read if no more complete entries to read from logs
|
||||
if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) {
|
||||
return false;
|
||||
}
|
||||
return currentLogReader_->ReadRecord(record, scratch);
|
||||
}
|
||||
|
||||
void TransactionLogIteratorImpl::SeekToStartSequence(
|
||||
uint64_t startFileIndex,
|
||||
bool strict) {
|
||||
std::string scratch;
|
||||
Slice record;
|
||||
started_ = false;
|
||||
isValid_ = false;
|
||||
if (files_->size() <= startFileIndex) {
|
||||
return;
|
||||
}
|
||||
Status s = OpenLogReader(files_->at(startFileIndex).get());
|
||||
if (!s.ok()) {
|
||||
currentStatus_ = s;
|
||||
reporter_.Info(currentStatus_.ToString().c_str());
|
||||
return;
|
||||
}
|
||||
while (RestrictedRead(&record, &scratch)) {
|
||||
if (record.size() < 12) {
|
||||
reporter_.Corruption(
|
||||
record.size(), Status::Corruption("very small log record"));
|
||||
continue;
|
||||
}
|
||||
UpdateCurrentWriteBatch(record);
|
||||
if (currentLastSeq_ >= startingSequenceNumber_) {
|
||||
if (strict && currentBatchSeq_ != startingSequenceNumber_) {
|
||||
currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
|
||||
"seek to required sequence number");
|
||||
reporter_.Info(currentStatus_.ToString().c_str());
|
||||
return;
|
||||
} else if (strict) {
|
||||
reporter_.Info("Could seek required sequence number. Iterator will "
|
||||
"continue.");
|
||||
}
|
||||
isValid_ = true;
|
||||
started_ = true; // set started_ as we could seek till starting sequence
|
||||
return;
|
||||
} else {
|
||||
isValid_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Could not find start sequence in first file. Normally this must be the
|
||||
// only file. Otherwise log the error and let the iterator return next entry
|
||||
// If strict is set, we want to seek exactly till the start sequence and it
|
||||
// should have been present in the file we scanned above
|
||||
if (strict) {
|
||||
currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
|
||||
"seek to required sequence number");
|
||||
reporter_.Info(currentStatus_.ToString().c_str());
|
||||
} else if (files_->size() != 1) {
|
||||
currentStatus_ = Status::Corruption("Start sequence was not found, "
|
||||
"skipping to the next available");
|
||||
reporter_.Info(currentStatus_.ToString().c_str());
|
||||
// Let NextImpl find the next available entry. started_ remains false
|
||||
// because we don't want to check for gaps while moving to start sequence
|
||||
NextImpl(true);
|
||||
}
|
||||
}
|
||||
|
||||
void TransactionLogIteratorImpl::Next() {
|
||||
return NextImpl(false);
|
||||
}
|
||||
|
||||
void TransactionLogIteratorImpl::NextImpl(bool internal) {
|
||||
std::string scratch;
|
||||
Slice record;
|
||||
isValid_ = false;
|
||||
if (!internal && !started_) {
|
||||
// Runs every time until we can seek to the start sequence
|
||||
return SeekToStartSequence();
|
||||
}
|
||||
while(true) {
|
||||
assert(currentLogReader_);
|
||||
if (currentLogReader_->IsEOF()) {
|
||||
currentLogReader_->UnmarkEOF();
|
||||
}
|
||||
while (RestrictedRead(&record, &scratch)) {
|
||||
if (record.size() < 12) {
|
||||
reporter_.Corruption(
|
||||
record.size(), Status::Corruption("very small log record"));
|
||||
continue;
|
||||
} else {
|
||||
// started_ should be true if called by application
|
||||
assert(internal || started_);
|
||||
// started_ should be false if called internally
|
||||
assert(!internal || !started_);
|
||||
UpdateCurrentWriteBatch(record);
|
||||
if (internal && !started_) {
|
||||
started_ = true;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Open the next file
|
||||
if (currentFileIndex_ < files_->size() - 1) {
|
||||
++currentFileIndex_;
|
||||
Status status =OpenLogReader(files_->at(currentFileIndex_).get());
|
||||
if (!status.ok()) {
|
||||
isValid_ = false;
|
||||
currentStatus_ = status;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
isValid_ = false;
|
||||
if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) {
|
||||
currentStatus_ = Status::OK();
|
||||
} else {
|
||||
currentStatus_ = Status::Corruption("NO MORE DATA LEFT");
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool TransactionLogIteratorImpl::IsBatchExpected(
|
||||
const WriteBatch* batch,
|
||||
const SequenceNumber expectedSeq) {
|
||||
assert(batch);
|
||||
SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch);
|
||||
if (batchSeq != expectedSeq) {
|
||||
char buf[200];
|
||||
snprintf(buf, sizeof(buf),
|
||||
"Discontinuity in log records. Got seq=%lu, Expected seq=%lu, "
|
||||
"Last flushed seq=%lu.Log iterator will reseek the correct "
|
||||
"batch.",
|
||||
(unsigned long)batchSeq,
|
||||
(unsigned long)expectedSeq,
|
||||
(unsigned long)dbimpl_->GetLatestSequenceNumber());
|
||||
reporter_.Info(buf);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
|
||||
std::unique_ptr<WriteBatch> batch(new WriteBatch());
|
||||
WriteBatchInternal::SetContents(batch.get(), record);
|
||||
|
||||
SequenceNumber expectedSeq = currentLastSeq_ + 1;
|
||||
// If the iterator has started, then confirm that we get continuous batches
|
||||
if (started_ && !IsBatchExpected(batch.get(), expectedSeq)) {
|
||||
// Seek to the batch having expected sequence number
|
||||
if (expectedSeq < files_->at(currentFileIndex_)->StartSequence()) {
|
||||
// Expected batch must lie in the previous log file
|
||||
// Avoid underflow.
|
||||
if (currentFileIndex_ != 0) {
|
||||
currentFileIndex_--;
|
||||
}
|
||||
}
|
||||
startingSequenceNumber_ = expectedSeq;
|
||||
// currentStatus_ will be set to Ok if reseek succeeds
|
||||
currentStatus_ = Status::NotFound("Gap in sequence numbers");
|
||||
return SeekToStartSequence(currentFileIndex_, true);
|
||||
}
|
||||
|
||||
currentBatchSeq_ = WriteBatchInternal::Sequence(batch.get());
|
||||
currentLastSeq_ = currentBatchSeq_ +
|
||||
WriteBatchInternal::Count(batch.get()) - 1;
|
||||
// currentBatchSeq_ can only change here
|
||||
assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber());
|
||||
|
||||
currentBatch_ = move(batch);
|
||||
isValid_ = true;
|
||||
currentStatus_ = Status::OK();
|
||||
}
|
||||
|
||||
Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
|
||||
unique_ptr<SequentialFile> file;
|
||||
Status status = OpenLogFile(logFile, &file);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
assert(file);
|
||||
currentLogReader_.reset(new log::Reader(std::move(file), &reporter_,
|
||||
read_options_.verify_checksums_, 0));
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
120
db/transaction_log_impl.h
Normal file
120
db/transaction_log_impl.h
Normal file
@@ -0,0 +1,120 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
#pragma once
|
||||
#include <vector>
|
||||
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "rocksdb/transaction_log.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/log_reader.h"
|
||||
#include "db/filename.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
struct LogReporter : public log::Reader::Reporter {
|
||||
Env* env;
|
||||
Logger* info_log;
|
||||
virtual void Corruption(size_t bytes, const Status& s) {
|
||||
Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str());
|
||||
}
|
||||
virtual void Info(const char* s) {
|
||||
Log(info_log, "%s", s);
|
||||
}
|
||||
};
|
||||
|
||||
class LogFileImpl : public LogFile {
|
||||
public:
|
||||
LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
|
||||
uint64_t sizeBytes) :
|
||||
logNumber_(logNum),
|
||||
type_(logType),
|
||||
startSequence_(startSeq),
|
||||
sizeFileBytes_(sizeBytes) {
|
||||
}
|
||||
|
||||
std::string PathName() const {
|
||||
if (type_ == kArchivedLogFile) {
|
||||
return ArchivedLogFileName("", logNumber_);
|
||||
}
|
||||
return LogFileName("", logNumber_);
|
||||
}
|
||||
|
||||
uint64_t LogNumber() const { return logNumber_; }
|
||||
|
||||
WalFileType Type() const { return type_; }
|
||||
|
||||
SequenceNumber StartSequence() const { return startSequence_; }
|
||||
|
||||
uint64_t SizeFileBytes() const { return sizeFileBytes_; }
|
||||
|
||||
bool operator < (const LogFile& that) const {
|
||||
return LogNumber() < that.LogNumber();
|
||||
}
|
||||
|
||||
private:
|
||||
uint64_t logNumber_;
|
||||
WalFileType type_;
|
||||
SequenceNumber startSequence_;
|
||||
uint64_t sizeFileBytes_;
|
||||
|
||||
};
|
||||
|
||||
class TransactionLogIteratorImpl : public TransactionLogIterator {
|
||||
public:
|
||||
TransactionLogIteratorImpl(
|
||||
const std::string& dir, const DBOptions* options,
|
||||
const TransactionLogIterator::ReadOptions& read_options,
|
||||
const EnvOptions& soptions, const SequenceNumber seqNum,
|
||||
std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl);
|
||||
|
||||
virtual bool Valid();
|
||||
|
||||
virtual void Next();
|
||||
|
||||
virtual Status status();
|
||||
|
||||
virtual BatchResult GetBatch();
|
||||
|
||||
private:
|
||||
const std::string& dir_;
|
||||
const DBOptions* options_;
|
||||
const TransactionLogIterator::ReadOptions read_options_;
|
||||
const EnvOptions& soptions_;
|
||||
SequenceNumber startingSequenceNumber_;
|
||||
std::unique_ptr<VectorLogPtr> files_;
|
||||
bool started_;
|
||||
bool isValid_; // not valid when it starts of.
|
||||
Status currentStatus_;
|
||||
size_t currentFileIndex_;
|
||||
std::unique_ptr<WriteBatch> currentBatch_;
|
||||
unique_ptr<log::Reader> currentLogReader_;
|
||||
Status OpenLogFile(const LogFile* logFile, unique_ptr<SequentialFile>* file);
|
||||
LogReporter reporter_;
|
||||
SequenceNumber currentBatchSeq_; // sequence number at start of current batch
|
||||
SequenceNumber currentLastSeq_; // last sequence in the current batch
|
||||
DBImpl const * const dbimpl_; // The db on whose log files this iterates
|
||||
|
||||
// Reads from transaction log only if the writebatch record has been written
|
||||
bool RestrictedRead(Slice* record, std::string* scratch);
|
||||
// Seeks to startingSequenceNumber reading from startFileIndex in files_.
|
||||
// If strict is set,then must get a batch starting with startingSequenceNumber
|
||||
void SeekToStartSequence(uint64_t startFileIndex = 0, bool strict = false);
|
||||
// Implementation of Next. SeekToStartSequence calls it internally with
|
||||
// internal=true to let it find next entry even if it has to jump gaps because
|
||||
// the iterator may start off from the first available entry but promises to
|
||||
// be continuous after that
|
||||
void NextImpl(bool internal = false);
|
||||
// Check if batch is expected, else return false
|
||||
bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expectedSeq);
|
||||
// Update current batch if a continuous batch is found, else return false
|
||||
void UpdateCurrentWriteBatch(const Slice& record);
|
||||
Status OpenLogReader(const LogFile* file);
|
||||
};
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
364
db/version_edit.cc
Normal file
364
db/version_edit.cc
Normal file
@@ -0,0 +1,364 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/version_edit.h"
|
||||
|
||||
#include "db/version_set.h"
|
||||
#include "util/coding.h"
|
||||
#include "rocksdb/slice.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// Tag numbers for serialized VersionEdit. These numbers are written to
|
||||
// disk and should not be changed.
|
||||
enum Tag {
|
||||
kComparator = 1,
|
||||
kLogNumber = 2,
|
||||
kNextFileNumber = 3,
|
||||
kLastSequence = 4,
|
||||
kCompactPointer = 5,
|
||||
kDeletedFile = 6,
|
||||
kNewFile = 7,
|
||||
// 8 was used for large value refs
|
||||
kPrevLogNumber = 9,
|
||||
|
||||
// these are new formats divergent from open source leveldb
|
||||
kNewFile2 = 100, // store smallest & largest seqno
|
||||
|
||||
kColumnFamily = 200, // specify column family for version edit
|
||||
kColumnFamilyAdd = 201,
|
||||
kColumnFamilyDrop = 202,
|
||||
kMaxColumnFamily = 203,
|
||||
};
|
||||
|
||||
void VersionEdit::Clear() {
|
||||
comparator_.clear();
|
||||
max_level_ = 0;
|
||||
log_number_ = 0;
|
||||
prev_log_number_ = 0;
|
||||
last_sequence_ = 0;
|
||||
next_file_number_ = 0;
|
||||
max_column_family_ = 0;
|
||||
has_comparator_ = false;
|
||||
has_log_number_ = false;
|
||||
has_prev_log_number_ = false;
|
||||
has_next_file_number_ = false;
|
||||
has_last_sequence_ = false;
|
||||
has_max_column_family_ = false;
|
||||
deleted_files_.clear();
|
||||
new_files_.clear();
|
||||
column_family_ = 0;
|
||||
is_column_family_add_ = 0;
|
||||
is_column_family_drop_ = 0;
|
||||
column_family_name_.clear();
|
||||
}
|
||||
|
||||
void VersionEdit::EncodeTo(std::string* dst) const {
|
||||
if (has_comparator_) {
|
||||
PutVarint32(dst, kComparator);
|
||||
PutLengthPrefixedSlice(dst, comparator_);
|
||||
}
|
||||
if (has_log_number_) {
|
||||
PutVarint32(dst, kLogNumber);
|
||||
PutVarint64(dst, log_number_);
|
||||
}
|
||||
if (has_prev_log_number_) {
|
||||
PutVarint32(dst, kPrevLogNumber);
|
||||
PutVarint64(dst, prev_log_number_);
|
||||
}
|
||||
if (has_next_file_number_) {
|
||||
PutVarint32(dst, kNextFileNumber);
|
||||
PutVarint64(dst, next_file_number_);
|
||||
}
|
||||
if (has_last_sequence_) {
|
||||
PutVarint32(dst, kLastSequence);
|
||||
PutVarint64(dst, last_sequence_);
|
||||
}
|
||||
if (has_max_column_family_) {
|
||||
PutVarint32(dst, kMaxColumnFamily);
|
||||
PutVarint32(dst, max_column_family_);
|
||||
}
|
||||
|
||||
for (const auto& deleted : deleted_files_) {
|
||||
PutVarint32(dst, kDeletedFile);
|
||||
PutVarint32(dst, deleted.first /* level */);
|
||||
PutVarint64(dst, deleted.second /* file number */);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < new_files_.size(); i++) {
|
||||
const FileMetaData& f = new_files_[i].second;
|
||||
PutVarint32(dst, kNewFile2);
|
||||
PutVarint32(dst, new_files_[i].first); // level
|
||||
PutVarint64(dst, f.number);
|
||||
PutVarint64(dst, f.file_size);
|
||||
PutLengthPrefixedSlice(dst, f.smallest.Encode());
|
||||
PutLengthPrefixedSlice(dst, f.largest.Encode());
|
||||
PutVarint64(dst, f.smallest_seqno);
|
||||
PutVarint64(dst, f.largest_seqno);
|
||||
}
|
||||
|
||||
// 0 is default and does not need to be explicitly written
|
||||
if (column_family_ != 0) {
|
||||
PutVarint32(dst, kColumnFamily);
|
||||
PutVarint32(dst, column_family_);
|
||||
}
|
||||
|
||||
if (is_column_family_add_) {
|
||||
PutVarint32(dst, kColumnFamilyAdd);
|
||||
PutLengthPrefixedSlice(dst, Slice(column_family_name_));
|
||||
}
|
||||
|
||||
if (is_column_family_drop_) {
|
||||
PutVarint32(dst, kColumnFamilyDrop);
|
||||
}
|
||||
}
|
||||
|
||||
static bool GetInternalKey(Slice* input, InternalKey* dst) {
|
||||
Slice str;
|
||||
if (GetLengthPrefixedSlice(input, &str)) {
|
||||
dst->DecodeFrom(str);
|
||||
return dst->Valid();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
|
||||
uint32_t v;
|
||||
if (GetVarint32(input, &v)) {
|
||||
*level = v;
|
||||
if (max_level_ < *level) {
|
||||
max_level_ = *level;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
Status VersionEdit::DecodeFrom(const Slice& src) {
|
||||
Clear();
|
||||
Slice input = src;
|
||||
const char* msg = nullptr;
|
||||
uint32_t tag;
|
||||
|
||||
// Temporary storage for parsing
|
||||
int level;
|
||||
uint64_t number;
|
||||
FileMetaData f;
|
||||
Slice str;
|
||||
InternalKey key;
|
||||
|
||||
while (msg == nullptr && GetVarint32(&input, &tag)) {
|
||||
switch (tag) {
|
||||
case kComparator:
|
||||
if (GetLengthPrefixedSlice(&input, &str)) {
|
||||
comparator_ = str.ToString();
|
||||
has_comparator_ = true;
|
||||
} else {
|
||||
msg = "comparator name";
|
||||
}
|
||||
break;
|
||||
|
||||
case kLogNumber:
|
||||
if (GetVarint64(&input, &log_number_)) {
|
||||
has_log_number_ = true;
|
||||
} else {
|
||||
msg = "log number";
|
||||
}
|
||||
break;
|
||||
|
||||
case kPrevLogNumber:
|
||||
if (GetVarint64(&input, &prev_log_number_)) {
|
||||
has_prev_log_number_ = true;
|
||||
} else {
|
||||
msg = "previous log number";
|
||||
}
|
||||
break;
|
||||
|
||||
case kNextFileNumber:
|
||||
if (GetVarint64(&input, &next_file_number_)) {
|
||||
has_next_file_number_ = true;
|
||||
} else {
|
||||
msg = "next file number";
|
||||
}
|
||||
break;
|
||||
|
||||
case kLastSequence:
|
||||
if (GetVarint64(&input, &last_sequence_)) {
|
||||
has_last_sequence_ = true;
|
||||
} else {
|
||||
msg = "last sequence number";
|
||||
}
|
||||
break;
|
||||
|
||||
case kMaxColumnFamily:
|
||||
if (GetVarint32(&input, &max_column_family_)) {
|
||||
has_max_column_family_ = true;
|
||||
} else {
|
||||
msg = "max column family";
|
||||
}
|
||||
break;
|
||||
|
||||
case kCompactPointer:
|
||||
if (GetLevel(&input, &level, &msg) &&
|
||||
GetInternalKey(&input, &key)) {
|
||||
// we don't use compact pointers anymore,
|
||||
// but we should not fail if they are still
|
||||
// in manifest
|
||||
} else {
|
||||
if (!msg) {
|
||||
msg = "compaction pointer";
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case kDeletedFile:
|
||||
if (GetLevel(&input, &level, &msg) &&
|
||||
GetVarint64(&input, &number)) {
|
||||
deleted_files_.insert(std::make_pair(level, number));
|
||||
} else {
|
||||
if (!msg) {
|
||||
msg = "deleted file";
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case kNewFile:
|
||||
if (GetLevel(&input, &level, &msg) &&
|
||||
GetVarint64(&input, &f.number) &&
|
||||
GetVarint64(&input, &f.file_size) &&
|
||||
GetInternalKey(&input, &f.smallest) &&
|
||||
GetInternalKey(&input, &f.largest)) {
|
||||
new_files_.push_back(std::make_pair(level, f));
|
||||
} else {
|
||||
if (!msg) {
|
||||
msg = "new-file entry";
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case kNewFile2:
|
||||
if (GetLevel(&input, &level, &msg) &&
|
||||
GetVarint64(&input, &f.number) &&
|
||||
GetVarint64(&input, &f.file_size) &&
|
||||
GetInternalKey(&input, &f.smallest) &&
|
||||
GetInternalKey(&input, &f.largest) &&
|
||||
GetVarint64(&input, &f.smallest_seqno) &&
|
||||
GetVarint64(&input, &f.largest_seqno) ) {
|
||||
new_files_.push_back(std::make_pair(level, f));
|
||||
} else {
|
||||
if (!msg) {
|
||||
msg = "new-file2 entry";
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case kColumnFamily:
|
||||
if (!GetVarint32(&input, &column_family_)) {
|
||||
if (!msg) {
|
||||
msg = "set column family id";
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case kColumnFamilyAdd:
|
||||
if (GetLengthPrefixedSlice(&input, &str)) {
|
||||
is_column_family_add_ = true;
|
||||
column_family_name_ = str.ToString();
|
||||
} else {
|
||||
if (!msg) {
|
||||
msg = "column family add";
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case kColumnFamilyDrop:
|
||||
is_column_family_drop_ = true;
|
||||
break;
|
||||
|
||||
default:
|
||||
msg = "unknown tag";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (msg == nullptr && !input.empty()) {
|
||||
msg = "invalid tag";
|
||||
}
|
||||
|
||||
Status result;
|
||||
if (msg != nullptr) {
|
||||
result = Status::Corruption("VersionEdit", msg);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string VersionEdit::DebugString(bool hex_key) const {
|
||||
std::string r;
|
||||
r.append("VersionEdit {");
|
||||
if (has_comparator_) {
|
||||
r.append("\n Comparator: ");
|
||||
r.append(comparator_);
|
||||
}
|
||||
if (has_log_number_) {
|
||||
r.append("\n LogNumber: ");
|
||||
AppendNumberTo(&r, log_number_);
|
||||
}
|
||||
if (has_prev_log_number_) {
|
||||
r.append("\n PrevLogNumber: ");
|
||||
AppendNumberTo(&r, prev_log_number_);
|
||||
}
|
||||
if (has_next_file_number_) {
|
||||
r.append("\n NextFile: ");
|
||||
AppendNumberTo(&r, next_file_number_);
|
||||
}
|
||||
if (has_last_sequence_) {
|
||||
r.append("\n LastSeq: ");
|
||||
AppendNumberTo(&r, last_sequence_);
|
||||
}
|
||||
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
|
||||
iter != deleted_files_.end();
|
||||
++iter) {
|
||||
r.append("\n DeleteFile: ");
|
||||
AppendNumberTo(&r, iter->first);
|
||||
r.append(" ");
|
||||
AppendNumberTo(&r, iter->second);
|
||||
}
|
||||
for (size_t i = 0; i < new_files_.size(); i++) {
|
||||
const FileMetaData& f = new_files_[i].second;
|
||||
r.append("\n AddFile: ");
|
||||
AppendNumberTo(&r, new_files_[i].first);
|
||||
r.append(" ");
|
||||
AppendNumberTo(&r, f.number);
|
||||
r.append(" ");
|
||||
AppendNumberTo(&r, f.file_size);
|
||||
r.append(" ");
|
||||
r.append(f.smallest.DebugString(hex_key));
|
||||
r.append(" .. ");
|
||||
r.append(f.largest.DebugString(hex_key));
|
||||
}
|
||||
r.append("\n ColumnFamily: ");
|
||||
AppendNumberTo(&r, column_family_);
|
||||
if (is_column_family_add_) {
|
||||
r.append("\n ColumnFamilyAdd: ");
|
||||
r.append(column_family_name_);
|
||||
}
|
||||
if (is_column_family_drop_) {
|
||||
r.append("\n ColumnFamilyDrop");
|
||||
}
|
||||
if (has_max_column_family_) {
|
||||
r.append("\n MaxColumnFamily: ");
|
||||
AppendNumberTo(&r, max_column_family_);
|
||||
}
|
||||
r.append("\n}\n");
|
||||
return r;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
176
db/version_edit.h
Normal file
176
db/version_edit.h
Normal file
@@ -0,0 +1,176 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "rocksdb/cache.h"
|
||||
#include "db/dbformat.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class VersionSet;
|
||||
|
||||
struct FileMetaData {
|
||||
int refs;
|
||||
int allowed_seeks; // Seeks allowed until compaction
|
||||
uint64_t number;
|
||||
uint64_t file_size; // File size in bytes
|
||||
InternalKey smallest; // Smallest internal key served by table
|
||||
InternalKey largest; // Largest internal key served by table
|
||||
bool being_compacted; // Is this file undergoing compaction?
|
||||
SequenceNumber smallest_seqno;// The smallest seqno in this file
|
||||
SequenceNumber largest_seqno; // The largest seqno in this file
|
||||
|
||||
// Needs to be disposed when refs becomes 0.
|
||||
Cache::Handle* table_reader_handle;
|
||||
// Table reader in table_reader_handle
|
||||
TableReader* table_reader;
|
||||
|
||||
FileMetaData(uint64_t number, uint64_t file_size)
|
||||
: refs(0),
|
||||
allowed_seeks(1 << 30),
|
||||
number(number),
|
||||
file_size(file_size),
|
||||
being_compacted(false),
|
||||
table_reader_handle(nullptr),
|
||||
table_reader(nullptr) {}
|
||||
FileMetaData() : FileMetaData(0, 0) {}
|
||||
};
|
||||
|
||||
class VersionEdit {
|
||||
public:
|
||||
VersionEdit() { Clear(); }
|
||||
~VersionEdit() { }
|
||||
|
||||
void Clear();
|
||||
|
||||
void SetComparatorName(const Slice& name) {
|
||||
has_comparator_ = true;
|
||||
comparator_ = name.ToString();
|
||||
}
|
||||
void SetLogNumber(uint64_t num) {
|
||||
has_log_number_ = true;
|
||||
log_number_ = num;
|
||||
}
|
||||
void SetPrevLogNumber(uint64_t num) {
|
||||
has_prev_log_number_ = true;
|
||||
prev_log_number_ = num;
|
||||
}
|
||||
void SetNextFile(uint64_t num) {
|
||||
has_next_file_number_ = true;
|
||||
next_file_number_ = num;
|
||||
}
|
||||
void SetLastSequence(SequenceNumber seq) {
|
||||
has_last_sequence_ = true;
|
||||
last_sequence_ = seq;
|
||||
}
|
||||
void SetMaxColumnFamily(uint32_t max_column_family) {
|
||||
has_max_column_family_ = true;
|
||||
max_column_family_ = max_column_family;
|
||||
}
|
||||
|
||||
// Add the specified file at the specified number.
|
||||
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
|
||||
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
|
||||
void AddFile(int level, uint64_t file,
|
||||
uint64_t file_size,
|
||||
const InternalKey& smallest,
|
||||
const InternalKey& largest,
|
||||
const SequenceNumber& smallest_seqno,
|
||||
const SequenceNumber& largest_seqno) {
|
||||
assert(smallest_seqno <= largest_seqno);
|
||||
FileMetaData f;
|
||||
f.number = file;
|
||||
f.file_size = file_size;
|
||||
f.smallest = smallest;
|
||||
f.largest = largest;
|
||||
f.smallest_seqno = smallest_seqno;
|
||||
f.largest_seqno = largest_seqno;
|
||||
new_files_.push_back(std::make_pair(level, f));
|
||||
}
|
||||
|
||||
// Delete the specified "file" from the specified "level".
|
||||
void DeleteFile(int level, uint64_t file) {
|
||||
deleted_files_.insert({level, file});
|
||||
}
|
||||
|
||||
// Number of edits
|
||||
int NumEntries() {
|
||||
return new_files_.size() + deleted_files_.size();
|
||||
}
|
||||
|
||||
bool IsColumnFamilyManipulation() {
|
||||
return is_column_family_add_ || is_column_family_drop_;
|
||||
}
|
||||
|
||||
void SetColumnFamily(uint32_t column_family_id) {
|
||||
column_family_ = column_family_id;
|
||||
}
|
||||
|
||||
// set column family ID by calling SetColumnFamily()
|
||||
void AddColumnFamily(const std::string& name) {
|
||||
assert(!is_column_family_drop_);
|
||||
assert(!is_column_family_add_);
|
||||
assert(NumEntries() == 0);
|
||||
is_column_family_add_ = true;
|
||||
column_family_name_ = name;
|
||||
}
|
||||
|
||||
// set column family ID by calling SetColumnFamily()
|
||||
void DropColumnFamily() {
|
||||
assert(!is_column_family_drop_);
|
||||
assert(!is_column_family_add_);
|
||||
assert(NumEntries() == 0);
|
||||
is_column_family_drop_ = true;
|
||||
}
|
||||
|
||||
void EncodeTo(std::string* dst) const;
|
||||
Status DecodeFrom(const Slice& src);
|
||||
|
||||
std::string DebugString(bool hex_key = false) const;
|
||||
|
||||
private:
|
||||
friend class VersionSet;
|
||||
|
||||
typedef std::set< std::pair<int, uint64_t>> DeletedFileSet;
|
||||
|
||||
bool GetLevel(Slice* input, int* level, const char** msg);
|
||||
|
||||
int max_level_;
|
||||
std::string comparator_;
|
||||
uint64_t log_number_;
|
||||
uint64_t prev_log_number_;
|
||||
uint64_t next_file_number_;
|
||||
uint32_t max_column_family_;
|
||||
SequenceNumber last_sequence_;
|
||||
bool has_comparator_;
|
||||
bool has_log_number_;
|
||||
bool has_prev_log_number_;
|
||||
bool has_next_file_number_;
|
||||
bool has_last_sequence_;
|
||||
bool has_max_column_family_;
|
||||
|
||||
DeletedFileSet deleted_files_;
|
||||
std::vector<std::pair<int, FileMetaData>> new_files_;
|
||||
|
||||
// Each version edit record should have column_family_id set
|
||||
// If it's not set, it is default (0)
|
||||
uint32_t column_family_;
|
||||
// a version edit can be either column_family add or
|
||||
// column_family drop. If it's column family add,
|
||||
// it also includes column family name.
|
||||
bool is_column_family_drop_;
|
||||
bool is_column_family_add_;
|
||||
std::string column_family_name_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
65
db/version_edit_test.cc
Normal file
65
db/version_edit_test.cc
Normal file
@@ -0,0 +1,65 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/version_edit.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
static void TestEncodeDecode(const VersionEdit& edit) {
|
||||
std::string encoded, encoded2;
|
||||
edit.EncodeTo(&encoded);
|
||||
VersionEdit parsed;
|
||||
Status s = parsed.DecodeFrom(encoded);
|
||||
ASSERT_TRUE(s.ok()) << s.ToString();
|
||||
parsed.EncodeTo(&encoded2);
|
||||
ASSERT_EQ(encoded, encoded2);
|
||||
}
|
||||
|
||||
class VersionEditTest { };
|
||||
|
||||
TEST(VersionEditTest, EncodeDecode) {
|
||||
static const uint64_t kBig = 1ull << 50;
|
||||
|
||||
VersionEdit edit;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
TestEncodeDecode(edit);
|
||||
edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
|
||||
InternalKey("foo", kBig + 500 + i, kTypeValue),
|
||||
InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
|
||||
kBig + 500 + i,
|
||||
kBig + 600 + i);
|
||||
edit.DeleteFile(4, kBig + 700 + i);
|
||||
}
|
||||
|
||||
edit.SetComparatorName("foo");
|
||||
edit.SetLogNumber(kBig + 100);
|
||||
edit.SetNextFile(kBig + 200);
|
||||
edit.SetLastSequence(kBig + 1000);
|
||||
TestEncodeDecode(edit);
|
||||
}
|
||||
|
||||
TEST(VersionEditTest, ColumnFamilyTest) {
|
||||
VersionEdit edit;
|
||||
edit.SetColumnFamily(2);
|
||||
edit.AddColumnFamily("column_family");
|
||||
edit.SetMaxColumnFamily(5);
|
||||
TestEncodeDecode(edit);
|
||||
|
||||
edit.Clear();
|
||||
edit.SetColumnFamily(3);
|
||||
edit.DropColumnFamily();
|
||||
TestEncodeDecode(edit);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
||||
2822
db/version_set.cc
Normal file
2822
db/version_set.cc
Normal file
File diff suppressed because it is too large
Load Diff
499
db/version_set.h
Normal file
499
db/version_set.h
Normal file
@@ -0,0 +1,499 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// The representation of a DBImpl consists of a set of Versions. The
|
||||
// newest version is called "current". Older versions may be kept
|
||||
// around to provide a consistent view to live iterators.
|
||||
//
|
||||
// Each Version keeps track of a set of Table files per level. The
|
||||
// entire set of versions is maintained in a VersionSet.
|
||||
//
|
||||
// Version,VersionSet are thread-compatible, but require external
|
||||
// synchronization on all accesses.
|
||||
|
||||
#pragma once
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include <deque>
|
||||
#include <atomic>
|
||||
#include <limits>
|
||||
#include "db/dbformat.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "port/port.h"
|
||||
#include "db/table_cache.h"
|
||||
#include "db/compaction.h"
|
||||
#include "db/compaction_picker.h"
|
||||
#include "db/column_family.h"
|
||||
#include "db/log_reader.h"
|
||||
#include "db/file_indexer.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace log { class Writer; }
|
||||
|
||||
class Compaction;
|
||||
class CompactionPicker;
|
||||
class Iterator;
|
||||
class LogBuffer;
|
||||
class LookupKey;
|
||||
class MemTable;
|
||||
class Version;
|
||||
class VersionSet;
|
||||
class MergeContext;
|
||||
class ColumnFamilyData;
|
||||
class ColumnFamilySet;
|
||||
class TableCache;
|
||||
class MergeIteratorBuilder;
|
||||
|
||||
// Return the smallest index i such that files[i]->largest >= key.
|
||||
// Return files.size() if there is no such file.
|
||||
// REQUIRES: "files" contains a sorted list of non-overlapping files.
|
||||
extern int FindFile(const InternalKeyComparator& icmp,
|
||||
const std::vector<FileMetaData*>& files,
|
||||
const Slice& key);
|
||||
|
||||
// Returns true iff some file in "files" overlaps the user key range
|
||||
// [*smallest,*largest].
|
||||
// smallest==nullptr represents a key smaller than all keys in the DB.
|
||||
// largest==nullptr represents a key largest than all keys in the DB.
|
||||
// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
|
||||
// in sorted order.
|
||||
extern bool SomeFileOverlapsRange(
|
||||
const InternalKeyComparator& icmp,
|
||||
bool disjoint_sorted_files,
|
||||
const std::vector<FileMetaData*>& files,
|
||||
const Slice* smallest_user_key,
|
||||
const Slice* largest_user_key);
|
||||
|
||||
class Version {
|
||||
public:
|
||||
// Append to *iters a sequence of iterators that will
|
||||
// yield the contents of this Version when merged together.
|
||||
// REQUIRES: This version has been saved (see VersionSet::SaveTo)
|
||||
void AddIterators(const ReadOptions&, const EnvOptions& soptions,
|
||||
std::vector<Iterator*>* iters);
|
||||
|
||||
void AddIterators(const ReadOptions&, const EnvOptions& soptions,
|
||||
MergeIteratorBuilder* merger_iter_builder);
|
||||
|
||||
// Lookup the value for key. If found, store it in *val and
|
||||
// return OK. Else return a non-OK status. Fills *stats.
|
||||
// Uses *operands to store merge_operator operations to apply later
|
||||
// REQUIRES: lock is not held
|
||||
struct GetStats {
|
||||
FileMetaData* seek_file;
|
||||
int seek_file_level;
|
||||
};
|
||||
void Get(const ReadOptions&, const LookupKey& key, std::string* val,
|
||||
Status* status, MergeContext* merge_context, GetStats* stats,
|
||||
bool* value_found = nullptr);
|
||||
|
||||
// Adds "stats" into the current state. Returns true if a new
|
||||
// compaction may need to be triggered, false otherwise.
|
||||
// REQUIRES: lock is held
|
||||
bool UpdateStats(const GetStats& stats);
|
||||
|
||||
// Updates internal structures that keep track of compaction scores
|
||||
// We use compaction scores to figure out which compaction to do next
|
||||
// REQUIRES: If Version is not yet saved to current_, it can be called without
|
||||
// a lock. Once a version is saved to current_, call only with mutex held
|
||||
void ComputeCompactionScore(std::vector<uint64_t>& size_being_compacted);
|
||||
|
||||
// Reference count management (so Versions do not disappear out from
|
||||
// under live iterators)
|
||||
void Ref();
|
||||
// Decrease reference count. Delete the object if no reference left
|
||||
// and return true. Otherwise, return false.
|
||||
bool Unref();
|
||||
|
||||
// Returns true iff some level needs a compaction.
|
||||
bool NeedsCompaction() const;
|
||||
|
||||
// Returns the maxmimum compaction score for levels 1 to max
|
||||
double MaxCompactionScore() const { return max_compaction_score_; }
|
||||
|
||||
// See field declaration
|
||||
int MaxCompactionScoreLevel() const { return max_compaction_score_level_; }
|
||||
|
||||
void GetOverlappingInputs(
|
||||
int level,
|
||||
const InternalKey* begin, // nullptr means before all keys
|
||||
const InternalKey* end, // nullptr means after all keys
|
||||
std::vector<FileMetaData*>* inputs,
|
||||
int hint_index = -1, // index of overlap file
|
||||
int* file_index = nullptr); // return index of overlap file
|
||||
|
||||
void GetOverlappingInputsBinarySearch(
|
||||
int level,
|
||||
const Slice& begin, // nullptr means before all keys
|
||||
const Slice& end, // nullptr means after all keys
|
||||
std::vector<FileMetaData*>* inputs,
|
||||
int hint_index, // index of overlap file
|
||||
int* file_index); // return index of overlap file
|
||||
|
||||
void ExtendOverlappingInputs(
|
||||
int level,
|
||||
const Slice& begin, // nullptr means before all keys
|
||||
const Slice& end, // nullptr means after all keys
|
||||
std::vector<FileMetaData*>* inputs,
|
||||
unsigned int index); // start extending from this index
|
||||
|
||||
// Returns true iff some file in the specified level overlaps
|
||||
// some part of [*smallest_user_key,*largest_user_key].
|
||||
// smallest_user_key==NULL represents a key smaller than all keys in the DB.
|
||||
// largest_user_key==NULL represents a key largest than all keys in the DB.
|
||||
bool OverlapInLevel(int level,
|
||||
const Slice* smallest_user_key,
|
||||
const Slice* largest_user_key);
|
||||
|
||||
// Returns true iff the first or last file in inputs contains
|
||||
// an overlapping user key to the file "just outside" of it (i.e.
|
||||
// just after the last file, or just before the first file)
|
||||
// REQUIRES: "*inputs" is a sorted list of non-overlapping files
|
||||
bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
|
||||
int level);
|
||||
|
||||
|
||||
// Return the level at which we should place a new memtable compaction
|
||||
// result that covers the range [smallest_user_key,largest_user_key].
|
||||
int PickLevelForMemTableOutput(const Slice& smallest_user_key,
|
||||
const Slice& largest_user_key);
|
||||
|
||||
int NumberLevels() const { return num_levels_; }
|
||||
|
||||
// REQUIRES: lock is held
|
||||
int NumLevelFiles(int level) const { return files_[level].size(); }
|
||||
|
||||
// Return the combined file size of all files at the specified level.
|
||||
int64_t NumLevelBytes(int level) const;
|
||||
|
||||
// Return a human-readable short (single-line) summary of the number
|
||||
// of files per level. Uses *scratch as backing store.
|
||||
struct LevelSummaryStorage {
|
||||
char buffer[100];
|
||||
};
|
||||
struct FileSummaryStorage {
|
||||
char buffer[1000];
|
||||
};
|
||||
const char* LevelSummary(LevelSummaryStorage* scratch) const;
|
||||
// Return a human-readable short (single-line) summary of files
|
||||
// in a specified level. Uses *scratch as backing store.
|
||||
const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
|
||||
|
||||
// Return the maximum overlapping data (in bytes) at next level for any
|
||||
// file at a level >= 1.
|
||||
int64_t MaxNextLevelOverlappingBytes();
|
||||
|
||||
// Add all files listed in the current version to *live.
|
||||
void AddLiveFiles(std::set<uint64_t>* live);
|
||||
|
||||
// Return a human readable string that describes this version's contents.
|
||||
std::string DebugString(bool hex = false) const;
|
||||
|
||||
// Returns the version nuber of this version
|
||||
uint64_t GetVersionNumber() const { return version_number_; }
|
||||
|
||||
// REQUIRES: lock is held
|
||||
// On success, *props will be populated with all SSTables' table properties.
|
||||
// The keys of `props` are the sst file name, the values of `props` are the
|
||||
// tables' propertis, represented as shared_ptr.
|
||||
Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
|
||||
|
||||
// used to sort files by size
|
||||
struct Fsize {
|
||||
int index;
|
||||
FileMetaData* file;
|
||||
};
|
||||
|
||||
private:
|
||||
friend class Compaction;
|
||||
friend class VersionSet;
|
||||
friend class DBImpl;
|
||||
friend class ColumnFamilyData;
|
||||
friend class CompactionPicker;
|
||||
friend class LevelCompactionPicker;
|
||||
friend class UniversalCompactionPicker;
|
||||
friend class FIFOCompactionPicker;
|
||||
friend class ForwardIterator;
|
||||
|
||||
class LevelFileNumIterator;
|
||||
class LevelFileIteratorState;
|
||||
|
||||
bool PrefixMayMatch(const ReadOptions& options, Iterator* level_iter,
|
||||
const Slice& internal_prefix) const;
|
||||
|
||||
// Sort all files for this version based on their file size and
|
||||
// record results in files_by_size_. The largest files are listed first.
|
||||
void UpdateFilesBySize();
|
||||
|
||||
ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs
|
||||
const InternalKeyComparator* internal_comparator_;
|
||||
const Comparator* user_comparator_;
|
||||
TableCache* table_cache_;
|
||||
const MergeOperator* merge_operator_;
|
||||
Logger* info_log_;
|
||||
Statistics* db_statistics_;
|
||||
VersionSet* vset_; // VersionSet to which this Version belongs
|
||||
Version* next_; // Next version in linked list
|
||||
Version* prev_; // Previous version in linked list
|
||||
int refs_; // Number of live refs to this version
|
||||
int num_levels_; // Number of levels
|
||||
|
||||
// List of files per level, files in each level are arranged
|
||||
// in increasing order of keys
|
||||
std::vector<FileMetaData*>* files_;
|
||||
|
||||
// A list for the same set of files that are stored in files_,
|
||||
// but files in each level are now sorted based on file
|
||||
// size. The file with the largest size is at the front.
|
||||
// This vector stores the index of the file from files_.
|
||||
std::vector<std::vector<int>> files_by_size_;
|
||||
|
||||
// An index into files_by_size_ that specifies the first
|
||||
// file that is not yet compacted
|
||||
std::vector<int> next_file_to_compact_by_size_;
|
||||
|
||||
// Only the first few entries of files_by_size_ are sorted.
|
||||
// There is no need to sort all the files because it is likely
|
||||
// that on a running system, we need to look at only the first
|
||||
// few largest files because a new version is created every few
|
||||
// seconds/minutes (because of concurrent compactions).
|
||||
static const int number_of_files_to_sort_ = 50;
|
||||
|
||||
// Next file to compact based on seek stats.
|
||||
FileMetaData* file_to_compact_;
|
||||
int file_to_compact_level_;
|
||||
|
||||
// Level that should be compacted next and its compaction score.
|
||||
// Score < 1 means compaction is not strictly needed. These fields
|
||||
// are initialized by Finalize().
|
||||
// The most critical level to be compacted is listed first
|
||||
// These are used to pick the best compaction level
|
||||
std::vector<double> compaction_score_;
|
||||
std::vector<int> compaction_level_;
|
||||
double max_compaction_score_; // max score in l1 to ln-1
|
||||
int max_compaction_score_level_; // level on which max score occurs
|
||||
|
||||
// A version number that uniquely represents this version. This is
|
||||
// used for debugging and logging purposes only.
|
||||
uint64_t version_number_;
|
||||
|
||||
Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
|
||||
FileIndexer file_indexer_;
|
||||
|
||||
~Version();
|
||||
|
||||
// re-initializes the index that is used to offset into files_by_size_
|
||||
// to find the next compaction candidate file.
|
||||
void ResetNextCompactionIndex(int level) {
|
||||
next_file_to_compact_by_size_[level] = 0;
|
||||
}
|
||||
|
||||
// No copying allowed
|
||||
Version(const Version&);
|
||||
void operator=(const Version&);
|
||||
};
|
||||
|
||||
class VersionSet {
|
||||
public:
|
||||
VersionSet(const std::string& dbname, const DBOptions* options,
|
||||
const EnvOptions& storage_options, Cache* table_cache);
|
||||
~VersionSet();
|
||||
|
||||
// Apply *edit to the current version to form a new descriptor that
|
||||
// is both saved to persistent state and installed as the new
|
||||
// current version. Will release *mu while actually writing to the file.
|
||||
// column_family_options has to be set if edit is column family add
|
||||
// REQUIRES: *mu is held on entry.
|
||||
// REQUIRES: no other thread concurrently calls LogAndApply()
|
||||
Status LogAndApply(ColumnFamilyData* column_family_data, VersionEdit* edit,
|
||||
port::Mutex* mu, Directory* db_directory = nullptr,
|
||||
bool new_descriptor_log = false,
|
||||
const ColumnFamilyOptions* column_family_options =
|
||||
nullptr);
|
||||
|
||||
// Recover the last saved descriptor from persistent storage.
|
||||
// If read_only == true, Recover() will not complain if some column families
|
||||
// are not opened
|
||||
Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
bool read_only = false);
|
||||
|
||||
// Reads a manifest file and returns a list of column families in
|
||||
// column_families.
|
||||
static Status ListColumnFamilies(std::vector<std::string>* column_families,
|
||||
const std::string& dbname, Env* env);
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
// Try to reduce the number of levels. This call is valid when
|
||||
// only one level from the new max level to the old
|
||||
// max level containing files.
|
||||
// The call is static, since number of levels is immutable during
|
||||
// the lifetime of a RocksDB instance. It reduces number of levels
|
||||
// in a DB by applying changes to manifest.
|
||||
// For example, a db currently has 7 levels [0-6], and a call to
|
||||
// to reduce to 5 [0-4] can only be executed when only one level
|
||||
// among [4-6] contains files.
|
||||
static Status ReduceNumberOfLevels(const std::string& dbname,
|
||||
const Options* options,
|
||||
const EnvOptions& storage_options,
|
||||
int new_levels);
|
||||
|
||||
// printf contents (for debugging)
|
||||
Status DumpManifest(Options& options, std::string& manifestFileName,
|
||||
bool verbose, bool hex = false);
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
// Return the current manifest file number
|
||||
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
|
||||
|
||||
uint64_t PendingManifestFileNumber() const {
|
||||
return pending_manifest_file_number_;
|
||||
}
|
||||
|
||||
// Allocate and return a new file number
|
||||
uint64_t NewFileNumber() { return next_file_number_++; }
|
||||
|
||||
// Arrange to reuse "file_number" unless a newer file number has
|
||||
// already been allocated.
|
||||
// REQUIRES: "file_number" was returned by a call to NewFileNumber().
|
||||
void ReuseFileNumber(uint64_t file_number) {
|
||||
if (next_file_number_ == file_number + 1) {
|
||||
next_file_number_ = file_number;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the last sequence number.
|
||||
uint64_t LastSequence() const {
|
||||
return last_sequence_.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
// Set the last sequence number to s.
|
||||
void SetLastSequence(uint64_t s) {
|
||||
assert(s >= last_sequence_);
|
||||
last_sequence_.store(s, std::memory_order_release);
|
||||
}
|
||||
|
||||
// Mark the specified file number as used.
|
||||
void MarkFileNumberUsed(uint64_t number);
|
||||
|
||||
// Return the log file number for the log file that is currently
|
||||
// being compacted, or zero if there is no such log file.
|
||||
uint64_t PrevLogNumber() const { return prev_log_number_; }
|
||||
|
||||
// Returns the minimum log number such that all
|
||||
// log numbers less than or equal to it can be deleted
|
||||
uint64_t MinLogNumber() const {
|
||||
uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
|
||||
for (auto cfd : *column_family_set_) {
|
||||
if (min_log_num > cfd->GetLogNumber()) {
|
||||
min_log_num = cfd->GetLogNumber();
|
||||
}
|
||||
}
|
||||
return min_log_num;
|
||||
}
|
||||
|
||||
// Create an iterator that reads over the compaction inputs for "*c".
|
||||
// The caller should delete the iterator when no longer needed.
|
||||
Iterator* MakeInputIterator(Compaction* c);
|
||||
|
||||
// Add all files listed in any live version to *live.
|
||||
void AddLiveFiles(std::vector<uint64_t>* live_list);
|
||||
|
||||
// Return the approximate offset in the database of the data for
|
||||
// "key" as of version "v".
|
||||
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
|
||||
|
||||
// Return the size of the current manifest file
|
||||
uint64_t ManifestFileSize() const { return manifest_file_size_; }
|
||||
|
||||
// verify that the files that we started with for a compaction
|
||||
// still exist in the current version and in the same original level.
|
||||
// This ensures that a concurrent compaction did not erroneously
|
||||
// pick the same files to compact.
|
||||
bool VerifyCompactionFileConsistency(Compaction* c);
|
||||
|
||||
Status GetMetadataForFile(uint64_t number, int* filelevel,
|
||||
FileMetaData** metadata, ColumnFamilyData** cfd);
|
||||
|
||||
void GetLiveFilesMetaData(
|
||||
std::vector<LiveFileMetaData> *metadata);
|
||||
|
||||
void GetObsoleteFiles(std::vector<FileMetaData*>* files);
|
||||
|
||||
ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
|
||||
|
||||
private:
|
||||
class Builder;
|
||||
struct ManifestWriter;
|
||||
|
||||
friend class Version;
|
||||
|
||||
struct LogReporter : public log::Reader::Reporter {
|
||||
Status* status;
|
||||
virtual void Corruption(size_t bytes, const Status& s) {
|
||||
if (this->status->ok()) *this->status = s;
|
||||
}
|
||||
};
|
||||
|
||||
// Save current contents to *log
|
||||
Status WriteSnapshot(log::Writer* log);
|
||||
|
||||
void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
|
||||
|
||||
bool ManifestContains(uint64_t manifest_file_number,
|
||||
const std::string& record) const;
|
||||
|
||||
ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options,
|
||||
VersionEdit* edit);
|
||||
|
||||
std::unique_ptr<ColumnFamilySet> column_family_set_;
|
||||
|
||||
Env* const env_;
|
||||
const std::string dbname_;
|
||||
const DBOptions* const options_;
|
||||
uint64_t next_file_number_;
|
||||
uint64_t manifest_file_number_;
|
||||
uint64_t pending_manifest_file_number_;
|
||||
std::atomic<uint64_t> last_sequence_;
|
||||
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
|
||||
|
||||
// Opened lazily
|
||||
unique_ptr<log::Writer> descriptor_log_;
|
||||
|
||||
// generates a increasing version number for every new version
|
||||
uint64_t current_version_number_;
|
||||
|
||||
// Queue of writers to the manifest file
|
||||
std::deque<ManifestWriter*> manifest_writers_;
|
||||
|
||||
// Current size of manifest file
|
||||
uint64_t manifest_file_size_;
|
||||
|
||||
std::vector<FileMetaData*> obsolete_files_;
|
||||
|
||||
// storage options for all reads and writes except compactions
|
||||
const EnvOptions& storage_options_;
|
||||
|
||||
// storage options used for compactions. This is a copy of
|
||||
// storage_options_ but with readaheads set to readahead_compactions_.
|
||||
const EnvOptions storage_options_compactions_;
|
||||
|
||||
// No copying allowed
|
||||
VersionSet(const VersionSet&);
|
||||
void operator=(const VersionSet&);
|
||||
|
||||
void LogAndApplyCFHelper(VersionEdit* edit);
|
||||
void LogAndApplyHelper(ColumnFamilyData* cfd, Builder* b, Version* v,
|
||||
VersionEdit* edit, port::Mutex* mu);
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
184
db/version_set_test.cc
Normal file
184
db/version_set_test.cc
Normal file
@@ -0,0 +1,184 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/version_set.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class FindFileTest {
|
||||
public:
|
||||
std::vector<FileMetaData*> files_;
|
||||
bool disjoint_sorted_files_;
|
||||
|
||||
FindFileTest() : disjoint_sorted_files_(true) { }
|
||||
|
||||
~FindFileTest() {
|
||||
for (unsigned int i = 0; i < files_.size(); i++) {
|
||||
delete files_[i];
|
||||
}
|
||||
}
|
||||
|
||||
void Add(const char* smallest, const char* largest,
|
||||
SequenceNumber smallest_seq = 100,
|
||||
SequenceNumber largest_seq = 100) {
|
||||
FileMetaData* f = new FileMetaData;
|
||||
f->number = files_.size() + 1;
|
||||
f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
|
||||
f->largest = InternalKey(largest, largest_seq, kTypeValue);
|
||||
files_.push_back(f);
|
||||
}
|
||||
|
||||
int Find(const char* key) {
|
||||
InternalKey target(key, 100, kTypeValue);
|
||||
InternalKeyComparator cmp(BytewiseComparator());
|
||||
return FindFile(cmp, files_, target.Encode());
|
||||
}
|
||||
|
||||
bool Overlaps(const char* smallest, const char* largest) {
|
||||
InternalKeyComparator cmp(BytewiseComparator());
|
||||
Slice s(smallest != nullptr ? smallest : "");
|
||||
Slice l(largest != nullptr ? largest : "");
|
||||
return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
|
||||
(smallest != nullptr ? &s : nullptr),
|
||||
(largest != nullptr ? &l : nullptr));
|
||||
}
|
||||
};
|
||||
|
||||
TEST(FindFileTest, Empty) {
|
||||
ASSERT_EQ(0, Find("foo"));
|
||||
ASSERT_TRUE(! Overlaps("a", "z"));
|
||||
ASSERT_TRUE(! Overlaps(nullptr, "z"));
|
||||
ASSERT_TRUE(! Overlaps("a", nullptr));
|
||||
ASSERT_TRUE(! Overlaps(nullptr, nullptr));
|
||||
}
|
||||
|
||||
TEST(FindFileTest, Single) {
|
||||
Add("p", "q");
|
||||
ASSERT_EQ(0, Find("a"));
|
||||
ASSERT_EQ(0, Find("p"));
|
||||
ASSERT_EQ(0, Find("p1"));
|
||||
ASSERT_EQ(0, Find("q"));
|
||||
ASSERT_EQ(1, Find("q1"));
|
||||
ASSERT_EQ(1, Find("z"));
|
||||
|
||||
ASSERT_TRUE(! Overlaps("a", "b"));
|
||||
ASSERT_TRUE(! Overlaps("z1", "z2"));
|
||||
ASSERT_TRUE(Overlaps("a", "p"));
|
||||
ASSERT_TRUE(Overlaps("a", "q"));
|
||||
ASSERT_TRUE(Overlaps("a", "z"));
|
||||
ASSERT_TRUE(Overlaps("p", "p1"));
|
||||
ASSERT_TRUE(Overlaps("p", "q"));
|
||||
ASSERT_TRUE(Overlaps("p", "z"));
|
||||
ASSERT_TRUE(Overlaps("p1", "p2"));
|
||||
ASSERT_TRUE(Overlaps("p1", "z"));
|
||||
ASSERT_TRUE(Overlaps("q", "q"));
|
||||
ASSERT_TRUE(Overlaps("q", "q1"));
|
||||
|
||||
ASSERT_TRUE(! Overlaps(nullptr, "j"));
|
||||
ASSERT_TRUE(! Overlaps("r", nullptr));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "p"));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "p1"));
|
||||
ASSERT_TRUE(Overlaps("q", nullptr));
|
||||
ASSERT_TRUE(Overlaps(nullptr, nullptr));
|
||||
}
|
||||
|
||||
|
||||
TEST(FindFileTest, Multiple) {
|
||||
Add("150", "200");
|
||||
Add("200", "250");
|
||||
Add("300", "350");
|
||||
Add("400", "450");
|
||||
ASSERT_EQ(0, Find("100"));
|
||||
ASSERT_EQ(0, Find("150"));
|
||||
ASSERT_EQ(0, Find("151"));
|
||||
ASSERT_EQ(0, Find("199"));
|
||||
ASSERT_EQ(0, Find("200"));
|
||||
ASSERT_EQ(1, Find("201"));
|
||||
ASSERT_EQ(1, Find("249"));
|
||||
ASSERT_EQ(1, Find("250"));
|
||||
ASSERT_EQ(2, Find("251"));
|
||||
ASSERT_EQ(2, Find("299"));
|
||||
ASSERT_EQ(2, Find("300"));
|
||||
ASSERT_EQ(2, Find("349"));
|
||||
ASSERT_EQ(2, Find("350"));
|
||||
ASSERT_EQ(3, Find("351"));
|
||||
ASSERT_EQ(3, Find("400"));
|
||||
ASSERT_EQ(3, Find("450"));
|
||||
ASSERT_EQ(4, Find("451"));
|
||||
|
||||
ASSERT_TRUE(! Overlaps("100", "149"));
|
||||
ASSERT_TRUE(! Overlaps("251", "299"));
|
||||
ASSERT_TRUE(! Overlaps("451", "500"));
|
||||
ASSERT_TRUE(! Overlaps("351", "399"));
|
||||
|
||||
ASSERT_TRUE(Overlaps("100", "150"));
|
||||
ASSERT_TRUE(Overlaps("100", "200"));
|
||||
ASSERT_TRUE(Overlaps("100", "300"));
|
||||
ASSERT_TRUE(Overlaps("100", "400"));
|
||||
ASSERT_TRUE(Overlaps("100", "500"));
|
||||
ASSERT_TRUE(Overlaps("375", "400"));
|
||||
ASSERT_TRUE(Overlaps("450", "450"));
|
||||
ASSERT_TRUE(Overlaps("450", "500"));
|
||||
}
|
||||
|
||||
TEST(FindFileTest, MultipleNullBoundaries) {
|
||||
Add("150", "200");
|
||||
Add("200", "250");
|
||||
Add("300", "350");
|
||||
Add("400", "450");
|
||||
ASSERT_TRUE(! Overlaps(nullptr, "149"));
|
||||
ASSERT_TRUE(! Overlaps("451", nullptr));
|
||||
ASSERT_TRUE(Overlaps(nullptr, nullptr));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "150"));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "199"));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "200"));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "201"));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "400"));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "800"));
|
||||
ASSERT_TRUE(Overlaps("100", nullptr));
|
||||
ASSERT_TRUE(Overlaps("200", nullptr));
|
||||
ASSERT_TRUE(Overlaps("449", nullptr));
|
||||
ASSERT_TRUE(Overlaps("450", nullptr));
|
||||
}
|
||||
|
||||
TEST(FindFileTest, OverlapSequenceChecks) {
|
||||
Add("200", "200", 5000, 3000);
|
||||
ASSERT_TRUE(! Overlaps("199", "199"));
|
||||
ASSERT_TRUE(! Overlaps("201", "300"));
|
||||
ASSERT_TRUE(Overlaps("200", "200"));
|
||||
ASSERT_TRUE(Overlaps("190", "200"));
|
||||
ASSERT_TRUE(Overlaps("200", "210"));
|
||||
}
|
||||
|
||||
TEST(FindFileTest, OverlappingFiles) {
|
||||
Add("150", "600");
|
||||
Add("400", "500");
|
||||
disjoint_sorted_files_ = false;
|
||||
ASSERT_TRUE(! Overlaps("100", "149"));
|
||||
ASSERT_TRUE(! Overlaps("601", "700"));
|
||||
ASSERT_TRUE(Overlaps("100", "150"));
|
||||
ASSERT_TRUE(Overlaps("100", "200"));
|
||||
ASSERT_TRUE(Overlaps("100", "300"));
|
||||
ASSERT_TRUE(Overlaps("100", "400"));
|
||||
ASSERT_TRUE(Overlaps("100", "500"));
|
||||
ASSERT_TRUE(Overlaps("375", "400"));
|
||||
ASSERT_TRUE(Overlaps("450", "450"));
|
||||
ASSERT_TRUE(Overlaps("450", "500"));
|
||||
ASSERT_TRUE(Overlaps("450", "700"));
|
||||
ASSERT_TRUE(Overlaps("600", "700"));
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user