Squashed 'src/rocksdb/' content from commit 457bae6

git-subtree-dir: src/rocksdb
git-subtree-split: 457bae6911
This commit is contained in:
Vinnie Falco
2014-06-04 16:10:38 -07:00
commit 8514b88974
379 changed files with 108179 additions and 0 deletions

10
.arcconfig Normal file
View File

@@ -0,0 +1,10 @@
{
"project_id" : "rocksdb",
"conduit_uri" : "https://reviews.facebook.net/",
"copyright_holder" : "Facebook",
"load" : [
"linters"
],
"lint.engine" : "FacebookFbcodeLintEngine",
"lint.engine.single.linter" : "FbcodeCppLinter"
}

5
.clang-format Normal file
View File

@@ -0,0 +1,5 @@
# Complete list of style options can be found at:
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
---
BasedOnStyle: Google
...

34
.gitignore vendored Normal file
View File

@@ -0,0 +1,34 @@
TARGETS
build_config.mk
*.a
*.arc
*.d
*.dylib*
*.gcda
*.gcno
*.o
*.so
*.so.*
*_test
*_bench
*_stress
*.out
*.class
*.jar
*.*jnilib*
*.d-e
*.o-*
*.swp
ldb
manifest_dump
sst_dump
util/build_version.cc
build_tools/VALGRIND_LOGS/
coverage/COVERAGE_REPORT
.gdbhistory
.phutil_module_cache
tags
java/*.log
java/include/org_rocksdb_*.h

20
.travis.yml Normal file
View File

@@ -0,0 +1,20 @@
language: cpp
compiler: gcc
before_install:
# As of this writing (10 May 2014) the Travis build environment is Ubuntu 12.04,
# which needs the following ugly dependency incantations to build RocksDB:
- sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
- sudo apt-get update -qq
- sudo apt-get install -y -qq gcc-4.8 g++-4.8 zlib1g-dev libbz2-dev libsnappy-dev libjemalloc-dev
- sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 50
- sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 50
- wget https://gflags.googlecode.com/files/libgflags0_2.0-1_amd64.deb
- sudo dpkg -i libgflags0_2.0-1_amd64.deb
- wget https://gflags.googlecode.com/files/libgflags-dev_2.0-1_amd64.deb
- sudo dpkg -i libgflags-dev_2.0-1_amd64.deb
# Lousy hack to disable use and testing of fallocate, which doesn't behave quite
# as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
- sed -i "s/fallocate(/HACK_NO_fallocate(/" build_tools/build_detect_platform
script: make check -j8
notifications:
email: false

20
CONTRIBUTING.md Normal file
View File

@@ -0,0 +1,20 @@
# Contributing to RocksDB
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You
only need to do this once, so if you've done this for another Facebook
open source project, you're good to go. If you are submitting a pull
request for the first time, just let us know that you have completed
the CLA and we can cross-check with your GitHub username.
Complete your CLA here: <https://code.facebook.com/cla>
If you don't have a Facebook account, we can send you a PDF that you can
sign offline. Send us an e-mail or create a new github issue to
request the CLA in PDF format.
## License
By contributing to RocksDB, you agree that your contributions will be
licensed under the [BSD License](LICENSE).

89
HISTORY.md Normal file
View File

@@ -0,0 +1,89 @@
# Rocksdb Change Log
## 3.1.0 (05/21/2014)
### Public API changes
* Replaced ColumnFamilyOptions::table_properties_collectors with ColumnFamilyOptions::table_properties_collector_factories
### New Features
* Hash index for block-based table will be materialized and reconstructed more efficiently. Previously hash index is constructed by scanning the whole table during every table open.
* FIFO compaction style
## 3.0.0 (05/05/2014)
### Public API changes
* Added _LEVEL to all InfoLogLevel enums
* Deprecated ReadOptions.prefix and ReadOptions.prefix_seek. Seek() defaults to prefix-based seek when Options.prefix_extractor is supplied. More detail is documented in https://github.com/facebook/rocksdb/wiki/Prefix-Seek-API-Changes
* MemTableRepFactory::CreateMemTableRep() takes info logger as an extra parameter.
### New Features
* Column family support
* Added an option to use different checksum functions in BlockBasedTableOptions
* Added ApplyToAllCacheEntries() function to Cache
## 2.8.0 (04/04/2014)
* Removed arena.h from public header files.
* By default, checksums are verified on every read from database
* Change default value of several options, including: paranoid_checks=true, max_open_files=5000, level0_slowdown_writes_trigger=20, level0_stop_writes_trigger=24, disable_seek_compaction=true, max_background_flushes=1 and allow_mmap_writes=false
* Added is_manual_compaction to CompactionFilter::Context
* Added "virtual void WaitForJoin()" in class Env. Default operation is no-op.
* Removed BackupEngine::DeleteBackupsNewerThan() function
* Added new option -- verify_checksums_in_compaction
* Changed Options.prefix_extractor from raw pointer to shared_ptr (take ownership)
Changed HashSkipListRepFactory and HashLinkListRepFactory constructor to not take SliceTransform object (use Options.prefix_extractor implicitly)
* Added Env::GetThreadPoolQueueLen(), which returns the waiting queue length of thread pools
* Added a command "checkconsistency" in ldb tool, which checks
if file system state matches DB state (file existence and file sizes)
* Separate options related to block based table to a new struct BlockBasedTableOptions.
* WriteBatch has a new function Count() to return total size in the batch, and Data() now returns a reference instead of a copy
* Add more counters to perf context.
* Supports several more DB properties: compaction-pending, background-errors and cur-size-active-mem-table.
### New Features
* If we find one truncated record at the end of the MANIFEST or WAL files,
we will ignore it. We assume that writers of these records were interrupted
and that we can safely ignore it.
* A new SST format "PlainTable" is added, which is optimized for memory-only workloads. It can be created through NewPlainTableFactory() or NewTotalOrderPlainTableFactory().
* A new mem table implementation hash linked list optimizing for the case that there are only few keys for each prefix, which can be created through NewHashLinkListRepFactory().
* Merge operator supports a new function PartialMergeMulti() to allow users to do partial merges against multiple operands.
* Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1.
* Geo-spatial support for locations and radial-search.
## 2.7.0 (01/28/2014)
### Public API changes
* Renamed `StackableDB::GetRawDB()` to `StackableDB::GetBaseDB()`.
* Renamed `WriteBatch::Data()` `const std::string& Data() const`.
* Renamed class `TableStats` to `TableProperties`.
* Deleted class `PrefixHashRepFactory`. Please use `NewHashSkipListRepFactory()` instead.
* Supported multi-threaded `EnableFileDeletions()` and `DisableFileDeletions()`.
* Added `DB::GetOptions()`.
* Added `DB::GetDbIdentity()`.
### New Features
* Added [BackupableDB](https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F)
* Implemented [TailingIterator](https://github.com/facebook/rocksdb/wiki/Tailing-Iterator), a special type of iterator that
doesn't create a snapshot (can be used to read newly inserted data)
and is optimized for doing sequential reads.
* Added property block for table, which allows (1) a table to store
its metadata and (2) end user to collect and store properties they
are interested in.
* Enabled caching index and filter block in block cache (turned off by default).
* Supported error report when doing manual compaction.
* Supported additional Linux platform flavors and Mac OS.
* Put with `SliceParts` - Variant of `Put()` that gathers output like `writev(2)`
* Bug fixes and code refactor for compatibility with upcoming Column
Family feature.
### Performance Improvements
* Huge benchmark performance improvements by multiple efforts. For example, increase in readonly QPS from about 530k in 2.6 release to 1.1 million in 2.7 [1]
* Speeding up a way RocksDB deleted obsolete files - no longer listing the whole directory under a lock -- decrease in p99
* Use raw pointer instead of shared pointer for statistics: [5b825d](https://github.com/facebook/rocksdb/commit/5b825d6964e26ec3b4bb6faa708ebb1787f1d7bd) -- huge increase in performance -- shared pointers are slow
* Optimized locking for `Get()` -- [1fdb3f](https://github.com/facebook/rocksdb/commit/1fdb3f7dc60e96394e3e5b69a46ede5d67fb976c) -- 1.5x QPS increase for some workloads
* Cache speedup - [e8d40c3](https://github.com/facebook/rocksdb/commit/e8d40c31b3cca0c3e1ae9abe9b9003b1288026a9)
* Implemented autovector, which allocates first N elements on stack. Most of vectors in RocksDB are small. Also, we never want to allocate heap objects while holding a mutex. -- [c01676e4](https://github.com/facebook/rocksdb/commit/c01676e46d3be08c3c140361ef1f5884f47d3b3c)
* Lots of efforts to move malloc, memcpy and IO outside of locks

84
INSTALL.md Normal file
View File

@@ -0,0 +1,84 @@
## Compilation
RocksDB's library should be able to compile without any dependency installed,
although we recommend installing some compression libraries (see below).
We do depend on newer gcc with C++11 support.
There are few options when compiling RocksDB:
* [recommended] `make static_lib` will compile librocksdb.a, RocksDB static library.
* `make shared_lib` will compile librocksdb.so, RocksDB shared library.
* `make check` will compile and run all the unit tests
* `make all` will compile our static library, and all our tools and unit tests. Our tools
depend on gflags. You will need to have gflags installed to run `make all`.
## Dependencies
* You can link RocksDB with following compression libraries:
- [zlib](http://www.zlib.net/) - a library for data compression.
- [bzip2](http://www.bzip.org/) - a library for data compression.
- [snappy](https://code.google.com/p/snappy/) - a library for fast
data compression.
* All our tools depend on:
- [gflags](https://code.google.com/p/gflags/) - a library that handles
command line flags processing. You can compile rocksdb library even
if you don't have gflags installed.
## Supported platforms
* **Linux - Ubuntu**
* Upgrade your gcc to version at least 4.7 to get C++11 support.
* Install gflags. First, try: `sudo apt-get install libgflags-dev`
If this doesn't work and you're using Ubuntu, here's a nice tutorial:
(http://askubuntu.com/questions/312173/installing-gflags-12-04)
* Install snappy. This is usually as easy as:
`sudo apt-get install libsnappy-dev`.
* Install zlib. Try: `sudo apt-get install zlib1g-dev`.
* Install bzip2: `sudo apt-get install libbz2-dev`.
* **Linux - CentOS**
* Upgrade your gcc to version at least 4.7 to get C++11 support:
`yum install gcc47-c++`
* Install gflags:
wget https://gflags.googlecode.com/files/gflags-2.0-no-svn-files.tar.gz
tar -xzvf gflags-2.0-no-svn-files.tar.gz
cd gflags-2.0
./configure && make && sudo make install
* Install snappy:
wget https://snappy.googlecode.com/files/snappy-1.1.1.tar.gz
tar -xzvf snappy-1.1.1.tar.gz
cd snappy-1.1.1
./configure && make && sudo make install
* Install zlib:
sudo yum install zlib
sudo yum install zlib-devel
* Install bzip2:
sudo yum install bzip2
sudo yum install bzip2-devel
* **OS X**:
* Install latest C++ compiler that supports C++ 11:
* Update XCode: run `xcode-select --install` (or install it from XCode App's settting).
* Install via [homebrew](http://brew.sh/).
* If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line.
* run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher).
* Install zlib, bzip2 and snappy libraries for compression.
* Install gflags. We have included a script
`build_tools/mac-install-gflags.sh`, which should automatically install it.
If you installed gflags by other means (for example, `brew install gflags`),
please set `LIBRARY_PATH` and `CPATH` accordingly.
* Please note that some of the optimizations/features are disabled in OSX.
We did not run any production workloads on it.
* **iOS**:
* Run: `TARGET_OS=IOS make static_lib`

BIN
LICENSE Normal file

Binary file not shown.

523
Makefile Normal file
View File

@@ -0,0 +1,523 @@
# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. See the AUTHORS file for names of contributors.
# Inherit some settings from environment variables, if available
INSTALL_PATH ?= $(CURDIR)
#-----------------------------------------------
ifneq ($(MAKECMDGOALS),dbg)
OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
else
# intentionally left blank
endif
ifeq ($(MAKECMDGOALS),shared_lib)
OPT += -DNDEBUG
endif
ifeq ($(MAKECMDGOALS),static_lib)
OPT += -DNDEBUG
endif
#-----------------------------------------------
# detect what platform we're building on
$(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/build_config.mk"))
# this file is generated by the previous line to set build flags and sources
include build_config.mk
ifneq ($(PLATFORM), IOS)
CFLAGS += -g
CXXFLAGS += -g
else
# no debug info for IOS, that will make our library big
OPT += -DNDEBUG
endif
# ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
ifdef COMPILE_WITH_ASAN
# ASAN compile flags
EXEC_LDFLAGS += -fsanitize=address
PLATFORM_CCFLAGS += -fsanitize=address
PLATFORM_CXXFLAGS += -fsanitize=address
else
# if we're not compiling with ASAN, use jemalloc
EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS)
PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
endif
WARNING_FLAGS = -Wall -Werror
CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
LDFLAGS += $(PLATFORM_LDFLAGS)
LIBOBJECTS = $(SOURCES:.cc=.o)
LIBOBJECTS += $(SOURCESCPP:.cpp=.o)
MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
TESTUTIL = ./util/testutil.o
TESTHARNESS = ./util/testharness.o $(TESTUTIL)
BENCHHARNESS = ./util/benchharness.o
VALGRIND_ERROR = 2
VALGRIND_DIR = build_tools/VALGRIND_LOGS
VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
TESTS = \
db_test \
block_hash_index_test \
autovector_test \
column_family_test \
table_properties_collector_test \
arena_test \
auto_roll_logger_test \
benchharness_test \
block_test \
bloom_test \
dynamic_bloom_test \
c_test \
cache_test \
coding_test \
corruption_test \
crc32c_test \
dbformat_test \
env_test \
blob_store_test \
filelock_test \
filename_test \
filter_block_test \
histogram_test \
log_test \
manual_compaction_test \
memenv_test \
merge_test \
redis_test \
reduce_levels_test \
plain_table_db_test \
prefix_test \
simple_table_db_test \
skiplist_test \
stringappend_test \
ttl_test \
backupable_db_test \
version_edit_test \
version_set_test \
file_indexer_test \
write_batch_test\
deletefile_test \
table_test \
thread_local_test \
geodb_test
TOOLS = \
sst_dump \
db_sanity_test \
db_stress \
ldb \
db_repl_stress \
blob_store_bench
PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench $(TOOLS)
# The library name is configurable since we are maintaining libraries of both
# debug/release mode.
ifeq ($(LIBNAME),)
LIBNAME=librocksdb
endif
LIBRARY = ${LIBNAME}.a
MEMENVLIBRARY = libmemenv.a
default: all
#-----------------------------------------------
# Create platform independent shared libraries.
#-----------------------------------------------
ifneq ($(PLATFORM_SHARED_EXT),)
ifneq ($(PLATFORM_SHARED_VERSIONED),true)
SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
SHARED2 = $(SHARED1)
SHARED3 = $(SHARED1)
SHARED = $(SHARED1)
else
# Update db.h if you change these.
SHARED_MAJOR = 3
SHARED_MINOR = 2
SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
SHARED2 = $(SHARED1).$(SHARED_MAJOR)
SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
$(SHARED1): $(SHARED3)
ln -fs $(SHARED3) $(SHARED1)
$(SHARED2): $(SHARED3)
ln -fs $(SHARED3) $(SHARED2)
endif
$(SHARED3):
$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) $(LDFLAGS) -o $@
endif # PLATFORM_SHARED_EXT
.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
dbg
all: $(LIBRARY) $(PROGRAMS) $(TESTS)
static_lib: $(LIBRARY)
shared_lib: $(SHARED)
dbg: $(LIBRARY) $(PROGRAMS) $(TESTS)
# creates static library and programs
release:
$(MAKE) clean
OPT="-DNDEBUG -O2" $(MAKE) static_lib $(PROGRAMS) -j32
coverage:
$(MAKE) clean
COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check -j32
(cd coverage; ./coverage_test.sh)
# Delete intermediate files
find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
check: $(TESTS) ldb
for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
python tools/ldb_test.py
ldb_tests: ldb
python tools/ldb_test.py
crash_test: whitebox_crash_test blackbox_crash_test
blackbox_crash_test: db_stress
python -u tools/db_crashtest.py
whitebox_crash_test: db_stress
python -u tools/db_crashtest2.py
asan_check:
$(MAKE) clean
COMPILE_WITH_ASAN=1 $(MAKE) check -j32
$(MAKE) clean
asan_crash_test:
$(MAKE) clean
COMPILE_WITH_ASAN=1 $(MAKE) crash_test
$(MAKE) clean
valgrind_check: all $(PROGRAMS) $(TESTS)
mkdir -p $(VALGRIND_DIR)
echo TESTS THAT HAVE VALGRIND ERRORS > $(VALGRIND_DIR)/valgrind_failed_tests; \
echo TIMES in seconds TAKEN BY TESTS ON VALGRIND > $(VALGRIND_DIR)/valgrind_tests_times; \
for t in $(filter-out skiplist_test,$(TESTS)); do \
stime=`date '+%s'`; \
$(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \
if [ $$? -eq $(VALGRIND_ERROR) ] ; then \
echo $$t >> $(VALGRIND_DIR)/valgrind_failed_tests; \
fi; \
etime=`date '+%s'`; \
echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \
done
clean:
-rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk
-rm -rf ios-x86/* ios-arm/*
-find . -name "*.[od]" -exec rm {} \;
-find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
tags:
ctags * -R
cscope -b `find . -name '*.cc'` `find . -name '*.h'`
format:
build_tools/format-diff.sh
# ---------------------------------------------------------------------------
# Unit tests and tools
# ---------------------------------------------------------------------------
$(LIBRARY): $(LIBOBJECTS)
rm -f $@
$(AR) -rs $@ $(LIBOBJECTS)
db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
signal_test: util/signal_test.o $(LIBOBJECTS)
$(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
column_family_test: db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
dynamic_bloom_test: util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL)
$(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
benchharness_test: util/benchharness_test.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS)
$(CXX) util/benchharness_test.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
histogram_test: util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
log_and_apply_bench: db/log_and_apply_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS)
$(CXX) db/log_and_apply_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
file_indexer_test : db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
geodb_test: utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
rm -f $@
$(AR) -rs $@ $(MEMENVOBJECTS)
memenv_test : helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
rocksdb_shell: tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o tools/shell/ShellContext.h tools/shell/ShellState.h tools/shell/DBClientProxy.h $(LIBOBJECTS)
$(CXX) tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
DBClientProxy_test: tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY)
$(CXX) tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY) $(EXEC_LDFLAGS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
sst_dump: tools/sst_dump.o $(LIBOBJECTS)
$(CXX) tools/sst_dump.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
ldb: tools/ldb.o $(LIBOBJECTS)
$(CXX) tools/ldb.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
# ---------------------------------------------------------------------------
# Jni stuff
# ---------------------------------------------------------------------------
JNI_NATIVE_SOURCES = ./java/rocksjni/*.cc
JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
ROCKSDBJNILIB = ./java/librocksdbjni.so
ifeq ($(PLATFORM), OS_MACOSX)
ROCKSDBJNILIB = ./java/librocksdbjni.jnilib
JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
endif
rocksdbjava: clean
OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32
cd java;$(MAKE) java;
rm -f $(ROCKSDBJNILIB)
$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o $(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(LDFLAGS) $(COVERAGEFLAGS)
jclean:
cd java;$(MAKE) clean;
rm -f $(ROCKSDBJNILIB)
jtest:
cd java;$(MAKE) sample;$(MAKE) test;
jdb_bench:
cd java;$(MAKE) db_bench;
# ---------------------------------------------------------------------------
# Platform-specific compilation
# ---------------------------------------------------------------------------
ifeq ($(PLATFORM), IOS)
# For iOS, create universal object files to be used on both the simulator and
# a device.
PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
.cc.o:
mkdir -p ios-x86/$(dir $@)
$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
mkdir -p ios-arm/$(dir $@)
xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
lipo ios-x86/$@ ios-arm/$@ -create -output $@
.c.o:
mkdir -p ios-x86/$(dir $@)
$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
mkdir -p ios-arm/$(dir $@)
xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
lipo ios-x86/$@ ios-arm/$@ -create -output $@
else
.cc.o:
$(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
.c.o:
$(CC) $(CFLAGS) -c $< -o $@
endif
# ---------------------------------------------------------------------------
# Source files dependencies detection
# ---------------------------------------------------------------------------
# Add proper dependency support so changing a .h file forces a .cc file to
# rebuild.
# The .d file indicates .cc file's dependencies on .h files. We generate such
# dependency by g++'s -MM option, whose output is a make dependency rule.
# The sed command makes sure the "target" file in the generated .d file has
# the correct path prefix.
%.d: %.cc
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM $< -o $@
ifeq ($(PLATFORM), OS_MACOSX)
@sed -i '' -e 's,.*:,$*.o:,' $@
else
@sed -i -e 's,.*:,$*.o:,' $@
endif
DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d))
depend: $(DEPFILES)
# if the make goal is either "clean" or "format", we shouldn't
# try to import the *.d files.
# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly
# working solution.
ifneq ($(MAKECMDGOALS),clean)
ifneq ($(MAKECMDGOALS),format)
ifneq ($(MAKECMDGOALS),jclean)
ifneq ($(MAKECMDGOALS),jtest)
-include $(DEPFILES)
endif
endif
endif
endif

23
PATENTS Normal file
View File

@@ -0,0 +1,23 @@
Additional Grant of Patent Rights
“Software” means the rocksdb software distributed by Facebook, Inc.
Facebook hereby grants you a perpetual, worldwide, royalty-free,
non-exclusive, irrevocable (subject to the termination provision below)
license under any rights in any patent claims owned by Facebook, to make,
have made, use, sell, offer to sell, import, and otherwise transfer the
Software. For avoidance of doubt, no license is granted under Facebooks
rights in any patent claims that are infringed by (i) modifications to the
Software made by you or a third party, or (ii) the Software in combination
with any software or other technology provided by you or a third party.
The license granted hereunder will terminate, automatically and without
notice, for anyone that makes any claim (including by filing any lawsuit,
assertion or other action) alleging (a) direct, indirect, or contributory
infringement or inducement to infringe any patent: (i) by Facebook or any
of its subsidiaries or affiliates, whether or not such claim is related
to the Software, (ii) by any party if such claim arises in whole or in
part from any software, product or service of Facebook or any of its
subsidiaries or affiliates, whether or not such claim is related to the
Software, or (iii) by any party relating to the Software; or (b) that
any right in any patent claim of Facebook is invalid or unenforceable.

26
README.md Normal file
View File

@@ -0,0 +1,26 @@
## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage
[![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb)
RocksDB is developed and maintained by Facebook Database Engineering Team.
It is built on on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com)
and Jeff Dean (jeff@google.com)
This code is a library that forms the core building block for a fast
key value server, especially suited for storing data on flash drives.
It has an Log-Structured-Merge-Database (LSM) design with flexible tradeoffs
between Write-Amplification-Factor (WAF), Read-Amplification-Factor (RAF)
and Space-Amplification-Factor (SAF). It has multi-threaded compactions,
making it specially suitable for storing multiple terabytes of data in a
single database.
Start with example usage here: https://github.com/facebook/rocksdb/tree/master/examples
See [doc/index.html](https://github.com/facebook/rocksdb/blob/master/doc/index.html) and
[github wiki](https://github.com/facebook/rocksdb/wiki) for more explanation.
The public interface is in `include/`. Callers should not include or
rely on the details of any other header files in this package. Those
internal APIs may be changed without warning.
Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/

20
ROCKSDB_LITE.md Normal file
View File

@@ -0,0 +1,20 @@
# RocksDBLite
RocksDBLite is a project focused on mobile use cases, which don't need a lot of fancy things we've built for server workloads and they are very sensitive to binary size. For that reason, we added a compile flag ROCKSDB_LITE that comments out a lot of the nonessential code and keeps the binary lean.
Some examples of the features disabled by ROCKSDB_LITE:
* compiled-in support for LDB tool
* No backupable DB
* No support for replication (which we provide in form of TrasactionalIterator)
* No advanced monitoring tools
* No special-purpose memtables that are highly optimized for specific use cases
When adding a new big feature to RocksDB, please add ROCKSDB_LITE compile guard if:
* Nobody from mobile really needs your feature,
* Your feature is adding a lot of weight to the binary.
Don't add ROCKSDB_LITE compile guard if:
* It would introduce a lot of code complexity. Compile guards make code harder to read. It's a trade-off.
* Your feature is not adding a lot of weight.
If unsure, ask. :)

320
build_tools/build_detect_platform Executable file
View File

@@ -0,0 +1,320 @@
#!/bin/sh
#
# Detects OS we're compiling on and outputs a file specified by the first
# argument, which in turn gets read while processing Makefile.
#
# The output will set the following variables:
# CC C Compiler path
# CXX C++ Compiler path
# PLATFORM_LDFLAGS Linker flags
# PLATFORM_SHARED_EXT Extension for shared libraries
# PLATFORM_SHARED_LDFLAGS Flags for building shared library
# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library
# PLATFORM_CCFLAGS C compiler flags
# PLATFORM_CXXFLAGS C++ compiler flags. Will contain:
# PLATFORM_SHARED_VERSIONED Set to 'true' if platform supports versioned
# shared libraries, empty otherwise.
#
# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
#
# -DLEVELDB_PLATFORM_POSIX if cstdatomic is present
# -DLEVELDB_PLATFORM_NOATOMIC if it is not
# -DSNAPPY if the Snappy library is present
# -DLZ4 if the LZ4 library is present
#
# Using gflags in rocksdb:
# Our project depends on gflags, which requires users to take some extra steps
# before they can compile the whole repository:
# 1. Install gflags. You may download it from here:
# https://code.google.com/p/gflags/
# 2. Once install, add the include path/lib path for gflags to CPATH and
# LIBRARY_PATH respectively. If installed with default mode, the
# lib and include path will be /usr/local/lib and /usr/local/include
# Mac user can do this by running build_tools/mac-install-gflags.sh
OUTPUT=$1
if test -z "$OUTPUT"; then
echo "usage: $0 <output-filename>" >&2
exit 1
fi
# we depend on C++11
PLATFORM_CXXFLAGS="-std=c++11"
# we currently depend on POSIX platform
COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
# Default to fbcode gcc on internal fb machines
if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
FBCODE_BUILD="true"
if [ -z "$USE_CLANG" ]; then
CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \
$(rpm -q --whatprovides redhat-release)`
if [ "$CENTOS_VERSION" = "6" ]; then
source "$PWD/build_tools/fbcode.gcc481.sh"
else
source "$PWD/build_tools/fbcode.gcc471.sh"
fi
else
source "$PWD/build_tools/fbcode.clang31.sh"
fi
fi
# Delete existing output, if it exists
rm -f "$OUTPUT"
touch "$OUTPUT"
if test -z "$CC"; then
CC=cc
fi
if test -z "$CXX"; then
CXX=g++
fi
# Detect OS
if test -z "$TARGET_OS"; then
TARGET_OS=`uname -s`
fi
COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}"
CROSS_COMPILE=
PLATFORM_CCFLAGS=
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
PLATFORM_SHARED_EXT="so"
PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
PLATFORM_SHARED_CFLAGS="-fPIC"
PLATFORM_SHARED_VERSIONED=false
# generic port files (working on all platform by #ifdef) go directly in /port
GENERIC_PORT_FILES=`cd "$ROCKSDB_ROOT"; find port -name '*.cc' | tr "\n" " "`
# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
case "$TARGET_OS" in
Darwin)
PLATFORM=OS_MACOSX
COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX"
PLATFORM_SHARED_EXT=dylib
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
# PORT_FILES=port/darwin/darwin_specific.cc
;;
IOS)
PLATFORM=IOS
COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE -DROCKSDB_LITE"
PLATFORM_SHARED_EXT=dylib
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
CROSS_COMPILE=true
;;
Linux)
PLATFORM=OS_LINUX
COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
if [ -z "$USE_CLANG" ]; then
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
fi
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
# PORT_FILES=port/linux/linux_specific.cc
;;
SunOS)
PLATFORM=OS_SOLARIS
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
# PORT_FILES=port/sunos/sunos_specific.cc
;;
FreeBSD)
PLATFORM=OS_FREEBSD
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
# PORT_FILES=port/freebsd/freebsd_specific.cc
;;
NetBSD)
PLATFORM=OS_NETBSD
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s"
# PORT_FILES=port/netbsd/netbsd_specific.cc
;;
OpenBSD)
PLATFORM=OS_OPENBSD
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread"
# PORT_FILES=port/openbsd/openbsd_specific.cc
;;
DragonFly)
PLATFORM=OS_DRAGONFLYBSD
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
# PORT_FILES=port/dragonfly/dragonfly_specific.cc
;;
OS_ANDROID_CROSSCOMPILE)
PLATFORM=OS_ANDROID
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS " # All pthread features are in the Android C library
# PORT_FILES=port/android/android.cc
CROSS_COMPILE=true
;;
*)
echo "Unknown platform!" >&2
exit 1
esac
if test -z "$DO_NOT_RUN_BUILD_DETECT_VERSION"; then
"$PWD/build_tools/build_detect_version"
fi
# We want to make a list of all cc files within util, db, table, and helpers
# except for the test and benchmark files. By default, find will output a list
# of all files matching either rule, so we need to append -print to make the
# prune take effect.
DIRS="util db table utilities"
set -f # temporarily disable globbing so that our patterns arent expanded
PRUNE_TEST="-name *test*.cc -prune"
PRUNE_BENCH="-name *bench*.cc -prune"
PORTABLE_FILES=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
PORTABLE_CPP=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "`
set +f # re-enable globbing
# The sources consist of the portable files, plus the platform-specific port
# file.
echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> "$OUTPUT"
echo "SOURCESCPP=$PORTABLE_CPP" >> "$OUTPUT"
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> "$OUTPUT"
if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
# Cross-compiling; do not try any compilation tests.
# Also don't need any compilation tests if compiling on fbcode
true
else
# If -std=c++0x works, use <atomic>. Otherwise use port_posix.h.
$CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <atomic>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT"
fi
# Test whether fallocate is available
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <fcntl.h>
int main() {
int fd = open("/dev/null", 0);
fallocate(fd, 0, 0, 1024);
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT"
fi
# Test whether Snappy library is installed
# http://code.google.com/p/snappy/
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <snappy.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
fi
# Test whether gflags library is installed
# http://code.google.com/p/gflags/
# check if the namespace is gflags
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <gflags/gflags.h>
using namespace gflags;
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=gflags"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
fi
# check if namespace is google
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <gflags/gflags.h>
using namespace google;
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=google"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
fi
# Test whether zlib library is installed
$CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <zlib.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
fi
# Test whether bzip library is installed
$CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <bzlib.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DBZIP2"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2"
fi
# Test whether lz4 library is installed
$CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <lz4.h>
#include <lz4hc.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DLZ4"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4"
fi
# Test whether tcmalloc is available
$CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <<EOF
int main() {}
EOF
if [ "$?" = 0 ]; then
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
fi
fi
# shall we use HDFS?
if test "$USE_HDFS"; then
if test -z "$JAVA_HOME"; then
echo "JAVA_HOME has to be set for HDFS usage."
exit 1
fi
HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS"
HDFS_LDFLAGS="$HDFS_LDFLAGS -Wl,--no-whole-archive -lhdfs -L$JAVA_HOME/jre/lib/amd64"
HDFS_LDFLAGS="$HDFS_LDFLAGS -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib"
HDFS_LDFLAGS="$HDFS_LDFLAGS -ldl -lverify -ljava -ljvm"
COMMON_FLAGS="$COMMON_FLAGS $HDFS_CCFLAGS"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $HDFS_LDFLAGS"
fi
# if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 "
COMMON_FLAGS="$COMMON_FLAGS $USE_SSE"
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
VALGRIND_VER="$VALGRIND_VER"
echo "CC=$CC" >> "$OUTPUT"
echo "CXX=$CXX" >> "$OUTPUT"
echo "PLATFORM=$PLATFORM" >> "$OUTPUT"
echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT"
echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT"
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT"
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> "$OUTPUT"
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> "$OUTPUT"
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> "$OUTPUT"
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> "$OUTPUT"
echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> "$OUTPUT"
echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> "$OUTPUT"
echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> "$OUTPUT"
echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> "$OUTPUT"

View File

@@ -0,0 +1,22 @@
#!/bin/sh
#
# Record the version of the source that we are compiling.
# We keep a record of the git revision in util/version.cc. This source file
# is then built as a regular source file as part of the compilation process.
# One can run "strings executable_filename | grep _build_" to find the version of
# the source that we used to build the executable file.
OUTFILE="$PWD/util/build_version.cc"
GIT_SHA=""
if command -v git >/dev/null 2>&1; then
GIT_SHA=$(git rev-parse HEAD 2>/dev/null)
fi
cat > "${OUTFILE}" <<EOF
#include "build_version.h"
const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:${GIT_SHA}";
const char* rocksdb_build_git_datetime = "rocksdb_build_git_datetime:$(date)";
const char* rocksdb_build_compile_date = __DATE__;
const char* rocksdb_build_compile_time = __TIME__;
EOF

View File

@@ -0,0 +1,74 @@
#!/bin/sh
#
# Set environment variables so that we can compile leveldb using
# fbcode settings. It uses the latest g++ compiler and also
# uses jemalloc
TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
GLIBC_RUNTIME_PATH=/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1
# location of libgcc
LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
# location of glibc
GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
# location of snappy headers and libraries
SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
# location of zlib headers and libraries
ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
# location of bzip headers and libraries
BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
# use Intel SSE support for checksum calculations
export USE_SSE=" -msse -msse4.2 "
CC="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang $CLANG_INCLUDES"
CXX="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang++ $CLANG_INCLUDES $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin -nostdlib "
CFLAGS+=" -nostdinc -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1 "
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/x86_64-facebook-linux "
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/backward "
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include "
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include "
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/clang/clang-3.2/0b7c69d/lib/clang/3.2/include "
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include/linux "
CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include "
CFLAGS+=" -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual"
CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
CXXFLAGS="$CFLAGS -nostdinc++"
CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
EXEC_LDFLAGS+=" -Wl,--dynamic-linker,$GLIBC_RUNTIME_PATH/lib/ld-linux-x86-64.so.2"
EXEC_LDFLAGS+=" -B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin"
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED

View File

@@ -0,0 +1,70 @@
#!/bin/sh
#
# Set environment variables so that we can compile leveldb using
# fbcode settings. It uses the latest g++ compiler and also
# uses jemalloc
TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
# location of libhdfs libraries
if test "$USE_HDFS"; then
JAVA_HOME="/usr/local/jdk-6u22-64"
JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1"
HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
HDFSLIB+=" -ldl -lverify -ljava -ljvm "
fi
# location of libgcc
LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
# location of glibc
GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
# location of snappy headers and libraries
SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
# location of zlib headers and libraries
ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
# location of bzip headers and libraries
BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
# use Intel SSE support for checksum calculations
export USE_SSE=" -msse -msse4.2 "
CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/gcc"
CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2"
EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/91ddd43/bin/"
export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER

View File

@@ -0,0 +1,81 @@
#!/bin/sh
#
# Set environment variables so that we can compile rocksdb using
# fbcode settings. It uses the latest g++ compiler and also
# uses jemalloc
TOOLCHAIN_REV=53dc1fe83f84e9145b9ffb81b81aa7f6a49c87cc
CENTOS_VERSION=`rpm -q --qf "%{VERSION}" $(rpm -q --whatprovides redhat-release)`
if [ "$CENTOS_VERSION" = "6" ]; then
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos6-native"
else
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
fi
TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17"
# location of libhdfs libraries
if test "$USE_HDFS"; then
JAVA_HOME="/usr/local/jdk-6u22-64"
JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17"
HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
HDFSLIB+=" -ldl -lverify -ljava -ljvm "
fi
# location of libgcc
LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/include"
LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/libs"
# location of glibc
GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/include"
GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/lib"
# location of snappy headers and libraries
SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/include"
SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/lib/libsnappy.a"
# location of zlib headers and libraries
ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/include"
ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/lib/libz.a"
# location of bzip headers and libraries
BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/include"
BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/lib/libbz2.a"
LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b
LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include"
LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a"
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/include"
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/lib/libgflags.a"
# location of jemalloc
JEMALLOC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/include/"
JEMALLOC_LIB=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/lib/libjemalloc.a"
# use Intel SSE support for checksum calculations
export USE_SSE=" -msse -msse4.2 "
CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/gcc"
CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE"
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4"
EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/675d945/lib/libunwind.a"
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/c3f970a/bin/"
export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE

107
build_tools/format-diff.sh Executable file
View File

@@ -0,0 +1,107 @@
#!/bin/bash
# If clang_format_diff.py command is not specfied, we assume we are able to
# access directly without any path.
if [ -z $CLANG_FORMAT_DIFF ]
then
CLANG_FORMAT_DIFF="clang-format-diff.py"
fi
# Check clang-format-diff.py
if ! which $CLANG_FORMAT_DIFF &> /dev/null
then
echo "You didn't have clang-format-diff.py available in your computer!"
echo "You can download it by running: "
echo " curl http://goo.gl/iUW1u2"
exit 128
fi
# Check argparse, a library that clang-format-diff.py requires.
python 2>/dev/null << EOF
import argparse
EOF
if [ "$?" != 0 ]
then
echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
echo "installed. You can try either of the follow ways to install it:"
echo " 1. Manually download argparse: https://pypi.python.org/pypi/argparse"
echo " 2. easy_install argparse (if you have easy_install)"
echo " 3. pip install argparse (if you have pip)"
exit 129
fi
# TODO(kailiu) following work is not complete since we still need to figure
# out how to add the modified files done pre-commit hook to git's commit index.
#
# Check if this script has already been added to pre-commit hook.
# Will suggest user to add this script to pre-commit hook if their pre-commit
# is empty.
# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit"
# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null
# then
# echo "Would you like to add this script to pre-commit hook, which will do "
# echo -n "the format check for all the affected lines before you check in (y/n):"
# read add_to_hook
# if [ "$add_to_hook" == "y" ]
# then
# ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
# fi
# fi
set -e
uncommitted_code=`git diff HEAD`
# If there's no uncommitted changes, we assume user are doing post-commit
# format check, in which case we'll check the modified lines from latest commit.
# Otherwise, we'll check format of the uncommitted code only.
if [ -z "$uncommitted_code" ]
then
# Check the format of last commit
diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1)
else
# Check the format of uncommitted lines,
diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
fi
if [ -z "$diffs" ]
then
echo "Nothing needs to be reformatted!"
exit 0
fi
# Highlight the insertion/deletion from the clang-format-diff.py's output
COLOR_END="\033[0m"
COLOR_RED="\033[0;31m"
COLOR_GREEN="\033[0;32m"
echo -e "Detect lines that doesn't follow the format rules:\r"
# Add the color to the diff. lines added will be green; lines removed will be red.
echo "$diffs" |
sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
echo -e "Would you like to fix the format automatically (y/n): \c"
# Make sure under any mode, we can read user input.
exec < /dev/tty
read to_fix
if [ "$to_fix" != "y" ]
then
exit 1
fi
# Do in-place format adjustment.
git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1
echo "Files reformatted!"
# Amend to last commit if user do the post-commit format check
if [ -z "$uncommitted_code" ]; then
echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c"
read to_amend
if [ "$to_amend" == "y" ]
then
git commit -a --amend --reuse-message HEAD
echo "Amended to last commit"
fi
fi

View File

@@ -0,0 +1,25 @@
#!/bin/sh
# Install gflags for mac developers.
set -e
DIR=`mktemp -d /tmp/rocksdb_gflags_XXXX`
cd $DIR
wget https://gflags.googlecode.com/files/gflags-2.0.tar.gz
tar xvfz gflags-2.0.tar.gz
cd gflags-2.0
./configure
make
make install
# Add include/lib path for g++
echo 'export LIBRARY_PATH+=":/usr/local/lib"' >> ~/.bash_profile
echo 'export CPATH+=":/usr/local/include"' >> ~/.bash_profile
echo ""
echo "-----------------------------------------------------------------------------"
echo "| Installation Completed |"
echo "-----------------------------------------------------------------------------"
echo "Please run `. ~/bash_profile` to be able to compile with gflags"

46
build_tools/make_new_version.sh Executable file
View File

@@ -0,0 +1,46 @@
#!/bin/bash
# Copyright (c) 2013, Facebook, Inc. All rights reserved.
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.
set -e
if [ -z "$GIT" ]
then
GIT="git"
fi
# Print out the colored progress info so that it can be brainlessly
# distinguished by users.
function title() {
echo -e "\033[1;32m$*\033[0m"
}
usage="Create new RocksDB version and prepare it for the release process\n"
usage+="USAGE: ./make_new_version.sh <version>"
# -- Pre-check
if [[ $# < 1 ]]; then
echo -e $usage
exit 1
fi
ROCKSDB_VERSION=$1
GIT_BRANCH=`git rev-parse --abbrev-ref HEAD`
echo $GIT_BRANCH
if [ $GIT_BRANCH != "master" ]; then
echo "Error: Current branch is '$GIT_BRANCH', Please switch to master branch."
exit 1
fi
title "Adding new tag for this release ..."
BRANCH="$ROCKSDB_VERSION.fb"
$GIT co -b $BRANCH
# Setting up the proxy for remote repo access
title "Pushing new branch to remote repo ..."
git push origin --set-upstream $BRANCH
title "Branch $BRANCH is pushed to github;"

View File

@@ -0,0 +1,330 @@
#!/bin/bash
set -e
NUM=10000000
if [ $# -eq 1 ];then
DATA_DIR=$1
elif [ $# -eq 2 ];then
DATA_DIR=$1
STAT_FILE=$2
fi
# On the production build servers, set data and stat
# files/directories not in /tmp or else the tempdir cleaning
# scripts will make you very unhappy.
DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)}
STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)}
function cleanup {
rm -rf $DATA_DIR
rm -f $STAT_FILE.fillseq
rm -f $STAT_FILE.readrandom
rm -f $STAT_FILE.overwrite
rm -f $STAT_FILE.memtablefillreadrandom
}
trap cleanup EXIT
if [ -z $GIT_BRANCH ]; then
git_br=`git rev-parse --abbrev-ref HEAD`
else
git_br=$(basename $GIT_BRANCH)
fi
if [ $git_br == "master" ]; then
git_br=""
else
git_br="."$git_br
fi
make release
# measure fillseq + fill up the DB for overwrite benchmark
./db_bench \
--benchmarks=fillseq \
--db=$DATA_DIR \
--use_existing_db=0 \
--bloom_bits=10 \
--num=$NUM \
--writes=$NUM \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 > ${STAT_FILE}.fillseq
# measure overwrite performance
./db_bench \
--benchmarks=overwrite \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$NUM \
--writes=$((NUM / 10)) \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--threads=8 > ${STAT_FILE}.overwrite
# fill up the db for readrandom benchmark (1GB total size)
./db_bench \
--benchmarks=fillseq \
--db=$DATA_DIR \
--use_existing_db=0 \
--bloom_bits=10 \
--num=$NUM \
--writes=$NUM \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--threads=1 > /dev/null
# measure readrandom with 6GB block cache
./db_bench \
--benchmarks=readrandom \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$NUM \
--reads=$((NUM / 5)) \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--disable_seek_compaction=1 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--threads=16 > ${STAT_FILE}.readrandom
# measure readrandom with 6GB block cache and tailing iterator
./db_bench \
--benchmarks=readrandom \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$NUM \
--reads=$((NUM / 5)) \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--disable_seek_compaction=1 \
--use_tailing_iterator=1 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--threads=16 > ${STAT_FILE}.readrandomtailing
# measure readrandom with 100MB block cache
./db_bench \
--benchmarks=readrandom \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$NUM \
--reads=$((NUM / 5)) \
--cache_size=104857600 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--disable_seek_compaction=1 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--threads=16 > ${STAT_FILE}.readrandomsmallblockcache
# measure readrandom with 8k data in memtable
./db_bench \
--benchmarks=overwrite,readrandom \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$NUM \
--reads=$((NUM / 5)) \
--writes=512 \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--write_buffer_size=1000000000 \
--open_files=55000 \
--disable_seek_compaction=1 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--threads=16 > ${STAT_FILE}.readrandom_mem_sst
# fill up the db for readrandom benchmark with filluniquerandom (1GB total size)
./db_bench \
--benchmarks=filluniquerandom \
--db=$DATA_DIR \
--use_existing_db=0 \
--bloom_bits=10 \
--num=$((NUM / 4)) \
--writes=$((NUM / 4)) \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--threads=1 > /dev/null
# dummy test just to compact the data
./db_bench \
--benchmarks=readrandom \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$((NUM / 1000)) \
--reads=$((NUM / 1000)) \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--threads=16 > /dev/null
# measure readrandom after load with filluniquerandom with 6GB block cache
./db_bench \
--benchmarks=readrandom \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$((NUM / 4)) \
--reads=$((NUM / 4)) \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--disable_seek_compaction=1 \
--disable_auto_compactions=1 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--threads=16 > ${STAT_FILE}.readrandom_filluniquerandom
# measure readwhilewriting after load with filluniquerandom with 6GB block cache
./db_bench \
--benchmarks=readwhilewriting \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$((NUM / 4)) \
--reads=$((NUM / 4)) \
--writes_per_second=1000 \
--write_buffer_size=100000000 \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--disable_seek_compaction=1 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--threads=16 > ${STAT_FILE}.readwhilewriting
# measure memtable performance -- none of the data gets flushed to disk
./db_bench \
--benchmarks=fillrandom,readrandom, \
--db=$DATA_DIR \
--use_existing_db=0 \
--num=$((NUM / 10)) \
--reads=$NUM \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--write_buffer_size=1000000000 \
--open_files=55000 \
--disable_seek_compaction=1 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--value_size=10 \
--threads=16 > ${STAT_FILE}.memtablefillreadrandom
# send data to ods
function send_to_ods {
key="$1"
value="$2"
if [ -z $JENKINS_HOME ]; then
# running on devbox, just print out the values
echo $1 $2
return
fi
if [ -z "$value" ];then
echo >&2 "ERROR: Key $key doesn't have a value."
return
fi
curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \
--connect-timeout 60
}
function send_benchmark_to_ods {
bench="$1"
bench_key="$2"
file="$3"
QPS=$(grep $bench $file | awk '{print $5}')
P50_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $3}' )
P75_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $5}' )
P99_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $7}' )
send_to_ods rocksdb.build.$bench_key.qps $QPS
send_to_ods rocksdb.build.$bench_key.p50_micros $P50_MICROS
send_to_ods rocksdb.build.$bench_key.p75_micros $P75_MICROS
send_to_ods rocksdb.build.$bench_key.p99_micros $P99_MICROS
}
send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite
send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq
send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom
send_benchmark_to_ods readrandom readrandom_tailing $STAT_FILE.readrandomtailing
send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache
send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst
send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom
send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom
send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom
send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting

15
build_tools/valgrind_test.sh Executable file
View File

@@ -0,0 +1,15 @@
#!/bin/bash
#A shell script for Jenknis to run valgrind on rocksdb tests
#Returns 0 on success when there are no failed tests
VALGRIND_DIR=build_tools/VALGRIND_LOGS
make clean
make -j$(nproc) valgrind_check
NUM_FAILED_TESTS=$((`wc -l $VALGRIND_DIR/valgrind_failed_tests | awk '{print $1}'` - 1))
if [ $NUM_FAILED_TESTS -lt 1 ]; then
echo No tests have valgrind errors
exit 0
else
cat $VALGRIND_DIR/valgrind_failed_tests
exit 1
fi

78
coverage/coverage_test.sh Executable file
View File

@@ -0,0 +1,78 @@
#!/bin/bash
# Exit on error.
set -e
if [ -n "$USE_CLANG" ]; then
echo "Error: Coverage test is supported only for gcc."
exit 1
fi
ROOT=".."
# Fetch right version of gcov
if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
source $ROOT/build_tools/fbcode.gcc471.sh
GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov
else
GCOV=$(which gcov)
fi
COVERAGE_DIR="$PWD/COVERAGE_REPORT"
mkdir -p $COVERAGE_DIR
# Find all gcno files to generate the coverage report
GCNO_FILES=`find $ROOT -name "*.gcno"`
$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
# Parse the raw gcov report to more human readable form.
python $ROOT/coverage/parse_gcov_output.py |
# Write the output to both stdout and report file.
tee $COVERAGE_DIR/coverage_report_all.txt &&
echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n"
# TODO: we also need to get the files of the latest commits.
# Get the most recently committed files.
LATEST_FILES=`
git show --pretty="format:" --name-only HEAD |
grep -v "^$" |
paste -s -d,`
RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt
echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT
$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES |
tee -a $RECENT_REPORT &&
echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
# Unless otherwise specified, we'll not generate html report by default
if [ -z "$HTML" ]; then
exit 0
fi
# Generate the html report. If we cannot find lcov in this machine, we'll simply
# skip this step.
echo "Generating the html coverage report..."
LCOV=$(which lcov || true 2>/dev/null)
if [ -z $LCOV ]
then
echo "Skip: Cannot find lcov to generate the html report."
exit 0
fi
LCOV_VERSION=$(lcov -v | grep 1.1 || true)
if [ $LCOV_VERSION ]
then
echo "Not supported lcov version. Expect lcov 1.1."
exit 0
fi
(cd $ROOT; lcov --no-external \
--capture \
--directory $PWD \
--gcov-tool $GCOV \
--output-file $COVERAGE_DIR/coverage.info)
genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR
echo "HTML Coverage report is generated in $COVERAGE_DIR"

View File

@@ -0,0 +1,118 @@
import optparse
import re
import sys
from optparse import OptionParser
# the gcov report follows certain pattern. Each file will have two lines
# of report, from which we can extract the file name, total lines and coverage
# percentage.
def parse_gcov_report(gcov_input):
per_file_coverage = {}
total_coverage = None
for line in sys.stdin:
line = line.strip()
# --First line of the coverage report (with file name in it)?
match_obj = re.match("^File '(.*)'$", line)
if match_obj:
# fetch the file name from the first line of the report.
current_file = match_obj.group(1)
continue
# -- Second line of the file report (with coverage percentage)
match_obj = re.match("^Lines executed:(.*)% of (.*)", line)
if match_obj:
coverage = float(match_obj.group(1))
lines = int(match_obj.group(2))
if current_file is not None:
per_file_coverage[current_file] = (coverage, lines)
current_file = None
else:
# If current_file is not set, we reach the last line of report,
# which contains the summarized coverage percentage.
total_coverage = (coverage, lines)
continue
# If the line's pattern doesn't fall into the above categories. We
# can simply ignore them since they're either empty line or doesn't
# find executable lines of the given file.
current_file = None
return per_file_coverage, total_coverage
def get_option_parser():
usage = "Parse the gcov output and generate more human-readable code " +\
"coverage report."
parser = OptionParser(usage)
parser.add_option(
"--interested-files", "-i",
dest="filenames",
help="Comma separated files names. if specified, we will display " +
"the coverage report only for interested source files. " +
"Otherwise we will display the coverage report for all " +
"source files."
)
return parser
def display_file_coverage(per_file_coverage, total_coverage):
# To print out auto-adjustable column, we need to know the longest
# length of file names.
max_file_name_length = max(
len(fname) for fname in per_file_coverage.keys()
)
# -- Print header
# size of separator is determined by 3 column sizes:
# file name, coverage percentage and lines.
header_template = \
"%" + str(max_file_name_length) + "s\t%s\t%s"
separator = "-" * (max_file_name_length + 10 + 20)
print header_template % ("Filename", "Coverage", "Lines")
print separator
# -- Print body
# template for printing coverage report for each file.
record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d"
for fname, coverage_info in per_file_coverage.items():
coverage, lines = coverage_info
print record_template % (fname, coverage, lines)
# -- Print footer
if total_coverage:
print separator
print record_template % ("Total", total_coverage[0], total_coverage[1])
def report_coverage():
parser = get_option_parser()
(options, args) = parser.parse_args()
interested_files = set()
if options.filenames is not None:
interested_files = set(f.strip() for f in options.filenames.split(','))
# To make things simple, right now we only read gcov report from the input
per_file_coverage, total_coverage = parse_gcov_report(sys.stdin)
# Check if we need to display coverage info for interested files.
if len(interested_files):
per_file_coverage = dict(
(fname, per_file_coverage[fname]) for fname in interested_files
if fname in per_file_coverage
)
# If we only interested in several files, it makes no sense to report
# the total_coverage
total_coverage = None
if not len(per_file_coverage):
print >> sys.stderr, "Cannot find coverage info for the given files."
return
display_file_coverage(per_file_coverage, total_coverage)
if __name__ == "__main__":
report_coverage()

224
db/builder.cc Normal file
View File

@@ -0,0 +1,224 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/builder.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/merge_helper.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "rocksdb/table.h"
#include "table/block_based_table_builder.h"
#include "util/stop_watch.h"
namespace rocksdb {
class TableFactory;
TableBuilder* NewTableBuilder(const Options& options,
const InternalKeyComparator& internal_comparator,
WritableFile* file,
CompressionType compression_type) {
return options.table_factory->NewTableBuilder(options, internal_comparator,
file, compression_type);
}
Status BuildTable(const std::string& dbname, Env* env, const Options& options,
const EnvOptions& soptions, TableCache* table_cache,
Iterator* iter, FileMetaData* meta,
const InternalKeyComparator& internal_comparator,
const SequenceNumber newest_snapshot,
const SequenceNumber earliest_seqno_in_memtable,
const CompressionType compression) {
Status s;
meta->file_size = 0;
meta->smallest_seqno = meta->largest_seqno = 0;
iter->SeekToFirst();
// If the sequence number of the smallest entry in the memtable is
// smaller than the most recent snapshot, then we do not trigger
// removal of duplicate/deleted keys as part of this builder.
bool purge = options.purge_redundant_kvs_while_flush;
if (earliest_seqno_in_memtable <= newest_snapshot) {
purge = false;
}
std::string fname = TableFileName(dbname, meta->number);
if (iter->Valid()) {
unique_ptr<WritableFile> file;
s = env->NewWritableFile(fname, &file, soptions);
if (!s.ok()) {
return s;
}
TableBuilder* builder =
NewTableBuilder(options, internal_comparator, file.get(), compression);
// the first key is the smallest key
Slice key = iter->key();
meta->smallest.DecodeFrom(key);
meta->smallest_seqno = GetInternalKeySeqno(key);
meta->largest_seqno = meta->smallest_seqno;
MergeHelper merge(internal_comparator.user_comparator(),
options.merge_operator.get(), options.info_log.get(),
options.min_partial_merge_operands,
true /* internal key corruption is not ok */);
if (purge) {
// Ugly walkaround to avoid compiler error for release build
bool ok __attribute__((unused)) = true;
// Will write to builder if current key != prev key
ParsedInternalKey prev_ikey;
std::string prev_key;
bool is_first_key = true; // Also write if this is the very first key
while (iter->Valid()) {
bool iterator_at_next = false;
// Get current key
ParsedInternalKey this_ikey;
Slice key = iter->key();
Slice value = iter->value();
// In-memory key corruption is not ok;
// TODO: find a clean way to treat in memory key corruption
ok = ParseInternalKey(key, &this_ikey);
assert(ok);
assert(this_ikey.sequence >= earliest_seqno_in_memtable);
// If the key is the same as the previous key (and it is not the
// first key), then we skip it, since it is an older version.
// Otherwise we output the key and mark it as the "new" previous key.
if (!is_first_key && !internal_comparator.user_comparator()->Compare(
prev_ikey.user_key, this_ikey.user_key)) {
// seqno within the same key are in decreasing order
assert(this_ikey.sequence < prev_ikey.sequence);
} else {
is_first_key = false;
if (this_ikey.type == kTypeMerge) {
// Handle merge-type keys using the MergeHelper
// TODO: pass statistics to MergeUntil
merge.MergeUntil(iter, 0 /* don't worry about snapshot */);
iterator_at_next = true;
if (merge.IsSuccess()) {
// Merge completed correctly.
// Add the resulting merge key/value and continue to next
builder->Add(merge.key(), merge.value());
prev_key.assign(merge.key().data(), merge.key().size());
ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
assert(ok);
} else {
// Merge did not find a Put/Delete.
// Can not compact these merges into a kValueType.
// Write them out one-by-one. (Proceed back() to front())
const std::deque<std::string>& keys = merge.keys();
const std::deque<std::string>& values = merge.values();
assert(keys.size() == values.size() && keys.size() >= 1);
std::deque<std::string>::const_reverse_iterator key_iter;
std::deque<std::string>::const_reverse_iterator value_iter;
for (key_iter=keys.rbegin(), value_iter = values.rbegin();
key_iter != keys.rend() && value_iter != values.rend();
++key_iter, ++value_iter) {
builder->Add(Slice(*key_iter), Slice(*value_iter));
}
// Sanity check. Both iterators should end at the same time
assert(key_iter == keys.rend() && value_iter == values.rend());
prev_key.assign(keys.front());
ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
assert(ok);
}
} else {
// Handle Put/Delete-type keys by simply writing them
builder->Add(key, value);
prev_key.assign(key.data(), key.size());
ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
assert(ok);
}
}
if (!iterator_at_next) iter->Next();
}
// The last key is the largest key
meta->largest.DecodeFrom(Slice(prev_key));
SequenceNumber seqno = GetInternalKeySeqno(Slice(prev_key));
meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
meta->largest_seqno = std::max(meta->largest_seqno, seqno);
} else {
for (; iter->Valid(); iter->Next()) {
Slice key = iter->key();
meta->largest.DecodeFrom(key);
builder->Add(key, iter->value());
SequenceNumber seqno = GetInternalKeySeqno(key);
meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
meta->largest_seqno = std::max(meta->largest_seqno, seqno);
}
}
// Finish and check for builder errors
if (s.ok()) {
s = builder->Finish();
if (s.ok()) {
meta->file_size = builder->FileSize();
assert(meta->file_size > 0);
}
} else {
builder->Abandon();
}
delete builder;
// Finish and check for file errors
if (s.ok() && !options.disableDataSync) {
if (options.use_fsync) {
StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
s = file->Fsync();
} else {
StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
s = file->Sync();
}
}
if (s.ok()) {
s = file->Close();
}
if (s.ok()) {
// Verify that the table is usable
Iterator* it = table_cache->NewIterator(ReadOptions(), soptions,
internal_comparator, *meta);
s = it->status();
delete it;
}
}
// Check for input iterator errors
if (!iter->status().ok()) {
s = iter->status();
}
if (s.ok() && meta->file_size > 0) {
// Keep it
} else {
env->DeleteFile(fname);
}
return s;
}
} // namespace rocksdb

45
db/builder.h Normal file
View File

@@ -0,0 +1,45 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "rocksdb/comparator.h"
#include "rocksdb/status.h"
#include "rocksdb/types.h"
#include "rocksdb/options.h"
namespace rocksdb {
struct Options;
struct FileMetaData;
class Env;
struct EnvOptions;
class Iterator;
class TableCache;
class VersionEdit;
class TableBuilder;
class WritableFile;
extern TableBuilder* NewTableBuilder(
const Options& options, const InternalKeyComparator& internal_comparator,
WritableFile* file, CompressionType compression_type);
// Build a Table file from the contents of *iter. The generated file
// will be named according to meta->number. On success, the rest of
// *meta will be filled with metadata about the generated table.
// If no data is present in *iter, meta->file_size will be set to
// zero, and no Table file will be produced.
extern Status BuildTable(const std::string& dbname, Env* env,
const Options& options, const EnvOptions& soptions,
TableCache* table_cache, Iterator* iter,
FileMetaData* meta,
const InternalKeyComparator& internal_comparator,
const SequenceNumber newest_snapshot,
const SequenceNumber earliest_seqno_in_memtable,
const CompressionType compression);
} // namespace rocksdb

1476
db/c.cc Normal file

File diff suppressed because it is too large Load Diff

494
db/c_test.c Normal file
View File

@@ -0,0 +1,494 @@
/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
Use of this source code is governed by a BSD-style license that can be
found in the LICENSE file. See the AUTHORS file for names of contributors. */
#include "rocksdb/c.h"
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
const char* phase = "";
static char dbname[200];
static void StartPhase(const char* name) {
fprintf(stderr, "=== Test %s\n", name);
phase = name;
}
static const char* GetTempDir(void) {
const char* ret = getenv("TEST_TMPDIR");
if (ret == NULL || ret[0] == '\0')
ret = "/tmp";
return ret;
}
#define CheckNoError(err) \
if ((err) != NULL) { \
fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
abort(); \
}
#define CheckCondition(cond) \
if (!(cond)) { \
fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
abort(); \
}
static void CheckEqual(const char* expected, const char* v, size_t n) {
if (expected == NULL && v == NULL) {
// ok
} else if (expected != NULL && v != NULL && n == strlen(expected) &&
memcmp(expected, v, n) == 0) {
// ok
return;
} else {
fprintf(stderr, "%s: expected '%s', got '%s'\n",
phase,
(expected ? expected : "(null)"),
(v ? v : "(null"));
abort();
}
}
static void Free(char** ptr) {
if (*ptr) {
free(*ptr);
*ptr = NULL;
}
}
static void CheckGet(
rocksdb_t* db,
const rocksdb_readoptions_t* options,
const char* key,
const char* expected) {
char* err = NULL;
size_t val_len;
char* val;
val = rocksdb_get(db, options, key, strlen(key), &val_len, &err);
CheckNoError(err);
CheckEqual(expected, val, val_len);
Free(&val);
}
static void CheckIter(rocksdb_iterator_t* iter,
const char* key, const char* val) {
size_t len;
const char* str;
str = rocksdb_iter_key(iter, &len);
CheckEqual(key, str, len);
str = rocksdb_iter_value(iter, &len);
CheckEqual(val, str, len);
}
// Callback from rocksdb_writebatch_iterate()
static void CheckPut(void* ptr,
const char* k, size_t klen,
const char* v, size_t vlen) {
int* state = (int*) ptr;
CheckCondition(*state < 2);
switch (*state) {
case 0:
CheckEqual("bar", k, klen);
CheckEqual("b", v, vlen);
break;
case 1:
CheckEqual("box", k, klen);
CheckEqual("c", v, vlen);
break;
}
(*state)++;
}
// Callback from rocksdb_writebatch_iterate()
static void CheckDel(void* ptr, const char* k, size_t klen) {
int* state = (int*) ptr;
CheckCondition(*state == 2);
CheckEqual("bar", k, klen);
(*state)++;
}
static void CmpDestroy(void* arg) { }
static int CmpCompare(void* arg, const char* a, size_t alen,
const char* b, size_t blen) {
int n = (alen < blen) ? alen : blen;
int r = memcmp(a, b, n);
if (r == 0) {
if (alen < blen) r = -1;
else if (alen > blen) r = +1;
}
return r;
}
static const char* CmpName(void* arg) {
return "foo";
}
// Custom filter policy
static unsigned char fake_filter_result = 1;
static void FilterDestroy(void* arg) { }
static const char* FilterName(void* arg) {
return "TestFilter";
}
static char* FilterCreate(
void* arg,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length) {
*filter_length = 4;
char* result = malloc(4);
memcpy(result, "fake", 4);
return result;
}
static unsigned char FilterKeyMatch(
void* arg,
const char* key, size_t length,
const char* filter, size_t filter_length) {
CheckCondition(filter_length == 4);
CheckCondition(memcmp(filter, "fake", 4) == 0);
return fake_filter_result;
}
// Custom merge operator
static void MergeOperatorDestroy(void* arg) { }
static const char* MergeOperatorName(void* arg) {
return "TestMergeOperator";
}
static char* MergeOperatorFullMerge(
void* arg,
const char* key, size_t key_length,
const char* existing_value, size_t existing_value_length,
const char* const* operands_list, const size_t* operands_list_length,
int num_operands,
unsigned char* success, size_t* new_value_length) {
*new_value_length = 4;
*success = 1;
char* result = malloc(4);
memcpy(result, "fake", 4);
return result;
}
static char* MergeOperatorPartialMerge(
void* arg,
const char* key, size_t key_length,
const char* const* operands_list, const size_t* operands_list_length,
int num_operands,
unsigned char* success, size_t* new_value_length) {
*new_value_length = 4;
*success = 1;
char* result = malloc(4);
memcpy(result, "fake", 4);
return result;
}
int main(int argc, char** argv) {
rocksdb_t* db;
rocksdb_comparator_t* cmp;
rocksdb_cache_t* cache;
rocksdb_env_t* env;
rocksdb_options_t* options;
rocksdb_readoptions_t* roptions;
rocksdb_writeoptions_t* woptions;
char* err = NULL;
int run = -1;
snprintf(dbname, sizeof(dbname),
"%s/rocksdb_c_test-%d",
GetTempDir(),
((int) geteuid()));
StartPhase("create_objects");
cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
env = rocksdb_create_default_env();
cache = rocksdb_cache_create_lru(100000);
options = rocksdb_options_create();
rocksdb_options_set_comparator(options, cmp);
rocksdb_options_set_error_if_exists(options, 1);
rocksdb_options_set_cache(options, cache);
rocksdb_options_set_env(options, env);
rocksdb_options_set_info_log(options, NULL);
rocksdb_options_set_write_buffer_size(options, 100000);
rocksdb_options_set_paranoid_checks(options, 1);
rocksdb_options_set_max_open_files(options, 10);
rocksdb_options_set_block_size(options, 1024);
rocksdb_options_set_block_restart_interval(options, 8);
rocksdb_options_set_compression(options, rocksdb_no_compression);
rocksdb_options_set_compression_options(options, -14, -1, 0);
int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression,
rocksdb_no_compression, rocksdb_no_compression};
rocksdb_options_set_compression_per_level(options, compression_levels, 4);
roptions = rocksdb_readoptions_create();
rocksdb_readoptions_set_verify_checksums(roptions, 1);
rocksdb_readoptions_set_fill_cache(roptions, 0);
woptions = rocksdb_writeoptions_create();
rocksdb_writeoptions_set_sync(woptions, 1);
StartPhase("destroy");
rocksdb_destroy_db(options, dbname, &err);
Free(&err);
StartPhase("open_error");
db = rocksdb_open(options, dbname, &err);
CheckCondition(err != NULL);
Free(&err);
StartPhase("open");
rocksdb_options_set_create_if_missing(options, 1);
db = rocksdb_open(options, dbname, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", NULL);
StartPhase("put");
rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", "hello");
StartPhase("compactall");
rocksdb_compact_range(db, NULL, 0, NULL, 0);
CheckGet(db, roptions, "foo", "hello");
StartPhase("compactrange");
rocksdb_compact_range(db, "a", 1, "z", 1);
CheckGet(db, roptions, "foo", "hello");
StartPhase("writebatch");
{
rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
rocksdb_writebatch_clear(wb);
rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
rocksdb_writebatch_put(wb, "box", 3, "c", 1);
rocksdb_writebatch_delete(wb, "bar", 3);
rocksdb_write(db, woptions, wb, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", "hello");
CheckGet(db, roptions, "bar", NULL);
CheckGet(db, roptions, "box", "c");
int pos = 0;
rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
CheckCondition(pos == 3);
rocksdb_writebatch_destroy(wb);
}
StartPhase("iter");
{
rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
CheckCondition(!rocksdb_iter_valid(iter));
rocksdb_iter_seek_to_first(iter);
CheckCondition(rocksdb_iter_valid(iter));
CheckIter(iter, "box", "c");
rocksdb_iter_next(iter);
CheckIter(iter, "foo", "hello");
rocksdb_iter_prev(iter);
CheckIter(iter, "box", "c");
rocksdb_iter_prev(iter);
CheckCondition(!rocksdb_iter_valid(iter));
rocksdb_iter_seek_to_last(iter);
CheckIter(iter, "foo", "hello");
rocksdb_iter_seek(iter, "b", 1);
CheckIter(iter, "box", "c");
rocksdb_iter_get_error(iter, &err);
CheckNoError(err);
rocksdb_iter_destroy(iter);
}
StartPhase("approximate_sizes");
{
int i;
int n = 20000;
char keybuf[100];
char valbuf[100];
uint64_t sizes[2];
const char* start[2] = { "a", "k00000000000000010000" };
size_t start_len[2] = { 1, 21 };
const char* limit[2] = { "k00000000000000010000", "z" };
size_t limit_len[2] = { 21, 1 };
rocksdb_writeoptions_set_sync(woptions, 0);
for (i = 0; i < n; i++) {
snprintf(keybuf, sizeof(keybuf), "k%020d", i);
snprintf(valbuf, sizeof(valbuf), "v%020d", i);
rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
&err);
CheckNoError(err);
}
rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
CheckCondition(sizes[0] > 0);
CheckCondition(sizes[1] > 0);
}
StartPhase("property");
{
char* prop = rocksdb_property_value(db, "nosuchprop");
CheckCondition(prop == NULL);
prop = rocksdb_property_value(db, "rocksdb.stats");
CheckCondition(prop != NULL);
Free(&prop);
}
StartPhase("snapshot");
{
const rocksdb_snapshot_t* snap;
snap = rocksdb_create_snapshot(db);
rocksdb_delete(db, woptions, "foo", 3, &err);
CheckNoError(err);
rocksdb_readoptions_set_snapshot(roptions, snap);
CheckGet(db, roptions, "foo", "hello");
rocksdb_readoptions_set_snapshot(roptions, NULL);
CheckGet(db, roptions, "foo", NULL);
rocksdb_release_snapshot(db, snap);
}
StartPhase("repair");
{
// If we do not compact here, then the lazy deletion of
// files (https://reviews.facebook.net/D6123) would leave
// around deleted files and the repair process will find
// those files and put them back into the database.
rocksdb_compact_range(db, NULL, 0, NULL, 0);
rocksdb_close(db);
rocksdb_options_set_create_if_missing(options, 0);
rocksdb_options_set_error_if_exists(options, 0);
rocksdb_repair_db(options, dbname, &err);
CheckNoError(err);
db = rocksdb_open(options, dbname, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", NULL);
CheckGet(db, roptions, "bar", NULL);
CheckGet(db, roptions, "box", "c");
rocksdb_options_set_create_if_missing(options, 1);
rocksdb_options_set_error_if_exists(options, 1);
}
StartPhase("filter");
for (run = 0; run < 2; run++) {
// First run uses custom filter, second run uses bloom filter
CheckNoError(err);
rocksdb_filterpolicy_t* policy;
if (run == 0) {
policy = rocksdb_filterpolicy_create(
NULL, FilterDestroy, FilterCreate, FilterKeyMatch, NULL, FilterName);
} else {
policy = rocksdb_filterpolicy_create_bloom(10);
}
// Create new database
rocksdb_close(db);
rocksdb_destroy_db(options, dbname, &err);
rocksdb_options_set_filter_policy(options, policy);
db = rocksdb_open(options, dbname, &err);
CheckNoError(err);
rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
CheckNoError(err);
rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
CheckNoError(err);
rocksdb_compact_range(db, NULL, 0, NULL, 0);
fake_filter_result = 1;
CheckGet(db, roptions, "foo", "foovalue");
CheckGet(db, roptions, "bar", "barvalue");
if (phase == 0) {
// Must not find value when custom filter returns false
fake_filter_result = 0;
CheckGet(db, roptions, "foo", NULL);
CheckGet(db, roptions, "bar", NULL);
fake_filter_result = 1;
CheckGet(db, roptions, "foo", "foovalue");
CheckGet(db, roptions, "bar", "barvalue");
}
rocksdb_options_set_filter_policy(options, NULL);
rocksdb_filterpolicy_destroy(policy);
}
StartPhase("merge_operator");
{
rocksdb_mergeoperator_t* merge_operator;
merge_operator = rocksdb_mergeoperator_create(
NULL, MergeOperatorDestroy, MergeOperatorFullMerge,
MergeOperatorPartialMerge, NULL, MergeOperatorName);
// Create new database
rocksdb_close(db);
rocksdb_destroy_db(options, dbname, &err);
rocksdb_options_set_merge_operator(options, merge_operator);
db = rocksdb_open(options, dbname, &err);
CheckNoError(err);
rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", "foovalue");
rocksdb_merge(db, woptions, "foo", 3, "barvalue", 8, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", "fake");
// Merge of a non-existing value
rocksdb_merge(db, woptions, "bar", 3, "barvalue", 8, &err);
CheckNoError(err);
CheckGet(db, roptions, "bar", "fake");
}
StartPhase("prefix");
{
// Create new database
rocksdb_close(db);
rocksdb_destroy_db(options, dbname, &err);
rocksdb_filterpolicy_t* policy = rocksdb_filterpolicy_create_bloom(10);
rocksdb_options_set_filter_policy(options, policy);
rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3));
rocksdb_options_set_hash_skip_list_rep(options, 50000, 4, 4);
rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
db = rocksdb_open(options, dbname, &err);
CheckNoError(err);
rocksdb_put(db, woptions, "foo1", 4, "foo", 3, &err);
CheckNoError(err);
rocksdb_put(db, woptions, "foo2", 4, "foo", 3, &err);
CheckNoError(err);
rocksdb_put(db, woptions, "foo3", 4, "foo", 3, &err);
CheckNoError(err);
rocksdb_put(db, woptions, "bar1", 4, "bar", 3, &err);
CheckNoError(err);
rocksdb_put(db, woptions, "bar2", 4, "bar", 3, &err);
CheckNoError(err);
rocksdb_put(db, woptions, "bar3", 4, "bar", 3, &err);
CheckNoError(err);
rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
CheckCondition(!rocksdb_iter_valid(iter));
rocksdb_iter_seek(iter, "bar", 3);
rocksdb_iter_get_error(iter, &err);
CheckNoError(err);
CheckCondition(rocksdb_iter_valid(iter));
CheckIter(iter, "bar1", "bar");
rocksdb_iter_next(iter);
CheckIter(iter, "bar2", "bar");
rocksdb_iter_next(iter);
CheckIter(iter, "bar3", "bar");
rocksdb_iter_get_error(iter, &err);
CheckNoError(err);
rocksdb_iter_destroy(iter);
rocksdb_filterpolicy_destroy(policy);
}
StartPhase("cleanup");
rocksdb_close(db);
rocksdb_options_destroy(options);
rocksdb_readoptions_destroy(roptions);
rocksdb_writeoptions_destroy(woptions);
rocksdb_cache_destroy(cache);
rocksdb_comparator_destroy(cmp);
rocksdb_env_destroy(env);
fprintf(stderr, "PASS\n");
return 0;
}

604
db/column_family.cc Normal file
View File

@@ -0,0 +1,604 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/column_family.h"
#include <vector>
#include <string>
#include <algorithm>
#include <limits>
#include "db/db_impl.h"
#include "db/version_set.h"
#include "db/internal_stats.h"
#include "db/compaction_picker.h"
#include "db/table_properties_collector.h"
#include "util/autovector.h"
#include "util/hash_skiplist_rep.h"
namespace rocksdb {
ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
DBImpl* db, port::Mutex* mutex)
: cfd_(cfd), db_(db), mutex_(mutex) {
if (cfd_ != nullptr) {
cfd_->Ref();
}
}
ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
if (cfd_ != nullptr) {
DBImpl::DeletionState deletion_state;
mutex_->Lock();
if (cfd_->Unref()) {
delete cfd_;
}
db_->FindObsoleteFiles(deletion_state, false, true);
mutex_->Unlock();
if (deletion_state.HaveSomethingToDelete()) {
db_->PurgeObsoleteFiles(deletion_state);
}
}
}
uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
namespace {
// Fix user-supplied options to be reasonable
template <class T, class V>
static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
}
} // anonymous namespace
ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const ColumnFamilyOptions& src) {
ColumnFamilyOptions result = src;
result.comparator = icmp;
result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
#ifdef OS_MACOSX
// TODO(icanadi) make write_buffer_size uint64_t instead of size_t
ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30);
#else
ClipToRange(&result.write_buffer_size,
((size_t)64) << 10, ((size_t)64) << 30);
#endif
// if user sets arena_block_size, we trust user to use this value. Otherwise,
// calculate a proper value from writer_buffer_size;
if (result.arena_block_size <= 0) {
result.arena_block_size = result.write_buffer_size / 10;
}
result.min_write_buffer_number_to_merge =
std::min(result.min_write_buffer_number_to_merge,
result.max_write_buffer_number - 1);
if (result.block_cache == nullptr && !result.no_block_cache) {
result.block_cache = NewLRUCache(8 << 20);
}
result.compression_per_level = src.compression_per_level;
if (result.block_size_deviation < 0 || result.block_size_deviation > 100) {
result.block_size_deviation = 0;
}
if (result.max_mem_compaction_level >= result.num_levels) {
result.max_mem_compaction_level = result.num_levels - 1;
}
if (result.soft_rate_limit > result.hard_rate_limit) {
result.soft_rate_limit = result.hard_rate_limit;
}
if (!result.prefix_extractor) {
assert(result.memtable_factory);
Slice name = result.memtable_factory->Name();
if (name.compare("HashSkipListRepFactory") == 0 ||
name.compare("HashLinkListRepFactory") == 0) {
result.memtable_factory = std::make_shared<SkipListFactory>();
}
}
// -- Sanitize the table properties collector
// All user defined properties collectors will be wrapped by
// UserKeyTablePropertiesCollector since for them they only have the
// knowledge of the user keys; internal keys are invisible to them.
auto& collector_factories = result.table_properties_collector_factories;
for (size_t i = 0; i < result.table_properties_collector_factories.size();
++i) {
assert(collector_factories[i]);
collector_factories[i] =
std::make_shared<UserKeyTablePropertiesCollectorFactory>(
collector_factories[i]);
}
// Add collector to collect internal key statistics
collector_factories.push_back(
std::make_shared<InternalKeyPropertiesCollectorFactory>());
if (result.compaction_style == kCompactionStyleFIFO) {
result.num_levels = 1;
// since we delete level0 files in FIFO compaction when there are too many
// of them, these options don't really mean anything
result.level0_file_num_compaction_trigger = std::numeric_limits<int>::max();
result.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
result.level0_stop_writes_trigger = std::numeric_limits<int>::max();
}
return result;
}
int SuperVersion::dummy = 0;
void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
void* const SuperVersion::kSVObsolete = nullptr;
SuperVersion::~SuperVersion() {
for (auto td : to_delete) {
delete td;
}
}
SuperVersion* SuperVersion::Ref() {
refs.fetch_add(1, std::memory_order_relaxed);
return this;
}
bool SuperVersion::Unref() {
// fetch_sub returns the previous value of ref
uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed);
assert(previous_refs > 0);
return previous_refs == 1;
}
void SuperVersion::Cleanup() {
assert(refs.load(std::memory_order_relaxed) == 0);
imm->Unref(&to_delete);
MemTable* m = mem->Unref();
if (m != nullptr) {
to_delete.push_back(m);
}
current->Unref();
}
void SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm,
Version* new_current) {
mem = new_mem;
imm = new_imm;
current = new_current;
mem->Ref();
imm->Ref();
current->Ref();
refs.store(1, std::memory_order_relaxed);
}
namespace {
void SuperVersionUnrefHandle(void* ptr) {
// UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
// destroyed. When former happens, the thread shouldn't see kSVInUse.
// When latter happens, we are in ~ColumnFamilyData(), no get should happen as
// well.
SuperVersion* sv = static_cast<SuperVersion*>(ptr);
if (sv->Unref()) {
sv->db_mutex->Lock();
sv->Cleanup();
sv->db_mutex->Unlock();
delete sv;
}
}
} // anonymous namespace
ColumnFamilyData::ColumnFamilyData(const std::string& dbname, uint32_t id,
const std::string& name,
Version* dummy_versions, Cache* table_cache,
const ColumnFamilyOptions& options,
const DBOptions* db_options,
const EnvOptions& storage_options,
ColumnFamilySet* column_family_set)
: id_(id),
name_(name),
dummy_versions_(dummy_versions),
current_(nullptr),
refs_(0),
dropped_(false),
internal_comparator_(options.comparator),
internal_filter_policy_(options.filter_policy),
options_(*db_options, SanitizeOptions(&internal_comparator_,
&internal_filter_policy_, options)),
mem_(nullptr),
imm_(options_.min_write_buffer_number_to_merge),
super_version_(nullptr),
super_version_number_(0),
local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
next_(nullptr),
prev_(nullptr),
log_number_(0),
need_slowdown_for_num_level0_files_(false),
column_family_set_(column_family_set) {
Ref();
// if dummy_versions is nullptr, then this is a dummy column family.
if (dummy_versions != nullptr) {
internal_stats_.reset(new InternalStats(
options_.num_levels, db_options->env, db_options->statistics.get()));
table_cache_.reset(
new TableCache(dbname, &options_, storage_options, table_cache));
if (options_.compaction_style == kCompactionStyleUniversal) {
compaction_picker_.reset(
new UniversalCompactionPicker(&options_, &internal_comparator_));
} else if (options_.compaction_style == kCompactionStyleLevel) {
compaction_picker_.reset(
new LevelCompactionPicker(&options_, &internal_comparator_));
} else {
assert(options_.compaction_style == kCompactionStyleFIFO);
compaction_picker_.reset(
new FIFOCompactionPicker(&options_, &internal_comparator_));
}
Log(options_.info_log, "Options for column family \"%s\":\n",
name.c_str());
const ColumnFamilyOptions* cf_options = &options_;
cf_options->Dump(options_.info_log.get());
}
}
// DB mutex held
ColumnFamilyData::~ColumnFamilyData() {
assert(refs_ == 0);
// remove from linked list
auto prev = prev_;
auto next = next_;
prev->next_ = next;
next->prev_ = prev;
// it's nullptr for dummy CFD
if (column_family_set_ != nullptr) {
// remove from column_family_set
column_family_set_->RemoveColumnFamily(this);
}
if (current_ != nullptr) {
current_->Unref();
}
if (super_version_ != nullptr) {
// Release SuperVersion reference kept in ThreadLocalPtr.
// This must be done outside of mutex_ since unref handler can lock mutex.
super_version_->db_mutex->Unlock();
local_sv_.reset();
super_version_->db_mutex->Lock();
bool is_last_reference __attribute__((unused));
is_last_reference = super_version_->Unref();
assert(is_last_reference);
super_version_->Cleanup();
delete super_version_;
super_version_ = nullptr;
}
if (dummy_versions_ != nullptr) {
// List must be empty
assert(dummy_versions_->next_ == dummy_versions_);
delete dummy_versions_;
}
if (mem_ != nullptr) {
delete mem_->Unref();
}
autovector<MemTable*> to_delete;
imm_.current()->Unref(&to_delete);
for (MemTable* m : to_delete) {
delete m;
}
}
const EnvOptions* ColumnFamilyData::soptions() const {
return &(column_family_set_->storage_options_);
}
void ColumnFamilyData::SetCurrent(Version* current) {
current_ = current;
need_slowdown_for_num_level0_files_ =
(options_.level0_slowdown_writes_trigger >= 0 &&
current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
}
void ColumnFamilyData::CreateNewMemtable() {
assert(current_ != nullptr);
if (mem_ != nullptr) {
delete mem_->Unref();
}
mem_ = new MemTable(internal_comparator_, options_);
mem_->Ref();
}
Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
return compaction_picker_->PickCompaction(current_, log_buffer);
}
Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level,
const InternalKey* begin,
const InternalKey* end,
InternalKey** compaction_end) {
return compaction_picker_->CompactRange(current_, input_level, output_level,
begin, end, compaction_end);
}
SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
port::Mutex* db_mutex) {
SuperVersion* sv = nullptr;
if (LIKELY(column_family_set_->db_options_->allow_thread_local)) {
sv = GetThreadLocalSuperVersion(db_mutex);
sv->Ref();
if (!ReturnThreadLocalSuperVersion(sv)) {
sv->Unref();
}
} else {
db_mutex->Lock();
sv = super_version_->Ref();
db_mutex->Unlock();
}
return sv;
}
SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
port::Mutex* db_mutex) {
SuperVersion* sv = nullptr;
// The SuperVersion is cached in thread local storage to avoid acquiring
// mutex when SuperVersion does not change since the last use. When a new
// SuperVersion is installed, the compaction or flush thread cleans up
// cached SuperVersion in all existing thread local storage. To avoid
// acquiring mutex for this operation, we use atomic Swap() on the thread
// local pointer to guarantee exclusive access. If the thread local pointer
// is being used while a new SuperVersion is installed, the cached
// SuperVersion can become stale. In that case, the background thread would
// have swapped in kSVObsolete. We re-check the value at when returning
// SuperVersion back to thread local, with an atomic compare and swap.
// The superversion will need to be released if detected to be stale.
void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
// Invariant:
// (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
// (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
// should only keep kSVInUse before ReturnThreadLocalSuperVersion call
// (if no Scrape happens).
assert(ptr != SuperVersion::kSVInUse);
sv = static_cast<SuperVersion*>(ptr);
if (sv == SuperVersion::kSVObsolete ||
sv->version_number != super_version_number_.load()) {
RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_ACQUIRES);
SuperVersion* sv_to_delete = nullptr;
if (sv && sv->Unref()) {
RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS);
db_mutex->Lock();
// NOTE: underlying resources held by superversion (sst files) might
// not be released until the next background job.
sv->Cleanup();
sv_to_delete = sv;
} else {
db_mutex->Lock();
}
sv = super_version_->Ref();
db_mutex->Unlock();
delete sv_to_delete;
}
assert(sv != nullptr);
return sv;
}
bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
assert(sv != nullptr);
// Put the SuperVersion back
void* expected = SuperVersion::kSVInUse;
if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
// When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
// storage has not been altered and no Scrape has happend. The
// SuperVersion is still current.
return true;
} else {
// ThreadLocal scrape happened in the process of this GetImpl call (after
// thread local Swap() at the beginning and before CompareAndSwap()).
// This means the SuperVersion it holds is obsolete.
assert(expected == SuperVersion::kSVObsolete);
}
return false;
}
SuperVersion* ColumnFamilyData::InstallSuperVersion(
SuperVersion* new_superversion, port::Mutex* db_mutex) {
new_superversion->db_mutex = db_mutex;
new_superversion->Init(mem_, imm_.current(), current_);
SuperVersion* old_superversion = super_version_;
super_version_ = new_superversion;
++super_version_number_;
super_version_->version_number = super_version_number_;
// Reset SuperVersions cached in thread local storage
if (column_family_set_->db_options_->allow_thread_local) {
ResetThreadLocalSuperVersions();
}
if (old_superversion != nullptr && old_superversion->Unref()) {
old_superversion->Cleanup();
return old_superversion; // will let caller delete outside of mutex
}
return nullptr;
}
void ColumnFamilyData::ResetThreadLocalSuperVersions() {
autovector<void*> sv_ptrs;
local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
for (auto ptr : sv_ptrs) {
assert(ptr);
if (ptr == SuperVersion::kSVInUse) {
continue;
}
auto sv = static_cast<SuperVersion*>(ptr);
if (sv->Unref()) {
sv->Cleanup();
delete sv;
}
}
}
ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
const DBOptions* db_options,
const EnvOptions& storage_options,
Cache* table_cache)
: max_column_family_(0),
dummy_cfd_(new ColumnFamilyData(dbname, 0, "", nullptr, nullptr,
ColumnFamilyOptions(), db_options,
storage_options_, nullptr)),
default_cfd_cache_(nullptr),
db_name_(dbname),
db_options_(db_options),
storage_options_(storage_options),
table_cache_(table_cache),
spin_lock_(ATOMIC_FLAG_INIT) {
// initialize linked list
dummy_cfd_->prev_ = dummy_cfd_;
dummy_cfd_->next_ = dummy_cfd_;
}
ColumnFamilySet::~ColumnFamilySet() {
while (column_family_data_.size() > 0) {
// cfd destructor will delete itself from column_family_data_
auto cfd = column_family_data_.begin()->second;
cfd->Unref();
delete cfd;
}
dummy_cfd_->Unref();
delete dummy_cfd_;
}
ColumnFamilyData* ColumnFamilySet::GetDefault() const {
assert(default_cfd_cache_ != nullptr);
return default_cfd_cache_;
}
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
auto cfd_iter = column_family_data_.find(id);
if (cfd_iter != column_family_data_.end()) {
return cfd_iter->second;
} else {
return nullptr;
}
}
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name)
const {
auto cfd_iter = column_families_.find(name);
if (cfd_iter != column_families_.end()) {
auto cfd = GetColumnFamily(cfd_iter->second);
assert(cfd != nullptr);
return cfd;
} else {
return nullptr;
}
}
uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
return ++max_column_family_;
}
uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
max_column_family_ = std::max(new_max_column_family, max_column_family_);
}
size_t ColumnFamilySet::NumberOfColumnFamilies() const {
return column_families_.size();
}
// under a DB mutex
ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
const std::string& name, uint32_t id, Version* dummy_versions,
const ColumnFamilyOptions& options) {
assert(column_families_.find(name) == column_families_.end());
ColumnFamilyData* new_cfd =
new ColumnFamilyData(db_name_, id, name, dummy_versions, table_cache_,
options, db_options_, storage_options_, this);
Lock();
column_families_.insert({name, id});
column_family_data_.insert({id, new_cfd});
Unlock();
max_column_family_ = std::max(max_column_family_, id);
// add to linked list
new_cfd->next_ = dummy_cfd_;
auto prev = dummy_cfd_->prev_;
new_cfd->prev_ = prev;
prev->next_ = new_cfd;
dummy_cfd_->prev_ = new_cfd;
if (id == 0) {
default_cfd_cache_ = new_cfd;
}
return new_cfd;
}
void ColumnFamilySet::Lock() {
// spin lock
while (spin_lock_.test_and_set(std::memory_order_acquire)) {
}
}
void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); }
// REQUIRES: DB mutex held
void ColumnFamilySet::FreeDeadColumnFamilies() {
autovector<ColumnFamilyData*> to_delete;
for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
if (cfd->refs_ == 0) {
to_delete.push_back(cfd);
}
}
for (auto cfd : to_delete) {
// this is very rare, so it's not a problem that we do it under a mutex
delete cfd;
}
}
// under a DB mutex
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
auto cfd_iter = column_family_data_.find(cfd->GetID());
assert(cfd_iter != column_family_data_.end());
Lock();
column_family_data_.erase(cfd_iter);
column_families_.erase(cfd->GetName());
Unlock();
}
bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
if (column_family_id == 0) {
// optimization for common case
current_ = column_family_set_->GetDefault();
} else {
// maybe outside of db mutex, should lock
column_family_set_->Lock();
current_ = column_family_set_->GetColumnFamily(column_family_id);
column_family_set_->Unlock();
}
handle_.SetCFD(current_);
return current_ != nullptr;
}
uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
assert(current_ != nullptr);
return current_->GetLogNumber();
}
MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
assert(current_ != nullptr);
return current_->mem();
}
const Options* ColumnFamilyMemTablesImpl::GetOptions() const {
assert(current_ != nullptr);
return current_->options();
}
ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
assert(current_ != nullptr);
return &handle_;
}
} // namespace rocksdb

419
db/column_family.h Normal file
View File

@@ -0,0 +1,419 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <unordered_map>
#include <string>
#include <vector>
#include <atomic>
#include "rocksdb/options.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "db/memtable_list.h"
#include "db/write_batch_internal.h"
#include "db/table_cache.h"
#include "util/thread_local.h"
namespace rocksdb {
class Version;
class VersionSet;
class MemTable;
class MemTableListVersion;
class CompactionPicker;
class Compaction;
class InternalKey;
class InternalStats;
class ColumnFamilyData;
class DBImpl;
class LogBuffer;
// ColumnFamilyHandleImpl is the class that clients use to access different
// column families. It has non-trivial destructor, which gets called when client
// is done using the column family
class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
public:
// create while holding the mutex
ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex);
// destroy without mutex
virtual ~ColumnFamilyHandleImpl();
virtual ColumnFamilyData* cfd() const { return cfd_; }
virtual uint32_t GetID() const;
private:
ColumnFamilyData* cfd_;
DBImpl* db_;
port::Mutex* mutex_;
};
// Does not ref-count ColumnFamilyData
// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
// calls DBImpl methods. When this happens, MemTableInserter need access to
// ColumnFamilyHandle (same as the client would need). In that case, we feed
// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
// methods
class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
public:
ColumnFamilyHandleInternal()
: ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {}
void SetCFD(ColumnFamilyData* cfd) { internal_cfd_ = cfd; }
virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
private:
ColumnFamilyData* internal_cfd_;
};
// holds references to memtable, all immutable memtables and version
struct SuperVersion {
MemTable* mem;
MemTableListVersion* imm;
Version* current;
std::atomic<uint32_t> refs;
// We need to_delete because during Cleanup(), imm->Unref() returns
// all memtables that we need to free through this vector. We then
// delete all those memtables outside of mutex, during destruction
autovector<MemTable*> to_delete;
// Version number of the current SuperVersion
uint64_t version_number;
port::Mutex* db_mutex;
// should be called outside the mutex
SuperVersion() = default;
~SuperVersion();
SuperVersion* Ref();
bool Unref();
// call these two methods with db mutex held
// Cleanup unrefs mem, imm and current. Also, it stores all memtables
// that needs to be deleted in to_delete vector. Unrefing those
// objects needs to be done in the mutex
void Cleanup();
void Init(MemTable* new_mem, MemTableListVersion* new_imm,
Version* new_current);
// The value of dummy is not actually used. kSVInUse takes its address as a
// mark in the thread local storage to indicate the SuperVersion is in use
// by thread. This way, the value of kSVInUse is guaranteed to have no
// conflict with SuperVersion object address and portable on different
// platform.
static int dummy;
static void* const kSVInUse;
static void* const kSVObsolete;
};
extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const ColumnFamilyOptions& src);
class ColumnFamilySet;
// This class keeps all the data that a column family needs. It's mosly dumb and
// used just to provide access to metadata.
// Most methods require DB mutex held, unless otherwise noted
class ColumnFamilyData {
public:
~ColumnFamilyData();
// thread-safe
uint32_t GetID() const { return id_; }
// thread-safe
const std::string& GetName() const { return name_; }
void Ref() { ++refs_; }
// will just decrease reference count to 0, but will not delete it. returns
// true if the ref count was decreased to zero. in that case, it can be
// deleted by the caller immediatelly, or later, by calling
// FreeDeadColumnFamilies()
bool Unref() {
assert(refs_ > 0);
return --refs_ == 0;
}
// This can only be called from single-threaded VersionSet::LogAndApply()
// After dropping column family no other operation on that column family
// will be executed. All the files and memory will be, however, kept around
// until client drops the column family handle. That way, client can still
// access data from dropped column family.
// Column family can be dropped and still alive. In that state:
// *) Column family is not included in the iteration.
// *) Compaction and flush is not executed on the dropped column family.
// *) Client can continue writing and reading from column family. However, all
// writes stay in the current memtable.
// When the dropped column family is unreferenced, then we:
// *) delete all memory associated with that column family
// *) delete all the files associated with that column family
void SetDropped() {
// can't drop default CF
assert(id_ != 0);
dropped_ = true;
}
bool IsDropped() const { return dropped_; }
// thread-safe
int NumberLevels() const { return options_.num_levels; }
void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
uint64_t GetLogNumber() const { return log_number_; }
// thread-safe
const Options* options() const { return &options_; }
const EnvOptions* soptions() const;
InternalStats* internal_stats() { return internal_stats_.get(); }
MemTableList* imm() { return &imm_; }
MemTable* mem() { return mem_; }
Version* current() { return current_; }
Version* dummy_versions() { return dummy_versions_; }
void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
void SetCurrent(Version* current);
void CreateNewMemtable();
TableCache* table_cache() const { return table_cache_.get(); }
// See documentation in compaction_picker.h
Compaction* PickCompaction(LogBuffer* log_buffer);
Compaction* CompactRange(int input_level, int output_level,
const InternalKey* begin, const InternalKey* end,
InternalKey** compaction_end);
CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
// thread-safe
const Comparator* user_comparator() const {
return internal_comparator_.user_comparator();
}
// thread-safe
const InternalKeyComparator& internal_comparator() const {
return internal_comparator_;
}
SuperVersion* GetSuperVersion() { return super_version_; }
// thread-safe
// Return a already referenced SuperVersion to be used safely.
SuperVersion* GetReferencedSuperVersion(port::Mutex* db_mutex);
// thread-safe
// Get SuperVersion stored in thread local storage. If it does not exist,
// get a reference from a current SuperVersion.
SuperVersion* GetThreadLocalSuperVersion(port::Mutex* db_mutex);
// Try to return SuperVersion back to thread local storage. Retrun true on
// success and false on failure. It fails when the thread local storage
// contains anything other than SuperVersion::kSVInUse flag.
bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
// thread-safe
uint64_t GetSuperVersionNumber() const {
return super_version_number_.load();
}
// will return a pointer to SuperVersion* if previous SuperVersion
// if its reference count is zero and needs deletion or nullptr if not
// As argument takes a pointer to allocated SuperVersion to enable
// the clients to allocate SuperVersion outside of mutex.
SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
port::Mutex* db_mutex);
void ResetThreadLocalSuperVersions();
// A Flag indicating whether write needs to slowdown because of there are
// too many number of level0 files.
bool NeedSlowdownForNumLevel0Files() const {
return need_slowdown_for_num_level0_files_;
}
private:
friend class ColumnFamilySet;
ColumnFamilyData(const std::string& dbname, uint32_t id,
const std::string& name, Version* dummy_versions,
Cache* table_cache, const ColumnFamilyOptions& options,
const DBOptions* db_options,
const EnvOptions& storage_options,
ColumnFamilySet* column_family_set);
uint32_t id_;
const std::string name_;
Version* dummy_versions_; // Head of circular doubly-linked list of versions.
Version* current_; // == dummy_versions->prev_
int refs_; // outstanding references to ColumnFamilyData
bool dropped_; // true if client dropped it
const InternalKeyComparator internal_comparator_;
const InternalFilterPolicy internal_filter_policy_;
Options const options_;
std::unique_ptr<TableCache> table_cache_;
std::unique_ptr<InternalStats> internal_stats_;
MemTable* mem_;
MemTableList imm_;
SuperVersion* super_version_;
// An ordinal representing the current SuperVersion. Updated by
// InstallSuperVersion(), i.e. incremented every time super_version_
// changes.
std::atomic<uint64_t> super_version_number_;
// Thread's local copy of SuperVersion pointer
// This needs to be destructed before mutex_
std::unique_ptr<ThreadLocalPtr> local_sv_;
// pointers for a circular linked list. we use it to support iterations
// that can be concurrent with writes
ColumnFamilyData* next_;
ColumnFamilyData* prev_;
// This is the earliest log file number that contains data from this
// Column Family. All earlier log files must be ignored and not
// recovered from
uint64_t log_number_;
// A flag indicating whether we should delay writes because
// we have too many level 0 files
bool need_slowdown_for_num_level0_files_;
// An object that keeps all the compaction stats
// and picks the next compaction
std::unique_ptr<CompactionPicker> compaction_picker_;
ColumnFamilySet* column_family_set_;
};
// ColumnFamilySet has interesting thread-safety requirements
// * CreateColumnFamily() or RemoveColumnFamily() -- need to protect by DB
// mutex. Inside, column_family_data_ and column_families_ will be protected
// by Lock() and Unlock(). CreateColumnFamily() should ONLY be called from
// VersionSet::LogAndApply() in the normal runtime. It is also called
// during Recovery and in DumpManifest(). RemoveColumnFamily() is called
// from ColumnFamilyData destructor
// * Iteration -- hold DB mutex, but you can release it in the body of
// iteration. If you release DB mutex in body, reference the column
// family before the mutex and unreference after you unlock, since the column
// family might get dropped when the DB mutex is released
// * GetDefault() -- thread safe
// * GetColumnFamily() -- either inside of DB mutex or call Lock() <-> Unlock()
// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
// NumberOfColumnFamilies -- inside of DB mutex
class ColumnFamilySet {
public:
// ColumnFamilySet supports iteration
class iterator {
public:
explicit iterator(ColumnFamilyData* cfd)
: current_(cfd) {}
iterator& operator++() {
// dummy is never dead or dropped, so this will never be infinite
do {
current_ = current_->next_;
} while (current_->refs_ == 0 || current_->IsDropped());
return *this;
}
bool operator!=(const iterator& other) {
return this->current_ != other.current_;
}
ColumnFamilyData* operator*() { return current_; }
private:
ColumnFamilyData* current_;
};
ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
const EnvOptions& storage_options, Cache* table_cache);
~ColumnFamilySet();
ColumnFamilyData* GetDefault() const;
// GetColumnFamily() calls return nullptr if column family is not found
ColumnFamilyData* GetColumnFamily(uint32_t id) const;
ColumnFamilyData* GetColumnFamily(const std::string& name) const;
// this call will return the next available column family ID. it guarantees
// that there is no column family with id greater than or equal to the
// returned value in the current running instance or anytime in RocksDB
// instance history.
uint32_t GetNextColumnFamilyID();
uint32_t GetMaxColumnFamily();
void UpdateMaxColumnFamily(uint32_t new_max_column_family);
size_t NumberOfColumnFamilies() const;
ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
Version* dummy_version,
const ColumnFamilyOptions& options);
iterator begin() { return iterator(dummy_cfd_->next_); }
iterator end() { return iterator(dummy_cfd_); }
void Lock();
void Unlock();
// REQUIRES: DB mutex held
// Don't call while iterating over ColumnFamilySet
void FreeDeadColumnFamilies();
private:
friend class ColumnFamilyData;
// helper function that gets called from cfd destructor
// REQUIRES: DB mutex held
void RemoveColumnFamily(ColumnFamilyData* cfd);
// column_families_ and column_family_data_ need to be protected:
// * when mutating: 1. DB mutex locked first, 2. spinlock locked second
// * when reading, either: 1. lock DB mutex, or 2. lock spinlock
// (if both, respect the ordering to avoid deadlock!)
std::unordered_map<std::string, uint32_t> column_families_;
std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
uint32_t max_column_family_;
ColumnFamilyData* dummy_cfd_;
// We don't hold the refcount here, since default column family always exists
// We are also not responsible for cleaning up default_cfd_cache_. This is
// just a cache that makes common case (accessing default column family)
// faster
ColumnFamilyData* default_cfd_cache_;
const std::string db_name_;
const DBOptions* const db_options_;
const EnvOptions storage_options_;
Cache* table_cache_;
std::atomic_flag spin_lock_;
};
// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
// memtables of different column families (specified by ID in the write batch)
class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
public:
explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
: column_family_set_(column_family_set), current_(nullptr) {}
// sets current_ to ColumnFamilyData with column_family_id
// returns false if column family doesn't exist
bool Seek(uint32_t column_family_id) override;
// Returns log number of the selected column family
uint64_t GetLogNumber() const override;
// REQUIRES: Seek() called first
virtual MemTable* GetMemTable() const override;
// Returns options for selected column family
// REQUIRES: Seek() called first
virtual const Options* GetOptions() const override;
// Returns column family handle for the selected column family
virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
private:
ColumnFamilySet* column_family_set_;
ColumnFamilyData* current_;
ColumnFamilyHandleInternal handle_;
};
} // namespace rocksdb

977
db/column_family_test.cc Normal file
View File

@@ -0,0 +1,977 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <algorithm>
#include <vector>
#include <string>
#include "db/db_impl.h"
#include "rocksdb/env.h"
#include "rocksdb/db.h"
#include "util/testharness.h"
#include "util/testutil.h"
#include "util/coding.h"
#include "utilities/merge_operators.h"
namespace rocksdb {
namespace {
std::string RandomString(Random* rnd, int len) {
std::string r;
test::RandomString(rnd, len, &r);
return r;
}
} // anonymous namespace
// counts how many operations were performed
class EnvCounter : public EnvWrapper {
public:
explicit EnvCounter(Env* base)
: EnvWrapper(base), num_new_writable_file_(0) {}
int GetNumberOfNewWritableFileCalls() {
return num_new_writable_file_;
}
Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
const EnvOptions& soptions) {
++num_new_writable_file_;
return EnvWrapper::NewWritableFile(f, r, soptions);
}
private:
int num_new_writable_file_;
};
class ColumnFamilyTest {
public:
ColumnFamilyTest() : rnd_(139) {
env_ = new EnvCounter(Env::Default());
dbname_ = test::TmpDir() + "/column_family_test";
db_options_.create_if_missing = true;
db_options_.env = env_;
DestroyDB(dbname_, Options(db_options_, column_family_options_));
}
~ColumnFamilyTest() {
delete env_;
}
void Close() {
for (auto h : handles_) {
delete h;
}
handles_.clear();
names_.clear();
delete db_;
db_ = nullptr;
}
Status TryOpen(std::vector<std::string> cf,
std::vector<ColumnFamilyOptions> options = {}) {
std::vector<ColumnFamilyDescriptor> column_families;
names_.clear();
for (size_t i = 0; i < cf.size(); ++i) {
column_families.push_back(ColumnFamilyDescriptor(
cf[i], options.size() == 0 ? column_family_options_ : options[i]));
names_.push_back(cf[i]);
}
return DB::Open(db_options_, dbname_, column_families, &handles_, &db_);
}
Status OpenReadOnly(std::vector<std::string> cf,
std::vector<ColumnFamilyOptions> options = {}) {
std::vector<ColumnFamilyDescriptor> column_families;
names_.clear();
for (size_t i = 0; i < cf.size(); ++i) {
column_families.push_back(ColumnFamilyDescriptor(
cf[i], options.size() == 0 ? column_family_options_ : options[i]));
names_.push_back(cf[i]);
}
return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_,
&db_);
}
void AssertOpenReadOnly(std::vector<std::string> cf,
std::vector<ColumnFamilyOptions> options = {}) {
ASSERT_OK(OpenReadOnly(cf, options));
}
void Open(std::vector<std::string> cf,
std::vector<ColumnFamilyOptions> options = {}) {
ASSERT_OK(TryOpen(cf, options));
}
void Open() {
Open({"default"});
}
DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_); }
int GetProperty(int cf, std::string property) {
std::string value;
ASSERT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value));
return std::stoi(value);
}
void Destroy() {
for (auto h : handles_) {
delete h;
}
handles_.clear();
names_.clear();
delete db_;
db_ = nullptr;
ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
}
void CreateColumnFamilies(
const std::vector<std::string>& cfs,
const std::vector<ColumnFamilyOptions> options = {}) {
int cfi = handles_.size();
handles_.resize(cfi + cfs.size());
names_.resize(cfi + cfs.size());
for (size_t i = 0; i < cfs.size(); ++i) {
ASSERT_OK(db_->CreateColumnFamily(
options.size() == 0 ? column_family_options_ : options[i], cfs[i],
&handles_[cfi]));
names_[cfi] = cfs[i];
cfi++;
}
}
void Reopen(const std::vector<ColumnFamilyOptions> options = {}) {
std::vector<std::string> names;
for (auto name : names_) {
if (name != "") {
names.push_back(name);
}
}
Close();
assert(options.size() == 0 || names.size() == options.size());
Open(names, options);
}
void CreateColumnFamiliesAndReopen(const std::vector<std::string>& cfs) {
CreateColumnFamilies(cfs);
Reopen();
}
void DropColumnFamilies(const std::vector<int>& cfs) {
for (auto cf : cfs) {
ASSERT_OK(db_->DropColumnFamily(handles_[cf]));
delete handles_[cf];
handles_[cf] = nullptr;
names_[cf] = "";
}
}
void PutRandomData(int cf, int num, int key_value_size) {
for (int i = 0; i < num; ++i) {
// 10 bytes for key, rest is value
ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 10),
RandomString(&rnd_, key_value_size - 10)));
}
}
void WaitForFlush(int cf) {
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
}
void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); }
Status Put(int cf, const std::string& key, const std::string& value) {
return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
}
Status Merge(int cf, const std::string& key, const std::string& value) {
return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
}
Status Flush(int cf) {
return db_->Flush(FlushOptions(), handles_[cf]);
}
std::string Get(int cf, const std::string& key) {
ReadOptions options;
options.verify_checksums = true;
std::string result;
Status s = db_->Get(options, handles_[cf], Slice(key), &result);
if (s.IsNotFound()) {
result = "NOT_FOUND";
} else if (!s.ok()) {
result = s.ToString();
}
return result;
}
void CompactAll(int cf) {
ASSERT_OK(db_->CompactRange(handles_[cf], nullptr, nullptr));
}
void Compact(int cf, const Slice& start, const Slice& limit) {
ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit));
}
int NumTableFilesAtLevel(int level, int cf) {
return GetProperty(cf,
"rocksdb.num-files-at-level" + std::to_string(level));
}
// Return spread of files per level
std::string FilesPerLevel(int cf) {
std::string result;
int last_non_zero_offset = 0;
for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) {
int f = NumTableFilesAtLevel(level, cf);
char buf[100];
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
result += buf;
if (f > 0) {
last_non_zero_offset = result.size();
}
}
result.resize(last_non_zero_offset);
return result;
}
int CountLiveFiles() {
std::vector<LiveFileMetaData> metadata;
db_->GetLiveFilesMetaData(&metadata);
return static_cast<int>(metadata.size());
}
// Do n memtable flushes, each of which produces an sstable
// covering the range [small,large].
void MakeTables(int cf, int n, const std::string& small,
const std::string& large) {
for (int i = 0; i < n; i++) {
ASSERT_OK(Put(cf, small, "begin"));
ASSERT_OK(Put(cf, large, "end"));
ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf]));
}
}
int CountLiveLogFiles() {
int micros_wait_for_log_deletion = 20000;
env_->SleepForMicroseconds(micros_wait_for_log_deletion);
int ret = 0;
VectorLogPtr wal_files;
Status s;
// GetSortedWalFiles is a flakey function -- it gets all the wal_dir
// children files and then later checks for their existance. if some of the
// log files doesn't exist anymore, it reports an error. it does all of this
// without DB mutex held, so if a background process deletes the log file
// while the function is being executed, it returns an error. We retry the
// function 10 times to avoid the error failing the test
for (int retries = 0; retries < 10; ++retries) {
wal_files.clear();
s = db_->GetSortedWalFiles(wal_files);
if (s.ok()) {
break;
}
}
ASSERT_OK(s);
for (const auto& wal : wal_files) {
if (wal->Type() == kAliveLogFile) {
++ret;
}
}
return ret;
}
void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
assert(num_per_cf.size() == handles_.size());
for (size_t i = 0; i < num_per_cf.size(); ++i) {
ASSERT_EQ(num_per_cf[i],
GetProperty(i, "rocksdb.num-immutable-mem-table"));
}
}
void CopyFile(const std::string& source, const std::string& destination,
uint64_t size = 0) {
const EnvOptions soptions;
unique_ptr<SequentialFile> srcfile;
ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
unique_ptr<WritableFile> destfile;
ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
if (size == 0) {
// default argument means copy everything
ASSERT_OK(env_->GetFileSize(source, &size));
}
char buffer[4096];
Slice slice;
while (size > 0) {
uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
ASSERT_OK(srcfile->Read(one, &slice, buffer));
ASSERT_OK(destfile->Append(slice));
size -= slice.size();
}
ASSERT_OK(destfile->Close());
}
std::vector<ColumnFamilyHandle*> handles_;
std::vector<std::string> names_;
ColumnFamilyOptions column_family_options_;
DBOptions db_options_;
std::string dbname_;
DB* db_ = nullptr;
EnvCounter* env_;
Random rnd_;
};
TEST(ColumnFamilyTest, DontReuseColumnFamilyID) {
for (int iter = 0; iter < 3; ++iter) {
Open();
CreateColumnFamilies({"one", "two", "three"});
for (size_t i = 0; i < handles_.size(); ++i) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[i]);
ASSERT_EQ(i, cfh->GetID());
}
if (iter == 1) {
Reopen();
}
DropColumnFamilies({3});
Reopen();
if (iter == 2) {
// this tests if max_column_family is correctly persisted with
// WriteSnapshot()
Reopen();
}
CreateColumnFamilies({"three2"});
// ID 3 that was used for dropped column family "three" should not be reused
auto cfh3 = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[3]);
ASSERT_EQ(4U, cfh3->GetID());
Close();
Destroy();
}
}
TEST(ColumnFamilyTest, AddDrop) {
Open();
CreateColumnFamilies({"one", "two", "three"});
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(2, "fodor"));
DropColumnFamilies({2});
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
CreateColumnFamilies({"four"});
ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
ASSERT_OK(Put(1, "fodor", "mirko"));
ASSERT_EQ("mirko", Get(1, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
Close();
ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument());
Open({"default", "one", "three", "four"});
DropColumnFamilies({1});
Reopen();
Close();
std::vector<std::string> families;
ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
sort(families.begin(), families.end());
ASSERT_TRUE(families ==
std::vector<std::string>({"default", "four", "three"}));
}
TEST(ColumnFamilyTest, DropTest) {
// first iteration - dont reopen DB before dropping
// second iteration - reopen DB before dropping
for (int iter = 0; iter < 2; ++iter) {
Open({"default"});
CreateColumnFamiliesAndReopen({"pikachu"});
for (int i = 0; i < 100; ++i) {
ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i)));
}
ASSERT_OK(Flush(1));
if (iter == 1) {
Reopen();
}
ASSERT_EQ("bar1", Get(1, "1"));
ASSERT_EQ(CountLiveFiles(), 1);
DropColumnFamilies({1});
// make sure that all files are deleted when we drop the column family
ASSERT_EQ(CountLiveFiles(), 0);
Destroy();
}
}
TEST(ColumnFamilyTest, WriteBatchFailure) {
Open();
CreateColumnFamiliesAndReopen({"one", "two"});
WriteBatch batch;
batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
ASSERT_OK(db_->Write(WriteOptions(), &batch));
DropColumnFamilies({1});
Status s = db_->Write(WriteOptions(), &batch);
ASSERT_TRUE(s.IsInvalidArgument());
Close();
}
TEST(ColumnFamilyTest, ReadWrite) {
Open();
CreateColumnFamiliesAndReopen({"one", "two"});
ASSERT_OK(Put(0, "foo", "v1"));
ASSERT_OK(Put(0, "bar", "v2"));
ASSERT_OK(Put(1, "mirko", "v3"));
ASSERT_OK(Put(0, "foo", "v2"));
ASSERT_OK(Put(2, "fodor", "v5"));
for (int iter = 0; iter <= 3; ++iter) {
ASSERT_EQ("v2", Get(0, "foo"));
ASSERT_EQ("v2", Get(0, "bar"));
ASSERT_EQ("v3", Get(1, "mirko"));
ASSERT_EQ("v5", Get(2, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
if (iter <= 1) {
Reopen();
}
}
Close();
}
TEST(ColumnFamilyTest, IgnoreRecoveredLog) {
std::string backup_logs = dbname_ + "/backup_logs";
// delete old files in backup_logs directory
ASSERT_OK(env_->CreateDirIfMissing(dbname_));
ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
std::vector<std::string> old_files;
env_->GetChildren(backup_logs, &old_files);
for (auto& file : old_files) {
if (file != "." && file != "..") {
env_->DeleteFile(backup_logs + "/" + file);
}
}
column_family_options_.merge_operator =
MergeOperators::CreateUInt64AddOperator();
db_options_.wal_dir = dbname_ + "/logs";
Destroy();
Open();
CreateColumnFamilies({"cf1", "cf2"});
// fill up the DB
std::string one, two, three;
PutFixed64(&one, 1);
PutFixed64(&two, 2);
PutFixed64(&three, 3);
ASSERT_OK(Merge(0, "foo", one));
ASSERT_OK(Merge(1, "mirko", one));
ASSERT_OK(Merge(0, "foo", one));
ASSERT_OK(Merge(2, "bla", one));
ASSERT_OK(Merge(2, "fodor", one));
ASSERT_OK(Merge(0, "bar", one));
ASSERT_OK(Merge(2, "bla", one));
ASSERT_OK(Merge(1, "mirko", two));
ASSERT_OK(Merge(1, "franjo", one));
// copy the logs to backup
std::vector<std::string> logs;
env_->GetChildren(db_options_.wal_dir, &logs);
for (auto& log : logs) {
if (log != ".." && log != ".") {
CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
}
}
// recover the DB
Close();
// 1. check consistency
// 2. copy the logs from backup back to WAL dir. if the recovery happens
// again on the same log files, this should lead to incorrect results
// due to applying merge operator twice
// 3. check consistency
for (int iter = 0; iter < 2; ++iter) {
// assert consistency
Open({"default", "cf1", "cf2"});
ASSERT_EQ(two, Get(0, "foo"));
ASSERT_EQ(one, Get(0, "bar"));
ASSERT_EQ(three, Get(1, "mirko"));
ASSERT_EQ(one, Get(1, "franjo"));
ASSERT_EQ(one, Get(2, "fodor"));
ASSERT_EQ(two, Get(2, "bla"));
Close();
if (iter == 0) {
// copy the logs from backup back to wal dir
for (auto& log : logs) {
if (log != ".." && log != ".") {
CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
}
}
}
}
}
TEST(ColumnFamilyTest, FlushTest) {
Open();
CreateColumnFamiliesAndReopen({"one", "two"});
ASSERT_OK(Put(0, "foo", "v1"));
ASSERT_OK(Put(0, "bar", "v2"));
ASSERT_OK(Put(1, "mirko", "v3"));
ASSERT_OK(Put(0, "foo", "v2"));
ASSERT_OK(Put(2, "fodor", "v5"));
for (int i = 0; i < 3; ++i) {
Flush(i);
}
Reopen();
for (int iter = 0; iter <= 2; ++iter) {
ASSERT_EQ("v2", Get(0, "foo"));
ASSERT_EQ("v2", Get(0, "bar"));
ASSERT_EQ("v3", Get(1, "mirko"));
ASSERT_EQ("v5", Get(2, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
if (iter <= 1) {
Reopen();
}
}
Close();
}
// Makes sure that obsolete log files get deleted
TEST(ColumnFamilyTest, LogDeletionTest) {
db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
column_family_options_.write_buffer_size = 100000; // 100KB
Open();
CreateColumnFamilies({"one", "two", "three", "four"});
// Each bracket is one log file. if number is in (), it means
// we don't need it anymore (it's been flushed)
// []
ASSERT_EQ(CountLiveLogFiles(), 0);
PutRandomData(0, 1, 100);
// [0]
PutRandomData(1, 1, 100);
// [0, 1]
PutRandomData(1, 1000, 100);
WaitForFlush(1);
// [0, (1)] [1]
ASSERT_EQ(CountLiveLogFiles(), 2);
PutRandomData(0, 1, 100);
// [0, (1)] [0, 1]
ASSERT_EQ(CountLiveLogFiles(), 2);
PutRandomData(2, 1, 100);
// [0, (1)] [0, 1, 2]
PutRandomData(2, 1000, 100);
WaitForFlush(2);
// [0, (1)] [0, 1, (2)] [2]
ASSERT_EQ(CountLiveLogFiles(), 3);
PutRandomData(2, 1000, 100);
WaitForFlush(2);
// [0, (1)] [0, 1, (2)] [(2)] [2]
ASSERT_EQ(CountLiveLogFiles(), 4);
PutRandomData(3, 1, 100);
// [0, (1)] [0, 1, (2)] [(2)] [2, 3]
PutRandomData(1, 1, 100);
// [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3]
ASSERT_EQ(CountLiveLogFiles(), 4);
PutRandomData(1, 1000, 100);
WaitForFlush(1);
// [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1]
ASSERT_EQ(CountLiveLogFiles(), 5);
PutRandomData(0, 1000, 100);
WaitForFlush(0);
// [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0]
// delete obsolete logs -->
// [(1), 2, 3] [1, (0)] [0]
ASSERT_EQ(CountLiveLogFiles(), 3);
PutRandomData(0, 1000, 100);
WaitForFlush(0);
// [(1), 2, 3] [1, (0)], [(0)] [0]
ASSERT_EQ(CountLiveLogFiles(), 4);
PutRandomData(1, 1000, 100);
WaitForFlush(1);
// [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1]
ASSERT_EQ(CountLiveLogFiles(), 5);
PutRandomData(2, 1000, 100);
WaitForFlush(2);
// [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2]
ASSERT_EQ(CountLiveLogFiles(), 6);
PutRandomData(3, 1000, 100);
WaitForFlush(3);
// [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3]
// delete obsolete logs -->
// [0, (1)] [1, (2)], [2, (3)] [3]
ASSERT_EQ(CountLiveLogFiles(), 4);
Close();
}
// Makes sure that obsolete log files get deleted
TEST(ColumnFamilyTest, DifferentWriteBufferSizes) {
// disable flushing stale column families
db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
Open();
CreateColumnFamilies({"one", "two", "three"});
ColumnFamilyOptions default_cf, one, two, three;
// setup options. all column families have max_write_buffer_number setup to 10
// "default" -> 100KB memtable, start flushing immediatelly
// "one" -> 200KB memtable, start flushing with two immutable memtables
// "two" -> 1MB memtable, start flushing with three immutable memtables
// "three" -> 90KB memtable, start flushing with four immutable memtables
default_cf.write_buffer_size = 100000;
default_cf.max_write_buffer_number = 10;
default_cf.min_write_buffer_number_to_merge = 1;
one.write_buffer_size = 200000;
one.max_write_buffer_number = 10;
one.min_write_buffer_number_to_merge = 2;
two.write_buffer_size = 1000000;
two.max_write_buffer_number = 10;
two.min_write_buffer_number_to_merge = 3;
three.write_buffer_size = 90000;
three.max_write_buffer_number = 10;
three.min_write_buffer_number_to_merge = 4;
Reopen({default_cf, one, two, three});
int micros_wait_for_flush = 10000;
PutRandomData(0, 100, 1000);
WaitForFlush(0);
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
ASSERT_EQ(CountLiveLogFiles(), 1);
PutRandomData(1, 200, 1000);
env_->SleepForMicroseconds(micros_wait_for_flush);
AssertNumberOfImmutableMemtables({0, 1, 0, 0});
ASSERT_EQ(CountLiveLogFiles(), 2);
PutRandomData(2, 1000, 1000);
env_->SleepForMicroseconds(micros_wait_for_flush);
AssertNumberOfImmutableMemtables({0, 1, 1, 0});
ASSERT_EQ(CountLiveLogFiles(), 3);
PutRandomData(2, 1000, 1000);
env_->SleepForMicroseconds(micros_wait_for_flush);
AssertNumberOfImmutableMemtables({0, 1, 2, 0});
ASSERT_EQ(CountLiveLogFiles(), 4);
PutRandomData(3, 90, 1000);
env_->SleepForMicroseconds(micros_wait_for_flush);
AssertNumberOfImmutableMemtables({0, 1, 2, 1});
ASSERT_EQ(CountLiveLogFiles(), 5);
PutRandomData(3, 90, 1000);
env_->SleepForMicroseconds(micros_wait_for_flush);
AssertNumberOfImmutableMemtables({0, 1, 2, 2});
ASSERT_EQ(CountLiveLogFiles(), 6);
PutRandomData(3, 90, 1000);
env_->SleepForMicroseconds(micros_wait_for_flush);
AssertNumberOfImmutableMemtables({0, 1, 2, 3});
ASSERT_EQ(CountLiveLogFiles(), 7);
PutRandomData(0, 100, 1000);
WaitForFlush(0);
AssertNumberOfImmutableMemtables({0, 1, 2, 3});
ASSERT_EQ(CountLiveLogFiles(), 8);
PutRandomData(2, 100, 10000);
WaitForFlush(2);
AssertNumberOfImmutableMemtables({0, 1, 0, 3});
ASSERT_EQ(CountLiveLogFiles(), 9);
PutRandomData(3, 90, 1000);
WaitForFlush(3);
AssertNumberOfImmutableMemtables({0, 1, 0, 0});
ASSERT_EQ(CountLiveLogFiles(), 10);
PutRandomData(3, 90, 1000);
env_->SleepForMicroseconds(micros_wait_for_flush);
AssertNumberOfImmutableMemtables({0, 1, 0, 1});
ASSERT_EQ(CountLiveLogFiles(), 11);
PutRandomData(1, 200, 1000);
WaitForFlush(1);
AssertNumberOfImmutableMemtables({0, 0, 0, 1});
ASSERT_EQ(CountLiveLogFiles(), 5);
PutRandomData(3, 90*6, 1000);
WaitForFlush(3);
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
ASSERT_EQ(CountLiveLogFiles(), 12);
PutRandomData(0, 100, 1000);
WaitForFlush(0);
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
ASSERT_EQ(CountLiveLogFiles(), 12);
PutRandomData(2, 3*100, 10000);
WaitForFlush(2);
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
ASSERT_EQ(CountLiveLogFiles(), 12);
PutRandomData(1, 2*200, 1000);
WaitForFlush(1);
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
ASSERT_EQ(CountLiveLogFiles(), 7);
Close();
}
TEST(ColumnFamilyTest, DifferentMergeOperators) {
Open();
CreateColumnFamilies({"first", "second"});
ColumnFamilyOptions default_cf, first, second;
first.merge_operator = MergeOperators::CreateUInt64AddOperator();
second.merge_operator = MergeOperators::CreateStringAppendOperator();
Reopen({default_cf, first, second});
std::string one, two, three;
PutFixed64(&one, 1);
PutFixed64(&two, 2);
PutFixed64(&three, 3);
ASSERT_OK(Put(0, "foo", two));
ASSERT_OK(Put(0, "foo", one));
ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported());
ASSERT_EQ(Get(0, "foo"), one);
ASSERT_OK(Put(1, "foo", two));
ASSERT_OK(Put(1, "foo", one));
ASSERT_OK(Merge(1, "foo", two));
ASSERT_EQ(Get(1, "foo"), three);
ASSERT_OK(Put(2, "foo", two));
ASSERT_OK(Put(2, "foo", one));
ASSERT_OK(Merge(2, "foo", two));
ASSERT_EQ(Get(2, "foo"), one + "," + two);
Close();
}
TEST(ColumnFamilyTest, DifferentCompactionStyles) {
Open();
CreateColumnFamilies({"one", "two"});
ColumnFamilyOptions default_cf, one, two;
db_options_.max_open_files = 20; // only 10 files in file cache
db_options_.disableDataSync = true;
default_cf.compaction_style = kCompactionStyleLevel;
default_cf.num_levels = 3;
default_cf.write_buffer_size = 64 << 10; // 64KB
default_cf.target_file_size_base = 30 << 10;
default_cf.filter_policy = nullptr;
default_cf.no_block_cache = true;
default_cf.source_compaction_factor = 100;
default_cf.disable_seek_compaction = false;
one.compaction_style = kCompactionStyleUniversal;
// trigger compaction if there are >= 4 files
one.level0_file_num_compaction_trigger = 4;
one.write_buffer_size = 100000;
two.compaction_style = kCompactionStyleLevel;
two.num_levels = 4;
two.max_mem_compaction_level = 0;
two.level0_file_num_compaction_trigger = 3;
two.write_buffer_size = 100000;
Reopen({default_cf, one, two});
// SETUP column family "default" - test read compaction
ASSERT_EQ("", FilesPerLevel(0));
PutRandomData(0, 1, 4096);
ASSERT_OK(Flush(0));
ASSERT_EQ("0,0,1", FilesPerLevel(0));
// write 8MB
PutRandomData(0, 2000, 4096);
ASSERT_OK(Flush(0));
// clear levels 0 and 1
dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[0]);
dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[0]);
ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);
// write some new keys into level 0 and 1
PutRandomData(0, 1024, 512);
ASSERT_OK(Flush(0));
WaitForCompaction();
PutRandomData(0, 10, 512);
ASSERT_OK(Flush(0));
// remember number of files in each level
int l1 = NumTableFilesAtLevel(0, 0);
int l2 = NumTableFilesAtLevel(1, 0);
int l3 = NumTableFilesAtLevel(2, 0);
ASSERT_NE(l1, 0);
ASSERT_NE(l2, 0);
ASSERT_NE(l3, 0);
// SETUP column family "one" -- universal style
for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
PutRandomData(1, 11, 10000);
WaitForFlush(1);
ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(1));
}
// SETUP column family "two" -- level style with 4 levels
for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
PutRandomData(2, 15, 10000);
WaitForFlush(2);
ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2));
}
// TRIGGER compaction "default"
// read a bunch of times, trigger read compaction
for (int i = 0; i < 200000; ++i) {
Get(0, std::to_string(i));
}
// TRIGGER compaction "one"
PutRandomData(1, 12, 10000);
// TRIGGER compaction "two"
PutRandomData(2, 10, 10000);
// WAIT for compactions
WaitForCompaction();
// VERIFY compaction "default"
// verify that the number of files have decreased
// in some level, indicating that there was a compaction
ASSERT_TRUE(NumTableFilesAtLevel(0, 0) < l1 ||
NumTableFilesAtLevel(1, 0) < l2 ||
NumTableFilesAtLevel(2, 0) < l3);
// VERIFY compaction "one"
ASSERT_EQ("1", FilesPerLevel(1));
// VERIFY compaction "two"
ASSERT_EQ("0,1", FilesPerLevel(2));
CompactAll(2);
ASSERT_EQ("0,1", FilesPerLevel(2));
Close();
}
namespace {
std::string IterStatus(Iterator* iter) {
std::string result;
if (iter->Valid()) {
result = iter->key().ToString() + "->" + iter->value().ToString();
} else {
result = "(invalid)";
}
return result;
}
} // anonymous namespace
TEST(ColumnFamilyTest, NewIteratorsTest) {
// iter == 0 -- no tailing
// iter == 2 -- tailing
for (int iter = 0; iter < 2; ++iter) {
Open();
CreateColumnFamiliesAndReopen({"one", "two"});
ASSERT_OK(Put(0, "a", "b"));
ASSERT_OK(Put(1, "b", "a"));
ASSERT_OK(Put(2, "c", "m"));
ASSERT_OK(Put(2, "v", "t"));
std::vector<Iterator*> iterators;
ReadOptions options;
options.tailing = (iter == 1);
ASSERT_OK(db_->NewIterators(options, handles_, &iterators));
for (auto it : iterators) {
it->SeekToFirst();
}
ASSERT_EQ(IterStatus(iterators[0]), "a->b");
ASSERT_EQ(IterStatus(iterators[1]), "b->a");
ASSERT_EQ(IterStatus(iterators[2]), "c->m");
ASSERT_OK(Put(1, "x", "x"));
for (auto it : iterators) {
it->Next();
}
ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
if (iter == 0) {
// no tailing
ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
} else {
// tailing
ASSERT_EQ(IterStatus(iterators[1]), "x->x");
}
ASSERT_EQ(IterStatus(iterators[2]), "v->t");
for (auto it : iterators) {
delete it;
}
Destroy();
}
}
TEST(ColumnFamilyTest, ReadOnlyDBTest) {
Open();
CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
ASSERT_OK(Put(1, "foo", "bla"));
ASSERT_OK(Put(2, "foo", "blabla"));
ASSERT_OK(Put(3, "foo", "blablabla"));
ASSERT_OK(Put(4, "foo", "blablablabla"));
DropColumnFamilies({2});
Close();
// open only a subset of column families
AssertOpenReadOnly({"default", "one", "four"});
ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
ASSERT_EQ("bla", Get(1, "foo"));
ASSERT_EQ("blablablabla", Get(2, "foo"));
Close();
// can't open dropped column family
Status s = OpenReadOnly({"default", "one", "two"});
ASSERT_TRUE(!s.ok());
// Can't open without specifying default column family
s = OpenReadOnly({"one", "four"});
ASSERT_TRUE(!s.ok());
}
TEST(ColumnFamilyTest, DontRollEmptyLogs) {
Open();
CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
for (size_t i = 0; i < handles_.size(); ++i) {
PutRandomData(i, 10, 100);
}
int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls();
// this will trigger the flushes
ASSERT_OK(db_->Write(WriteOptions(), nullptr));
for (int i = 0; i < 4; ++i) {
dbfull()->TEST_WaitForFlushMemTable(handles_[i]);
}
int total_new_writable_files =
env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start;
ASSERT_EQ(static_cast<size_t>(total_new_writable_files), handles_.size() + 1);
Close();
}
TEST(ColumnFamilyTest, FlushStaleColumnFamilies) {
Open();
CreateColumnFamilies({"one", "two"});
ColumnFamilyOptions default_cf, one, two;
default_cf.write_buffer_size = 100000; // small write buffer size
default_cf.disable_auto_compactions = true;
one.disable_auto_compactions = true;
two.disable_auto_compactions = true;
db_options_.max_total_wal_size = 210000;
Reopen({default_cf, one, two});
PutRandomData(2, 1, 10); // 10 bytes
for (int i = 0; i < 2; ++i) {
PutRandomData(0, 100, 1000); // flush
WaitForFlush(0);
ASSERT_EQ(i + 1, CountLiveFiles());
}
// third flush. now, CF [two] should be detected as stale and flushed
// column family 1 should not be flushed since it's empty
PutRandomData(0, 100, 1000); // flush
WaitForFlush(0);
WaitForFlush(2);
// 3 files for default column families, 1 file for column family [two], zero
// files for column family [one], because it's empty
ASSERT_EQ(4, CountLiveFiles());
Close();
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

253
db/compaction.cc Normal file
View File

@@ -0,0 +1,253 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/compaction.h"
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <vector>
#include "db/column_family.h"
#include "util/logging.h"
namespace rocksdb {
static uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
uint64_t sum = 0;
for (size_t i = 0; i < files.size() && files[i]; i++) {
sum += files[i]->file_size;
}
return sum;
}
Compaction::Compaction(Version* input_version, int level, int out_level,
uint64_t target_file_size,
uint64_t max_grandparent_overlap_bytes,
bool seek_compaction, bool enable_compression,
bool deletion_compaction)
: level_(level),
out_level_(out_level),
max_output_file_size_(target_file_size),
max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
input_version_(input_version),
number_levels_(input_version_->NumberLevels()),
cfd_(input_version_->cfd_),
seek_compaction_(seek_compaction),
enable_compression_(enable_compression),
deletion_compaction_(deletion_compaction),
grandparent_index_(0),
seen_key_(false),
overlapped_bytes_(0),
base_index_(-1),
parent_index_(-1),
score_(0),
bottommost_level_(false),
is_full_compaction_(false),
is_manual_compaction_(false),
level_ptrs_(std::vector<size_t>(number_levels_)) {
cfd_->Ref();
input_version_->Ref();
edit_ = new VersionEdit();
edit_->SetColumnFamily(cfd_->GetID());
for (int i = 0; i < number_levels_; i++) {
level_ptrs_[i] = 0;
}
}
Compaction::~Compaction() {
delete edit_;
if (input_version_ != nullptr) {
input_version_->Unref();
}
if (cfd_ != nullptr) {
if (cfd_->Unref()) {
delete cfd_;
}
}
}
bool Compaction::IsTrivialMove() const {
// Avoid a move if there is lots of overlapping grandparent data.
// Otherwise, the move could create a parent file that will require
// a very expensive merge later on.
// If level_== out_level_, the purpose is to force compaction filter to be
// applied to that level, and thus cannot be a trivia move.
return (level_ != out_level_ &&
num_input_files(0) == 1 &&
num_input_files(1) == 0 &&
TotalFileSize(grandparents_) <= max_grandparent_overlap_bytes_);
}
bool Compaction::IsDeletionCompaction() const { return deletion_compaction_; }
void Compaction::AddInputDeletions(VersionEdit* edit) {
for (int which = 0; which < 2; which++) {
for (size_t i = 0; i < inputs_[which].size(); i++) {
edit->DeleteFile(level_ + which, inputs_[which][i]->number);
}
}
}
bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
assert(cfd_->options()->compaction_style != kCompactionStyleFIFO);
if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
return bottommost_level_;
}
// Maybe use binary search to find right entry instead of linear search?
const Comparator* user_cmp = cfd_->user_comparator();
for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
for (; level_ptrs_[lvl] < files.size(); ) {
FileMetaData* f = files[level_ptrs_[lvl]];
if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
// We've advanced far enough
if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
// Key falls in this file's range, so definitely not base level
return false;
}
break;
}
level_ptrs_[lvl]++;
}
}
return true;
}
bool Compaction::ShouldStopBefore(const Slice& internal_key) {
// Scan to find earliest grandparent file that contains key.
const InternalKeyComparator* icmp = &cfd_->internal_comparator();
while (grandparent_index_ < grandparents_.size() &&
icmp->Compare(internal_key,
grandparents_[grandparent_index_]->largest.Encode()) > 0) {
if (seen_key_) {
overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
}
assert(grandparent_index_ + 1 >= grandparents_.size() ||
icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
grandparents_[grandparent_index_+1]->smallest.Encode())
< 0);
grandparent_index_++;
}
seen_key_ = true;
if (overlapped_bytes_ > max_grandparent_overlap_bytes_) {
// Too much overlap for current output; start new output
overlapped_bytes_ = 0;
return true;
} else {
return false;
}
}
// Mark (or clear) each file that is being compacted
void Compaction::MarkFilesBeingCompacted(bool value) {
for (int i = 0; i < 2; i++) {
std::vector<FileMetaData*> v = inputs_[i];
for (unsigned int j = 0; j < inputs_[i].size(); j++) {
assert(value ? !inputs_[i][j]->being_compacted :
inputs_[i][j]->being_compacted);
inputs_[i][j]->being_compacted = value;
}
}
}
// Is this compaction producing files at the bottommost level?
void Compaction::SetupBottomMostLevel(bool isManual) {
assert(cfd_->options()->compaction_style != kCompactionStyleFIFO);
if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
// If universal compaction style is used and manual
// compaction is occuring, then we are guaranteed that
// all files will be picked in a single compaction
// run. We can safely set bottommost_level_ = true.
// If it is not manual compaction, then bottommost_level_
// is already set when the Compaction was created.
if (isManual) {
bottommost_level_ = true;
}
return;
}
bottommost_level_ = true;
for (int i = output_level() + 1; i < number_levels_; i++) {
if (input_version_->NumLevelFiles(i) > 0) {
bottommost_level_ = false;
break;
}
}
}
void Compaction::ReleaseInputs() {
if (input_version_ != nullptr) {
input_version_->Unref();
input_version_ = nullptr;
}
if (cfd_ != nullptr) {
if (cfd_->Unref()) {
delete cfd_;
}
cfd_ = nullptr;
}
}
void Compaction::ReleaseCompactionFiles(Status status) {
cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
}
void Compaction::ResetNextCompactionIndex() {
input_version_->ResetNextCompactionIndex(level_);
}
namespace {
int InputSummary(const std::vector<FileMetaData*>& files, char* output,
int len) {
*output = '\0';
int write = 0;
for (unsigned int i = 0; i < files.size(); i++) {
int sz = len - write;
int ret;
char sztxt[16];
AppendHumanBytes(files.at(i)->file_size, sztxt, 16);
ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", files.at(i)->number,
sztxt);
if (ret < 0 || ret >= sz) break;
write += ret;
}
// if files.size() is non-zero, overwrite the last space
return write - !!files.size();
}
} // namespace
void Compaction::Summary(char* output, int len) {
int write =
snprintf(output, len, "Base version %" PRIu64
" Base level %d, seek compaction:%d, inputs: [",
input_version_->GetVersionNumber(), level_, seek_compaction_);
if (write < 0 || write >= len) {
return;
}
write += InputSummary(inputs_[0], output + write, len - write);
if (write < 0 || write >= len) {
return;
}
write += snprintf(output + write, len - write, "], [");
if (write < 0 || write >= len) {
return;
}
write += InputSummary(inputs_[1], output + write, len - write);
if (write < 0 || write >= len) {
return;
}
snprintf(output + write, len - write, "]");
}
} // namespace rocksdb

158
db/compaction.h Normal file
View File

@@ -0,0 +1,158 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "db/version_set.h"
namespace rocksdb {
class Version;
class ColumnFamilyData;
// A Compaction encapsulates information about a compaction.
class Compaction {
public:
~Compaction();
// Return the level that is being compacted. Inputs from "level"
// will be merged.
int level() const { return level_; }
// Outputs will go to this level
int output_level() const { return out_level_; }
// Return the object that holds the edits to the descriptor done
// by this compaction.
VersionEdit* edit() { return edit_; }
// "which" must be either 0 or 1
int num_input_files(int which) const { return inputs_[which].size(); }
// Returns input version of the compaction
Version* input_version() const { return input_version_; }
ColumnFamilyData* column_family_data() const { return cfd_; }
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
std::vector<FileMetaData*>* inputs(int which) { return &inputs_[which]; }
// Maximum size of files to build during this compaction.
uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
// Whether compression will be enabled for compaction outputs
bool enable_compression() const { return enable_compression_; }
// Is this a trivial compaction that can be implemented by just
// moving a single input file to the next level (no merging or splitting)
bool IsTrivialMove() const;
// If true, just delete all files in inputs_[0]
bool IsDeletionCompaction() const;
// Add all inputs to this compaction as delete operations to *edit.
void AddInputDeletions(VersionEdit* edit);
// Returns true if the information we have available guarantees that
// the compaction is producing data in "level+1" for which no data exists
// in levels greater than "level+1".
bool IsBaseLevelForKey(const Slice& user_key);
// Returns true iff we should stop building the current output
// before processing "internal_key".
bool ShouldStopBefore(const Slice& internal_key);
// Release the input version for the compaction, once the compaction
// is successful.
void ReleaseInputs();
// Clear all files to indicate that they are not being compacted
// Delete this compaction from the list of running compactions.
void ReleaseCompactionFiles(Status status);
void Summary(char* output, int len);
// Return the score that was used to pick this compaction run.
double score() const { return score_; }
// Is this compaction creating a file in the bottom most level?
bool BottomMostLevel() { return bottommost_level_; }
// Does this compaction include all sst files?
bool IsFullCompaction() { return is_full_compaction_; }
// Was this compaction triggered manually by the client?
bool IsManualCompaction() { return is_manual_compaction_; }
private:
friend class CompactionPicker;
friend class UniversalCompactionPicker;
friend class FIFOCompactionPicker;
friend class LevelCompactionPicker;
Compaction(Version* input_version, int level, int out_level,
uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
bool seek_compaction = false, bool enable_compression = true,
bool deletion_compaction = false);
int level_;
int out_level_; // levels to which output files are stored
uint64_t max_output_file_size_;
uint64_t max_grandparent_overlap_bytes_;
Version* input_version_;
VersionEdit* edit_;
int number_levels_;
ColumnFamilyData* cfd_;
bool seek_compaction_;
bool enable_compression_;
// if true, just delete files in inputs_[0]
bool deletion_compaction_;
// Each compaction reads inputs from "level_" and "level_+1"
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
// State used to check for number of of overlapping grandparent files
// (parent == level_ + 1, grandparent == level_ + 2)
std::vector<FileMetaData*> grandparents_;
size_t grandparent_index_; // Index in grandparent_starts_
bool seen_key_; // Some output key has been seen
uint64_t overlapped_bytes_; // Bytes of overlap between current output
// and grandparent files
int base_index_; // index of the file in files_[level_]
int parent_index_; // index of some file with same range in files_[level_+1]
double score_; // score that was used to pick this compaction.
// Is this compaction creating a file in the bottom most level?
bool bottommost_level_;
// Does this compaction include all sst files?
bool is_full_compaction_;
// Is this compaction requested by the client?
bool is_manual_compaction_;
// level_ptrs_ holds indices into input_version_->levels_: our state
// is that we are positioned at one of the file ranges for each
// higher level than the ones involved in this compaction (i.e. for
// all L >= level_ + 2).
std::vector<size_t> level_ptrs_;
// mark (or clear) all files that are being compacted
void MarkFilesBeingCompacted(bool);
// Initialize whether compaction producing files at the bottommost level
void SetupBottomMostLevel(bool isManual);
// In case of compaction error, reset the nextIndex that is used
// to pick up the next file to be compacted from files_by_size_
void ResetNextCompactionIndex();
};
} // namespace rocksdb

960
db/compaction_picker.cc Normal file
View File

@@ -0,0 +1,960 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/compaction_picker.h"
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <limits>
#include "util/log_buffer.h"
#include "util/statistics.h"
namespace rocksdb {
namespace {
uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
uint64_t sum = 0;
for (size_t i = 0; i < files.size() && files[i]; i++) {
sum += files[i]->file_size;
}
return sum;
}
// Multiple two operands. If they overflow, return op1.
uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) {
if (op1 == 0) {
return 0;
}
if (op2 <= 0) {
return op1;
}
uint64_t casted_op2 = (uint64_t) op2;
if (std::numeric_limits<uint64_t>::max() / op1 < casted_op2) {
return op1;
}
return op1 * casted_op2;
}
} // anonymous namespace
CompactionPicker::CompactionPicker(const Options* options,
const InternalKeyComparator* icmp)
: compactions_in_progress_(options->num_levels),
options_(options),
num_levels_(options->num_levels),
icmp_(icmp) {
max_file_size_.reset(new uint64_t[NumberLevels()]);
level_max_bytes_.reset(new uint64_t[NumberLevels()]);
int target_file_size_multiplier = options_->target_file_size_multiplier;
int max_bytes_multiplier = options_->max_bytes_for_level_multiplier;
for (int i = 0; i < NumberLevels(); i++) {
if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) {
max_file_size_[i] = ULLONG_MAX;
level_max_bytes_[i] = options_->max_bytes_for_level_base;
} else if (i > 1) {
max_file_size_[i] = MultiplyCheckOverflow(max_file_size_[i - 1],
target_file_size_multiplier);
level_max_bytes_[i] = MultiplyCheckOverflow(
MultiplyCheckOverflow(level_max_bytes_[i - 1], max_bytes_multiplier),
options_->max_bytes_for_level_multiplier_additional[i - 1]);
} else {
max_file_size_[i] = options_->target_file_size_base;
level_max_bytes_[i] = options_->max_bytes_for_level_base;
}
}
}
CompactionPicker::~CompactionPicker() {}
void CompactionPicker::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
for (int level = 0; level < NumberLevels() - 1; level++) {
uint64_t total = 0;
for (auto c : compactions_in_progress_[level]) {
assert(c->level() == level);
for (int i = 0; i < c->num_input_files(0); i++) {
total += c->input(0,i)->file_size;
}
}
sizes[level] = total;
}
}
// Clear all files to indicate that they are not being compacted
// Delete this compaction from the list of running compactions.
void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
c->MarkFilesBeingCompacted(false);
compactions_in_progress_[c->level()].erase(c);
if (!status.ok()) {
c->ResetNextCompactionIndex();
}
}
uint64_t CompactionPicker::MaxFileSizeForLevel(int level) const {
assert(level >= 0);
assert(level < NumberLevels());
return max_file_size_[level];
}
uint64_t CompactionPicker::MaxGrandParentOverlapBytes(int level) {
uint64_t result = MaxFileSizeForLevel(level);
result *= options_->max_grandparent_overlap_factor;
return result;
}
double CompactionPicker::MaxBytesForLevel(int level) {
// Note: the result for level zero is not really used since we set
// the level-0 compaction threshold based on number of files.
assert(level >= 0);
assert(level < NumberLevels());
return level_max_bytes_[level];
}
void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs,
InternalKey* smallest, InternalKey* largest) {
assert(!inputs.empty());
smallest->Clear();
largest->Clear();
for (size_t i = 0; i < inputs.size(); i++) {
FileMetaData* f = inputs[i];
if (i == 0) {
*smallest = f->smallest;
*largest = f->largest;
} else {
if (icmp_->Compare(f->smallest, *smallest) < 0) {
*smallest = f->smallest;
}
if (icmp_->Compare(f->largest, *largest) > 0) {
*largest = f->largest;
}
}
}
}
void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs1,
const std::vector<FileMetaData*>& inputs2,
InternalKey* smallest, InternalKey* largest) {
std::vector<FileMetaData*> all = inputs1;
all.insert(all.end(), inputs2.begin(), inputs2.end());
GetRange(all, smallest, largest);
}
bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
// If inputs are empty then there is nothing to expand.
if (!c || c->inputs_[0].empty()) {
return true;
}
// GetOverlappingInputs will always do the right thing for level-0.
// So we don't need to do any expansion if level == 0.
if (c->level() == 0) {
return true;
}
const int level = c->level();
InternalKey smallest, largest;
// Keep expanding c->inputs_[0] until we are sure that there is a
// "clean cut" boundary between the files in input and the surrounding files.
// This will ensure that no parts of a key are lost during compaction.
int hint_index = -1;
size_t old_size;
do {
old_size = c->inputs_[0].size();
GetRange(c->inputs_[0], &smallest, &largest);
c->inputs_[0].clear();
c->input_version_->GetOverlappingInputs(
level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index);
} while(c->inputs_[0].size() > old_size);
// Get the new range
GetRange(c->inputs_[0], &smallest, &largest);
// If, after the expansion, there are files that are already under
// compaction, then we must drop/cancel this compaction.
int parent_index = -1;
if (c->inputs_[0].empty()) {
Log(options_->info_log,
"[%s] ExpandWhileOverlapping() failure because zero input files",
c->column_family_data()->GetName().c_str());
}
if (c->inputs_[0].empty() || FilesInCompaction(c->inputs_[0]) ||
(c->level() != c->output_level() &&
ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
&parent_index))) {
c->inputs_[0].clear();
c->inputs_[1].clear();
return false;
}
return true;
}
uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) {
uint64_t result = MaxFileSizeForLevel(level);
result *= options_->expanded_compaction_factor;
return result;
}
// Returns true if any one of specified files are being compacted
bool CompactionPicker::FilesInCompaction(std::vector<FileMetaData*>& files) {
for (unsigned int i = 0; i < files.size(); i++) {
if (files[i]->being_compacted) {
return true;
}
}
return false;
}
// Returns true if any one of the parent files are being compacted
bool CompactionPicker::ParentRangeInCompaction(Version* version,
const InternalKey* smallest,
const InternalKey* largest,
int level, int* parent_index) {
std::vector<FileMetaData*> inputs;
assert(level + 1 < NumberLevels());
version->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
*parent_index, parent_index);
return FilesInCompaction(inputs);
}
// Populates the set of inputs from "level+1" that overlap with "level".
// Will also attempt to expand "level" if that doesn't expand "level+1"
// or cause "level" to include a file for compaction that has an overlapping
// user-key with another file.
void CompactionPicker::SetupOtherInputs(Compaction* c) {
// If inputs are empty, then there is nothing to expand.
// If both input and output levels are the same, no need to consider
// files at level "level+1"
if (c->inputs_[0].empty() || c->level() == c->output_level()) {
return;
}
const int level = c->level();
InternalKey smallest, largest;
// Get the range one last time.
GetRange(c->inputs_[0], &smallest, &largest);
// Populate the set of next-level files (inputs_[1]) to include in compaction
c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest,
&c->inputs_[1], c->parent_index_,
&c->parent_index_);
// Get entire range covered by compaction
InternalKey all_start, all_limit;
GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
// See if we can further grow the number of inputs in "level" without
// changing the number of "level+1" files we pick up. We also choose NOT
// to expand if this would cause "level" to include some entries for some
// user key, while excluding other entries for the same user key. This
// can happen when one user key spans multiple files.
if (!c->inputs_[1].empty()) {
std::vector<FileMetaData*> expanded0;
c->input_version_->GetOverlappingInputs(
level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
const uint64_t expanded0_size = TotalFileSize(expanded0);
uint64_t limit = ExpandedCompactionByteSizeLimit(level);
if (expanded0.size() > c->inputs_[0].size() &&
inputs1_size + expanded0_size < limit &&
!FilesInCompaction(expanded0) &&
!c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
InternalKey new_start, new_limit;
GetRange(expanded0, &new_start, &new_limit);
std::vector<FileMetaData*> expanded1;
c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
&expanded1, c->parent_index_,
&c->parent_index_);
if (expanded1.size() == c->inputs_[1].size() &&
!FilesInCompaction(expanded1)) {
Log(options_->info_log,
"[%s] Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu "
"bytes)\n",
c->column_family_data()->GetName().c_str(), (unsigned long)level,
(unsigned long)(c->inputs_[0].size()),
(unsigned long)(c->inputs_[1].size()), (unsigned long)inputs0_size,
(unsigned long)inputs1_size, (unsigned long)(expanded0.size()),
(unsigned long)(expanded1.size()), (unsigned long)expanded0_size,
(unsigned long)inputs1_size);
smallest = new_start;
largest = new_limit;
c->inputs_[0] = expanded0;
c->inputs_[1] = expanded1;
GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
}
}
}
// Compute the set of grandparent files that overlap this compaction
// (parent == level+1; grandparent == level+2)
if (level + 2 < NumberLevels()) {
c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
&c->grandparents_);
}
}
Compaction* CompactionPicker::CompactRange(Version* version, int input_level,
int output_level,
const InternalKey* begin,
const InternalKey* end,
InternalKey** compaction_end) {
// CompactionPickerFIFO has its own implementation of compact range
assert(options_->compaction_style != kCompactionStyleFIFO);
std::vector<FileMetaData*> inputs;
bool covering_the_whole_range = true;
// All files are 'overlapping' in universal style compaction.
// We have to compact the entire range in one shot.
if (options_->compaction_style == kCompactionStyleUniversal) {
begin = nullptr;
end = nullptr;
}
version->GetOverlappingInputs(input_level, begin, end, &inputs);
if (inputs.empty()) {
return nullptr;
}
// Avoid compacting too much in one shot in case the range is large.
// But we cannot do this for level-0 since level-0 files can overlap
// and we must not pick one file and drop another older file if the
// two files overlap.
if (input_level > 0) {
const uint64_t limit =
MaxFileSizeForLevel(input_level) * options_->source_compaction_factor;
uint64_t total = 0;
for (size_t i = 0; i + 1 < inputs.size(); ++i) {
uint64_t s = inputs[i]->file_size;
total += s;
if (total >= limit) {
**compaction_end = inputs[i + 1]->smallest;
covering_the_whole_range = false;
inputs.resize(i + 1);
break;
}
}
}
Compaction* c = new Compaction(version, input_level, output_level,
MaxFileSizeForLevel(output_level),
MaxGrandParentOverlapBytes(input_level));
c->inputs_[0] = inputs;
if (ExpandWhileOverlapping(c) == false) {
delete c;
Log(options_->info_log,
"[%s] Could not compact due to expansion failure.\n",
version->cfd_->GetName().c_str());
return nullptr;
}
SetupOtherInputs(c);
if (covering_the_whole_range) {
*compaction_end = nullptr;
}
// These files that are to be manaully compacted do not trample
// upon other files because manual compactions are processed when
// the system has a max of 1 background compaction thread.
c->MarkFilesBeingCompacted(true);
// Is this compaction creating a file at the bottommost level
c->SetupBottomMostLevel(true);
c->is_manual_compaction_ = true;
return c;
}
Compaction* LevelCompactionPicker::PickCompaction(Version* version,
LogBuffer* log_buffer) {
Compaction* c = nullptr;
int level = -1;
// Compute the compactions needed. It is better to do it here
// and also in LogAndApply(), otherwise the values could be stale.
std::vector<uint64_t> size_being_compacted(NumberLevels() - 1);
SizeBeingCompacted(size_being_compacted);
version->ComputeCompactionScore(size_being_compacted);
// We prefer compactions triggered by too much data in a level over
// the compactions triggered by seeks.
//
// Find the compactions by size on all levels.
for (int i = 0; i < NumberLevels() - 1; i++) {
assert(i == 0 ||
version->compaction_score_[i] <= version->compaction_score_[i - 1]);
level = version->compaction_level_[i];
if ((version->compaction_score_[i] >= 1)) {
c = PickCompactionBySize(version, level, version->compaction_score_[i]);
if (ExpandWhileOverlapping(c) == false) {
delete c;
c = nullptr;
} else {
break;
}
}
}
// Find compactions needed by seeks
FileMetaData* f = version->file_to_compact_;
if (c == nullptr && f != nullptr && !f->being_compacted) {
level = version->file_to_compact_level_;
int parent_index = -1;
// Only allow one level 0 compaction at a time.
// Do not pick this file if its parents at level+1 are being compacted.
if (level != 0 || compactions_in_progress_[0].empty()) {
if (!ParentRangeInCompaction(version, &f->smallest, &f->largest, level,
&parent_index)) {
c = new Compaction(version, level, level + 1,
MaxFileSizeForLevel(level + 1),
MaxGrandParentOverlapBytes(level), true);
c->inputs_[0].push_back(f);
c->parent_index_ = parent_index;
c->input_version_->file_to_compact_ = nullptr;
if (ExpandWhileOverlapping(c) == false) {
return nullptr;
}
}
}
}
if (c == nullptr) {
return nullptr;
}
// Two level 0 compaction won't run at the same time, so don't need to worry
// about files on level 0 being compacted.
if (level == 0) {
assert(compactions_in_progress_[0].empty());
InternalKey smallest, largest;
GetRange(c->inputs_[0], &smallest, &largest);
// Note that the next call will discard the file we placed in
// c->inputs_[0] earlier and replace it with an overlapping set
// which will include the picked file.
c->inputs_[0].clear();
c->input_version_->GetOverlappingInputs(0, &smallest, &largest,
&c->inputs_[0]);
// If we include more L0 files in the same compaction run it can
// cause the 'smallest' and 'largest' key to get extended to a
// larger range. So, re-invoke GetRange to get the new key range
GetRange(c->inputs_[0], &smallest, &largest);
if (ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
&c->parent_index_)) {
delete c;
return nullptr;
}
assert(!c->inputs_[0].empty());
}
// Setup "level+1" files (inputs_[1])
SetupOtherInputs(c);
// mark all the files that are being compacted
c->MarkFilesBeingCompacted(true);
// Is this compaction creating a file at the bottommost level
c->SetupBottomMostLevel(false);
// remember this currently undergoing compaction
compactions_in_progress_[level].insert(c);
return c;
}
Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version,
int level,
double score) {
Compaction* c = nullptr;
// level 0 files are overlapping. So we cannot pick more
// than one concurrent compactions at this level. This
// could be made better by looking at key-ranges that are
// being compacted at level 0.
if (level == 0 && compactions_in_progress_[level].size() == 1) {
return nullptr;
}
assert(level >= 0);
assert(level + 1 < NumberLevels());
c = new Compaction(version, level, level + 1, MaxFileSizeForLevel(level + 1),
MaxGrandParentOverlapBytes(level));
c->score_ = score;
// Pick the largest file in this level that is not already
// being compacted
std::vector<int>& file_size = c->input_version_->files_by_size_[level];
// record the first file that is not yet compacted
int nextIndex = -1;
for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level];
i < file_size.size(); i++) {
int index = file_size[i];
FileMetaData* f = c->input_version_->files_[level][index];
// check to verify files are arranged in descending size
assert((i == file_size.size() - 1) ||
(i >= Version::number_of_files_to_sort_ - 1) ||
(f->file_size >=
c->input_version_->files_[level][file_size[i + 1]]->file_size));
// do not pick a file to compact if it is being compacted
// from n-1 level.
if (f->being_compacted) {
continue;
}
// remember the startIndex for the next call to PickCompaction
if (nextIndex == -1) {
nextIndex = i;
}
// Do not pick this file if its parents at level+1 are being compacted.
// Maybe we can avoid redoing this work in SetupOtherInputs
int parent_index = -1;
if (ParentRangeInCompaction(c->input_version_, &f->smallest, &f->largest,
level, &parent_index)) {
continue;
}
c->inputs_[0].push_back(f);
c->base_index_ = index;
c->parent_index_ = parent_index;
break;
}
if (c->inputs_[0].empty()) {
delete c;
c = nullptr;
}
// store where to start the iteration in the next call to PickCompaction
version->next_file_to_compact_by_size_[level] = nextIndex;
return c;
}
// Universal style of compaction. Pick files that are contiguous in
// time-range to compact.
//
Compaction* UniversalCompactionPicker::PickCompaction(Version* version,
LogBuffer* log_buffer) {
int level = 0;
double score = version->compaction_score_[0];
if ((version->files_[level].size() <
(unsigned int)options_->level0_file_num_compaction_trigger)) {
LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n",
version->cfd_->GetName().c_str());
return nullptr;
}
Version::FileSummaryStorage tmp;
LogToBuffer(log_buffer, "[%s] Universal: candidate files(%zu): %s\n",
version->cfd_->GetName().c_str(), version->files_[level].size(),
version->LevelFileSummary(&tmp, 0));
// Check for size amplification first.
Compaction* c;
if ((c = PickCompactionUniversalSizeAmp(version, score, log_buffer)) !=
nullptr) {
LogToBuffer(log_buffer, "[%s] Universal: compacting for size amp\n",
version->cfd_->GetName().c_str());
} else {
// Size amplification is within limits. Try reducing read
// amplification while maintaining file size ratios.
unsigned int ratio = options_->compaction_options_universal.size_ratio;
if ((c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX,
log_buffer)) != nullptr) {
LogToBuffer(log_buffer, "[%s] Universal: compacting for size ratio\n",
version->cfd_->GetName().c_str());
} else {
// Size amplification and file size ratios are within configured limits.
// If max read amplification is exceeding configured limits, then force
// compaction without looking at filesize ratios and try to reduce
// the number of files to fewer than level0_file_num_compaction_trigger.
unsigned int num_files = version->files_[level].size() -
options_->level0_file_num_compaction_trigger;
if ((c = PickCompactionUniversalReadAmp(
version, score, UINT_MAX, num_files, log_buffer)) != nullptr) {
LogToBuffer(log_buffer, "[%s] Universal: compacting for file num\n",
version->cfd_->GetName().c_str());
}
}
}
if (c == nullptr) {
return nullptr;
}
assert(c->inputs_[0].size() > 1);
// validate that all the chosen files are non overlapping in time
FileMetaData* newerfile __attribute__((unused)) = nullptr;
for (unsigned int i = 0; i < c->inputs_[0].size(); i++) {
FileMetaData* f = c->inputs_[0][i];
assert (f->smallest_seqno <= f->largest_seqno);
assert(newerfile == nullptr ||
newerfile->smallest_seqno > f->largest_seqno);
newerfile = f;
}
// The files are sorted from newest first to oldest last.
std::vector<int>& file_by_time = c->input_version_->files_by_size_[level];
// Is the earliest file part of this compaction?
int last_index = file_by_time[file_by_time.size()-1];
FileMetaData* last_file = c->input_version_->files_[level][last_index];
if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
c->bottommost_level_ = true;
}
// update statistics
MeasureTime(options_->statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION,
c->inputs_[0].size());
// mark all the files that are being compacted
c->MarkFilesBeingCompacted(true);
// remember this currently undergoing compaction
compactions_in_progress_[level].insert(c);
// Record whether this compaction includes all sst files.
// For now, it is only relevant in universal compaction mode.
c->is_full_compaction_ =
(c->inputs_[0].size() == c->input_version_->files_[0].size());
return c;
}
//
// Consider compaction files based on their size differences with
// the next file in time order.
//
Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
Version* version, double score, unsigned int ratio,
unsigned int max_number_of_files_to_compact, LogBuffer* log_buffer) {
int level = 0;
unsigned int min_merge_width =
options_->compaction_options_universal.min_merge_width;
unsigned int max_merge_width =
options_->compaction_options_universal.max_merge_width;
// The files are sorted from newest first to oldest last.
std::vector<int>& file_by_time = version->files_by_size_[level];
FileMetaData* f = nullptr;
bool done = false;
int start_index = 0;
unsigned int candidate_count = 0;
assert(file_by_time.size() == version->files_[level].size());
unsigned int max_files_to_compact = std::min(max_merge_width,
max_number_of_files_to_compact);
min_merge_width = std::max(min_merge_width, 2U);
// Considers a candidate file only if it is smaller than the
// total size accumulated so far.
for (unsigned int loop = 0; loop < file_by_time.size(); loop++) {
candidate_count = 0;
// Skip files that are already being compacted
for (f = nullptr; loop < file_by_time.size(); loop++) {
int index = file_by_time[loop];
f = version->files_[level][index];
if (!f->being_compacted) {
candidate_count = 1;
break;
}
LogToBuffer(
log_buffer, "[%s] Universal: file %lu[%d] being compacted, skipping",
version->cfd_->GetName().c_str(), (unsigned long)f->number, loop);
f = nullptr;
}
// This file is not being compacted. Consider it as the
// first candidate to be compacted.
uint64_t candidate_size = f != nullptr? f->file_size : 0;
if (f != nullptr) {
LogToBuffer(
log_buffer, "[%s] Universal: Possible candidate file %lu[%d].",
version->cfd_->GetName().c_str(), (unsigned long)f->number, loop);
}
// Check if the suceeding files need compaction.
for (unsigned int i = loop+1;
candidate_count < max_files_to_compact && i < file_by_time.size();
i++) {
int index = file_by_time[i];
FileMetaData* f = version->files_[level][index];
if (f->being_compacted) {
break;
}
// Pick files if the total/last candidate file size (increased by the
// specified ratio) is still larger than the next candidate file.
// candidate_size is the total size of files picked so far with the
// default kCompactionStopStyleTotalSize; with
// kCompactionStopStyleSimilarSize, it's simply the size of the last
// picked file.
uint64_t sz = (candidate_size * (100L + ratio)) /100;
if (sz < f->file_size) {
break;
}
if (options_->compaction_options_universal.stop_style == kCompactionStopStyleSimilarSize) {
// Similar-size stopping rule: also check the last picked file isn't
// far larger than the next candidate file.
sz = (f->file_size * (100L + ratio)) / 100;
if (sz < candidate_size) {
// If the small file we've encountered begins a run of similar-size
// files, we'll pick them up on a future iteration of the outer
// loop. If it's some lonely straggler, it'll eventually get picked
// by the last-resort read amp strategy which disregards size ratios.
break;
}
candidate_size = f->file_size;
} else { // default kCompactionStopStyleTotalSize
candidate_size += f->file_size;
}
candidate_count++;
}
// Found a series of consecutive files that need compaction.
if (candidate_count >= (unsigned int)min_merge_width) {
start_index = loop;
done = true;
break;
} else {
for (unsigned int i = loop;
i < loop + candidate_count && i < file_by_time.size(); i++) {
int index = file_by_time[i];
FileMetaData* f = version->files_[level][index];
LogToBuffer(log_buffer,
"[%s] Universal: Skipping file %lu[%d] with size %lu %d\n",
version->cfd_->GetName().c_str(), (unsigned long)f->number,
i, (unsigned long)f->file_size, f->being_compacted);
}
}
}
if (!done || candidate_count <= 1) {
return nullptr;
}
unsigned int first_index_after = start_index + candidate_count;
// Compression is enabled if files compacted earlier already reached
// size ratio of compression.
bool enable_compression = true;
int ratio_to_compress =
options_->compaction_options_universal.compression_size_percent;
if (ratio_to_compress >= 0) {
uint64_t total_size = version->NumLevelBytes(level);
uint64_t older_file_size = 0;
for (unsigned int i = file_by_time.size() - 1; i >= first_index_after;
i--) {
older_file_size += version->files_[level][file_by_time[i]]->file_size;
if (older_file_size * 100L >= total_size * (long) ratio_to_compress) {
enable_compression = false;
break;
}
}
}
Compaction* c =
new Compaction(version, level, level, MaxFileSizeForLevel(level),
LLONG_MAX, false, enable_compression);
c->score_ = score;
for (unsigned int i = start_index; i < first_index_after; i++) {
int index = file_by_time[i];
FileMetaData* f = c->input_version_->files_[level][index];
c->inputs_[0].push_back(f);
LogToBuffer(log_buffer,
"[%s] Universal: Picking file %lu[%d] with size %lu\n",
version->cfd_->GetName().c_str(), (unsigned long)f->number, i,
(unsigned long)f->file_size);
}
return c;
}
// Look at overall size amplification. If size amplification
// exceeeds the configured value, then do a compaction
// of the candidate files all the way upto the earliest
// base file (overrides configured values of file-size ratios,
// min_merge_width and max_merge_width).
//
Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
Version* version, double score, LogBuffer* log_buffer) {
int level = 0;
// percentage flexibilty while reducing size amplification
uint64_t ratio = options_->compaction_options_universal.
max_size_amplification_percent;
// The files are sorted from newest first to oldest last.
std::vector<int>& file_by_time = version->files_by_size_[level];
assert(file_by_time.size() == version->files_[level].size());
unsigned int candidate_count = 0;
uint64_t candidate_size = 0;
unsigned int start_index = 0;
FileMetaData* f = nullptr;
// Skip files that are already being compacted
for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) {
int index = file_by_time[loop];
f = version->files_[level][index];
if (!f->being_compacted) {
start_index = loop; // Consider this as the first candidate.
break;
}
LogToBuffer(log_buffer,
"[%s] Universal: skipping file %lu[%d] compacted %s",
version->cfd_->GetName().c_str(), (unsigned long)f->number,
loop, " cannot be a candidate to reduce size amp.\n");
f = nullptr;
}
if (f == nullptr) {
return nullptr; // no candidate files
}
LogToBuffer(log_buffer, "[%s] Universal: First candidate file %lu[%d] %s",
version->cfd_->GetName().c_str(), (unsigned long)f->number,
start_index, " to reduce size amp.\n");
// keep adding up all the remaining files
for (unsigned int loop = start_index; loop < file_by_time.size() - 1;
loop++) {
int index = file_by_time[loop];
f = version->files_[level][index];
if (f->being_compacted) {
LogToBuffer(
log_buffer, "[%s] Universal: Possible candidate file %lu[%d] %s.",
version->cfd_->GetName().c_str(), (unsigned long)f->number, loop,
" is already being compacted. No size amp reduction possible.\n");
return nullptr;
}
candidate_size += f->file_size;
candidate_count++;
}
if (candidate_count == 0) {
return nullptr;
}
// size of earliest file
int index = file_by_time[file_by_time.size() - 1];
uint64_t earliest_file_size = version->files_[level][index]->file_size;
// size amplification = percentage of additional size
if (candidate_size * 100 < ratio * earliest_file_size) {
LogToBuffer(
log_buffer,
"[%s] Universal: size amp not needed. newer-files-total-size %lu "
"earliest-file-size %lu",
version->cfd_->GetName().c_str(), (unsigned long)candidate_size,
(unsigned long)earliest_file_size);
return nullptr;
} else {
LogToBuffer(log_buffer,
"[%s] Universal: size amp needed. newer-files-total-size %lu "
"earliest-file-size %lu",
version->cfd_->GetName().c_str(), (unsigned long)candidate_size,
(unsigned long)earliest_file_size);
}
assert(start_index >= 0 && start_index < file_by_time.size() - 1);
// create a compaction request
// We always compact all the files, so always compress.
Compaction* c =
new Compaction(version, level, level, MaxFileSizeForLevel(level),
LLONG_MAX, false, true);
c->score_ = score;
for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
int index = file_by_time[loop];
f = c->input_version_->files_[level][index];
c->inputs_[0].push_back(f);
LogToBuffer(log_buffer,
"[%s] Universal: size amp picking file %lu[%d] with size %lu",
version->cfd_->GetName().c_str(), (unsigned long)f->number,
index, (unsigned long)f->file_size);
}
return c;
}
Compaction* FIFOCompactionPicker::PickCompaction(Version* version,
LogBuffer* log_buffer) {
assert(version->NumberLevels() == 1);
uint64_t total_size = 0;
for (const auto& file : version->files_[0]) {
total_size += file->file_size;
}
if (total_size <= options_->compaction_options_fifo.max_table_files_size ||
version->files_[0].size() == 0) {
// total size not exceeded
LogToBuffer(log_buffer,
"[%s] FIFO compaction: nothing to do. Total size %" PRIu64
", max size %" PRIu64 "\n",
version->cfd_->GetName().c_str(), total_size,
options_->compaction_options_fifo.max_table_files_size);
return nullptr;
}
if (compactions_in_progress_[0].size() > 0) {
LogToBuffer(log_buffer,
"[%s] FIFO compaction: Already executing compaction. No need "
"to run parallel compactions since compactions are very fast",
version->cfd_->GetName().c_str());
return nullptr;
}
Compaction* c = new Compaction(version, 0, 0, 0, 0, false, false,
true /* is deletion compaction */);
// delete old files (FIFO)
for (auto ritr = version->files_[0].rbegin();
ritr != version->files_[0].rend(); ++ritr) {
auto f = *ritr;
total_size -= f->file_size;
c->inputs_[0].push_back(f);
char tmp_fsize[16];
AppendHumanBytes(f->file_size, tmp_fsize, sizeof(tmp_fsize));
LogToBuffer(log_buffer, "[%s] FIFO compaction: picking file %" PRIu64
" with size %s for deletion",
version->cfd_->GetName().c_str(), f->number, tmp_fsize);
if (total_size <= options_->compaction_options_fifo.max_table_files_size) {
break;
}
}
c->MarkFilesBeingCompacted(true);
compactions_in_progress_[0].insert(c);
return c;
}
Compaction* FIFOCompactionPicker::CompactRange(Version* version,
int input_level,
int output_level,
const InternalKey* begin,
const InternalKey* end,
InternalKey** compaction_end) {
assert(input_level == 0);
assert(output_level == 0);
*compaction_end = nullptr;
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_->info_log.get());
auto c = PickCompaction(version, &log_buffer);
log_buffer.FlushBufferToLog();
return c;
}
} // namespace rocksdb

181
db/compaction_picker.h Normal file
View File

@@ -0,0 +1,181 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "db/version_set.h"
#include "db/compaction.h"
#include "rocksdb/status.h"
#include "rocksdb/options.h"
#include "rocksdb/env.h"
#include <vector>
#include <memory>
#include <set>
namespace rocksdb {
class LogBuffer;
class Compaction;
class Version;
class CompactionPicker {
public:
CompactionPicker(const Options* options, const InternalKeyComparator* icmp);
virtual ~CompactionPicker();
// Pick level and inputs for a new compaction.
// Returns nullptr if there is no compaction to be done.
// Otherwise returns a pointer to a heap-allocated object that
// describes the compaction. Caller should delete the result.
virtual Compaction* PickCompaction(Version* version,
LogBuffer* log_buffer) = 0;
// Return a compaction object for compacting the range [begin,end] in
// the specified level. Returns nullptr if there is nothing in that
// level that overlaps the specified range. Caller should delete
// the result.
//
// The returned Compaction might not include the whole requested range.
// In that case, compaction_end will be set to the next key that needs
// compacting. In case the compaction will compact the whole range,
// compaction_end will be set to nullptr.
// Client is responsible for compaction_end storage -- when called,
// *compaction_end should point to valid InternalKey!
virtual Compaction* CompactRange(Version* version, int input_level,
int output_level, const InternalKey* begin,
const InternalKey* end,
InternalKey** compaction_end);
// Free up the files that participated in a compaction
void ReleaseCompactionFiles(Compaction* c, Status status);
// Return the total amount of data that is undergoing
// compactions per level
void SizeBeingCompacted(std::vector<uint64_t>& sizes);
// Returns maximum total overlap bytes with grandparent
// level (i.e., level+2) before we stop building a single
// file in level->level+1 compaction.
uint64_t MaxGrandParentOverlapBytes(int level);
// Returns maximum total bytes of data on a given level.
double MaxBytesForLevel(int level);
// Get the max file size in a given level.
uint64_t MaxFileSizeForLevel(int level) const;
protected:
int NumberLevels() const { return num_levels_; }
// Stores the minimal range that covers all entries in inputs in
// *smallest, *largest.
// REQUIRES: inputs is not empty
void GetRange(const std::vector<FileMetaData*>& inputs, InternalKey* smallest,
InternalKey* largest);
// Stores the minimal range that covers all entries in inputs1 and inputs2
// in *smallest, *largest.
// REQUIRES: inputs is not empty
void GetRange(const std::vector<FileMetaData*>& inputs1,
const std::vector<FileMetaData*>& inputs2,
InternalKey* smallest, InternalKey* largest);
// Add more files to the inputs on "level" to make sure that
// no newer version of a key is compacted to "level+1" while leaving an older
// version in a "level". Otherwise, any Get() will search "level" first,
// and will likely return an old/stale value for the key, since it always
// searches in increasing order of level to find the value. This could
// also scramble the order of merge operands. This function should be
// called any time a new Compaction is created, and its inputs_[0] are
// populated.
//
// Will return false if it is impossible to apply this compaction.
bool ExpandWhileOverlapping(Compaction* c);
uint64_t ExpandedCompactionByteSizeLimit(int level);
// Returns true if any one of the specified files are being compacted
bool FilesInCompaction(std::vector<FileMetaData*>& files);
// Returns true if any one of the parent files are being compacted
bool ParentRangeInCompaction(Version* version, const InternalKey* smallest,
const InternalKey* largest, int level,
int* index);
void SetupOtherInputs(Compaction* c);
// record all the ongoing compactions for all levels
std::vector<std::set<Compaction*>> compactions_in_progress_;
// Per-level target file size.
std::unique_ptr<uint64_t[]> max_file_size_;
// Per-level max bytes
std::unique_ptr<uint64_t[]> level_max_bytes_;
const Options* const options_;
private:
int num_levels_;
const InternalKeyComparator* const icmp_;
};
class UniversalCompactionPicker : public CompactionPicker {
public:
UniversalCompactionPicker(const Options* options,
const InternalKeyComparator* icmp)
: CompactionPicker(options, icmp) {}
virtual Compaction* PickCompaction(Version* version,
LogBuffer* log_buffer) override;
private:
// Pick Universal compaction to limit read amplification
Compaction* PickCompactionUniversalReadAmp(Version* version, double score,
unsigned int ratio,
unsigned int num_files,
LogBuffer* log_buffer);
// Pick Universal compaction to limit space amplification.
Compaction* PickCompactionUniversalSizeAmp(Version* version, double score,
LogBuffer* log_buffer);
};
class LevelCompactionPicker : public CompactionPicker {
public:
LevelCompactionPicker(const Options* options,
const InternalKeyComparator* icmp)
: CompactionPicker(options, icmp) {}
virtual Compaction* PickCompaction(Version* version,
LogBuffer* log_buffer) override;
private:
// For the specfied level, pick a compaction.
// Returns nullptr if there is no compaction to be done.
// If level is 0 and there is already a compaction on that level, this
// function will return nullptr.
Compaction* PickCompactionBySize(Version* version, int level, double score);
};
class FIFOCompactionPicker : public CompactionPicker {
public:
FIFOCompactionPicker(const Options* options,
const InternalKeyComparator* icmp)
: CompactionPicker(options, icmp) {}
virtual Compaction* PickCompaction(Version* version,
LogBuffer* log_buffer) override;
virtual Compaction* CompactRange(Version* version, int input_level,
int output_level, const InternalKey* begin,
const InternalKey* end,
InternalKey** compaction_end) override;
};
} // namespace rocksdb

440
db/corruption_test.cc Normal file
View File

@@ -0,0 +1,440 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "rocksdb/db.h"
#include <errno.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "rocksdb/cache.h"
#include "rocksdb/env.h"
#include "rocksdb/table.h"
#include "rocksdb/write_batch.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/log_format.h"
#include "db/version_set.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace rocksdb {
static const int kValueSize = 1000;
class CorruptionTest {
public:
test::ErrorEnv env_;
std::string dbname_;
shared_ptr<Cache> tiny_cache_;
Options options_;
DB* db_;
CorruptionTest() {
tiny_cache_ = NewLRUCache(100);
options_.env = &env_;
dbname_ = test::TmpDir() + "/corruption_test";
DestroyDB(dbname_, options_);
db_ = nullptr;
options_.create_if_missing = true;
options_.block_size_deviation = 0; // make unit test pass for now
Reopen();
options_.create_if_missing = false;
}
~CorruptionTest() {
delete db_;
DestroyDB(dbname_, Options());
}
Status TryReopen(Options* options = nullptr) {
delete db_;
db_ = nullptr;
Options opt = (options ? *options : options_);
opt.env = &env_;
opt.block_cache = tiny_cache_;
opt.block_size_deviation = 0;
opt.arena_block_size = 4096;
return DB::Open(opt, dbname_, &db_);
}
void Reopen(Options* options = nullptr) {
ASSERT_OK(TryReopen(options));
}
void RepairDB() {
delete db_;
db_ = nullptr;
ASSERT_OK(::rocksdb::RepairDB(dbname_, options_));
}
void Build(int n) {
std::string key_space, value_space;
WriteBatch batch;
for (int i = 0; i < n; i++) {
//if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
Slice key = Key(i, &key_space);
batch.Clear();
batch.Put(key, Value(i, &value_space));
ASSERT_OK(db_->Write(WriteOptions(), &batch));
}
}
void Check(int min_expected, int max_expected) {
unsigned int next_expected = 0;
int missed = 0;
int bad_keys = 0;
int bad_values = 0;
int correct = 0;
std::string value_space;
// Do not verify checksums. If we verify checksums then the
// db itself will raise errors because data is corrupted.
// Instead, we want the reads to be successful and this test
// will detect whether the appropriate corruptions have
// occured.
Iterator* iter = db_->NewIterator(ReadOptions(false, true));
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
uint64_t key;
Slice in(iter->key());
if (!ConsumeDecimalNumber(&in, &key) ||
!in.empty() ||
key < next_expected) {
bad_keys++;
continue;
}
missed += (key - next_expected);
next_expected = key + 1;
if (iter->value() != Value(key, &value_space)) {
bad_values++;
} else {
correct++;
}
}
delete iter;
fprintf(stderr,
"expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
min_expected, max_expected, correct, bad_keys, bad_values, missed);
ASSERT_LE(min_expected, correct);
ASSERT_GE(max_expected, correct);
}
void CorruptFile(const std::string fname, int offset, int bytes_to_corrupt) {
struct stat sbuf;
if (stat(fname.c_str(), &sbuf) != 0) {
const char* msg = strerror(errno);
ASSERT_TRUE(false) << fname << ": " << msg;
}
if (offset < 0) {
// Relative to end of file; make it absolute
if (-offset > sbuf.st_size) {
offset = 0;
} else {
offset = sbuf.st_size + offset;
}
}
if (offset > sbuf.st_size) {
offset = sbuf.st_size;
}
if (offset + bytes_to_corrupt > sbuf.st_size) {
bytes_to_corrupt = sbuf.st_size - offset;
}
// Do it
std::string contents;
Status s = ReadFileToString(Env::Default(), fname, &contents);
ASSERT_TRUE(s.ok()) << s.ToString();
for (int i = 0; i < bytes_to_corrupt; i++) {
contents[i + offset] ^= 0x80;
}
s = WriteStringToFile(Env::Default(), contents, fname);
ASSERT_TRUE(s.ok()) << s.ToString();
}
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
// Pick file to corrupt
std::vector<std::string> filenames;
ASSERT_OK(env_.GetChildren(dbname_, &filenames));
uint64_t number;
FileType type;
std::string fname;
int picked_number = -1;
for (unsigned int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type) &&
type == filetype &&
static_cast<int>(number) > picked_number) { // Pick latest file
fname = dbname_ + "/" + filenames[i];
picked_number = number;
}
}
ASSERT_TRUE(!fname.empty()) << filetype;
CorruptFile(fname, offset, bytes_to_corrupt);
}
// corrupts exactly one file at level `level`. if no file found at level,
// asserts
void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) {
std::vector<LiveFileMetaData> metadata;
db_->GetLiveFilesMetaData(&metadata);
for (const auto& m : metadata) {
if (m.level == level) {
CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt);
return;
}
}
ASSERT_TRUE(false) << "no file found at level";
}
int Property(const std::string& name) {
std::string property;
int result;
if (db_->GetProperty(name, &property) &&
sscanf(property.c_str(), "%d", &result) == 1) {
return result;
} else {
return -1;
}
}
// Return the ith key
Slice Key(int i, std::string* storage) {
char buf[100];
snprintf(buf, sizeof(buf), "%016d", i);
storage->assign(buf, strlen(buf));
return Slice(*storage);
}
// Return the value to associate with the specified key
Slice Value(int k, std::string* storage) {
Random r(k);
return test::RandomString(&r, kValueSize, storage);
}
};
TEST(CorruptionTest, Recovery) {
Build(100);
Check(100, 100);
Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block
Reopen();
// The 64 records in the first two log blocks are completely lost.
Check(36, 36);
}
TEST(CorruptionTest, RecoverWriteError) {
env_.writable_file_error_ = true;
Status s = TryReopen();
ASSERT_TRUE(!s.ok());
}
TEST(CorruptionTest, NewFileErrorDuringWrite) {
// Do enough writing to force minor compaction
env_.writable_file_error_ = true;
const int num = 3 + (Options().write_buffer_size / kValueSize);
std::string value_storage;
Status s;
bool failed = false;
for (int i = 0; i < num; i++) {
WriteBatch batch;
batch.Put("a", Value(100, &value_storage));
s = db_->Write(WriteOptions(), &batch);
if (!s.ok()) {
failed = true;
}
ASSERT_TRUE(!failed || !s.ok());
}
ASSERT_TRUE(!s.ok());
ASSERT_GE(env_.num_writable_file_errors_, 1);
env_.writable_file_error_ = false;
Reopen();
}
TEST(CorruptionTest, TableFile) {
Build(100);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_FlushMemTable();
dbi->TEST_CompactRange(0, nullptr, nullptr);
dbi->TEST_CompactRange(1, nullptr, nullptr);
Corrupt(kTableFile, 100, 1);
Check(99, 99);
}
TEST(CorruptionTest, TableFileIndexData) {
Build(10000); // Enough to build multiple Tables
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_FlushMemTable();
Corrupt(kTableFile, -2000, 500);
Reopen();
Check(5000, 9999);
}
TEST(CorruptionTest, MissingDescriptor) {
Build(1000);
RepairDB();
Reopen();
Check(1000, 1000);
}
TEST(CorruptionTest, SequenceNumberRecovery) {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
RepairDB();
Reopen();
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v5", v);
// Write something. If sequence number was not recovered properly,
// it will be hidden by an earlier write.
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v6", v);
Reopen();
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v6", v);
}
TEST(CorruptionTest, CorruptedDescriptor) {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_FlushMemTable();
dbi->TEST_CompactRange(0, nullptr, nullptr);
Corrupt(kDescriptorFile, 0, 1000);
Status s = TryReopen();
ASSERT_TRUE(!s.ok());
RepairDB();
Reopen();
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("hello", v);
}
TEST(CorruptionTest, CompactionInputError) {
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_FlushMemTable();
const int last = dbi->MaxMemCompactionLevel();
ASSERT_EQ(1, Property("rocksdb.num-files-at-level" + NumberToString(last)));
Corrupt(kTableFile, 100, 1);
Check(9, 9);
// Force compactions by writing lots of values
Build(10000);
Check(10000, 10000);
}
TEST(CorruptionTest, CompactionInputErrorParanoid) {
Options options;
options.paranoid_checks = true;
options.write_buffer_size = 131072;
options.max_write_buffer_number = 2;
Reopen(&options);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
// Fill levels >= 1 so memtable flush outputs to level 0
for (int level = 1; level < dbi->NumberLevels(); level++) {
dbi->Put(WriteOptions(), "", "begin");
dbi->Put(WriteOptions(), "~", "end");
dbi->TEST_FlushMemTable();
}
options.max_mem_compaction_level = 0;
Reopen(&options);
dbi = reinterpret_cast<DBImpl*>(db_);
Build(10);
dbi->TEST_FlushMemTable();
dbi->TEST_WaitForCompact();
ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
CorruptTableFileAtLevel(0, 100, 1);
Check(9, 9);
// Write must eventually fail because of corrupted table
Status s;
std::string tmp1, tmp2;
bool failed = false;
for (int i = 0; i < 10000; i++) {
s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
if (!s.ok()) {
failed = true;
}
// if one write failed, every subsequent write must fail, too
ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
}
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
}
TEST(CorruptionTest, UnrelatedKeys) {
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_FlushMemTable();
Corrupt(kTableFile, 100, 1);
std::string tmp1, tmp2;
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
dbi->TEST_FlushMemTable();
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
}
TEST(CorruptionTest, FileSystemStateCorrupted) {
for (int iter = 0; iter < 2; ++iter) {
Options options;
options.paranoid_checks = true;
options.create_if_missing = true;
Reopen(&options);
Build(10);
ASSERT_OK(db_->Flush(FlushOptions()));
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
std::vector<LiveFileMetaData> metadata;
dbi->GetLiveFilesMetaData(&metadata);
ASSERT_GT(metadata.size(), size_t(0));
std::string filename = dbname_ + metadata[0].name;
delete db_;
db_ = nullptr;
if (iter == 0) { // corrupt file size
unique_ptr<WritableFile> file;
env_.NewWritableFile(filename, &file, EnvOptions());
file->Append(Slice("corrupted sst"));
file.reset();
} else { // delete the file
env_.DeleteFile(filename);
}
Status x = TryReopen(&options);
ASSERT_TRUE(x.IsCorruption());
DestroyDB(dbname_, options_);
Reopen(&options);
}
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

2642
db/db_bench.cc Normal file

File diff suppressed because it is too large Load Diff

179
db/db_filesnapshot.cc Normal file
View File

@@ -0,0 +1,179 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2012 Facebook.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef ROCKSDB_LITE
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <algorithm>
#include <string>
#include <stdint.h>
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "port/port.h"
#include "util/mutexlock.h"
#include "util/sync_point.h"
namespace rocksdb {
Status DBImpl::DisableFileDeletions() {
MutexLock l(&mutex_);
++disable_delete_obsolete_files_;
if (disable_delete_obsolete_files_ == 1) {
Log(options_.info_log, "File Deletions Disabled");
} else {
Log(options_.info_log,
"File Deletions Disabled, but already disabled. Counter: %d",
disable_delete_obsolete_files_);
}
return Status::OK();
}
Status DBImpl::EnableFileDeletions(bool force) {
DeletionState deletion_state;
bool should_purge_files = false;
{
MutexLock l(&mutex_);
if (force) {
// if force, we need to enable file deletions right away
disable_delete_obsolete_files_ = 0;
} else if (disable_delete_obsolete_files_ > 0) {
--disable_delete_obsolete_files_;
}
if (disable_delete_obsolete_files_ == 0) {
Log(options_.info_log, "File Deletions Enabled");
should_purge_files = true;
FindObsoleteFiles(deletion_state, true);
} else {
Log(options_.info_log,
"File Deletions Enable, but not really enabled. Counter: %d",
disable_delete_obsolete_files_);
}
}
if (should_purge_files) {
PurgeObsoleteFiles(deletion_state);
}
LogFlush(options_.info_log);
return Status::OK();
}
Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
uint64_t* manifest_file_size,
bool flush_memtable) {
*manifest_file_size = 0;
mutex_.Lock();
if (flush_memtable) {
// flush all dirty data to disk.
Status status;
for (auto cfd : *versions_->GetColumnFamilySet()) {
cfd->Ref();
mutex_.Unlock();
status = FlushMemTable(cfd, FlushOptions());
mutex_.Lock();
cfd->Unref();
if (!status.ok()) {
break;
}
}
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
if (!status.ok()) {
mutex_.Unlock();
Log(options_.info_log, "Cannot Flush data %s\n",
status.ToString().c_str());
return status;
}
}
// Make a set of all of the live *.sst files
std::set<uint64_t> live;
for (auto cfd : *versions_->GetColumnFamilySet()) {
cfd->current()->AddLiveFiles(&live);
}
ret.clear();
ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
// create names of the live files. The names are not absolute
// paths, instead they are relative to dbname_;
for (auto live_file : live) {
ret.push_back(TableFileName("", live_file));
}
ret.push_back(CurrentFileName(""));
ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber()));
// find length of manifest file while holding the mutex lock
*manifest_file_size = versions_->ManifestFileSize();
mutex_.Unlock();
return Status::OK();
}
Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
// First get sorted files in db dir, then get sorted files from archived
// dir, to avoid a race condition where a log file is moved to archived
// dir in between.
Status s;
// list wal files in main db dir.
VectorLogPtr logs;
s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile);
if (!s.ok()) {
return s;
}
// Reproduce the race condition where a log file is moved
// to archived dir, between these two sync points, used in
// (DBTest,TransactionLogIteratorRace)
TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1");
TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2");
files.clear();
// list wal files in archive dir.
std::string archivedir = ArchivalDirectory(options_.wal_dir);
if (env_->FileExists(archivedir)) {
s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
if (!s.ok()) {
return s;
}
}
uint64_t latest_archived_log_number = 0;
if (!files.empty()) {
latest_archived_log_number = files.back()->LogNumber();
Log(options_.info_log, "Latest Archived log: %" PRIu64,
latest_archived_log_number);
}
files.reserve(files.size() + logs.size());
for (auto& log : logs) {
if (log->LogNumber() > latest_archived_log_number) {
files.push_back(std::move(log));
} else {
// When the race condition happens, we could see the
// same log in both db dir and archived dir. Simply
// ignore the one in db dir. Note that, if we read
// archived dir first, we would have missed the log file.
Log(options_.info_log, "%s already moved to archive",
log->PathName().c_str());
}
}
return s;
}
}
#endif // ROCKSDB_LITE

4703
db/db_impl.cc Normal file

File diff suppressed because it is too large Load Diff

635
db/db_impl.h Normal file
View File

@@ -0,0 +1,635 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <atomic>
#include <deque>
#include <set>
#include <utility>
#include <vector>
#include <string>
#include "db/dbformat.h"
#include "db/log_writer.h"
#include "db/snapshot.h"
#include "db/column_family.h"
#include "db/version_edit.h"
#include "memtable_list.h"
#include "port/port.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/transaction_log.h"
#include "util/autovector.h"
#include "util/stats_logger.h"
#include "util/thread_local.h"
#include "db/internal_stats.h"
namespace rocksdb {
class MemTable;
class TableCache;
class Version;
class VersionEdit;
class VersionSet;
class CompactionFilterV2;
class Arena;
class DBImpl : public DB {
public:
DBImpl(const DBOptions& options, const std::string& dbname);
virtual ~DBImpl();
// Implementations of the DB interface
using DB::Put;
virtual Status Put(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value);
using DB::Merge;
virtual Status Merge(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value);
using DB::Delete;
virtual Status Delete(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key);
using DB::Write;
virtual Status Write(const WriteOptions& options, WriteBatch* updates);
using DB::Get;
virtual Status Get(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value);
using DB::MultiGet;
virtual std::vector<Status> MultiGet(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys, std::vector<std::string>* values);
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
const std::string& column_family,
ColumnFamilyHandle** handle);
virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
// Returns false if key doesn't exist in the database and true if it may.
// If value_found is not passed in as null, then return the value if found in
// memory. On return, if value was found, then value_found will be set to true
// , otherwise false.
using DB::KeyMayExist;
virtual bool KeyMayExist(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value, bool* value_found = nullptr);
using DB::NewIterator;
virtual Iterator* NewIterator(const ReadOptions& options,
ColumnFamilyHandle* column_family);
virtual Status NewIterators(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_families,
std::vector<Iterator*>* iterators);
virtual const Snapshot* GetSnapshot();
virtual void ReleaseSnapshot(const Snapshot* snapshot);
using DB::GetProperty;
virtual bool GetProperty(ColumnFamilyHandle* column_family,
const Slice& property, std::string* value);
using DB::GetApproximateSizes;
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
const Range* range, int n, uint64_t* sizes);
using DB::CompactRange;
virtual Status CompactRange(ColumnFamilyHandle* column_family,
const Slice* begin, const Slice* end,
bool reduce_level = false, int target_level = -1);
using DB::NumberLevels;
virtual int NumberLevels(ColumnFamilyHandle* column_family);
using DB::MaxMemCompactionLevel;
virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family);
using DB::Level0StopWriteTrigger;
virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family);
virtual const std::string& GetName() const;
virtual Env* GetEnv() const;
using DB::GetOptions;
virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const;
using DB::Flush;
virtual Status Flush(const FlushOptions& options,
ColumnFamilyHandle* column_family);
virtual SequenceNumber GetLatestSequenceNumber() const;
#ifndef ROCKSDB_LITE
virtual Status DisableFileDeletions();
virtual Status EnableFileDeletions(bool force);
// All the returned filenames start with "/"
virtual Status GetLiveFiles(std::vector<std::string>&,
uint64_t* manifest_file_size,
bool flush_memtable = true);
virtual Status GetSortedWalFiles(VectorLogPtr& files);
virtual Status GetUpdatesSince(
SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
const TransactionLogIterator::ReadOptions&
read_options = TransactionLogIterator::ReadOptions());
virtual Status DeleteFile(std::string name);
virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
#endif // ROCKSDB_LITE
// checks if all live files exist on file system and that their file sizes
// match to our in-memory records
virtual Status CheckConsistency();
virtual Status GetDbIdentity(std::string& identity);
Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
int output_level, const Slice* begin,
const Slice* end);
#ifndef ROCKSDB_LITE
// Extra methods (for testing) that are not in the public DB interface
// Implemented in db_impl_debug.cc
// Compact any files in the named level that overlap [*begin, *end]
Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
ColumnFamilyHandle* column_family = nullptr);
// Force current memtable contents to be flushed.
Status TEST_FlushMemTable(bool wait = true);
// Wait for memtable compaction
Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
// Wait for any compaction
Status TEST_WaitForCompact();
// Return an internal iterator over the current state of the database.
// The keys of this iterator are internal keys (see format.h).
// The returned iterator should be deleted when no longer needed.
Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family =
nullptr);
// Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1.
int64_t TEST_MaxNextLevelOverlappingBytes(ColumnFamilyHandle* column_family =
nullptr);
// Return the current manifest file no.
uint64_t TEST_Current_Manifest_FileNo();
// Trigger's a background call for testing.
void TEST_PurgeObsoleteteWAL();
// get total level0 file size. Only for testing.
uint64_t TEST_GetLevel0TotalSize();
void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
{
default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
}
void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
std::vector<std::vector<FileMetaData>>* metadata);
Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
SequenceNumber* sequence);
Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
#endif // NDEBUG
// needed for CleanupIteratorState
struct DeletionState {
inline bool HaveSomethingToDelete() const {
return candidate_files.size() ||
sst_delete_files.size() ||
log_delete_files.size();
}
// a list of all files that we'll consider deleting
// (every once in a while this is filled up with all files
// in the DB directory)
std::vector<std::string> candidate_files;
// the list of all live sst files that cannot be deleted
std::vector<uint64_t> sst_live;
// a list of sst files that we need to delete
std::vector<FileMetaData*> sst_delete_files;
// a list of log files that we need to delete
std::vector<uint64_t> log_delete_files;
// a list of memtables to be free
autovector<MemTable*> memtables_to_free;
autovector<SuperVersion*> superversions_to_free;
SuperVersion* new_superversion; // if nullptr no new superversion
// the current manifest_file_number, log_number and prev_log_number
// that corresponds to the set of files in 'live'.
uint64_t manifest_file_number, pending_manifest_file_number, log_number,
prev_log_number;
explicit DeletionState(bool create_superversion = false) {
manifest_file_number = 0;
pending_manifest_file_number = 0;
log_number = 0;
prev_log_number = 0;
new_superversion = create_superversion ? new SuperVersion() : nullptr;
}
~DeletionState() {
// free pending memtables
for (auto m : memtables_to_free) {
delete m;
}
// free superversions
for (auto s : superversions_to_free) {
delete s;
}
// if new_superversion was not used, it will be non-nullptr and needs
// to be freed here
delete new_superversion;
}
};
// Returns the list of live files in 'live' and the list
// of all files in the filesystem in 'candidate_files'.
// If force == false and the last call was less than
// options_.delete_obsolete_files_period_micros microseconds ago,
// it will not fill up the deletion_state
void FindObsoleteFiles(DeletionState& deletion_state,
bool force,
bool no_full_scan = false);
// Diffs the files listed in filenames and those that do not
// belong to live files are posibly removed. Also, removes all the
// files in sst_delete_files and log_delete_files.
// It is not necessary to hold the mutex when invoking this method.
void PurgeObsoleteFiles(DeletionState& deletion_state);
ColumnFamilyHandle* DefaultColumnFamily() const;
protected:
Env* const env_;
const std::string dbname_;
unique_ptr<VersionSet> versions_;
const DBOptions options_;
Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
SuperVersion* super_version,
Arena* arena = nullptr);
private:
friend class DB;
friend class InternalStats;
#ifndef ROCKSDB_LITE
friend class TailingIterator;
friend class ForwardIterator;
#endif
friend struct SuperVersion;
struct CompactionState;
struct Writer;
Status NewDB();
// Recover the descriptor from persistent storage. May do a significant
// amount of work to recover recently logged updates. Any changes to
// be made to the descriptor are added to *edit.
Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
bool read_only = false, bool error_if_log_file_exist = false);
void MaybeIgnoreError(Status* s) const;
const Status CreateArchivalDirectory();
// Delete any unneeded files and stale in-memory entries.
void DeleteObsoleteFiles();
// Flush the in-memory write buffer to storage. Switches to a new
// log-file/memtable and writes a new descriptor iff successful.
Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress,
DeletionState& deletion_state,
LogBuffer* log_buffer);
Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
bool read_only);
// The following two methods are used to flush a memtable to
// storage. The first one is used atdatabase RecoveryTime (when the
// database is opened) and is heavyweight because it holds the mutex
// for the entire period. The second method WriteLevel0Table supports
// concurrent flush memtables to storage.
Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
VersionEdit* edit);
Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
VersionEdit* edit, uint64_t* filenumber,
LogBuffer* log_buffer);
uint64_t SlowdownAmount(int n, double bottom, double top);
// TODO(icanadi) free superversion_to_free and old_log outside of mutex
Status MakeRoomForWrite(ColumnFamilyData* cfd,
bool force /* flush even if there is room? */,
autovector<SuperVersion*>* superversions_to_free,
autovector<log::Writer*>* logs_to_free);
void BuildBatchGroup(Writer** last_writer,
autovector<WriteBatch*>* write_batch_group);
// Force current memtable contents to be flushed.
Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
// Wait for memtable flushed
Status WaitForFlushMemTable(ColumnFamilyData* cfd);
void MaybeScheduleLogDBDeployStats();
#ifndef ROCKSDB_LITE
static void BGLogDBDeployStats(void* db);
void LogDBDeployStats();
#endif // ROCKSDB_LITE
void MaybeScheduleFlushOrCompaction();
static void BGWorkCompaction(void* db);
static void BGWorkFlush(void* db);
void BackgroundCallCompaction();
void BackgroundCallFlush();
Status BackgroundCompaction(bool* madeProgress, DeletionState& deletion_state,
LogBuffer* log_buffer);
Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state,
LogBuffer* log_buffer);
void CleanupCompaction(CompactionState* compact, Status status);
Status DoCompactionWork(CompactionState* compact,
DeletionState& deletion_state,
LogBuffer* log_buffer);
// This function is called as part of compaction. It enables Flush process to
// preempt compaction, since it's higher prioirty
// Returns: micros spent executing
uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
DeletionState& deletion_state,
LogBuffer* log_buffer);
// Call compaction filter if is_compaction_v2 is not true. Then iterate
// through input and compact the kv-pairs
Status ProcessKeyValueCompaction(
SequenceNumber visible_at_tip,
SequenceNumber earliest_snapshot,
SequenceNumber latest_snapshot,
DeletionState& deletion_state,
bool bottommost_level,
int64_t& imm_micros,
Iterator* input,
CompactionState* compact,
bool is_compaction_v2,
LogBuffer* log_buffer);
// Call compaction_filter_v2->Filter() on kv-pairs in compact
void CallCompactionFilterV2(CompactionState* compact,
CompactionFilterV2* compaction_filter_v2);
Status OpenCompactionOutputFile(CompactionState* compact);
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
Status InstallCompactionResults(CompactionState* compact,
LogBuffer* log_buffer);
void AllocateCompactionOutputFileNumbers(CompactionState* compact);
void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
#ifdef ROCKSDB_LITE
void PurgeObsoleteWALFiles() {
// this function is used for archiving WAL files. we don't need this in
// ROCKSDB_LITE
}
#else
void PurgeObsoleteWALFiles();
Status GetSortedWalsOfType(const std::string& path,
VectorLogPtr& log_files,
WalFileType type);
// Requires: all_logs should be sorted with earliest log file first
// Retains all log files in all_logs which contain updates with seq no.
// Greater Than or Equal to the requested SequenceNumber.
Status RetainProbableWalFiles(VectorLogPtr& all_logs,
const SequenceNumber target);
Status ReadFirstRecord(const WalFileType type, const uint64_t number,
SequenceNumber* sequence);
Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
#endif // ROCKSDB_LITE
void PrintStatistics();
// dump rocksdb.stats to LOG
void MaybeDumpStats();
// Return true if the current db supports snapshot. If the current
// DB does not support snapshot, then calling GetSnapshot() will always
// return nullptr.
//
// @see GetSnapshot()
virtual bool IsSnapshotSupported() const;
// Return the minimum empty level that could hold the total data in the
// input level. Return the input level, if such level could not be found.
int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level);
// Move the files in the input level to the target level.
// If target_level < 0, automatically calculate the minimum level that could
// hold the data set.
Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
// table_cache_ provides its own synchronization
std::shared_ptr<Cache> table_cache_;
// Lock over the persistent DB state. Non-nullptr iff successfully acquired.
FileLock* db_lock_;
// State below is protected by mutex_
port::Mutex mutex_;
port::AtomicPointer shutting_down_;
// This condition variable is signaled on these conditions:
// * whenever bg_compaction_scheduled_ goes down to 0
// * if bg_manual_only_ > 0, whenever a compaction finishes, even if it hasn't
// made any progress
// * whenever a compaction made any progress
// * whenever bg_flush_scheduled_ value decreases (i.e. whenever a flush is
// done, even if it didn't make any progress)
// * whenever there is an error in background flush or compaction
// * whenever bg_logstats_scheduled_ turns to false
port::CondVar bg_cv_;
uint64_t logfile_number_;
unique_ptr<log::Writer> log_;
bool log_empty_;
ColumnFamilyHandleImpl* default_cf_handle_;
unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
struct LogFileNumberSize {
explicit LogFileNumberSize(uint64_t _number)
: number(_number), size(0), getting_flushed(false) {}
void AddSize(uint64_t new_size) { size += new_size; }
uint64_t number;
uint64_t size;
bool getting_flushed;
};
std::deque<LogFileNumberSize> alive_log_files_;
uint64_t total_log_size_;
// only used for dynamically adjusting max_total_wal_size. it is a sum of
// [write_buffer_size * max_write_buffer_number] over all column families
uint64_t max_total_in_memory_state_;
std::string host_name_;
std::unique_ptr<Directory> db_directory_;
// Queue of writers.
std::deque<Writer*> writers_;
WriteBatch tmp_batch_;
SnapshotList snapshots_;
// cache for ReadFirstRecord() calls
std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
port::Mutex read_first_record_cache_mutex_;
// Set of table files to protect from deletion because they are
// part of ongoing compactions.
std::set<uint64_t> pending_outputs_;
// At least one compaction or flush job is pending but not yet scheduled
// because of the max background thread limit.
bool bg_schedule_needed_;
// count how many background compactions are running or have been scheduled
int bg_compaction_scheduled_;
// If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual
// compactions (if manual_compaction_ is not null). This mechanism enables
// manual compactions to wait until all other compactions are finished.
int bg_manual_only_;
// number of background memtable flush jobs, submitted to the HIGH pool
int bg_flush_scheduled_;
// Has a background stats log thread scheduled?
bool bg_logstats_scheduled_;
// Information for a manual compaction
struct ManualCompaction {
ColumnFamilyData* cfd;
int input_level;
int output_level;
bool done;
Status status;
bool in_progress; // compaction request being processed?
const InternalKey* begin; // nullptr means beginning of key range
const InternalKey* end; // nullptr means end of key range
InternalKey tmp_storage; // Used to keep track of compaction progress
};
ManualCompaction* manual_compaction_;
// Have we encountered a background error in paranoid mode?
Status bg_error_;
std::unique_ptr<StatsLogger> logger_;
int64_t volatile last_log_ts;
// shall we disable deletion of obsolete files
// if 0 the deletion is enabled.
// if non-zero, files will not be getting deleted
// This enables two different threads to call
// EnableFileDeletions() and DisableFileDeletions()
// without any synchronization
int disable_delete_obsolete_files_;
// last time when DeleteObsoleteFiles was invoked
uint64_t delete_obsolete_files_last_run_;
// last time when PurgeObsoleteWALFiles ran.
uint64_t purge_wal_files_last_run_;
// last time stats were dumped to LOG
std::atomic<uint64_t> last_stats_dump_time_microsec_;
// obsolete files will be deleted every this seconds if ttl deletion is
// enabled and archive size_limit is disabled.
uint64_t default_interval_to_delete_obsolete_WAL_;
bool flush_on_destroy_; // Used when disableWAL is true.
static const int KEEP_LOG_FILE_NUM = 1000;
std::string db_absolute_path_;
// count of the number of contiguous delaying writes
int delayed_writes_;
// The options to access storage files
const EnvOptions storage_options_;
// A value of true temporarily disables scheduling of background work
bool bg_work_gate_closed_;
// Guard against multiple concurrent refitting
bool refitting_level_;
// Indicate DB was opened successfully
bool opened_successfully_;
// No copying allowed
DBImpl(const DBImpl&);
void operator=(const DBImpl&);
// dump the delayed_writes_ to the log file and reset counter.
void DelayLoggingAndReset();
// Return the earliest snapshot where seqno is visible.
// Store the snapshot right before that, if any, in prev_snapshot
inline SequenceNumber findEarliestVisibleSnapshot(
SequenceNumber in,
std::vector<SequenceNumber>& snapshots,
SequenceNumber* prev_snapshot);
// Background threads call this function, which is just a wrapper around
// the cfd->InstallSuperVersion() function. Background threads carry
// deletion_state which can have new_superversion already allocated.
void InstallSuperVersion(ColumnFamilyData* cfd,
DeletionState& deletion_state);
#ifndef ROCKSDB_LITE
using DB::GetPropertiesOfAllTables;
virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
TablePropertiesCollection* props)
override;
#endif // ROCKSDB_LITE
// Function that Get and KeyMayExist call with no_io true or false
// Note: 'value_found' from KeyMayExist propagates here
Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, std::string* value,
bool* value_found = nullptr);
};
// Sanitize db options. The caller should delete result.info_log if
// it is not equal to src.info_log.
extern Options SanitizeOptions(const std::string& db,
const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const Options& src);
extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
// Determine compression type, based on user options, level of the output
// file and whether compression is disabled.
// If enable_compression is false, then compression is always disabled no
// matter what the values of the other two parameters are.
// Otherwise, the compression type is determined based on options and level.
CompressionType GetCompressionType(const Options& options, int level,
const bool enable_compression);
// Determine compression type for L0 file written by memtable flush.
CompressionType GetCompressionFlush(const Options& options);
} // namespace rocksdb

133
db/db_impl_debug.cc Normal file
View File

@@ -0,0 +1,133 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef ROCKSDB_LITE
#include "db/db_impl.h"
namespace rocksdb {
void DBImpl::TEST_PurgeObsoleteteWAL() { PurgeObsoleteWALFiles(); }
uint64_t DBImpl::TEST_GetLevel0TotalSize() {
MutexLock l(&mutex_);
return default_cf_handle_->cfd()->current()->NumLevelBytes(0);
}
Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
ColumnFamilyData* cfd;
if (column_family == nullptr) {
cfd = default_cf_handle_->cfd();
} else {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
cfd = cfh->cfd();
}
mutex_.Lock();
SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
mutex_.Unlock();
ReadOptions roptions;
return NewInternalIterator(roptions, cfd, super_version);
}
int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
ColumnFamilyHandle* column_family) {
ColumnFamilyData* cfd;
if (column_family == nullptr) {
cfd = default_cf_handle_->cfd();
} else {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
cfd = cfh->cfd();
}
MutexLock l(&mutex_);
return cfd->current()->MaxNextLevelOverlappingBytes();
}
void DBImpl::TEST_GetFilesMetaData(
ColumnFamilyHandle* column_family,
std::vector<std::vector<FileMetaData>>* metadata) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
MutexLock l(&mutex_);
metadata->resize(NumberLevels());
for (int level = 0; level < NumberLevels(); level++) {
const std::vector<FileMetaData*>& files = cfd->current()->files_[level];
(*metadata)[level].clear();
for (const auto& f : files) {
(*metadata)[level].push_back(*f);
}
}
}
uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
return versions_->ManifestFileNumber();
}
Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
const Slice* end,
ColumnFamilyHandle* column_family) {
ColumnFamilyData* cfd;
if (column_family == nullptr) {
cfd = default_cf_handle_->cfd();
} else {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
cfd = cfh->cfd();
}
int output_level =
(cfd->options()->compaction_style == kCompactionStyleUniversal ||
cfd->options()->compaction_style == kCompactionStyleFIFO)
? level
: level + 1;
return RunManualCompaction(cfd, level, output_level, begin, end);
}
Status DBImpl::TEST_FlushMemTable(bool wait) {
FlushOptions fo;
fo.wait = wait;
return FlushMemTable(default_cf_handle_->cfd(), fo);
}
Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
ColumnFamilyData* cfd;
if (column_family == nullptr) {
cfd = default_cf_handle_->cfd();
} else {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
cfd = cfh->cfd();
}
return WaitForFlushMemTable(cfd);
}
Status DBImpl::TEST_WaitForCompact() {
// Wait until the compaction completes
// TODO: a bug here. This function actually does not necessarily
// wait for compact. It actually waits for scheduled compaction
// OR flush to finish.
MutexLock l(&mutex_);
while ((bg_compaction_scheduled_ || bg_flush_scheduled_) && bg_error_.ok()) {
bg_cv_.Wait();
}
return bg_error_;
}
Status DBImpl::TEST_ReadFirstRecord(const WalFileType type,
const uint64_t number,
SequenceNumber* sequence) {
return ReadFirstRecord(type, number, sequence);
}
Status DBImpl::TEST_ReadFirstLine(const std::string& fname,
SequenceNumber* sequence) {
return ReadFirstLine(fname, sequence);
}
} // namespace rocksdb
#endif // ROCKSDB_LITE

154
db/db_impl_readonly.cc Normal file
View File

@@ -0,0 +1,154 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2012 Facebook. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "db/db_impl_readonly.h"
#include "db/db_impl.h"
#include <algorithm>
#include <set>
#include <string>
#include <stdint.h>
#include <stdio.h>
#include <vector>
#include <algorithm>
#include "db/db_iter.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/merge_context.h"
#include "db/table_cache.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"
#include "rocksdb/merge_operator.h"
#include "port/port.h"
#include "table/block.h"
#include "table/merger.h"
#include "table/two_level_iterator.h"
#include "util/coding.h"
#include "util/logging.h"
#include "util/build_version.h"
namespace rocksdb {
DBImplReadOnly::DBImplReadOnly(const DBOptions& options,
const std::string& dbname)
: DBImpl(options, dbname) {
Log(options_.info_log, "Opening the db in read only mode");
}
DBImplReadOnly::~DBImplReadOnly() {
}
// Implementations of the DB interface
Status DBImplReadOnly::Get(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value) {
Status s;
SequenceNumber snapshot = versions_->LastSequence();
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
SuperVersion* super_version = cfd->GetSuperVersion();
MergeContext merge_context;
LookupKey lkey(key, snapshot);
if (super_version->mem->Get(lkey, value, &s, merge_context,
*cfd->options())) {
} else {
Version::GetStats stats;
super_version->current->Get(options, lkey, value, &s, &merge_context,
&stats);
}
return s;
}
Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options,
ColumnFamilyHandle* column_family) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
SequenceNumber latest_snapshot = versions_->LastSequence();
Iterator* internal_iter = NewInternalIterator(options, cfd, super_version);
return NewDBIterator(
env_, *cfd->options(), cfd->user_comparator(), internal_iter,
(options.snapshot != nullptr
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
: latest_snapshot));
}
Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
DB** dbptr, bool error_if_log_file_exist) {
*dbptr = nullptr;
DBOptions db_options(options);
ColumnFamilyOptions cf_options(options);
std::vector<ColumnFamilyDescriptor> column_families;
column_families.push_back(
ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
std::vector<ColumnFamilyHandle*> handles;
Status s =
DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
if (s.ok()) {
assert(handles.size() == 1);
// i can delete the handle since DBImpl is always holding a
// reference to default column family
delete handles[0];
}
return s;
}
Status DB::OpenForReadOnly(
const DBOptions& db_options, const std::string& dbname,
const std::vector<ColumnFamilyDescriptor>& column_families,
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
bool error_if_log_file_exist) {
*dbptr = nullptr;
handles->clear();
DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
impl->mutex_.Lock();
Status s = impl->Recover(column_families, true /* read only */,
error_if_log_file_exist);
if (s.ok()) {
// set column family handles
for (auto cf : column_families) {
auto cfd =
impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
if (cfd == nullptr) {
s = Status::InvalidArgument("Column family not found: ", cf.name);
break;
}
handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
}
}
if (s.ok()) {
for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_);
}
}
impl->mutex_.Unlock();
if (s.ok()) {
*dbptr = impl;
} else {
for (auto h : *handles) {
delete h;
}
handles->clear();
delete impl;
}
return s;
}
} // namespace rocksdb

103
db/db_impl_readonly.h Normal file
View File

@@ -0,0 +1,103 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2012 Facebook. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#pragma once
#include "db/db_impl.h"
#include <deque>
#include <set>
#include <vector>
#include <string>
#include "db/dbformat.h"
#include "db/log_writer.h"
#include "db/snapshot.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "port/port.h"
#include "util/stats_logger.h"
namespace rocksdb {
class DBImplReadOnly : public DBImpl {
public:
DBImplReadOnly(const DBOptions& options, const std::string& dbname);
virtual ~DBImplReadOnly();
// Implementations of the DB interface
using DB::Get;
virtual Status Get(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value);
// TODO: Implement ReadOnly MultiGet?
using DBImpl::NewIterator;
virtual Iterator* NewIterator(const ReadOptions&,
ColumnFamilyHandle* column_family);
virtual Status NewIterators(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_family,
std::vector<Iterator*>* iterators) {
// TODO
return Status::NotSupported("Not supported yet.");
}
using DBImpl::Put;
virtual Status Put(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) {
return Status::NotSupported("Not supported operation in read only mode.");
}
using DBImpl::Merge;
virtual Status Merge(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) {
return Status::NotSupported("Not supported operation in read only mode.");
}
using DBImpl::Delete;
virtual Status Delete(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
return Status::NotSupported("Not supported operation in read only mode.");
}
using DBImpl::CompactRange;
virtual Status CompactRange(ColumnFamilyHandle* column_family,
const Slice* begin, const Slice* end,
bool reduce_level = false,
int target_level = -1) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status DisableFileDeletions() {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status EnableFileDeletions(bool force) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status GetLiveFiles(std::vector<std::string>&,
uint64_t* manifest_file_size,
bool flush_memtable = true) {
return Status::NotSupported("Not supported operation in read only mode.");
}
using DBImpl::Flush;
virtual Status Flush(const FlushOptions& options,
ColumnFamilyHandle* column_family) {
return Status::NotSupported("Not supported operation in read only mode.");
}
private:
friend class DB;
// No copying allowed
DBImplReadOnly(const DBImplReadOnly&);
void operator=(const DBImplReadOnly&);
};
}

517
db/db_iter.cc Normal file
View File

@@ -0,0 +1,517 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_iter.h"
#include <stdexcept>
#include <deque>
#include "db/filename.h"
#include "db/dbformat.h"
#include "rocksdb/env.h"
#include "rocksdb/options.h"
#include "rocksdb/iterator.h"
#include "rocksdb/merge_operator.h"
#include "port/port.h"
#include "util/arena.h"
#include "util/logging.h"
#include "util/mutexlock.h"
#include "util/perf_context_imp.h"
namespace rocksdb {
#if 0
static void DumpInternalIter(Iterator* iter) {
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ParsedInternalKey k;
if (!ParseInternalKey(iter->key(), &k)) {
fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
} else {
fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
}
}
}
#endif
// Memtables and sstables that make the DB representation contain
// (userkey,seq,type) => uservalue entries. DBIter
// combines multiple entries for the same userkey found in the DB
// representation into a single entry while accounting for sequence
// numbers, deletion markers, overwrites, etc.
class DBIter: public Iterator {
public:
// The following is grossly complicated. TODO: clean it up
// Which direction is the iterator currently moving?
// (1) When moving forward, the internal iterator is positioned at
// the exact entry that yields this->key(), this->value()
// (2) When moving backwards, the internal iterator is positioned
// just before all entries whose user key == this->key().
enum Direction {
kForward,
kReverse
};
DBIter(Env* env, const Options& options, const Comparator* cmp,
Iterator* iter, SequenceNumber s, bool arena_mode)
: arena_mode_(arena_mode),
env_(env),
logger_(options.info_log.get()),
user_comparator_(cmp),
user_merge_operator_(options.merge_operator.get()),
iter_(iter),
sequence_(s),
direction_(kForward),
valid_(false),
current_entry_is_merged_(false),
statistics_(options.statistics.get()) {
RecordTick(statistics_, NO_ITERATORS, 1);
max_skip_ = options.max_sequential_skip_in_iterations;
}
virtual ~DBIter() {
RecordTick(statistics_, NO_ITERATORS, -1);
if (!arena_mode_) {
delete iter_;
} else {
iter_->~Iterator();
}
}
virtual void SetIter(Iterator* iter) {
assert(iter_ == nullptr);
iter_ = iter;
}
virtual bool Valid() const { return valid_; }
virtual Slice key() const {
assert(valid_);
return saved_key_.GetKey();
}
virtual Slice value() const {
assert(valid_);
return (direction_ == kForward && !current_entry_is_merged_) ?
iter_->value() : saved_value_;
}
virtual Status status() const {
if (status_.ok()) {
return iter_->status();
} else {
return status_;
}
}
virtual void Next();
virtual void Prev();
virtual void Seek(const Slice& target);
virtual void SeekToFirst();
virtual void SeekToLast();
private:
inline void FindNextUserEntry(bool skipping);
void FindNextUserEntryInternal(bool skipping);
void FindPrevUserEntry();
bool ParseKey(ParsedInternalKey* key);
void MergeValuesNewToOld();
inline void ClearSavedValue() {
if (saved_value_.capacity() > 1048576) {
std::string empty;
swap(empty, saved_value_);
} else {
saved_value_.clear();
}
}
bool arena_mode_;
Env* const env_;
Logger* logger_;
const Comparator* const user_comparator_;
const MergeOperator* const user_merge_operator_;
Iterator* iter_;
SequenceNumber const sequence_;
Status status_;
IterKey saved_key_; // == current key when direction_==kReverse
std::string saved_value_; // == current raw value when direction_==kReverse
Direction direction_;
bool valid_;
bool current_entry_is_merged_;
Statistics* statistics_;
uint64_t max_skip_;
// No copying allowed
DBIter(const DBIter&);
void operator=(const DBIter&);
};
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
if (!ParseInternalKey(iter_->key(), ikey)) {
status_ = Status::Corruption("corrupted internal key in DBIter");
Log(logger_, "corrupted internal key in DBIter: %s",
iter_->key().ToString(true).c_str());
return false;
} else {
return true;
}
}
void DBIter::Next() {
assert(valid_);
if (direction_ == kReverse) { // Switch directions?
direction_ = kForward;
// iter_ is pointing just before the entries for this->key(),
// so advance into the range of entries for this->key() and then
// use the normal skipping code below.
if (!iter_->Valid()) {
iter_->SeekToFirst();
} else {
iter_->Next();
}
if (!iter_->Valid()) {
valid_ = false;
saved_key_.Clear();
return;
}
}
// If the current value is merged, we might already hit end of iter_
if (!iter_->Valid()) {
valid_ = false;
return;
}
FindNextUserEntry(true /* skipping the current user key */);
}
// PRE: saved_key_ has the current user key if skipping
// POST: saved_key_ should have the next user key if valid_,
// if the current entry is a result of merge
// current_entry_is_merged_ => true
// saved_value_ => the merged value
//
// NOTE: In between, saved_key_ can point to a user key that has
// a delete marker
inline void DBIter::FindNextUserEntry(bool skipping) {
PERF_TIMER_AUTO(find_next_user_entry_time);
FindNextUserEntryInternal(skipping);
PERF_TIMER_STOP(find_next_user_entry_time);
}
// Actual implementation of DBIter::FindNextUserEntry()
void DBIter::FindNextUserEntryInternal(bool skipping) {
// Loop until we hit an acceptable entry to yield
assert(iter_->Valid());
assert(direction_ == kForward);
current_entry_is_merged_ = false;
uint64_t num_skipped = 0;
do {
ParsedInternalKey ikey;
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
if (skipping &&
user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
num_skipped++; // skip this entry
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
} else {
skipping = false;
switch (ikey.type) {
case kTypeDeletion:
// Arrange to skip all upcoming entries for this key since
// they are hidden by this deletion.
saved_key_.SetKey(ikey.user_key);
skipping = true;
num_skipped = 0;
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
break;
case kTypeValue:
valid_ = true;
saved_key_.SetKey(ikey.user_key);
return;
case kTypeMerge:
// By now, we are sure the current ikey is going to yield a value
saved_key_.SetKey(ikey.user_key);
current_entry_is_merged_ = true;
valid_ = true;
MergeValuesNewToOld(); // Go to a different state machine
return;
default:
assert(false);
break;
}
}
}
// If we have sequentially iterated via numerous keys and still not
// found the next user-key, then it is better to seek so that we can
// avoid too many key comparisons. We seek to the last occurence of
// our current key by looking for sequence number 0.
if (skipping && num_skipped > max_skip_) {
num_skipped = 0;
std::string last_key;
AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), 0,
kValueTypeForSeek));
iter_->Seek(last_key);
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
} else {
iter_->Next();
}
} while (iter_->Valid());
valid_ = false;
}
// Merge values of the same user key starting from the current iter_ position
// Scan from the newer entries to older entries.
// PRE: iter_->key() points to the first merge type entry
// saved_key_ stores the user key
// POST: saved_value_ has the merged value for the user key
// iter_ points to the next entry (or invalid)
void DBIter::MergeValuesNewToOld() {
if (!user_merge_operator_) {
Log(logger_, "Options::merge_operator is null.");
throw std::logic_error("DBIter::MergeValuesNewToOld() with"
" Options::merge_operator null");
}
// Start the merge process by pushing the first operand
std::deque<std::string> operands;
operands.push_front(iter_->value().ToString());
std::string merge_result; // Temporary string to hold merge result later
ParsedInternalKey ikey;
for (iter_->Next(); iter_->Valid(); iter_->Next()) {
if (!ParseKey(&ikey)) {
// skip corrupted key
continue;
}
if (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) != 0) {
// hit the next user key, stop right here
break;
}
if (kTypeDeletion == ikey.type) {
// hit a delete with the same user key, stop right here
// iter_ is positioned after delete
iter_->Next();
break;
}
if (kTypeValue == ikey.type) {
// hit a put, merge the put value with operands and store the
// final result in saved_value_. We are done!
// ignore corruption if there is any.
const Slice value = iter_->value();
user_merge_operator_->FullMerge(ikey.user_key, &value, operands,
&saved_value_, logger_);
// iter_ is positioned after put
iter_->Next();
return;
}
if (kTypeMerge == ikey.type) {
// hit a merge, add the value as an operand and run associative merge.
// when complete, add result to operands and continue.
const Slice& value = iter_->value();
operands.push_front(value.ToString());
}
}
// we either exhausted all internal keys under this user key, or hit
// a deletion marker.
// feed null as the existing value to the merge operator, such that
// client can differentiate this scenario and do things accordingly.
user_merge_operator_->FullMerge(saved_key_.GetKey(), nullptr, operands,
&saved_value_, logger_);
}
void DBIter::Prev() {
assert(valid_);
// Throw an exception now if merge_operator is provided
// TODO: support backward iteration
if (user_merge_operator_) {
Log(logger_, "Prev not supported yet if merge_operator is provided");
throw std::logic_error("DBIter::Prev backward iteration not supported"
" if merge_operator is provided");
}
if (direction_ == kForward) { // Switch directions?
// iter_ is pointing at the current entry. Scan backwards until
// the key changes so we can use the normal reverse scanning code.
assert(iter_->Valid()); // Otherwise valid_ would have been false
saved_key_.SetKey(ExtractUserKey(iter_->key()));
while (true) {
iter_->Prev();
if (!iter_->Valid()) {
valid_ = false;
saved_key_.Clear();
ClearSavedValue();
return;
}
if (user_comparator_->Compare(ExtractUserKey(iter_->key()),
saved_key_.GetKey()) < 0) {
break;
}
}
direction_ = kReverse;
}
FindPrevUserEntry();
}
void DBIter::FindPrevUserEntry() {
assert(direction_ == kReverse);
uint64_t num_skipped = 0;
ValueType value_type = kTypeDeletion;
bool saved_key_valid = true;
if (iter_->Valid()) {
do {
ParsedInternalKey ikey;
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
if ((value_type != kTypeDeletion) &&
user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) < 0) {
// We encountered a non-deleted value in entries for previous keys,
break;
}
value_type = ikey.type;
if (value_type == kTypeDeletion) {
saved_key_.Clear();
ClearSavedValue();
saved_key_valid = false;
} else {
Slice raw_value = iter_->value();
if (saved_value_.capacity() > raw_value.size() + 1048576) {
std::string empty;
swap(empty, saved_value_);
}
saved_key_.SetKey(ExtractUserKey(iter_->key()));
saved_value_.assign(raw_value.data(), raw_value.size());
}
} else {
// In the case of ikey.sequence > sequence_, we might have already
// iterated to a different user key.
saved_key_valid = false;
}
num_skipped++;
// If we have sequentially iterated via numerous keys and still not
// found the prev user-key, then it is better to seek so that we can
// avoid too many key comparisons. We seek to the first occurence of
// our current key by looking for max sequence number.
if (saved_key_valid && num_skipped > max_skip_) {
num_skipped = 0;
std::string last_key;
AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(),
kMaxSequenceNumber,
kValueTypeForSeek));
iter_->Seek(last_key);
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
} else {
iter_->Prev();
}
} while (iter_->Valid());
}
if (value_type == kTypeDeletion) {
// End
valid_ = false;
saved_key_.Clear();
ClearSavedValue();
direction_ = kForward;
} else {
valid_ = true;
}
}
void DBIter::Seek(const Slice& target) {
saved_key_.Clear();
// now savved_key is used to store internal key.
saved_key_.SetInternalKey(target, sequence_);
PERF_TIMER_AUTO(seek_internal_seek_time);
iter_->Seek(saved_key_.GetKey());
PERF_TIMER_STOP(seek_internal_seek_time);
if (iter_->Valid()) {
direction_ = kForward;
ClearSavedValue();
FindNextUserEntry(false /*not skipping */);
} else {
valid_ = false;
}
}
void DBIter::SeekToFirst() {
direction_ = kForward;
ClearSavedValue();
PERF_TIMER_AUTO(seek_internal_seek_time);
iter_->SeekToFirst();
PERF_TIMER_STOP(seek_internal_seek_time);
if (iter_->Valid()) {
FindNextUserEntry(false /* not skipping */);
} else {
valid_ = false;
}
}
void DBIter::SeekToLast() {
// Throw an exception for now if merge_operator is provided
// TODO: support backward iteration
if (user_merge_operator_) {
Log(logger_, "SeekToLast not supported yet if merge_operator is provided");
throw std::logic_error("DBIter::SeekToLast: backward iteration not"
" supported if merge_operator is provided");
}
direction_ = kReverse;
ClearSavedValue();
PERF_TIMER_AUTO(seek_internal_seek_time);
iter_->SeekToLast();
PERF_TIMER_STOP(seek_internal_seek_time);
FindPrevUserEntry();
}
Iterator* NewDBIterator(Env* env, const Options& options,
const Comparator* user_key_comparator,
Iterator* internal_iter,
const SequenceNumber& sequence) {
return new DBIter(env, options, user_key_comparator, internal_iter, sequence,
false);
}
ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
void ArenaWrappedDBIter::SetDBIter(DBIter* iter) { db_iter_ = iter; }
void ArenaWrappedDBIter::SetIterUnderDBIter(Iterator* iter) {
static_cast<DBIter*>(db_iter_)->SetIter(iter);
}
inline bool ArenaWrappedDBIter::Valid() const { return db_iter_->Valid(); }
inline void ArenaWrappedDBIter::SeekToFirst() { db_iter_->SeekToFirst(); }
inline void ArenaWrappedDBIter::SeekToLast() { db_iter_->SeekToLast(); }
inline void ArenaWrappedDBIter::Seek(const Slice& target) {
db_iter_->Seek(target);
}
inline void ArenaWrappedDBIter::Next() { db_iter_->Next(); }
inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); }
inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); }
inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); }
inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); }
void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
void* arg2) {
db_iter_->RegisterCleanup(function, arg1, arg2);
}
ArenaWrappedDBIter* NewArenaWrappedDbIterator(
Env* env, const Options& options, const Comparator* user_key_comparator,
const SequenceNumber& sequence) {
ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
Arena* arena = iter->GetArena();
auto mem = arena->AllocateAligned(sizeof(DBIter));
DBIter* db_iter = new (mem)
DBIter(env, options, user_key_comparator, nullptr, sequence, true);
iter->SetDBIter(db_iter);
return iter;
}
} // namespace rocksdb

73
db/db_iter.h Normal file
View File

@@ -0,0 +1,73 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <stdint.h>
#include "rocksdb/db.h"
#include "db/dbformat.h"
#include "util/arena.h"
#include "util/autovector.h"
namespace rocksdb {
class Arena;
class DBIter;
// Return a new iterator that converts internal keys (yielded by
// "*internal_iter") that were live at the specified "sequence" number
// into appropriate user keys.
extern Iterator* NewDBIterator(
Env* env,
const Options& options,
const Comparator *user_key_comparator,
Iterator* internal_iter,
const SequenceNumber& sequence);
// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
// iterator is supposed be allocated. This class is used as an entry point of
// a iterator hierarchy whose memory can be allocated inline. In that way,
// accessing the iterator tree can be more cache friendly. It is also faster
// to allocate.
class ArenaWrappedDBIter : public Iterator {
public:
virtual ~ArenaWrappedDBIter();
// Get the arena to be used to allocate memory for DBIter to be wrapped,
// as well as child iterators in it.
virtual Arena* GetArena() { return &arena_; }
// Set the DB Iterator to be wrapped
virtual void SetDBIter(DBIter* iter);
// Set the internal iterator wrapped inside the DB Iterator. Usually it is
// a merging iterator.
virtual void SetIterUnderDBIter(Iterator* iter);
virtual bool Valid() const override;
virtual void SeekToFirst() override;
virtual void SeekToLast() override;
virtual void Seek(const Slice& target) override;
virtual void Next() override;
virtual void Prev() override;
virtual Slice key() const override;
virtual Slice value() const override;
virtual Status status() const override;
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
private:
DBIter* db_iter_;
Arena arena_;
};
// Generate the arena wrapped iterator class.
extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
Env* env, const Options& options, const Comparator* user_key_comparator,
const SequenceNumber& sequence);
} // namespace rocksdb

95
db/db_stats_logger.cc Normal file
View File

@@ -0,0 +1,95 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_impl.h"
#include <string>
#include <stdint.h>
#include <stdio.h>
#include "db/version_set.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "port/port.h"
#include "util/mutexlock.h"
namespace rocksdb {
void DBImpl::MaybeScheduleLogDBDeployStats() {
// we did say maybe
#ifndef ROCKSDB_LITE
// There is a lock in the actual logger.
if (!logger_ || options_.db_stats_log_interval < 0
|| host_name_.empty()) {
return;
}
if(bg_logstats_scheduled_ || shutting_down_.Acquire_Load()) {
// Already scheduled
} else {
int64_t current_ts = 0;
Status st = env_->GetCurrentTime(&current_ts);
if (!st.ok()) {
return;
}
if ((current_ts - last_log_ts) < options_.db_stats_log_interval) {
return;
}
last_log_ts = current_ts;
bg_logstats_scheduled_ = true;
env_->Schedule(&DBImpl::BGLogDBDeployStats, this);
}
}
void DBImpl::BGLogDBDeployStats(void* db) {
DBImpl* db_inst = reinterpret_cast<DBImpl*>(db);
db_inst->LogDBDeployStats();
}
void DBImpl::LogDBDeployStats() {
mutex_.Lock();
if (shutting_down_.Acquire_Load()) {
bg_logstats_scheduled_ = false;
bg_cv_.SignalAll();
mutex_.Unlock();
return;
}
char tmp_ver[100];
sprintf(tmp_ver, "%d.%d", kMajorVersion, kMinorVersion);
std::string version_info(tmp_ver);
uint64_t file_total_size = 0;
uint32_t file_total_num = 0;
Version* current = default_cf_handle_->cfd()->current();
for (int i = 0; i < current->NumberLevels(); i++) {
file_total_num += current->NumLevelFiles(i);
file_total_size += current->NumLevelBytes(i);
}
Version::LevelSummaryStorage scratch;
const char* file_num_summary = current->LevelSummary(&scratch);
std::string file_num_per_level(file_num_summary);
std::string data_size_per_level(file_num_summary);
mutex_.Unlock();
int64_t unix_ts;
env_->GetCurrentTime(&unix_ts);
logger_->Log_Deploy_Stats(version_info, host_name_,
db_absolute_path_, file_total_size, file_total_num, file_num_per_level,
data_size_per_level, unix_ts);
mutex_.Lock();
bg_logstats_scheduled_ = false;
bg_cv_.SignalAll();
mutex_.Unlock();
#endif
}
}

6852
db/db_test.cc Normal file

File diff suppressed because it is too large Load Diff

169
db/dbformat.cc Normal file
View File

@@ -0,0 +1,169 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/dbformat.h"
#include <stdio.h>
#include "port/port.h"
#include "util/coding.h"
#include "util/perf_context_imp.h"
namespace rocksdb {
uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
assert(seq <= kMaxSequenceNumber);
assert(t <= kValueTypeForSeek);
return (seq << 8) | t;
}
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
result->append(key.user_key.data(), key.user_key.size());
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
}
std::string ParsedInternalKey::DebugString(bool hex) const {
char buf[50];
snprintf(buf, sizeof(buf), "' @ %llu : %d",
(unsigned long long) sequence,
int(type));
std::string result = "'";
result += user_key.ToString(hex);
result += buf;
return result;
}
std::string InternalKey::DebugString(bool hex) const {
std::string result;
ParsedInternalKey parsed;
if (ParseInternalKey(rep_, &parsed)) {
result = parsed.DebugString(hex);
} else {
result = "(bad)";
result.append(EscapeString(rep_));
}
return result;
}
const char* InternalKeyComparator::Name() const {
return name_.c_str();
}
int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
// Order by:
// increasing user key (according to user-supplied comparator)
// decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
PERF_COUNTER_ADD(user_key_comparison_count, 1);
if (r == 0) {
const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
if (anum > bnum) {
r = -1;
} else if (anum < bnum) {
r = +1;
}
}
return r;
}
int InternalKeyComparator::Compare(const ParsedInternalKey& a,
const ParsedInternalKey& b) const {
// Order by:
// increasing user key (according to user-supplied comparator)
// decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_->Compare(a.user_key, b.user_key);
PERF_COUNTER_ADD(user_key_comparison_count, 1);
if (r == 0) {
if (a.sequence > b.sequence) {
r = -1;
} else if (a.sequence < b.sequence) {
r = +1;
} else if (a.type > b.type) {
r = -1;
} else if (a.type < b.type) {
r = +1;
}
}
return r;
}
void InternalKeyComparator::FindShortestSeparator(
std::string* start,
const Slice& limit) const {
// Attempt to shorten the user portion of the key
Slice user_start = ExtractUserKey(*start);
Slice user_limit = ExtractUserKey(limit);
std::string tmp(user_start.data(), user_start.size());
user_comparator_->FindShortestSeparator(&tmp, user_limit);
if (tmp.size() < user_start.size() &&
user_comparator_->Compare(user_start, tmp) < 0) {
// User key has become shorter physically, but larger logically.
// Tack on the earliest possible number to the shortened user key.
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
assert(this->Compare(*start, tmp) < 0);
assert(this->Compare(tmp, limit) < 0);
start->swap(tmp);
}
}
void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
Slice user_key = ExtractUserKey(*key);
std::string tmp(user_key.data(), user_key.size());
user_comparator_->FindShortSuccessor(&tmp);
if (tmp.size() < user_key.size() &&
user_comparator_->Compare(user_key, tmp) < 0) {
// User key has become shorter physically, but larger logically.
// Tack on the earliest possible number to the shortened user key.
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
assert(this->Compare(*key, tmp) < 0);
key->swap(tmp);
}
}
const char* InternalFilterPolicy::Name() const {
return user_policy_->Name();
}
void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
std::string* dst) const {
// We rely on the fact that the code in table.cc does not mind us
// adjusting keys[].
Slice* mkey = const_cast<Slice*>(keys);
for (int i = 0; i < n; i++) {
mkey[i] = ExtractUserKey(keys[i]);
// TODO(sanjay): Suppress dups?
}
user_policy_->CreateFilter(keys, n, dst);
}
bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
}
LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
size_t usize = user_key.size();
size_t needed = usize + 13; // A conservative estimate
char* dst;
if (needed <= sizeof(space_)) {
dst = space_;
} else {
dst = new char[needed];
}
start_ = dst;
dst = EncodeVarint32(dst, usize + 8);
kstart_ = dst;
memcpy(dst, user_key.data(), usize);
dst += usize;
EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
dst += 8;
end_ = dst;
}
} // namespace rocksdb

345
db/dbformat.h Normal file
View File

@@ -0,0 +1,345 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <stdio.h>
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/filter_policy.h"
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/table.h"
#include "rocksdb/types.h"
#include "util/coding.h"
#include "util/logging.h"
namespace rocksdb {
class InternalKey;
// Value types encoded as the last component of internal keys.
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
// data structures.
// The highest bit of the value type needs to be reserved to SST tables
// for them to do more flexible encoding.
enum ValueType : unsigned char {
kTypeDeletion = 0x0,
kTypeValue = 0x1,
kTypeMerge = 0x2,
// Following types are used only in write ahead logs. They are not used in
// memtables or sst files:
kTypeLogData = 0x3,
kTypeColumnFamilyDeletion = 0x4,
kTypeColumnFamilyValue = 0x5,
kTypeColumnFamilyMerge = 0x6,
kMaxValue = 0x7F
};
// kValueTypeForSeek defines the ValueType that should be passed when
// constructing a ParsedInternalKey object for seeking to a particular
// sequence number (since we sort sequence numbers in decreasing order
// and the value type is embedded as the low 8 bits in the sequence
// number in internal keys, we need to use the highest-numbered
// ValueType, not the lowest).
static const ValueType kValueTypeForSeek = kTypeMerge;
// We leave eight bits empty at the bottom so a type and sequence#
// can be packed together into 64-bits.
static const SequenceNumber kMaxSequenceNumber =
((0x1ull << 56) - 1);
struct ParsedInternalKey {
Slice user_key;
SequenceNumber sequence;
ValueType type;
ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
: user_key(u), sequence(seq), type(t) { }
std::string DebugString(bool hex = false) const;
};
// Return the length of the encoding of "key".
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
return key.user_key.size() + 8;
}
extern uint64_t PackSequenceAndType(uint64_t seq, ValueType t);
// Append the serialization of "key" to *result.
extern void AppendInternalKey(std::string* result,
const ParsedInternalKey& key);
// Attempt to parse an internal key from "internal_key". On success,
// stores the parsed data in "*result", and returns true.
//
// On error, returns false, leaves "*result" in an undefined state.
extern bool ParseInternalKey(const Slice& internal_key,
ParsedInternalKey* result);
// Returns the user key portion of an internal key.
inline Slice ExtractUserKey(const Slice& internal_key) {
assert(internal_key.size() >= 8);
return Slice(internal_key.data(), internal_key.size() - 8);
}
inline ValueType ExtractValueType(const Slice& internal_key) {
assert(internal_key.size() >= 8);
const size_t n = internal_key.size();
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
unsigned char c = num & 0xff;
return static_cast<ValueType>(c);
}
// A comparator for internal keys that uses a specified comparator for
// the user key portion and breaks ties by decreasing sequence number.
class InternalKeyComparator : public Comparator {
private:
const Comparator* user_comparator_;
std::string name_;
public:
explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c),
name_("rocksdb.InternalKeyComparator:" +
std::string(user_comparator_->Name())) {
}
virtual ~InternalKeyComparator() {}
virtual const char* Name() const;
virtual int Compare(const Slice& a, const Slice& b) const;
virtual void FindShortestSeparator(
std::string* start,
const Slice& limit) const;
virtual void FindShortSuccessor(std::string* key) const;
const Comparator* user_comparator() const { return user_comparator_; }
int Compare(const InternalKey& a, const InternalKey& b) const;
int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
};
// Filter policy wrapper that converts from internal keys to user keys
class InternalFilterPolicy : public FilterPolicy {
private:
const FilterPolicy* const user_policy_;
public:
explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
virtual const char* Name() const;
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
};
// Modules in this directory should keep internal keys wrapped inside
// the following class instead of plain strings so that we do not
// incorrectly use string comparisons instead of an InternalKeyComparator.
class InternalKey {
private:
std::string rep_;
public:
InternalKey() { } // Leave rep_ as empty to indicate it is invalid
InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
}
bool Valid() const {
ParsedInternalKey parsed;
return ParseInternalKey(Slice(rep_), &parsed);
}
void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
Slice Encode() const {
assert(!rep_.empty());
return rep_;
}
Slice user_key() const { return ExtractUserKey(rep_); }
void SetFrom(const ParsedInternalKey& p) {
rep_.clear();
AppendInternalKey(&rep_, p);
}
void Clear() { rep_.clear(); }
std::string DebugString(bool hex = false) const;
};
inline int InternalKeyComparator::Compare(
const InternalKey& a, const InternalKey& b) const {
return Compare(a.Encode(), b.Encode());
}
inline bool ParseInternalKey(const Slice& internal_key,
ParsedInternalKey* result) {
const size_t n = internal_key.size();
if (n < 8) return false;
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
unsigned char c = num & 0xff;
result->sequence = num >> 8;
result->type = static_cast<ValueType>(c);
assert(result->type <= ValueType::kMaxValue);
result->user_key = Slice(internal_key.data(), n - 8);
return (c <= static_cast<unsigned char>(kValueTypeForSeek));
}
// Update the sequence number in the internal key
inline void UpdateInternalKey(char* internal_key,
const size_t internal_key_size,
uint64_t seq, ValueType t) {
assert(internal_key_size >= 8);
char* seqtype = internal_key + internal_key_size - 8;
uint64_t newval = (seq << 8) | t;
EncodeFixed64(seqtype, newval);
}
// Get the sequence number from the internal key
inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
const size_t n = internal_key.size();
assert(n >= 8);
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
return num >> 8;
}
// A helper class useful for DBImpl::Get()
class LookupKey {
public:
// Initialize *this for looking up user_key at a snapshot with
// the specified sequence number.
LookupKey(const Slice& user_key, SequenceNumber sequence);
~LookupKey();
// Return a key suitable for lookup in a MemTable.
Slice memtable_key() const { return Slice(start_, end_ - start_); }
// Return an internal key (suitable for passing to an internal iterator)
Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
// Return the user key
Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
private:
// We construct a char array of the form:
// klength varint32 <-- start_
// userkey char[klength] <-- kstart_
// tag uint64
// <-- end_
// The array is a suitable MemTable key.
// The suffix starting with "userkey" can be used as an InternalKey.
const char* start_;
const char* kstart_;
const char* end_;
char space_[200]; // Avoid allocation for short keys
// No copying allowed
LookupKey(const LookupKey&);
void operator=(const LookupKey&);
};
inline LookupKey::~LookupKey() {
if (start_ != space_) delete[] start_;
}
class IterKey {
public:
IterKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {}
~IterKey() { ResetBuffer(); }
Slice GetKey() const { return Slice(key_, key_size_); }
void Clear() { key_size_ = 0; }
void SetKey(const Slice& key) {
size_t size = key.size();
EnlargeBufferIfNeeded(size);
memcpy(key_, key.data(), size);
key_size_ = size;
}
void SetInternalKey(const Slice& user_key, SequenceNumber s,
ValueType value_type = kValueTypeForSeek) {
size_t usize = user_key.size();
EnlargeBufferIfNeeded(usize + sizeof(uint64_t));
memcpy(key_, user_key.data(), usize);
EncodeFixed64(key_ + usize, PackSequenceAndType(s, value_type));
key_size_ = usize + sizeof(uint64_t);
}
void SetInternalKey(const ParsedInternalKey& parsed_key) {
SetInternalKey(parsed_key.user_key, parsed_key.sequence, parsed_key.type);
}
private:
char* key_;
size_t buf_size_;
size_t key_size_;
char space_[32]; // Avoid allocation for short keys
void ResetBuffer() {
if (key_ != nullptr && key_ != space_) {
delete[] key_;
}
key_ = space_;
buf_size_ = sizeof(space_);
key_size_ = 0;
}
// Enlarge the buffer size if needed based on key_size.
// By default, static allocated buffer is used. Once there is a key
// larger than the static allocated buffer, another buffer is dynamically
// allocated, until a larger key buffer is requested. In that case, we
// reallocate buffer and delete the old one.
void EnlargeBufferIfNeeded(size_t key_size) {
// If size is smaller than buffer size, continue using current buffer,
// or the static allocated one, as default
if (key_size > buf_size_) {
// Need to enlarge the buffer.
ResetBuffer();
key_ = new char[key_size];
buf_size_ = key_size;
}
}
// No copying allowed
IterKey(const IterKey&) = delete;
void operator=(const IterKey&) = delete;
};
class InternalKeySliceTransform : public SliceTransform {
public:
explicit InternalKeySliceTransform(const SliceTransform* transform)
: transform_(transform) {}
virtual const char* Name() const { return transform_->Name(); }
virtual Slice Transform(const Slice& src) const {
auto user_key = ExtractUserKey(src);
return transform_->Transform(user_key);
}
virtual bool InDomain(const Slice& src) const {
auto user_key = ExtractUserKey(src);
return transform_->InDomain(user_key);
}
virtual bool InRange(const Slice& dst) const {
auto user_key = ExtractUserKey(dst);
return transform_->InRange(user_key);
}
const SliceTransform* user_prefix_extractor() const { return transform_; }
private:
// Like comparator, InternalKeySliceTransform will not take care of the
// deletion of transform_
const SliceTransform* const transform_;
};
} // namespace rocksdb

117
db/dbformat_test.cc Normal file
View File

@@ -0,0 +1,117 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/dbformat.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace rocksdb {
static std::string IKey(const std::string& user_key,
uint64_t seq,
ValueType vt) {
std::string encoded;
AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
return encoded;
}
static std::string Shorten(const std::string& s, const std::string& l) {
std::string result = s;
InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
return result;
}
static std::string ShortSuccessor(const std::string& s) {
std::string result = s;
InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
return result;
}
static void TestKey(const std::string& key,
uint64_t seq,
ValueType vt) {
std::string encoded = IKey(key, seq, vt);
Slice in(encoded);
ParsedInternalKey decoded("", 0, kTypeValue);
ASSERT_TRUE(ParseInternalKey(in, &decoded));
ASSERT_EQ(key, decoded.user_key.ToString());
ASSERT_EQ(seq, decoded.sequence);
ASSERT_EQ(vt, decoded.type);
ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
}
class FormatTest { };
TEST(FormatTest, InternalKey_EncodeDecode) {
const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
const uint64_t seq[] = {
1, 2, 3,
(1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
(1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
(1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
};
for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
TestKey(keys[k], seq[s], kTypeValue);
TestKey("hello", 1, kTypeDeletion);
}
}
}
TEST(FormatTest, InternalKeyShortSeparator) {
// When user keys are same
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 99, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 101, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 100, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 100, kTypeDeletion)));
// When user keys are misordered
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("bar", 99, kTypeValue)));
// When user keys are different, but correctly ordered
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
Shorten(IKey("foo", 100, kTypeValue),
IKey("hello", 200, kTypeValue)));
// When start user key is prefix of limit user key
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foobar", 200, kTypeValue)));
// When limit user key is prefix of start user key
ASSERT_EQ(IKey("foobar", 100, kTypeValue),
Shorten(IKey("foobar", 100, kTypeValue),
IKey("foo", 200, kTypeValue)));
}
TEST(FormatTest, InternalKeyShortestSuccessor) {
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
ShortSuccessor(IKey("foo", 100, kTypeValue)));
ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

295
db/deletefile_test.cc Normal file
View File

@@ -0,0 +1,295 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "rocksdb/db.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "util/testharness.h"
#include "util/testutil.h"
#include "rocksdb/env.h"
#include "rocksdb/transaction_log.h"
#include <vector>
#include <stdlib.h>
#include <map>
#include <string>
namespace rocksdb {
class DeleteFileTest {
public:
std::string dbname_;
Options options_;
DB* db_;
Env* env_;
int numlevels_;
DeleteFileTest() {
db_ = nullptr;
env_ = Env::Default();
options_.write_buffer_size = 1024*1024*1000;
options_.target_file_size_base = 1024*1024*1000;
options_.max_bytes_for_level_base = 1024*1024*1000;
options_.WAL_ttl_seconds = 300; // Used to test log files
options_.WAL_size_limit_MB = 1024; // Used to test log files
dbname_ = test::TmpDir() + "/deletefile_test";
options_.wal_dir = dbname_ + "/wal_files";
// clean up all the files that might have been there before
std::vector<std::string> old_files;
env_->GetChildren(dbname_, &old_files);
for (auto file : old_files) {
env_->DeleteFile(dbname_ + "/" + file);
}
env_->GetChildren(options_.wal_dir, &old_files);
for (auto file : old_files) {
env_->DeleteFile(options_.wal_dir + "/" + file);
}
DestroyDB(dbname_, options_);
numlevels_ = 7;
ASSERT_OK(ReopenDB(true));
}
Status ReopenDB(bool create) {
delete db_;
if (create) {
DestroyDB(dbname_, options_);
}
db_ = nullptr;
options_.create_if_missing = create;
return DB::Open(options_, dbname_, &db_);
}
void CloseDB() {
delete db_;
}
void AddKeys(int numkeys, int startkey = 0) {
WriteOptions options;
options.sync = false;
ReadOptions roptions;
for (int i = startkey; i < (numkeys + startkey) ; i++) {
std::string temp = std::to_string(i);
Slice key(temp);
Slice value(temp);
ASSERT_OK(db_->Put(options, key, value));
}
}
int numKeysInLevels(
std::vector<LiveFileMetaData> &metadata,
std::vector<int> *keysperlevel = nullptr) {
if (keysperlevel != nullptr) {
keysperlevel->resize(numlevels_);
}
int numKeys = 0;
for (size_t i = 0; i < metadata.size(); i++) {
int startkey = atoi(metadata[i].smallestkey.c_str());
int endkey = atoi(metadata[i].largestkey.c_str());
int numkeysinfile = (endkey - startkey + 1);
numKeys += numkeysinfile;
if (keysperlevel != nullptr) {
(*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
}
fprintf(stderr, "level %d name %s smallest %s largest %s\n",
metadata[i].level, metadata[i].name.c_str(),
metadata[i].smallestkey.c_str(),
metadata[i].largestkey.c_str());
}
return numKeys;
}
void CreateTwoLevels() {
AddKeys(50000, 10000);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
ASSERT_OK(dbi->TEST_FlushMemTable());
ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
AddKeys(50000, 10000);
ASSERT_OK(dbi->TEST_FlushMemTable());
ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
}
void CheckFileTypeCounts(std::string& dir,
int required_log,
int required_sst,
int required_manifest) {
std::vector<std::string> filenames;
env_->GetChildren(dir, &filenames);
int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
for (auto file : filenames) {
uint64_t number;
FileType type;
if (ParseFileName(file, &number, &type)) {
log_cnt += (type == kLogFile);
sst_cnt += (type == kTableFile);
manifest_cnt += (type == kDescriptorFile);
}
}
ASSERT_EQ(required_log, log_cnt);
ASSERT_EQ(required_sst, sst_cnt);
ASSERT_EQ(required_manifest, manifest_cnt);
}
};
TEST(DeleteFileTest, AddKeysAndQueryLevels) {
CreateTwoLevels();
std::vector<LiveFileMetaData> metadata;
std::vector<int> keysinlevel;
db_->GetLiveFilesMetaData(&metadata);
std::string level1file = "";
int level1keycount = 0;
std::string level2file = "";
int level2keycount = 0;
int level1index = 0;
int level2index = 1;
ASSERT_EQ((int)metadata.size(), 2);
if (metadata[0].level == 2) {
level1index = 1;
level2index = 0;
}
level1file = metadata[level1index].name;
int startkey = atoi(metadata[level1index].smallestkey.c_str());
int endkey = atoi(metadata[level1index].largestkey.c_str());
level1keycount = (endkey - startkey + 1);
level2file = metadata[level2index].name;
startkey = atoi(metadata[level2index].smallestkey.c_str());
endkey = atoi(metadata[level2index].largestkey.c_str());
level2keycount = (endkey - startkey + 1);
// COntrolled setup. Levels 1 and 2 should both have 50K files.
// This is a little fragile as it depends on the current
// compaction heuristics.
ASSERT_EQ(level1keycount, 50000);
ASSERT_EQ(level2keycount, 50000);
Status status = db_->DeleteFile("0.sst");
ASSERT_TRUE(status.IsInvalidArgument());
// intermediate level files cannot be deleted.
status = db_->DeleteFile(level1file);
ASSERT_TRUE(status.IsInvalidArgument());
// Lowest level file deletion should succeed.
ASSERT_OK(db_->DeleteFile(level2file));
CloseDB();
}
TEST(DeleteFileTest, PurgeObsoleteFilesTest) {
CreateTwoLevels();
// there should be only one (empty) log file because CreateTwoLevels()
// flushes the memtables to disk
CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
// 2 ssts, 1 manifest
CheckFileTypeCounts(dbname_, 0, 2, 1);
std::string first("0"), last("999999");
Slice first_slice(first), last_slice(last);
db_->CompactRange(&first_slice, &last_slice, true, 2);
// 1 sst after compaction
CheckFileTypeCounts(dbname_, 0, 1, 1);
// this time, we keep an iterator alive
ReopenDB(true);
Iterator *itr = 0;
CreateTwoLevels();
itr = db_->NewIterator(ReadOptions());
db_->CompactRange(&first_slice, &last_slice, true, 2);
// 3 sst after compaction with live iterator
CheckFileTypeCounts(dbname_, 0, 3, 1);
delete itr;
// 1 sst after iterator deletion
CheckFileTypeCounts(dbname_, 0, 1, 1);
CloseDB();
}
TEST(DeleteFileTest, DeleteFileWithIterator) {
CreateTwoLevels();
ReadOptions options;
Iterator* it = db_->NewIterator(options);
std::vector<LiveFileMetaData> metadata;
db_->GetLiveFilesMetaData(&metadata);
std::string level2file = "";
ASSERT_EQ((int)metadata.size(), 2);
if (metadata[0].level == 1) {
level2file = metadata[1].name;
} else {
level2file = metadata[0].name;
}
Status status = db_->DeleteFile(level2file);
fprintf(stdout, "Deletion status %s: %s\n",
level2file.c_str(), status.ToString().c_str());
ASSERT_TRUE(status.ok());
it->SeekToFirst();
int numKeysIterated = 0;
while(it->Valid()) {
numKeysIterated++;
it->Next();
}
ASSERT_EQ(numKeysIterated, 50000);
delete it;
CloseDB();
}
TEST(DeleteFileTest, DeleteLogFiles) {
AddKeys(10, 0);
VectorLogPtr logfiles;
db_->GetSortedWalFiles(logfiles);
ASSERT_GT(logfiles.size(), 0UL);
// Take the last log file which is expected to be alive and try to delete it
// Should not succeed because live logs are not allowed to be deleted
std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
ASSERT_EQ(alive_log->Type(), kAliveLogFile);
ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
fprintf(stdout, "Deleting alive log file %s\n",
alive_log->PathName().c_str());
ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
logfiles.clear();
// Call Flush to bring about a new working log file and add more keys
// Call Flush again to flush out memtable and move alive log to archived log
// and try to delete the archived log file
FlushOptions fopts;
db_->Flush(fopts);
AddKeys(10, 0);
db_->Flush(fopts);
db_->GetSortedWalFiles(logfiles);
ASSERT_GT(logfiles.size(), 0UL);
std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" +
archived_log->PathName()));
fprintf(stdout, "Deleting archived log file %s\n",
archived_log->PathName().c_str());
ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
ASSERT_TRUE(!env_->FileExists(options_.wal_dir + "/" +
archived_log->PathName()));
CloseDB();
}
} //namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

202
db/file_indexer.cc Normal file
View File

@@ -0,0 +1,202 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/file_indexer.h"
#include <algorithm>
#include "rocksdb/comparator.h"
#include "db/version_edit.h"
namespace rocksdb {
FileIndexer::FileIndexer(const uint32_t num_levels,
const Comparator* ucmp)
: num_levels_(num_levels),
ucmp_(ucmp),
next_level_index_(num_levels),
level_rb_(num_levels, -1) {
}
uint32_t FileIndexer::NumLevelIndex() {
return next_level_index_.size();
}
uint32_t FileIndexer::LevelIndexSize(uint32_t level) {
return next_level_index_[level].size();
}
void FileIndexer::GetNextLevelIndex(
const uint32_t level, const uint32_t file_index, const int cmp_smallest,
const int cmp_largest, int32_t* left_bound, int32_t* right_bound) {
assert(level > 0);
// Last level, no hint
if (level == num_levels_ - 1) {
*left_bound = 0;
*right_bound = -1;
return;
}
assert(level < num_levels_ - 1);
assert(static_cast<int32_t>(file_index) <= level_rb_[level]);
const auto& index = next_level_index_[level][file_index];
if (cmp_smallest < 0) {
*left_bound = (level > 0 && file_index > 0) ?
next_level_index_[level][file_index - 1].largest_lb : 0;
*right_bound = index.smallest_rb;
} else if (cmp_smallest == 0) {
*left_bound = index.smallest_lb;
*right_bound = index.smallest_rb;
} else if (cmp_smallest > 0 && cmp_largest < 0) {
*left_bound = index.smallest_lb;
*right_bound = index.largest_rb;
} else if (cmp_largest == 0) {
*left_bound = index.largest_lb;
*right_bound = index.largest_rb;
} else if (cmp_largest > 0) {
*left_bound = index.largest_lb;
*right_bound = level_rb_[level + 1];
} else {
assert(false);
}
assert(*left_bound >= 0);
assert(*left_bound <= *right_bound + 1);
assert(*right_bound <= level_rb_[level + 1]);
}
void FileIndexer::ClearIndex() {
for (uint32_t level = 1; level < num_levels_; ++level) {
next_level_index_[level].clear();
}
}
void FileIndexer::UpdateIndex(std::vector<FileMetaData*>* const files) {
if (files == nullptr) {
return;
}
// L1 - Ln-1
for (uint32_t level = 1; level < num_levels_ - 1; ++level) {
const auto& upper_files = files[level];
const int32_t upper_size = upper_files.size();
const auto& lower_files = files[level + 1];
level_rb_[level] = upper_files.size() - 1;
if (upper_size == 0) {
continue;
}
auto& index = next_level_index_[level];
index.resize(upper_size);
CalculateLB(upper_files, lower_files, &index,
[this](const FileMetaData* a, const FileMetaData* b) -> int {
return ucmp_->Compare(a->smallest.user_key(), b->largest.user_key());
},
[](IndexUnit* index, int32_t f_idx) {
index->smallest_lb = f_idx;
});
CalculateLB(upper_files, lower_files, &index,
[this](const FileMetaData* a, const FileMetaData* b) -> int {
return ucmp_->Compare(a->largest.user_key(), b->largest.user_key());
},
[](IndexUnit* index, int32_t f_idx) {
index->largest_lb = f_idx;
});
CalculateRB(upper_files, lower_files, &index,
[this](const FileMetaData* a, const FileMetaData* b) -> int {
return ucmp_->Compare(a->smallest.user_key(), b->smallest.user_key());
},
[](IndexUnit* index, int32_t f_idx) {
index->smallest_rb = f_idx;
});
CalculateRB(upper_files, lower_files, &index,
[this](const FileMetaData* a, const FileMetaData* b) -> int {
return ucmp_->Compare(a->largest.user_key(), b->smallest.user_key());
},
[](IndexUnit* index, int32_t f_idx) {
index->largest_rb = f_idx;
});
}
level_rb_[num_levels_ - 1] = files[num_levels_ - 1].size() - 1;
}
void FileIndexer::CalculateLB(const std::vector<FileMetaData*>& upper_files,
const std::vector<FileMetaData*>& lower_files,
std::vector<IndexUnit>* index,
std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
std::function<void(IndexUnit*, int32_t)> set_index) {
const int32_t upper_size = upper_files.size();
const int32_t lower_size = lower_files.size();
int32_t upper_idx = 0;
int32_t lower_idx = 0;
while (upper_idx < upper_size && lower_idx < lower_size) {
int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
if (cmp == 0) {
set_index(&(*index)[upper_idx], lower_idx);
++upper_idx;
++lower_idx;
} else if (cmp > 0) {
// Lower level's file (largest) is smaller, a key won't hit in that
// file. Move to next lower file
++lower_idx;
} else {
// Lower level's file becomes larger, update the index, and
// move to the next upper file
set_index(&(*index)[upper_idx], lower_idx);
++upper_idx;
}
}
while (upper_idx < upper_size) {
// Lower files are exhausted, that means the remaining upper files are
// greater than any lower files. Set the index to be the lower level size.
set_index(&(*index)[upper_idx], lower_size);
++upper_idx;
}
}
void FileIndexer::CalculateRB(const std::vector<FileMetaData*>& upper_files,
const std::vector<FileMetaData*>& lower_files,
std::vector<IndexUnit>* index,
std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
std::function<void(IndexUnit*, int32_t)> set_index) {
const int32_t upper_size = upper_files.size();
const int32_t lower_size = lower_files.size();
int32_t upper_idx = upper_size - 1;
int32_t lower_idx = lower_size - 1;
while (upper_idx >= 0 && lower_idx >= 0) {
int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
if (cmp == 0) {
set_index(&(*index)[upper_idx], lower_idx);
--upper_idx;
--lower_idx;
} else if (cmp < 0) {
// Lower level's file (smallest) is larger, a key won't hit in that
// file. Move to next lower file.
--lower_idx;
} else {
// Lower level's file becomes smaller, update the index, and move to
// the next the upper file
set_index(&(*index)[upper_idx], lower_idx);
--upper_idx;
}
}
while (upper_idx >= 0) {
// Lower files are exhausted, that means the remaining upper files are
// smaller than any lower files. Set it to -1.
set_index(&(*index)[upper_idx], -1);
--upper_idx;
}
}
} // namespace rocksdb

129
db/file_indexer.h Normal file
View File

@@ -0,0 +1,129 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <cstdint>
#include <functional>
#include <limits>
#include <vector>
namespace rocksdb {
class Comparator;
struct FileMetaData;
// The file tree structure in Version is prebuilt and the range of each file
// is known. On Version::Get(), it uses binary search to find a potential file
// and then check if a target key can be found in the file by comparing the key
// to each file's smallest and largest key. The results of these comparisions
// can be reused beyond checking if a key falls into a file's range.
// With some pre-calculated knowledge, each key comparision that has been done
// can serve as a hint to narrow down further searches: if a key compared to
// be smaller than a file's smallest or largest, that comparison can be used
// to find out the right bound of next binary search. Similarly, if a key
// compared to be larger than a file's smallest or largest, it can be utilized
// to find out the left bound of next binary search.
// With these hints: it can greatly reduce the range of binary search,
// especially for bottom levels, given that one file most likely overlaps with
// only N files from level below (where N is max_bytes_for_level_multiplier).
// So on level L, we will only look at ~N files instead of N^L files on the
// naive approach.
class FileIndexer {
public:
FileIndexer(const uint32_t num_levels, const Comparator* ucmp);
uint32_t NumLevelIndex();
uint32_t LevelIndexSize(uint32_t level);
// Return a file index range in the next level to search for a key based on
// smallest and largest key comparision for the current file specified by
// level and file_index. When *left_index < *right_index, both index should
// be valid and fit in the vector size.
void GetNextLevelIndex(
const uint32_t level, const uint32_t file_index, const int cmp_smallest,
const int cmp_largest, int32_t* left_bound, int32_t* right_bound);
void ClearIndex();
void UpdateIndex(std::vector<FileMetaData*>* const files);
enum {
kLevelMaxIndex = std::numeric_limits<int32_t>::max()
};
private:
const uint32_t num_levels_;
const Comparator* ucmp_;
struct IndexUnit {
IndexUnit()
: smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {}
// During file search, a key is compared against smallest and largest
// from a FileMetaData. It can have 3 possible outcomes:
// (1) key is smaller than smallest, implying it is also smaller than
// larger. Precalculated index based on "smallest < smallest" can
// be used to provide right bound.
// (2) key is in between smallest and largest.
// Precalculated index based on "smallest > greatest" can be used to
// provide left bound.
// Precalculated index based on "largest < smallest" can be used to
// provide right bound.
// (3) key is larger than largest, implying it is also larger than smallest.
// Precalculated index based on "largest > largest" can be used to
// provide left bound.
//
// As a result, we will need to do:
// Compare smallest (<=) and largest keys from upper level file with
// smallest key from lower level to get a right bound.
// Compare smallest (>=) and largest keys from upper level file with
// largest key from lower level to get a left bound.
//
// Example:
// level 1: [50 - 60]
// level 2: [1 - 40], [45 - 55], [58 - 80]
// A key 35, compared to be less than 50, 3rd file on level 2 can be
// skipped according to rule (1). LB = 0, RB = 1.
// A key 53, sits in the middle 50 and 60. 1st file on level 2 can be
// skipped according to rule (2)-a, but the 3rd file cannot be skipped
// because 60 is greater than 58. LB = 1, RB = 2.
// A key 70, compared to be larger than 60. 1st and 2nd file can be skipped
// according to rule (3). LB = 2, RB = 2.
//
// Point to a left most file in a lower level that may contain a key,
// which compares greater than smallest of a FileMetaData (upper level)
int32_t smallest_lb;
// Point to a left most file in a lower level that may contain a key,
// which compares greater than largest of a FileMetaData (upper level)
int32_t largest_lb;
// Point to a right most file in a lower level that may contain a key,
// which compares smaller than smallest of a FileMetaData (upper level)
int32_t smallest_rb;
// Point to a right most file in a lower level that may contain a key,
// which compares smaller than largest of a FileMetaData (upper level)
int32_t largest_rb;
};
void CalculateLB(const std::vector<FileMetaData*>& upper_files,
const std::vector<FileMetaData*>& lower_files,
std::vector<IndexUnit>* index,
std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
std::function<void(IndexUnit*, int32_t)> set_index);
void CalculateRB(const std::vector<FileMetaData*>& upper_files,
const std::vector<FileMetaData*>& lower_files,
std::vector<IndexUnit>* index,
std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
std::function<void(IndexUnit*, int32_t)> set_index);
std::vector<std::vector<IndexUnit>> next_level_index_;
std::vector<int32_t> level_rb_;
};
} // namespace rocksdb

330
db/file_indexer_test.cc Normal file
View File

@@ -0,0 +1,330 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <string>
#include "db/file_indexer.h"
#include "db/dbformat.h"
#include "db/version_edit.h"
#include "rocksdb/comparator.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace rocksdb {
class IntComparator : public Comparator {
public:
int Compare(const Slice& a, const Slice& b) const {
assert(a.size() == 8);
assert(b.size() == 8);
return *reinterpret_cast<const int64_t*>(a.data()) -
*reinterpret_cast<const int64_t*>(b.data());
}
const char* Name() const {
return "IntComparator";
}
void FindShortestSeparator(std::string* start, const Slice& limit) const {}
void FindShortSuccessor(std::string* key) const {}
};
struct FileIndexerTest {
public:
FileIndexerTest() :
kNumLevels(4), indexer(kNumLevels, &ucmp),
files(new std::vector<FileMetaData*>[kNumLevels]) {
}
~FileIndexerTest() {
Reset();
delete[] files;
}
void AddFile(int level, int64_t smallest, int64_t largest) {
auto* f = new FileMetaData();
f->smallest = IntKey(smallest);
f->largest = IntKey(largest);
files[level].push_back(f);
}
InternalKey IntKey(int64_t v) {
return InternalKey(Slice(reinterpret_cast<char*>(&v), 8), 0, kTypeValue);
}
void Reset() {
for (uint32_t i = 0; i < kNumLevels; ++i) {
for (auto* f : files[i]) {
delete f;
}
files[i].clear();
}
indexer.ClearIndex();
}
void GetNextLevelIndex(const uint32_t level, const uint32_t file_index,
const int cmp_smallest, const int cmp_largest, int32_t* left_index,
int32_t* right_index) {
*left_index = 100;
*right_index = 100;
indexer.GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest,
left_index, right_index);
}
const uint32_t kNumLevels;
IntComparator ucmp;
FileIndexer indexer;
std::vector<FileMetaData*>* files;
};
TEST(FileIndexerTest, next_level_hint) {
for (uint32_t i = 0; i < kNumLevels; ++i) {
ASSERT_EQ(0U, indexer.LevelIndexSize(i));
}
// Case 1: no overlap, files are on the left of next level files
// level 1
AddFile(1, 100, 200);
AddFile(1, 300, 400);
AddFile(1, 500, 600);
// level 2
AddFile(2, 1500, 1600);
AddFile(2, 1601, 1699);
AddFile(2, 1700, 1800);
// level 3
AddFile(3, 2500, 2600);
AddFile(3, 2601, 2699);
AddFile(3, 2700, 2800);
indexer.UpdateIndex(files);
int32_t left = 100;
int32_t right = 100;
for (uint32_t level = 1; level < 3; ++level) {
for (uint32_t f = 0; f < 3; ++f) {
GetNextLevelIndex(level, f, -1, -1, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(-1, right);
GetNextLevelIndex(level, f, 0, -1, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(-1, right);
GetNextLevelIndex(level, f, 1, -1, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(-1, right);
GetNextLevelIndex(level, f, 1, 0, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(-1, right);
GetNextLevelIndex(level, f, 1, 1, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(2, right);
}
}
// Case 2: no overlap, files are on the right of next level files
Reset();
for (uint32_t i = 1; i < kNumLevels; ++i) {
ASSERT_EQ(0U, indexer.LevelIndexSize(i));
}
// level 1
AddFile(1, 2100, 2200);
AddFile(1, 2300, 2400);
AddFile(1, 2500, 2600);
// level 2
AddFile(2, 1500, 1600);
AddFile(2, 1501, 1699);
AddFile(2, 1700, 1800);
// level 3
AddFile(3, 500, 600);
AddFile(3, 501, 699);
AddFile(3, 700, 800);
indexer.UpdateIndex(files);
for (uint32_t level = 1; level < 3; ++level) {
for (uint32_t f = 0; f < 3; ++f) {
GetNextLevelIndex(level, f, -1, -1, &left, &right);
ASSERT_EQ(f == 0 ? 0 : 3, left);
ASSERT_EQ(2, right);
GetNextLevelIndex(level, f, 0, -1, &left, &right);
ASSERT_EQ(3, left);
ASSERT_EQ(2, right);
GetNextLevelIndex(level, f, 1, -1, &left, &right);
ASSERT_EQ(3, left);
ASSERT_EQ(2, right);
GetNextLevelIndex(level, f, 1, -1, &left, &right);
ASSERT_EQ(3, left);
ASSERT_EQ(2, right);
GetNextLevelIndex(level, f, 1, 0, &left, &right);
ASSERT_EQ(3, left);
ASSERT_EQ(2, right);
GetNextLevelIndex(level, f, 1, 1, &left, &right);
ASSERT_EQ(3, left);
ASSERT_EQ(2, right);
}
}
// Case 3: empty L2
Reset();
for (uint32_t i = 1; i < kNumLevels; ++i) {
ASSERT_EQ(0U, indexer.LevelIndexSize(i));
}
// level 1
AddFile(1, 2100, 2200);
AddFile(1, 2300, 2400);
AddFile(1, 2500, 2600);
// level 3
AddFile(3, 500, 600);
AddFile(3, 501, 699);
AddFile(3, 700, 800);
indexer.UpdateIndex(files);
for (uint32_t f = 0; f < 3; ++f) {
GetNextLevelIndex(1, f, -1, -1, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(-1, right);
GetNextLevelIndex(1, f, 0, -1, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(-1, right);
GetNextLevelIndex(1, f, 1, -1, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(-1, right);
GetNextLevelIndex(1, f, 1, -1, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(-1, right);
GetNextLevelIndex(1, f, 1, 0, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(-1, right);
GetNextLevelIndex(1, f, 1, 1, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(-1, right);
}
// Case 4: mixed
Reset();
for (uint32_t i = 1; i < kNumLevels; ++i) {
ASSERT_EQ(0U, indexer.LevelIndexSize(i));
}
// level 1
AddFile(1, 100, 200);
AddFile(1, 250, 400);
AddFile(1, 450, 500);
// level 2
AddFile(2, 100, 150); // 0
AddFile(2, 200, 250); // 1
AddFile(2, 251, 300); // 2
AddFile(2, 301, 350); // 3
AddFile(2, 500, 600); // 4
// level 3
AddFile(3, 0, 50);
AddFile(3, 100, 200);
AddFile(3, 201, 250);
indexer.UpdateIndex(files);
// level 1, 0
GetNextLevelIndex(1, 0, -1, -1, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(0, right);
GetNextLevelIndex(1, 0, 0, -1, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(0, right);
GetNextLevelIndex(1, 0, 1, -1, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(1, right);
GetNextLevelIndex(1, 0, 1, 0, &left, &right);
ASSERT_EQ(1, left);
ASSERT_EQ(1, right);
GetNextLevelIndex(1, 0, 1, 1, &left, &right);
ASSERT_EQ(1, left);
ASSERT_EQ(4, right);
// level 1, 1
GetNextLevelIndex(1, 1, -1, -1, &left, &right);
ASSERT_EQ(1, left);
ASSERT_EQ(1, right);
GetNextLevelIndex(1, 1, 0, -1, &left, &right);
ASSERT_EQ(1, left);
ASSERT_EQ(1, right);
GetNextLevelIndex(1, 1, 1, -1, &left, &right);
ASSERT_EQ(1, left);
ASSERT_EQ(3, right);
GetNextLevelIndex(1, 1, 1, 0, &left, &right);
ASSERT_EQ(4, left);
ASSERT_EQ(3, right);
GetNextLevelIndex(1, 1, 1, 1, &left, &right);
ASSERT_EQ(4, left);
ASSERT_EQ(4, right);
// level 1, 2
GetNextLevelIndex(1, 2, -1, -1, &left, &right);
ASSERT_EQ(4, left);
ASSERT_EQ(3, right);
GetNextLevelIndex(1, 2, 0, -1, &left, &right);
ASSERT_EQ(4, left);
ASSERT_EQ(3, right);
GetNextLevelIndex(1, 2, 1, -1, &left, &right);
ASSERT_EQ(4, left);
ASSERT_EQ(4, right);
GetNextLevelIndex(1, 2, 1, 0, &left, &right);
ASSERT_EQ(4, left);
ASSERT_EQ(4, right);
GetNextLevelIndex(1, 2, 1, 1, &left, &right);
ASSERT_EQ(4, left);
ASSERT_EQ(4, right);
// level 2, 0
GetNextLevelIndex(2, 0, -1, -1, &left, &right);
ASSERT_EQ(0, left);
ASSERT_EQ(1, right);
GetNextLevelIndex(2, 0, 0, -1, &left, &right);
ASSERT_EQ(1, left);
ASSERT_EQ(1, right);
GetNextLevelIndex(2, 0, 1, -1, &left, &right);
ASSERT_EQ(1, left);
ASSERT_EQ(1, right);
GetNextLevelIndex(2, 0, 1, 0, &left, &right);
ASSERT_EQ(1, left);
ASSERT_EQ(1, right);
GetNextLevelIndex(2, 0, 1, 1, &left, &right);
ASSERT_EQ(1, left);
ASSERT_EQ(2, right);
// level 2, 1
GetNextLevelIndex(2, 1, -1, -1, &left, &right);
ASSERT_EQ(1, left);
ASSERT_EQ(1, right);
GetNextLevelIndex(2, 1, 0, -1, &left, &right);
ASSERT_EQ(1, left);
ASSERT_EQ(1, right);
GetNextLevelIndex(2, 1, 1, -1, &left, &right);
ASSERT_EQ(1, left);
ASSERT_EQ(2, right);
GetNextLevelIndex(2, 1, 1, 0, &left, &right);
ASSERT_EQ(2, left);
ASSERT_EQ(2, right);
GetNextLevelIndex(2, 1, 1, 1, &left, &right);
ASSERT_EQ(2, left);
ASSERT_EQ(2, right);
// level 2, [2 - 4], no overlap
for (uint32_t f = 2; f <= 4; ++f) {
GetNextLevelIndex(2, f, -1, -1, &left, &right);
ASSERT_EQ(f == 2 ? 2 : 3, left);
ASSERT_EQ(2, right);
GetNextLevelIndex(2, f, 0, -1, &left, &right);
ASSERT_EQ(3, left);
ASSERT_EQ(2, right);
GetNextLevelIndex(2, f, 1, -1, &left, &right);
ASSERT_EQ(3, left);
ASSERT_EQ(2, right);
GetNextLevelIndex(2, f, 1, 0, &left, &right);
ASSERT_EQ(3, left);
ASSERT_EQ(2, right);
GetNextLevelIndex(2, f, 1, 1, &left, &right);
ASSERT_EQ(3, left);
ASSERT_EQ(2, right);
}
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

266
db/filename.cc Normal file
View File

@@ -0,0 +1,266 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/filename.h"
#include <ctype.h>
#include <stdio.h>
#include "db/dbformat.h"
#include "rocksdb/env.h"
#include "util/logging.h"
namespace rocksdb {
// Given a path, flatten the path name by replacing all chars not in
// {[0-9,a-z,A-Z,-,_,.]} with _. And append '\0' at the end.
// Return the number of chars stored in dest not including the trailing '\0'.
static int FlattenPath(const std::string& path, char* dest, int len) {
int write_idx = 0;
int i = 0;
int src_len = path.size();
while (i < src_len && write_idx < len - 1) {
if ((path[i] >= 'a' && path[i] <= 'z') ||
(path[i] >= '0' && path[i] <= '9') ||
(path[i] >= 'A' && path[i] <= 'Z') ||
path[i] == '-' ||
path[i] == '.' ||
path[i] == '_'){
dest[write_idx++] = path[i];
} else {
if (i > 0)
dest[write_idx++] = '_';
}
i++;
}
dest[write_idx] = '\0';
return write_idx;
}
static std::string MakeFileName(const std::string& name, uint64_t number,
const char* suffix) {
char buf[100];
snprintf(buf, sizeof(buf), "/%06llu.%s",
static_cast<unsigned long long>(number),
suffix);
return name + buf;
}
std::string LogFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name, number, "log");
}
std::string ArchivalDirectory(const std::string& dir) {
return dir + "/" + ARCHIVAL_DIR;
}
std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log");
}
std::string TableFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name, number, "sst");
}
std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
assert(number > 0);
char buf[100];
snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
static_cast<unsigned long long>(number));
return dbname + buf;
}
std::string CurrentFileName(const std::string& dbname) {
return dbname + "/CURRENT";
}
std::string LockFileName(const std::string& dbname) {
return dbname + "/LOCK";
}
std::string TempFileName(const std::string& dbname, uint64_t number) {
return MakeFileName(dbname, number, "dbtmp");
}
std::string InfoLogFileName(const std::string& dbname,
const std::string& db_path, const std::string& log_dir) {
if (log_dir.empty())
return dbname + "/LOG";
char flatten_db_path[256];
FlattenPath(db_path, flatten_db_path, 256);
return log_dir + "/" + flatten_db_path + "_LOG";
}
// Return the name of the old info log file for "dbname".
std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
const std::string& db_path, const std::string& log_dir) {
char buf[50];
snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
if (log_dir.empty())
return dbname + "/LOG.old." + buf;
char flatten_db_path[256];
FlattenPath(db_path, flatten_db_path, 256);
return log_dir + "/" + flatten_db_path + "_LOG.old." + buf;
}
std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
char buf[100];
snprintf(buf, sizeof(buf), "/METADB-%llu",
static_cast<unsigned long long>(number));
return dbname + buf;
}
std::string IdentityFileName(const std::string& dbname) {
return dbname + "/IDENTITY";
}
// Owned filenames have the form:
// dbname/IDENTITY
// dbname/CURRENT
// dbname/LOCK
// dbname/LOG
// dbname/LOG.old.[0-9]+
// dbname/MANIFEST-[0-9]+
// dbname/[0-9]+.(log|sst)
// dbname/METADB-[0-9]+
// Disregards / at the beginning
bool ParseFileName(const std::string& fname,
uint64_t* number,
FileType* type,
WalFileType* log_type) {
Slice rest(fname);
if (fname.length() > 1 && fname[0] == '/') {
rest.remove_prefix(1);
}
if (rest == "IDENTITY") {
*number = 0;
*type = kIdentityFile;
} else if (rest == "CURRENT") {
*number = 0;
*type = kCurrentFile;
} else if (rest == "LOCK") {
*number = 0;
*type = kDBLockFile;
} else if (rest == "LOG" || rest == "LOG.old") {
*number = 0;
*type = kInfoLogFile;
} else if (rest.starts_with("LOG.old.")) {
uint64_t ts_suffix;
// sizeof also counts the trailing '\0'.
rest.remove_prefix(sizeof("LOG.old.") - 1);
if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
return false;
}
*number = ts_suffix;
*type = kInfoLogFile;
} else if (rest.starts_with("MANIFEST-")) {
rest.remove_prefix(strlen("MANIFEST-"));
uint64_t num;
if (!ConsumeDecimalNumber(&rest, &num)) {
return false;
}
if (!rest.empty()) {
return false;
}
*type = kDescriptorFile;
*number = num;
} else if (rest.starts_with("METADB-")) {
rest.remove_prefix(strlen("METADB-"));
uint64_t num;
if (!ConsumeDecimalNumber(&rest, &num)) {
return false;
}
if (!rest.empty()) {
return false;
}
*type = kMetaDatabase;
*number = num;
} else {
// Avoid strtoull() to keep filename format independent of the
// current locale
bool archive_dir_found = false;
if (rest.starts_with(ARCHIVAL_DIR)) {
if (rest.size() <= ARCHIVAL_DIR.size()) {
return false;
}
rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also
if (log_type) {
*log_type = kArchivedLogFile;
}
archive_dir_found = true;
}
uint64_t num;
if (!ConsumeDecimalNumber(&rest, &num)) {
return false;
}
Slice suffix = rest;
if (suffix == Slice(".log")) {
*type = kLogFile;
if (log_type && !archive_dir_found) {
*log_type = kAliveLogFile;
}
} else if (archive_dir_found) {
return false; // Archive dir can contain only log files
} else if (suffix == Slice(".sst")) {
*type = kTableFile;
} else if (suffix == Slice(".dbtmp")) {
*type = kTempFile;
} else {
return false;
}
*number = num;
}
return true;
}
Status SetCurrentFile(Env* env, const std::string& dbname,
uint64_t descriptor_number,
Directory* directory_to_fsync) {
// Remove leading "dbname/" and add newline to manifest file name
std::string manifest = DescriptorFileName(dbname, descriptor_number);
Slice contents = manifest;
assert(contents.starts_with(dbname + "/"));
contents.remove_prefix(dbname.size() + 1);
std::string tmp = TempFileName(dbname, descriptor_number);
Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp, true);
if (s.ok()) {
s = env->RenameFile(tmp, CurrentFileName(dbname));
}
if (s.ok()) {
if (directory_to_fsync != nullptr) {
directory_to_fsync->Fsync();
}
} else {
env->DeleteFile(tmp);
}
return s;
}
Status SetIdentityFile(Env* env, const std::string& dbname) {
std::string id = env->GenerateUniqueId();
assert(!id.empty());
// Reserve the filename dbname/000000.dbtmp for the temporary identity file
std::string tmp = TempFileName(dbname, 0);
Status s = WriteStringToFile(env, id, tmp, true);
if (s.ok()) {
s = env->RenameFile(tmp, IdentityFileName(dbname));
}
if (!s.ok()) {
env->DeleteFile(tmp);
}
return s;
}
} // namespace rocksdb

110
db/filename.h Normal file
View File

@@ -0,0 +1,110 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// File names used by DB code
#pragma once
#include <stdint.h>
#include <string>
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "rocksdb/transaction_log.h"
#include "port/port.h"
namespace rocksdb {
class Env;
class Directory;
enum FileType {
kLogFile,
kDBLockFile,
kTableFile,
kDescriptorFile,
kCurrentFile,
kTempFile,
kInfoLogFile, // Either the current one, or an old one
kMetaDatabase,
kIdentityFile
};
// Return the name of the log file with the specified number
// in the db named by "dbname". The result will be prefixed with
// "dbname".
extern std::string LogFileName(const std::string& dbname, uint64_t number);
static const std::string ARCHIVAL_DIR = "archive";
extern std::string ArchivalDirectory(const std::string& dbname);
// Return the name of the archived log file with the specified number
// in the db named by "dbname". The result will be prefixed with "dbname".
extern std::string ArchivedLogFileName(const std::string& dbname,
uint64_t num);
// Return the name of the sstable with the specified number
// in the db named by "dbname". The result will be prefixed with
// "dbname".
extern std::string TableFileName(const std::string& dbname, uint64_t number);
// Return the name of the descriptor file for the db named by
// "dbname" and the specified incarnation number. The result will be
// prefixed with "dbname".
extern std::string DescriptorFileName(const std::string& dbname,
uint64_t number);
// Return the name of the current file. This file contains the name
// of the current manifest file. The result will be prefixed with
// "dbname".
extern std::string CurrentFileName(const std::string& dbname);
// Return the name of the lock file for the db named by
// "dbname". The result will be prefixed with "dbname".
extern std::string LockFileName(const std::string& dbname);
// Return the name of a temporary file owned by the db named "dbname".
// The result will be prefixed with "dbname".
extern std::string TempFileName(const std::string& dbname, uint64_t number);
// Return the name of the info log file for "dbname".
extern std::string InfoLogFileName(const std::string& dbname,
const std::string& db_path="", const std::string& log_dir="");
// Return the name of the old info log file for "dbname".
extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
const std::string& db_path="", const std::string& log_dir="");
// Return the name to use for a metadatabase. The result will be prefixed with
// "dbname".
extern std::string MetaDatabaseName(const std::string& dbname,
uint64_t number);
// Return the name of the Identity file which stores a unique number for the db
// that will get regenerated if the db loses all its data and is recreated fresh
// either from a backup-image or empty
extern std::string IdentityFileName(const std::string& dbname);
// If filename is a rocksdb file, store the type of the file in *type.
// The number encoded in the filename is stored in *number. If the
// filename was successfully parsed, returns true. Else return false.
extern bool ParseFileName(const std::string& filename,
uint64_t* number,
FileType* type,
WalFileType* log_type = nullptr);
// Make the CURRENT file point to the descriptor file with the
// specified number.
extern Status SetCurrentFile(Env* env, const std::string& dbname,
uint64_t descriptor_number,
Directory* directory_to_fsync);
// Make the IDENTITY file for the db
extern Status SetIdentityFile(Env* env, const std::string& dbname);
} // namespace rocksdb

140
db/filename_test.cc Normal file
View File

@@ -0,0 +1,140 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/filename.h"
#include "db/dbformat.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace rocksdb {
class FileNameTest { };
TEST(FileNameTest, Parse) {
Slice db;
FileType type;
uint64_t number;
// Successful parses
static struct {
const char* fname;
uint64_t number;
FileType type;
} cases[] = {
{ "100.log", 100, kLogFile },
{ "0.log", 0, kLogFile },
{ "0.sst", 0, kTableFile },
{ "CURRENT", 0, kCurrentFile },
{ "LOCK", 0, kDBLockFile },
{ "MANIFEST-2", 2, kDescriptorFile },
{ "MANIFEST-7", 7, kDescriptorFile },
{ "METADB-2", 2, kMetaDatabase },
{ "METADB-7", 7, kMetaDatabase },
{ "LOG", 0, kInfoLogFile },
{ "LOG.old", 0, kInfoLogFile },
{ "18446744073709551615.log", 18446744073709551615ull, kLogFile },
};
for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
std::string f = cases[i].fname;
ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
ASSERT_EQ(cases[i].type, type) << f;
ASSERT_EQ(cases[i].number, number) << f;
}
// Errors
static const char* errors[] = {
"",
"foo",
"foo-dx-100.log",
".log",
"",
"manifest",
"CURREN",
"CURRENTX",
"MANIFES",
"MANIFEST",
"MANIFEST-",
"XMANIFEST-3",
"MANIFEST-3x",
"META",
"METADB",
"METADB-",
"XMETADB-3",
"METADB-3x",
"LOC",
"LOCKx",
"LO",
"LOGx",
"18446744073709551616.log",
"184467440737095516150.log",
"100",
"100.",
"100.lop"
};
for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
std::string f = errors[i];
ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
};
}
TEST(FileNameTest, Construction) {
uint64_t number;
FileType type;
std::string fname;
fname = CurrentFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(0U, number);
ASSERT_EQ(kCurrentFile, type);
fname = LockFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(0U, number);
ASSERT_EQ(kDBLockFile, type);
fname = LogFileName("foo", 192);
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(192U, number);
ASSERT_EQ(kLogFile, type);
fname = TableFileName("bar", 200);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(200U, number);
ASSERT_EQ(kTableFile, type);
fname = DescriptorFileName("bar", 100);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(100U, number);
ASSERT_EQ(kDescriptorFile, type);
fname = TempFileName("tmp", 999);
ASSERT_EQ("tmp/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(999U, number);
ASSERT_EQ(kTempFile, type);
fname = MetaDatabaseName("met", 100);
ASSERT_EQ("met/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(100U, number);
ASSERT_EQ(kMetaDatabase, type);
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

383
db/forward_iterator.cc Normal file
View File

@@ -0,0 +1,383 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#include "db/forward_iterator.h"
#include <string>
#include <utility>
#include <limits>
#include "db/db_impl.h"
#include "db/db_iter.h"
#include "db/column_family.h"
#include "rocksdb/env.h"
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include "table/merger.h"
#include "db/dbformat.h"
namespace rocksdb {
// Usage:
// LevelIterator iter;
// iter.SetFileIndex(file_index);
// iter.Seek(target);
// iter.Next()
class LevelIterator : public Iterator {
public:
LevelIterator(const ColumnFamilyData* const cfd,
const ReadOptions& read_options,
const std::vector<FileMetaData*>& files)
: cfd_(cfd), read_options_(read_options), files_(files), valid_(false),
file_index_(std::numeric_limits<uint32_t>::max()) {}
void SetFileIndex(uint32_t file_index) {
assert(file_index < files_.size());
if (file_index != file_index_) {
file_index_ = file_index;
file_iter_.reset(cfd_->table_cache()->NewIterator(
read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
*(files_[file_index_]), nullptr /* table_reader_ptr */, false));
}
valid_ = false;
}
void SeekToLast() override {
status_ = Status::NotSupported("LevelIterator::SeekToLast()");
valid_ = false;
}
void Prev() {
status_ = Status::NotSupported("LevelIterator::Prev()");
valid_ = false;
}
bool Valid() const override {
return valid_;
}
void SeekToFirst() override {
SetFileIndex(0);
file_iter_->SeekToFirst();
valid_ = file_iter_->Valid();
}
void Seek(const Slice& internal_key) override {
assert(file_iter_ != nullptr);
file_iter_->Seek(internal_key);
valid_ = file_iter_->Valid();
assert(valid_);
}
void Next() override {
assert(valid_);
file_iter_->Next();
while (!file_iter_->Valid()) {
if (file_index_ + 1 >= files_.size()) {
valid_ = false;
return;
}
SetFileIndex(file_index_ + 1);
file_iter_->SeekToFirst();
}
valid_ = file_iter_->Valid();
}
Slice key() const override {
assert(valid_);
return file_iter_->key();
}
Slice value() const override {
assert(valid_);
return file_iter_->value();
}
Status status() const override {
return status_;
}
private:
const ColumnFamilyData* const cfd_;
const ReadOptions& read_options_;
const std::vector<FileMetaData*>& files_;
bool valid_;
uint32_t file_index_;
Status status_;
std::unique_ptr<Iterator> file_iter_;
};
ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
ColumnFamilyData* cfd)
: db_(db),
read_options_(read_options),
cfd_(cfd),
prefix_extractor_(cfd->options()->prefix_extractor.get()),
user_comparator_(cfd->user_comparator()),
immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
sv_(nullptr),
mutable_iter_(nullptr),
current_(nullptr),
valid_(false),
is_prev_set_(false) {}
ForwardIterator::~ForwardIterator() {
Cleanup();
}
void ForwardIterator::Cleanup() {
delete mutable_iter_;
for (auto* m : imm_iters_) {
delete m;
}
imm_iters_.clear();
for (auto* f : l0_iters_) {
delete f;
}
l0_iters_.clear();
for (auto* l : level_iters_) {
delete l;
}
level_iters_.clear();
if (sv_ != nullptr && sv_->Unref()) {
DBImpl::DeletionState deletion_state;
db_->mutex_.Lock();
sv_->Cleanup();
db_->FindObsoleteFiles(deletion_state, false, true);
db_->mutex_.Unlock();
delete sv_;
if (deletion_state.HaveSomethingToDelete()) {
db_->PurgeObsoleteFiles(deletion_state);
}
}
}
bool ForwardIterator::Valid() const {
return valid_;
}
void ForwardIterator::SeekToFirst() {
if (sv_ == nullptr ||
sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
RebuildIterators();
}
SeekInternal(Slice(), true);
}
void ForwardIterator::Seek(const Slice& internal_key) {
if (sv_ == nullptr ||
sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
RebuildIterators();
}
SeekInternal(internal_key, false);
}
void ForwardIterator::SeekInternal(const Slice& internal_key,
bool seek_to_first) {
// mutable
seek_to_first ? mutable_iter_->SeekToFirst() :
mutable_iter_->Seek(internal_key);
// immutable
// TODO(ljin): NeedToSeekImmutable has negative impact on performance
// if it turns to need to seek immutable often. We probably want to have
// an option to turn it off.
if (seek_to_first || NeedToSeekImmutable(internal_key)) {
{
auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator()));
immutable_min_heap_.swap(tmp);
}
for (auto* m : imm_iters_) {
seek_to_first ? m->SeekToFirst() : m->Seek(internal_key);
if (m->Valid()) {
immutable_min_heap_.push(m);
}
}
auto* files = sv_->current->files_;
for (uint32_t i = 0; i < files[0].size(); ++i) {
if (seek_to_first) {
l0_iters_[i]->SeekToFirst();
} else {
// If the target key passes over the larget key, we are sure Next()
// won't go over this file.
if (user_comparator_->Compare(ExtractUserKey(internal_key),
files[0][i]->largest.user_key()) > 0) {
continue;
}
l0_iters_[i]->Seek(internal_key);
}
if (l0_iters_[i]->Valid()) {
immutable_min_heap_.push(l0_iters_[i]);
}
}
for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) {
if (files[level].empty()) {
continue;
}
assert(level_iters_[level - 1] != nullptr);
uint32_t f_idx = 0;
if (!seek_to_first) {
f_idx = FindFileInRange(
files[level], internal_key, 0, files[level].size());
}
if (f_idx < files[level].size()) {
level_iters_[level - 1]->SetFileIndex(f_idx);
seek_to_first ? level_iters_[level - 1]->SeekToFirst() :
level_iters_[level - 1]->Seek(internal_key);
if (level_iters_[level - 1]->Valid()) {
immutable_min_heap_.push(level_iters_[level - 1]);
}
}
}
if (seek_to_first || immutable_min_heap_.empty()) {
is_prev_set_ = false;
} else {
prev_key_.SetKey(internal_key);
is_prev_set_ = true;
}
}
UpdateCurrent();
}
void ForwardIterator::Next() {
assert(valid_);
if (sv_ == nullptr ||
sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
std::string current_key = key().ToString();
Slice old_key(current_key.data(), current_key.size());
RebuildIterators();
SeekInternal(old_key, false);
if (!valid_ || key().compare(old_key) != 0) {
return;
}
} else if (current_ != mutable_iter_) {
// It is going to advance immutable iterator
prev_key_.SetKey(current_->key());
is_prev_set_ = true;
}
current_->Next();
if (current_->Valid() && current_ != mutable_iter_) {
immutable_min_heap_.push(current_);
}
UpdateCurrent();
}
Slice ForwardIterator::key() const {
assert(valid_);
return current_->key();
}
Slice ForwardIterator::value() const {
assert(valid_);
return current_->value();
}
Status ForwardIterator::status() const {
if (!status_.ok()) {
return status_;
} else if (!mutable_iter_->status().ok()) {
return mutable_iter_->status();
}
return Status::OK();
}
void ForwardIterator::RebuildIterators() {
// Clean up
Cleanup();
// New
sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
mutable_iter_ = sv_->mem->NewIterator(read_options_);
sv_->imm->AddIterators(read_options_, &imm_iters_);
const auto& l0_files = sv_->current->files_[0];
l0_iters_.reserve(l0_files.size());
for (const auto* l0 : l0_files) {
l0_iters_.push_back(cfd_->table_cache()->NewIterator(
read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0));
}
level_iters_.reserve(sv_->current->NumberLevels() - 1);
for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) {
if (sv_->current->files_[level].empty()) {
level_iters_.push_back(nullptr);
} else {
level_iters_.push_back(new LevelIterator(cfd_, read_options_,
sv_->current->files_[level]));
}
}
current_ = nullptr;
is_prev_set_ = false;
}
void ForwardIterator::UpdateCurrent() {
if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) {
current_ = nullptr;
} else if (immutable_min_heap_.empty()) {
current_ = mutable_iter_;
} else if (!mutable_iter_->Valid()) {
current_ = immutable_min_heap_.top();
immutable_min_heap_.pop();
} else {
current_ = immutable_min_heap_.top();
assert(current_ != nullptr);
assert(current_->Valid());
int cmp = cfd_->internal_comparator().InternalKeyComparator::Compare(
mutable_iter_->key(), current_->key()) > 0;
assert(cmp != 0);
if (cmp > 0) {
immutable_min_heap_.pop();
} else {
current_ = mutable_iter_;
}
}
valid_ = (current_ != nullptr);
if (!status_.ok()) {
status_ = Status::OK();
}
}
bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
if (!is_prev_set_) {
return true;
}
Slice prev_key = prev_key_.GetKey();
if (prefix_extractor_ && prefix_extractor_->Transform(target).compare(
prefix_extractor_->Transform(prev_key)) != 0) {
return true;
}
if (cfd_->internal_comparator().InternalKeyComparator::Compare(
prev_key, target) >= 0) {
return true;
}
if (immutable_min_heap_.empty() ||
cfd_->internal_comparator().InternalKeyComparator::Compare(
target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key()
: current_->key()) > 0) {
return true;
}
return false;
}
uint32_t ForwardIterator::FindFileInRange(
const std::vector<FileMetaData*>& files, const Slice& internal_key,
uint32_t left, uint32_t right) {
while (left < right) {
uint32_t mid = (left + right) / 2;
const FileMetaData* f = files[mid];
if (cfd_->internal_comparator().InternalKeyComparator::Compare(
f->largest.Encode(), internal_key) < 0) {
// Key at "mid.largest" is < "target". Therefore all
// files at or before "mid" are uninteresting.
left = mid + 1;
} else {
// Key at "mid.largest" is >= "target". Therefore all files
// after "mid" are uninteresting.
right = mid;
}
}
return right;
}
} // namespace rocksdb
#endif // ROCKSDB_LITE

105
db/forward_iterator.h Normal file
View File

@@ -0,0 +1,105 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <string>
#include <vector>
#include <queue>
#include "rocksdb/db.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "db/dbformat.h"
namespace rocksdb {
class DBImpl;
class Env;
struct SuperVersion;
class ColumnFamilyData;
class LevelIterator;
struct FileMetaData;
class MinIterComparator {
public:
explicit MinIterComparator(const Comparator* comparator) :
comparator_(comparator) {}
bool operator()(Iterator* a, Iterator* b) {
return comparator_->Compare(a->key(), b->key()) > 0;
}
private:
const Comparator* comparator_;
};
typedef std::priority_queue<Iterator*,
std::vector<Iterator*>,
MinIterComparator> MinIterHeap;
/**
* ForwardIterator is a special type of iterator that only supports Seek()
* and Next(). It is expected to perform better than TailingIterator by
* removing the encapsulation and making all information accessible within
* the iterator. At the current implementation, snapshot is taken at the
* time Seek() is called. The Next() followed do not see new values after.
*/
class ForwardIterator : public Iterator {
public:
ForwardIterator(DBImpl* db, const ReadOptions& read_options,
ColumnFamilyData* cfd);
virtual ~ForwardIterator();
void SeekToLast() override {
status_ = Status::NotSupported("ForwardIterator::SeekToLast()");
valid_ = false;
}
void Prev() {
status_ = Status::NotSupported("ForwardIterator::Prev");
valid_ = false;
}
virtual bool Valid() const override;
void SeekToFirst() override;
virtual void Seek(const Slice& target) override;
virtual void Next() override;
virtual Slice key() const override;
virtual Slice value() const override;
virtual Status status() const override;
private:
void Cleanup();
void RebuildIterators();
void SeekInternal(const Slice& internal_key, bool seek_to_first);
void UpdateCurrent();
bool NeedToSeekImmutable(const Slice& internal_key);
uint32_t FindFileInRange(
const std::vector<FileMetaData*>& files, const Slice& internal_key,
uint32_t left, uint32_t right);
DBImpl* const db_;
const ReadOptions read_options_;
ColumnFamilyData* const cfd_;
const SliceTransform* const prefix_extractor_;
const Comparator* user_comparator_;
MinIterHeap immutable_min_heap_;
SuperVersion* sv_;
Iterator* mutable_iter_;
std::vector<Iterator*> imm_iters_;
std::vector<Iterator*> l0_iters_;
std::vector<LevelIterator*> level_iters_;
Iterator* current_;
// internal iterator status
Status status_;
bool valid_;
IterKey prev_key_;
bool is_prev_set_;
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

369
db/internal_stats.cc Normal file
View File

@@ -0,0 +1,369 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/internal_stats.h"
#include "db/column_family.h"
#include <vector>
namespace rocksdb {
DBPropertyType GetPropertyType(const Slice& property) {
Slice in = property;
Slice prefix("rocksdb.");
if (!in.starts_with(prefix)) return kUnknown;
in.remove_prefix(prefix.size());
if (in.starts_with("num-files-at-level")) {
return kNumFilesAtLevel;
} else if (in == "levelstats") {
return kLevelStats;
} else if (in == "stats") {
return kStats;
} else if (in == "sstables") {
return kSsTables;
} else if (in == "num-immutable-mem-table") {
return kNumImmutableMemTable;
} else if (in == "mem-table-flush-pending") {
return kMemtableFlushPending;
} else if (in == "compaction-pending") {
return kCompactionPending;
} else if (in == "background-errors") {
return kBackgroundErrors;
} else if (in == "cur-size-active-mem-table") {
return kCurSizeActiveMemTable;
} else if (in == "num-entries-active-mem-table") {
return kNumEntriesInMutableMemtable;
} else if (in == "num-entries-imm-mem-tables") {
return kNumEntriesInImmutableMemtable;
}
return kUnknown;
}
bool InternalStats::GetProperty(DBPropertyType property_type,
const Slice& property, std::string* value,
ColumnFamilyData* cfd) {
Version* current = cfd->current();
Slice in = property;
switch (property_type) {
case kNumFilesAtLevel: {
in.remove_prefix(strlen("rocksdb.num-files-at-level"));
uint64_t level;
bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
if (!ok || (int)level >= number_levels_) {
return false;
} else {
char buf[100];
snprintf(buf, sizeof(buf), "%d",
current->NumLevelFiles(static_cast<int>(level)));
*value = buf;
return true;
}
}
case kLevelStats: {
char buf[1000];
snprintf(buf, sizeof(buf),
"Level Files Size(MB)\n"
"--------------------\n");
value->append(buf);
for (int level = 0; level < number_levels_; level++) {
snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level,
current->NumLevelFiles(level),
current->NumLevelBytes(level) / 1048576.0);
value->append(buf);
}
return true;
}
case kStats: {
char buf[1000];
uint64_t wal_bytes = 0;
uint64_t wal_synced = 0;
uint64_t user_bytes_written = 0;
uint64_t write_other = 0;
uint64_t write_self = 0;
uint64_t write_with_wal = 0;
uint64_t total_bytes_written = 0;
uint64_t total_bytes_read = 0;
uint64_t micros_up = env_->NowMicros() - started_at_;
// Add "+1" to make sure seconds_up is > 0 and avoid NaN later
double seconds_up = (micros_up + 1) / 1000000.0;
uint64_t total_slowdown = 0;
uint64_t total_slowdown_count = 0;
uint64_t interval_bytes_written = 0;
uint64_t interval_bytes_read = 0;
uint64_t interval_bytes_new = 0;
double interval_seconds_up = 0;
if (statistics_) {
wal_bytes = statistics_->getTickerCount(WAL_FILE_BYTES);
wal_synced = statistics_->getTickerCount(WAL_FILE_SYNCED);
user_bytes_written = statistics_->getTickerCount(BYTES_WRITTEN);
write_other = statistics_->getTickerCount(WRITE_DONE_BY_OTHER);
write_self = statistics_->getTickerCount(WRITE_DONE_BY_SELF);
write_with_wal = statistics_->getTickerCount(WRITE_WITH_WAL);
}
snprintf(
buf, sizeof(buf),
" Compactions\n"
"Level Files Size(MB) Score Time(sec) Read(MB) Write(MB) Rn(MB) "
" "
"Rnp1(MB) Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s) Rn "
"Rnp1 "
" Wnp1 NewW Count msComp msStall Ln-stall Stall-cnt\n"
"--------------------------------------------------------------------"
"--"
"--------------------------------------------------------------------"
"--"
"----------------------------------------------------------------\n");
value->append(buf);
for (int level = 0; level < number_levels_; level++) {
int files = current->NumLevelFiles(level);
if (compaction_stats_[level].micros > 0 || files > 0) {
int64_t bytes_read = compaction_stats_[level].bytes_readn +
compaction_stats_[level].bytes_readnp1;
int64_t bytes_new = compaction_stats_[level].bytes_written -
compaction_stats_[level].bytes_readnp1;
double amplify =
(compaction_stats_[level].bytes_readn == 0)
? 0.0
: (compaction_stats_[level].bytes_written +
compaction_stats_[level].bytes_readnp1 +
compaction_stats_[level].bytes_readn) /
(double)compaction_stats_[level].bytes_readn;
total_bytes_read += bytes_read;
total_bytes_written += compaction_stats_[level].bytes_written;
uint64_t stalls = level == 0 ? (stall_counts_[LEVEL0_SLOWDOWN] +
stall_counts_[LEVEL0_NUM_FILES] +
stall_counts_[MEMTABLE_COMPACTION])
: stall_leveln_slowdown_count_[level];
double stall_us = level == 0 ? (stall_micros_[LEVEL0_SLOWDOWN] +
stall_micros_[LEVEL0_NUM_FILES] +
stall_micros_[MEMTABLE_COMPACTION])
: stall_leveln_slowdown_[level];
snprintf(buf, sizeof(buf),
"%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f "
"%10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %8d %9.1f %9.1f "
"%9lu\n",
level, files, current->NumLevelBytes(level) / 1048576.0,
current->NumLevelBytes(level) /
cfd->compaction_picker()->MaxBytesForLevel(level),
compaction_stats_[level].micros / 1e6,
bytes_read / 1048576.0,
compaction_stats_[level].bytes_written / 1048576.0,
compaction_stats_[level].bytes_readn / 1048576.0,
compaction_stats_[level].bytes_readnp1 / 1048576.0,
bytes_new / 1048576.0, amplify,
// +1 to avoid division by 0
(bytes_read / 1048576.0) /
((compaction_stats_[level].micros + 1) / 1000000.0),
(compaction_stats_[level].bytes_written / 1048576.0) /
((compaction_stats_[level].micros + 1) / 1000000.0),
compaction_stats_[level].files_in_leveln,
compaction_stats_[level].files_in_levelnp1,
compaction_stats_[level].files_out_levelnp1,
compaction_stats_[level].files_out_levelnp1 -
compaction_stats_[level].files_in_levelnp1,
compaction_stats_[level].count,
(int)((double)compaction_stats_[level].micros / 1000.0 /
(compaction_stats_[level].count + 1)),
(double)stall_us / 1000.0 / (stalls + 1),
stall_us / 1000000.0, (unsigned long)stalls);
total_slowdown += stall_leveln_slowdown_[level];
total_slowdown_count += stall_leveln_slowdown_count_[level];
value->append(buf);
}
}
interval_bytes_new = user_bytes_written - last_stats_.ingest_bytes_;
interval_bytes_read =
total_bytes_read - last_stats_.compaction_bytes_read_;
interval_bytes_written =
total_bytes_written - last_stats_.compaction_bytes_written_;
interval_seconds_up = seconds_up - last_stats_.seconds_up_;
snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
seconds_up, interval_seconds_up);
value->append(buf);
snprintf(buf, sizeof(buf),
"Writes cumulative: %llu total, %llu batches, "
"%.1f per batch, %.2f ingest GB\n",
(unsigned long long)(write_other + write_self),
(unsigned long long)write_self,
(write_other + write_self) / (double)(write_self + 1),
user_bytes_written / (1048576.0 * 1024));
value->append(buf);
snprintf(buf, sizeof(buf),
"WAL cumulative: %llu WAL writes, %llu WAL syncs, "
"%.2f writes per sync, %.2f GB written\n",
(unsigned long long)write_with_wal,
(unsigned long long)wal_synced,
write_with_wal / (double)(wal_synced + 1),
wal_bytes / (1048576.0 * 1024));
value->append(buf);
snprintf(buf, sizeof(buf),
"Compaction IO cumulative (GB): "
"%.2f new, %.2f read, %.2f write, %.2f read+write\n",
user_bytes_written / (1048576.0 * 1024),
total_bytes_read / (1048576.0 * 1024),
total_bytes_written / (1048576.0 * 1024),
(total_bytes_read + total_bytes_written) / (1048576.0 * 1024));
value->append(buf);
snprintf(
buf, sizeof(buf),
"Compaction IO cumulative (MB/sec): "
"%.1f new, %.1f read, %.1f write, %.1f read+write\n",
user_bytes_written / 1048576.0 / seconds_up,
total_bytes_read / 1048576.0 / seconds_up,
total_bytes_written / 1048576.0 / seconds_up,
(total_bytes_read + total_bytes_written) / 1048576.0 / seconds_up);
value->append(buf);
// +1 to avoid divide by 0 and NaN
snprintf(
buf, sizeof(buf),
"Amplification cumulative: %.1f write, %.1f compaction\n",
(double)(total_bytes_written + wal_bytes) / (user_bytes_written + 1),
(double)(total_bytes_written + total_bytes_read + wal_bytes) /
(user_bytes_written + 1));
value->append(buf);
uint64_t interval_write_other = write_other - last_stats_.write_other_;
uint64_t interval_write_self = write_self - last_stats_.write_self_;
snprintf(buf, sizeof(buf),
"Writes interval: %llu total, %llu batches, "
"%.1f per batch, %.1f ingest MB\n",
(unsigned long long)(interval_write_other + interval_write_self),
(unsigned long long)interval_write_self,
(double)(interval_write_other + interval_write_self) /
(interval_write_self + 1),
(user_bytes_written - last_stats_.ingest_bytes_) / 1048576.0);
value->append(buf);
uint64_t interval_write_with_wal =
write_with_wal - last_stats_.write_with_wal_;
uint64_t interval_wal_synced = wal_synced - last_stats_.wal_synced_;
uint64_t interval_wal_bytes = wal_bytes - last_stats_.wal_bytes_;
snprintf(buf, sizeof(buf),
"WAL interval: %llu WAL writes, %llu WAL syncs, "
"%.2f writes per sync, %.2f MB written\n",
(unsigned long long)interval_write_with_wal,
(unsigned long long)interval_wal_synced,
interval_write_with_wal / (double)(interval_wal_synced + 1),
interval_wal_bytes / (1048576.0 * 1024));
value->append(buf);
snprintf(buf, sizeof(buf),
"Compaction IO interval (MB): "
"%.2f new, %.2f read, %.2f write, %.2f read+write\n",
interval_bytes_new / 1048576.0, interval_bytes_read / 1048576.0,
interval_bytes_written / 1048576.0,
(interval_bytes_read + interval_bytes_written) / 1048576.0);
value->append(buf);
snprintf(buf, sizeof(buf),
"Compaction IO interval (MB/sec): "
"%.1f new, %.1f read, %.1f write, %.1f read+write\n",
interval_bytes_new / 1048576.0 / interval_seconds_up,
interval_bytes_read / 1048576.0 / interval_seconds_up,
interval_bytes_written / 1048576.0 / interval_seconds_up,
(interval_bytes_read + interval_bytes_written) / 1048576.0 /
interval_seconds_up);
value->append(buf);
// +1 to avoid divide by 0 and NaN
snprintf(
buf, sizeof(buf),
"Amplification interval: %.1f write, %.1f compaction\n",
(double)(interval_bytes_written + wal_bytes) /
(interval_bytes_new + 1),
(double)(interval_bytes_written + interval_bytes_read + wal_bytes) /
(interval_bytes_new + 1));
value->append(buf);
snprintf(buf, sizeof(buf),
"Stalls(secs): %.3f level0_slowdown, %.3f level0_numfiles, "
"%.3f memtable_compaction, %.3f leveln_slowdown\n",
stall_micros_[LEVEL0_SLOWDOWN] / 1000000.0,
stall_micros_[LEVEL0_NUM_FILES] / 1000000.0,
stall_micros_[MEMTABLE_COMPACTION] / 1000000.0,
total_slowdown / 1000000.0);
value->append(buf);
snprintf(buf, sizeof(buf),
"Stalls(count): %lu level0_slowdown, %lu level0_numfiles, "
"%lu memtable_compaction, %lu leveln_slowdown\n",
(unsigned long)stall_counts_[LEVEL0_SLOWDOWN],
(unsigned long)stall_counts_[LEVEL0_NUM_FILES],
(unsigned long)stall_counts_[MEMTABLE_COMPACTION],
(unsigned long)total_slowdown_count);
value->append(buf);
last_stats_.compaction_bytes_read_ = total_bytes_read;
last_stats_.compaction_bytes_written_ = total_bytes_written;
last_stats_.ingest_bytes_ = user_bytes_written;
last_stats_.seconds_up_ = seconds_up;
last_stats_.wal_bytes_ = wal_bytes;
last_stats_.wal_synced_ = wal_synced;
last_stats_.write_with_wal_ = write_with_wal;
last_stats_.write_other_ = write_other;
last_stats_.write_self_ = write_self;
return true;
}
case kSsTables:
*value = current->DebugString();
return true;
case kNumImmutableMemTable:
*value = std::to_string(cfd->imm()->size());
return true;
case kMemtableFlushPending:
// Return number of mem tables that are ready to flush (made immutable)
*value = std::to_string(cfd->imm()->IsFlushPending() ? 1 : 0);
return true;
case kCompactionPending:
// 1 if the system already determines at least one compacdtion is needed.
// 0 otherwise,
*value = std::to_string(current->NeedsCompaction() ? 1 : 0);
return true;
case kBackgroundErrors:
// Accumulated number of errors in background flushes or compactions.
*value = std::to_string(GetBackgroundErrorCount());
return true;
case kCurSizeActiveMemTable:
// Current size of the active memtable
*value = std::to_string(cfd->mem()->ApproximateMemoryUsage());
return true;
case kNumEntriesInMutableMemtable:
// Current size of the active memtable
*value = std::to_string(cfd->mem()->GetNumEntries());
return true;
case kNumEntriesInImmutableMemtable:
// Current size of the active memtable
*value = std::to_string(cfd->imm()->current()->GetTotalNumEntries());
return true;
default:
return false;
}
}
} // namespace rocksdb

187
db/internal_stats.h Normal file
View File

@@ -0,0 +1,187 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
#pragma once
#include "rocksdb/statistics.h"
#include "util/statistics.h"
#include "db/version_set.h"
#include <vector>
#include <string>
class ColumnFamilyData;
namespace rocksdb {
class MemTableList;
class DBImpl;
enum DBPropertyType {
kNumFilesAtLevel, // Number of files at a specific level
kLevelStats, // Return number of files and total sizes of each level
kStats, // Return general statitistics of DB
kSsTables, // Return a human readable string of current SST files
kNumImmutableMemTable, // Return number of immutable mem tables
kMemtableFlushPending, // Return 1 if mem table flushing is pending,
// otherwise 0.
kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0.
kBackgroundErrors, // Return accumulated background errors encountered.
kCurSizeActiveMemTable, // Return current size of the active memtable
kNumEntriesInMutableMemtable, // Return number of entries in the mutable
// memtable.
kNumEntriesInImmutableMemtable, // Return sum of number of entries in all
// the immutable mem tables.
kUnknown,
};
extern DBPropertyType GetPropertyType(const Slice& property);
class InternalStats {
public:
enum WriteStallType {
LEVEL0_SLOWDOWN,
MEMTABLE_COMPACTION,
LEVEL0_NUM_FILES,
WRITE_STALLS_ENUM_MAX,
};
InternalStats(int num_levels, Env* env, Statistics* statistics)
: compaction_stats_(num_levels),
stall_micros_(WRITE_STALLS_ENUM_MAX, 0),
stall_counts_(WRITE_STALLS_ENUM_MAX, 0),
stall_leveln_slowdown_(num_levels, 0),
stall_leveln_slowdown_count_(num_levels, 0),
bg_error_count_(0),
number_levels_(num_levels),
statistics_(statistics),
env_(env),
started_at_(env->NowMicros()) {}
// Per level compaction stats. compaction_stats_[level] stores the stats for
// compactions that produced data for the specified "level".
struct CompactionStats {
uint64_t micros;
// Bytes read from level N during compaction between levels N and N+1
int64_t bytes_readn;
// Bytes read from level N+1 during compaction between levels N and N+1
int64_t bytes_readnp1;
// Total bytes written during compaction between levels N and N+1
int64_t bytes_written;
// Files read from level N during compaction between levels N and N+1
int files_in_leveln;
// Files read from level N+1 during compaction between levels N and N+1
int files_in_levelnp1;
// Files written during compaction between levels N and N+1
int files_out_levelnp1;
// Number of compactions done
int count;
CompactionStats()
: micros(0),
bytes_readn(0),
bytes_readnp1(0),
bytes_written(0),
files_in_leveln(0),
files_in_levelnp1(0),
files_out_levelnp1(0),
count(0) {}
void Add(const CompactionStats& c) {
this->micros += c.micros;
this->bytes_readn += c.bytes_readn;
this->bytes_readnp1 += c.bytes_readnp1;
this->bytes_written += c.bytes_written;
this->files_in_leveln += c.files_in_leveln;
this->files_in_levelnp1 += c.files_in_levelnp1;
this->files_out_levelnp1 += c.files_out_levelnp1;
this->count += 1;
}
};
void AddCompactionStats(int level, const CompactionStats& stats) {
compaction_stats_[level].Add(stats);
}
void RecordWriteStall(WriteStallType write_stall_type, uint64_t micros) {
stall_micros_[write_stall_type] += micros;
stall_counts_[write_stall_type]++;
}
void RecordLevelNSlowdown(int level, uint64_t micros) {
stall_leveln_slowdown_[level] += micros;
stall_leveln_slowdown_count_[level] += micros;
}
uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
bool GetProperty(DBPropertyType property_type, const Slice& property,
std::string* value, ColumnFamilyData* cfd);
private:
std::vector<CompactionStats> compaction_stats_;
// Used to compute per-interval statistics
struct StatsSnapshot {
uint64_t compaction_bytes_read_; // Bytes read by compaction
uint64_t compaction_bytes_written_; // Bytes written by compaction
uint64_t ingest_bytes_; // Bytes written by user
uint64_t wal_bytes_; // Bytes written to WAL
uint64_t wal_synced_; // Number of times WAL is synced
uint64_t write_with_wal_; // Number of writes that request WAL
// These count the number of writes processed by the calling thread or
// another thread.
uint64_t write_other_;
uint64_t write_self_;
double seconds_up_;
StatsSnapshot()
: compaction_bytes_read_(0),
compaction_bytes_written_(0),
ingest_bytes_(0),
wal_bytes_(0),
wal_synced_(0),
write_with_wal_(0),
write_other_(0),
write_self_(0),
seconds_up_(0) {}
};
// Counters from the previous time per-interval stats were computed
StatsSnapshot last_stats_;
// These count the number of microseconds for which MakeRoomForWrite stalls.
std::vector<uint64_t> stall_micros_;
std::vector<uint64_t> stall_counts_;
std::vector<uint64_t> stall_leveln_slowdown_;
std::vector<uint64_t> stall_leveln_slowdown_count_;
// Total number of background errors encountered. Every time a flush task
// or compaction task fails, this counter is incremented. The failure can
// be caused by any possible reason, including file system errors, out of
// resources, or input file corruption. Failing when retrying the same flush
// or compaction will cause the counter to increase too.
uint64_t bg_error_count_;
int number_levels_;
Statistics* statistics_;
Env* env_;
uint64_t started_at_;
};
} // namespace rocksdb

79
db/log_and_apply_bench.cc Normal file
View File

@@ -0,0 +1,79 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include <vector>
#include "util/testharness.h"
#include "util/benchharness.h"
#include "db/version_set.h"
#include "util/mutexlock.h"
namespace rocksdb {
std::string MakeKey(unsigned int num) {
char buf[30];
snprintf(buf, sizeof(buf), "%016u", num);
return std::string(buf);
}
void BM_LogAndApply(int iters, int num_base_files) {
VersionSet* vset;
ColumnFamilyData* default_cfd;
uint64_t fnum = 1;
port::Mutex mu;
MutexLock l(&mu);
BENCHMARK_SUSPEND {
std::string dbname = test::TmpDir() + "/rocksdb_test_benchmark";
ASSERT_OK(DestroyDB(dbname, Options()));
DB* db = nullptr;
Options opts;
opts.create_if_missing = true;
Status s = DB::Open(opts, dbname, &db);
ASSERT_OK(s);
ASSERT_TRUE(db != nullptr);
delete db;
db = nullptr;
Options options;
EnvOptions sopt;
vset = new VersionSet(dbname, &options, sopt, nullptr);
std::vector<ColumnFamilyDescriptor> dummy;
dummy.push_back(ColumnFamilyDescriptor());
ASSERT_OK(vset->Recover(dummy));
default_cfd = vset->GetColumnFamilySet()->GetDefault();
VersionEdit vbase;
for (int i = 0; i < num_base_files; i++) {
InternalKey start(MakeKey(2 * fnum), 1, kTypeValue);
InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
vbase.AddFile(2, ++fnum, 1 /* file size */, start, limit, 1, 1);
}
ASSERT_OK(vset->LogAndApply(default_cfd, &vbase, &mu));
}
for (int i = 0; i < iters; i++) {
VersionEdit vedit;
vedit.DeleteFile(2, fnum);
InternalKey start(MakeKey(2 * fnum), 1, kTypeValue);
InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
vedit.AddFile(2, ++fnum, 1 /* file size */, start, limit, 1, 1);
vset->LogAndApply(default_cfd, &vedit, &mu);
}
}
BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_1_file, 1000, 1)
BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_100_files, 1000, 100)
BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_10000_files, 1000, 10000)
BENCHMARK_NAMED_PARAM(BM_LogAndApply, 100_iters_100000_files, 100, 100000)
} // namespace rocksdb
int main(int argc, char** argv) {
rocksdb::benchmark::RunBenchmarks();
return 0;
}

35
db/log_format.h Normal file
View File

@@ -0,0 +1,35 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Log format information shared by reader and writer.
// See ../doc/log_format.txt for more detail.
#pragma once
namespace rocksdb {
namespace log {
enum RecordType {
// Zero is reserved for preallocated files
kZeroType = 0,
kFullType = 1,
// For fragments
kFirstType = 2,
kMiddleType = 3,
kLastType = 4
};
static const int kMaxRecordType = kLastType;
static const unsigned int kBlockSize = 32768;
// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
static const int kHeaderSize = 4 + 1 + 2;
} // namespace log
} // namespace rocksdb

339
db/log_reader.cc Normal file
View File

@@ -0,0 +1,339 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_reader.h"
#include <stdio.h>
#include "rocksdb/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
namespace rocksdb {
namespace log {
Reader::Reporter::~Reporter() {
}
Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
bool checksum, uint64_t initial_offset)
: file_(std::move(file)),
reporter_(reporter),
checksum_(checksum),
backing_store_(new char[kBlockSize]),
buffer_(),
eof_(false),
read_error_(false),
eof_offset_(0),
last_record_offset_(0),
end_of_buffer_offset_(0),
initial_offset_(initial_offset) {
}
Reader::~Reader() {
delete[] backing_store_;
}
bool Reader::SkipToInitialBlock() {
size_t offset_in_block = initial_offset_ % kBlockSize;
uint64_t block_start_location = initial_offset_ - offset_in_block;
// Don't search a block if we'd be in the trailer
if (offset_in_block > kBlockSize - 6) {
offset_in_block = 0;
block_start_location += kBlockSize;
}
end_of_buffer_offset_ = block_start_location;
// Skip to start of first block that can contain the initial record
if (block_start_location > 0) {
Status skip_status = file_->Skip(block_start_location);
if (!skip_status.ok()) {
ReportDrop(block_start_location, skip_status);
return false;
}
}
return true;
}
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
if (last_record_offset_ < initial_offset_) {
if (!SkipToInitialBlock()) {
return false;
}
}
scratch->clear();
record->clear();
bool in_fragmented_record = false;
// Record offset of the logical record that we're reading
// 0 is a dummy value to make compilers happy
uint64_t prospective_record_offset = 0;
Slice fragment;
while (true) {
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
const unsigned int record_type = ReadPhysicalRecord(&fragment);
switch (record_type) {
case kFullType:
if (in_fragmented_record) {
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (scratch->empty()) {
in_fragmented_record = false;
} else {
ReportCorruption(scratch->size(), "partial record without end(1)");
}
}
prospective_record_offset = physical_record_offset;
scratch->clear();
*record = fragment;
last_record_offset_ = prospective_record_offset;
return true;
case kFirstType:
if (in_fragmented_record) {
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (scratch->empty()) {
in_fragmented_record = false;
} else {
ReportCorruption(scratch->size(), "partial record without end(2)");
}
}
prospective_record_offset = physical_record_offset;
scratch->assign(fragment.data(), fragment.size());
in_fragmented_record = true;
break;
case kMiddleType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(1)");
} else {
scratch->append(fragment.data(), fragment.size());
}
break;
case kLastType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(2)");
} else {
scratch->append(fragment.data(), fragment.size());
*record = Slice(*scratch);
last_record_offset_ = prospective_record_offset;
return true;
}
break;
case kEof:
if (in_fragmented_record) {
// This can be caused by the writer dying immediately after
// writing a physical record but before completing the next; don't
// treat it as a corruption, just ignore the entire logical record.
scratch->clear();
}
return false;
case kBadRecord:
if (in_fragmented_record) {
ReportCorruption(scratch->size(), "error in middle of record");
in_fragmented_record = false;
scratch->clear();
}
break;
default: {
char buf[40];
snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
ReportCorruption(
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
buf);
in_fragmented_record = false;
scratch->clear();
break;
}
}
}
return false;
}
uint64_t Reader::LastRecordOffset() {
return last_record_offset_;
}
void Reader::UnmarkEOF() {
if (read_error_) {
return;
}
eof_ = false;
if (eof_offset_ == 0) {
return;
}
// If the EOF was in the middle of a block (a partial block was read) we have
// to read the rest of the block as ReadPhysicalRecord can only read full
// blocks and expects the file position indicator to be aligned to the start
// of a block.
//
// consumed_bytes + buffer_size() + remaining == kBlockSize
size_t consumed_bytes = eof_offset_ - buffer_.size();
size_t remaining = kBlockSize - eof_offset_;
// backing_store_ is used to concatenate what is left in buffer_ and
// the remainder of the block. If buffer_ already uses backing_store_,
// we just append the new data.
if (buffer_.data() != backing_store_ + consumed_bytes) {
// Buffer_ does not use backing_store_ for storage.
// Copy what is left in buffer_ to backing_store.
memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size());
}
Slice read_buffer;
Status status = file_->Read(remaining, &read_buffer,
backing_store_ + eof_offset_);
size_t added = read_buffer.size();
end_of_buffer_offset_ += added;
if (!status.ok()) {
if (added > 0) {
ReportDrop(added, status);
}
read_error_ = true;
return;
}
if (read_buffer.data() != backing_store_ + eof_offset_) {
// Read did not write to backing_store_
memmove(backing_store_ + eof_offset_, read_buffer.data(),
read_buffer.size());
}
buffer_ = Slice(backing_store_ + consumed_bytes,
eof_offset_ + added - consumed_bytes);
if (added < remaining) {
eof_ = true;
eof_offset_ += added;
} else {
eof_offset_ = 0;
}
}
void Reader::ReportCorruption(size_t bytes, const char* reason) {
ReportDrop(bytes, Status::Corruption(reason));
}
void Reader::ReportDrop(size_t bytes, const Status& reason) {
if (reporter_ != nullptr &&
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
reporter_->Corruption(bytes, reason);
}
}
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
while (true) {
if (buffer_.size() < (size_t)kHeaderSize) {
if (!eof_ && !read_error_) {
// Last read was a full read, so this is a trailer to skip
buffer_.clear();
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
end_of_buffer_offset_ += buffer_.size();
if (!status.ok()) {
buffer_.clear();
ReportDrop(kBlockSize, status);
read_error_ = true;
return kEof;
} else if (buffer_.size() < (size_t)kBlockSize) {
eof_ = true;
eof_offset_ = buffer_.size();
}
continue;
} else {
// Note that if buffer_ is non-empty, we have a truncated header at the
// end of the file, which can be caused by the writer crashing in the
// middle of writing the header. Instead of considering this an error,
// just report EOF.
buffer_.clear();
return kEof;
}
}
// Parse the header
const char* header = buffer_.data();
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
const unsigned int type = header[6];
const uint32_t length = a | (b << 8);
if (kHeaderSize + length > buffer_.size()) {
size_t drop_size = buffer_.size();
buffer_.clear();
if (!eof_) {
ReportCorruption(drop_size, "bad record length");
return kBadRecord;
}
// If the end of the file has been reached without reading |length| bytes
// of payload, assume the writer died in the middle of writing the record.
// Don't report a corruption.
return kEof;
}
if (type == kZeroType && length == 0) {
// Skip zero length record without reporting any drops since
// such records are produced by the mmap based writing code in
// env_posix.cc that preallocates file regions.
// NOTE: this should never happen in DB written by new RocksDB versions,
// since we turn off mmap writes to manifest and log files
buffer_.clear();
return kBadRecord;
}
// Check crc
if (checksum_) {
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
if (actual_crc != expected_crc) {
// Drop the rest of the buffer since "length" itself may have
// been corrupted and if we trust it, we could find some
// fragment of a real log record that just happens to look
// like a valid log record.
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "checksum mismatch");
return kBadRecord;
}
}
buffer_.remove_prefix(kHeaderSize + length);
// Skip physical record that started before initial_offset_
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
initial_offset_) {
result->clear();
return kBadRecord;
}
*result = Slice(header + kHeaderSize, length);
return type;
}
}
} // namespace log
} // namespace rocksdb

130
db/log_reader.h Normal file
View File

@@ -0,0 +1,130 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <memory>
#include <stdint.h>
#include "db/log_format.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
namespace rocksdb {
class SequentialFile;
using std::unique_ptr;
namespace log {
class Reader {
public:
// Interface for reporting errors.
class Reporter {
public:
virtual ~Reporter();
// Some corruption was detected. "size" is the approximate number
// of bytes dropped due to the corruption.
virtual void Corruption(size_t bytes, const Status& status) = 0;
};
// Create a reader that will return log records from "*file".
// "*file" must remain live while this Reader is in use.
//
// If "reporter" is non-nullptr, it is notified whenever some data is
// dropped due to a detected corruption. "*reporter" must remain
// live while this Reader is in use.
//
// If "checksum" is true, verify checksums if available.
//
// The Reader will start reading at the first record located at physical
// position >= initial_offset within the file.
Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
bool checksum, uint64_t initial_offset);
~Reader();
// Read the next record into *record. Returns true if read
// successfully, false if we hit end of the input. May use
// "*scratch" as temporary storage. The contents filled in *record
// will only be valid until the next mutating operation on this
// reader or the next mutation to *scratch.
bool ReadRecord(Slice* record, std::string* scratch);
// Returns the physical offset of the last record returned by ReadRecord.
//
// Undefined before the first call to ReadRecord.
uint64_t LastRecordOffset();
// returns true if the reader has encountered an eof condition.
bool IsEOF() {
return eof_;
}
// when we know more data has been written to the file. we can use this
// function to force the reader to look again in the file.
// Also aligns the file position indicator to the start of the next block
// by reading the rest of the data from the EOF position to the end of the
// block that was partially read.
void UnmarkEOF();
SequentialFile* file() { return file_.get(); }
private:
const unique_ptr<SequentialFile> file_;
Reporter* const reporter_;
bool const checksum_;
char* const backing_store_;
Slice buffer_;
bool eof_; // Last Read() indicated EOF by returning < kBlockSize
bool read_error_; // Error occurred while reading from file
// Offset of the file position indicator within the last block when an
// EOF was detected.
size_t eof_offset_;
// Offset of the last record returned by ReadRecord.
uint64_t last_record_offset_;
// Offset of the first location past the end of buffer_.
uint64_t end_of_buffer_offset_;
// Offset at which to start looking for the first record to return
uint64_t const initial_offset_;
// Extend record types with the following special values
enum {
kEof = kMaxRecordType + 1,
// Returned whenever we find an invalid physical record.
// Currently there are three situations in which this happens:
// * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
// * The record is a 0-length record (No drop is reported)
// * The record is below constructor's initial_offset (No drop is reported)
kBadRecord = kMaxRecordType + 2
};
// Skips all blocks that are completely before "initial_offset_".
//
// Returns true on success. Handles reporting.
bool SkipToInitialBlock();
// Return type, or one of the preceding special values
unsigned int ReadPhysicalRecord(Slice* result);
// Reports dropped bytes to the reporter.
// buffer_ must be updated to remove the dropped bytes prior to invocation.
void ReportCorruption(size_t bytes, const char* reason);
void ReportDrop(size_t bytes, const Status& reason);
// No copying allowed
Reader(const Reader&);
void operator=(const Reader&);
};
} // namespace log
} // namespace rocksdb

689
db/log_test.cc Normal file
View File

@@ -0,0 +1,689 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "rocksdb/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
#include "util/random.h"
#include "util/testharness.h"
namespace rocksdb {
namespace log {
// Construct a string of the specified length made out of the supplied
// partial string.
static std::string BigString(const std::string& partial_string, size_t n) {
std::string result;
while (result.size() < n) {
result.append(partial_string);
}
result.resize(n);
return result;
}
// Construct a string from a number
static std::string NumberString(int n) {
char buf[50];
snprintf(buf, sizeof(buf), "%d.", n);
return std::string(buf);
}
// Return a skewed potentially long string
static std::string RandomSkewedString(int i, Random* rnd) {
return BigString(NumberString(i), rnd->Skewed(17));
}
class LogTest {
private:
class StringDest : public WritableFile {
public:
std::string contents_;
explicit StringDest(Slice& reader_contents) :
WritableFile(),
contents_(""),
reader_contents_(reader_contents),
last_flush_(0) {
reader_contents_ = Slice(contents_.data(), 0);
};
virtual Status Close() { return Status::OK(); }
virtual Status Flush() {
ASSERT_TRUE(reader_contents_.size() <= last_flush_);
size_t offset = last_flush_ - reader_contents_.size();
reader_contents_ = Slice(
contents_.data() + offset,
contents_.size() - offset);
last_flush_ = contents_.size();
return Status::OK();
}
virtual Status Sync() { return Status::OK(); }
virtual Status Append(const Slice& slice) {
contents_.append(slice.data(), slice.size());
return Status::OK();
}
void Drop(size_t bytes) {
contents_.resize(contents_.size() - bytes);
reader_contents_ = Slice(
reader_contents_.data(), reader_contents_.size() - bytes);
last_flush_ = contents_.size();
}
private:
Slice& reader_contents_;
size_t last_flush_;
};
class StringSource : public SequentialFile {
public:
Slice& contents_;
bool force_error_;
size_t force_error_position_;
bool force_eof_;
size_t force_eof_position_;
bool returned_partial_;
explicit StringSource(Slice& contents) :
contents_(contents),
force_error_(false),
force_error_position_(0),
force_eof_(false),
force_eof_position_(0),
returned_partial_(false) { }
virtual Status Read(size_t n, Slice* result, char* scratch) {
ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
if (force_error_) {
if (force_error_position_ >= n) {
force_error_position_ -= n;
} else {
*result = Slice(contents_.data(), force_error_position_);
contents_.remove_prefix(force_error_position_);
force_error_ = false;
returned_partial_ = true;
return Status::Corruption("read error");
}
}
if (contents_.size() < n) {
n = contents_.size();
returned_partial_ = true;
}
if (force_eof_) {
if (force_eof_position_ >= n) {
force_eof_position_ -= n;
} else {
force_eof_ = false;
n = force_eof_position_;
returned_partial_ = true;
}
}
// By using scratch we ensure that caller has control over the
// lifetime of result.data()
memcpy(scratch, contents_.data(), n);
*result = Slice(scratch, n);
contents_.remove_prefix(n);
return Status::OK();
}
virtual Status Skip(uint64_t n) {
if (n > contents_.size()) {
contents_.clear();
return Status::NotFound("in-memory file skipepd past end");
}
contents_.remove_prefix(n);
return Status::OK();
}
};
class ReportCollector : public Reader::Reporter {
public:
size_t dropped_bytes_;
std::string message_;
ReportCollector() : dropped_bytes_(0) { }
virtual void Corruption(size_t bytes, const Status& status) {
dropped_bytes_ += bytes;
message_.append(status.ToString());
}
};
std::string& dest_contents() {
auto dest = dynamic_cast<StringDest*>(writer_.file());
assert(dest);
return dest->contents_;
}
const std::string& dest_contents() const {
auto dest = dynamic_cast<const StringDest*>(writer_.file());
assert(dest);
return dest->contents_;
}
void reset_source_contents() {
auto src = dynamic_cast<StringSource*>(reader_.file());
assert(src);
src->contents_ = dest_contents();
}
Slice reader_contents_;
unique_ptr<StringDest> dest_holder_;
unique_ptr<StringSource> source_holder_;
ReportCollector report_;
Writer writer_;
Reader reader_;
// Record metadata for testing initial offset functionality
static size_t initial_offset_record_sizes_[];
static uint64_t initial_offset_last_record_offsets_[];
public:
LogTest() : reader_contents_(),
dest_holder_(new StringDest(reader_contents_)),
source_holder_(new StringSource(reader_contents_)),
writer_(std::move(dest_holder_)),
reader_(std::move(source_holder_), &report_, true/*checksum*/,
0/*initial_offset*/) {
}
void Write(const std::string& msg) {
writer_.AddRecord(Slice(msg));
}
size_t WrittenBytes() const {
return dest_contents().size();
}
std::string Read() {
std::string scratch;
Slice record;
if (reader_.ReadRecord(&record, &scratch)) {
return record.ToString();
} else {
return "EOF";
}
}
void IncrementByte(int offset, int delta) {
dest_contents()[offset] += delta;
}
void SetByte(int offset, char new_byte) {
dest_contents()[offset] = new_byte;
}
void ShrinkSize(int bytes) {
auto dest = dynamic_cast<StringDest*>(writer_.file());
assert(dest);
dest->Drop(bytes);
}
void FixChecksum(int header_offset, int len) {
// Compute crc of type/len/data
uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len);
crc = crc32c::Mask(crc);
EncodeFixed32(&dest_contents()[header_offset], crc);
}
void ForceError(size_t position = 0) {
auto src = dynamic_cast<StringSource*>(reader_.file());
src->force_error_ = true;
src->force_error_position_ = position;
}
size_t DroppedBytes() const {
return report_.dropped_bytes_;
}
std::string ReportMessage() const {
return report_.message_;
}
void ForceEOF(size_t position = 0) {
auto src = dynamic_cast<StringSource*>(reader_.file());
src->force_eof_ = true;
src->force_eof_position_ = position;
}
void UnmarkEOF() {
auto src = dynamic_cast<StringSource*>(reader_.file());
src->returned_partial_ = false;
reader_.UnmarkEOF();
}
bool IsEOF() {
return reader_.IsEOF();
}
// Returns OK iff recorded error message contains "msg"
std::string MatchError(const std::string& msg) const {
if (report_.message_.find(msg) == std::string::npos) {
return report_.message_;
} else {
return "OK";
}
}
void WriteInitialOffsetLog() {
for (int i = 0; i < 4; i++) {
std::string record(initial_offset_record_sizes_[i],
static_cast<char>('a' + i));
Write(record);
}
}
void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
WriteInitialOffsetLog();
unique_ptr<StringSource> source(new StringSource(reader_contents_));
unique_ptr<Reader> offset_reader(
new Reader(std::move(source), &report_, true/*checksum*/,
WrittenBytes() + offset_past_end));
Slice record;
std::string scratch;
ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
}
void CheckInitialOffsetRecord(uint64_t initial_offset,
int expected_record_offset) {
WriteInitialOffsetLog();
unique_ptr<StringSource> source(new StringSource(reader_contents_));
unique_ptr<Reader> offset_reader(
new Reader(std::move(source), &report_, true/*checksum*/,
initial_offset));
Slice record;
std::string scratch;
ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
record.size());
ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
offset_reader->LastRecordOffset());
ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
}
};
size_t LogTest::initial_offset_record_sizes_[] =
{10000, // Two sizable records in first block
10000,
2 * log::kBlockSize - 1000, // Span three blocks
1};
uint64_t LogTest::initial_offset_last_record_offsets_[] =
{0,
kHeaderSize + 10000,
2 * (kHeaderSize + 10000),
2 * (kHeaderSize + 10000) +
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
TEST(LogTest, Empty) {
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, ReadWrite) {
Write("foo");
Write("bar");
Write("");
Write("xxxx");
ASSERT_EQ("foo", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("", Read());
ASSERT_EQ("xxxx", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
}
TEST(LogTest, ManyBlocks) {
for (int i = 0; i < 100000; i++) {
Write(NumberString(i));
}
for (int i = 0; i < 100000; i++) {
ASSERT_EQ(NumberString(i), Read());
}
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, Fragmentation) {
Write("small");
Write(BigString("medium", 50000));
Write(BigString("large", 100000));
ASSERT_EQ("small", Read());
ASSERT_EQ(BigString("medium", 50000), Read());
ASSERT_EQ(BigString("large", 100000), Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, MarginalTrailer) {
// Make a trailer that is exactly the same length as an empty record.
const int n = kBlockSize - 2*kHeaderSize;
Write(BigString("foo", n));
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
Write("");
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, MarginalTrailer2) {
// Make a trailer that is exactly the same length as an empty record.
const int n = kBlockSize - 2*kHeaderSize;
Write(BigString("foo", n));
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(0U, DroppedBytes());
ASSERT_EQ("", ReportMessage());
}
TEST(LogTest, ShortTrailer) {
const int n = kBlockSize - 2*kHeaderSize + 4;
Write(BigString("foo", n));
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
Write("");
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, AlignedEof) {
const int n = kBlockSize - 2*kHeaderSize + 4;
Write(BigString("foo", n));
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, RandomRead) {
const int N = 500;
Random write_rnd(301);
for (int i = 0; i < N; i++) {
Write(RandomSkewedString(i, &write_rnd));
}
Random read_rnd(301);
for (int i = 0; i < N; i++) {
ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
}
ASSERT_EQ("EOF", Read());
}
// Tests of all the error paths in log_reader.cc follow:
TEST(LogTest, ReadError) {
Write("foo");
ForceError();
ASSERT_EQ("EOF", Read());
ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes());
ASSERT_EQ("OK", MatchError("read error"));
}
TEST(LogTest, BadRecordType) {
Write("foo");
// Type is stored in header[6]
IncrementByte(6, 100);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("unknown record type"));
}
TEST(LogTest, TruncatedTrailingRecordIsIgnored) {
Write("foo");
ShrinkSize(4); // Drop all payload as well as a header byte
ASSERT_EQ("EOF", Read());
// Truncated last record is ignored, not treated as an error
ASSERT_EQ(0U, DroppedBytes());
ASSERT_EQ("", ReportMessage());
}
TEST(LogTest, BadLength) {
const int kPayloadSize = kBlockSize - kHeaderSize;
Write(BigString("bar", kPayloadSize));
Write("foo");
// Least significant size byte is stored in header[4].
IncrementByte(4, 1);
ASSERT_EQ("foo", Read());
ASSERT_EQ(kBlockSize, DroppedBytes());
ASSERT_EQ("OK", MatchError("bad record length"));
}
TEST(LogTest, BadLengthAtEndIsIgnored) {
Write("foo");
ShrinkSize(1);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(0U, DroppedBytes());
ASSERT_EQ("", ReportMessage());
}
TEST(LogTest, ChecksumMismatch) {
Write("foo");
IncrementByte(0, 10);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(10U, DroppedBytes());
ASSERT_EQ("OK", MatchError("checksum mismatch"));
}
TEST(LogTest, UnexpectedMiddleType) {
Write("foo");
SetByte(6, kMiddleType);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("missing start"));
}
TEST(LogTest, UnexpectedLastType) {
Write("foo");
SetByte(6, kLastType);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("missing start"));
}
TEST(LogTest, UnexpectedFullType) {
Write("foo");
Write("bar");
SetByte(6, kFirstType);
FixChecksum(0, 3);
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("partial record without end"));
}
TEST(LogTest, UnexpectedFirstType) {
Write("foo");
Write(BigString("bar", 100000));
SetByte(6, kFirstType);
FixChecksum(0, 3);
ASSERT_EQ(BigString("bar", 100000), Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("partial record without end"));
}
TEST(LogTest, MissingLastIsIgnored) {
Write(BigString("bar", kBlockSize));
// Remove the LAST block, including header.
ShrinkSize(14);
ASSERT_EQ("EOF", Read());
ASSERT_EQ("", ReportMessage());
ASSERT_EQ(0U, DroppedBytes());
}
TEST(LogTest, PartialLastIsIgnored) {
Write(BigString("bar", kBlockSize));
// Cause a bad record length in the LAST block.
ShrinkSize(1);
ASSERT_EQ("EOF", Read());
ASSERT_EQ("", ReportMessage());
ASSERT_EQ(0U, DroppedBytes());
}
TEST(LogTest, ErrorJoinsRecords) {
// Consider two fragmented records:
// first(R1) last(R1) first(R2) last(R2)
// where the middle two fragments disappear. We do not want
// first(R1),last(R2) to get joined and returned as a valid record.
// Write records that span two blocks
Write(BigString("foo", kBlockSize));
Write(BigString("bar", kBlockSize));
Write("correct");
// Wipe the middle block
for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
SetByte(offset, 'x');
}
ASSERT_EQ("correct", Read());
ASSERT_EQ("EOF", Read());
const unsigned int dropped = DroppedBytes();
ASSERT_LE(dropped, 2*kBlockSize + 100);
ASSERT_GE(dropped, 2*kBlockSize);
}
TEST(LogTest, ReadStart) {
CheckInitialOffsetRecord(0, 0);
}
TEST(LogTest, ReadSecondOneOff) {
CheckInitialOffsetRecord(1, 1);
}
TEST(LogTest, ReadSecondTenThousand) {
CheckInitialOffsetRecord(10000, 1);
}
TEST(LogTest, ReadSecondStart) {
CheckInitialOffsetRecord(10007, 1);
}
TEST(LogTest, ReadThirdOneOff) {
CheckInitialOffsetRecord(10008, 2);
}
TEST(LogTest, ReadThirdStart) {
CheckInitialOffsetRecord(20014, 2);
}
TEST(LogTest, ReadFourthOneOff) {
CheckInitialOffsetRecord(20015, 3);
}
TEST(LogTest, ReadFourthFirstBlockTrailer) {
CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
}
TEST(LogTest, ReadFourthMiddleBlock) {
CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
}
TEST(LogTest, ReadFourthLastBlock) {
CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
}
TEST(LogTest, ReadFourthStart) {
CheckInitialOffsetRecord(
2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
3);
}
TEST(LogTest, ReadEnd) {
CheckOffsetPastEndReturnsNoRecords(0);
}
TEST(LogTest, ReadPastEnd) {
CheckOffsetPastEndReturnsNoRecords(5);
}
TEST(LogTest, ClearEofSingleBlock) {
Write("foo");
Write("bar");
ForceEOF(3 + kHeaderSize + 2);
ASSERT_EQ("foo", Read());
UnmarkEOF();
ASSERT_EQ("bar", Read());
ASSERT_TRUE(IsEOF());
ASSERT_EQ("EOF", Read());
Write("xxx");
UnmarkEOF();
ASSERT_EQ("xxx", Read());
ASSERT_TRUE(IsEOF());
}
TEST(LogTest, ClearEofMultiBlock) {
size_t num_full_blocks = 5;
size_t n = (kBlockSize - kHeaderSize) * num_full_blocks + 25;
Write(BigString("foo", n));
Write(BigString("bar", n));
ForceEOF(n + num_full_blocks * kHeaderSize + 10);
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_TRUE(IsEOF());
UnmarkEOF();
ASSERT_EQ(BigString("bar", n), Read());
ASSERT_TRUE(IsEOF());
Write(BigString("xxx", n));
UnmarkEOF();
ASSERT_EQ(BigString("xxx", n), Read());
ASSERT_TRUE(IsEOF());
}
TEST(LogTest, ClearEofError) {
// If an error occurs during Read() in UnmarkEOF(), the records contained
// in the buffer should be returned on subsequent calls of ReadRecord()
// until no more full records are left, whereafter ReadRecord() should return
// false to indicate that it cannot read any further.
Write("foo");
Write("bar");
UnmarkEOF();
ASSERT_EQ("foo", Read());
ASSERT_TRUE(IsEOF());
Write("xxx");
ForceError(0);
UnmarkEOF();
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, ClearEofError2) {
Write("foo");
Write("bar");
UnmarkEOF();
ASSERT_EQ("foo", Read());
Write("xxx");
ForceError(3);
UnmarkEOF();
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("read error"));
}
} // namespace log
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

108
db/log_writer.cc Normal file
View File

@@ -0,0 +1,108 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_writer.h"
#include <stdint.h>
#include "rocksdb/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
namespace rocksdb {
namespace log {
Writer::Writer(unique_ptr<WritableFile>&& dest)
: dest_(std::move(dest)),
block_offset_(0) {
for (int i = 0; i <= kMaxRecordType; i++) {
char t = static_cast<char>(i);
type_crc_[i] = crc32c::Value(&t, 1);
}
}
Writer::~Writer() {
}
Status Writer::AddRecord(const Slice& slice) {
const char* ptr = slice.data();
size_t left = slice.size();
// Fragment the record if necessary and emit it. Note that if slice
// is empty, we still want to iterate once to emit a single
// zero-length record
Status s;
bool begin = true;
do {
const int leftover = kBlockSize - block_offset_;
assert(leftover >= 0);
if (leftover < kHeaderSize) {
// Switch to a new block
if (leftover > 0) {
// Fill the trailer (literal below relies on kHeaderSize being 7)
assert(kHeaderSize == 7);
dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
}
block_offset_ = 0;
}
// Invariant: we never leave < kHeaderSize bytes in a block.
assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
const size_t fragment_length = (left < avail) ? left : avail;
RecordType type;
const bool end = (left == fragment_length);
if (begin && end) {
type = kFullType;
} else if (begin) {
type = kFirstType;
} else if (end) {
type = kLastType;
} else {
type = kMiddleType;
}
s = EmitPhysicalRecord(type, ptr, fragment_length);
ptr += fragment_length;
left -= fragment_length;
begin = false;
} while (s.ok() && left > 0);
return s;
}
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
assert(n <= 0xffff); // Must fit in two bytes
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
// Format the header
char buf[kHeaderSize];
buf[4] = static_cast<char>(n & 0xff);
buf[5] = static_cast<char>(n >> 8);
buf[6] = static_cast<char>(t);
// Compute the crc of the record type and the payload.
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
crc = crc32c::Mask(crc); // Adjust for storage
EncodeFixed32(buf, crc);
// Write the header and the payload
Status s = dest_->Append(Slice(buf, kHeaderSize));
if (s.ok()) {
s = dest_->Append(Slice(ptr, n));
if (s.ok()) {
s = dest_->Flush();
}
}
block_offset_ += kHeaderSize + n;
return s;
}
} // namespace log
} // namespace rocksdb

55
db/log_writer.h Normal file
View File

@@ -0,0 +1,55 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <memory>
#include <stdint.h>
#include "db/log_format.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
namespace rocksdb {
class WritableFile;
using std::unique_ptr;
namespace log {
class Writer {
public:
// Create a writer that will append data to "*dest".
// "*dest" must be initially empty.
// "*dest" must remain live while this Writer is in use.
explicit Writer(unique_ptr<WritableFile>&& dest);
~Writer();
Status AddRecord(const Slice& slice);
WritableFile* file() { return dest_.get(); }
const WritableFile* file() const { return dest_.get(); }
private:
unique_ptr<WritableFile> dest_;
int block_offset_; // Current offset in block
// crc32c values for all supported record types. These are
// pre-computed to reduce the overhead of computing the crc of the
// record type stored in the header.
uint32_t type_crc_[kMaxRecordType + 1];
Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
// No copying allowed
Writer(const Writer&);
void operator=(const Writer&);
};
} // namespace log
} // namespace rocksdb

620
db/memtable.cc Normal file
View File

@@ -0,0 +1,620 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/memtable.h"
#include <memory>
#include <algorithm>
#include <limits>
#include "db/dbformat.h"
#include "db/merge_context.h"
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/merge_operator.h"
#include "rocksdb/slice_transform.h"
#include "table/merger.h"
#include "util/arena.h"
#include "util/coding.h"
#include "util/murmurhash.h"
#include "util/mutexlock.h"
#include "util/perf_context_imp.h"
#include "util/statistics.h"
#include "util/stop_watch.h"
namespace rocksdb {
MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
: comparator_(cmp),
refs_(0),
kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)),
kWriteBufferSize(options.write_buffer_size),
arena_(options.arena_block_size),
table_(options.memtable_factory->CreateMemTableRep(
comparator_, &arena_, options.prefix_extractor.get(),
options.info_log.get())),
num_entries_(0),
flush_in_progress_(false),
flush_completed_(false),
file_number_(0),
first_seqno_(0),
mem_next_logfile_number_(0),
locks_(options.inplace_update_support ? options.inplace_update_num_locks
: 0),
prefix_extractor_(options.prefix_extractor.get()),
should_flush_(ShouldFlushNow()) {
// if should_flush_ == true without an entry inserted, something must have
// gone wrong already.
assert(!should_flush_);
if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
prefix_bloom_.reset(new DynamicBloom(
options.memtable_prefix_bloom_bits, options.bloom_locality,
options.memtable_prefix_bloom_probes, nullptr,
options.memtable_prefix_bloom_huge_page_tlb_size,
options.info_log.get()));
}
}
MemTable::~MemTable() {
assert(refs_ == 0);
}
size_t MemTable::ApproximateMemoryUsage() {
size_t arena_usage = arena_.ApproximateMemoryUsage();
size_t table_usage = table_->ApproximateMemoryUsage();
// let MAX_USAGE = std::numeric_limits<size_t>::max()
// then if arena_usage + total_usage >= MAX_USAGE, return MAX_USAGE.
// the following variation is to avoid numeric overflow.
if (arena_usage >= std::numeric_limits<size_t>::max() - table_usage) {
return std::numeric_limits<size_t>::max();
}
// otherwise, return the actual usage
return arena_usage + table_usage;
}
bool MemTable::ShouldFlushNow() const {
// In a lot of times, we cannot allocate arena blocks that exactly matches the
// buffer size. Thus we have to decide if we should over-allocate or
// under-allocate.
// This constant avariable can be interpreted as: if we still have more than
// "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over
// allocate one more block.
const double kAllowOverAllocationRatio = 0.6;
// If arena still have room for new block allocation, we can safely say it
// shouldn't flush.
auto allocated_memory =
table_->ApproximateMemoryUsage() + arena_.MemoryAllocatedBytes();
// if we can still allocate one more block without exceeding the
// over-allocation ratio, then we should not flush.
if (allocated_memory + kArenaBlockSize <
kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
return false;
}
// if user keeps adding entries that exceeds kWriteBufferSize, we need to
// flush earlier even though we still have much available memory left.
if (allocated_memory >
kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
return true;
}
// In this code path, Arena has already allocated its "last block", which
// means the total allocatedmemory size is either:
// (1) "moderately" over allocated the memory (no more than `0.6 * arena
// block size`. Or,
// (2) the allocated memory is less than write buffer size, but we'll stop
// here since if we allocate a new arena block, we'll over allocate too much
// more (half of the arena block size) memory.
//
// In either case, to avoid over-allocate, the last block will stop allocation
// when its usage reaches a certain ratio, which we carefully choose "0.75
// full" as the stop condition because it addresses the following issue with
// great simplicity: What if the next inserted entry's size is
// bigger than AllocatedAndUnused()?
//
// The answer is: if the entry size is also bigger than 0.25 *
// kArenaBlockSize, a dedicated block will be allocated for it; otherwise
// arena will anyway skip the AllocatedAndUnused() and allocate a new, empty
// and regular block. In either case, we *overly* over-allocated.
//
// Therefore, setting the last block to be at most "0.75 full" avoids both
// cases.
//
// NOTE: the average percentage of waste space of this approach can be counted
// as: "arena block size * 0.25 / write buffer size". User who specify a small
// write buffer size and/or big arena block size may suffer.
return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
}
int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
const char* prefix_len_key2) const {
// Internal keys are encoded as length-prefixed strings.
Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
return comparator.Compare(k1, k2);
}
int MemTable::KeyComparator::operator()(const char* prefix_len_key,
const Slice& key)
const {
// Internal keys are encoded as length-prefixed strings.
Slice a = GetLengthPrefixedSlice(prefix_len_key);
return comparator.Compare(a, key);
}
Slice MemTableRep::UserKey(const char* key) const {
Slice slice = GetLengthPrefixedSlice(key);
return Slice(slice.data(), slice.size() - 8);
}
KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
*buf = arena_->Allocate(len);
return static_cast<KeyHandle>(*buf);
}
// Encode a suitable internal key target for "target" and return it.
// Uses *scratch as scratch space, and the returned pointer will point
// into this scratch space.
const char* EncodeKey(std::string* scratch, const Slice& target) {
scratch->clear();
PutVarint32(scratch, target.size());
scratch->append(target.data(), target.size());
return scratch->data();
}
class MemTableIterator: public Iterator {
public:
MemTableIterator(const MemTable& mem, const ReadOptions& options,
bool enforce_total_order, Arena* arena)
: bloom_(nullptr),
prefix_extractor_(mem.prefix_extractor_),
valid_(false),
arena_mode_(arena != nullptr) {
if (prefix_extractor_ != nullptr && !enforce_total_order) {
bloom_ = mem.prefix_bloom_.get();
iter_ = mem.table_->GetDynamicPrefixIterator(arena);
} else {
iter_ = mem.table_->GetIterator(arena);
}
}
~MemTableIterator() {
if (arena_mode_) {
iter_->~Iterator();
} else {
delete iter_;
}
}
virtual bool Valid() const { return valid_; }
virtual void Seek(const Slice& k) {
if (bloom_ != nullptr &&
!bloom_->MayContain(prefix_extractor_->Transform(ExtractUserKey(k)))) {
valid_ = false;
return;
}
iter_->Seek(k, nullptr);
valid_ = iter_->Valid();
}
virtual void SeekToFirst() {
iter_->SeekToFirst();
valid_ = iter_->Valid();
}
virtual void SeekToLast() {
iter_->SeekToLast();
valid_ = iter_->Valid();
}
virtual void Next() {
assert(Valid());
iter_->Next();
valid_ = iter_->Valid();
}
virtual void Prev() {
assert(Valid());
iter_->Prev();
valid_ = iter_->Valid();
}
virtual Slice key() const {
assert(Valid());
return GetLengthPrefixedSlice(iter_->key());
}
virtual Slice value() const {
assert(Valid());
Slice key_slice = GetLengthPrefixedSlice(iter_->key());
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
}
virtual Status status() const { return Status::OK(); }
private:
DynamicBloom* bloom_;
const SliceTransform* const prefix_extractor_;
MemTableRep::Iterator* iter_;
bool valid_;
bool arena_mode_;
// No copying allowed
MemTableIterator(const MemTableIterator&);
void operator=(const MemTableIterator&);
};
Iterator* MemTable::NewIterator(const ReadOptions& options,
bool enforce_total_order, Arena* arena) {
if (arena == nullptr) {
return new MemTableIterator(*this, options, enforce_total_order, nullptr);
} else {
auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
return new (mem)
MemTableIterator(*this, options, enforce_total_order, arena);
}
}
port::RWMutex* MemTable::GetLock(const Slice& key) {
static murmur_hash hash;
return &locks_[hash(key) % locks_.size()];
}
void MemTable::Add(SequenceNumber s, ValueType type,
const Slice& key, /* user key */
const Slice& value) {
// Format of an entry is concatenation of:
// key_size : varint32 of internal_key.size()
// key bytes : char[internal_key.size()]
// value_size : varint32 of value.size()
// value bytes : char[value.size()]
size_t key_size = key.size();
size_t val_size = value.size();
size_t internal_key_size = key_size + 8;
const size_t encoded_len =
VarintLength(internal_key_size) + internal_key_size +
VarintLength(val_size) + val_size;
char* buf = nullptr;
KeyHandle handle = table_->Allocate(encoded_len, &buf);
assert(buf != nullptr);
char* p = EncodeVarint32(buf, internal_key_size);
memcpy(p, key.data(), key_size);
p += key_size;
EncodeFixed64(p, (s << 8) | type);
p += 8;
p = EncodeVarint32(p, val_size);
memcpy(p, value.data(), val_size);
assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
table_->Insert(handle);
num_entries_++;
if (prefix_bloom_) {
assert(prefix_extractor_);
prefix_bloom_->Add(prefix_extractor_->Transform(key));
}
// The first sequence number inserted into the memtable
assert(first_seqno_ == 0 || s > first_seqno_);
if (first_seqno_ == 0) {
first_seqno_ = s;
}
should_flush_ = ShouldFlushNow();
}
// Callback from MemTable::Get()
namespace {
struct Saver {
Status* status;
const LookupKey* key;
bool* found_final_value; // Is value set correctly? Used by KeyMayExist
bool* merge_in_progress;
std::string* value;
const MergeOperator* merge_operator;
// the merge operations encountered;
MergeContext* merge_context;
MemTable* mem;
Logger* logger;
Statistics* statistics;
bool inplace_update_support;
};
} // namespace
static bool SaveValue(void* arg, const char* entry) {
Saver* s = reinterpret_cast<Saver*>(arg);
MergeContext* merge_context = s->merge_context;
const MergeOperator* merge_operator = s->merge_operator;
assert(s != nullptr && merge_context != nullptr);
// entry format is:
// klength varint32
// userkey char[klength-8]
// tag uint64
// vlength varint32
// value char[vlength]
// Check that it belongs to same user key. We do not check the
// sequence number since the Seek() call above should have skipped
// all entries with overly large sequence numbers.
uint32_t key_length;
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
if (s->mem->GetInternalKeyComparator().user_comparator()->Compare(
Slice(key_ptr, key_length - 8), s->key->user_key()) == 0) {
// Correct user key
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
switch (static_cast<ValueType>(tag & 0xff)) {
case kTypeValue: {
if (s->inplace_update_support) {
s->mem->GetLock(s->key->user_key())->ReadLock();
}
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
*(s->status) = Status::OK();
if (*(s->merge_in_progress)) {
assert(merge_operator);
if (!merge_operator->FullMerge(s->key->user_key(), &v,
merge_context->GetOperands(), s->value,
s->logger)) {
RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
*(s->status) =
Status::Corruption("Error: Could not perform merge.");
}
} else {
s->value->assign(v.data(), v.size());
}
if (s->inplace_update_support) {
s->mem->GetLock(s->key->user_key())->Unlock();
}
*(s->found_final_value) = true;
return false;
}
case kTypeDeletion: {
if (*(s->merge_in_progress)) {
assert(merge_operator);
*(s->status) = Status::OK();
if (!merge_operator->FullMerge(s->key->user_key(), nullptr,
merge_context->GetOperands(), s->value,
s->logger)) {
RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
*(s->status) =
Status::Corruption("Error: Could not perform merge.");
}
} else {
*(s->status) = Status::NotFound();
}
*(s->found_final_value) = true;
return false;
}
case kTypeMerge: {
std::string merge_result; // temporary area for merge results later
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
*(s->merge_in_progress) = true;
merge_context->PushOperand(v);
return true;
}
default:
assert(false);
return true;
}
}
// s->state could be Corrupt, merge or notfound
return false;
}
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
MergeContext& merge_context, const Options& options) {
PERF_TIMER_AUTO(get_from_memtable_time);
Slice user_key = key.user_key();
bool found_final_value = false;
bool merge_in_progress = s->IsMergeInProgress();
if (prefix_bloom_ &&
!prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) {
// iter is null if prefix bloom says the key does not exist
} else {
Saver saver;
saver.status = s;
saver.found_final_value = &found_final_value;
saver.merge_in_progress = &merge_in_progress;
saver.key = &key;
saver.value = value;
saver.status = s;
saver.mem = this;
saver.merge_context = &merge_context;
saver.merge_operator = options.merge_operator.get();
saver.logger = options.info_log.get();
saver.inplace_update_support = options.inplace_update_support;
saver.statistics = options.statistics.get();
table_->Get(key, &saver, SaveValue);
}
// No change to value, since we have not yet found a Put/Delete
if (!found_final_value && merge_in_progress) {
*s = Status::MergeInProgress("");
}
PERF_TIMER_STOP(get_from_memtable_time);
PERF_COUNTER_ADD(get_from_memtable_count, 1);
return found_final_value;
}
void MemTable::Update(SequenceNumber seq,
const Slice& key,
const Slice& value) {
LookupKey lkey(key, seq);
Slice mem_key = lkey.memtable_key();
std::unique_ptr<MemTableRep::Iterator> iter(
table_->GetIterator(lkey.user_key()));
iter->Seek(lkey.internal_key(), mem_key.data());
if (iter->Valid()) {
// entry format is:
// key_length varint32
// userkey char[klength-8]
// tag uint64
// vlength varint32
// value char[vlength]
// Check that it belongs to same user key. We do not check the
// sequence number since the Seek() call above should have skipped
// all entries with overly large sequence numbers.
const char* entry = iter->key();
uint32_t key_length = 0;
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
if (comparator_.comparator.user_comparator()->Compare(
Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
// Correct user key
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
switch (static_cast<ValueType>(tag & 0xff)) {
case kTypeValue: {
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
uint32_t prev_size = prev_value.size();
uint32_t new_size = value.size();
// Update value, if new value size <= previous value size
if (new_size <= prev_size ) {
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
new_size);
WriteLock wl(GetLock(lkey.user_key()));
memcpy(p, value.data(), value.size());
assert((unsigned)((p + value.size()) - entry) ==
(unsigned)(VarintLength(key_length) + key_length +
VarintLength(value.size()) + value.size()));
return;
}
}
default:
// If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
// we don't have enough space for update inplace
Add(seq, kTypeValue, key, value);
return;
}
}
}
// key doesn't exist
Add(seq, kTypeValue, key, value);
}
bool MemTable::UpdateCallback(SequenceNumber seq,
const Slice& key,
const Slice& delta,
const Options& options) {
LookupKey lkey(key, seq);
Slice memkey = lkey.memtable_key();
std::unique_ptr<MemTableRep::Iterator> iter(
table_->GetIterator(lkey.user_key()));
iter->Seek(lkey.internal_key(), memkey.data());
if (iter->Valid()) {
// entry format is:
// key_length varint32
// userkey char[klength-8]
// tag uint64
// vlength varint32
// value char[vlength]
// Check that it belongs to same user key. We do not check the
// sequence number since the Seek() call above should have skipped
// all entries with overly large sequence numbers.
const char* entry = iter->key();
uint32_t key_length = 0;
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
if (comparator_.comparator.user_comparator()->Compare(
Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
// Correct user key
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
switch (static_cast<ValueType>(tag & 0xff)) {
case kTypeValue: {
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
uint32_t prev_size = prev_value.size();
char* prev_buffer = const_cast<char*>(prev_value.data());
uint32_t new_prev_size = prev_size;
std::string str_value;
WriteLock wl(GetLock(lkey.user_key()));
auto status = options.inplace_callback(prev_buffer, &new_prev_size,
delta, &str_value);
if (status == UpdateStatus::UPDATED_INPLACE) {
// Value already updated by callback.
assert(new_prev_size <= prev_size);
if (new_prev_size < prev_size) {
// overwrite the new prev_size
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
new_prev_size);
if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
// shift the value buffer as well.
memcpy(p, prev_buffer, new_prev_size);
}
}
RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED);
should_flush_ = ShouldFlushNow();
return true;
} else if (status == UpdateStatus::UPDATED) {
Add(seq, kTypeValue, key, Slice(str_value));
RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN);
should_flush_ = ShouldFlushNow();
return true;
} else if (status == UpdateStatus::UPDATE_FAILED) {
// No action required. Return.
should_flush_ = ShouldFlushNow();
return true;
}
}
default:
break;
}
}
}
// If the latest value is not kTypeValue
// or key doesn't exist
return false;
}
size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
Slice memkey = key.memtable_key();
// A total ordered iterator is costly for some memtablerep (prefix aware
// reps). By passing in the user key, we allow efficient iterator creation.
// The iterator only needs to be ordered within the same user key.
std::unique_ptr<MemTableRep::Iterator> iter(
table_->GetIterator(key.user_key()));
iter->Seek(key.internal_key(), memkey.data());
size_t num_successive_merges = 0;
for (; iter->Valid(); iter->Next()) {
const char* entry = iter->key();
uint32_t key_length = 0;
const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
if (comparator_.comparator.user_comparator()->Compare(
Slice(iter_key_ptr, key_length - 8), key.user_key()) != 0) {
break;
}
const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
if (static_cast<ValueType>(tag & 0xff) != kTypeMerge) {
break;
}
++num_successive_merges;
}
return num_successive_merges;
}
void MemTableRep::Get(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry)) {
auto iter = GetIterator(k.user_key());
for (iter->Seek(k.internal_key(), k.memtable_key().data());
iter->Valid() && callback_func(callback_args, iter->key());
iter->Next()) {
}
}
} // namespace rocksdb

222
db/memtable.h Normal file
View File

@@ -0,0 +1,222 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <string>
#include <memory>
#include <deque>
#include "db/dbformat.h"
#include "db/skiplist.h"
#include "db/version_edit.h"
#include "rocksdb/db.h"
#include "rocksdb/memtablerep.h"
#include "util/arena.h"
#include "util/dynamic_bloom.h"
namespace rocksdb {
class Arena;
class Mutex;
class MemTableIterator;
class MergeContext;
class MemTable {
public:
struct KeyComparator : public MemTableRep::KeyComparator {
const InternalKeyComparator comparator;
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
virtual int operator()(const char* prefix_len_key1,
const char* prefix_len_key2) const;
virtual int operator()(const char* prefix_len_key,
const Slice& key) const override;
};
// MemTables are reference counted. The initial reference count
// is zero and the caller must call Ref() at least once.
explicit MemTable(const InternalKeyComparator& comparator,
const Options& options);
~MemTable();
// Increase reference count.
void Ref() { ++refs_; }
// Drop reference count.
// If the refcount goes to zero return this memtable, otherwise return null
MemTable* Unref() {
--refs_;
assert(refs_ >= 0);
if (refs_ <= 0) {
return this;
}
return nullptr;
}
// Returns an estimate of the number of bytes of data in use by this
// data structure.
//
// REQUIRES: external synchronization to prevent simultaneous
// operations on the same MemTable.
size_t ApproximateMemoryUsage();
// This method heuristically determines if the memtable should continue to
// host more data.
bool ShouldFlush() const { return should_flush_; }
// Return an iterator that yields the contents of the memtable.
//
// The caller must ensure that the underlying MemTable remains live
// while the returned iterator is live. The keys returned by this
// iterator are internal keys encoded by AppendInternalKey in the
// db/dbformat.{h,cc} module.
//
// By default, it returns an iterator for prefix seek if prefix_extractor
// is configured in Options.
// arena: If not null, the arena needs to be used to allocate the Iterator.
// Calling ~Iterator of the iterator will destroy all the states but
// those allocated in arena.
Iterator* NewIterator(const ReadOptions& options,
bool enforce_total_order = false,
Arena* arena = nullptr);
// Add an entry into memtable that maps key to value at the
// specified sequence number and with the specified type.
// Typically value will be empty if type==kTypeDeletion.
void Add(SequenceNumber seq, ValueType type,
const Slice& key,
const Slice& value);
// If memtable contains a value for key, store it in *value and return true.
// If memtable contains a deletion for key, store a NotFound() error
// in *status and return true.
// If memtable contains Merge operation as the most recent entry for a key,
// and the merge process does not stop (not reaching a value or delete),
// prepend the current merge operand to *operands.
// store MergeInProgress in s, and return false.
// Else, return false.
bool Get(const LookupKey& key, std::string* value, Status* s,
MergeContext& merge_context, const Options& options);
// Attempts to update the new_value inplace, else does normal Add
// Pseudocode
// if key exists in current memtable && prev_value is of type kTypeValue
// if new sizeof(new_value) <= sizeof(prev_value)
// update inplace
// else add(key, new_value)
// else add(key, new_value)
void Update(SequenceNumber seq,
const Slice& key,
const Slice& value);
// If prev_value for key exits, attempts to update it inplace.
// else returns false
// Pseudocode
// if key exists in current memtable && prev_value is of type kTypeValue
// new_value = delta(prev_value)
// if sizeof(new_value) <= sizeof(prev_value)
// update inplace
// else add(key, new_value)
// else return false
bool UpdateCallback(SequenceNumber seq,
const Slice& key,
const Slice& delta,
const Options& options);
// Returns the number of successive merge entries starting from the newest
// entry for the key up to the last non-merge entry or last entry for the
// key in the memtable.
size_t CountSuccessiveMergeEntries(const LookupKey& key);
// Get total number of entries in the mem table.
uint64_t GetNumEntries() const { return num_entries_; }
// Returns the edits area that is needed for flushing the memtable
VersionEdit* GetEdits() { return &edit_; }
// Returns the sequence number of the first element that was inserted
// into the memtable
SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }
// Returns the next active logfile number when this memtable is about to
// be flushed to storage
uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
// Sets the next active logfile number when this memtable is about to
// be flushed to storage
void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
// Notify the underlying storage that no more items will be added
void MarkImmutable() { table_->MarkReadOnly(); }
// return true if the current MemTableRep supports merge operator.
bool IsMergeOperatorSupported() const {
return table_->IsMergeOperatorSupported();
}
// return true if the current MemTableRep supports snapshots.
bool IsSnapshotSupported() const { return table_->IsSnapshotSupported(); }
// Get the lock associated for the key
port::RWMutex* GetLock(const Slice& key);
const InternalKeyComparator& GetInternalKeyComparator() const {
return comparator_.comparator;
}
const Arena& TEST_GetArena() const { return arena_; }
private:
// Dynamically check if we can add more incoming entries.
bool ShouldFlushNow() const;
friend class MemTableIterator;
friend class MemTableBackwardIterator;
friend class MemTableList;
KeyComparator comparator_;
int refs_;
const size_t kArenaBlockSize;
const size_t kWriteBufferSize;
Arena arena_;
unique_ptr<MemTableRep> table_;
uint64_t num_entries_;
// These are used to manage memtable flushes to storage
bool flush_in_progress_; // started the flush
bool flush_completed_; // finished the flush
uint64_t file_number_; // filled up after flush is complete
// The updates to be applied to the transaction log when this
// memtable is flushed to storage.
VersionEdit edit_;
// The sequence number of the kv that was inserted first
SequenceNumber first_seqno_;
// The log files earlier than this number can be deleted.
uint64_t mem_next_logfile_number_;
// rw locks for inplace updates
std::vector<port::RWMutex> locks_;
// No copying allowed
MemTable(const MemTable&);
void operator=(const MemTable&);
const SliceTransform* const prefix_extractor_;
std::unique_ptr<DynamicBloom> prefix_bloom_;
// a flag indicating if a memtable has met the criteria to flush
bool should_flush_;
};
extern const char* EncodeKey(std::string* scratch, const Slice& target);
} // namespace rocksdb

286
db/memtable_list.cc Normal file
View File

@@ -0,0 +1,286 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include "db/memtable_list.h"
#include <string>
#include "rocksdb/db.h"
#include "db/memtable.h"
#include "db/version_set.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "table/merger.h"
#include "util/coding.h"
#include "util/log_buffer.h"
namespace rocksdb {
class InternalKeyComparator;
class Mutex;
class VersionSet;
MemTableListVersion::MemTableListVersion(MemTableListVersion* old) {
if (old != nullptr) {
memlist_ = old->memlist_;
size_ = old->size_;
for (auto& m : memlist_) {
m->Ref();
}
}
}
void MemTableListVersion::Ref() { ++refs_; }
void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
assert(refs_ >= 1);
--refs_;
if (refs_ == 0) {
// if to_delete is equal to nullptr it means we're confident
// that refs_ will not be zero
assert(to_delete != nullptr);
for (const auto& m : memlist_) {
MemTable* x = m->Unref();
if (x != nullptr) {
to_delete->push_back(x);
}
}
delete this;
}
}
int MemTableListVersion::size() const { return size_; }
// Returns the total number of memtables in the list
int MemTableList::size() const {
assert(num_flush_not_started_ <= current_->size_);
return current_->size_;
}
// Search all the memtables starting from the most recent one.
// Return the most recent value found, if any.
// Operands stores the list of merge operations to apply, so far.
bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
Status* s, MergeContext& merge_context,
const Options& options) {
for (auto& memtable : memlist_) {
if (memtable->Get(key, value, s, merge_context, options)) {
return true;
}
}
return false;
}
void MemTableListVersion::AddIterators(const ReadOptions& options,
std::vector<Iterator*>* iterator_list) {
for (auto& m : memlist_) {
iterator_list->push_back(m->NewIterator(options));
}
}
void MemTableListVersion::AddIterators(
const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder) {
for (auto& m : memlist_) {
merge_iter_builder->AddIterator(
m->NewIterator(options, merge_iter_builder->GetArena()));
}
}
uint64_t MemTableListVersion::GetTotalNumEntries() const {
uint64_t total_num = 0;
for (auto& m : memlist_) {
total_num += m->GetNumEntries();
}
return total_num;
}
// caller is responsible for referencing m
void MemTableListVersion::Add(MemTable* m) {
assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable
memlist_.push_front(m);
++size_;
}
// caller is responsible for unreferencing m
void MemTableListVersion::Remove(MemTable* m) {
assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable
memlist_.remove(m);
--size_;
}
// Returns true if there is at least one memtable on which flush has
// not yet started.
bool MemTableList::IsFlushPending() const {
if ((flush_requested_ && num_flush_not_started_ >= 1) ||
(num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
assert(imm_flush_needed.NoBarrier_Load() != nullptr);
return true;
}
return false;
}
// Returns the memtables that need to be flushed.
void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
const auto& memlist = current_->memlist_;
for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
MemTable* m = *it;
if (!m->flush_in_progress_) {
assert(!m->flush_completed_);
num_flush_not_started_--;
if (num_flush_not_started_ == 0) {
imm_flush_needed.Release_Store(nullptr);
}
m->flush_in_progress_ = true; // flushing will start very soon
ret->push_back(m);
}
}
flush_requested_ = false; // start-flush request is complete
}
void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
uint64_t file_number,
std::set<uint64_t>* pending_outputs) {
assert(!mems.empty());
// If the flush was not successful, then just reset state.
// Maybe a suceeding attempt to flush will be successful.
for (MemTable* m : mems) {
assert(m->flush_in_progress_);
assert(m->file_number_ == 0);
m->flush_in_progress_ = false;
m->flush_completed_ = false;
m->edit_.Clear();
num_flush_not_started_++;
}
pending_outputs->erase(file_number);
imm_flush_needed.Release_Store(reinterpret_cast<void *>(1));
}
// Record a successful flush in the manifest file
Status MemTableList::InstallMemtableFlushResults(
ColumnFamilyData* cfd, const autovector<MemTable*>& mems, VersionSet* vset,
port::Mutex* mu, Logger* info_log, uint64_t file_number,
std::set<uint64_t>& pending_outputs, autovector<MemTable*>* to_delete,
Directory* db_directory, LogBuffer* log_buffer) {
mu->AssertHeld();
// flush was sucessful
for (size_t i = 0; i < mems.size(); ++i) {
// All the edits are associated with the first memtable of this batch.
assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
mems[i]->flush_completed_ = true;
mems[i]->file_number_ = file_number;
}
// if some other thread is already commiting, then return
Status s;
if (commit_in_progress_) {
return s;
}
// Only a single thread can be executing this piece of code
commit_in_progress_ = true;
// scan all memtables from the earliest, and commit those
// (in that order) that have finished flushing. Memetables
// are always committed in the order that they were created.
while (!current_->memlist_.empty() && s.ok()) {
MemTable* m = current_->memlist_.back(); // get the last element
if (!m->flush_completed_) {
break;
}
LogToBuffer(log_buffer, "[%s] Level-0 commit table #%lu started",
cfd->GetName().c_str(), (unsigned long)m->file_number_);
// this can release and reacquire the mutex.
s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory);
// we will be changing the version in the next code path,
// so we better create a new one, since versions are immutable
InstallNewVersion();
// All the later memtables that have the same filenum
// are part of the same batch. They can be committed now.
uint64_t mem_id = 1; // how many memtables has been flushed.
do {
if (s.ok()) { // commit new state
LogToBuffer(log_buffer,
"[%s] Level-0 commit table #%lu: memtable #%lu done",
cfd->GetName().c_str(), (unsigned long)m->file_number_,
(unsigned long)mem_id);
current_->Remove(m);
assert(m->file_number_ > 0);
// pending_outputs can be cleared only after the newly created file
// has been written to a committed version so that other concurrently
// executing compaction threads do not mistakenly assume that this
// file is not live.
pending_outputs.erase(m->file_number_);
if (m->Unref() != nullptr) {
to_delete->push_back(m);
}
} else {
//commit failed. setup state so that we can flush again.
Log(info_log,
"Level-0 commit table #%lu: memtable #%lu failed",
(unsigned long)m->file_number_,
(unsigned long)mem_id);
m->flush_completed_ = false;
m->flush_in_progress_ = false;
m->edit_.Clear();
num_flush_not_started_++;
pending_outputs.erase(m->file_number_);
m->file_number_ = 0;
imm_flush_needed.Release_Store((void *)1);
}
++mem_id;
} while (!current_->memlist_.empty() && (m = current_->memlist_.back()) &&
m->file_number_ == file_number);
}
commit_in_progress_ = false;
return s;
}
// New memtables are inserted at the front of the list.
void MemTableList::Add(MemTable* m) {
assert(current_->size_ >= num_flush_not_started_);
InstallNewVersion();
// this method is used to move mutable memtable into an immutable list.
// since mutable memtable is already refcounted by the DBImpl,
// and when moving to the imutable list we don't unref it,
// we don't have to ref the memtable here. we just take over the
// reference from the DBImpl.
current_->Add(m);
m->MarkImmutable();
num_flush_not_started_++;
if (num_flush_not_started_ == 1) {
imm_flush_needed.Release_Store((void *)1);
}
}
// Returns an estimate of the number of bytes of data in use.
size_t MemTableList::ApproximateMemoryUsage() {
size_t size = 0;
for (auto& memtable : current_->memlist_) {
size += memtable->ApproximateMemoryUsage();
}
return size;
}
void MemTableList::InstallNewVersion() {
if (current_->refs_ == 1) {
// we're the only one using the version, just keep using it
} else {
// somebody else holds the current version, we need to create new one
MemTableListVersion* version = current_;
current_ = new MemTableListVersion(current_);
current_->Ref();
version->Unref();
}
}
} // namespace rocksdb

156
db/memtable_list.h Normal file
View File

@@ -0,0 +1,156 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#pragma once
#include <string>
#include <list>
#include <vector>
#include <set>
#include <deque>
#include "rocksdb/db.h"
#include "rocksdb/options.h"
#include "rocksdb/iterator.h"
#include "db/dbformat.h"
#include "db/skiplist.h"
#include "db/memtable.h"
#include "rocksdb/db.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "util/autovector.h"
#include "util/log_buffer.h"
namespace rocksdb {
class ColumnFamilyData;
class InternalKeyComparator;
class Mutex;
class MergeIteratorBuilder;
// keeps a list of immutable memtables in a vector. the list is immutable
// if refcount is bigger than one. It is used as a state for Get() and
// Iterator code paths
class MemTableListVersion {
public:
explicit MemTableListVersion(MemTableListVersion* old = nullptr);
void Ref();
void Unref(autovector<MemTable*>* to_delete = nullptr);
int size() const;
// Search all the memtables starting from the most recent one.
// Return the most recent value found, if any.
bool Get(const LookupKey& key, std::string* value, Status* s,
MergeContext& merge_context, const Options& options);
void AddIterators(const ReadOptions& options,
std::vector<Iterator*>* iterator_list);
void AddIterators(const ReadOptions& options,
MergeIteratorBuilder* merge_iter_builder);
uint64_t GetTotalNumEntries() const;
private:
// REQUIRE: m is mutable memtable
void Add(MemTable* m);
// REQUIRE: m is mutable memtable
void Remove(MemTable* m);
friend class MemTableList;
std::list<MemTable*> memlist_;
int size_ = 0;
int refs_ = 0;
};
// This class stores references to all the immutable memtables.
// The memtables are flushed to L0 as soon as possible and in
// any order. If there are more than one immutable memtable, their
// flushes can occur concurrently. However, they are 'committed'
// to the manifest in FIFO order to maintain correctness and
// recoverability from a crash.
class MemTableList {
public:
// A list of memtables.
explicit MemTableList(int min_write_buffer_number_to_merge)
: min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
current_(new MemTableListVersion()),
num_flush_not_started_(0),
commit_in_progress_(false),
flush_requested_(false) {
imm_flush_needed.Release_Store(nullptr);
current_->Ref();
}
~MemTableList() {}
MemTableListVersion* current() { return current_; }
// so that background threads can detect non-nullptr pointer to
// determine whether there is anything more to start flushing.
port::AtomicPointer imm_flush_needed;
// Returns the total number of memtables in the list
int size() const;
// Returns true if there is at least one memtable on which flush has
// not yet started.
bool IsFlushPending() const;
// Returns the earliest memtables that needs to be flushed. The returned
// memtables are guaranteed to be in the ascending order of created time.
void PickMemtablesToFlush(autovector<MemTable*>* mems);
// Reset status of the given memtable list back to pending state so that
// they can get picked up again on the next round of flush.
void RollbackMemtableFlush(const autovector<MemTable*>& mems,
uint64_t file_number,
std::set<uint64_t>* pending_outputs);
// Commit a successful flush in the manifest file
Status InstallMemtableFlushResults(ColumnFamilyData* cfd,
const autovector<MemTable*>& m,
VersionSet* vset, port::Mutex* mu,
Logger* info_log, uint64_t file_number,
std::set<uint64_t>& pending_outputs,
autovector<MemTable*>* to_delete,
Directory* db_directory,
LogBuffer* log_buffer);
// New memtables are inserted at the front of the list.
// Takes ownership of the referenced held on *m by the caller of Add().
void Add(MemTable* m);
// Returns an estimate of the number of bytes of data in use.
size_t ApproximateMemoryUsage();
// Request a flush of all existing memtables to storage
void FlushRequested() { flush_requested_ = true; }
// Copying allowed
// MemTableList(const MemTableList&);
// void operator=(const MemTableList&);
private:
// DB mutex held
void InstallNewVersion();
int min_write_buffer_number_to_merge_;
MemTableListVersion* current_;
// the number of elements that still need flushing
int num_flush_not_started_;
// committing in progress
bool commit_in_progress_;
// Requested a flush of all memtables to storage
bool flush_requested_;
};
} // namespace rocksdb

69
db/merge_context.h Normal file
View File

@@ -0,0 +1,69 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#pragma once
#include "db/dbformat.h"
#include "rocksdb/slice.h"
#include <string>
#include <deque>
namespace rocksdb {
const std::deque<std::string> empty_operand_list;
// The merge context for merging a user key.
// When doing a Get(), DB will create such a class and pass it when
// issuing Get() operation to memtables and version_set. The operands
// will be fetched from the context when issuing partial of full merge.
class MergeContext {
public:
// Clear all the operands
void Clear() {
if (operand_list) {
operand_list->clear();
}
}
// Replace all operands with merge_result, which are expected to be the
// merge result of them.
void PushPartialMergeResult(std::string& merge_result) {
assert (operand_list);
operand_list->clear();
operand_list->push_front(std::move(merge_result));
}
// Push a merge operand
void PushOperand(const Slice& operand_slice) {
Initialize();
operand_list->push_front(operand_slice.ToString());
}
// return total number of operands in the list
size_t GetNumOperands() const {
if (!operand_list) {
return 0;
}
return operand_list->size();
}
// Get the operand at the index.
Slice GetOperand(int index) const {
assert (operand_list);
return (*operand_list)[index];
}
// Return all the operands.
const std::deque<std::string>& GetOperands() const {
if (!operand_list) {
return empty_operand_list;
}
return *operand_list;
}
private:
void Initialize() {
if (!operand_list) {
operand_list.reset(new std::deque<std::string>());
}
}
std::unique_ptr<std::deque<std::string>> operand_list;
};
} // namespace rocksdb

209
db/merge_helper.cc Normal file
View File

@@ -0,0 +1,209 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include "merge_helper.h"
#include "db/dbformat.h"
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/merge_operator.h"
#include "util/statistics.h"
#include <string>
#include <stdio.h>
namespace rocksdb {
// PRE: iter points to the first merge type entry
// POST: iter points to the first entry beyond the merge process (or the end)
// keys_, operands_ are updated to reflect the merge result.
// keys_ stores the list of keys encountered while merging.
// operands_ stores the list of merge operands encountered while merging.
// keys_[i] corresponds to operands_[i] for each i.
void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
bool at_bottom, Statistics* stats, int* steps) {
// Get a copy of the internal key, before it's invalidated by iter->Next()
// Also maintain the list of merge operands seen.
keys_.clear();
operands_.clear();
keys_.push_front(iter->key().ToString());
operands_.push_front(iter->value().ToString());
success_ = false; // Will become true if we hit Put/Delete or bottom
// We need to parse the internal key again as the parsed key is
// backed by the internal key!
// Assume no internal key corruption as it has been successfully parsed
// by the caller.
// Invariant: keys_.back() will not change. Hence, orig_ikey is always valid.
ParsedInternalKey orig_ikey;
ParseInternalKey(keys_.back(), &orig_ikey);
bool hit_the_next_user_key = false;
std::string merge_result; // Temporary value for merge results
if (steps) {
++(*steps);
}
for (iter->Next(); iter->Valid(); iter->Next()) {
ParsedInternalKey ikey;
assert(operands_.size() >= 1); // Should be invariants!
assert(keys_.size() == operands_.size());
if (!ParseInternalKey(iter->key(), &ikey)) {
// stop at corrupted key
if (assert_valid_internal_key_) {
assert(!"corrupted internal key is not expected");
}
break;
}
if (user_comparator_->Compare(ikey.user_key, orig_ikey.user_key) != 0) {
// hit a different user key, stop right here
hit_the_next_user_key = true;
break;
}
if (stop_before && ikey.sequence <= stop_before) {
// hit an entry that's visible by the previous snapshot, can't touch that
break;
}
// At this point we are guaranteed that we need to process this key.
if (kTypeDeletion == ikey.type) {
// hit a delete
// => merge nullptr with operands_
// => store result in operands_.back() (and update keys_.back())
// => change the entry type to kTypeValue for keys_.back()
// We are done! Return a success if the merge passes.
success_ = user_merge_operator_->FullMerge(ikey.user_key, nullptr,
operands_, &merge_result,
logger_);
// We store the result in keys_.back() and operands_.back()
// if nothing went wrong (i.e.: no operand corruption on disk)
if (success_) {
std::string& key = keys_.back(); // The original key encountered
orig_ikey.type = kTypeValue;
UpdateInternalKey(&key[0], key.size(),
orig_ikey.sequence, orig_ikey.type);
swap(operands_.back(), merge_result);
} else {
RecordTick(stats, NUMBER_MERGE_FAILURES);
}
// move iter to the next entry (before doing anything else)
iter->Next();
if (steps) {
++(*steps);
}
return;
}
if (kTypeValue == ikey.type) {
// hit a put
// => merge the put value with operands_
// => store result in operands_.back() (and update keys_.back())
// => change the entry type to kTypeValue for keys_.back()
// We are done! Success!
const Slice value = iter->value();
success_ = user_merge_operator_->FullMerge(ikey.user_key, &value,
operands_, &merge_result,
logger_);
// We store the result in keys_.back() and operands_.back()
// if nothing went wrong (i.e.: no operand corruption on disk)
if (success_) {
std::string& key = keys_.back(); // The original key encountered
orig_ikey.type = kTypeValue;
UpdateInternalKey(&key[0], key.size(),
orig_ikey.sequence, orig_ikey.type);
swap(operands_.back(), merge_result);
} else {
RecordTick(stats, NUMBER_MERGE_FAILURES);
}
// move iter to the next entry
iter->Next();
if (steps) {
++(*steps);
}
return;
}
if (kTypeMerge == ikey.type) {
// hit a merge
// => merge the operand into the front of the operands_ list
// => use the user's associative merge function to determine how.
// => then continue because we haven't yet seen a Put/Delete.
assert(!operands_.empty()); // Should have at least one element in it
// keep queuing keys and operands until we either meet a put / delete
// request or later did a partial merge.
keys_.push_front(iter->key().ToString());
operands_.push_front(iter->value().ToString());
if (steps) {
++(*steps);
}
}
}
// We are sure we have seen this key's entire history if we are at the
// last level and exhausted all internal keys of this user key.
// NOTE: !iter->Valid() does not necessarily mean we hit the
// beginning of a user key, as versions of a user key might be
// split into multiple files (even files on the same level)
// and some files might not be included in the compaction/merge.
//
// There are also cases where we have seen the root of history of this
// key without being sure of it. Then, we simply miss the opportunity
// to combine the keys. Since VersionSet::SetupOtherInputs() always makes
// sure that all merge-operands on the same level get compacted together,
// this will simply lead to these merge operands moving to the next level.
//
// So, we only perform the following logic (to merge all operands together
// without a Put/Delete) if we are certain that we have seen the end of key.
bool surely_seen_the_beginning = hit_the_next_user_key && at_bottom;
if (surely_seen_the_beginning) {
// do a final merge with nullptr as the existing value and say
// bye to the merge type (it's now converted to a Put)
assert(kTypeMerge == orig_ikey.type);
assert(operands_.size() >= 1);
assert(operands_.size() == keys_.size());
success_ = user_merge_operator_->FullMerge(orig_ikey.user_key, nullptr,
operands_, &merge_result,
logger_);
if (success_) {
std::string& key = keys_.back(); // The original key encountered
orig_ikey.type = kTypeValue;
UpdateInternalKey(&key[0], key.size(),
orig_ikey.sequence, orig_ikey.type);
// The final value() is always stored in operands_.back()
swap(operands_.back(),merge_result);
} else {
RecordTick(stats, NUMBER_MERGE_FAILURES);
// Do nothing if not success_. Leave keys() and operands() as they are.
}
} else {
// We haven't seen the beginning of the key nor a Put/Delete.
// Attempt to use the user's associative merge function to
// merge the stacked merge operands into a single operand.
if (operands_.size() >= 2 &&
operands_.size() >= min_partial_merge_operands_ &&
user_merge_operator_->PartialMergeMulti(
orig_ikey.user_key,
std::deque<Slice>(operands_.begin(), operands_.end()),
&merge_result, logger_)) {
// Merging of operands (associative merge) was successful.
// Replace operands with the merge result
operands_.clear();
operands_.push_front(std::move(merge_result));
keys_.erase(keys_.begin(), keys_.end() - 1);
}
}
}
} // namespace rocksdb

105
db/merge_helper.h Normal file
View File

@@ -0,0 +1,105 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#ifndef MERGE_HELPER_H
#define MERGE_HELPER_H
#include "db/dbformat.h"
#include "rocksdb/slice.h"
#include <string>
#include <deque>
namespace rocksdb {
class Comparator;
class Iterator;
class Logger;
class MergeOperator;
class Statistics;
class MergeHelper {
public:
MergeHelper(const Comparator* user_comparator,
const MergeOperator* user_merge_operator, Logger* logger,
unsigned min_partial_merge_operands,
bool assert_valid_internal_key)
: user_comparator_(user_comparator),
user_merge_operator_(user_merge_operator),
logger_(logger),
min_partial_merge_operands_(min_partial_merge_operands),
assert_valid_internal_key_(assert_valid_internal_key),
keys_(),
operands_(),
success_(false) {}
// Merge entries until we hit
// - a corrupted key
// - a Put/Delete,
// - a different user key,
// - a specific sequence number (snapshot boundary),
// or - the end of iteration
// iter: (IN) points to the first merge type entry
// (OUT) points to the first entry not included in the merge process
// stop_before: (IN) a sequence number that merge should not cross.
// 0 means no restriction
// at_bottom: (IN) true if the iterator covers the bottem level, which means
// we could reach the start of the history of this user key.
void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0,
bool at_bottom = false, Statistics* stats = nullptr,
int* steps = nullptr);
// Query the merge result
// These are valid until the next MergeUntil call
// If the merging was successful:
// - IsSuccess() will be true
// - key() will have the latest sequence number of the merges.
// The type will be Put or Merge. See IMPORTANT 1 note, below.
// - value() will be the result of merging all the operands together
// - The user should ignore keys() and values().
//
// IMPORTANT 1: the key type could change after the MergeUntil call.
// Put/Delete + Merge + ... + Merge => Put
// Merge + ... + Merge => Merge
//
// If the merge operator is not associative, and if a Put/Delete is not found
// then the merging will be unsuccessful. In this case:
// - IsSuccess() will be false
// - keys() contains the list of internal keys seen in order of iteration.
// - values() contains the list of values (merges) seen in the same order.
// values() is parallel to keys() so that the first entry in
// keys() is the key associated with the first entry in values()
// and so on. These lists will be the same length.
// All of these pairs will be merges over the same user key.
// See IMPORTANT 2 note below.
// - The user should ignore key() and value().
//
// IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
// So keys().back() was the first key seen by iterator.
// TODO: Re-style this comment to be like the first one
bool IsSuccess() { return success_; }
Slice key() { assert(success_); return Slice(keys_.back()); }
Slice value() { assert(success_); return Slice(operands_.back()); }
const std::deque<std::string>& keys() { assert(!success_); return keys_; }
const std::deque<std::string>& values() {
assert(!success_); return operands_;
}
private:
const Comparator* user_comparator_;
const MergeOperator* user_merge_operator_;
Logger* logger_;
unsigned min_partial_merge_operands_;
bool assert_valid_internal_key_; // enforce no internal key corruption?
// the scratch area that holds the result of MergeUntil
// valid up to the next MergeUntil call
std::deque<std::string> keys_; // Keeps track of the sequence of keys seen
std::deque<std::string> operands_; // Parallel with keys_; stores the values
bool success_;
};
} // namespace rocksdb
#endif

77
db/merge_operator.cc Normal file
View File

@@ -0,0 +1,77 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
/**
* Back-end implementation details specific to the Merge Operator.
*/
#include "rocksdb/merge_operator.h"
namespace rocksdb {
// The default implementation of PartialMergeMulti, which invokes
// PartialMerge multiple times internally and merges two operands at
// a time.
bool MergeOperator::PartialMergeMulti(const Slice& key,
const std::deque<Slice>& operand_list,
std::string* new_value,
Logger* logger) const {
assert(operand_list.size() >= 2);
// Simply loop through the operands
std::string temp_value;
Slice temp_slice(operand_list[0]);
for (size_t i = 1; i < operand_list.size(); ++i) {
auto& operand = operand_list[i];
if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) {
return false;
}
swap(temp_value, *new_value);
temp_slice = Slice(*new_value);
}
// The result will be in *new_value. All merges succeeded.
return true;
}
// Given a "real" merge from the library, call the user's
// associative merge function one-by-one on each of the operands.
// NOTE: It is assumed that the client's merge-operator will handle any errors.
bool AssociativeMergeOperator::FullMerge(
const Slice& key,
const Slice* existing_value,
const std::deque<std::string>& operand_list,
std::string* new_value,
Logger* logger) const {
// Simply loop through the operands
Slice temp_existing;
std::string temp_value;
for (const auto& operand : operand_list) {
Slice value(operand);
if (!Merge(key, existing_value, value, &temp_value, logger)) {
return false;
}
swap(temp_value, *new_value);
temp_existing = Slice(*new_value);
existing_value = &temp_existing;
}
// The result will be in *new_value. All merges succeeded.
return true;
}
// Call the user defined simple merge on the operands;
// NOTE: It is assumed that the client's merge-operator will handle any errors.
bool AssociativeMergeOperator::PartialMerge(
const Slice& key,
const Slice& left_operand,
const Slice& right_operand,
std::string* new_value,
Logger* logger) const {
return Merge(key, &left_operand, right_operand, new_value, logger);
}
} // namespace rocksdb

472
db/merge_test.cc Normal file
View File

@@ -0,0 +1,472 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include <assert.h>
#include <memory>
#include <iostream>
#include "rocksdb/cache.h"
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/merge_operator.h"
#include "db/dbformat.h"
#include "db/db_impl.h"
#include "db/write_batch_internal.h"
#include "utilities/merge_operators.h"
#include "util/testharness.h"
#include "utilities/db_ttl.h"
using namespace std;
using namespace rocksdb;
namespace {
int numMergeOperatorCalls;
void resetNumMergeOperatorCalls() {
numMergeOperatorCalls = 0;
}
int num_partial_merge_calls;
void resetNumPartialMergeCalls() {
num_partial_merge_calls = 0;
}
}
class CountMergeOperator : public AssociativeMergeOperator {
public:
CountMergeOperator() {
mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
}
virtual bool Merge(const Slice& key,
const Slice* existing_value,
const Slice& value,
std::string* new_value,
Logger* logger) const override {
++numMergeOperatorCalls;
if (existing_value == nullptr) {
new_value->assign(value.data(), value.size());
return true;
}
return mergeOperator_->PartialMerge(
key,
*existing_value,
value,
new_value,
logger);
}
virtual bool PartialMergeMulti(const Slice& key,
const std::deque<Slice>& operand_list,
std::string* new_value, Logger* logger) const {
++num_partial_merge_calls;
return mergeOperator_->PartialMergeMulti(key, operand_list, new_value,
logger);
}
virtual const char* Name() const override {
return "UInt64AddOperator";
}
private:
std::shared_ptr<MergeOperator> mergeOperator_;
};
namespace {
std::shared_ptr<DB> OpenDb(const string& dbname, const bool ttl = false,
const size_t max_successive_merges = 0,
const uint32_t min_partial_merge_operands = 2) {
DB* db;
Options options;
options.create_if_missing = true;
options.merge_operator = std::make_shared<CountMergeOperator>();
options.max_successive_merges = max_successive_merges;
options.min_partial_merge_operands = min_partial_merge_operands;
Status s;
DestroyDB(dbname, Options());
if (ttl) {
cout << "Opening database with TTL\n";
DBWithTTL* db_with_ttl;
s = DBWithTTL::Open(options, dbname, &db_with_ttl);
db = db_with_ttl;
} else {
s = DB::Open(options, dbname, &db);
}
if (!s.ok()) {
cerr << s.ToString() << endl;
assert(false);
}
return std::shared_ptr<DB>(db);
}
} // namespace
// Imagine we are maintaining a set of uint64 counters.
// Each counter has a distinct name. And we would like
// to support four high level operations:
// set, add, get and remove
// This is a quick implementation without a Merge operation.
class Counters {
protected:
std::shared_ptr<DB> db_;
WriteOptions put_option_;
ReadOptions get_option_;
WriteOptions delete_option_;
uint64_t default_;
public:
explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
: db_(db),
put_option_(),
get_option_(),
delete_option_(),
default_(defaultCount) {
assert(db_);
}
virtual ~Counters() {}
// public interface of Counters.
// All four functions return false
// if the underlying level db operation failed.
// mapped to a levedb Put
bool set(const string& key, uint64_t value) {
// just treat the internal rep of int64 as the string
Slice slice((char *)&value, sizeof(value));
auto s = db_->Put(put_option_, key, slice);
if (s.ok()) {
return true;
} else {
cerr << s.ToString() << endl;
return false;
}
}
// mapped to a rocksdb Delete
bool remove(const string& key) {
auto s = db_->Delete(delete_option_, key);
if (s.ok()) {
return true;
} else {
cerr << s.ToString() << std::endl;
return false;
}
}
// mapped to a rocksdb Get
bool get(const string& key, uint64_t *value) {
string str;
auto s = db_->Get(get_option_, key, &str);
if (s.IsNotFound()) {
// return default value if not found;
*value = default_;
return true;
} else if (s.ok()) {
// deserialization
if (str.size() != sizeof(uint64_t)) {
cerr << "value corruption\n";
return false;
}
*value = DecodeFixed64(&str[0]);
return true;
} else {
cerr << s.ToString() << std::endl;
return false;
}
}
// 'add' is implemented as get -> modify -> set
// An alternative is a single merge operation, see MergeBasedCounters
virtual bool add(const string& key, uint64_t value) {
uint64_t base = default_;
return get(key, &base) && set(key, base + value);
}
// convenience functions for testing
void assert_set(const string& key, uint64_t value) {
assert(set(key, value));
}
void assert_remove(const string& key) {
assert(remove(key));
}
uint64_t assert_get(const string& key) {
uint64_t value = default_;
int result = get(key, &value);
assert(result);
if (result == 0) exit(1); // Disable unused variable warning.
return value;
}
void assert_add(const string& key, uint64_t value) {
int result = add(key, value);
assert(result);
if (result == 0) exit(1); // Disable unused variable warning.
}
};
// Implement 'add' directly with the new Merge operation
class MergeBasedCounters : public Counters {
private:
WriteOptions merge_option_; // for merge
public:
explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
: Counters(db, defaultCount),
merge_option_() {
}
// mapped to a rocksdb Merge operation
virtual bool add(const string& key, uint64_t value) override {
char encoded[sizeof(uint64_t)];
EncodeFixed64(encoded, value);
Slice slice(encoded, sizeof(uint64_t));
auto s = db_->Merge(merge_option_, key, slice);
if (s.ok()) {
return true;
} else {
cerr << s.ToString() << endl;
return false;
}
}
};
namespace {
void dumpDb(DB* db) {
auto it = unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
for (it->SeekToFirst(); it->Valid(); it->Next()) {
uint64_t value = DecodeFixed64(it->value().data());
cout << it->key().ToString() << ": " << value << endl;
}
assert(it->status().ok()); // Check for any errors found during the scan
}
void testCounters(Counters& counters, DB* db, bool test_compaction) {
FlushOptions o;
o.wait = true;
counters.assert_set("a", 1);
if (test_compaction) db->Flush(o);
assert(counters.assert_get("a") == 1);
counters.assert_remove("b");
// defaut value is 0 if non-existent
assert(counters.assert_get("b") == 0);
counters.assert_add("a", 2);
if (test_compaction) db->Flush(o);
// 1+2 = 3
assert(counters.assert_get("a")== 3);
dumpDb(db);
std::cout << "1\n";
// 1+...+49 = ?
uint64_t sum = 0;
for (int i = 1; i < 50; i++) {
counters.assert_add("b", i);
sum += i;
}
assert(counters.assert_get("b") == sum);
std::cout << "2\n";
dumpDb(db);
std::cout << "3\n";
if (test_compaction) {
db->Flush(o);
cout << "Compaction started ...\n";
db->CompactRange(nullptr, nullptr);
cout << "Compaction ended\n";
dumpDb(db);
assert(counters.assert_get("a")== 3);
assert(counters.assert_get("b") == sum);
}
}
void testSuccessiveMerge(
Counters& counters, int max_num_merges, int num_merges) {
counters.assert_remove("z");
uint64_t sum = 0;
for (int i = 1; i <= num_merges; ++i) {
resetNumMergeOperatorCalls();
counters.assert_add("z", i);
sum += i;
if (i % (max_num_merges + 1) == 0) {
assert(numMergeOperatorCalls == max_num_merges + 1);
} else {
assert(numMergeOperatorCalls == 0);
}
resetNumMergeOperatorCalls();
assert(counters.assert_get("z") == sum);
assert(numMergeOperatorCalls == i % (max_num_merges + 1));
}
}
void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
int count) {
FlushOptions o;
o.wait = true;
// Test case 1: partial merge should be called when the number of merge
// operands exceeds the threshold.
uint64_t tmp_sum = 0;
resetNumPartialMergeCalls();
for (int i = 1; i <= count; i++) {
counters->assert_add("b", i);
tmp_sum += i;
}
db->Flush(o);
db->CompactRange(nullptr, nullptr);
ASSERT_EQ(tmp_sum, counters->assert_get("b"));
if (count > max_merge) {
// in this case, FullMerge should be called instead.
ASSERT_EQ(num_partial_merge_calls, 0);
} else {
// if count >= min_merge, then partial merge should be called once.
ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1));
}
// Test case 2: partial merge should not be called when a put is found.
resetNumPartialMergeCalls();
tmp_sum = 0;
db->Put(rocksdb::WriteOptions(), "c", "10");
for (int i = 1; i <= count; i++) {
counters->assert_add("c", i);
tmp_sum += i;
}
db->Flush(o);
db->CompactRange(nullptr, nullptr);
ASSERT_EQ(tmp_sum, counters->assert_get("c"));
ASSERT_EQ(num_partial_merge_calls, 0);
}
void testSingleBatchSuccessiveMerge(
DB* db,
int max_num_merges,
int num_merges) {
assert(num_merges > max_num_merges);
Slice key("BatchSuccessiveMerge");
uint64_t merge_value = 1;
Slice merge_value_slice((char *)&merge_value, sizeof(merge_value));
// Create the batch
WriteBatch batch;
for (int i = 0; i < num_merges; ++i) {
batch.Merge(key, merge_value_slice);
}
// Apply to memtable and count the number of merges
resetNumMergeOperatorCalls();
{
Status s = db->Write(WriteOptions(), &batch);
assert(s.ok());
}
assert(numMergeOperatorCalls ==
num_merges - (num_merges % (max_num_merges + 1)));
// Get the value
resetNumMergeOperatorCalls();
string get_value_str;
{
Status s = db->Get(ReadOptions(), key, &get_value_str);
assert(s.ok());
}
assert(get_value_str.size() == sizeof(uint64_t));
uint64_t get_value = DecodeFixed64(&get_value_str[0]);
ASSERT_EQ(get_value, num_merges * merge_value);
ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1)));
}
void runTest(int argc, const string& dbname, const bool use_ttl = false) {
auto db = OpenDb(dbname, use_ttl);
{
cout << "Test read-modify-write counters... \n";
Counters counters(db, 0);
testCounters(counters, db.get(), true);
}
bool compact = false;
if (argc > 1) {
compact = true;
cout << "Turn on Compaction\n";
}
{
cout << "Test merge-based counters... \n";
MergeBasedCounters counters(db, 0);
testCounters(counters, db.get(), compact);
}
DestroyDB(dbname, Options());
db.reset();
{
cout << "Test merge in memtable... \n";
size_t max_merge = 5;
auto db = OpenDb(dbname, use_ttl, max_merge);
MergeBasedCounters counters(db, 0);
testCounters(counters, db.get(), compact);
testSuccessiveMerge(counters, max_merge, max_merge * 2);
testSingleBatchSuccessiveMerge(db.get(), 5, 7);
DestroyDB(dbname, Options());
}
{
cout << "Test Partial-Merge\n";
size_t max_merge = 100;
for (uint32_t min_merge = 5; min_merge < 25; min_merge += 5) {
for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) {
auto db = OpenDb(dbname, use_ttl, max_merge, min_merge);
MergeBasedCounters counters(db, 0);
testPartialMerge(&counters, db.get(), max_merge, min_merge, count);
DestroyDB(dbname, Options());
}
{
auto db = OpenDb(dbname, use_ttl, max_merge, min_merge);
MergeBasedCounters counters(db, 0);
testPartialMerge(&counters, db.get(), max_merge, min_merge,
min_merge * 10);
DestroyDB(dbname, Options());
}
}
}
}
} // namespace
int main(int argc, char *argv[]) {
//TODO: Make this test like a general rocksdb unit-test
runTest(argc, test::TmpDir() + "/merge_testdb");
runTest(argc, test::TmpDir() + "/merge_testdbttl", true); // Run test on TTL database
printf("Passed all tests!\n");
return 0;
}

358
db/perf_context_test.cc Normal file
View File

@@ -0,0 +1,358 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include <algorithm>
#include <iostream>
#include <vector>
#include "/usr/include/valgrind/callgrind.h"
#include "rocksdb/db.h"
#include "rocksdb/perf_context.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/memtablerep.h"
#include "util/histogram.h"
#include "util/stop_watch.h"
#include "util/testharness.h"
bool FLAGS_random_key = false;
bool FLAGS_use_set_based_memetable = false;
int FLAGS_total_keys = 100;
int FLAGS_write_buffer_size = 1000000000;
int FLAGS_max_write_buffer_number = 8;
int FLAGS_min_write_buffer_number_to_merge = 7;
// Path to the database on file system
const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test";
namespace rocksdb {
std::shared_ptr<DB> OpenDb() {
DB* db;
Options options;
options.create_if_missing = true;
options.write_buffer_size = FLAGS_write_buffer_size;
options.max_write_buffer_number = FLAGS_max_write_buffer_number;
options.min_write_buffer_number_to_merge =
FLAGS_min_write_buffer_number_to_merge;
if (FLAGS_use_set_based_memetable) {
auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0);
options.memtable_factory.reset(
NewHashSkipListRepFactory(prefix_extractor));
}
Status s = DB::Open(options, kDbName, &db);
ASSERT_OK(s);
return std::shared_ptr<DB>(db);
}
class PerfContextTest { };
TEST(PerfContextTest, SeekIntoDeletion) {
DestroyDB(kDbName, Options());
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
for (int i = 0; i < FLAGS_total_keys; ++i) {
std::string key = "k" + std::to_string(i);
std::string value = "v" + std::to_string(i);
db->Put(write_options, key, value);
}
for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
std::string key = "k" + std::to_string(i);
db->Delete(write_options, key);
}
HistogramImpl hist_get;
HistogramImpl hist_get_time;
for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
std::string key = "k" + std::to_string(i);
std::string value;
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
auto status = db->Get(read_options, key, &value);
auto elapsed_nanos = timer.ElapsedNanos();
ASSERT_TRUE(status.IsNotFound());
hist_get.Add(perf_context.user_key_comparison_count);
hist_get_time.Add(elapsed_nanos);
}
std::cout << "Get uesr key comparison: \n" << hist_get.ToString()
<< "Get time: \n" << hist_get_time.ToString();
HistogramImpl hist_seek_to_first;
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
iter->SeekToFirst();
hist_seek_to_first.Add(perf_context.user_key_comparison_count);
auto elapsed_nanos = timer.ElapsedNanos();
std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString()
<< "ikey skipped: " << perf_context.internal_key_skipped_count << "\n"
<< "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n"
<< "elapsed: " << elapsed_nanos << "\n";
HistogramImpl hist_seek;
for (int i = 0; i < FLAGS_total_keys; ++i) {
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
std::string key = "k" + std::to_string(i);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
iter->Seek(key);
auto elapsed_nanos = timer.ElapsedNanos();
hist_seek.Add(perf_context.user_key_comparison_count);
std::cout << "seek cmp: " << perf_context.user_key_comparison_count
<< " ikey skipped " << perf_context.internal_key_skipped_count
<< " idelete skipped " << perf_context.internal_delete_skipped_count
<< " elapsed: " << elapsed_nanos << "ns\n";
perf_context.Reset();
ASSERT_TRUE(iter->Valid());
StopWatchNano timer2(Env::Default(), true);
iter->Next();
auto elapsed_nanos2 = timer2.ElapsedNanos();
std::cout << "next cmp: " << perf_context.user_key_comparison_count
<< "elapsed: " << elapsed_nanos2 << "ns\n";
}
std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString();
}
TEST(PerfContextTest, StopWatchNanoOverhead) {
// profile the timer cost by itself!
const int kTotalIterations = 1000000;
std::vector<uint64_t> timings(kTotalIterations);
StopWatchNano timer(Env::Default(), true);
for (auto& timing : timings) {
timing = timer.ElapsedNanos(true /* reset */);
}
HistogramImpl histogram;
for (const auto timing : timings) {
histogram.Add(timing);
}
std::cout << histogram.ToString();
}
TEST(PerfContextTest, StopWatchOverhead) {
// profile the timer cost by itself!
const int kTotalIterations = 1000000;
std::vector<uint64_t> timings(kTotalIterations);
StopWatch timer(Env::Default());
for (auto& timing : timings) {
timing = timer.ElapsedMicros();
}
HistogramImpl histogram;
uint64_t prev_timing = 0;
for (const auto timing : timings) {
histogram.Add(timing - prev_timing);
prev_timing = timing;
}
std::cout << histogram.ToString();
}
void ProfileKeyComparison() {
DestroyDB(kDbName, Options()); // Start this test with a fresh DB
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
HistogramImpl hist_put;
HistogramImpl hist_get;
HistogramImpl hist_get_snapshot;
HistogramImpl hist_get_memtable;
HistogramImpl hist_get_post_process;
HistogramImpl hist_num_memtable_checked;
HistogramImpl hist_write_pre_post;
HistogramImpl hist_write_wal_time;
HistogramImpl hist_write_memtable_time;
std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
std::vector<int> keys;
for (int i = 0; i < FLAGS_total_keys; ++i) {
keys.push_back(i);
}
if (FLAGS_random_key) {
std::random_shuffle(keys.begin(), keys.end());
}
for (const int i : keys) {
std::string key = "k" + std::to_string(i);
std::string value = "v" + std::to_string(i);
perf_context.Reset();
db->Put(write_options, key, value);
hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
hist_write_wal_time.Add(perf_context.write_wal_time);
hist_write_memtable_time.Add(perf_context.write_memtable_time);
hist_put.Add(perf_context.user_key_comparison_count);
perf_context.Reset();
db->Get(read_options, key, &value);
hist_get_snapshot.Add(perf_context.get_snapshot_time);
hist_get_memtable.Add(perf_context.get_from_memtable_time);
hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
hist_get_post_process.Add(perf_context.get_post_process_time);
hist_get.Add(perf_context.user_key_comparison_count);
}
std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
<< "Get uesr key comparison: \n" << hist_get.ToString();
std::cout << "Put(): Pre and Post Process Time: \n"
<< hist_write_pre_post.ToString()
<< " Writing WAL time: \n"
<< hist_write_wal_time.ToString() << "\n"
<< " Writing Mem Table time: \n"
<< hist_write_memtable_time.ToString() << "\n";
std::cout << "Get(): Time to get snapshot: \n"
<< hist_get_snapshot.ToString()
<< " Time to get value from memtables: \n"
<< hist_get_memtable.ToString() << "\n"
<< " Number of memtables checked: \n"
<< hist_num_memtable_checked.ToString() << "\n"
<< " Time to post process: \n"
<< hist_get_post_process.ToString() << "\n";
}
TEST(PerfContextTest, KeyComparisonCount) {
SetPerfLevel(kEnableCount);
ProfileKeyComparison();
SetPerfLevel(kDisable);
ProfileKeyComparison();
SetPerfLevel(kEnableTime);
ProfileKeyComparison();
}
// make perf_context_test
// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison
// For one memtable:
// ./perf_context_test --write_buffer_size=500000 --total_keys=10000
// For two memtables:
// ./perf_context_test --write_buffer_size=250000 --total_keys=10000
// Specify --random_key=1 to shuffle the key before insertion
// Results show that, for sequential insertion, worst-case Seek Key comparison
// is close to the total number of keys (linear), when there is only one
// memtable. When there are two memtables, even the avg Seek Key comparison
// starts to become linear to the input size.
TEST(PerfContextTest, SeekKeyComparison) {
DestroyDB(kDbName, Options());
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
std::vector<int> keys;
for (int i = 0; i < FLAGS_total_keys; ++i) {
keys.push_back(i);
}
if (FLAGS_random_key) {
std::random_shuffle(keys.begin(), keys.end());
}
HistogramImpl hist_put_time;
HistogramImpl hist_wal_time;
HistogramImpl hist_time_diff;
SetPerfLevel(kEnableTime);
StopWatchNano timer(Env::Default());
for (const int i : keys) {
std::string key = "k" + std::to_string(i);
std::string value = "v" + std::to_string(i);
perf_context.Reset();
timer.Start();
db->Put(write_options, key, value);
auto put_time = timer.ElapsedNanos();
hist_put_time.Add(put_time);
hist_wal_time.Add(perf_context.write_wal_time);
hist_time_diff.Add(put_time - perf_context.write_wal_time);
}
std::cout << "Put time:\n" << hist_put_time.ToString()
<< "WAL time:\n" << hist_wal_time.ToString()
<< "time diff:\n" << hist_time_diff.ToString();
HistogramImpl hist_seek;
HistogramImpl hist_next;
for (int i = 0; i < FLAGS_total_keys; ++i) {
std::string key = "k" + std::to_string(i);
std::string value = "v" + std::to_string(i);
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
perf_context.Reset();
iter->Seek(key);
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(iter->value().ToString(), value);
hist_seek.Add(perf_context.user_key_comparison_count);
}
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
for (iter->SeekToFirst(); iter->Valid();) {
perf_context.Reset();
iter->Next();
hist_next.Add(perf_context.user_key_comparison_count);
}
std::cout << "Seek:\n" << hist_seek.ToString()
<< "Next:\n" << hist_next.ToString();
}
}
int main(int argc, char** argv) {
for (int i = 1; i < argc; i++) {
int n;
char junk;
if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
FLAGS_write_buffer_size = n;
}
if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) {
FLAGS_total_keys = n;
}
if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_random_key = n;
}
if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_use_set_based_memetable = n;
}
}
std::cout << kDbName << "\n";
rocksdb::test::RunAllTests();
return 0;
}

853
db/plain_table_db_test.cc Normal file
View File

@@ -0,0 +1,853 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <algorithm>
#include <set>
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/filter_policy.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/table.h"
#include "table/meta_blocks.h"
#include "table/plain_table_factory.h"
#include "table/plain_table_reader.h"
#include "util/hash.h"
#include "util/logging.h"
#include "util/mutexlock.h"
#include "util/testharness.h"
#include "util/testutil.h"
#include "utilities/merge_operators.h"
using std::unique_ptr;
namespace rocksdb {
class PlainTableDBTest {
protected:
private:
std::string dbname_;
Env* env_;
DB* db_;
Options last_options_;
public:
PlainTableDBTest() : env_(Env::Default()) {
dbname_ = test::TmpDir() + "/plain_table_db_test";
ASSERT_OK(DestroyDB(dbname_, Options()));
db_ = nullptr;
Reopen();
}
~PlainTableDBTest() {
delete db_;
ASSERT_OK(DestroyDB(dbname_, Options()));
}
// Return the current option configuration.
Options CurrentOptions() {
Options options;
options.table_factory.reset(NewPlainTableFactory(16, 2, 0.8, 3));
options.prefix_extractor.reset(NewFixedPrefixTransform(8));
options.allow_mmap_reads = true;
return options;
}
DBImpl* dbfull() {
return reinterpret_cast<DBImpl*>(db_);
}
void Reopen(Options* options = nullptr) {
ASSERT_OK(TryReopen(options));
}
void Close() {
delete db_;
db_ = nullptr;
}
void DestroyAndReopen(Options* options = nullptr) {
//Destroy using last options
Destroy(&last_options_);
ASSERT_OK(TryReopen(options));
}
void Destroy(Options* options) {
delete db_;
db_ = nullptr;
ASSERT_OK(DestroyDB(dbname_, *options));
}
Status PureReopen(Options* options, DB** db) {
return DB::Open(*options, dbname_, db);
}
Status TryReopen(Options* options = nullptr) {
delete db_;
db_ = nullptr;
Options opts;
if (options != nullptr) {
opts = *options;
} else {
opts = CurrentOptions();
opts.create_if_missing = true;
}
last_options_ = opts;
return DB::Open(opts, dbname_, &db_);
}
Status Put(const Slice& k, const Slice& v) {
return db_->Put(WriteOptions(), k, v);
}
Status Delete(const std::string& k) {
return db_->Delete(WriteOptions(), k);
}
std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
ReadOptions options;
options.snapshot = snapshot;
std::string result;
Status s = db_->Get(options, k, &result);
if (s.IsNotFound()) {
result = "NOT_FOUND";
} else if (!s.ok()) {
result = s.ToString();
}
return result;
}
int NumTableFilesAtLevel(int level) {
std::string property;
ASSERT_TRUE(
db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
&property));
return atoi(property.c_str());
}
// Return spread of files per level
std::string FilesPerLevel() {
std::string result;
int last_non_zero_offset = 0;
for (int level = 0; level < db_->NumberLevels(); level++) {
int f = NumTableFilesAtLevel(level);
char buf[100];
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
result += buf;
if (f > 0) {
last_non_zero_offset = result.size();
}
}
result.resize(last_non_zero_offset);
return result;
}
std::string IterStatus(Iterator* iter) {
std::string result;
if (iter->Valid()) {
result = iter->key().ToString() + "->" + iter->value().ToString();
} else {
result = "(invalid)";
}
return result;
}
};
TEST(PlainTableDBTest, Empty) {
ASSERT_TRUE(dbfull() != nullptr);
ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
}
class TestPlainTableReader : public PlainTableReader {
public:
TestPlainTableReader(const EnvOptions& storage_options,
const InternalKeyComparator& icomparator,
uint64_t file_size, int bloom_bits_per_key,
double hash_table_ratio, size_t index_sparseness,
const TableProperties* table_properties,
unique_ptr<RandomAccessFile>&& file,
const Options& options, bool* expect_bloom_not_match)
: PlainTableReader(options, std::move(file), storage_options, icomparator,
file_size, bloom_bits_per_key, hash_table_ratio,
index_sparseness, table_properties, 2 * 1024 * 1024),
expect_bloom_not_match_(expect_bloom_not_match) {
Status s = PopulateIndex(const_cast<TableProperties*>(table_properties));
ASSERT_TRUE(s.ok());
}
virtual ~TestPlainTableReader() {}
private:
virtual bool MatchBloom(uint32_t hash) const override {
bool ret = PlainTableReader::MatchBloom(hash);
ASSERT_TRUE(!*expect_bloom_not_match_ || !ret);
return ret;
}
bool* expect_bloom_not_match_;
};
extern const uint64_t kPlainTableMagicNumber;
class TestPlainTableFactory : public PlainTableFactory {
public:
explicit TestPlainTableFactory(bool* expect_bloom_not_match,
uint32_t user_key_len, int bloom_bits_per_key,
double hash_table_ratio,
size_t index_sparseness,
size_t huge_page_tlb_size)
: PlainTableFactory(user_key_len, user_key_len, hash_table_ratio,
index_sparseness, huge_page_tlb_size),
bloom_bits_per_key_(bloom_bits_per_key),
hash_table_ratio_(hash_table_ratio),
index_sparseness_(index_sparseness),
expect_bloom_not_match_(expect_bloom_not_match) {}
Status NewTableReader(const Options& options, const EnvOptions& soptions,
const InternalKeyComparator& internal_comparator,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table) const override {
TableProperties* props = nullptr;
auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
options.env, options.info_log.get(), &props);
ASSERT_TRUE(s.ok());
std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
soptions, internal_comparator, file_size, bloom_bits_per_key_,
hash_table_ratio_, index_sparseness_, props, std::move(file), options,
expect_bloom_not_match_));
*table = std::move(new_reader);
return s;
}
private:
int bloom_bits_per_key_;
double hash_table_ratio_;
size_t index_sparseness_;
bool* expect_bloom_not_match_;
};
TEST(PlainTableDBTest, Flush) {
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
huge_page_tlb_size += 2 * 1024 * 1024) {
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
for (int total_order = 0; total_order <= 1; total_order++) {
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
if (total_order) {
options.table_factory.reset(NewTotalOrderPlainTableFactory(
16, bloom_bits, 2, huge_page_tlb_size));
} else {
options.table_factory.reset(NewPlainTableFactory(
16, bloom_bits, 0.75, 16, huge_page_tlb_size));
}
DestroyAndReopen(&options);
ASSERT_OK(Put("1000000000000foo", "v1"));
ASSERT_OK(Put("0000000000000bar", "v2"));
ASSERT_OK(Put("1000000000000foo", "v3"));
dbfull()->TEST_FlushMemTable();
TablePropertiesCollection ptc;
reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
ASSERT_EQ(1U, ptc.size());
auto row = ptc.begin();
auto tp = row->second;
ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at(
"plain_table_hash_table_size"));
ASSERT_EQ(total_order ? "9" : "0", (tp->user_collected_properties).at(
"plain_table_sub_index_size"));
ASSERT_EQ("v3", Get("1000000000000foo"));
ASSERT_EQ("v2", Get("0000000000000bar"));
}
}
}
}
TEST(PlainTableDBTest, Flush2) {
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
huge_page_tlb_size += 2 * 1024 * 1024) {
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
for (int total_order = 0; total_order <= 1; total_order++) {
bool expect_bloom_not_match = false;
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
if (total_order) {
options.prefix_extractor = nullptr;
options.table_factory.reset(
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
0, 2, huge_page_tlb_size));
} else {
options.table_factory.reset(
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
0.75, 16, huge_page_tlb_size));
}
DestroyAndReopen(&options);
ASSERT_OK(Put("0000000000000bar", "b"));
ASSERT_OK(Put("1000000000000foo", "v1"));
dbfull()->TEST_FlushMemTable();
ASSERT_OK(Put("1000000000000foo", "v2"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v2", Get("1000000000000foo"));
ASSERT_OK(Put("0000000000000eee", "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v3", Get("0000000000000eee"));
ASSERT_OK(Delete("0000000000000bar"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
ASSERT_OK(Put("0000000000000eee", "v5"));
ASSERT_OK(Put("9000000000000eee", "v5"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v5", Get("0000000000000eee"));
// Test Bloom Filter
if (bloom_bits > 0) {
// Neither key nor value should exist.
expect_bloom_not_match = true;
ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
// Key doesn't exist any more but prefix exists.
if (total_order) {
ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
}
expect_bloom_not_match = false;
}
}
}
}
}
TEST(PlainTableDBTest, Iterator) {
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
huge_page_tlb_size += 2 * 1024 * 1024) {
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
for (int total_order = 0; total_order <= 1; total_order++) {
bool expect_bloom_not_match = false;
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
if (total_order) {
options.prefix_extractor = nullptr;
options.table_factory.reset(
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
0, 2, huge_page_tlb_size));
} else {
options.table_factory.reset(
new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
0.75, 16, huge_page_tlb_size));
}
DestroyAndReopen(&options);
ASSERT_OK(Put("1000000000foo002", "v_2"));
ASSERT_OK(Put("0000000000000bar", "random"));
ASSERT_OK(Put("1000000000foo001", "v1"));
ASSERT_OK(Put("3000000000000bar", "bar_v"));
ASSERT_OK(Put("1000000000foo003", "v__3"));
ASSERT_OK(Put("1000000000foo004", "v__4"));
ASSERT_OK(Put("1000000000foo005", "v__5"));
ASSERT_OK(Put("1000000000foo007", "v__7"));
ASSERT_OK(Put("1000000000foo008", "v__8"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get("1000000000foo001"));
ASSERT_EQ("v__3", Get("1000000000foo003"));
Iterator* iter = dbfull()->NewIterator(ReadOptions());
iter->Seek("1000000000foo000");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo001", iter->key().ToString());
ASSERT_EQ("v1", iter->value().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo002", iter->key().ToString());
ASSERT_EQ("v_2", iter->value().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo003", iter->key().ToString());
ASSERT_EQ("v__3", iter->value().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo004", iter->key().ToString());
ASSERT_EQ("v__4", iter->value().ToString());
iter->Seek("3000000000000bar");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("3000000000000bar", iter->key().ToString());
ASSERT_EQ("bar_v", iter->value().ToString());
iter->Seek("1000000000foo000");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo001", iter->key().ToString());
ASSERT_EQ("v1", iter->value().ToString());
iter->Seek("1000000000foo005");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo005", iter->key().ToString());
ASSERT_EQ("v__5", iter->value().ToString());
iter->Seek("1000000000foo006");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo007", iter->key().ToString());
ASSERT_EQ("v__7", iter->value().ToString());
iter->Seek("1000000000foo008");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo008", iter->key().ToString());
ASSERT_EQ("v__8", iter->value().ToString());
if (total_order == 0) {
iter->Seek("1000000000foo009");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("3000000000000bar", iter->key().ToString());
}
// Test Bloom Filter
if (bloom_bits > 0) {
if (!total_order) {
// Neither key nor value should exist.
expect_bloom_not_match = true;
iter->Seek("2not000000000bar");
ASSERT_TRUE(!iter->Valid());
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
expect_bloom_not_match = false;
} else {
expect_bloom_not_match = true;
ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
expect_bloom_not_match = false;
}
}
delete iter;
}
}
}
}
namespace {
std::string MakeLongKey(size_t length, char c) {
return std::string(length, c);
}
} // namespace
TEST(PlainTableDBTest, IteratorLargeKeys) {
Options options = CurrentOptions();
options.table_factory.reset(NewTotalOrderPlainTableFactory(0, 0, 16));
options.create_if_missing = true;
options.prefix_extractor.reset();
DestroyAndReopen(&options);
std::string key_list[] = {
MakeLongKey(30, '0'),
MakeLongKey(16, '1'),
MakeLongKey(32, '2'),
MakeLongKey(60, '3'),
MakeLongKey(90, '4'),
MakeLongKey(50, '5'),
MakeLongKey(26, '6')
};
for (size_t i = 0; i < 7; i++) {
ASSERT_OK(Put(key_list[i], std::to_string(i)));
}
dbfull()->TEST_FlushMemTable();
Iterator* iter = dbfull()->NewIterator(ReadOptions());
iter->Seek(key_list[0]);
for (size_t i = 0; i < 7; i++) {
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(key_list[i], iter->key().ToString());
ASSERT_EQ(std::to_string(i), iter->value().ToString());
iter->Next();
}
ASSERT_TRUE(!iter->Valid());
delete iter;
}
// A test comparator which compare two strings in this way:
// (1) first compare prefix of 8 bytes in alphabet order,
// (2) if two strings share the same prefix, sort the other part of the string
// in the reverse alphabet order.
class SimpleSuffixReverseComparator : public Comparator {
public:
SimpleSuffixReverseComparator() {}
virtual const char* Name() const { return "SimpleSuffixReverseComparator"; }
virtual int Compare(const Slice& a, const Slice& b) const {
Slice prefix_a = Slice(a.data(), 8);
Slice prefix_b = Slice(b.data(), 8);
int prefix_comp = prefix_a.compare(prefix_b);
if (prefix_comp != 0) {
return prefix_comp;
} else {
Slice suffix_a = Slice(a.data() + 8, a.size() - 8);
Slice suffix_b = Slice(b.data() + 8, b.size() - 8);
return -(suffix_a.compare(suffix_b));
}
}
virtual void FindShortestSeparator(std::string* start,
const Slice& limit) const {}
virtual void FindShortSuccessor(std::string* key) const {}
};
TEST(PlainTableDBTest, IteratorReverseSuffixComparator) {
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
SimpleSuffixReverseComparator comp;
options.comparator = &comp;
DestroyAndReopen(&options);
ASSERT_OK(Put("1000000000foo002", "v_2"));
ASSERT_OK(Put("0000000000000bar", "random"));
ASSERT_OK(Put("1000000000foo001", "v1"));
ASSERT_OK(Put("3000000000000bar", "bar_v"));
ASSERT_OK(Put("1000000000foo003", "v__3"));
ASSERT_OK(Put("1000000000foo004", "v__4"));
ASSERT_OK(Put("1000000000foo005", "v__5"));
ASSERT_OK(Put("1000000000foo007", "v__7"));
ASSERT_OK(Put("1000000000foo008", "v__8"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get("1000000000foo001"));
ASSERT_EQ("v__3", Get("1000000000foo003"));
Iterator* iter = dbfull()->NewIterator(ReadOptions());
iter->Seek("1000000000foo009");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo008", iter->key().ToString());
ASSERT_EQ("v__8", iter->value().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo007", iter->key().ToString());
ASSERT_EQ("v__7", iter->value().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo005", iter->key().ToString());
ASSERT_EQ("v__5", iter->value().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo004", iter->key().ToString());
ASSERT_EQ("v__4", iter->value().ToString());
iter->Seek("3000000000000bar");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("3000000000000bar", iter->key().ToString());
ASSERT_EQ("bar_v", iter->value().ToString());
iter->Seek("1000000000foo005");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo005", iter->key().ToString());
ASSERT_EQ("v__5", iter->value().ToString());
iter->Seek("1000000000foo006");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo005", iter->key().ToString());
ASSERT_EQ("v__5", iter->value().ToString());
iter->Seek("1000000000foo008");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("1000000000foo008", iter->key().ToString());
ASSERT_EQ("v__8", iter->value().ToString());
iter->Seek("1000000000foo000");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("3000000000000bar", iter->key().ToString());
delete iter;
}
TEST(PlainTableDBTest, HashBucketConflict) {
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
huge_page_tlb_size += 2 * 1024 * 1024) {
for (unsigned char i = 1; i <= 3; i++) {
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
options.table_factory.reset(
NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
DestroyAndReopen(&options);
ASSERT_OK(Put("5000000000000fo0", "v1"));
ASSERT_OK(Put("5000000000000fo1", "v2"));
ASSERT_OK(Put("5000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo0", "v3"));
ASSERT_OK(Put("2000000000000fo1", "v4"));
ASSERT_OK(Put("2000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo3", "v"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get("5000000000000fo0"));
ASSERT_EQ("v2", Get("5000000000000fo1"));
ASSERT_EQ("v3", Get("2000000000000fo0"));
ASSERT_EQ("v4", Get("2000000000000fo1"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
ReadOptions ro;
Iterator* iter = dbfull()->NewIterator(ro);
iter->Seek("5000000000000fo0");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Seek("5000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo0");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000bar");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
iter->Seek("5000000000000bar");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
iter->Seek("2000000000000fo8");
ASSERT_TRUE(!iter->Valid() ||
options.comparator->Compare(iter->key(), "20000001") > 0);
iter->Seek("5000000000000fo8");
ASSERT_TRUE(!iter->Valid());
iter->Seek("1000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("3000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("8000000000000fo2");
ASSERT_TRUE(!iter->Valid());
delete iter;
}
}
}
TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
huge_page_tlb_size += 2 * 1024 * 1024) {
for (unsigned char i = 1; i <= 3; i++) {
Options options = CurrentOptions();
options.create_if_missing = true;
SimpleSuffixReverseComparator comp;
options.comparator = &comp;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
options.table_factory.reset(
NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
DestroyAndReopen(&options);
ASSERT_OK(Put("5000000000000fo0", "v1"));
ASSERT_OK(Put("5000000000000fo1", "v2"));
ASSERT_OK(Put("5000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo0", "v3"));
ASSERT_OK(Put("2000000000000fo1", "v4"));
ASSERT_OK(Put("2000000000000fo2", "v"));
ASSERT_OK(Put("2000000000000fo3", "v"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get("5000000000000fo0"));
ASSERT_EQ("v2", Get("5000000000000fo1"));
ASSERT_EQ("v3", Get("2000000000000fo0"));
ASSERT_EQ("v4", Get("2000000000000fo1"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
ReadOptions ro;
Iterator* iter = dbfull()->NewIterator(ro);
iter->Seek("5000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
iter->Seek("5000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo0", iter->key().ToString());
iter->Seek("2000000000000fo1");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo1", iter->key().ToString());
iter->Seek("2000000000000var");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("2000000000000fo3", iter->key().ToString());
iter->Seek("5000000000000var");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo2", iter->key().ToString());
std::string seek_key = "2000000000000bar";
iter->Seek(seek_key);
ASSERT_TRUE(!iter->Valid() ||
options.prefix_extractor->Transform(iter->key()) !=
options.prefix_extractor->Transform(seek_key));
iter->Seek("1000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("3000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("8000000000000fo2");
ASSERT_TRUE(!iter->Valid());
delete iter;
}
}
}
TEST(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
Options options = CurrentOptions();
options.create_if_missing = true;
// Set only one bucket to force bucket conflict.
// Test index interval for the same prefix to be 1, 2 and 4
options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 5));
DestroyAndReopen(&options);
ASSERT_OK(Put("5000000000000fo0", "v1"));
ASSERT_OK(Put("5000000000000fo1", "v2"));
ASSERT_OK(Put("5000000000000fo2", "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get("5000000000000fo0"));
ASSERT_EQ("v2", Get("5000000000000fo1"));
ASSERT_EQ("v3", Get("5000000000000fo2"));
ASSERT_EQ("NOT_FOUND", Get("8000000000000bar"));
ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
Iterator* iter = dbfull()->NewIterator(ReadOptions());
iter->Seek("5000000000000bar");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("5000000000000fo0", iter->key().ToString());
iter->Seek("5000000000000fo8");
ASSERT_TRUE(!iter->Valid());
iter->Seek("1000000000000fo2");
ASSERT_TRUE(!iter->Valid());
iter->Seek("8000000000000fo2");
ASSERT_TRUE(!iter->Valid());
delete iter;
}
static std::string Key(int i) {
char buf[100];
snprintf(buf, sizeof(buf), "key_______%06d", i);
return std::string(buf);
}
static std::string RandomString(Random* rnd, int len) {
std::string r;
test::RandomString(rnd, len, &r);
return r;
}
TEST(PlainTableDBTest, CompactionTrigger) {
Options options = CurrentOptions();
options.write_buffer_size = 100 << 10; //100KB
options.num_levels = 3;
options.max_mem_compaction_level = 0;
options.level0_file_num_compaction_trigger = 3;
Reopen(&options);
Random rnd(301);
for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
num++) {
std::vector<std::string> values;
// Write 120KB (12 values, each 10K)
for (int i = 0; i < 12; i++) {
values.push_back(RandomString(&rnd, 10000));
ASSERT_OK(Put(Key(i), values[i]));
}
dbfull()->TEST_WaitForFlushMemTable();
ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
}
//generate one more file in level-0, and should trigger level-0 compaction
std::vector<std::string> values;
for (int i = 0; i < 12; i++) {
values.push_back(RandomString(&rnd, 10000));
ASSERT_OK(Put(Key(i), values[i]));
}
dbfull()->TEST_WaitForCompact();
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_EQ(NumTableFilesAtLevel(1), 1);
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

499
db/prefix_test.cc Normal file
View File

@@ -0,0 +1,499 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef GFLAGS
#include <cstdio>
int main() {
fprintf(stderr, "Please install gflags to run rocksdb tools\n");
return 1;
}
#else
#include <algorithm>
#include <iostream>
#include <vector>
#include <gflags/gflags.h>
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/perf_context.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/memtablerep.h"
#include "util/histogram.h"
#include "util/stop_watch.h"
#include "util/testharness.h"
using GFLAGS::ParseCommandLineFlags;
DEFINE_bool(trigger_deadlock, false,
"issue delete in range scan to trigger PrefixHashMap deadlock");
DEFINE_uint64(bucket_count, 100000, "number of buckets");
DEFINE_uint64(num_locks, 10001, "number of locks");
DEFINE_bool(random_prefix, false, "randomize prefix");
DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
DEFINE_int64(write_buffer_size, 33554432, "");
DEFINE_int64(max_write_buffer_number, 2, "");
DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
DEFINE_int32(skiplist_height, 4, "");
DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
DEFINE_int32(memtable_prefix_bloom_huge_page_tlb_size, 2 * 1024 * 1024, "");
DEFINE_int32(value_size, 40, "");
// Path to the database on file system
const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
namespace rocksdb {
struct TestKey {
uint64_t prefix;
uint64_t sorted;
TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {}
};
// return a slice backed by test_key
inline Slice TestKeyToSlice(const TestKey& test_key) {
return Slice((const char*)&test_key, sizeof(test_key));
}
inline const TestKey* SliceToTestKey(const Slice& slice) {
return (const TestKey*)slice.data();
}
class TestKeyComparator : public Comparator {
public:
// Compare needs to be aware of the possibility of a and/or b is
// prefix only
virtual int Compare(const Slice& a, const Slice& b) const {
const TestKey* key_a = SliceToTestKey(a);
const TestKey* key_b = SliceToTestKey(b);
if (key_a->prefix != key_b->prefix) {
if (key_a->prefix < key_b->prefix) return -1;
if (key_a->prefix > key_b->prefix) return 1;
} else {
ASSERT_TRUE(key_a->prefix == key_b->prefix);
// note, both a and b could be prefix only
if (a.size() != b.size()) {
// one of them is prefix
ASSERT_TRUE(
(a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) ||
(b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey)));
if (a.size() < b.size()) return -1;
if (a.size() > b.size()) return 1;
} else {
// both a and b are prefix
if (a.size() == sizeof(uint64_t)) {
return 0;
}
// both a and b are whole key
ASSERT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey));
if (key_a->sorted < key_b->sorted) return -1;
if (key_a->sorted > key_b->sorted) return 1;
if (key_a->sorted == key_b->sorted) return 0;
}
}
return 0;
}
virtual const char* Name() const override {
return "TestKeyComparator";
}
virtual void FindShortestSeparator(
std::string* start,
const Slice& limit) const {
}
virtual void FindShortSuccessor(std::string* key) const {}
};
namespace {
void PutKey(DB* db, WriteOptions write_options, uint64_t prefix,
uint64_t suffix, const Slice& value) {
TestKey test_key(prefix, suffix);
Slice key = TestKeyToSlice(test_key);
ASSERT_OK(db->Put(write_options, key, value));
}
void SeekIterator(Iterator* iter, uint64_t prefix, uint64_t suffix) {
TestKey test_key(prefix, suffix);
Slice key = TestKeyToSlice(test_key);
iter->Seek(key);
}
const std::string kNotFoundResult = "NOT_FOUND";
std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix,
uint64_t suffix) {
TestKey test_key(prefix, suffix);
Slice key = TestKeyToSlice(test_key);
std::string result;
Status s = db->Get(read_options, key, &result);
if (s.IsNotFound()) {
result = kNotFoundResult;
} else if (!s.ok()) {
result = s.ToString();
}
return result;
}
} // namespace
class PrefixTest {
public:
std::shared_ptr<DB> OpenDb() {
DB* db;
options.create_if_missing = true;
options.write_buffer_size = FLAGS_write_buffer_size;
options.max_write_buffer_number = FLAGS_max_write_buffer_number;
options.min_write_buffer_number_to_merge =
FLAGS_min_write_buffer_number_to_merge;
options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
options.memtable_prefix_bloom_huge_page_tlb_size =
FLAGS_memtable_prefix_bloom_huge_page_tlb_size;
Status s = DB::Open(options, kDbName, &db);
ASSERT_OK(s);
return std::shared_ptr<DB>(db);
}
void FirstOption() {
option_config_ = kBegin;
}
bool NextOptions(int bucket_count) {
// skip some options
option_config_++;
if (option_config_ < kEnd) {
options.prefix_extractor.reset(NewFixedPrefixTransform(8));
switch(option_config_) {
case kHashSkipList:
options.memtable_factory.reset(
NewHashSkipListRepFactory(bucket_count, FLAGS_skiplist_height));
return true;
case kHashLinkList:
options.memtable_factory.reset(
NewHashLinkListRepFactory(bucket_count));
return true;
case kHashLinkListHugePageTlb:
options.memtable_factory.reset(
NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
return true;
default:
return false;
}
}
return false;
}
PrefixTest() : option_config_(kBegin) {
options.comparator = new TestKeyComparator();
}
~PrefixTest() {
delete options.comparator;
}
protected:
enum OptionConfig {
kBegin,
kHashSkipList,
kHashLinkList,
kHashLinkListHugePageTlb,
kEnd
};
int option_config_;
Options options;
};
TEST(PrefixTest, TestResult) {
for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
FirstOption();
while (NextOptions(num_buckets)) {
std::cout << "*** Mem table: " << options.memtable_factory->Name()
<< " number of buckets: " << num_buckets
<< std::endl;
DestroyDB(kDbName, Options());
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
// 1. Insert one row.
Slice v16("v16");
PutKey(db.get(), write_options, 1, 6, v16);
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
SeekIterator(iter.get(), 1, 6);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v16 == iter->value());
SeekIterator(iter.get(), 1, 5);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v16 == iter->value());
SeekIterator(iter.get(), 1, 5);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v16 == iter->value());
iter->Next();
ASSERT_TRUE(!iter->Valid());
SeekIterator(iter.get(), 2, 0);
ASSERT_TRUE(!iter->Valid());
ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5));
ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 7));
ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 0, 6));
ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 2, 6));
// 2. Insert an entry for the same prefix as the last entry in the bucket.
Slice v17("v17");
PutKey(db.get(), write_options, 1, 7, v17);
iter.reset(db->NewIterator(read_options));
SeekIterator(iter.get(), 1, 7);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v17 == iter->value());
SeekIterator(iter.get(), 1, 6);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v16 == iter->value());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v17 == iter->value());
iter->Next();
ASSERT_TRUE(!iter->Valid());
SeekIterator(iter.get(), 2, 0);
ASSERT_TRUE(!iter->Valid());
// 3. Insert an entry for the same prefix as the head of the bucket.
Slice v15("v15");
PutKey(db.get(), write_options, 1, 5, v15);
iter.reset(db->NewIterator(read_options));
SeekIterator(iter.get(), 1, 7);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v17 == iter->value());
SeekIterator(iter.get(), 1, 5);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v15 == iter->value());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v16 == iter->value());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v17 == iter->value());
SeekIterator(iter.get(), 1, 5);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v15 == iter->value());
ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
// 4. Insert an entry with a larger prefix
Slice v22("v22");
PutKey(db.get(), write_options, 2, 2, v22);
iter.reset(db->NewIterator(read_options));
SeekIterator(iter.get(), 2, 2);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v22 == iter->value());
SeekIterator(iter.get(), 2, 0);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v22 == iter->value());
SeekIterator(iter.get(), 1, 5);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v15 == iter->value());
SeekIterator(iter.get(), 1, 7);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v17 == iter->value());
// 5. Insert an entry with a smaller prefix
Slice v02("v02");
PutKey(db.get(), write_options, 0, 2, v02);
iter.reset(db->NewIterator(read_options));
SeekIterator(iter.get(), 0, 2);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v02 == iter->value());
SeekIterator(iter.get(), 0, 0);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v02 == iter->value());
SeekIterator(iter.get(), 2, 0);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v22 == iter->value());
SeekIterator(iter.get(), 1, 5);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v15 == iter->value());
SeekIterator(iter.get(), 1, 7);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v17 == iter->value());
// 6. Insert to the beginning and the end of the first prefix
Slice v13("v13");
Slice v18("v18");
PutKey(db.get(), write_options, 1, 3, v13);
PutKey(db.get(), write_options, 1, 8, v18);
iter.reset(db->NewIterator(read_options));
SeekIterator(iter.get(), 1, 7);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v17 == iter->value());
SeekIterator(iter.get(), 1, 3);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v13 == iter->value());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v15 == iter->value());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v16 == iter->value());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v17 == iter->value());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v18 == iter->value());
SeekIterator(iter.get(), 0, 0);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v02 == iter->value());
SeekIterator(iter.get(), 2, 0);
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(v22 == iter->value());
ASSERT_EQ(v22.ToString(), Get(db.get(), read_options, 2, 2));
ASSERT_EQ(v02.ToString(), Get(db.get(), read_options, 0, 2));
ASSERT_EQ(v13.ToString(), Get(db.get(), read_options, 1, 3));
ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
ASSERT_EQ(v18.ToString(), Get(db.get(), read_options, 1, 8));
}
}
}
TEST(PrefixTest, DynamicPrefixIterator) {
while (NextOptions(FLAGS_bucket_count)) {
std::cout << "*** Mem table: " << options.memtable_factory->Name()
<< std::endl;
DestroyDB(kDbName, Options());
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
std::vector<uint64_t> prefixes;
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
prefixes.push_back(i);
}
if (FLAGS_random_prefix) {
std::random_shuffle(prefixes.begin(), prefixes.end());
}
HistogramImpl hist_put_time;
HistogramImpl hist_put_comparison;
// insert x random prefix, each with y continuous element.
for (auto prefix : prefixes) {
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
TestKey test_key(prefix, sorted);
Slice key = TestKeyToSlice(test_key);
std::string value(FLAGS_value_size, 0);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
ASSERT_OK(db->Put(write_options, key, value));
hist_put_time.Add(timer.ElapsedNanos());
hist_put_comparison.Add(perf_context.user_key_comparison_count);
}
}
std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
<< "Put time: \n" << hist_put_time.ToString();
// test seek existing keys
HistogramImpl hist_seek_time;
HistogramImpl hist_seek_comparison;
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
for (auto prefix : prefixes) {
TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(0);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
auto key_prefix = options.prefix_extractor->Transform(key);
uint64_t total_keys = 0;
for (iter->Seek(key);
iter->Valid() && iter->key().starts_with(key_prefix);
iter->Next()) {
if (FLAGS_trigger_deadlock) {
std::cout << "Behold the deadlock!\n";
db->Delete(write_options, iter->key());
}
total_keys++;
}
hist_seek_time.Add(timer.ElapsedNanos());
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
}
std::cout << "Seek key comparison: \n"
<< hist_seek_comparison.ToString()
<< "Seek time: \n"
<< hist_seek_time.ToString();
// test non-existing keys
HistogramImpl hist_no_seek_time;
HistogramImpl hist_no_seek_comparison;
for (auto prefix = FLAGS_total_prefixes;
prefix < FLAGS_total_prefixes + 10000;
prefix++) {
TestKey test_key(prefix, 0);
Slice key = TestKeyToSlice(test_key);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
iter->Seek(key);
hist_no_seek_time.Add(timer.ElapsedNanos());
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_TRUE(!iter->Valid());
}
std::cout << "non-existing Seek key comparison: \n"
<< hist_no_seek_comparison.ToString()
<< "non-existing Seek time: \n"
<< hist_no_seek_time.ToString();
}
}
}
int main(int argc, char** argv) {
ParseCommandLineFlags(&argc, &argv, true);
std::cout << kDbName << "\n";
rocksdb::test::RunAllTests();
return 0;
}
#endif // GFLAGS

403
db/repair.cc Normal file
View File

@@ -0,0 +1,403 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// We recover the contents of the descriptor from the other files we find.
// (1) Any log files are first converted to tables
// (2) We scan every table to compute
// (a) smallest/largest for the table
// (b) largest sequence number in the table
// (3) We generate descriptor contents:
// - log number is set to zero
// - next-file-number is set to 1 + largest file number we found
// - last-sequence-number is set to largest sequence# found across
// all tables (see 2c)
// - compaction pointers are cleared
// - every table file is added at level 0
//
// Possible optimization 1:
// (a) Compute total size and use to pick appropriate max-level M
// (b) Sort tables by largest sequence# in the table
// (c) For each table: if it overlaps earlier table, place in level-0,
// else place in level-M.
// Possible optimization 2:
// Store per-table metadata (smallest, largest, largest-seq#, ...)
// in the table's meta section to speed up ScanTable.
#ifndef ROCKSDB_LITE
#include "db/builder.h"
#include "db/db_impl.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "db/write_batch_internal.h"
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
namespace rocksdb {
namespace {
class Repairer {
public:
Repairer(const std::string& dbname, const Options& options)
: dbname_(dbname),
env_(options.env),
icmp_(options.comparator),
ipolicy_(options.filter_policy),
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
raw_table_cache_(
// TableCache can be small since we expect each table to be opened
// once.
NewLRUCache(10, options_.table_cache_numshardbits,
options_.table_cache_remove_scan_count_limit)),
next_file_number_(1) {
table_cache_ = new TableCache(dbname_, &options_, storage_options_,
raw_table_cache_.get());
edit_ = new VersionEdit();
}
~Repairer() {
delete table_cache_;
raw_table_cache_.reset();
delete edit_;
}
Status Run() {
Status status = FindFiles();
if (status.ok()) {
ConvertLogFilesToTables();
ExtractMetaData();
status = WriteDescriptor();
}
if (status.ok()) {
unsigned long long bytes = 0;
for (size_t i = 0; i < tables_.size(); i++) {
bytes += tables_[i].meta.file_size;
}
Log(options_.info_log,
"**** Repaired rocksdb %s; "
"recovered %d files; %llu bytes. "
"Some data may have been lost. "
"****",
dbname_.c_str(),
static_cast<int>(tables_.size()),
bytes);
}
return status;
}
private:
struct TableInfo {
FileMetaData meta;
SequenceNumber min_sequence;
SequenceNumber max_sequence;
};
std::string const dbname_;
Env* const env_;
InternalKeyComparator const icmp_;
InternalFilterPolicy const ipolicy_;
Options const options_;
std::shared_ptr<Cache> raw_table_cache_;
TableCache* table_cache_;
VersionEdit* edit_;
std::vector<std::string> manifests_;
std::vector<uint64_t> table_numbers_;
std::vector<uint64_t> logs_;
std::vector<TableInfo> tables_;
uint64_t next_file_number_;
const EnvOptions storage_options_;
Status FindFiles() {
std::vector<std::string> filenames;
Status status = env_->GetChildren(dbname_, &filenames);
if (!status.ok()) {
return status;
}
if (filenames.empty()) {
return Status::Corruption(dbname_, "repair found no files");
}
uint64_t number;
FileType type;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) {
if (type == kDescriptorFile) {
manifests_.push_back(filenames[i]);
} else {
if (number + 1 > next_file_number_) {
next_file_number_ = number + 1;
}
if (type == kLogFile) {
logs_.push_back(number);
} else if (type == kTableFile) {
table_numbers_.push_back(number);
} else {
// Ignore other files
}
}
}
}
return status;
}
void ConvertLogFilesToTables() {
for (size_t i = 0; i < logs_.size(); i++) {
std::string logname = LogFileName(dbname_, logs_[i]);
Status status = ConvertLogToTable(logs_[i]);
if (!status.ok()) {
Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
(unsigned long long) logs_[i],
status.ToString().c_str());
}
ArchiveFile(logname);
}
}
Status ConvertLogToTable(uint64_t log) {
struct LogReporter : public log::Reader::Reporter {
Env* env;
std::shared_ptr<Logger> info_log;
uint64_t lognum;
virtual void Corruption(size_t bytes, const Status& s) {
// We print error messages for corruption, but continue repairing.
Log(info_log, "Log #%llu: dropping %d bytes; %s",
(unsigned long long) lognum,
static_cast<int>(bytes),
s.ToString().c_str());
}
};
// Open the log file
std::string logname = LogFileName(dbname_, log);
unique_ptr<SequentialFile> lfile;
Status status = env_->NewSequentialFile(logname, &lfile, storage_options_);
if (!status.ok()) {
return status;
}
// Create the log reader.
LogReporter reporter;
reporter.env = env_;
reporter.info_log = options_.info_log;
reporter.lognum = log;
// We intentially make log::Reader do checksumming so that
// corruptions cause entire commits to be skipped instead of
// propagating bad information (like overly large sequence
// numbers).
log::Reader reader(std::move(lfile), &reporter, false/*do not checksum*/,
0/*initial_offset*/);
// Read all the records and add to a memtable
std::string scratch;
Slice record;
WriteBatch batch;
MemTable* mem = new MemTable(icmp_, options_);
auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
mem->Ref();
int counter = 0;
while (reader.ReadRecord(&record, &scratch)) {
if (record.size() < 12) {
reporter.Corruption(
record.size(), Status::Corruption("log record too small"));
continue;
}
WriteBatchInternal::SetContents(&batch, record);
status = WriteBatchInternal::InsertInto(&batch, cf_mems_default);
if (status.ok()) {
counter += WriteBatchInternal::Count(&batch);
} else {
Log(options_.info_log, "Log #%llu: ignoring %s",
(unsigned long long) log,
status.ToString().c_str());
status = Status::OK(); // Keep going with rest of file
}
}
// Do not record a version edit for this conversion to a Table
// since ExtractMetaData() will also generate edits.
FileMetaData meta;
meta.number = next_file_number_++;
ReadOptions ro;
Iterator* iter = mem->NewIterator(ro, true /* enforce_total_order */);
status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_,
iter, &meta, icmp_, 0, 0, kNoCompression);
delete iter;
delete mem->Unref();
delete cf_mems_default;
mem = nullptr;
if (status.ok()) {
if (meta.file_size > 0) {
table_numbers_.push_back(meta.number);
}
}
Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
(unsigned long long) log,
counter,
(unsigned long long) meta.number,
status.ToString().c_str());
return status;
}
void ExtractMetaData() {
for (size_t i = 0; i < table_numbers_.size(); i++) {
TableInfo t;
t.meta.number = table_numbers_[i];
Status status = ScanTable(&t);
if (!status.ok()) {
std::string fname = TableFileName(dbname_, table_numbers_[i]);
Log(options_.info_log, "Table #%llu: ignoring %s",
(unsigned long long) table_numbers_[i],
status.ToString().c_str());
ArchiveFile(fname);
} else {
tables_.push_back(t);
}
}
}
Status ScanTable(TableInfo* t) {
std::string fname = TableFileName(dbname_, t->meta.number);
int counter = 0;
Status status = env_->GetFileSize(fname, &t->meta.file_size);
if (status.ok()) {
FileMetaData dummy_meta(t->meta.number, t->meta.file_size);
Iterator* iter = table_cache_->NewIterator(
ReadOptions(), storage_options_, icmp_, dummy_meta);
bool empty = true;
ParsedInternalKey parsed;
t->min_sequence = 0;
t->max_sequence = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Slice key = iter->key();
if (!ParseInternalKey(key, &parsed)) {
Log(options_.info_log, "Table #%llu: unparsable key %s",
(unsigned long long) t->meta.number,
EscapeString(key).c_str());
continue;
}
counter++;
if (empty) {
empty = false;
t->meta.smallest.DecodeFrom(key);
}
t->meta.largest.DecodeFrom(key);
if (parsed.sequence < t->min_sequence) {
t->min_sequence = parsed.sequence;
}
if (parsed.sequence > t->max_sequence) {
t->max_sequence = parsed.sequence;
}
}
if (!iter->status().ok()) {
status = iter->status();
}
delete iter;
}
Log(options_.info_log, "Table #%llu: %d entries %s",
(unsigned long long) t->meta.number,
counter,
status.ToString().c_str());
return status;
}
Status WriteDescriptor() {
std::string tmp = TempFileName(dbname_, 1);
unique_ptr<WritableFile> file;
Status status = env_->NewWritableFile(
tmp, &file, env_->OptimizeForManifestWrite(storage_options_));
if (!status.ok()) {
return status;
}
SequenceNumber max_sequence = 0;
for (size_t i = 0; i < tables_.size(); i++) {
if (max_sequence < tables_[i].max_sequence) {
max_sequence = tables_[i].max_sequence;
}
}
edit_->SetComparatorName(icmp_.user_comparator()->Name());
edit_->SetLogNumber(0);
edit_->SetNextFile(next_file_number_);
edit_->SetLastSequence(max_sequence);
for (size_t i = 0; i < tables_.size(); i++) {
// TODO(opt): separate out into multiple levels
const TableInfo& t = tables_[i];
edit_->AddFile(0, t.meta.number, t.meta.file_size,
t.meta.smallest, t.meta.largest,
t.min_sequence, t.max_sequence);
}
//fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
{
log::Writer log(std::move(file));
std::string record;
edit_->EncodeTo(&record);
status = log.AddRecord(record);
}
if (!status.ok()) {
env_->DeleteFile(tmp);
} else {
// Discard older manifests
for (size_t i = 0; i < manifests_.size(); i++) {
ArchiveFile(dbname_ + "/" + manifests_[i]);
}
// Install new manifest
status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
if (status.ok()) {
status = SetCurrentFile(env_, dbname_, 1, nullptr);
} else {
env_->DeleteFile(tmp);
}
}
return status;
}
void ArchiveFile(const std::string& fname) {
// Move into another directory. E.g., for
// dir/foo
// rename to
// dir/lost/foo
const char* slash = strrchr(fname.c_str(), '/');
std::string new_dir;
if (slash != nullptr) {
new_dir.assign(fname.data(), slash - fname.data());
}
new_dir.append("/lost");
env_->CreateDir(new_dir); // Ignore error
std::string new_file = new_dir;
new_file.append("/");
new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
Status s = env_->RenameFile(fname, new_file);
Log(options_.info_log, "Archiving %s: %s\n",
fname.c_str(), s.ToString().c_str());
}
};
} // namespace
Status RepairDB(const std::string& dbname, const Options& options) {
Repairer repairer(dbname, options);
return repairer.Run();
}
} // namespace rocksdb
#endif // ROCKSDB_LITE

800
db/simple_table_db_test.cc Normal file
View File

@@ -0,0 +1,800 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <algorithm>
#include <set>
#include "rocksdb/db.h"
#include "rocksdb/filter_policy.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "rocksdb/statistics.h"
#include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/env.h"
#include "rocksdb/table.h"
#include "rocksdb/table_properties.h"
#include "table/table_builder.h"
#include "util/hash.h"
#include "util/logging.h"
#include "util/mutexlock.h"
#include "util/testharness.h"
#include "util/testutil.h"
#include "utilities/merge_operators.h"
using std::unique_ptr;
// IS THIS FILE STILL NEEDED?
namespace rocksdb {
// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
// as production quality.
// SimpleTable requires the input key size to be fixed 16 bytes, value cannot
// be longer than 150000 bytes and stored data on disk in this format:
// +--------------------------------------------+ <= key1 offset
// | key1 | value_size (4 bytes) | |
// +----------------------------------------+ |
// | value1 |
// | |
// +----------------------------------------+---+ <= key2 offset
// | key2 | value_size (4 bytes) | |
// +----------------------------------------+ |
// | value2 |
// | |
// | ...... |
// +-----------------+--------------------------+ <= index_block_offset
// | key1 | key1 offset (8 bytes) |
// +-----------------+--------------------------+
// | key2 | key2 offset (8 bytes) |
// +-----------------+--------------------------+
// | key3 | key3 offset (8 bytes) |
// +-----------------+--------------------------+
// | ...... |
// +-----------------+------------+-------------+
// | index_block_offset (8 bytes) |
// +------------------------------+
// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
// as production quality.
class SimpleTableReader: public TableReader {
public:
// Attempt to open the table that is stored in bytes [0..file_size)
// of "file", and read the metadata entries necessary to allow
// retrieving data from the table.
//
// If successful, returns ok and sets "*table" to the newly opened
// table. The client should delete "*table" when no longer needed.
// If there was an error while initializing the table, sets "*table"
// to nullptr and returns a non-ok status. Does not take ownership of
// "*source", but the client must ensure that "source" remains live
// for the duration of the returned table's lifetime.
//
// *file must remain live while this Table is in use.
static Status Open(const Options& options, const EnvOptions& soptions,
unique_ptr<RandomAccessFile> && file, uint64_t file_size,
unique_ptr<TableReader>* table_reader);
Iterator* NewIterator(const ReadOptions&, Arena* arena) override;
Status Get(const ReadOptions&, const Slice& key, void* arg,
bool (*handle_result)(void* arg, const ParsedInternalKey& k,
const Slice& v, bool),
void (*mark_key_may_exist)(void*) = nullptr) override;
uint64_t ApproximateOffsetOf(const Slice& key) override;
void SetupForCompaction() override;
std::shared_ptr<const TableProperties> GetTableProperties() const override;
~SimpleTableReader();
private:
struct Rep;
Rep* rep_;
explicit SimpleTableReader(Rep* rep) {
rep_ = rep;
}
friend class TableCache;
friend class SimpleTableIterator;
Status GetOffset(const Slice& target, uint64_t* offset);
// No copying allowed
explicit SimpleTableReader(const TableReader&) = delete;
void operator=(const TableReader&) = delete;
};
// Iterator to iterate SimpleTable
class SimpleTableIterator: public Iterator {
public:
explicit SimpleTableIterator(SimpleTableReader* table);
~SimpleTableIterator();
bool Valid() const;
void SeekToFirst();
void SeekToLast();
void Seek(const Slice& target);
void Next();
void Prev();
Slice key() const;
Slice value() const;
Status status() const;
private:
SimpleTableReader* table_;
uint64_t offset_;
uint64_t next_offset_;
Slice key_;
Slice value_;
char tmp_str_[4];
char* key_str_;
char* value_str_;
int value_str_len_;
Status status_;
// No copying allowed
SimpleTableIterator(const SimpleTableIterator&) = delete;
void operator=(const Iterator&) = delete;
};
struct SimpleTableReader::Rep {
~Rep() {
}
Rep(const EnvOptions& storage_options, uint64_t index_start_offset,
int num_entries) :
soptions(storage_options), index_start_offset(index_start_offset),
num_entries(num_entries) {
}
Options options;
const EnvOptions& soptions;
Status status;
unique_ptr<RandomAccessFile> file;
uint64_t index_start_offset;
int num_entries;
std::shared_ptr<TableProperties> table_properties;
const static int user_key_size = 16;
const static int offset_length = 8;
const static int key_footer_len = 8;
static int GetInternalKeyLength() {
return user_key_size + key_footer_len;
}
};
SimpleTableReader::~SimpleTableReader() {
delete rep_;
}
Status SimpleTableReader::Open(const Options& options,
const EnvOptions& soptions,
unique_ptr<RandomAccessFile> && file,
uint64_t size,
unique_ptr<TableReader>* table_reader) {
char footer_space[Rep::offset_length];
Slice footer_input;
Status s = file->Read(size - Rep::offset_length, Rep::offset_length,
&footer_input, footer_space);
if (s.ok()) {
uint64_t index_start_offset = DecodeFixed64(footer_space);
int num_entries = (size - Rep::offset_length - index_start_offset)
/ (Rep::GetInternalKeyLength() + Rep::offset_length);
SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions,
index_start_offset,
num_entries);
rep->file = std::move(file);
rep->options = options;
table_reader->reset(new SimpleTableReader(rep));
}
return s;
}
void SimpleTableReader::SetupForCompaction() {
}
std::shared_ptr<const TableProperties> SimpleTableReader::GetTableProperties()
const {
return rep_->table_properties;
}
Iterator* SimpleTableReader::NewIterator(const ReadOptions& options,
Arena* arena) {
if (arena == nullptr) {
return new SimpleTableIterator(this);
} else {
auto mem = arena->AllocateAligned(sizeof(SimpleTableIterator));
return new (mem) SimpleTableIterator(this);
}
}
Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
uint32_t left = 0;
uint32_t right = rep_->num_entries - 1;
char key_chars[Rep::GetInternalKeyLength()];
Slice tmp_slice;
uint32_t target_offset = 0;
while (left <= right) {
uint32_t mid = (left + right + 1) / 2;
uint64_t offset_to_read = rep_->index_start_offset
+ (Rep::GetInternalKeyLength() + Rep::offset_length) * mid;
Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(),
&tmp_slice, key_chars);
if (!s.ok()) {
return s;
}
InternalKeyComparator ikc(rep_->options.comparator);
int compare_result = ikc.Compare(tmp_slice, target);
if (compare_result < 0) {
if (left == right) {
target_offset = right + 1;
break;
}
left = mid;
} else {
if (left == right) {
target_offset = left;
break;
}
right = mid - 1;
}
}
if (target_offset >= (uint32_t) rep_->num_entries) {
*offset = rep_->index_start_offset;
return Status::OK();
}
char value_offset_chars[Rep::offset_length];
int64_t offset_for_value_offset = rep_->index_start_offset
+ (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset
+ Rep::GetInternalKeyLength();
Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length,
&tmp_slice, value_offset_chars);
if (s.ok()) {
*offset = DecodeFixed64(value_offset_chars);
}
return s;
}
Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k,
void* arg,
bool (*saver)(void*, const ParsedInternalKey&,
const Slice&, bool),
void (*mark_key_may_exist)(void*)) {
Status s;
SimpleTableIterator* iter = new SimpleTableIterator(this);
for (iter->Seek(k); iter->Valid(); iter->Next()) {
ParsedInternalKey parsed_key;
if (!ParseInternalKey(iter->key(), &parsed_key)) {
return Status::Corruption(Slice());
}
if (!(*saver)(arg, parsed_key, iter->value(), true)) {
break;
}
}
s = iter->status();
delete iter;
return s;
}
uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
return 0;
}
SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) :
table_(table) {
key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()];
value_str_len_ = -1;
SeekToFirst();
}
SimpleTableIterator::~SimpleTableIterator() {
delete[] key_str_;
if (value_str_len_ >= 0) {
delete[] value_str_;
}
}
bool SimpleTableIterator::Valid() const {
return offset_ < table_->rep_->index_start_offset;
}
void SimpleTableIterator::SeekToFirst() {
next_offset_ = 0;
Next();
}
void SimpleTableIterator::SeekToLast() {
assert(false);
}
void SimpleTableIterator::Seek(const Slice& target) {
Status s = table_->GetOffset(target, &next_offset_);
if (!s.ok()) {
status_ = s;
}
Next();
}
void SimpleTableIterator::Next() {
offset_ = next_offset_;
if (offset_ >= table_->rep_->index_start_offset) {
return;
}
Slice result;
int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength();
Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result,
key_str_);
next_offset_ += internal_key_size;
key_ = result;
Slice value_size_slice;
s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_);
next_offset_ += 4;
uint32_t value_size = DecodeFixed32(tmp_str_);
Slice value_slice;
if ((int) value_size > value_str_len_) {
if (value_str_len_ >= 0) {
delete[] value_str_;
}
value_str_ = new char[value_size];
value_str_len_ = value_size;
}
s = table_->rep_->file->Read(next_offset_, value_size, &value_slice,
value_str_);
next_offset_ += value_size;
value_ = value_slice;
}
void SimpleTableIterator::Prev() {
assert(false);
}
Slice SimpleTableIterator::key() const {
Log(table_->rep_->options.info_log, "key!!!!");
return key_;
}
Slice SimpleTableIterator::value() const {
return value_;
}
Status SimpleTableIterator::status() const {
return status_;
}
class SimpleTableBuilder: public TableBuilder {
public:
// Create a builder that will store the contents of the table it is
// building in *file. Does not close the file. It is up to the
// caller to close the file after calling Finish(). The output file
// will be part of level specified by 'level'. A value of -1 means
// that the caller does not know which level the output file will reside.
SimpleTableBuilder(const Options& options, WritableFile* file,
CompressionType compression_type);
// REQUIRES: Either Finish() or Abandon() has been called.
~SimpleTableBuilder();
// Add key,value to the table being constructed.
// REQUIRES: key is after any previously added key according to comparator.
// REQUIRES: Finish(), Abandon() have not been called
void Add(const Slice& key, const Slice& value) override;
// Return non-ok iff some error has been detected.
Status status() const override;
// Finish building the table. Stops using the file passed to the
// constructor after this function returns.
// REQUIRES: Finish(), Abandon() have not been called
Status Finish() override;
// Indicate that the contents of this builder should be abandoned. Stops
// using the file passed to the constructor after this function returns.
// If the caller is not going to call Finish(), it must call Abandon()
// before destroying this builder.
// REQUIRES: Finish(), Abandon() have not been called
void Abandon() override;
// Number of calls to Add() so far.
uint64_t NumEntries() const override;
// Size of the file generated so far. If invoked after a successful
// Finish() call, returns the size of the final generated file.
uint64_t FileSize() const override;
private:
struct Rep;
Rep* rep_;
// No copying allowed
SimpleTableBuilder(const SimpleTableBuilder&) = delete;
void operator=(const SimpleTableBuilder&) = delete;
};
struct SimpleTableBuilder::Rep {
Options options;
WritableFile* file;
uint64_t offset = 0;
Status status;
uint64_t num_entries = 0;
bool closed = false; // Either Finish() or Abandon() has been called.
const static int user_key_size = 16;
const static int offset_length = 8;
const static int key_footer_len = 8;
static int GetInternalKeyLength() {
return user_key_size + key_footer_len;
}
std::string index;
Rep(const Options& opt, WritableFile* f) :
options(opt), file(f) {
}
~Rep() {
}
};
SimpleTableBuilder::SimpleTableBuilder(const Options& options,
WritableFile* file,
CompressionType compression_type) :
rep_(new SimpleTableBuilder::Rep(options, file)) {
}
SimpleTableBuilder::~SimpleTableBuilder() {
delete (rep_);
}
void SimpleTableBuilder::Add(const Slice& key, const Slice& value) {
assert((int ) key.size() == Rep::GetInternalKeyLength());
// Update index
rep_->index.append(key.data(), key.size());
PutFixed64(&(rep_->index), rep_->offset);
// Write key-value pair
rep_->file->Append(key);
rep_->offset += Rep::GetInternalKeyLength();
std::string size;
int value_size = value.size();
PutFixed32(&size, value_size);
Slice sizeSlice(size);
rep_->file->Append(sizeSlice);
rep_->file->Append(value);
rep_->offset += value_size + 4;
rep_->num_entries++;
}
Status SimpleTableBuilder::status() const {
return Status::OK();
}
Status SimpleTableBuilder::Finish() {
Rep* r = rep_;
assert(!r->closed);
r->closed = true;
uint64_t index_offset = rep_->offset;
Slice index_slice(rep_->index);
rep_->file->Append(index_slice);
rep_->offset += index_slice.size();
std::string index_offset_str;
PutFixed64(&index_offset_str, index_offset);
Slice foot_slice(index_offset_str);
rep_->file->Append(foot_slice);
rep_->offset += foot_slice.size();
return Status::OK();
}
void SimpleTableBuilder::Abandon() {
rep_->closed = true;
}
uint64_t SimpleTableBuilder::NumEntries() const {
return rep_->num_entries;
}
uint64_t SimpleTableBuilder::FileSize() const {
return rep_->offset;
}
class SimpleTableFactory: public TableFactory {
public:
~SimpleTableFactory() {
}
SimpleTableFactory() {
}
const char* Name() const override {
return "SimpleTable";
}
Status NewTableReader(const Options& options, const EnvOptions& soptions,
const InternalKeyComparator& internal_key,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table_reader) const;
TableBuilder* NewTableBuilder(const Options& options,
const InternalKeyComparator& internal_key,
WritableFile* file,
CompressionType compression_type) const;
};
Status SimpleTableFactory::NewTableReader(
const Options& options, const EnvOptions& soptions,
const InternalKeyComparator& internal_key,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table_reader) const {
return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
table_reader);
}
TableBuilder* SimpleTableFactory::NewTableBuilder(
const Options& options, const InternalKeyComparator& internal_key,
WritableFile* file, CompressionType compression_type) const {
return new SimpleTableBuilder(options, file, compression_type);
}
class SimpleTableDBTest {
protected:
public:
std::string dbname_;
Env* env_;
DB* db_;
Options last_options_;
SimpleTableDBTest() :
env_(Env::Default()) {
dbname_ = test::TmpDir() + "/simple_table_db_test";
ASSERT_OK(DestroyDB(dbname_, Options()));
db_ = nullptr;
Reopen();
}
~SimpleTableDBTest() {
delete db_;
ASSERT_OK(DestroyDB(dbname_, Options()));
}
// Return the current option configuration.
Options CurrentOptions() {
Options options;
options.table_factory.reset(new SimpleTableFactory());
return options;
}
DBImpl* dbfull() {
return reinterpret_cast<DBImpl*>(db_);
}
void Reopen(Options* options = nullptr) {
ASSERT_OK(TryReopen(options));
}
void Close() {
delete db_;
db_ = nullptr;
}
void DestroyAndReopen(Options* options = nullptr) {
//Destroy using last options
Destroy(&last_options_);
ASSERT_OK(TryReopen(options));
}
void Destroy(Options* options) {
delete db_;
db_ = nullptr;
ASSERT_OK(DestroyDB(dbname_, *options));
}
Status PureReopen(Options* options, DB** db) {
return DB::Open(*options, dbname_, db);
}
Status TryReopen(Options* options = nullptr) {
delete db_;
db_ = nullptr;
Options opts;
if (options != nullptr) {
opts = *options;
} else {
opts = CurrentOptions();
opts.create_if_missing = true;
}
last_options_ = opts;
return DB::Open(opts, dbname_, &db_);
}
Status Put(const Slice& k, const Slice& v) {
return db_->Put(WriteOptions(), k, v);
}
Status Delete(const std::string& k) {
return db_->Delete(WriteOptions(), k);
}
std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
ReadOptions options;
options.snapshot = snapshot;
std::string result;
Status s = db_->Get(options, k, &result);
if (s.IsNotFound()) {
result = "NOT_FOUND";
} else if (!s.ok()) {
result = s.ToString();
}
return result;
}
int NumTableFilesAtLevel(int level) {
std::string property;
ASSERT_TRUE(
db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
&property));
return atoi(property.c_str());
}
// Return spread of files per level
std::string FilesPerLevel() {
std::string result;
int last_non_zero_offset = 0;
for (int level = 0; level < db_->NumberLevels(); level++) {
int f = NumTableFilesAtLevel(level);
char buf[100];
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
result += buf;
if (f > 0) {
last_non_zero_offset = result.size();
}
}
result.resize(last_non_zero_offset);
return result;
}
std::string IterStatus(Iterator* iter) {
std::string result;
if (iter->Valid()) {
result = iter->key().ToString() + "->" + iter->value().ToString();
} else {
result = "(invalid)";
}
return result;
}
};
TEST(SimpleTableDBTest, Empty) {
ASSERT_TRUE(db_ != nullptr);
ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
}
TEST(SimpleTableDBTest, ReadWrite) {
ASSERT_OK(Put("0000000000000foo", "v1"));
ASSERT_EQ("v1", Get("0000000000000foo"));
ASSERT_OK(Put("0000000000000bar", "v2"));
ASSERT_OK(Put("0000000000000foo", "v3"));
ASSERT_EQ("v3", Get("0000000000000foo"));
ASSERT_EQ("v2", Get("0000000000000bar"));
}
TEST(SimpleTableDBTest, Flush) {
ASSERT_OK(Put("0000000000000foo", "v1"));
ASSERT_OK(Put("0000000000000bar", "v2"));
ASSERT_OK(Put("0000000000000foo", "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v3", Get("0000000000000foo"));
ASSERT_EQ("v2", Get("0000000000000bar"));
}
TEST(SimpleTableDBTest, Flush2) {
ASSERT_OK(Put("0000000000000bar", "b"));
ASSERT_OK(Put("0000000000000foo", "v1"));
dbfull()->TEST_FlushMemTable();
ASSERT_OK(Put("0000000000000foo", "v2"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v2", Get("0000000000000foo"));
ASSERT_OK(Put("0000000000000eee", "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v3", Get("0000000000000eee"));
ASSERT_OK(Delete("0000000000000bar"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
ASSERT_OK(Put("0000000000000eee", "v5"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v5", Get("0000000000000eee"));
}
static std::string Key(int i) {
char buf[100];
snprintf(buf, sizeof(buf), "key_______%06d", i);
return std::string(buf);
}
static std::string RandomString(Random* rnd, int len) {
std::string r;
test::RandomString(rnd, len, &r);
return r;
}
TEST(SimpleTableDBTest, CompactionTrigger) {
Options options = CurrentOptions();
options.write_buffer_size = 100 << 10; //100KB
options.num_levels = 3;
options.max_mem_compaction_level = 0;
options.level0_file_num_compaction_trigger = 3;
Reopen(&options);
Random rnd(301);
for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
num++) {
std::vector<std::string> values;
// Write 120KB (12 values, each 10K)
for (int i = 0; i < 12; i++) {
values.push_back(RandomString(&rnd, 10000));
ASSERT_OK(Put(Key(i), values[i]));
}
dbfull()->TEST_WaitForFlushMemTable();
ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
}
//generate one more file in level-0, and should trigger level-0 compaction
std::vector<std::string> values;
for (int i = 0; i < 12; i++) {
values.push_back(RandomString(&rnd, 10000));
ASSERT_OK(Put(Key(i), values[i]));
}
dbfull()->TEST_WaitForCompact();
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_EQ(NumTableFilesAtLevel(1), 1);
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

429
db/skiplist.h Normal file
View File

@@ -0,0 +1,429 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Thread safety
// -------------
//
// Writes require external synchronization, most likely a mutex.
// Reads require a guarantee that the SkipList will not be destroyed
// while the read is in progress. Apart from that, reads progress
// without any internal locking or synchronization.
//
// Invariants:
//
// (1) Allocated nodes are never deleted until the SkipList is
// destroyed. This is trivially guaranteed by the code since we
// never delete any skip list nodes.
//
// (2) The contents of a Node except for the next/prev pointers are
// immutable after the Node has been linked into the SkipList.
// Only Insert() modifies the list, and it is careful to initialize
// a node and use release-stores to publish the nodes in one or
// more lists.
//
// ... prev vs. next pointer ordering ...
//
#pragma once
#include <assert.h>
#include <stdlib.h>
#include "util/arena.h"
#include "port/port.h"
#include "util/arena.h"
#include "util/random.h"
namespace rocksdb {
template<typename Key, class Comparator>
class SkipList {
private:
struct Node;
public:
// Create a new SkipList object that will use "cmp" for comparing keys,
// and will allocate memory using "*arena". Objects allocated in the arena
// must remain allocated for the lifetime of the skiplist object.
explicit SkipList(Comparator cmp, Arena* arena,
int32_t max_height = 12, int32_t branching_factor = 4);
// Insert key into the list.
// REQUIRES: nothing that compares equal to key is currently in the list.
void Insert(const Key& key);
// Returns true iff an entry that compares equal to key is in the list.
bool Contains(const Key& key) const;
// Iteration over the contents of a skip list
class Iterator {
public:
// Initialize an iterator over the specified list.
// The returned iterator is not valid.
explicit Iterator(const SkipList* list);
// Change the underlying skiplist used for this iterator
// This enables us not changing the iterator without deallocating
// an old one and then allocating a new one
void SetList(const SkipList* list);
// Returns true iff the iterator is positioned at a valid node.
bool Valid() const;
// Returns the key at the current position.
// REQUIRES: Valid()
const Key& key() const;
// Advances to the next position.
// REQUIRES: Valid()
void Next();
// Advances to the previous position.
// REQUIRES: Valid()
void Prev();
// Advance to the first entry with a key >= target
void Seek(const Key& target);
// Position at the first entry in list.
// Final state of iterator is Valid() iff list is not empty.
void SeekToFirst();
// Position at the last entry in list.
// Final state of iterator is Valid() iff list is not empty.
void SeekToLast();
private:
const SkipList* list_;
Node* node_;
// Intentionally copyable
};
private:
const int32_t kMaxHeight_;
const int32_t kBranching_;
// Immutable after construction
Comparator const compare_;
Arena* const arena_; // Arena used for allocations of nodes
Node* const head_;
// Modified only by Insert(). Read racily by readers, but stale
// values are ok.
port::AtomicPointer max_height_; // Height of the entire list
// Used for optimizing sequential insert patterns
Node** prev_;
int32_t prev_height_;
inline int GetMaxHeight() const {
return static_cast<int>(
reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
}
// Read/written only by Insert().
Random rnd_;
Node* NewNode(const Key& key, int height);
int RandomHeight();
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
// Return true if key is greater than the data stored in "n"
bool KeyIsAfterNode(const Key& key, Node* n) const;
// Return the earliest node that comes at or after key.
// Return nullptr if there is no such node.
//
// If prev is non-nullptr, fills prev[level] with pointer to previous
// node at "level" for every level in [0..max_height_-1].
Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
// Return the latest node with a key < key.
// Return head_ if there is no such node.
Node* FindLessThan(const Key& key) const;
// Return the last node in the list.
// Return head_ if list is empty.
Node* FindLast() const;
// No copying allowed
SkipList(const SkipList&);
void operator=(const SkipList&);
};
// Implementation details follow
template<typename Key, class Comparator>
struct SkipList<Key, Comparator>::Node {
explicit Node(const Key& k) : key(k) { }
Key const key;
// Accessors/mutators for links. Wrapped in methods so we can
// add the appropriate barriers as necessary.
Node* Next(int n) {
assert(n >= 0);
// Use an 'acquire load' so that we observe a fully initialized
// version of the returned Node.
return reinterpret_cast<Node*>(next_[n].Acquire_Load());
}
void SetNext(int n, Node* x) {
assert(n >= 0);
// Use a 'release store' so that anybody who reads through this
// pointer observes a fully initialized version of the inserted node.
next_[n].Release_Store(x);
}
// No-barrier variants that can be safely used in a few locations.
Node* NoBarrier_Next(int n) {
assert(n >= 0);
return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
}
void NoBarrier_SetNext(int n, Node* x) {
assert(n >= 0);
next_[n].NoBarrier_Store(x);
}
private:
// Array of length equal to the node height. next_[0] is lowest level link.
port::AtomicPointer next_[1];
};
template<typename Key, class Comparator>
typename SkipList<Key, Comparator>::Node*
SkipList<Key, Comparator>::NewNode(const Key& key, int height) {
char* mem = arena_->AllocateAligned(
sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
return new (mem) Node(key);
}
template<typename Key, class Comparator>
inline SkipList<Key, Comparator>::Iterator::Iterator(const SkipList* list) {
SetList(list);
}
template<typename Key, class Comparator>
inline void SkipList<Key, Comparator>::Iterator::SetList(const SkipList* list) {
list_ = list;
node_ = nullptr;
}
template<typename Key, class Comparator>
inline bool SkipList<Key, Comparator>::Iterator::Valid() const {
return node_ != nullptr;
}
template<typename Key, class Comparator>
inline const Key& SkipList<Key, Comparator>::Iterator::key() const {
assert(Valid());
return node_->key;
}
template<typename Key, class Comparator>
inline void SkipList<Key, Comparator>::Iterator::Next() {
assert(Valid());
node_ = node_->Next(0);
}
template<typename Key, class Comparator>
inline void SkipList<Key, Comparator>::Iterator::Prev() {
// Instead of using explicit "prev" links, we just search for the
// last node that falls before key.
assert(Valid());
node_ = list_->FindLessThan(node_->key);
if (node_ == list_->head_) {
node_ = nullptr;
}
}
template<typename Key, class Comparator>
inline void SkipList<Key, Comparator>::Iterator::Seek(const Key& target) {
node_ = list_->FindGreaterOrEqual(target, nullptr);
}
template<typename Key, class Comparator>
inline void SkipList<Key, Comparator>::Iterator::SeekToFirst() {
node_ = list_->head_->Next(0);
}
template<typename Key, class Comparator>
inline void SkipList<Key, Comparator>::Iterator::SeekToLast() {
node_ = list_->FindLast();
if (node_ == list_->head_) {
node_ = nullptr;
}
}
template<typename Key, class Comparator>
int SkipList<Key, Comparator>::RandomHeight() {
// Increase height with probability 1 in kBranching
int height = 1;
while (height < kMaxHeight_ && ((rnd_.Next() % kBranching_) == 0)) {
height++;
}
assert(height > 0);
assert(height <= kMaxHeight_);
return height;
}
template<typename Key, class Comparator>
bool SkipList<Key, Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
// nullptr n is considered infinite
return (n != nullptr) && (compare_(n->key, key) < 0);
}
template<typename Key, class Comparator>
typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::
FindGreaterOrEqual(const Key& key, Node** prev) const {
// Use prev as an optimization hint and fallback to slow path
if (prev && !KeyIsAfterNode(key, prev[0]->Next(0))) {
Node* x = prev[0];
Node* next = x->Next(0);
if ((x == head_) || KeyIsAfterNode(key, x)) {
// Adjust all relevant insertion points to the previous entry
for (int i = 1; i < prev_height_; i++) {
prev[i] = x;
}
return next;
}
}
// Normal lookup
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
Node* next = x->Next(level);
// Make sure the lists are sorted.
// If x points to head_ or next points nullptr, it is trivially satisfied.
assert((x == head_) || (next == nullptr) || KeyIsAfterNode(next->key, x));
if (KeyIsAfterNode(key, next)) {
// Keep searching in this list
x = next;
} else {
if (prev != nullptr) prev[level] = x;
if (level == 0) {
return next;
} else {
// Switch to next list
level--;
}
}
}
}
template<typename Key, class Comparator>
typename SkipList<Key, Comparator>::Node*
SkipList<Key, Comparator>::FindLessThan(const Key& key) const {
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
assert(x == head_ || compare_(x->key, key) < 0);
Node* next = x->Next(level);
if (next == nullptr || compare_(next->key, key) >= 0) {
if (level == 0) {
return x;
} else {
// Switch to next list
level--;
}
} else {
x = next;
}
}
}
template<typename Key, class Comparator>
typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::FindLast()
const {
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
Node* next = x->Next(level);
if (next == nullptr) {
if (level == 0) {
return x;
} else {
// Switch to next list
level--;
}
} else {
x = next;
}
}
}
template<typename Key, class Comparator>
SkipList<Key, Comparator>::SkipList(const Comparator cmp, Arena* arena,
int32_t max_height,
int32_t branching_factor)
: kMaxHeight_(max_height),
kBranching_(branching_factor),
compare_(cmp),
arena_(arena),
head_(NewNode(0 /* any key will do */, max_height)),
max_height_(reinterpret_cast<void*>(1)),
prev_height_(1),
rnd_(0xdeadbeef) {
assert(kMaxHeight_ > 0);
assert(kBranching_ > 0);
// Allocate the prev_ Node* array, directly from the passed-in arena.
// prev_ does not need to be freed, as its life cycle is tied up with
// the arena as a whole.
prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_);
for (int i = 0; i < kMaxHeight_; i++) {
head_->SetNext(i, nullptr);
prev_[i] = head_;
}
}
template<typename Key, class Comparator>
void SkipList<Key, Comparator>::Insert(const Key& key) {
// TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
// here since Insert() is externally synchronized.
Node* x = FindGreaterOrEqual(key, prev_);
// Our data structure does not allow duplicate insertion
assert(x == nullptr || !Equal(key, x->key));
int height = RandomHeight();
if (height > GetMaxHeight()) {
for (int i = GetMaxHeight(); i < height; i++) {
prev_[i] = head_;
}
//fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
// It is ok to mutate max_height_ without any synchronization
// with concurrent readers. A concurrent reader that observes
// the new value of max_height_ will see either the old value of
// new level pointers from head_ (nullptr), or a new value set in
// the loop below. In the former case the reader will
// immediately drop to the next level since nullptr sorts after all
// keys. In the latter case the reader will use the new node.
max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
}
x = NewNode(key, height);
for (int i = 0; i < height; i++) {
// NoBarrier_SetNext() suffices since we will add a barrier when
// we publish a pointer to "x" in prev[i].
x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i));
prev_[i]->SetNext(i, x);
}
prev_[0] = x;
prev_height_ = height;
}
template<typename Key, class Comparator>
bool SkipList<Key, Comparator>::Contains(const Key& key) const {
Node* x = FindGreaterOrEqual(key, nullptr);
if (x != nullptr && Equal(key, x->key)) {
return true;
} else {
return false;
}
}
} // namespace rocksdb

383
db/skiplist_test.cc Normal file
View File

@@ -0,0 +1,383 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/skiplist.h"
#include <set>
#include "rocksdb/env.h"
#include "util/arena.h"
#include "util/hash.h"
#include "util/random.h"
#include "util/testharness.h"
namespace rocksdb {
typedef uint64_t Key;
struct TestComparator {
int operator()(const Key& a, const Key& b) const {
if (a < b) {
return -1;
} else if (a > b) {
return +1;
} else {
return 0;
}
}
};
class SkipTest { };
TEST(SkipTest, Empty) {
Arena arena;
TestComparator cmp;
SkipList<Key, TestComparator> list(cmp, &arena);
ASSERT_TRUE(!list.Contains(10));
SkipList<Key, TestComparator>::Iterator iter(&list);
ASSERT_TRUE(!iter.Valid());
iter.SeekToFirst();
ASSERT_TRUE(!iter.Valid());
iter.Seek(100);
ASSERT_TRUE(!iter.Valid());
iter.SeekToLast();
ASSERT_TRUE(!iter.Valid());
}
TEST(SkipTest, InsertAndLookup) {
const int N = 2000;
const int R = 5000;
Random rnd(1000);
std::set<Key> keys;
Arena arena;
TestComparator cmp;
SkipList<Key, TestComparator> list(cmp, &arena);
for (int i = 0; i < N; i++) {
Key key = rnd.Next() % R;
if (keys.insert(key).second) {
list.Insert(key);
}
}
for (int i = 0; i < R; i++) {
if (list.Contains(i)) {
ASSERT_EQ(keys.count(i), 1U);
} else {
ASSERT_EQ(keys.count(i), 0U);
}
}
// Simple iterator tests
{
SkipList<Key, TestComparator>::Iterator iter(&list);
ASSERT_TRUE(!iter.Valid());
iter.Seek(0);
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.begin()), iter.key());
iter.SeekToFirst();
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.begin()), iter.key());
iter.SeekToLast();
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.rbegin()), iter.key());
}
// Forward iteration test
for (int i = 0; i < R; i++) {
SkipList<Key, TestComparator>::Iterator iter(&list);
iter.Seek(i);
// Compare against model iterator
std::set<Key>::iterator model_iter = keys.lower_bound(i);
for (int j = 0; j < 3; j++) {
if (model_iter == keys.end()) {
ASSERT_TRUE(!iter.Valid());
break;
} else {
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*model_iter, iter.key());
++model_iter;
iter.Next();
}
}
}
// Backward iteration test
{
SkipList<Key, TestComparator>::Iterator iter(&list);
iter.SeekToLast();
// Compare against model iterator
for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
model_iter != keys.rend();
++model_iter) {
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*model_iter, iter.key());
iter.Prev();
}
ASSERT_TRUE(!iter.Valid());
}
}
// We want to make sure that with a single writer and multiple
// concurrent readers (with no synchronization other than when a
// reader's iterator is created), the reader always observes all the
// data that was present in the skip list when the iterator was
// constructor. Because insertions are happening concurrently, we may
// also observe new values that were inserted since the iterator was
// constructed, but we should never miss any values that were present
// at iterator construction time.
//
// We generate multi-part keys:
// <key,gen,hash>
// where:
// key is in range [0..K-1]
// gen is a generation number for key
// hash is hash(key,gen)
//
// The insertion code picks a random key, sets gen to be 1 + the last
// generation number inserted for that key, and sets hash to Hash(key,gen).
//
// At the beginning of a read, we snapshot the last inserted
// generation number for each key. We then iterate, including random
// calls to Next() and Seek(). For every key we encounter, we
// check that it is either expected given the initial snapshot or has
// been concurrently added since the iterator started.
class ConcurrentTest {
private:
static const uint32_t K = 4;
static uint64_t key(Key key) { return (key >> 40); }
static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
static uint64_t hash(Key key) { return key & 0xff; }
static uint64_t HashNumbers(uint64_t k, uint64_t g) {
uint64_t data[2] = { k, g };
return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
}
static Key MakeKey(uint64_t k, uint64_t g) {
assert(sizeof(Key) == sizeof(uint64_t));
assert(k <= K); // We sometimes pass K to seek to the end of the skiplist
assert(g <= 0xffffffffu);
return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
}
static bool IsValidKey(Key k) {
return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
}
static Key RandomTarget(Random* rnd) {
switch (rnd->Next() % 10) {
case 0:
// Seek to beginning
return MakeKey(0, 0);
case 1:
// Seek to end
return MakeKey(K, 0);
default:
// Seek to middle
return MakeKey(rnd->Next() % K, 0);
}
}
// Per-key generation
struct State {
port::AtomicPointer generation[K];
void Set(int k, intptr_t v) {
generation[k].Release_Store(reinterpret_cast<void*>(v));
}
intptr_t Get(int k) {
return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
}
State() {
for (unsigned int k = 0; k < K; k++) {
Set(k, 0);
}
}
};
// Current state of the test
State current_;
Arena arena_;
// SkipList is not protected by mu_. We just use a single writer
// thread to modify it.
SkipList<Key, TestComparator> list_;
public:
ConcurrentTest() : list_(TestComparator(), &arena_) {}
// REQUIRES: External synchronization
void WriteStep(Random* rnd) {
const uint32_t k = rnd->Next() % K;
const intptr_t g = current_.Get(k) + 1;
const Key key = MakeKey(k, g);
list_.Insert(key);
current_.Set(k, g);
}
void ReadStep(Random* rnd) {
// Remember the initial committed state of the skiplist.
State initial_state;
for (unsigned int k = 0; k < K; k++) {
initial_state.Set(k, current_.Get(k));
}
Key pos = RandomTarget(rnd);
SkipList<Key, TestComparator>::Iterator iter(&list_);
iter.Seek(pos);
while (true) {
Key current;
if (!iter.Valid()) {
current = MakeKey(K, 0);
} else {
current = iter.key();
ASSERT_TRUE(IsValidKey(current)) << current;
}
ASSERT_LE(pos, current) << "should not go backwards";
// Verify that everything in [pos,current) was not present in
// initial_state.
while (pos < current) {
ASSERT_LT(key(pos), K) << pos;
// Note that generation 0 is never inserted, so it is ok if
// <*,0,*> is missing.
ASSERT_TRUE((gen(pos) == 0U) ||
(gen(pos) > (uint64_t)initial_state.Get(key(pos)))
) << "key: " << key(pos)
<< "; gen: " << gen(pos)
<< "; initgen: "
<< initial_state.Get(key(pos));
// Advance to next key in the valid key space
if (key(pos) < key(current)) {
pos = MakeKey(key(pos) + 1, 0);
} else {
pos = MakeKey(key(pos), gen(pos) + 1);
}
}
if (!iter.Valid()) {
break;
}
if (rnd->Next() % 2) {
iter.Next();
pos = MakeKey(key(pos), gen(pos) + 1);
} else {
Key new_target = RandomTarget(rnd);
if (new_target > pos) {
pos = new_target;
iter.Seek(new_target);
}
}
}
}
};
const uint32_t ConcurrentTest::K;
// Simple test that does single-threaded testing of the ConcurrentTest
// scaffolding.
TEST(SkipTest, ConcurrentWithoutThreads) {
ConcurrentTest test;
Random rnd(test::RandomSeed());
for (int i = 0; i < 10000; i++) {
test.ReadStep(&rnd);
test.WriteStep(&rnd);
}
}
class TestState {
public:
ConcurrentTest t_;
int seed_;
port::AtomicPointer quit_flag_;
enum ReaderState {
STARTING,
RUNNING,
DONE
};
explicit TestState(int s)
: seed_(s),
quit_flag_(nullptr),
state_(STARTING),
state_cv_(&mu_) {}
void Wait(ReaderState s) {
mu_.Lock();
while (state_ != s) {
state_cv_.Wait();
}
mu_.Unlock();
}
void Change(ReaderState s) {
mu_.Lock();
state_ = s;
state_cv_.Signal();
mu_.Unlock();
}
private:
port::Mutex mu_;
ReaderState state_;
port::CondVar state_cv_;
};
static void ConcurrentReader(void* arg) {
TestState* state = reinterpret_cast<TestState*>(arg);
Random rnd(state->seed_);
int64_t reads = 0;
state->Change(TestState::RUNNING);
while (!state->quit_flag_.Acquire_Load()) {
state->t_.ReadStep(&rnd);
++reads;
}
state->Change(TestState::DONE);
}
static void RunConcurrent(int run) {
const int seed = test::RandomSeed() + (run * 100);
Random rnd(seed);
const int N = 1000;
const int kSize = 1000;
for (int i = 0; i < N; i++) {
if ((i % 100) == 0) {
fprintf(stderr, "Run %d of %d\n", i, N);
}
TestState state(seed + 1);
Env::Default()->Schedule(ConcurrentReader, &state);
state.Wait(TestState::RUNNING);
for (int i = 0; i < kSize; i++) {
state.t_.WriteStep(&rnd);
}
state.quit_flag_.Release_Store(&state); // Any non-nullptr arg will do
state.Wait(TestState::DONE);
}
}
TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

86
db/snapshot.h Normal file
View File

@@ -0,0 +1,86 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "rocksdb/db.h"
namespace rocksdb {
class SnapshotList;
// Snapshots are kept in a doubly-linked list in the DB.
// Each SnapshotImpl corresponds to a particular sequence number.
class SnapshotImpl : public Snapshot {
public:
SequenceNumber number_; // const after creation
private:
friend class SnapshotList;
// SnapshotImpl is kept in a doubly-linked circular list
SnapshotImpl* prev_;
SnapshotImpl* next_;
SnapshotList* list_; // just for sanity checks
};
class SnapshotList {
public:
SnapshotList() {
list_.prev_ = &list_;
list_.next_ = &list_;
list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging
}
bool empty() const { return list_.next_ == &list_; }
SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
const SnapshotImpl* New(SequenceNumber seq) {
SnapshotImpl* s = new SnapshotImpl;
s->number_ = seq;
s->list_ = this;
s->next_ = &list_;
s->prev_ = list_.prev_;
s->prev_->next_ = s;
s->next_->prev_ = s;
return s;
}
void Delete(const SnapshotImpl* s) {
assert(s->list_ == this);
s->prev_->next_ = s->next_;
s->next_->prev_ = s->prev_;
delete s;
}
// retrieve all snapshot numbers. They are sorted in ascending order.
void getAll(std::vector<SequenceNumber>& ret) {
if (empty()) return;
SnapshotImpl* s = &list_;
while (s->next_ != &list_) {
ret.push_back(s->next_->number_);
s = s ->next_;
}
}
// get the sequence number of the most recent snapshot
const SequenceNumber GetNewest() {
if (empty()) {
return 0;
}
return newest()->number_;
}
private:
// Dummy head of doubly-linked list of snapshots
SnapshotImpl list_;
};
} // namespace rocksdb

198
db/table_cache.cc Normal file
View File

@@ -0,0 +1,198 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/table_cache.h"
#include "db/filename.h"
#include "db/version_edit.h"
#include "rocksdb/statistics.h"
#include "table/iterator_wrapper.h"
#include "table/table_reader.h"
#include "util/coding.h"
#include "util/stop_watch.h"
namespace rocksdb {
static void DeleteEntry(const Slice& key, void* value) {
TableReader* table_reader = reinterpret_cast<TableReader*>(value);
delete table_reader;
}
static void UnrefEntry(void* arg1, void* arg2) {
Cache* cache = reinterpret_cast<Cache*>(arg1);
Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
cache->Release(h);
}
static Slice GetSliceForFileNumber(uint64_t* file_number) {
return Slice(reinterpret_cast<const char*>(file_number),
sizeof(*file_number));
}
TableCache::TableCache(const std::string& dbname, const Options* options,
const EnvOptions& storage_options, Cache* const cache)
: env_(options->env),
dbname_(dbname),
options_(options),
storage_options_(storage_options),
cache_(cache) {}
TableCache::~TableCache() {
}
TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
return reinterpret_cast<TableReader*>(cache_->Value(handle));
}
void TableCache::ReleaseHandle(Cache::Handle* handle) {
cache_->Release(handle);
}
Status TableCache::FindTable(const EnvOptions& toptions,
const InternalKeyComparator& internal_comparator,
uint64_t file_number, uint64_t file_size,
Cache::Handle** handle, bool* table_io,
const bool no_io) {
Status s;
Slice key = GetSliceForFileNumber(&file_number);
*handle = cache_->Lookup(key);
if (*handle == nullptr) {
if (no_io) { // Dont do IO and return a not-found status
return Status::Incomplete("Table not found in table_cache, no_io is set");
}
if (table_io != nullptr) {
*table_io = true; // we had to do IO from storage
}
std::string fname = TableFileName(dbname_, file_number);
unique_ptr<RandomAccessFile> file;
unique_ptr<TableReader> table_reader;
s = env_->NewRandomAccessFile(fname, &file, toptions);
RecordTick(options_->statistics.get(), NO_FILE_OPENS);
if (s.ok()) {
if (options_->advise_random_on_open) {
file->Hint(RandomAccessFile::RANDOM);
}
StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
s = options_->table_factory->NewTableReader(
*options_, toptions, internal_comparator, std::move(file), file_size,
&table_reader);
}
if (!s.ok()) {
assert(table_reader == nullptr);
RecordTick(options_->statistics.get(), NO_FILE_ERRORS);
// We do not cache error results so that if the error is transient,
// or somebody repairs the file, we recover automatically.
} else {
assert(file.get() == nullptr);
*handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry);
}
}
return s;
}
Iterator* TableCache::NewIterator(const ReadOptions& options,
const EnvOptions& toptions,
const InternalKeyComparator& icomparator,
const FileMetaData& file_meta,
TableReader** table_reader_ptr,
bool for_compaction, Arena* arena) {
if (table_reader_ptr != nullptr) {
*table_reader_ptr = nullptr;
}
TableReader* table_reader = file_meta.table_reader;
Cache::Handle* handle = nullptr;
Status s;
if (table_reader == nullptr) {
s = FindTable(toptions, icomparator, file_meta.number, file_meta.file_size,
&handle, nullptr, options.read_tier == kBlockCacheTier);
if (!s.ok()) {
return NewErrorIterator(s, arena);
}
table_reader = GetTableReaderFromHandle(handle);
}
Iterator* result = table_reader->NewIterator(options, arena);
if (handle != nullptr) {
result->RegisterCleanup(&UnrefEntry, cache_, handle);
}
if (table_reader_ptr != nullptr) {
*table_reader_ptr = table_reader;
}
if (for_compaction) {
table_reader->SetupForCompaction();
}
return result;
}
Status TableCache::Get(const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, const Slice& k, void* arg,
bool (*saver)(void*, const ParsedInternalKey&,
const Slice&, bool),
bool* table_io, void (*mark_key_may_exist)(void*)) {
TableReader* t = file_meta.table_reader;
Status s;
Cache::Handle* handle = nullptr;
if (!t) {
s = FindTable(storage_options_, internal_comparator, file_meta.number,
file_meta.file_size, &handle, table_io,
options.read_tier == kBlockCacheTier);
if (s.ok()) {
t = GetTableReaderFromHandle(handle);
}
}
if (s.ok()) {
s = t->Get(options, k, arg, saver, mark_key_may_exist);
if (handle != nullptr) {
ReleaseHandle(handle);
}
} else if (options.read_tier && s.IsIncomplete()) {
// Couldnt find Table in cache but treat as kFound if no_io set
(*mark_key_may_exist)(arg);
return Status::OK();
}
return s;
}
Status TableCache::GetTableProperties(
const EnvOptions& toptions,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
std::shared_ptr<const TableProperties>* properties, bool no_io) {
Status s;
auto table_reader = file_meta.table_reader;
// table already been pre-loaded?
if (table_reader) {
*properties = table_reader->GetTableProperties();
return s;
}
bool table_io;
Cache::Handle* table_handle = nullptr;
s = FindTable(toptions, internal_comparator, file_meta.number,
file_meta.file_size, &table_handle, &table_io, no_io);
if (!s.ok()) {
return s;
}
assert(table_handle);
auto table = GetTableReaderFromHandle(table_handle);
*properties = table->GetTableProperties();
ReleaseHandle(table_handle);
return s;
}
void TableCache::Evict(Cache* cache, uint64_t file_number) {
cache->Erase(GetSliceForFileNumber(&file_number));
}
} // namespace rocksdb

95
db/table_cache.h Normal file
View File

@@ -0,0 +1,95 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Thread-safe (provides internal synchronization)
#pragma once
#include <string>
#include <stdint.h>
#include "db/dbformat.h"
#include "port/port.h"
#include "rocksdb/cache.h"
#include "rocksdb/env.h"
#include "rocksdb/table.h"
#include "table/table_reader.h"
namespace rocksdb {
class Env;
class Arena;
struct FileMetaData;
// TODO(sdong): try to come up with a better API to pass the file information
// other than simply passing FileMetaData.
class TableCache {
public:
TableCache(const std::string& dbname, const Options* options,
const EnvOptions& storage_options, Cache* cache);
~TableCache();
// Return an iterator for the specified file number (the corresponding
// file length must be exactly "file_size" bytes). If "tableptr" is
// non-nullptr, also sets "*tableptr" to point to the Table object
// underlying the returned iterator, or nullptr if no Table object underlies
// the returned iterator. The returned "*tableptr" object is owned by
// the cache and should not be deleted, and is valid for as long as the
// returned iterator is live.
Iterator* NewIterator(const ReadOptions& options, const EnvOptions& toptions,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
TableReader** table_reader_ptr = nullptr,
bool for_compaction = false, Arena* arena = nullptr);
// If a seek to internal key "k" in specified file finds an entry,
// call (*handle_result)(arg, found_key, found_value) repeatedly until
// it returns false.
Status Get(const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta, const Slice& k, void* arg,
bool (*handle_result)(void*, const ParsedInternalKey&,
const Slice&, bool),
bool* table_io, void (*mark_key_may_exist)(void*) = nullptr);
// Evict any entry for the specified file number
static void Evict(Cache* cache, uint64_t file_number);
// Find table reader
Status FindTable(const EnvOptions& toptions,
const InternalKeyComparator& internal_comparator,
uint64_t file_number, uint64_t file_size, Cache::Handle**,
bool* table_io = nullptr, const bool no_io = false);
// Get TableReader from a cache handle.
TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
// Get the table properties of a given table.
// @no_io: indicates if we should load table to the cache if it is not present
// in table cache yet.
// @returns: `properties` will be reset on success. Please note that we will
// return Status::Incomplete() if table is not present in cache and
// we set `no_io` to be true.
Status GetTableProperties(const EnvOptions& toptions,
const InternalKeyComparator& internal_comparator,
const FileMetaData& file_meta,
std::shared_ptr<const TableProperties>* properties,
bool no_io = false);
// Release the handle from a cache
void ReleaseHandle(Cache::Handle* handle);
private:
Env* const env_;
const std::string dbname_;
const Options* options_;
const EnvOptions& storage_options_;
Cache* const cache_;
};
} // namespace rocksdb

View File

@@ -0,0 +1,83 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "db/table_properties_collector.h"
#include "db/dbformat.h"
#include "util/coding.h"
namespace rocksdb {
Status InternalKeyPropertiesCollector::Add(
const Slice& key, const Slice& value) {
ParsedInternalKey ikey;
if (!ParseInternalKey(key, &ikey)) {
return Status::InvalidArgument("Invalid internal key");
}
if (ikey.type == ValueType::kTypeDeletion) {
++deleted_keys_;
}
return Status::OK();
}
Status InternalKeyPropertiesCollector::Finish(
UserCollectedProperties* properties) {
assert(properties);
assert(properties->find(
InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end());
std::string val;
PutVarint64(&val, deleted_keys_);
properties->insert({ InternalKeyTablePropertiesNames::kDeletedKeys, val });
return Status::OK();
}
UserCollectedProperties
InternalKeyPropertiesCollector::GetReadableProperties() const {
return {
{ "kDeletedKeys", std::to_string(deleted_keys_) }
};
}
Status UserKeyTablePropertiesCollector::Add(
const Slice& key, const Slice& value) {
ParsedInternalKey ikey;
if (!ParseInternalKey(key, &ikey)) {
return Status::InvalidArgument("Invalid internal key");
}
return collector_->Add(ikey.user_key, value);
}
Status UserKeyTablePropertiesCollector::Finish(
UserCollectedProperties* properties) {
return collector_->Finish(properties);
}
UserCollectedProperties
UserKeyTablePropertiesCollector::GetReadableProperties() const {
return collector_->GetReadableProperties();
}
const std::string InternalKeyTablePropertiesNames::kDeletedKeys
= "rocksdb.deleted.keys";
uint64_t GetDeletedKeys(
const UserCollectedProperties& props) {
auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys);
if (pos == props.end()) {
return 0;
}
Slice raw = pos->second;
uint64_t val = 0;
return GetVarint64(&raw, &val) ? val : 0;
}
} // namespace rocksdb

View File

@@ -0,0 +1,95 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// This file defines a collection of statistics collectors.
#pragma once
#include "rocksdb/table_properties.h"
#include <memory>
#include <string>
#include <vector>
namespace rocksdb {
struct InternalKeyTablePropertiesNames {
static const std::string kDeletedKeys;
};
// Collecting the statistics for internal keys. Visible only by internal
// rocksdb modules.
class InternalKeyPropertiesCollector : public TablePropertiesCollector {
public:
virtual Status Add(const Slice& key, const Slice& value) override;
virtual Status Finish(UserCollectedProperties* properties) override;
virtual const char* Name() const override {
return "InternalKeyPropertiesCollector";
}
UserCollectedProperties GetReadableProperties() const override;
private:
uint64_t deleted_keys_ = 0;
};
class InternalKeyPropertiesCollectorFactory
: public TablePropertiesCollectorFactory {
public:
virtual TablePropertiesCollector* CreateTablePropertiesCollector() {
return new InternalKeyPropertiesCollector();
}
virtual const char* Name() const override {
return "InternalKeyPropertiesCollectorFactory";
}
};
// When rocksdb creates a new table, it will encode all "user keys" into
// "internal keys", which contains meta information of a given entry.
//
// This class extracts user key from the encoded internal key when Add() is
// invoked.
class UserKeyTablePropertiesCollector : public TablePropertiesCollector {
public:
// transfer of ownership
explicit UserKeyTablePropertiesCollector(TablePropertiesCollector* collector)
: collector_(collector) {}
virtual ~UserKeyTablePropertiesCollector() {}
virtual Status Add(const Slice& key, const Slice& value) override;
virtual Status Finish(UserCollectedProperties* properties) override;
virtual const char* Name() const override { return collector_->Name(); }
UserCollectedProperties GetReadableProperties() const override;
protected:
std::unique_ptr<TablePropertiesCollector> collector_;
};
class UserKeyTablePropertiesCollectorFactory
: public TablePropertiesCollectorFactory {
public:
explicit UserKeyTablePropertiesCollectorFactory(
std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory)
: user_collector_factory_(user_collector_factory) {}
virtual TablePropertiesCollector* CreateTablePropertiesCollector() {
return new UserKeyTablePropertiesCollector(
user_collector_factory_->CreateTablePropertiesCollector());
}
virtual const char* Name() const override {
return user_collector_factory_->Name();
}
private:
std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory_;
};
} // namespace rocksdb

View File

@@ -0,0 +1,313 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include <map>
#include <memory>
#include <string>
#include "db/db_impl.h"
#include "db/dbformat.h"
#include "db/table_properties_collector.h"
#include "rocksdb/table.h"
#include "table/block_based_table_factory.h"
#include "table/meta_blocks.h"
#include "table/plain_table_factory.h"
#include "table/table_builder.h"
#include "util/coding.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace rocksdb {
class TablePropertiesTest {
};
// TODO(kailiu) the following classes should be moved to some more general
// places, so that other tests can also make use of them.
// `FakeWritableFile` and `FakeRandomeAccessFile` bypass the real file system
// and therefore enable us to quickly setup the tests.
class FakeWritableFile : public WritableFile {
public:
~FakeWritableFile() { }
const std::string& contents() const { return contents_; }
virtual Status Close() { return Status::OK(); }
virtual Status Flush() { return Status::OK(); }
virtual Status Sync() { return Status::OK(); }
virtual Status Append(const Slice& data) {
contents_.append(data.data(), data.size());
return Status::OK();
}
private:
std::string contents_;
};
class FakeRandomeAccessFile : public RandomAccessFile {
public:
explicit FakeRandomeAccessFile(const Slice& contents)
: contents_(contents.data(), contents.size()) {
}
virtual ~FakeRandomeAccessFile() { }
uint64_t Size() const { return contents_.size(); }
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
if (offset > contents_.size()) {
return Status::InvalidArgument("invalid Read offset");
}
if (offset + n > contents_.size()) {
n = contents_.size() - offset;
}
memcpy(scratch, &contents_[offset], n);
*result = Slice(scratch, n);
return Status::OK();
}
private:
std::string contents_;
};
class DumbLogger : public Logger {
public:
virtual void Logv(const char* format, va_list ap) { }
virtual size_t GetLogFileSize() const { return 0; }
};
// Utilities test functions
namespace {
void MakeBuilder(const Options& options,
const InternalKeyComparator& internal_comparator,
std::unique_ptr<FakeWritableFile>* writable,
std::unique_ptr<TableBuilder>* builder) {
writable->reset(new FakeWritableFile);
builder->reset(options.table_factory->NewTableBuilder(
options, internal_comparator, writable->get(), options.compression));
}
} // namespace
// Collects keys that starts with "A" in a table.
class RegularKeysStartWithA: public TablePropertiesCollector {
public:
const char* Name() const { return "RegularKeysStartWithA"; }
Status Finish(UserCollectedProperties* properties) {
std::string encoded;
PutVarint32(&encoded, count_);
*properties = UserCollectedProperties {
{ "TablePropertiesTest", "Rocksdb" },
{ "Count", encoded }
};
return Status::OK();
}
Status Add(const Slice& user_key, const Slice& value) {
// simply asssume all user keys are not empty.
if (user_key.data()[0] == 'A') {
++count_;
}
return Status::OK();
}
virtual UserCollectedProperties GetReadableProperties() const {
return UserCollectedProperties{};
}
private:
uint32_t count_ = 0;
};
class RegularKeysStartWithAFactory : public TablePropertiesCollectorFactory {
public:
virtual TablePropertiesCollector* CreateTablePropertiesCollector() {
return new RegularKeysStartWithA();
}
const char* Name() const { return "RegularKeysStartWithA"; }
};
extern uint64_t kBlockBasedTableMagicNumber;
extern uint64_t kPlainTableMagicNumber;
namespace {
void TestCustomizedTablePropertiesCollector(
uint64_t magic_number, bool encode_as_internal, const Options& options,
const InternalKeyComparator& internal_comparator) {
// make sure the entries will be inserted with order.
std::map<std::string, std::string> kvs = {
{"About ", "val5"}, // starts with 'A'
{"Abstract", "val2"}, // starts with 'A'
{"Around ", "val7"}, // starts with 'A'
{"Beyond ", "val3"},
{"Builder ", "val1"},
{"Cancel ", "val4"},
{"Find ", "val6"},
};
// -- Step 1: build table
std::unique_ptr<TableBuilder> builder;
std::unique_ptr<FakeWritableFile> writable;
MakeBuilder(options, internal_comparator, &writable, &builder);
for (const auto& kv : kvs) {
if (encode_as_internal) {
InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
builder->Add(ikey.Encode(), kv.second);
} else {
builder->Add(kv.first, kv.second);
}
}
ASSERT_OK(builder->Finish());
// -- Step 2: Read properties
FakeRandomeAccessFile readable(writable->contents());
TableProperties* props;
Status s = ReadTableProperties(
&readable,
writable->contents().size(),
magic_number,
Env::Default(),
nullptr,
&props
);
std::unique_ptr<TableProperties> props_guard(props);
ASSERT_OK(s);
auto user_collected = props->user_collected_properties;
ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
uint32_t starts_with_A = 0;
Slice key(user_collected.at("Count"));
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
ASSERT_EQ(3u, starts_with_A);
}
} // namespace
TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
// Test properties collectors with internal keys or regular keys
// for block based table
for (bool encode_as_internal : { true, false }) {
Options options;
std::shared_ptr<TablePropertiesCollectorFactory> collector_factory(
new RegularKeysStartWithAFactory());
if (encode_as_internal) {
options.table_properties_collector_factories.emplace_back(
new UserKeyTablePropertiesCollectorFactory(collector_factory));
} else {
options.table_properties_collector_factories.resize(1);
options.table_properties_collector_factories[0] = collector_factory;
}
test::PlainInternalKeyComparator ikc(options.comparator);
TestCustomizedTablePropertiesCollector(kBlockBasedTableMagicNumber,
encode_as_internal, options, ikc);
}
// test plain table
Options options;
options.table_properties_collector_factories.emplace_back(
new RegularKeysStartWithAFactory());
options.table_factory = std::make_shared<PlainTableFactory>(8, 8, 0);
test::PlainInternalKeyComparator ikc(options.comparator);
TestCustomizedTablePropertiesCollector(kPlainTableMagicNumber, true, options,
ikc);
}
namespace {
void TestInternalKeyPropertiesCollector(
uint64_t magic_number,
bool sanitized,
std::shared_ptr<TableFactory> table_factory) {
InternalKey keys[] = {
InternalKey("A ", 0, ValueType::kTypeValue),
InternalKey("B ", 0, ValueType::kTypeValue),
InternalKey("C ", 0, ValueType::kTypeValue),
InternalKey("W ", 0, ValueType::kTypeDeletion),
InternalKey("X ", 0, ValueType::kTypeDeletion),
InternalKey("Y ", 0, ValueType::kTypeDeletion),
InternalKey("Z ", 0, ValueType::kTypeDeletion),
};
std::unique_ptr<TableBuilder> builder;
std::unique_ptr<FakeWritableFile> writable;
Options options;
test::PlainInternalKeyComparator pikc(options.comparator);
options.table_factory = table_factory;
if (sanitized) {
options.table_properties_collector_factories.emplace_back(
new RegularKeysStartWithAFactory());
// with sanitization, even regular properties collector will be able to
// handle internal keys.
auto comparator = options.comparator;
// HACK: Set options.info_log to avoid writing log in
// SanitizeOptions().
options.info_log = std::make_shared<DumbLogger>();
options = SanitizeOptions("db", // just a place holder
&pikc, nullptr, // don't care filter policy
options);
options.comparator = comparator;
} else {
options.table_properties_collector_factories = {
std::make_shared<InternalKeyPropertiesCollectorFactory>()};
}
for (int iter = 0; iter < 2; ++iter) {
MakeBuilder(options, pikc, &writable, &builder);
for (const auto& k : keys) {
builder->Add(k.Encode(), "val");
}
ASSERT_OK(builder->Finish());
FakeRandomeAccessFile readable(writable->contents());
TableProperties* props;
Status s =
ReadTableProperties(&readable, writable->contents().size(),
magic_number, Env::Default(), nullptr, &props);
ASSERT_OK(s);
std::unique_ptr<TableProperties> props_guard(props);
auto user_collected = props->user_collected_properties;
uint64_t deleted = GetDeletedKeys(user_collected);
ASSERT_EQ(4u, deleted);
if (sanitized) {
uint32_t starts_with_A = 0;
Slice key(user_collected.at("Count"));
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
ASSERT_EQ(1u, starts_with_A);
}
}
}
} // namespace
TEST(TablePropertiesTest, InternalKeyPropertiesCollector) {
TestInternalKeyPropertiesCollector(
kBlockBasedTableMagicNumber,
true /* sanitize */,
std::make_shared<BlockBasedTableFactory>()
);
TestInternalKeyPropertiesCollector(
kBlockBasedTableMagicNumber,
true /* not sanitize */,
std::make_shared<BlockBasedTableFactory>()
);
TestInternalKeyPropertiesCollector(
kPlainTableMagicNumber,
false /* not sanitize */,
std::make_shared<PlainTableFactory>(8, 8, 0)
);
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

221
db/tailing_iter.cc Normal file
View File

@@ -0,0 +1,221 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#include "db/tailing_iter.h"
#include <string>
#include <utility>
#include <vector>
#include "db/db_impl.h"
#include "db/db_iter.h"
#include "db/column_family.h"
#include "rocksdb/env.h"
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include "table/merger.h"
namespace rocksdb {
TailingIterator::TailingIterator(Env* const env, DBImpl* db,
const ReadOptions& read_options, ColumnFamilyData* cfd)
: env_(env),
db_(db),
read_options_(read_options),
cfd_(cfd),
super_version_(nullptr),
current_(nullptr),
status_(Status::InvalidArgument("Seek() not called on this iterator")) {}
TailingIterator::~TailingIterator() {
Cleanup();
}
bool TailingIterator::Valid() const {
return current_ != nullptr;
}
void TailingIterator::SeekToFirst() {
if (!IsCurrentVersion()) {
CreateIterators();
}
mutable_->SeekToFirst();
immutable_->SeekToFirst();
UpdateCurrent();
}
void TailingIterator::Seek(const Slice& target) {
if (!IsCurrentVersion()) {
CreateIterators();
}
mutable_->Seek(target);
// We maintain the interval (prev_key_, immutable_->key()] such that there
// are no records with keys within that range in immutable_ other than
// immutable_->key(). Since immutable_ can't change in this version, we don't
// need to do a seek if 'target' belongs to that interval (i.e. immutable_ is
// already at the correct position)!
//
// If prefix seek is used and immutable_ is not valid, seek if target has a
// different prefix than prev_key.
//
// prev_key_ is updated by Next(). SeekImmutable() sets prev_key_ to
// 'target' -- in this case, prev_key_ is included in the interval, so
// prev_inclusive_ has to be set.
const Comparator* cmp = cfd_->user_comparator();
if (!is_prev_set_ || cmp->Compare(prev_key_, target) >= !is_prev_inclusive_ ||
(immutable_->Valid() && cmp->Compare(target, immutable_->key()) > 0) ||
(cfd_->options()->prefix_extractor != nullptr && !IsSamePrefix(target))) {
SeekImmutable(target);
}
UpdateCurrent();
}
void TailingIterator::Next() {
assert(Valid());
if (!IsCurrentVersion()) {
// save the current key, create new iterators and then seek
std::string current_key = key().ToString();
Slice key_slice(current_key.data(), current_key.size());
CreateIterators();
Seek(key_slice);
if (!Valid() || key().compare(key_slice) != 0) {
// record with current_key no longer exists
return;
}
} else if (current_ == immutable_.get()) {
// immutable iterator is advanced -- update prev_key_
prev_key_ = key().ToString();
is_prev_inclusive_ = false;
is_prev_set_ = true;
}
current_->Next();
UpdateCurrent();
}
Slice TailingIterator::key() const {
assert(Valid());
return current_->key();
}
Slice TailingIterator::value() const {
assert(Valid());
return current_->value();
}
Status TailingIterator::status() const {
if (!status_.ok()) {
return status_;
} else if (!mutable_->status().ok()) {
return mutable_->status();
} else {
return immutable_->status();
}
}
void TailingIterator::Prev() {
status_ = Status::NotSupported("This iterator doesn't support Prev()");
}
void TailingIterator::SeekToLast() {
status_ = Status::NotSupported("This iterator doesn't support SeekToLast()");
}
void TailingIterator::Cleanup() {
// Release old super version if necessary
mutable_.reset();
immutable_.reset();
if (super_version_ != nullptr && super_version_->Unref()) {
DBImpl::DeletionState deletion_state;
db_->mutex_.Lock();
super_version_->Cleanup();
db_->FindObsoleteFiles(deletion_state, false, true);
db_->mutex_.Unlock();
delete super_version_;
if (deletion_state.HaveSomethingToDelete()) {
db_->PurgeObsoleteFiles(deletion_state);
}
}
}
void TailingIterator::CreateIterators() {
Cleanup();
super_version_= cfd_->GetReferencedSuperVersion(&(db_->mutex_));
Iterator* mutable_iter = super_version_->mem->NewIterator(read_options_);
// create a DBIter that only uses memtable content; see NewIterator()
mutable_.reset(
NewDBIterator(env_, *cfd_->options(), cfd_->user_comparator(),
mutable_iter, kMaxSequenceNumber));
std::vector<Iterator*> list;
super_version_->imm->AddIterators(read_options_, &list);
super_version_->current->AddIterators(
read_options_, *cfd_->soptions(), &list);
Iterator* immutable_iter =
NewMergingIterator(&cfd_->internal_comparator(), &list[0], list.size());
// create a DBIter that only uses memtable content; see NewIterator()
immutable_.reset(
NewDBIterator(env_, *cfd_->options(), cfd_->user_comparator(),
immutable_iter, kMaxSequenceNumber));
current_ = nullptr;
is_prev_set_ = false;
}
void TailingIterator::UpdateCurrent() {
current_ = nullptr;
if (mutable_->Valid()) {
current_ = mutable_.get();
}
const Comparator* cmp = cfd_->user_comparator();
if (immutable_->Valid() &&
(current_ == nullptr ||
cmp->Compare(immutable_->key(), current_->key()) < 0)) {
current_ = immutable_.get();
}
if (!status_.ok()) {
// reset status that was set by Prev() or SeekToLast()
status_ = Status::OK();
}
}
bool TailingIterator::IsCurrentVersion() const {
return super_version_ != nullptr &&
super_version_->version_number == cfd_->GetSuperVersionNumber();
}
bool TailingIterator::IsSamePrefix(const Slice& target) const {
const SliceTransform* extractor = cfd_->options()->prefix_extractor.get();
assert(extractor);
assert(is_prev_set_);
return extractor->Transform(target)
.compare(extractor->Transform(prev_key_)) == 0;
}
void TailingIterator::SeekImmutable(const Slice& target) {
prev_key_ = target.ToString();
is_prev_inclusive_ = true;
is_prev_set_ = true;
immutable_->Seek(target);
}
} // namespace rocksdb
#endif // ROCKSDB_LITE

97
db/tailing_iter.h Normal file
View File

@@ -0,0 +1,97 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <string>
#include "rocksdb/db.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
namespace rocksdb {
class DBImpl;
class Env;
struct SuperVersion;
class ColumnFamilyData;
/**
* TailingIterator is a special type of iterator that doesn't use an (implicit)
* snapshot. In other words, it can be used to read data that was added to the
* db after the iterator had been created.
*
* TailingIterator is optimized for sequential reading. It doesn't support
* Prev() and SeekToLast() operations.
*/
class TailingIterator : public Iterator {
public:
TailingIterator(Env* const env, DBImpl* db, const ReadOptions& read_options,
ColumnFamilyData* cfd);
virtual ~TailingIterator();
virtual bool Valid() const override;
virtual void SeekToFirst() override;
virtual void SeekToLast() override;
virtual void Seek(const Slice& target) override;
virtual void Next() override;
virtual void Prev() override;
virtual Slice key() const override;
virtual Slice value() const override;
virtual Status status() const override;
private:
void Cleanup();
Env* const env_;
DBImpl* const db_;
const ReadOptions read_options_;
ColumnFamilyData* const cfd_;
SuperVersion* super_version_;
// TailingIterator merges the contents of the two iterators below (one using
// mutable memtable contents only, other over SSTs and immutable memtables).
// See DBIter::GetTailingIteratorPair().
std::unique_ptr<Iterator> mutable_;
std::unique_ptr<Iterator> immutable_;
// points to either mutable_ or immutable_
Iterator* current_;
// key that precedes immutable iterator's current key
std::string prev_key_;
// unless prev_set is true, prev_key/prev_head is not valid and shouldn't be
// used; reset by createIterators()
bool is_prev_set_;
// prev_key_ was set by SeekImmutable(), which means that the interval of
// keys covered by immutable_ is [prev_key_, current], i.e. it includes the
// left endpoint
bool is_prev_inclusive_;
// internal iterator status
Status status_;
// check if this iterator's version matches DB's version
bool IsCurrentVersion() const;
// check if SeekImmutable() is needed due to target having a different prefix
// than prev_key_ (used when in prefix seek mode)
bool IsSamePrefix(const Slice& target) const;
// creates mutable_ and immutable_ iterators and updates version_number_
void CreateIterators();
// set current_ to be one of the iterators with the smallest key
void UpdateCurrent();
// seek on immutable_ and update prev_key
void SeekImmutable(const Slice& target);
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

262
db/transaction_log_impl.cc Normal file
View File

@@ -0,0 +1,262 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#include "db/transaction_log_impl.h"
#include "db/write_batch_internal.h"
namespace rocksdb {
TransactionLogIteratorImpl::TransactionLogIteratorImpl(
const std::string& dir, const DBOptions* options,
const TransactionLogIterator::ReadOptions& read_options,
const EnvOptions& soptions, const SequenceNumber seq,
std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl)
: dir_(dir),
options_(options),
read_options_(read_options),
soptions_(soptions),
startingSequenceNumber_(seq),
files_(std::move(files)),
started_(false),
isValid_(false),
currentFileIndex_(0),
currentBatchSeq_(0),
currentLastSeq_(0),
dbimpl_(dbimpl) {
assert(files_ != nullptr);
assert(dbimpl_ != nullptr);
reporter_.env = options_->env;
reporter_.info_log = options_->info_log.get();
SeekToStartSequence(); // Seek till starting sequence
}
Status TransactionLogIteratorImpl::OpenLogFile(
const LogFile* logFile,
unique_ptr<SequentialFile>* file) {
Env* env = options_->env;
if (logFile->Type() == kArchivedLogFile) {
std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber());
return env->NewSequentialFile(fname, file, soptions_);
} else {
std::string fname = LogFileName(dir_, logFile->LogNumber());
Status status = env->NewSequentialFile(fname, file, soptions_);
if (!status.ok()) {
// If cannot open file in DB directory.
// Try the archive dir, as it could have moved in the meanwhile.
fname = ArchivedLogFileName(dir_, logFile->LogNumber());
status = env->NewSequentialFile(fname, file, soptions_);
}
return status;
}
}
BatchResult TransactionLogIteratorImpl::GetBatch() {
assert(isValid_); // cannot call in a non valid state.
BatchResult result;
result.sequence = currentBatchSeq_;
result.writeBatchPtr = std::move(currentBatch_);
return result;
}
Status TransactionLogIteratorImpl::status() {
return currentStatus_;
}
bool TransactionLogIteratorImpl::Valid() {
return started_ && isValid_;
}
bool TransactionLogIteratorImpl::RestrictedRead(
Slice* record,
std::string* scratch) {
// Don't read if no more complete entries to read from logs
if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) {
return false;
}
return currentLogReader_->ReadRecord(record, scratch);
}
void TransactionLogIteratorImpl::SeekToStartSequence(
uint64_t startFileIndex,
bool strict) {
std::string scratch;
Slice record;
started_ = false;
isValid_ = false;
if (files_->size() <= startFileIndex) {
return;
}
Status s = OpenLogReader(files_->at(startFileIndex).get());
if (!s.ok()) {
currentStatus_ = s;
reporter_.Info(currentStatus_.ToString().c_str());
return;
}
while (RestrictedRead(&record, &scratch)) {
if (record.size() < 12) {
reporter_.Corruption(
record.size(), Status::Corruption("very small log record"));
continue;
}
UpdateCurrentWriteBatch(record);
if (currentLastSeq_ >= startingSequenceNumber_) {
if (strict && currentBatchSeq_ != startingSequenceNumber_) {
currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
"seek to required sequence number");
reporter_.Info(currentStatus_.ToString().c_str());
return;
} else if (strict) {
reporter_.Info("Could seek required sequence number. Iterator will "
"continue.");
}
isValid_ = true;
started_ = true; // set started_ as we could seek till starting sequence
return;
} else {
isValid_ = false;
}
}
// Could not find start sequence in first file. Normally this must be the
// only file. Otherwise log the error and let the iterator return next entry
// If strict is set, we want to seek exactly till the start sequence and it
// should have been present in the file we scanned above
if (strict) {
currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
"seek to required sequence number");
reporter_.Info(currentStatus_.ToString().c_str());
} else if (files_->size() != 1) {
currentStatus_ = Status::Corruption("Start sequence was not found, "
"skipping to the next available");
reporter_.Info(currentStatus_.ToString().c_str());
// Let NextImpl find the next available entry. started_ remains false
// because we don't want to check for gaps while moving to start sequence
NextImpl(true);
}
}
void TransactionLogIteratorImpl::Next() {
return NextImpl(false);
}
void TransactionLogIteratorImpl::NextImpl(bool internal) {
std::string scratch;
Slice record;
isValid_ = false;
if (!internal && !started_) {
// Runs every time until we can seek to the start sequence
return SeekToStartSequence();
}
while(true) {
assert(currentLogReader_);
if (currentLogReader_->IsEOF()) {
currentLogReader_->UnmarkEOF();
}
while (RestrictedRead(&record, &scratch)) {
if (record.size() < 12) {
reporter_.Corruption(
record.size(), Status::Corruption("very small log record"));
continue;
} else {
// started_ should be true if called by application
assert(internal || started_);
// started_ should be false if called internally
assert(!internal || !started_);
UpdateCurrentWriteBatch(record);
if (internal && !started_) {
started_ = true;
}
return;
}
}
// Open the next file
if (currentFileIndex_ < files_->size() - 1) {
++currentFileIndex_;
Status status =OpenLogReader(files_->at(currentFileIndex_).get());
if (!status.ok()) {
isValid_ = false;
currentStatus_ = status;
return;
}
} else {
isValid_ = false;
if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) {
currentStatus_ = Status::OK();
} else {
currentStatus_ = Status::Corruption("NO MORE DATA LEFT");
}
return;
}
}
}
bool TransactionLogIteratorImpl::IsBatchExpected(
const WriteBatch* batch,
const SequenceNumber expectedSeq) {
assert(batch);
SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch);
if (batchSeq != expectedSeq) {
char buf[200];
snprintf(buf, sizeof(buf),
"Discontinuity in log records. Got seq=%lu, Expected seq=%lu, "
"Last flushed seq=%lu.Log iterator will reseek the correct "
"batch.",
(unsigned long)batchSeq,
(unsigned long)expectedSeq,
(unsigned long)dbimpl_->GetLatestSequenceNumber());
reporter_.Info(buf);
return false;
}
return true;
}
void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
std::unique_ptr<WriteBatch> batch(new WriteBatch());
WriteBatchInternal::SetContents(batch.get(), record);
SequenceNumber expectedSeq = currentLastSeq_ + 1;
// If the iterator has started, then confirm that we get continuous batches
if (started_ && !IsBatchExpected(batch.get(), expectedSeq)) {
// Seek to the batch having expected sequence number
if (expectedSeq < files_->at(currentFileIndex_)->StartSequence()) {
// Expected batch must lie in the previous log file
// Avoid underflow.
if (currentFileIndex_ != 0) {
currentFileIndex_--;
}
}
startingSequenceNumber_ = expectedSeq;
// currentStatus_ will be set to Ok if reseek succeeds
currentStatus_ = Status::NotFound("Gap in sequence numbers");
return SeekToStartSequence(currentFileIndex_, true);
}
currentBatchSeq_ = WriteBatchInternal::Sequence(batch.get());
currentLastSeq_ = currentBatchSeq_ +
WriteBatchInternal::Count(batch.get()) - 1;
// currentBatchSeq_ can only change here
assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber());
currentBatch_ = move(batch);
isValid_ = true;
currentStatus_ = Status::OK();
}
Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
unique_ptr<SequentialFile> file;
Status status = OpenLogFile(logFile, &file);
if (!status.ok()) {
return status;
}
assert(file);
currentLogReader_.reset(new log::Reader(std::move(file), &reporter_,
read_options_.verify_checksums_, 0));
return Status::OK();
}
} // namespace rocksdb
#endif // ROCKSDB_LITE

120
db/transaction_log_impl.h Normal file
View File

@@ -0,0 +1,120 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#pragma once
#include <vector>
#include "rocksdb/env.h"
#include "rocksdb/options.h"
#include "rocksdb/types.h"
#include "rocksdb/transaction_log.h"
#include "db/db_impl.h"
#include "db/log_reader.h"
#include "db/filename.h"
namespace rocksdb {
struct LogReporter : public log::Reader::Reporter {
Env* env;
Logger* info_log;
virtual void Corruption(size_t bytes, const Status& s) {
Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str());
}
virtual void Info(const char* s) {
Log(info_log, "%s", s);
}
};
class LogFileImpl : public LogFile {
public:
LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
uint64_t sizeBytes) :
logNumber_(logNum),
type_(logType),
startSequence_(startSeq),
sizeFileBytes_(sizeBytes) {
}
std::string PathName() const {
if (type_ == kArchivedLogFile) {
return ArchivedLogFileName("", logNumber_);
}
return LogFileName("", logNumber_);
}
uint64_t LogNumber() const { return logNumber_; }
WalFileType Type() const { return type_; }
SequenceNumber StartSequence() const { return startSequence_; }
uint64_t SizeFileBytes() const { return sizeFileBytes_; }
bool operator < (const LogFile& that) const {
return LogNumber() < that.LogNumber();
}
private:
uint64_t logNumber_;
WalFileType type_;
SequenceNumber startSequence_;
uint64_t sizeFileBytes_;
};
class TransactionLogIteratorImpl : public TransactionLogIterator {
public:
TransactionLogIteratorImpl(
const std::string& dir, const DBOptions* options,
const TransactionLogIterator::ReadOptions& read_options,
const EnvOptions& soptions, const SequenceNumber seqNum,
std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl);
virtual bool Valid();
virtual void Next();
virtual Status status();
virtual BatchResult GetBatch();
private:
const std::string& dir_;
const DBOptions* options_;
const TransactionLogIterator::ReadOptions read_options_;
const EnvOptions& soptions_;
SequenceNumber startingSequenceNumber_;
std::unique_ptr<VectorLogPtr> files_;
bool started_;
bool isValid_; // not valid when it starts of.
Status currentStatus_;
size_t currentFileIndex_;
std::unique_ptr<WriteBatch> currentBatch_;
unique_ptr<log::Reader> currentLogReader_;
Status OpenLogFile(const LogFile* logFile, unique_ptr<SequentialFile>* file);
LogReporter reporter_;
SequenceNumber currentBatchSeq_; // sequence number at start of current batch
SequenceNumber currentLastSeq_; // last sequence in the current batch
DBImpl const * const dbimpl_; // The db on whose log files this iterates
// Reads from transaction log only if the writebatch record has been written
bool RestrictedRead(Slice* record, std::string* scratch);
// Seeks to startingSequenceNumber reading from startFileIndex in files_.
// If strict is set,then must get a batch starting with startingSequenceNumber
void SeekToStartSequence(uint64_t startFileIndex = 0, bool strict = false);
// Implementation of Next. SeekToStartSequence calls it internally with
// internal=true to let it find next entry even if it has to jump gaps because
// the iterator may start off from the first available entry but promises to
// be continuous after that
void NextImpl(bool internal = false);
// Check if batch is expected, else return false
bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expectedSeq);
// Update current batch if a continuous batch is found, else return false
void UpdateCurrentWriteBatch(const Slice& record);
Status OpenLogReader(const LogFile* file);
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

364
db/version_edit.cc Normal file
View File

@@ -0,0 +1,364 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_edit.h"
#include "db/version_set.h"
#include "util/coding.h"
#include "rocksdb/slice.h"
namespace rocksdb {
// Tag numbers for serialized VersionEdit. These numbers are written to
// disk and should not be changed.
enum Tag {
kComparator = 1,
kLogNumber = 2,
kNextFileNumber = 3,
kLastSequence = 4,
kCompactPointer = 5,
kDeletedFile = 6,
kNewFile = 7,
// 8 was used for large value refs
kPrevLogNumber = 9,
// these are new formats divergent from open source leveldb
kNewFile2 = 100, // store smallest & largest seqno
kColumnFamily = 200, // specify column family for version edit
kColumnFamilyAdd = 201,
kColumnFamilyDrop = 202,
kMaxColumnFamily = 203,
};
void VersionEdit::Clear() {
comparator_.clear();
max_level_ = 0;
log_number_ = 0;
prev_log_number_ = 0;
last_sequence_ = 0;
next_file_number_ = 0;
max_column_family_ = 0;
has_comparator_ = false;
has_log_number_ = false;
has_prev_log_number_ = false;
has_next_file_number_ = false;
has_last_sequence_ = false;
has_max_column_family_ = false;
deleted_files_.clear();
new_files_.clear();
column_family_ = 0;
is_column_family_add_ = 0;
is_column_family_drop_ = 0;
column_family_name_.clear();
}
void VersionEdit::EncodeTo(std::string* dst) const {
if (has_comparator_) {
PutVarint32(dst, kComparator);
PutLengthPrefixedSlice(dst, comparator_);
}
if (has_log_number_) {
PutVarint32(dst, kLogNumber);
PutVarint64(dst, log_number_);
}
if (has_prev_log_number_) {
PutVarint32(dst, kPrevLogNumber);
PutVarint64(dst, prev_log_number_);
}
if (has_next_file_number_) {
PutVarint32(dst, kNextFileNumber);
PutVarint64(dst, next_file_number_);
}
if (has_last_sequence_) {
PutVarint32(dst, kLastSequence);
PutVarint64(dst, last_sequence_);
}
if (has_max_column_family_) {
PutVarint32(dst, kMaxColumnFamily);
PutVarint32(dst, max_column_family_);
}
for (const auto& deleted : deleted_files_) {
PutVarint32(dst, kDeletedFile);
PutVarint32(dst, deleted.first /* level */);
PutVarint64(dst, deleted.second /* file number */);
}
for (size_t i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second;
PutVarint32(dst, kNewFile2);
PutVarint32(dst, new_files_[i].first); // level
PutVarint64(dst, f.number);
PutVarint64(dst, f.file_size);
PutLengthPrefixedSlice(dst, f.smallest.Encode());
PutLengthPrefixedSlice(dst, f.largest.Encode());
PutVarint64(dst, f.smallest_seqno);
PutVarint64(dst, f.largest_seqno);
}
// 0 is default and does not need to be explicitly written
if (column_family_ != 0) {
PutVarint32(dst, kColumnFamily);
PutVarint32(dst, column_family_);
}
if (is_column_family_add_) {
PutVarint32(dst, kColumnFamilyAdd);
PutLengthPrefixedSlice(dst, Slice(column_family_name_));
}
if (is_column_family_drop_) {
PutVarint32(dst, kColumnFamilyDrop);
}
}
static bool GetInternalKey(Slice* input, InternalKey* dst) {
Slice str;
if (GetLengthPrefixedSlice(input, &str)) {
dst->DecodeFrom(str);
return dst->Valid();
} else {
return false;
}
}
bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
uint32_t v;
if (GetVarint32(input, &v)) {
*level = v;
if (max_level_ < *level) {
max_level_ = *level;
}
return true;
} else {
return false;
}
}
Status VersionEdit::DecodeFrom(const Slice& src) {
Clear();
Slice input = src;
const char* msg = nullptr;
uint32_t tag;
// Temporary storage for parsing
int level;
uint64_t number;
FileMetaData f;
Slice str;
InternalKey key;
while (msg == nullptr && GetVarint32(&input, &tag)) {
switch (tag) {
case kComparator:
if (GetLengthPrefixedSlice(&input, &str)) {
comparator_ = str.ToString();
has_comparator_ = true;
} else {
msg = "comparator name";
}
break;
case kLogNumber:
if (GetVarint64(&input, &log_number_)) {
has_log_number_ = true;
} else {
msg = "log number";
}
break;
case kPrevLogNumber:
if (GetVarint64(&input, &prev_log_number_)) {
has_prev_log_number_ = true;
} else {
msg = "previous log number";
}
break;
case kNextFileNumber:
if (GetVarint64(&input, &next_file_number_)) {
has_next_file_number_ = true;
} else {
msg = "next file number";
}
break;
case kLastSequence:
if (GetVarint64(&input, &last_sequence_)) {
has_last_sequence_ = true;
} else {
msg = "last sequence number";
}
break;
case kMaxColumnFamily:
if (GetVarint32(&input, &max_column_family_)) {
has_max_column_family_ = true;
} else {
msg = "max column family";
}
break;
case kCompactPointer:
if (GetLevel(&input, &level, &msg) &&
GetInternalKey(&input, &key)) {
// we don't use compact pointers anymore,
// but we should not fail if they are still
// in manifest
} else {
if (!msg) {
msg = "compaction pointer";
}
}
break;
case kDeletedFile:
if (GetLevel(&input, &level, &msg) &&
GetVarint64(&input, &number)) {
deleted_files_.insert(std::make_pair(level, number));
} else {
if (!msg) {
msg = "deleted file";
}
}
break;
case kNewFile:
if (GetLevel(&input, &level, &msg) &&
GetVarint64(&input, &f.number) &&
GetVarint64(&input, &f.file_size) &&
GetInternalKey(&input, &f.smallest) &&
GetInternalKey(&input, &f.largest)) {
new_files_.push_back(std::make_pair(level, f));
} else {
if (!msg) {
msg = "new-file entry";
}
}
break;
case kNewFile2:
if (GetLevel(&input, &level, &msg) &&
GetVarint64(&input, &f.number) &&
GetVarint64(&input, &f.file_size) &&
GetInternalKey(&input, &f.smallest) &&
GetInternalKey(&input, &f.largest) &&
GetVarint64(&input, &f.smallest_seqno) &&
GetVarint64(&input, &f.largest_seqno) ) {
new_files_.push_back(std::make_pair(level, f));
} else {
if (!msg) {
msg = "new-file2 entry";
}
}
break;
case kColumnFamily:
if (!GetVarint32(&input, &column_family_)) {
if (!msg) {
msg = "set column family id";
}
}
break;
case kColumnFamilyAdd:
if (GetLengthPrefixedSlice(&input, &str)) {
is_column_family_add_ = true;
column_family_name_ = str.ToString();
} else {
if (!msg) {
msg = "column family add";
}
}
break;
case kColumnFamilyDrop:
is_column_family_drop_ = true;
break;
default:
msg = "unknown tag";
break;
}
}
if (msg == nullptr && !input.empty()) {
msg = "invalid tag";
}
Status result;
if (msg != nullptr) {
result = Status::Corruption("VersionEdit", msg);
}
return result;
}
std::string VersionEdit::DebugString(bool hex_key) const {
std::string r;
r.append("VersionEdit {");
if (has_comparator_) {
r.append("\n Comparator: ");
r.append(comparator_);
}
if (has_log_number_) {
r.append("\n LogNumber: ");
AppendNumberTo(&r, log_number_);
}
if (has_prev_log_number_) {
r.append("\n PrevLogNumber: ");
AppendNumberTo(&r, prev_log_number_);
}
if (has_next_file_number_) {
r.append("\n NextFile: ");
AppendNumberTo(&r, next_file_number_);
}
if (has_last_sequence_) {
r.append("\n LastSeq: ");
AppendNumberTo(&r, last_sequence_);
}
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end();
++iter) {
r.append("\n DeleteFile: ");
AppendNumberTo(&r, iter->first);
r.append(" ");
AppendNumberTo(&r, iter->second);
}
for (size_t i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second;
r.append("\n AddFile: ");
AppendNumberTo(&r, new_files_[i].first);
r.append(" ");
AppendNumberTo(&r, f.number);
r.append(" ");
AppendNumberTo(&r, f.file_size);
r.append(" ");
r.append(f.smallest.DebugString(hex_key));
r.append(" .. ");
r.append(f.largest.DebugString(hex_key));
}
r.append("\n ColumnFamily: ");
AppendNumberTo(&r, column_family_);
if (is_column_family_add_) {
r.append("\n ColumnFamilyAdd: ");
r.append(column_family_name_);
}
if (is_column_family_drop_) {
r.append("\n ColumnFamilyDrop");
}
if (has_max_column_family_) {
r.append("\n MaxColumnFamily: ");
AppendNumberTo(&r, max_column_family_);
}
r.append("\n}\n");
return r;
}
} // namespace rocksdb

176
db/version_edit.h Normal file
View File

@@ -0,0 +1,176 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <set>
#include <utility>
#include <vector>
#include <string>
#include "rocksdb/cache.h"
#include "db/dbformat.h"
namespace rocksdb {
class VersionSet;
struct FileMetaData {
int refs;
int allowed_seeks; // Seeks allowed until compaction
uint64_t number;
uint64_t file_size; // File size in bytes
InternalKey smallest; // Smallest internal key served by table
InternalKey largest; // Largest internal key served by table
bool being_compacted; // Is this file undergoing compaction?
SequenceNumber smallest_seqno;// The smallest seqno in this file
SequenceNumber largest_seqno; // The largest seqno in this file
// Needs to be disposed when refs becomes 0.
Cache::Handle* table_reader_handle;
// Table reader in table_reader_handle
TableReader* table_reader;
FileMetaData(uint64_t number, uint64_t file_size)
: refs(0),
allowed_seeks(1 << 30),
number(number),
file_size(file_size),
being_compacted(false),
table_reader_handle(nullptr),
table_reader(nullptr) {}
FileMetaData() : FileMetaData(0, 0) {}
};
class VersionEdit {
public:
VersionEdit() { Clear(); }
~VersionEdit() { }
void Clear();
void SetComparatorName(const Slice& name) {
has_comparator_ = true;
comparator_ = name.ToString();
}
void SetLogNumber(uint64_t num) {
has_log_number_ = true;
log_number_ = num;
}
void SetPrevLogNumber(uint64_t num) {
has_prev_log_number_ = true;
prev_log_number_ = num;
}
void SetNextFile(uint64_t num) {
has_next_file_number_ = true;
next_file_number_ = num;
}
void SetLastSequence(SequenceNumber seq) {
has_last_sequence_ = true;
last_sequence_ = seq;
}
void SetMaxColumnFamily(uint32_t max_column_family) {
has_max_column_family_ = true;
max_column_family_ = max_column_family;
}
// Add the specified file at the specified number.
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
void AddFile(int level, uint64_t file,
uint64_t file_size,
const InternalKey& smallest,
const InternalKey& largest,
const SequenceNumber& smallest_seqno,
const SequenceNumber& largest_seqno) {
assert(smallest_seqno <= largest_seqno);
FileMetaData f;
f.number = file;
f.file_size = file_size;
f.smallest = smallest;
f.largest = largest;
f.smallest_seqno = smallest_seqno;
f.largest_seqno = largest_seqno;
new_files_.push_back(std::make_pair(level, f));
}
// Delete the specified "file" from the specified "level".
void DeleteFile(int level, uint64_t file) {
deleted_files_.insert({level, file});
}
// Number of edits
int NumEntries() {
return new_files_.size() + deleted_files_.size();
}
bool IsColumnFamilyManipulation() {
return is_column_family_add_ || is_column_family_drop_;
}
void SetColumnFamily(uint32_t column_family_id) {
column_family_ = column_family_id;
}
// set column family ID by calling SetColumnFamily()
void AddColumnFamily(const std::string& name) {
assert(!is_column_family_drop_);
assert(!is_column_family_add_);
assert(NumEntries() == 0);
is_column_family_add_ = true;
column_family_name_ = name;
}
// set column family ID by calling SetColumnFamily()
void DropColumnFamily() {
assert(!is_column_family_drop_);
assert(!is_column_family_add_);
assert(NumEntries() == 0);
is_column_family_drop_ = true;
}
void EncodeTo(std::string* dst) const;
Status DecodeFrom(const Slice& src);
std::string DebugString(bool hex_key = false) const;
private:
friend class VersionSet;
typedef std::set< std::pair<int, uint64_t>> DeletedFileSet;
bool GetLevel(Slice* input, int* level, const char** msg);
int max_level_;
std::string comparator_;
uint64_t log_number_;
uint64_t prev_log_number_;
uint64_t next_file_number_;
uint32_t max_column_family_;
SequenceNumber last_sequence_;
bool has_comparator_;
bool has_log_number_;
bool has_prev_log_number_;
bool has_next_file_number_;
bool has_last_sequence_;
bool has_max_column_family_;
DeletedFileSet deleted_files_;
std::vector<std::pair<int, FileMetaData>> new_files_;
// Each version edit record should have column_family_id set
// If it's not set, it is default (0)
uint32_t column_family_;
// a version edit can be either column_family add or
// column_family drop. If it's column family add,
// it also includes column family name.
bool is_column_family_drop_;
bool is_column_family_add_;
std::string column_family_name_;
};
} // namespace rocksdb

65
db/version_edit_test.cc Normal file
View File

@@ -0,0 +1,65 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_edit.h"
#include "util/testharness.h"
namespace rocksdb {
static void TestEncodeDecode(const VersionEdit& edit) {
std::string encoded, encoded2;
edit.EncodeTo(&encoded);
VersionEdit parsed;
Status s = parsed.DecodeFrom(encoded);
ASSERT_TRUE(s.ok()) << s.ToString();
parsed.EncodeTo(&encoded2);
ASSERT_EQ(encoded, encoded2);
}
class VersionEditTest { };
TEST(VersionEditTest, EncodeDecode) {
static const uint64_t kBig = 1ull << 50;
VersionEdit edit;
for (int i = 0; i < 4; i++) {
TestEncodeDecode(edit);
edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
InternalKey("foo", kBig + 500 + i, kTypeValue),
InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
kBig + 500 + i,
kBig + 600 + i);
edit.DeleteFile(4, kBig + 700 + i);
}
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
edit.SetNextFile(kBig + 200);
edit.SetLastSequence(kBig + 1000);
TestEncodeDecode(edit);
}
TEST(VersionEditTest, ColumnFamilyTest) {
VersionEdit edit;
edit.SetColumnFamily(2);
edit.AddColumnFamily("column_family");
edit.SetMaxColumnFamily(5);
TestEncodeDecode(edit);
edit.Clear();
edit.SetColumnFamily(3);
edit.DropColumnFamily();
TestEncodeDecode(edit);
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

2822
db/version_set.cc Normal file

File diff suppressed because it is too large Load Diff

499
db/version_set.h Normal file
View File

@@ -0,0 +1,499 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// The representation of a DBImpl consists of a set of Versions. The
// newest version is called "current". Older versions may be kept
// around to provide a consistent view to live iterators.
//
// Each Version keeps track of a set of Table files per level. The
// entire set of versions is maintained in a VersionSet.
//
// Version,VersionSet are thread-compatible, but require external
// synchronization on all accesses.
#pragma once
#include <map>
#include <memory>
#include <set>
#include <vector>
#include <deque>
#include <atomic>
#include <limits>
#include "db/dbformat.h"
#include "db/version_edit.h"
#include "port/port.h"
#include "db/table_cache.h"
#include "db/compaction.h"
#include "db/compaction_picker.h"
#include "db/column_family.h"
#include "db/log_reader.h"
#include "db/file_indexer.h"
namespace rocksdb {
namespace log { class Writer; }
class Compaction;
class CompactionPicker;
class Iterator;
class LogBuffer;
class LookupKey;
class MemTable;
class Version;
class VersionSet;
class MergeContext;
class ColumnFamilyData;
class ColumnFamilySet;
class TableCache;
class MergeIteratorBuilder;
// Return the smallest index i such that files[i]->largest >= key.
// Return files.size() if there is no such file.
// REQUIRES: "files" contains a sorted list of non-overlapping files.
extern int FindFile(const InternalKeyComparator& icmp,
const std::vector<FileMetaData*>& files,
const Slice& key);
// Returns true iff some file in "files" overlaps the user key range
// [*smallest,*largest].
// smallest==nullptr represents a key smaller than all keys in the DB.
// largest==nullptr represents a key largest than all keys in the DB.
// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
// in sorted order.
extern bool SomeFileOverlapsRange(
const InternalKeyComparator& icmp,
bool disjoint_sorted_files,
const std::vector<FileMetaData*>& files,
const Slice* smallest_user_key,
const Slice* largest_user_key);
class Version {
public:
// Append to *iters a sequence of iterators that will
// yield the contents of this Version when merged together.
// REQUIRES: This version has been saved (see VersionSet::SaveTo)
void AddIterators(const ReadOptions&, const EnvOptions& soptions,
std::vector<Iterator*>* iters);
void AddIterators(const ReadOptions&, const EnvOptions& soptions,
MergeIteratorBuilder* merger_iter_builder);
// Lookup the value for key. If found, store it in *val and
// return OK. Else return a non-OK status. Fills *stats.
// Uses *operands to store merge_operator operations to apply later
// REQUIRES: lock is not held
struct GetStats {
FileMetaData* seek_file;
int seek_file_level;
};
void Get(const ReadOptions&, const LookupKey& key, std::string* val,
Status* status, MergeContext* merge_context, GetStats* stats,
bool* value_found = nullptr);
// Adds "stats" into the current state. Returns true if a new
// compaction may need to be triggered, false otherwise.
// REQUIRES: lock is held
bool UpdateStats(const GetStats& stats);
// Updates internal structures that keep track of compaction scores
// We use compaction scores to figure out which compaction to do next
// REQUIRES: If Version is not yet saved to current_, it can be called without
// a lock. Once a version is saved to current_, call only with mutex held
void ComputeCompactionScore(std::vector<uint64_t>& size_being_compacted);
// Reference count management (so Versions do not disappear out from
// under live iterators)
void Ref();
// Decrease reference count. Delete the object if no reference left
// and return true. Otherwise, return false.
bool Unref();
// Returns true iff some level needs a compaction.
bool NeedsCompaction() const;
// Returns the maxmimum compaction score for levels 1 to max
double MaxCompactionScore() const { return max_compaction_score_; }
// See field declaration
int MaxCompactionScoreLevel() const { return max_compaction_score_level_; }
void GetOverlappingInputs(
int level,
const InternalKey* begin, // nullptr means before all keys
const InternalKey* end, // nullptr means after all keys
std::vector<FileMetaData*>* inputs,
int hint_index = -1, // index of overlap file
int* file_index = nullptr); // return index of overlap file
void GetOverlappingInputsBinarySearch(
int level,
const Slice& begin, // nullptr means before all keys
const Slice& end, // nullptr means after all keys
std::vector<FileMetaData*>* inputs,
int hint_index, // index of overlap file
int* file_index); // return index of overlap file
void ExtendOverlappingInputs(
int level,
const Slice& begin, // nullptr means before all keys
const Slice& end, // nullptr means after all keys
std::vector<FileMetaData*>* inputs,
unsigned int index); // start extending from this index
// Returns true iff some file in the specified level overlaps
// some part of [*smallest_user_key,*largest_user_key].
// smallest_user_key==NULL represents a key smaller than all keys in the DB.
// largest_user_key==NULL represents a key largest than all keys in the DB.
bool OverlapInLevel(int level,
const Slice* smallest_user_key,
const Slice* largest_user_key);
// Returns true iff the first or last file in inputs contains
// an overlapping user key to the file "just outside" of it (i.e.
// just after the last file, or just before the first file)
// REQUIRES: "*inputs" is a sorted list of non-overlapping files
bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
int level);
// Return the level at which we should place a new memtable compaction
// result that covers the range [smallest_user_key,largest_user_key].
int PickLevelForMemTableOutput(const Slice& smallest_user_key,
const Slice& largest_user_key);
int NumberLevels() const { return num_levels_; }
// REQUIRES: lock is held
int NumLevelFiles(int level) const { return files_[level].size(); }
// Return the combined file size of all files at the specified level.
int64_t NumLevelBytes(int level) const;
// Return a human-readable short (single-line) summary of the number
// of files per level. Uses *scratch as backing store.
struct LevelSummaryStorage {
char buffer[100];
};
struct FileSummaryStorage {
char buffer[1000];
};
const char* LevelSummary(LevelSummaryStorage* scratch) const;
// Return a human-readable short (single-line) summary of files
// in a specified level. Uses *scratch as backing store.
const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
// Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1.
int64_t MaxNextLevelOverlappingBytes();
// Add all files listed in the current version to *live.
void AddLiveFiles(std::set<uint64_t>* live);
// Return a human readable string that describes this version's contents.
std::string DebugString(bool hex = false) const;
// Returns the version nuber of this version
uint64_t GetVersionNumber() const { return version_number_; }
// REQUIRES: lock is held
// On success, *props will be populated with all SSTables' table properties.
// The keys of `props` are the sst file name, the values of `props` are the
// tables' propertis, represented as shared_ptr.
Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
// used to sort files by size
struct Fsize {
int index;
FileMetaData* file;
};
private:
friend class Compaction;
friend class VersionSet;
friend class DBImpl;
friend class ColumnFamilyData;
friend class CompactionPicker;
friend class LevelCompactionPicker;
friend class UniversalCompactionPicker;
friend class FIFOCompactionPicker;
friend class ForwardIterator;
class LevelFileNumIterator;
class LevelFileIteratorState;
bool PrefixMayMatch(const ReadOptions& options, Iterator* level_iter,
const Slice& internal_prefix) const;
// Sort all files for this version based on their file size and
// record results in files_by_size_. The largest files are listed first.
void UpdateFilesBySize();
ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs
const InternalKeyComparator* internal_comparator_;
const Comparator* user_comparator_;
TableCache* table_cache_;
const MergeOperator* merge_operator_;
Logger* info_log_;
Statistics* db_statistics_;
VersionSet* vset_; // VersionSet to which this Version belongs
Version* next_; // Next version in linked list
Version* prev_; // Previous version in linked list
int refs_; // Number of live refs to this version
int num_levels_; // Number of levels
// List of files per level, files in each level are arranged
// in increasing order of keys
std::vector<FileMetaData*>* files_;
// A list for the same set of files that are stored in files_,
// but files in each level are now sorted based on file
// size. The file with the largest size is at the front.
// This vector stores the index of the file from files_.
std::vector<std::vector<int>> files_by_size_;
// An index into files_by_size_ that specifies the first
// file that is not yet compacted
std::vector<int> next_file_to_compact_by_size_;
// Only the first few entries of files_by_size_ are sorted.
// There is no need to sort all the files because it is likely
// that on a running system, we need to look at only the first
// few largest files because a new version is created every few
// seconds/minutes (because of concurrent compactions).
static const int number_of_files_to_sort_ = 50;
// Next file to compact based on seek stats.
FileMetaData* file_to_compact_;
int file_to_compact_level_;
// Level that should be compacted next and its compaction score.
// Score < 1 means compaction is not strictly needed. These fields
// are initialized by Finalize().
// The most critical level to be compacted is listed first
// These are used to pick the best compaction level
std::vector<double> compaction_score_;
std::vector<int> compaction_level_;
double max_compaction_score_; // max score in l1 to ln-1
int max_compaction_score_level_; // level on which max score occurs
// A version number that uniquely represents this version. This is
// used for debugging and logging purposes only.
uint64_t version_number_;
Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
FileIndexer file_indexer_;
~Version();
// re-initializes the index that is used to offset into files_by_size_
// to find the next compaction candidate file.
void ResetNextCompactionIndex(int level) {
next_file_to_compact_by_size_[level] = 0;
}
// No copying allowed
Version(const Version&);
void operator=(const Version&);
};
class VersionSet {
public:
VersionSet(const std::string& dbname, const DBOptions* options,
const EnvOptions& storage_options, Cache* table_cache);
~VersionSet();
// Apply *edit to the current version to form a new descriptor that
// is both saved to persistent state and installed as the new
// current version. Will release *mu while actually writing to the file.
// column_family_options has to be set if edit is column family add
// REQUIRES: *mu is held on entry.
// REQUIRES: no other thread concurrently calls LogAndApply()
Status LogAndApply(ColumnFamilyData* column_family_data, VersionEdit* edit,
port::Mutex* mu, Directory* db_directory = nullptr,
bool new_descriptor_log = false,
const ColumnFamilyOptions* column_family_options =
nullptr);
// Recover the last saved descriptor from persistent storage.
// If read_only == true, Recover() will not complain if some column families
// are not opened
Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
bool read_only = false);
// Reads a manifest file and returns a list of column families in
// column_families.
static Status ListColumnFamilies(std::vector<std::string>* column_families,
const std::string& dbname, Env* env);
#ifndef ROCKSDB_LITE
// Try to reduce the number of levels. This call is valid when
// only one level from the new max level to the old
// max level containing files.
// The call is static, since number of levels is immutable during
// the lifetime of a RocksDB instance. It reduces number of levels
// in a DB by applying changes to manifest.
// For example, a db currently has 7 levels [0-6], and a call to
// to reduce to 5 [0-4] can only be executed when only one level
// among [4-6] contains files.
static Status ReduceNumberOfLevels(const std::string& dbname,
const Options* options,
const EnvOptions& storage_options,
int new_levels);
// printf contents (for debugging)
Status DumpManifest(Options& options, std::string& manifestFileName,
bool verbose, bool hex = false);
#endif // ROCKSDB_LITE
// Return the current manifest file number
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
uint64_t PendingManifestFileNumber() const {
return pending_manifest_file_number_;
}
// Allocate and return a new file number
uint64_t NewFileNumber() { return next_file_number_++; }
// Arrange to reuse "file_number" unless a newer file number has
// already been allocated.
// REQUIRES: "file_number" was returned by a call to NewFileNumber().
void ReuseFileNumber(uint64_t file_number) {
if (next_file_number_ == file_number + 1) {
next_file_number_ = file_number;
}
}
// Return the last sequence number.
uint64_t LastSequence() const {
return last_sequence_.load(std::memory_order_acquire);
}
// Set the last sequence number to s.
void SetLastSequence(uint64_t s) {
assert(s >= last_sequence_);
last_sequence_.store(s, std::memory_order_release);
}
// Mark the specified file number as used.
void MarkFileNumberUsed(uint64_t number);
// Return the log file number for the log file that is currently
// being compacted, or zero if there is no such log file.
uint64_t PrevLogNumber() const { return prev_log_number_; }
// Returns the minimum log number such that all
// log numbers less than or equal to it can be deleted
uint64_t MinLogNumber() const {
uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
for (auto cfd : *column_family_set_) {
if (min_log_num > cfd->GetLogNumber()) {
min_log_num = cfd->GetLogNumber();
}
}
return min_log_num;
}
// Create an iterator that reads over the compaction inputs for "*c".
// The caller should delete the iterator when no longer needed.
Iterator* MakeInputIterator(Compaction* c);
// Add all files listed in any live version to *live.
void AddLiveFiles(std::vector<uint64_t>* live_list);
// Return the approximate offset in the database of the data for
// "key" as of version "v".
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
// Return the size of the current manifest file
uint64_t ManifestFileSize() const { return manifest_file_size_; }
// verify that the files that we started with for a compaction
// still exist in the current version and in the same original level.
// This ensures that a concurrent compaction did not erroneously
// pick the same files to compact.
bool VerifyCompactionFileConsistency(Compaction* c);
Status GetMetadataForFile(uint64_t number, int* filelevel,
FileMetaData** metadata, ColumnFamilyData** cfd);
void GetLiveFilesMetaData(
std::vector<LiveFileMetaData> *metadata);
void GetObsoleteFiles(std::vector<FileMetaData*>* files);
ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
private:
class Builder;
struct ManifestWriter;
friend class Version;
struct LogReporter : public log::Reader::Reporter {
Status* status;
virtual void Corruption(size_t bytes, const Status& s) {
if (this->status->ok()) *this->status = s;
}
};
// Save current contents to *log
Status WriteSnapshot(log::Writer* log);
void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
bool ManifestContains(uint64_t manifest_file_number,
const std::string& record) const;
ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options,
VersionEdit* edit);
std::unique_ptr<ColumnFamilySet> column_family_set_;
Env* const env_;
const std::string dbname_;
const DBOptions* const options_;
uint64_t next_file_number_;
uint64_t manifest_file_number_;
uint64_t pending_manifest_file_number_;
std::atomic<uint64_t> last_sequence_;
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
// Opened lazily
unique_ptr<log::Writer> descriptor_log_;
// generates a increasing version number for every new version
uint64_t current_version_number_;
// Queue of writers to the manifest file
std::deque<ManifestWriter*> manifest_writers_;
// Current size of manifest file
uint64_t manifest_file_size_;
std::vector<FileMetaData*> obsolete_files_;
// storage options for all reads and writes except compactions
const EnvOptions& storage_options_;
// storage options used for compactions. This is a copy of
// storage_options_ but with readaheads set to readahead_compactions_.
const EnvOptions storage_options_compactions_;
// No copying allowed
VersionSet(const VersionSet&);
void operator=(const VersionSet&);
void LogAndApplyCFHelper(VersionEdit* edit);
void LogAndApplyHelper(ColumnFamilyData* cfd, Builder* b, Version* v,
VersionEdit* edit, port::Mutex* mu);
};
} // namespace rocksdb

184
db/version_set_test.cc Normal file
View File

@@ -0,0 +1,184 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_set.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace rocksdb {
class FindFileTest {
public:
std::vector<FileMetaData*> files_;
bool disjoint_sorted_files_;
FindFileTest() : disjoint_sorted_files_(true) { }
~FindFileTest() {
for (unsigned int i = 0; i < files_.size(); i++) {
delete files_[i];
}
}
void Add(const char* smallest, const char* largest,
SequenceNumber smallest_seq = 100,
SequenceNumber largest_seq = 100) {
FileMetaData* f = new FileMetaData;
f->number = files_.size() + 1;
f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
f->largest = InternalKey(largest, largest_seq, kTypeValue);
files_.push_back(f);
}
int Find(const char* key) {
InternalKey target(key, 100, kTypeValue);
InternalKeyComparator cmp(BytewiseComparator());
return FindFile(cmp, files_, target.Encode());
}
bool Overlaps(const char* smallest, const char* largest) {
InternalKeyComparator cmp(BytewiseComparator());
Slice s(smallest != nullptr ? smallest : "");
Slice l(largest != nullptr ? largest : "");
return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
(smallest != nullptr ? &s : nullptr),
(largest != nullptr ? &l : nullptr));
}
};
TEST(FindFileTest, Empty) {
ASSERT_EQ(0, Find("foo"));
ASSERT_TRUE(! Overlaps("a", "z"));
ASSERT_TRUE(! Overlaps(nullptr, "z"));
ASSERT_TRUE(! Overlaps("a", nullptr));
ASSERT_TRUE(! Overlaps(nullptr, nullptr));
}
TEST(FindFileTest, Single) {
Add("p", "q");
ASSERT_EQ(0, Find("a"));
ASSERT_EQ(0, Find("p"));
ASSERT_EQ(0, Find("p1"));
ASSERT_EQ(0, Find("q"));
ASSERT_EQ(1, Find("q1"));
ASSERT_EQ(1, Find("z"));
ASSERT_TRUE(! Overlaps("a", "b"));
ASSERT_TRUE(! Overlaps("z1", "z2"));
ASSERT_TRUE(Overlaps("a", "p"));
ASSERT_TRUE(Overlaps("a", "q"));
ASSERT_TRUE(Overlaps("a", "z"));
ASSERT_TRUE(Overlaps("p", "p1"));
ASSERT_TRUE(Overlaps("p", "q"));
ASSERT_TRUE(Overlaps("p", "z"));
ASSERT_TRUE(Overlaps("p1", "p2"));
ASSERT_TRUE(Overlaps("p1", "z"));
ASSERT_TRUE(Overlaps("q", "q"));
ASSERT_TRUE(Overlaps("q", "q1"));
ASSERT_TRUE(! Overlaps(nullptr, "j"));
ASSERT_TRUE(! Overlaps("r", nullptr));
ASSERT_TRUE(Overlaps(nullptr, "p"));
ASSERT_TRUE(Overlaps(nullptr, "p1"));
ASSERT_TRUE(Overlaps("q", nullptr));
ASSERT_TRUE(Overlaps(nullptr, nullptr));
}
TEST(FindFileTest, Multiple) {
Add("150", "200");
Add("200", "250");
Add("300", "350");
Add("400", "450");
ASSERT_EQ(0, Find("100"));
ASSERT_EQ(0, Find("150"));
ASSERT_EQ(0, Find("151"));
ASSERT_EQ(0, Find("199"));
ASSERT_EQ(0, Find("200"));
ASSERT_EQ(1, Find("201"));
ASSERT_EQ(1, Find("249"));
ASSERT_EQ(1, Find("250"));
ASSERT_EQ(2, Find("251"));
ASSERT_EQ(2, Find("299"));
ASSERT_EQ(2, Find("300"));
ASSERT_EQ(2, Find("349"));
ASSERT_EQ(2, Find("350"));
ASSERT_EQ(3, Find("351"));
ASSERT_EQ(3, Find("400"));
ASSERT_EQ(3, Find("450"));
ASSERT_EQ(4, Find("451"));
ASSERT_TRUE(! Overlaps("100", "149"));
ASSERT_TRUE(! Overlaps("251", "299"));
ASSERT_TRUE(! Overlaps("451", "500"));
ASSERT_TRUE(! Overlaps("351", "399"));
ASSERT_TRUE(Overlaps("100", "150"));
ASSERT_TRUE(Overlaps("100", "200"));
ASSERT_TRUE(Overlaps("100", "300"));
ASSERT_TRUE(Overlaps("100", "400"));
ASSERT_TRUE(Overlaps("100", "500"));
ASSERT_TRUE(Overlaps("375", "400"));
ASSERT_TRUE(Overlaps("450", "450"));
ASSERT_TRUE(Overlaps("450", "500"));
}
TEST(FindFileTest, MultipleNullBoundaries) {
Add("150", "200");
Add("200", "250");
Add("300", "350");
Add("400", "450");
ASSERT_TRUE(! Overlaps(nullptr, "149"));
ASSERT_TRUE(! Overlaps("451", nullptr));
ASSERT_TRUE(Overlaps(nullptr, nullptr));
ASSERT_TRUE(Overlaps(nullptr, "150"));
ASSERT_TRUE(Overlaps(nullptr, "199"));
ASSERT_TRUE(Overlaps(nullptr, "200"));
ASSERT_TRUE(Overlaps(nullptr, "201"));
ASSERT_TRUE(Overlaps(nullptr, "400"));
ASSERT_TRUE(Overlaps(nullptr, "800"));
ASSERT_TRUE(Overlaps("100", nullptr));
ASSERT_TRUE(Overlaps("200", nullptr));
ASSERT_TRUE(Overlaps("449", nullptr));
ASSERT_TRUE(Overlaps("450", nullptr));
}
TEST(FindFileTest, OverlapSequenceChecks) {
Add("200", "200", 5000, 3000);
ASSERT_TRUE(! Overlaps("199", "199"));
ASSERT_TRUE(! Overlaps("201", "300"));
ASSERT_TRUE(Overlaps("200", "200"));
ASSERT_TRUE(Overlaps("190", "200"));
ASSERT_TRUE(Overlaps("200", "210"));
}
TEST(FindFileTest, OverlappingFiles) {
Add("150", "600");
Add("400", "500");
disjoint_sorted_files_ = false;
ASSERT_TRUE(! Overlaps("100", "149"));
ASSERT_TRUE(! Overlaps("601", "700"));
ASSERT_TRUE(Overlaps("100", "150"));
ASSERT_TRUE(Overlaps("100", "200"));
ASSERT_TRUE(Overlaps("100", "300"));
ASSERT_TRUE(Overlaps("100", "400"));
ASSERT_TRUE(Overlaps("100", "500"));
ASSERT_TRUE(Overlaps("375", "400"));
ASSERT_TRUE(Overlaps("450", "450"));
ASSERT_TRUE(Overlaps("450", "500"));
ASSERT_TRUE(Overlaps("450", "700"));
ASSERT_TRUE(Overlaps("600", "700"));
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

Some files were not shown because too many files have changed in this diff Show More