Files
rippled/external/rocksdb/patches/9.7.3-0001-memory-leak.patch
Bart d167d4864f refactor: Updates Conan dependencies: RocksDB (#5335)
Updates RocksDB to version 9.7.3, the latest version supported in Conan 1.x. A patch for 9.7.4 that fixes a memory leak is included.
2025-03-18 11:25:48 -04:00

320 lines
12 KiB
Diff

diff --git a/HISTORY.md b/HISTORY.md
index 36d472229..05ad1a202 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,10 @@
# Rocksdb Change Log
> NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
+## 9.7.4 (10/31/2024)
+### Bug Fixes
+* Fix a leak of obsolete blob files left open until DB::Close(). This bug was introduced in version 9.4.0.
+
## 9.7.3 (10/16/2024)
### Behavior Changes
* OPTIONS file to be loaded by remote worker is now preserved so that it does not get purged by the primary host. A similar technique as how we are preserving new SST files from getting purged is used for this. min_options_file_numbers_ is tracked like pending_outputs_ is tracked.
diff --git a/db/blob/blob_file_cache.cc b/db/blob/blob_file_cache.cc
index 5f340aadf..1b9faa238 100644
--- a/db/blob/blob_file_cache.cc
+++ b/db/blob/blob_file_cache.cc
@@ -42,6 +42,7 @@ Status BlobFileCache::GetBlobFileReader(
assert(blob_file_reader);
assert(blob_file_reader->IsEmpty());
+ // NOTE: sharing same Cache with table_cache
const Slice key = GetSliceForKey(&blob_file_number);
assert(cache_);
@@ -98,4 +99,13 @@ Status BlobFileCache::GetBlobFileReader(
return Status::OK();
}
+void BlobFileCache::Evict(uint64_t blob_file_number) {
+ // NOTE: sharing same Cache with table_cache
+ const Slice key = GetSliceForKey(&blob_file_number);
+
+ assert(cache_);
+
+ cache_.get()->Erase(key);
+}
+
} // namespace ROCKSDB_NAMESPACE
diff --git a/db/blob/blob_file_cache.h b/db/blob/blob_file_cache.h
index 740e67ada..6858d012b 100644
--- a/db/blob/blob_file_cache.h
+++ b/db/blob/blob_file_cache.h
@@ -36,6 +36,15 @@ class BlobFileCache {
uint64_t blob_file_number,
CacheHandleGuard<BlobFileReader>* blob_file_reader);
+ // Called when a blob file is obsolete to ensure it is removed from the cache
+ // to avoid effectively leaking the open file and assicated memory
+ void Evict(uint64_t blob_file_number);
+
+ // Used to identify cache entries for blob files (not normally useful)
+ static const Cache::CacheItemHelper* GetHelper() {
+ return CacheInterface::GetBasicHelper();
+ }
+
private:
using CacheInterface =
BasicTypedCacheInterface<BlobFileReader, CacheEntryRole::kMisc>;
diff --git a/db/column_family.h b/db/column_family.h
index e4b7adde8..86637736a 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -401,6 +401,7 @@ class ColumnFamilyData {
SequenceNumber earliest_seq);
TableCache* table_cache() const { return table_cache_.get(); }
+ BlobFileCache* blob_file_cache() const { return blob_file_cache_.get(); }
BlobSource* blob_source() const { return blob_source_.get(); }
// See documentation in compaction_picker.h
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 261593423..06573ac2e 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -659,8 +659,9 @@ Status DBImpl::CloseHelper() {
// We need to release them before the block cache is destroyed. The block
// cache may be destroyed inside versions_.reset(), when column family data
// list is destroyed, so leaving handles in table cache after
- // versions_.reset() may cause issues.
- // Here we clean all unreferenced handles in table cache.
+ // versions_.reset() may cause issues. Here we clean all unreferenced handles
+ // in table cache, and (for certain builds/conditions) assert that no obsolete
+ // files are hanging around unreferenced (leak) in the table/blob file cache.
// Now we assume all user queries have finished, so only version set itself
// can possibly hold the blocks from block cache. After releasing unreferenced
// handles here, only handles held by version set left and inside
@@ -668,6 +669,9 @@ Status DBImpl::CloseHelper() {
// time a handle is released, we erase it from the cache too. By doing that,
// we can guarantee that after versions_.reset(), table cache is empty
// so the cache can be safely destroyed.
+#ifndef NDEBUG
+ TEST_VerifyNoObsoleteFilesCached(/*db_mutex_already_held=*/true);
+#endif // !NDEBUG
table_cache_->EraseUnRefEntries();
for (auto& txn_entry : recovered_transactions_) {
@@ -3227,6 +3231,8 @@ Status DBImpl::MultiGetImpl(
s = Status::Aborted();
break;
}
+ // This could be a long-running operation
+ ROCKSDB_THREAD_YIELD_HOOK();
}
// Post processing (decrement reference counts and record statistics)
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 5e4fa310b..ccc0abfa7 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1241,9 +1241,14 @@ class DBImpl : public DB {
static Status TEST_ValidateOptions(const DBOptions& db_options) {
return ValidateOptions(db_options);
}
-
#endif // NDEBUG
+ // In certain configurations, verify that the table/blob file cache only
+ // contains entries for live files, to check for effective leaks of open
+ // files. This can only be called when purging of obsolete files has
+ // "settled," such as during parts of DB Close().
+ void TEST_VerifyNoObsoleteFilesCached(bool db_mutex_already_held) const;
+
// persist stats to column family "_persistent_stats"
void PersistStats();
diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
index 790a50d7a..67f5b4aaf 100644
--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@@ -9,6 +9,7 @@
#ifndef NDEBUG
+#include "db/blob/blob_file_cache.h"
#include "db/column_family.h"
#include "db/db_impl/db_impl.h"
#include "db/error_handler.h"
@@ -328,5 +329,49 @@ size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
InstrumentedMutexLock l(&const_cast<DBImpl*>(this)->stats_history_mutex_);
return EstimateInMemoryStatsHistorySize();
}
+
+void DBImpl::TEST_VerifyNoObsoleteFilesCached(
+ bool db_mutex_already_held) const {
+ // This check is somewhat expensive and obscure to make a part of every
+ // unit test in every build variety. Thus, we only enable it for ASAN builds.
+ if (!kMustFreeHeapAllocations) {
+ return;
+ }
+
+ std::optional<InstrumentedMutexLock> l;
+ if (db_mutex_already_held) {
+ mutex_.AssertHeld();
+ } else {
+ l.emplace(&mutex_);
+ }
+
+ std::vector<uint64_t> live_files;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ // Sneakily add both SST and blob files to the same list
+ cfd->current()->AddLiveFiles(&live_files, &live_files);
+ }
+ std::sort(live_files.begin(), live_files.end());
+
+ auto fn = [&live_files](const Slice& key, Cache::ObjectPtr, size_t,
+ const Cache::CacheItemHelper* helper) {
+ if (helper != BlobFileCache::GetHelper()) {
+ // Skip non-blob files for now
+ // FIXME: diagnose and fix the leaks of obsolete SST files revealed in
+ // unit tests.
+ return;
+ }
+ // See TableCache and BlobFileCache
+ assert(key.size() == sizeof(uint64_t));
+ uint64_t file_number;
+ GetUnaligned(reinterpret_cast<const uint64_t*>(key.data()), &file_number);
+ // Assert file is in sorted live_files
+ assert(
+ std::binary_search(live_files.begin(), live_files.end(), file_number));
+ };
+ table_cache_->ApplyToAllEntries(fn, {});
+}
} // namespace ROCKSDB_NAMESPACE
#endif // NDEBUG
diff --git a/db/db_iter.cc b/db/db_iter.cc
index e02586377..bf4749eb9 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -540,6 +540,8 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
} else {
iter_.Next();
}
+ // This could be a long-running operation due to tombstones, etc.
+ ROCKSDB_THREAD_YIELD_HOOK();
} while (iter_.Valid());
valid_ = false;
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 71fc29c32..8a5be75e8 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -164,6 +164,7 @@ Status TableCache::GetTableReader(
}
Cache::Handle* TableCache::Lookup(Cache* cache, uint64_t file_number) {
+ // NOTE: sharing same Cache with BlobFileCache
Slice key = GetSliceForFileNumber(&file_number);
return cache->Lookup(key);
}
@@ -179,6 +180,7 @@ Status TableCache::FindTable(
size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock);
uint64_t number = file_meta.fd.GetNumber();
+ // NOTE: sharing same Cache with BlobFileCache
Slice key = GetSliceForFileNumber(&number);
*handle = cache_.Lookup(key);
TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0",
diff --git a/db/version_builder.cc b/db/version_builder.cc
index ed8ab8214..c98f53f42 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -24,6 +24,7 @@
#include <vector>
#include "cache/cache_reservation_manager.h"
+#include "db/blob/blob_file_cache.h"
#include "db/blob/blob_file_meta.h"
#include "db/dbformat.h"
#include "db/internal_stats.h"
@@ -744,12 +745,9 @@ class VersionBuilder::Rep {
return Status::Corruption("VersionBuilder", oss.str());
}
- // Note: we use C++11 for now but in C++14, this could be done in a more
- // elegant way using generalized lambda capture.
- VersionSet* const vs = version_set_;
- const ImmutableCFOptions* const ioptions = ioptions_;
-
- auto deleter = [vs, ioptions](SharedBlobFileMetaData* shared_meta) {
+ auto deleter = [vs = version_set_, ioptions = ioptions_,
+ bc = cfd_ ? cfd_->blob_file_cache()
+ : nullptr](SharedBlobFileMetaData* shared_meta) {
if (vs) {
assert(ioptions);
assert(!ioptions->cf_paths.empty());
@@ -758,6 +756,9 @@ class VersionBuilder::Rep {
vs->AddObsoleteBlobFile(shared_meta->GetBlobFileNumber(),
ioptions->cf_paths.front().path);
}
+ if (bc) {
+ bc->Evict(shared_meta->GetBlobFileNumber());
+ }
delete shared_meta;
};
@@ -766,7 +767,7 @@ class VersionBuilder::Rep {
blob_file_number, blob_file_addition.GetTotalBlobCount(),
blob_file_addition.GetTotalBlobBytes(),
blob_file_addition.GetChecksumMethod(),
- blob_file_addition.GetChecksumValue(), deleter);
+ blob_file_addition.GetChecksumValue(), std::move(deleter));
mutable_blob_file_metas_.emplace(
blob_file_number, MutableBlobFileMetaData(std::move(shared_meta)));
diff --git a/db/version_set.h b/db/version_set.h
index 9336782b1..024f869e7 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -1514,7 +1514,6 @@ class VersionSet {
void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) {
- // TODO: Erase file from BlobFileCache?
obsolete_blob_files_.emplace_back(blob_file_number, std::move(path));
}
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 2a19796b8..0afa2cab1 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -13,7 +13,7 @@
// minor or major version number planned for release.
#define ROCKSDB_MAJOR 9
#define ROCKSDB_MINOR 7
-#define ROCKSDB_PATCH 3
+#define ROCKSDB_PATCH 4
// Do not use these. We made the mistake of declaring macros starting with
// double underscore. Now we have to live with our choice. We'll deprecate these
diff --git a/port/port.h b/port/port.h
index 13aa56d47..141716e5b 100644
--- a/port/port.h
+++ b/port/port.h
@@ -19,3 +19,19 @@
#elif defined(OS_WIN)
#include "port/win/port_win.h"
#endif
+
+#ifdef OS_LINUX
+// A temporary hook into long-running RocksDB threads to support modifying their
+// priority etc. This should become a public API hook once the requirements
+// are better understood.
+extern "C" void RocksDbThreadYield() __attribute__((__weak__));
+#define ROCKSDB_THREAD_YIELD_HOOK() \
+ { \
+ if (RocksDbThreadYield) { \
+ RocksDbThreadYield(); \
+ } \
+ }
+#else
+#define ROCKSDB_THREAD_YIELD_HOOK() \
+ {}
+#endif