Fix: Resolve slow test on macOS pipeline (#5392)

Using std::barrier performs extremely poorly (~1 hour vs ~1 minute to run the test suite) in certain macOS environments.
To unblock our macOS CI pipeline, std::barrier has been replaced with a custom mutex-based barrier (Barrier) that significantly improves performance without compromising correctness.
This commit is contained in:
Valentin Balaschenko
2025-05-16 11:31:51 +01:00
committed by GitHub
parent c3e9380fb4
commit 380ba9f1c1
2 changed files with 68 additions and 16 deletions

View File

@@ -71,6 +71,9 @@ jobs:
nproc --version nproc --version
echo -n "nproc returns: " echo -n "nproc returns: "
nproc nproc
system_profiler SPHardwareDataType
sysctl -n hw.logicalcpu
clang --version
- name: configure Conan - name: configure Conan
run : | run : |
conan profile new default --detect || true conan profile new default --detect || true
@@ -89,9 +92,8 @@ jobs:
generator: ${{ matrix.generator }} generator: ${{ matrix.generator }}
configuration: ${{ matrix.configuration }} configuration: ${{ matrix.configuration }}
cmake-args: "-Dassert=TRUE -Dwerr=TRUE ${{ matrix.cmake-args }}" cmake-args: "-Dassert=TRUE -Dwerr=TRUE ${{ matrix.cmake-args }}"
# TODO: Temporary disabled tests - name: test
# - name: test run: |
# run: | n=$(nproc)
# n=$(nproc) echo "Using $n test jobs"
# echo "Using $n test jobs" ${build_dir}/rippled --unittest --unittest-jobs $n
# ${build_dir}/rippled --unittest --unittest-jobs $n

View File

@@ -9,6 +9,7 @@
#include <atomic> #include <atomic>
#include <barrier> #include <barrier>
#include <chrono> #include <chrono>
#include <condition_variable>
#include <latch> #include <latch>
#include <optional> #include <optional>
#include <random> #include <random>
@@ -19,6 +20,55 @@
namespace ripple { namespace ripple {
namespace tests { namespace tests {
/**
Experimentally, we discovered that using std::barrier performs extremely
poorly (~1 hour vs ~1 minute to run the test suite) in certain macOS
environments. To unblock our macOS CI pipeline, we replaced std::barrier with a
custom mutex-based barrier (Barrier) that significantly improves performance
without compromising correctness. For future reference, if we ever consider
reintroducing std::barrier, the following configuration is known to exhibit the
problem:
Model Name: Mac mini
Model Identifier: Mac14,3
Model Number: Z16K000R4LL/A
Chip: Apple M2
Total Number of Cores: 8 (4 performance and 4 efficiency)
Memory: 24 GB
System Firmware Version: 11881.41.5
OS Loader Version: 11881.1.1
Apple clang version 16.0.0 (clang-1600.0.26.3)
Target: arm64-apple-darwin24.0.0
Thread model: posix
*/
struct Barrier
{
std::mutex mtx;
std::condition_variable cv;
int count;
int const initial;
Barrier(int n) : count(n), initial(n)
{
}
void
arrive_and_wait()
{
std::unique_lock lock(mtx);
if (--count == 0)
{
count = initial;
cv.notify_all();
}
else
{
cv.wait(lock, [&] { return count == initial; });
}
}
};
namespace { namespace {
enum class TrackedState : std::uint8_t { enum class TrackedState : std::uint8_t {
uninitialized, uninitialized,
@@ -500,9 +550,9 @@ public:
constexpr int loopIters = 2 * 1024; constexpr int loopIters = 2 * 1024;
constexpr int numThreads = 16; constexpr int numThreads = 16;
std::vector<SharedIntrusive<TIBase>> toClone; std::vector<SharedIntrusive<TIBase>> toClone;
std::barrier loopStartSyncPoint{numThreads}; Barrier loopStartSyncPoint{numThreads};
std::barrier postCreateToCloneSyncPoint{numThreads}; Barrier postCreateToCloneSyncPoint{numThreads};
std::barrier postCreateVecOfPointersSyncPoint{numThreads}; Barrier postCreateVecOfPointersSyncPoint{numThreads};
auto engines = [&]() -> std::vector<std::default_random_engine> { auto engines = [&]() -> std::vector<std::default_random_engine> {
std::random_device rd; std::random_device rd;
std::vector<std::default_random_engine> result; std::vector<std::default_random_engine> result;
@@ -628,10 +678,10 @@ public:
constexpr int flipPointersLoopIters = 256; constexpr int flipPointersLoopIters = 256;
constexpr int numThreads = 16; constexpr int numThreads = 16;
std::vector<SharedIntrusive<TIBase>> toClone; std::vector<SharedIntrusive<TIBase>> toClone;
std::barrier loopStartSyncPoint{numThreads}; Barrier loopStartSyncPoint{numThreads};
std::barrier postCreateToCloneSyncPoint{numThreads}; Barrier postCreateToCloneSyncPoint{numThreads};
std::barrier postCreateVecOfPointersSyncPoint{numThreads}; Barrier postCreateVecOfPointersSyncPoint{numThreads};
std::barrier postFlipPointersLoopSyncPoint{numThreads}; Barrier postFlipPointersLoopSyncPoint{numThreads};
auto engines = [&]() -> std::vector<std::default_random_engine> { auto engines = [&]() -> std::vector<std::default_random_engine> {
std::random_device rd; std::random_device rd;
std::vector<std::default_random_engine> result; std::vector<std::default_random_engine> result;
@@ -761,9 +811,9 @@ public:
constexpr int lockWeakLoopIters = 256; constexpr int lockWeakLoopIters = 256;
constexpr int numThreads = 16; constexpr int numThreads = 16;
std::vector<SharedIntrusive<TIBase>> toLock; std::vector<SharedIntrusive<TIBase>> toLock;
std::barrier loopStartSyncPoint{numThreads}; Barrier loopStartSyncPoint{numThreads};
std::barrier postCreateToLockSyncPoint{numThreads}; Barrier postCreateToLockSyncPoint{numThreads};
std::barrier postLockWeakLoopSyncPoint{numThreads}; Barrier postLockWeakLoopSyncPoint{numThreads};
// lockAndDestroy creates weak pointers from the strong pointer // lockAndDestroy creates weak pointers from the strong pointer
// and runs a loop that locks the weak pointer. At the end of the loop // and runs a loop that locks the weak pointer. At the end of the loop