From 380ba9f1c16cd1214462f1caa2370e73d7530f81 Mon Sep 17 00:00:00 2001 From: Valentin Balaschenko <13349202+vlntb@users.noreply.github.com> Date: Fri, 16 May 2025 11:31:51 +0100 Subject: [PATCH] Fix: Resolve slow test on macOS pipeline (#5392) Using std::barrier performs extremely poorly (~1 hour vs ~1 minute to run the test suite) in certain macOS environments. To unblock our macOS CI pipeline, std::barrier has been replaced with a custom mutex-based barrier (Barrier) that significantly improves performance without compromising correctness. --- .github/workflows/macos.yml | 14 +++-- src/test/basics/IntrusiveShared_test.cpp | 70 ++++++++++++++++++++---- 2 files changed, 68 insertions(+), 16 deletions(-) diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 905df7e83d..63d54175ea 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -71,6 +71,9 @@ jobs: nproc --version echo -n "nproc returns: " nproc + system_profiler SPHardwareDataType + sysctl -n hw.logicalcpu + clang --version - name: configure Conan run : | conan profile new default --detect || true @@ -89,9 +92,8 @@ jobs: generator: ${{ matrix.generator }} configuration: ${{ matrix.configuration }} cmake-args: "-Dassert=TRUE -Dwerr=TRUE ${{ matrix.cmake-args }}" - # TODO: Temporary disabled tests - # - name: test - # run: | - # n=$(nproc) - # echo "Using $n test jobs" - # ${build_dir}/rippled --unittest --unittest-jobs $n + - name: test + run: | + n=$(nproc) + echo "Using $n test jobs" + ${build_dir}/rippled --unittest --unittest-jobs $n diff --git a/src/test/basics/IntrusiveShared_test.cpp b/src/test/basics/IntrusiveShared_test.cpp index fe0cdba777..736cc47345 100644 --- a/src/test/basics/IntrusiveShared_test.cpp +++ b/src/test/basics/IntrusiveShared_test.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,55 @@ namespace ripple { namespace tests { +/** +Experimentally, we discovered that using std::barrier performs extremely +poorly (~1 hour vs ~1 minute to run the test suite) in certain macOS +environments. To unblock our macOS CI pipeline, we replaced std::barrier with a +custom mutex-based barrier (Barrier) that significantly improves performance +without compromising correctness. For future reference, if we ever consider +reintroducing std::barrier, the following configuration is known to exhibit the +problem: + + Model Name: Mac mini + Model Identifier: Mac14,3 + Model Number: Z16K000R4LL/A + Chip: Apple M2 + Total Number of Cores: 8 (4 performance and 4 efficiency) + Memory: 24 GB + System Firmware Version: 11881.41.5 + OS Loader Version: 11881.1.1 + Apple clang version 16.0.0 (clang-1600.0.26.3) + Target: arm64-apple-darwin24.0.0 + Thread model: posix + + */ +struct Barrier +{ + std::mutex mtx; + std::condition_variable cv; + int count; + int const initial; + + Barrier(int n) : count(n), initial(n) + { + } + + void + arrive_and_wait() + { + std::unique_lock lock(mtx); + if (--count == 0) + { + count = initial; + cv.notify_all(); + } + else + { + cv.wait(lock, [&] { return count == initial; }); + } + } +}; + namespace { enum class TrackedState : std::uint8_t { uninitialized, @@ -500,9 +550,9 @@ public: constexpr int loopIters = 2 * 1024; constexpr int numThreads = 16; std::vector> toClone; - std::barrier loopStartSyncPoint{numThreads}; - std::barrier postCreateToCloneSyncPoint{numThreads}; - std::barrier postCreateVecOfPointersSyncPoint{numThreads}; + Barrier loopStartSyncPoint{numThreads}; + Barrier postCreateToCloneSyncPoint{numThreads}; + Barrier postCreateVecOfPointersSyncPoint{numThreads}; auto engines = [&]() -> std::vector { std::random_device rd; std::vector result; @@ -628,10 +678,10 @@ public: constexpr int flipPointersLoopIters = 256; constexpr int numThreads = 16; std::vector> toClone; - std::barrier loopStartSyncPoint{numThreads}; - std::barrier postCreateToCloneSyncPoint{numThreads}; - std::barrier postCreateVecOfPointersSyncPoint{numThreads}; - std::barrier postFlipPointersLoopSyncPoint{numThreads}; + Barrier loopStartSyncPoint{numThreads}; + Barrier postCreateToCloneSyncPoint{numThreads}; + Barrier postCreateVecOfPointersSyncPoint{numThreads}; + Barrier postFlipPointersLoopSyncPoint{numThreads}; auto engines = [&]() -> std::vector { std::random_device rd; std::vector result; @@ -761,9 +811,9 @@ public: constexpr int lockWeakLoopIters = 256; constexpr int numThreads = 16; std::vector> toLock; - std::barrier loopStartSyncPoint{numThreads}; - std::barrier postCreateToLockSyncPoint{numThreads}; - std::barrier postLockWeakLoopSyncPoint{numThreads}; + Barrier loopStartSyncPoint{numThreads}; + Barrier postCreateToLockSyncPoint{numThreads}; + Barrier postLockWeakLoopSyncPoint{numThreads}; // lockAndDestroy creates weak pointers from the strong pointer // and runs a loop that locks the weak pointer. At the end of the loop