From c791cae1ec627bd34b29894eb9ea5ef78b36ce1e Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Wed, 11 Mar 2026 18:06:12 +0000 Subject: [PATCH] test: Fix flaky subscribe tests (#6510) Subscribe tests have a problem that there is no way to synchronize application running in background threads and test threads. Threads are communicating via websocket messages. When the code is compiled in debug mode with code coverage enabled it executes quite slow, so receiving websocket messages by the client in subscribe tests may time out. This change does 2 things to fix the problem: * Increases timeout for receiving a websocket message. * Decreases the number of tests running in parallel. While testing the fix for subscribe test another flaky test in ledger replay was found, which has also been addressed. --- .../workflows/reusable-build-test-config.yml | 2 ++ src/test/app/LedgerReplay_test.cpp | 24 ++++++++++++++++--- src/test/rpc/Subscribe_test.cpp | 16 ++++++++----- 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/.github/workflows/reusable-build-test-config.yml b/.github/workflows/reusable-build-test-config.yml index 66f46709cd..83ece81919 100644 --- a/.github/workflows/reusable-build-test-config.yml +++ b/.github/workflows/reusable-build-test-config.yml @@ -230,6 +230,8 @@ jobs: BUILD_NPROC: ${{ steps.nproc.outputs.nproc }} run: | set -o pipefail + # Coverage builds are slower due to instrumentation; use fewer parallel jobs to avoid flakiness + [ "$COVERAGE_ENABLED" = "true" ] && BUILD_NPROC=$(( BUILD_NPROC - 2 )) ./xrpld --unittest --unittest-jobs "${BUILD_NPROC}" 2>&1 | tee unittest.log - name: Show test failure summary diff --git a/src/test/app/LedgerReplay_test.cpp b/src/test/app/LedgerReplay_test.cpp index 5568a90d03..4428e82ea9 100644 --- a/src/test/app/LedgerReplay_test.cpp +++ b/src/test/app/LedgerReplay_test.cpp @@ -1128,9 +1128,27 @@ struct LedgerReplayer_test : public beast::unit_test::suite BEAST_EXPECT(net.client.waitAndCheckStatus( finalHash, totalReplay, TaskStatus::Completed, TaskStatus::Completed, deltaStatuses)); - // sweep - net.client.replayer.sweep(); - BEAST_EXPECT(net.client.countsAsExpected(0, 0, 0)); + // sweep() cleans up skipLists_ and deltas_ by removing entries whose + // weak_ptr can no longer be locked. Those weak_ptrs expire only when the + // last shared_ptr holder releases the sub-task. The sole owner is the + // LedgerReplayTask, but a JobQueue worker thread may still hold a + // temporary shared_ptr to a sub-task (from wptr.lock()) while executing + // the timer job that drove the task to completion. If sweep() runs before + // that thread unwinds, the weak_ptr is still lockable and the map entry + // is not removed. We retry until the worker thread finishes. + auto waitForSweep = [&net]() { + for (auto numAttempts = 0; numAttempts < 20; ++numAttempts) + { + net.client.replayer.sweep(); + if (net.client.countsAsExpected(0, 0, 0)) + { + return true; + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + return false; + }; + BEAST_EXPECT(waitForSweep()); } void diff --git a/src/test/rpc/Subscribe_test.cpp b/src/test/rpc/Subscribe_test.cpp index 414bceefd7..9a24980d49 100644 --- a/src/test/rpc/Subscribe_test.cpp +++ b/src/test/rpc/Subscribe_test.cpp @@ -802,13 +802,17 @@ public: * return {true, true} if received numReplies replies and also * received a tx with the account_history_tx_first == true */ - auto getTxHash = [](WSClient& wsc, IdxHashVec& v, int numReplies) -> std::pair { + auto getTxHash = [](WSClient& wsc, + IdxHashVec& v, + int numReplies, + std::chrono::milliseconds timeout = + std::chrono::milliseconds{5000}) -> std::pair { bool first_flag = false; for (int i = 0; i < numReplies; ++i) { std::uint32_t idx{0}; - auto reply = wsc.getMsg(100ms); + auto reply = wsc.getMsg(timeout); if (reply) { auto r = *reply; @@ -982,7 +986,7 @@ public: BEAST_EXPECT(goodSubRPC(jv)); sendPayments(env, env.master, alice, 1, 1); - r = getTxHash(*wscTxHistory, vec, 1); + r = getTxHash(*wscTxHistory, vec, 1, 10ms); BEAST_EXPECT(!r.first); } { @@ -1001,7 +1005,7 @@ public: return; IdxHashVec genesisFullHistoryVec; BEAST_EXPECT(env.syncClose()); - if (!BEAST_EXPECT(!getTxHash(*wscTxHistory, genesisFullHistoryVec, 1).first)) + if (!BEAST_EXPECT(!getTxHash(*wscTxHistory, genesisFullHistoryVec, 1, 10ms).first)) return; /* @@ -1161,7 +1165,7 @@ public: { // take out existing txns from the stream IdxHashVec tempVec; - getTxHash(*ws, tempVec, 100); + getTxHash(*ws, tempVec, 100, 1000ms); } auto count = mixedPayments(); @@ -1195,7 +1199,7 @@ public: { // take out existing txns from the stream IdxHashVec tempVec; - getTxHash(*wscLong, tempVec, 100); + getTxHash(*wscLong, tempVec, 100, 1000ms); } // repeat the payments many rounds