From 2ce3b3f1fdab8197ef7448e48ec2202ee69e8ea9 Mon Sep 17 00:00:00 2001 From: Nicholas Dudfield Date: Fri, 20 Feb 2026 16:54:06 +0700 Subject: [PATCH] fix: make partial sync poll-wait opt-in via partialSyncWait flag Previously finishFetch() entered the poll-wait loop for any coroutine context, causing unit tests to spin for 30s on missing nodes with no network to deliver them. Now requires explicit setPartialSyncWait(true) from partial sync code paths (RPCHelpers, SubmitAndWait). --- src/ripple/basics/LocalValue.h | 23 +++++ src/ripple/rpc/handlers/SubmitAndWait.cpp | 3 +- src/ripple/rpc/impl/RPCHelpers.cpp | 2 + src/ripple/shamap/impl/SHAMap.cpp | 107 +++++++++++----------- 4 files changed, 83 insertions(+), 52 deletions(-) diff --git a/src/ripple/basics/LocalValue.h b/src/ripple/basics/LocalValue.h index a12432ae3..dc4a70ef3 100644 --- a/src/ripple/basics/LocalValue.h +++ b/src/ripple/basics/LocalValue.h @@ -36,6 +36,10 @@ struct LocalValues bool onCoro = true; void* coroPtr = nullptr; // Pointer to owning JobQueue::Coro (if any) + // When true, SHAMap::finishFetch() will poll-wait for missing nodes + // instead of returning empty. Only set by partial sync code paths. + bool partialSyncWait = false; + // Configurable timeout for SHAMap node fetching during partial sync. // Zero means use the default (30s). RPC handlers can set this to // customize poll-wait behavior. @@ -145,6 +149,25 @@ getCurrentCoroPtr() return nullptr; } +// Check if partial sync wait is enabled for the current coroutine context. +inline bool +isPartialSyncWaitEnabled() +{ + auto lvs = detail::getLocalValues().get(); + if (lvs && lvs->onCoro) + return lvs->partialSyncWait; + return false; +} + +// Enable/disable partial sync wait for the current coroutine context. +inline void +setPartialSyncWait(bool enabled) +{ + auto lvs = detail::getLocalValues().get(); + if (lvs && lvs->onCoro) + lvs->partialSyncWait = enabled; +} + // Get the configured fetch timeout for current coroutine context. // Returns 0ms if not in a coroutine or no custom timeout set. inline std::chrono::milliseconds diff --git a/src/ripple/rpc/handlers/SubmitAndWait.cpp b/src/ripple/rpc/handlers/SubmitAndWait.cpp index 77da9a6a1..ee60a3127 100644 --- a/src/ripple/rpc/handlers/SubmitAndWait.cpp +++ b/src/ripple/rpc/handlers/SubmitAndWait.cpp @@ -119,7 +119,8 @@ doSubmitAndWait(RPC::JsonContext& context) timeout = std::chrono::seconds(t); } - // Set coroutine-local fetch timeout for SHAMap operations + // Enable partial sync wait for SHAMap operations + setPartialSyncWait(true); setCoroFetchTimeout( std::chrono::duration_cast(timeout / 2)); diff --git a/src/ripple/rpc/impl/RPCHelpers.cpp b/src/ripple/rpc/impl/RPCHelpers.cpp index fe3ed18ab..3e7d5df18 100644 --- a/src/ripple/rpc/impl/RPCHelpers.cpp +++ b/src/ripple/rpc/impl/RPCHelpers.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -695,6 +696,7 @@ getLedger(T& ledger, LedgerShortcut shortcut, Context& context) if (hash.isNonZero()) { + setPartialSyncWait(true); ledger = context.app.getInboundLedgers().getPartialLedger(hash); // If no InboundLedger exists yet, trigger acquisition and wait if (!ledger) diff --git a/src/ripple/shamap/impl/SHAMap.cpp b/src/ripple/shamap/impl/SHAMap.cpp index 2ad7c37da..aaaae1c13 100644 --- a/src/ripple/shamap/impl/SHAMap.cpp +++ b/src/ripple/shamap/impl/SHAMap.cpp @@ -189,64 +189,69 @@ SHAMap::finishFetch( f_.missingNodeAcquireBySeq(ledgerSeq_, hash.as_uint256()); } - // If we're in a coroutine context, poll-wait for the node - if (auto* coro = static_cast(getCurrentCoroPtr())) - { - using namespace std::chrono; - constexpr auto pollInterval = 50ms; - constexpr auto defaultTimeout = 30s; - // Use coroutine-local timeout if set, otherwise default - auto coroTimeout = getCoroFetchTimeout(); - auto timeout = - coroTimeout.count() > 0 ? coroTimeout : defaultTimeout; - auto const deadline = steady_clock::now() + timeout; - - // Linear backoff for re-requests: 50ms, 100ms, 150ms... up to - // 2s - auto nextRequestDelay = 50ms; - constexpr auto maxRequestDelay = 2000ms; - constexpr auto backoffStep = 50ms; - auto nextRequestTime = steady_clock::now() + nextRequestDelay; - - JLOG(journal_.debug()) - << "finishFetch: waiting for node " << hash; - - while (steady_clock::now() < deadline) + // If partial sync wait is enabled, poll-wait for the node + if (isPartialSyncWaitEnabled()) + if (auto* coro = + static_cast(getCurrentCoroPtr())) { - // Sleep for the poll interval (yields coroutine, frees job - // thread) - coro->sleepFor(pollInterval); + using namespace std::chrono; + constexpr auto pollInterval = 50ms; + constexpr auto defaultTimeout = 30s; + // Use coroutine-local timeout if set, otherwise default + auto coroTimeout = getCoroFetchTimeout(); + auto timeout = + coroTimeout.count() > 0 ? coroTimeout : defaultTimeout; + auto const deadline = steady_clock::now() + timeout; - // Try to fetch from cache/db again - if (auto obj = f_.db().fetchNodeObject( - hash.as_uint256(), ledgerSeq_)) + // Linear backoff for re-requests: 50ms, 100ms, 150ms... up + // to 2s + auto nextRequestDelay = 50ms; + constexpr auto maxRequestDelay = 2000ms; + constexpr auto backoffStep = 50ms; + auto nextRequestTime = + steady_clock::now() + nextRequestDelay; + + JLOG(journal_.debug()) + << "finishFetch: waiting for node " << hash; + + while (steady_clock::now() < deadline) { - JLOG(journal_.debug()) - << "finishFetch: got node " << hash; - auto node = SHAMapTreeNode::makeFromPrefix( - makeSlice(obj->getData()), hash); - if (node) - canonicalize(hash, node); - return node; + // Sleep for the poll interval (yields coroutine, frees + // job thread) + coro->sleepFor(pollInterval); + + // Try to fetch from cache/db again + if (auto obj = f_.db().fetchNodeObject( + hash.as_uint256(), ledgerSeq_)) + { + JLOG(journal_.debug()) + << "finishFetch: got node " << hash; + auto node = SHAMapTreeNode::makeFromPrefix( + makeSlice(obj->getData()), hash); + if (node) + canonicalize(hash, node); + return node; + } + + // Re-request with priority using linear backoff + auto now = steady_clock::now(); + if (now >= nextRequestTime) + { + f_.missingNodeAcquireBySeq( + ledgerSeq_, + hash.as_uint256(), + true /*prioritize*/); + // Increase delay for next request (linear backoff) + if (nextRequestDelay < maxRequestDelay) + nextRequestDelay += backoffStep; + nextRequestTime = now + nextRequestDelay; + } } - // Re-request with priority using linear backoff - auto now = steady_clock::now(); - if (now >= nextRequestTime) - { - f_.missingNodeAcquireBySeq( - ledgerSeq_, hash.as_uint256(), true /*prioritize*/); - // Increase delay for next request (linear backoff) - if (nextRequestDelay < maxRequestDelay) - nextRequestDelay += backoffStep; - nextRequestTime = now + nextRequestDelay; - } + JLOG(journal_.warn()) + << "finishFetch: timeout waiting for node " << hash; } - JLOG(journal_.warn()) - << "finishFetch: timeout waiting for node " << hash; - } - return {}; }