fix: make partial sync poll-wait opt-in via partialSyncWait flag

Previously finishFetch() entered the poll-wait loop for any coroutine
context, causing unit tests to spin for 30s on missing nodes with no
network to deliver them. Now requires explicit setPartialSyncWait(true)
from partial sync code paths (RPCHelpers, SubmitAndWait).
This commit is contained in:
Nicholas Dudfield
2026-02-20 16:54:06 +07:00
parent fe56844871
commit 2ce3b3f1fd
4 changed files with 83 additions and 52 deletions

View File

@@ -36,6 +36,10 @@ struct LocalValues
bool onCoro = true;
void* coroPtr = nullptr; // Pointer to owning JobQueue::Coro (if any)
// When true, SHAMap::finishFetch() will poll-wait for missing nodes
// instead of returning empty. Only set by partial sync code paths.
bool partialSyncWait = false;
// Configurable timeout for SHAMap node fetching during partial sync.
// Zero means use the default (30s). RPC handlers can set this to
// customize poll-wait behavior.
@@ -145,6 +149,25 @@ getCurrentCoroPtr()
return nullptr;
}
// Check if partial sync wait is enabled for the current coroutine context.
inline bool
isPartialSyncWaitEnabled()
{
auto lvs = detail::getLocalValues().get();
if (lvs && lvs->onCoro)
return lvs->partialSyncWait;
return false;
}
// Enable/disable partial sync wait for the current coroutine context.
inline void
setPartialSyncWait(bool enabled)
{
auto lvs = detail::getLocalValues().get();
if (lvs && lvs->onCoro)
lvs->partialSyncWait = enabled;
}
// Get the configured fetch timeout for current coroutine context.
// Returns 0ms if not in a coroutine or no custom timeout set.
inline std::chrono::milliseconds

View File

@@ -119,7 +119,8 @@ doSubmitAndWait(RPC::JsonContext& context)
timeout = std::chrono::seconds(t);
}
// Set coroutine-local fetch timeout for SHAMap operations
// Enable partial sync wait for SHAMap operations
setPartialSyncWait(true);
setCoroFetchTimeout(
std::chrono::duration_cast<std::chrono::milliseconds>(timeout / 2));

View File

@@ -25,6 +25,7 @@
#include <ripple/app/paths/TrustLine.h>
#include <ripple/app/rdb/RelationalDatabase.h>
#include <ripple/app/tx/impl/details/NFTokenUtils.h>
#include <ripple/basics/LocalValue.h>
#include <ripple/ledger/View.h>
#include <ripple/net/RPCErr.h>
#include <ripple/protocol/AccountID.h>
@@ -695,6 +696,7 @@ getLedger(T& ledger, LedgerShortcut shortcut, Context& context)
if (hash.isNonZero())
{
setPartialSyncWait(true);
ledger = context.app.getInboundLedgers().getPartialLedger(hash);
// If no InboundLedger exists yet, trigger acquisition and wait
if (!ledger)

View File

@@ -189,64 +189,69 @@ SHAMap::finishFetch(
f_.missingNodeAcquireBySeq(ledgerSeq_, hash.as_uint256());
}
// If we're in a coroutine context, poll-wait for the node
if (auto* coro = static_cast<JobQueue::Coro*>(getCurrentCoroPtr()))
{
using namespace std::chrono;
constexpr auto pollInterval = 50ms;
constexpr auto defaultTimeout = 30s;
// Use coroutine-local timeout if set, otherwise default
auto coroTimeout = getCoroFetchTimeout();
auto timeout =
coroTimeout.count() > 0 ? coroTimeout : defaultTimeout;
auto const deadline = steady_clock::now() + timeout;
// Linear backoff for re-requests: 50ms, 100ms, 150ms... up to
// 2s
auto nextRequestDelay = 50ms;
constexpr auto maxRequestDelay = 2000ms;
constexpr auto backoffStep = 50ms;
auto nextRequestTime = steady_clock::now() + nextRequestDelay;
JLOG(journal_.debug())
<< "finishFetch: waiting for node " << hash;
while (steady_clock::now() < deadline)
// If partial sync wait is enabled, poll-wait for the node
if (isPartialSyncWaitEnabled())
if (auto* coro =
static_cast<JobQueue::Coro*>(getCurrentCoroPtr()))
{
// Sleep for the poll interval (yields coroutine, frees job
// thread)
coro->sleepFor(pollInterval);
using namespace std::chrono;
constexpr auto pollInterval = 50ms;
constexpr auto defaultTimeout = 30s;
// Use coroutine-local timeout if set, otherwise default
auto coroTimeout = getCoroFetchTimeout();
auto timeout =
coroTimeout.count() > 0 ? coroTimeout : defaultTimeout;
auto const deadline = steady_clock::now() + timeout;
// Try to fetch from cache/db again
if (auto obj = f_.db().fetchNodeObject(
hash.as_uint256(), ledgerSeq_))
// Linear backoff for re-requests: 50ms, 100ms, 150ms... up
// to 2s
auto nextRequestDelay = 50ms;
constexpr auto maxRequestDelay = 2000ms;
constexpr auto backoffStep = 50ms;
auto nextRequestTime =
steady_clock::now() + nextRequestDelay;
JLOG(journal_.debug())
<< "finishFetch: waiting for node " << hash;
while (steady_clock::now() < deadline)
{
JLOG(journal_.debug())
<< "finishFetch: got node " << hash;
auto node = SHAMapTreeNode::makeFromPrefix(
makeSlice(obj->getData()), hash);
if (node)
canonicalize(hash, node);
return node;
// Sleep for the poll interval (yields coroutine, frees
// job thread)
coro->sleepFor(pollInterval);
// Try to fetch from cache/db again
if (auto obj = f_.db().fetchNodeObject(
hash.as_uint256(), ledgerSeq_))
{
JLOG(journal_.debug())
<< "finishFetch: got node " << hash;
auto node = SHAMapTreeNode::makeFromPrefix(
makeSlice(obj->getData()), hash);
if (node)
canonicalize(hash, node);
return node;
}
// Re-request with priority using linear backoff
auto now = steady_clock::now();
if (now >= nextRequestTime)
{
f_.missingNodeAcquireBySeq(
ledgerSeq_,
hash.as_uint256(),
true /*prioritize*/);
// Increase delay for next request (linear backoff)
if (nextRequestDelay < maxRequestDelay)
nextRequestDelay += backoffStep;
nextRequestTime = now + nextRequestDelay;
}
}
// Re-request with priority using linear backoff
auto now = steady_clock::now();
if (now >= nextRequestTime)
{
f_.missingNodeAcquireBySeq(
ledgerSeq_, hash.as_uint256(), true /*prioritize*/);
// Increase delay for next request (linear backoff)
if (nextRequestDelay < maxRequestDelay)
nextRequestDelay += backoffStep;
nextRequestTime = now + nextRequestDelay;
}
JLOG(journal_.warn())
<< "finishFetch: timeout waiting for node " << hash;
}
JLOG(journal_.warn())
<< "finishFetch: timeout waiting for node " << hash;
}
return {};
}