mirror of
https://github.com/Xahau/xahaud.git
synced 2026-04-29 15:37:46 +00:00
fix: make partial sync poll-wait opt-in via partialSyncWait flag
Previously finishFetch() entered the poll-wait loop for any coroutine context, causing unit tests to spin for 30s on missing nodes with no network to deliver them. Now requires explicit setPartialSyncWait(true) from partial sync code paths (RPCHelpers, SubmitAndWait).
This commit is contained in:
@@ -36,6 +36,10 @@ struct LocalValues
|
||||
bool onCoro = true;
|
||||
void* coroPtr = nullptr; // Pointer to owning JobQueue::Coro (if any)
|
||||
|
||||
// When true, SHAMap::finishFetch() will poll-wait for missing nodes
|
||||
// instead of returning empty. Only set by partial sync code paths.
|
||||
bool partialSyncWait = false;
|
||||
|
||||
// Configurable timeout for SHAMap node fetching during partial sync.
|
||||
// Zero means use the default (30s). RPC handlers can set this to
|
||||
// customize poll-wait behavior.
|
||||
@@ -145,6 +149,25 @@ getCurrentCoroPtr()
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Check if partial sync wait is enabled for the current coroutine context.
|
||||
inline bool
|
||||
isPartialSyncWaitEnabled()
|
||||
{
|
||||
auto lvs = detail::getLocalValues().get();
|
||||
if (lvs && lvs->onCoro)
|
||||
return lvs->partialSyncWait;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Enable/disable partial sync wait for the current coroutine context.
|
||||
inline void
|
||||
setPartialSyncWait(bool enabled)
|
||||
{
|
||||
auto lvs = detail::getLocalValues().get();
|
||||
if (lvs && lvs->onCoro)
|
||||
lvs->partialSyncWait = enabled;
|
||||
}
|
||||
|
||||
// Get the configured fetch timeout for current coroutine context.
|
||||
// Returns 0ms if not in a coroutine or no custom timeout set.
|
||||
inline std::chrono::milliseconds
|
||||
|
||||
@@ -119,7 +119,8 @@ doSubmitAndWait(RPC::JsonContext& context)
|
||||
timeout = std::chrono::seconds(t);
|
||||
}
|
||||
|
||||
// Set coroutine-local fetch timeout for SHAMap operations
|
||||
// Enable partial sync wait for SHAMap operations
|
||||
setPartialSyncWait(true);
|
||||
setCoroFetchTimeout(
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(timeout / 2));
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include <ripple/app/paths/TrustLine.h>
|
||||
#include <ripple/app/rdb/RelationalDatabase.h>
|
||||
#include <ripple/app/tx/impl/details/NFTokenUtils.h>
|
||||
#include <ripple/basics/LocalValue.h>
|
||||
#include <ripple/ledger/View.h>
|
||||
#include <ripple/net/RPCErr.h>
|
||||
#include <ripple/protocol/AccountID.h>
|
||||
@@ -695,6 +696,7 @@ getLedger(T& ledger, LedgerShortcut shortcut, Context& context)
|
||||
|
||||
if (hash.isNonZero())
|
||||
{
|
||||
setPartialSyncWait(true);
|
||||
ledger = context.app.getInboundLedgers().getPartialLedger(hash);
|
||||
// If no InboundLedger exists yet, trigger acquisition and wait
|
||||
if (!ledger)
|
||||
|
||||
@@ -189,64 +189,69 @@ SHAMap::finishFetch(
|
||||
f_.missingNodeAcquireBySeq(ledgerSeq_, hash.as_uint256());
|
||||
}
|
||||
|
||||
// If we're in a coroutine context, poll-wait for the node
|
||||
if (auto* coro = static_cast<JobQueue::Coro*>(getCurrentCoroPtr()))
|
||||
{
|
||||
using namespace std::chrono;
|
||||
constexpr auto pollInterval = 50ms;
|
||||
constexpr auto defaultTimeout = 30s;
|
||||
// Use coroutine-local timeout if set, otherwise default
|
||||
auto coroTimeout = getCoroFetchTimeout();
|
||||
auto timeout =
|
||||
coroTimeout.count() > 0 ? coroTimeout : defaultTimeout;
|
||||
auto const deadline = steady_clock::now() + timeout;
|
||||
|
||||
// Linear backoff for re-requests: 50ms, 100ms, 150ms... up to
|
||||
// 2s
|
||||
auto nextRequestDelay = 50ms;
|
||||
constexpr auto maxRequestDelay = 2000ms;
|
||||
constexpr auto backoffStep = 50ms;
|
||||
auto nextRequestTime = steady_clock::now() + nextRequestDelay;
|
||||
|
||||
JLOG(journal_.debug())
|
||||
<< "finishFetch: waiting for node " << hash;
|
||||
|
||||
while (steady_clock::now() < deadline)
|
||||
// If partial sync wait is enabled, poll-wait for the node
|
||||
if (isPartialSyncWaitEnabled())
|
||||
if (auto* coro =
|
||||
static_cast<JobQueue::Coro*>(getCurrentCoroPtr()))
|
||||
{
|
||||
// Sleep for the poll interval (yields coroutine, frees job
|
||||
// thread)
|
||||
coro->sleepFor(pollInterval);
|
||||
using namespace std::chrono;
|
||||
constexpr auto pollInterval = 50ms;
|
||||
constexpr auto defaultTimeout = 30s;
|
||||
// Use coroutine-local timeout if set, otherwise default
|
||||
auto coroTimeout = getCoroFetchTimeout();
|
||||
auto timeout =
|
||||
coroTimeout.count() > 0 ? coroTimeout : defaultTimeout;
|
||||
auto const deadline = steady_clock::now() + timeout;
|
||||
|
||||
// Try to fetch from cache/db again
|
||||
if (auto obj = f_.db().fetchNodeObject(
|
||||
hash.as_uint256(), ledgerSeq_))
|
||||
// Linear backoff for re-requests: 50ms, 100ms, 150ms... up
|
||||
// to 2s
|
||||
auto nextRequestDelay = 50ms;
|
||||
constexpr auto maxRequestDelay = 2000ms;
|
||||
constexpr auto backoffStep = 50ms;
|
||||
auto nextRequestTime =
|
||||
steady_clock::now() + nextRequestDelay;
|
||||
|
||||
JLOG(journal_.debug())
|
||||
<< "finishFetch: waiting for node " << hash;
|
||||
|
||||
while (steady_clock::now() < deadline)
|
||||
{
|
||||
JLOG(journal_.debug())
|
||||
<< "finishFetch: got node " << hash;
|
||||
auto node = SHAMapTreeNode::makeFromPrefix(
|
||||
makeSlice(obj->getData()), hash);
|
||||
if (node)
|
||||
canonicalize(hash, node);
|
||||
return node;
|
||||
// Sleep for the poll interval (yields coroutine, frees
|
||||
// job thread)
|
||||
coro->sleepFor(pollInterval);
|
||||
|
||||
// Try to fetch from cache/db again
|
||||
if (auto obj = f_.db().fetchNodeObject(
|
||||
hash.as_uint256(), ledgerSeq_))
|
||||
{
|
||||
JLOG(journal_.debug())
|
||||
<< "finishFetch: got node " << hash;
|
||||
auto node = SHAMapTreeNode::makeFromPrefix(
|
||||
makeSlice(obj->getData()), hash);
|
||||
if (node)
|
||||
canonicalize(hash, node);
|
||||
return node;
|
||||
}
|
||||
|
||||
// Re-request with priority using linear backoff
|
||||
auto now = steady_clock::now();
|
||||
if (now >= nextRequestTime)
|
||||
{
|
||||
f_.missingNodeAcquireBySeq(
|
||||
ledgerSeq_,
|
||||
hash.as_uint256(),
|
||||
true /*prioritize*/);
|
||||
// Increase delay for next request (linear backoff)
|
||||
if (nextRequestDelay < maxRequestDelay)
|
||||
nextRequestDelay += backoffStep;
|
||||
nextRequestTime = now + nextRequestDelay;
|
||||
}
|
||||
}
|
||||
|
||||
// Re-request with priority using linear backoff
|
||||
auto now = steady_clock::now();
|
||||
if (now >= nextRequestTime)
|
||||
{
|
||||
f_.missingNodeAcquireBySeq(
|
||||
ledgerSeq_, hash.as_uint256(), true /*prioritize*/);
|
||||
// Increase delay for next request (linear backoff)
|
||||
if (nextRequestDelay < maxRequestDelay)
|
||||
nextRequestDelay += backoffStep;
|
||||
nextRequestTime = now + nextRequestDelay;
|
||||
}
|
||||
JLOG(journal_.warn())
|
||||
<< "finishFetch: timeout waiting for node " << hash;
|
||||
}
|
||||
|
||||
JLOG(journal_.warn())
|
||||
<< "finishFetch: timeout waiting for node " << hash;
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user