mirror of
https://github.com/Xahau/xahaud.git
synced 2025-11-04 18:55:49 +00:00
Allow server to stabilize after online delete health check failure
This commit is contained in:
@@ -1140,17 +1140,10 @@
|
||||
# The online delete process checks periodically
|
||||
# that rippled is still in sync with the network,
|
||||
# and that the validated ledger is less than
|
||||
# 'age_threshold_seconds' old. By default, if it
|
||||
# is not the online delete process aborts and
|
||||
# tries again later. If 'recovery_wait_seconds'
|
||||
# is set and rippled is out of sync, but likely to
|
||||
# recover quickly, then online delete will wait
|
||||
# this number of seconds for rippled to get back
|
||||
# into sync before it aborts.
|
||||
# Set this value if the node is otherwise staying
|
||||
# in sync, or recovering quickly, but the online
|
||||
# delete process is unable to finish.
|
||||
# Default is unset.
|
||||
# 'age_threshold_seconds' old. If not, then continue
|
||||
# sleeping for this number of seconds and
|
||||
# checking until healthy.
|
||||
# Default is 5.
|
||||
#
|
||||
# Optional keys for Cassandra:
|
||||
#
|
||||
|
||||
@@ -138,7 +138,7 @@ SHAMapStoreImp::SHAMapStoreImp(
|
||||
if (get_if_exists(section, "age_threshold_seconds", temp))
|
||||
ageThreshold_ = std::chrono::seconds{temp};
|
||||
if (get_if_exists(section, "recovery_wait_seconds", temp))
|
||||
recoveryWaitTime_.emplace(std::chrono::seconds{temp});
|
||||
recoveryWaitTime_ = std::chrono::seconds{temp};
|
||||
|
||||
get_if_exists(section, "advisory_delete", advisoryDelete_);
|
||||
|
||||
@@ -268,7 +268,7 @@ SHAMapStoreImp::copyNode(std::uint64_t& nodeCount, SHAMapTreeNode const& node)
|
||||
true);
|
||||
if (!(++nodeCount % checkHealthInterval_))
|
||||
{
|
||||
if (health())
|
||||
if (stopping())
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -326,7 +326,7 @@ SHAMapStoreImp::run()
|
||||
|
||||
bool const readyToRotate =
|
||||
validatedSeq >= lastRotated + deleteInterval_ &&
|
||||
canDelete_ >= lastRotated - 1 && !health();
|
||||
canDelete_ >= lastRotated - 1 && !stopping();
|
||||
|
||||
// Make sure we don't delete ledgers currently being
|
||||
// imported into the ShardStore
|
||||
@@ -358,15 +358,8 @@ SHAMapStoreImp::run()
|
||||
<< ledgerMaster_->getValidatedLedgerAge().count() << 's';
|
||||
|
||||
clearPrior(lastRotated);
|
||||
switch (health())
|
||||
{
|
||||
case Health::stopping:
|
||||
if (stopping())
|
||||
return;
|
||||
case Health::unhealthy:
|
||||
continue;
|
||||
case Health::ok:
|
||||
default:;
|
||||
}
|
||||
|
||||
JLOG(journal_.debug()) << "copying ledger " << validatedSeq;
|
||||
std::uint64_t nodeCount = 0;
|
||||
@@ -375,30 +368,16 @@ SHAMapStoreImp::run()
|
||||
this,
|
||||
std::ref(nodeCount),
|
||||
std::placeholders::_1));
|
||||
switch (health())
|
||||
{
|
||||
case Health::stopping:
|
||||
if (stopping())
|
||||
return;
|
||||
case Health::unhealthy:
|
||||
continue;
|
||||
case Health::ok:
|
||||
default:;
|
||||
}
|
||||
// Only log if we completed without a "health" abort
|
||||
JLOG(journal_.debug()) << "copied ledger " << validatedSeq
|
||||
<< " nodecount " << nodeCount;
|
||||
|
||||
JLOG(journal_.debug()) << "freshening caches";
|
||||
freshenCaches();
|
||||
switch (health())
|
||||
{
|
||||
case Health::stopping:
|
||||
if (stopping())
|
||||
return;
|
||||
case Health::unhealthy:
|
||||
continue;
|
||||
case Health::ok:
|
||||
default:;
|
||||
}
|
||||
// Only log if we completed without a "health" abort
|
||||
JLOG(journal_.debug()) << validatedSeq << " freshened caches";
|
||||
|
||||
@@ -408,15 +387,8 @@ SHAMapStoreImp::run()
|
||||
<< validatedSeq << " new backend " << newBackend->getName();
|
||||
|
||||
clearCaches(validatedSeq);
|
||||
switch (health())
|
||||
{
|
||||
case Health::stopping:
|
||||
if (stopping())
|
||||
return;
|
||||
case Health::unhealthy:
|
||||
continue;
|
||||
case Health::ok:
|
||||
default:;
|
||||
}
|
||||
|
||||
lastRotated = validatedSeq;
|
||||
|
||||
@@ -580,7 +552,7 @@ SHAMapStoreImp::clearSql(
|
||||
min = *m;
|
||||
}
|
||||
|
||||
if (min > lastRotated || health() != Health::ok)
|
||||
if (min > lastRotated || stopping())
|
||||
return;
|
||||
if (min == lastRotated)
|
||||
{
|
||||
@@ -601,11 +573,11 @@ SHAMapStoreImp::clearSql(
|
||||
JLOG(journal_.trace())
|
||||
<< "End: Delete up to " << deleteBatch_ << " rows with LedgerSeq < "
|
||||
<< min << " from: " << TableName;
|
||||
if (health())
|
||||
if (stopping())
|
||||
return;
|
||||
if (min < lastRotated)
|
||||
std::this_thread::sleep_for(backOff_);
|
||||
if (health())
|
||||
if (stopping())
|
||||
return;
|
||||
}
|
||||
JLOG(journal_.debug()) << "finished deleting from: " << TableName;
|
||||
@@ -645,7 +617,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
|
||||
ledgerMaster_->clearPriorLedgers(lastRotated);
|
||||
JLOG(journal_.trace()) << "End: Clear internal ledgers up to "
|
||||
<< lastRotated;
|
||||
if (health())
|
||||
if (stopping())
|
||||
return;
|
||||
|
||||
RelationalDBInterfaceSqlite* iface =
|
||||
@@ -661,7 +633,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
|
||||
[&iface](LedgerIndex min) -> void {
|
||||
iface->deleteBeforeLedgerSeq(min);
|
||||
});
|
||||
if (health())
|
||||
if (stopping())
|
||||
return;
|
||||
|
||||
if (!app_.config().useTxTables())
|
||||
@@ -676,7 +648,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
|
||||
[&iface](LedgerIndex min) -> void {
|
||||
iface->deleteTransactionsBeforeLedgerSeq(min);
|
||||
});
|
||||
if (health())
|
||||
if (stopping())
|
||||
return;
|
||||
|
||||
clearSql(
|
||||
@@ -688,52 +660,30 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
|
||||
[&iface](LedgerIndex min) -> void {
|
||||
iface->deleteAccountTransactionsBeforeLedgerSeq(min);
|
||||
});
|
||||
if (health())
|
||||
if (stopping())
|
||||
return;
|
||||
}
|
||||
|
||||
SHAMapStoreImp::Health
|
||||
SHAMapStoreImp::health()
|
||||
bool
|
||||
SHAMapStoreImp::stopping()
|
||||
{
|
||||
{
|
||||
std::lock_guard lock(mutex_);
|
||||
if (stop_)
|
||||
return Health::stopping;
|
||||
}
|
||||
if (!netOPs_)
|
||||
return Health::ok;
|
||||
assert(deleteInterval_);
|
||||
|
||||
if (healthy_)
|
||||
{
|
||||
auto age = ledgerMaster_->getValidatedLedgerAge();
|
||||
OperatingMode mode = netOPs_->getOperatingMode();
|
||||
if (recoveryWaitTime_ && mode == OperatingMode::SYNCING &&
|
||||
age < ageThreshold_)
|
||||
std::unique_lock lock(mutex_);
|
||||
while (!stop_ && (mode != OperatingMode::FULL || age > ageThreshold_))
|
||||
{
|
||||
JLOG(journal_.warn())
|
||||
<< "Waiting " << recoveryWaitTime_->count()
|
||||
<< "s for node to get back into sync with network. state: "
|
||||
<< app_.getOPs().strOperatingMode(mode, false) << ". age "
|
||||
<< age.count() << 's';
|
||||
std::this_thread::sleep_for(*recoveryWaitTime_);
|
||||
|
||||
age = ledgerMaster_->getValidatedLedgerAge();
|
||||
mode = netOPs_->getOperatingMode();
|
||||
}
|
||||
if (mode != OperatingMode::FULL || age > ageThreshold_)
|
||||
{
|
||||
JLOG(journal_.warn()) << "Not deleting. state: "
|
||||
lock.unlock();
|
||||
JLOG(journal_.warn()) << "Waiting " << recoveryWaitTime_.count()
|
||||
<< "s for node to stabilize. state: "
|
||||
<< app_.getOPs().strOperatingMode(mode, false)
|
||||
<< ". age " << age.count() << 's';
|
||||
healthy_ = false;
|
||||
}
|
||||
std::this_thread::sleep_for(recoveryWaitTime_);
|
||||
age = ledgerMaster_->getValidatedLedgerAge();
|
||||
mode = netOPs_->getOperatingMode();
|
||||
lock.lock();
|
||||
}
|
||||
|
||||
if (healthy_)
|
||||
return Health::ok;
|
||||
else
|
||||
return Health::unhealthy;
|
||||
return stop_;
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -40,8 +40,6 @@ class NetworkOPs;
|
||||
class SHAMapStoreImp : public SHAMapStore
|
||||
{
|
||||
private:
|
||||
enum Health : std::uint8_t { ok = 0, stopping, unhealthy };
|
||||
|
||||
class SavedStateDB
|
||||
{
|
||||
public:
|
||||
@@ -106,12 +104,12 @@ private:
|
||||
std::uint32_t deleteBatch_ = 100;
|
||||
std::chrono::milliseconds backOff_{100};
|
||||
std::chrono::seconds ageThreshold_{60};
|
||||
/// If set, and the node is out of sync during an
|
||||
/// If the node is out of sync during an
|
||||
/// online_delete health check, sleep the thread
|
||||
/// for this time and check again so the node can
|
||||
/// recover.
|
||||
/// for this time, and continue checking until
|
||||
/// recovery.
|
||||
/// See also: "recovery_wait_seconds" in rippled-example.cfg
|
||||
std::optional<std::chrono::seconds> recoveryWaitTime_;
|
||||
std::chrono::seconds recoveryWaitTime_{5};
|
||||
|
||||
// these do not exist upon SHAMapStore creation, but do exist
|
||||
// as of run() or before
|
||||
@@ -201,7 +199,7 @@ private:
|
||||
{
|
||||
dbRotating_->fetchNodeObject(
|
||||
key, 0, NodeStore::FetchType::synchronous, true);
|
||||
if (!(++check % checkHealthInterval_) && health())
|
||||
if (!(++check % checkHealthInterval_) && stopping())
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -225,16 +223,15 @@ private:
|
||||
void
|
||||
clearPrior(LedgerIndex lastRotated);
|
||||
|
||||
// If rippled is not healthy, defer rotate-delete.
|
||||
// If already unhealthy, do not change state on further check.
|
||||
// Assume that, once unhealthy, a necessary step has been
|
||||
// aborted, so the online-delete process needs to restart
|
||||
// at next ledger.
|
||||
// If recoveryWaitTime_ is set, this may sleep to give rippled
|
||||
// time to recover, so never call it from any thread other than
|
||||
// the main "run()".
|
||||
Health
|
||||
health();
|
||||
/**
|
||||
* This is a health check for online deletion that waits until rippled is
|
||||
* stable until returning. If the server is stopping, then it returns
|
||||
* "true" to inform the caller to allow the server to stop.
|
||||
*
|
||||
* @return Whether the server is stopping.
|
||||
*/
|
||||
bool
|
||||
stopping();
|
||||
|
||||
public:
|
||||
void
|
||||
|
||||
Reference in New Issue
Block a user