Allow server to stabilize after online delete health check failure

This commit is contained in:
Mark Travis
2022-04-05 18:02:53 -07:00
committed by manojsdoshi
parent dfe69f1b76
commit 5aedb0e07a
3 changed files with 52 additions and 112 deletions

View File

@@ -1140,17 +1140,10 @@
# The online delete process checks periodically
# that rippled is still in sync with the network,
# and that the validated ledger is less than
# 'age_threshold_seconds' old. By default, if it
# is not the online delete process aborts and
# tries again later. If 'recovery_wait_seconds'
# is set and rippled is out of sync, but likely to
# recover quickly, then online delete will wait
# this number of seconds for rippled to get back
# into sync before it aborts.
# Set this value if the node is otherwise staying
# in sync, or recovering quickly, but the online
# delete process is unable to finish.
# Default is unset.
# 'age_threshold_seconds' old. If not, then continue
# sleeping for this number of seconds and
# checking until healthy.
# Default is 5.
#
# Optional keys for Cassandra:
#

View File

@@ -138,7 +138,7 @@ SHAMapStoreImp::SHAMapStoreImp(
if (get_if_exists(section, "age_threshold_seconds", temp))
ageThreshold_ = std::chrono::seconds{temp};
if (get_if_exists(section, "recovery_wait_seconds", temp))
recoveryWaitTime_.emplace(std::chrono::seconds{temp});
recoveryWaitTime_ = std::chrono::seconds{temp};
get_if_exists(section, "advisory_delete", advisoryDelete_);
@@ -268,7 +268,7 @@ SHAMapStoreImp::copyNode(std::uint64_t& nodeCount, SHAMapTreeNode const& node)
true);
if (!(++nodeCount % checkHealthInterval_))
{
if (health())
if (stopping())
return false;
}
@@ -326,7 +326,7 @@ SHAMapStoreImp::run()
bool const readyToRotate =
validatedSeq >= lastRotated + deleteInterval_ &&
canDelete_ >= lastRotated - 1 && !health();
canDelete_ >= lastRotated - 1 && !stopping();
// Make sure we don't delete ledgers currently being
// imported into the ShardStore
@@ -358,15 +358,8 @@ SHAMapStoreImp::run()
<< ledgerMaster_->getValidatedLedgerAge().count() << 's';
clearPrior(lastRotated);
switch (health())
{
case Health::stopping:
if (stopping())
return;
case Health::unhealthy:
continue;
case Health::ok:
default:;
}
JLOG(journal_.debug()) << "copying ledger " << validatedSeq;
std::uint64_t nodeCount = 0;
@@ -375,30 +368,16 @@ SHAMapStoreImp::run()
this,
std::ref(nodeCount),
std::placeholders::_1));
switch (health())
{
case Health::stopping:
if (stopping())
return;
case Health::unhealthy:
continue;
case Health::ok:
default:;
}
// Only log if we completed without a "health" abort
JLOG(journal_.debug()) << "copied ledger " << validatedSeq
<< " nodecount " << nodeCount;
JLOG(journal_.debug()) << "freshening caches";
freshenCaches();
switch (health())
{
case Health::stopping:
if (stopping())
return;
case Health::unhealthy:
continue;
case Health::ok:
default:;
}
// Only log if we completed without a "health" abort
JLOG(journal_.debug()) << validatedSeq << " freshened caches";
@@ -408,15 +387,8 @@ SHAMapStoreImp::run()
<< validatedSeq << " new backend " << newBackend->getName();
clearCaches(validatedSeq);
switch (health())
{
case Health::stopping:
if (stopping())
return;
case Health::unhealthy:
continue;
case Health::ok:
default:;
}
lastRotated = validatedSeq;
@@ -580,7 +552,7 @@ SHAMapStoreImp::clearSql(
min = *m;
}
if (min > lastRotated || health() != Health::ok)
if (min > lastRotated || stopping())
return;
if (min == lastRotated)
{
@@ -601,11 +573,11 @@ SHAMapStoreImp::clearSql(
JLOG(journal_.trace())
<< "End: Delete up to " << deleteBatch_ << " rows with LedgerSeq < "
<< min << " from: " << TableName;
if (health())
if (stopping())
return;
if (min < lastRotated)
std::this_thread::sleep_for(backOff_);
if (health())
if (stopping())
return;
}
JLOG(journal_.debug()) << "finished deleting from: " << TableName;
@@ -645,7 +617,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
ledgerMaster_->clearPriorLedgers(lastRotated);
JLOG(journal_.trace()) << "End: Clear internal ledgers up to "
<< lastRotated;
if (health())
if (stopping())
return;
RelationalDBInterfaceSqlite* iface =
@@ -661,7 +633,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
[&iface](LedgerIndex min) -> void {
iface->deleteBeforeLedgerSeq(min);
});
if (health())
if (stopping())
return;
if (!app_.config().useTxTables())
@@ -676,7 +648,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
[&iface](LedgerIndex min) -> void {
iface->deleteTransactionsBeforeLedgerSeq(min);
});
if (health())
if (stopping())
return;
clearSql(
@@ -688,52 +660,30 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
[&iface](LedgerIndex min) -> void {
iface->deleteAccountTransactionsBeforeLedgerSeq(min);
});
if (health())
if (stopping())
return;
}
SHAMapStoreImp::Health
SHAMapStoreImp::health()
bool
SHAMapStoreImp::stopping()
{
{
std::lock_guard lock(mutex_);
if (stop_)
return Health::stopping;
}
if (!netOPs_)
return Health::ok;
assert(deleteInterval_);
if (healthy_)
{
auto age = ledgerMaster_->getValidatedLedgerAge();
OperatingMode mode = netOPs_->getOperatingMode();
if (recoveryWaitTime_ && mode == OperatingMode::SYNCING &&
age < ageThreshold_)
std::unique_lock lock(mutex_);
while (!stop_ && (mode != OperatingMode::FULL || age > ageThreshold_))
{
JLOG(journal_.warn())
<< "Waiting " << recoveryWaitTime_->count()
<< "s for node to get back into sync with network. state: "
<< app_.getOPs().strOperatingMode(mode, false) << ". age "
<< age.count() << 's';
std::this_thread::sleep_for(*recoveryWaitTime_);
age = ledgerMaster_->getValidatedLedgerAge();
mode = netOPs_->getOperatingMode();
}
if (mode != OperatingMode::FULL || age > ageThreshold_)
{
JLOG(journal_.warn()) << "Not deleting. state: "
lock.unlock();
JLOG(journal_.warn()) << "Waiting " << recoveryWaitTime_.count()
<< "s for node to stabilize. state: "
<< app_.getOPs().strOperatingMode(mode, false)
<< ". age " << age.count() << 's';
healthy_ = false;
}
std::this_thread::sleep_for(recoveryWaitTime_);
age = ledgerMaster_->getValidatedLedgerAge();
mode = netOPs_->getOperatingMode();
lock.lock();
}
if (healthy_)
return Health::ok;
else
return Health::unhealthy;
return stop_;
}
void

View File

@@ -40,8 +40,6 @@ class NetworkOPs;
class SHAMapStoreImp : public SHAMapStore
{
private:
enum Health : std::uint8_t { ok = 0, stopping, unhealthy };
class SavedStateDB
{
public:
@@ -106,12 +104,12 @@ private:
std::uint32_t deleteBatch_ = 100;
std::chrono::milliseconds backOff_{100};
std::chrono::seconds ageThreshold_{60};
/// If set, and the node is out of sync during an
/// If the node is out of sync during an
/// online_delete health check, sleep the thread
/// for this time and check again so the node can
/// recover.
/// for this time, and continue checking until
/// recovery.
/// See also: "recovery_wait_seconds" in rippled-example.cfg
std::optional<std::chrono::seconds> recoveryWaitTime_;
std::chrono::seconds recoveryWaitTime_{5};
// these do not exist upon SHAMapStore creation, but do exist
// as of run() or before
@@ -201,7 +199,7 @@ private:
{
dbRotating_->fetchNodeObject(
key, 0, NodeStore::FetchType::synchronous, true);
if (!(++check % checkHealthInterval_) && health())
if (!(++check % checkHealthInterval_) && stopping())
return true;
}
@@ -225,16 +223,15 @@ private:
void
clearPrior(LedgerIndex lastRotated);
// If rippled is not healthy, defer rotate-delete.
// If already unhealthy, do not change state on further check.
// Assume that, once unhealthy, a necessary step has been
// aborted, so the online-delete process needs to restart
// at next ledger.
// If recoveryWaitTime_ is set, this may sleep to give rippled
// time to recover, so never call it from any thread other than
// the main "run()".
Health
health();
/**
* This is a health check for online deletion that waits until rippled is
* stable until returning. If the server is stopping, then it returns
* "true" to inform the caller to allow the server to stop.
*
* @return Whether the server is stopping.
*/
bool
stopping();
public:
void