Allow server to stabilize after online delete health check failure

2025-11-04 18:55:49 +00:00 · 2022-04-05 18:02:53 -07:00
parent dfe69f1b76
commit 5aedb0e07a
3 changed files with 52 additions and 112 deletions
--- a/cfg/rippled-example.cfg
+++ b/cfg/rippled-example.cfg
@@ -1140,17 +1140,10 @@
 #                           The online delete process checks periodically
 #                           that rippled is still in sync with the network,
 #                           and that the validated ledger is less than
-#                           'age_threshold_seconds' old. By default, if it
-#                           is not the online delete process aborts and
-#                           tries again later. If 'recovery_wait_seconds'
-#                           is set and rippled is out of sync, but likely to
-#                           recover quickly, then online delete will wait
-#                           this number of seconds for rippled to get back
-#                           into sync before it aborts.
-#                           Set this value if the node is otherwise staying
-#                           in sync, or recovering quickly, but the online
-#                           delete process is unable to finish.
-#                           Default is unset.
+#                           'age_threshold_seconds' old. If not, then continue
+#                           sleeping for this number of seconds and 
+#                           checking until healthy.
+#                           Default is 5.
 #
 #   Optional keys for Cassandra:
 #
--- a/src/ripple/app/misc/SHAMapStoreImp.cpp
+++ b/src/ripple/app/misc/SHAMapStoreImp.cpp
@@ -138,7 +138,7 @@ SHAMapStoreImp::SHAMapStoreImp(
        if (get_if_exists(section, "age_threshold_seconds", temp))
            ageThreshold_ = std::chrono::seconds{temp};
        if (get_if_exists(section, "recovery_wait_seconds", temp))
-            recoveryWaitTime_.emplace(std::chrono::seconds{temp});
+            recoveryWaitTime_ = std::chrono::seconds{temp};

        get_if_exists(section, "advisory_delete", advisoryDelete_);

@@ -268,7 +268,7 @@ SHAMapStoreImp::copyNode(std::uint64_t& nodeCount, SHAMapTreeNode const& node)
        true);
    if (!(++nodeCount % checkHealthInterval_))
    {
-        if (health())
+        if (stopping())
            return false;
    }

@@ -326,7 +326,7 @@ SHAMapStoreImp::run()

        bool const readyToRotate =
            validatedSeq >= lastRotated + deleteInterval_ &&
-            canDelete_ >= lastRotated - 1 && !health();
+            canDelete_ >= lastRotated - 1 && !stopping();

        // Make sure we don't delete ledgers currently being
        // imported into the ShardStore
@@ -358,15 +358,8 @@ SHAMapStoreImp::run()
                << ledgerMaster_->getValidatedLedgerAge().count() << 's';

            clearPrior(lastRotated);
-            switch (health())
-            {
-                case Health::stopping:
+            if (stopping())
                return;
-                case Health::unhealthy:
-                    continue;
-                case Health::ok:
-                default:;
-            }

            JLOG(journal_.debug()) << "copying ledger " << validatedSeq;
            std::uint64_t nodeCount = 0;
@@ -375,30 +368,16 @@ SHAMapStoreImp::run()
                this,
                std::ref(nodeCount),
                std::placeholders::_1));
-            switch (health())
-            {
-                case Health::stopping:
+            if (stopping())
                return;
-                case Health::unhealthy:
-                    continue;
-                case Health::ok:
-                default:;
-            }
            // Only log if we completed without a "health" abort
            JLOG(journal_.debug()) << "copied ledger " << validatedSeq
                                   << " nodecount " << nodeCount;

            JLOG(journal_.debug()) << "freshening caches";
            freshenCaches();
-            switch (health())
-            {
-                case Health::stopping:
+            if (stopping())
                return;
-                case Health::unhealthy:
-                    continue;
-                case Health::ok:
-                default:;
-            }
            // Only log if we completed without a "health" abort
            JLOG(journal_.debug()) << validatedSeq << " freshened caches";

@@ -408,15 +387,8 @@ SHAMapStoreImp::run()
                << validatedSeq << " new backend " << newBackend->getName();

            clearCaches(validatedSeq);
-            switch (health())
-            {
-                case Health::stopping:
+            if (stopping())
                return;
-                case Health::unhealthy:
-                    continue;
-                case Health::ok:
-                default:;
-            }

            lastRotated = validatedSeq;

@@ -580,7 +552,7 @@ SHAMapStoreImp::clearSql(
        min = *m;
    }

-    if (min > lastRotated || health() != Health::ok)
+    if (min > lastRotated || stopping())
        return;
    if (min == lastRotated)
    {
@@ -601,11 +573,11 @@ SHAMapStoreImp::clearSql(
        JLOG(journal_.trace())
            << "End: Delete up to " << deleteBatch_ << " rows with LedgerSeq < "
            << min << " from: " << TableName;
-        if (health())
+        if (stopping())
            return;
        if (min < lastRotated)
            std::this_thread::sleep_for(backOff_);
-        if (health())
+        if (stopping())
            return;
    }
    JLOG(journal_.debug()) << "finished deleting from: " << TableName;
@@ -645,7 +617,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
    ledgerMaster_->clearPriorLedgers(lastRotated);
    JLOG(journal_.trace()) << "End: Clear internal ledgers up to "
                           << lastRotated;
-    if (health())
+    if (stopping())
        return;

    RelationalDBInterfaceSqlite* iface =
@@ -661,7 +633,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
        [&iface](LedgerIndex min) -> void {
            iface->deleteBeforeLedgerSeq(min);
        });
-    if (health())
+    if (stopping())
        return;

    if (!app_.config().useTxTables())
@@ -676,7 +648,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
        [&iface](LedgerIndex min) -> void {
            iface->deleteTransactionsBeforeLedgerSeq(min);
        });
-    if (health())
+    if (stopping())
        return;

    clearSql(
@@ -688,52 +660,30 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
        [&iface](LedgerIndex min) -> void {
            iface->deleteAccountTransactionsBeforeLedgerSeq(min);
        });
-    if (health())
+    if (stopping())
        return;
 }

-SHAMapStoreImp::Health
-SHAMapStoreImp::health()
+bool
+SHAMapStoreImp::stopping()
 {
-    {
-        std::lock_guard lock(mutex_);
-        if (stop_)
-            return Health::stopping;
-    }
-    if (!netOPs_)
-        return Health::ok;
-    assert(deleteInterval_);
-
-    if (healthy_)
-    {
    auto age = ledgerMaster_->getValidatedLedgerAge();
    OperatingMode mode = netOPs_->getOperatingMode();
-        if (recoveryWaitTime_ && mode == OperatingMode::SYNCING &&
-            age < ageThreshold_)
+    std::unique_lock lock(mutex_);
+    while (!stop_ && (mode != OperatingMode::FULL || age > ageThreshold_))
    {
-            JLOG(journal_.warn())
-                << "Waiting " << recoveryWaitTime_->count()
-                << "s for node to get back into sync with network. state: "
-                << app_.getOPs().strOperatingMode(mode, false) << ". age "
-                << age.count() << 's';
-            std::this_thread::sleep_for(*recoveryWaitTime_);
-
-            age = ledgerMaster_->getValidatedLedgerAge();
-            mode = netOPs_->getOperatingMode();
-        }
-        if (mode != OperatingMode::FULL || age > ageThreshold_)
-        {
-            JLOG(journal_.warn()) << "Not deleting. state: "
+        lock.unlock();
+        JLOG(journal_.warn()) << "Waiting " << recoveryWaitTime_.count()
+                              << "s for node to stabilize. state: "
                              << app_.getOPs().strOperatingMode(mode, false)
                              << ". age " << age.count() << 's';
-            healthy_ = false;
-        }
+        std::this_thread::sleep_for(recoveryWaitTime_);
+        age = ledgerMaster_->getValidatedLedgerAge();
+        mode = netOPs_->getOperatingMode();
+        lock.lock();
    }

-    if (healthy_)
-        return Health::ok;
-    else
-        return Health::unhealthy;
+    return stop_;
 }

 void
--- a/src/ripple/app/misc/SHAMapStoreImp.h
+++ b/src/ripple/app/misc/SHAMapStoreImp.h
@@ -40,8 +40,6 @@ class NetworkOPs;
 class SHAMapStoreImp : public SHAMapStore
 {
 private:
-    enum Health : std::uint8_t { ok = 0, stopping, unhealthy };
-
    class SavedStateDB
    {
    public:
@@ -106,12 +104,12 @@ private:
    std::uint32_t deleteBatch_ = 100;
    std::chrono::milliseconds backOff_{100};
    std::chrono::seconds ageThreshold_{60};
-    /// If set, and the node is out of sync during an
+    /// If  the node is out of sync during an
    /// online_delete health check, sleep the thread
-    /// for this time and check again so the node can
-    /// recover.
+    /// for this time, and continue checking until
+    /// recovery.
    /// See also: "recovery_wait_seconds" in rippled-example.cfg
-    std::optional<std::chrono::seconds> recoveryWaitTime_;
+    std::chrono::seconds recoveryWaitTime_{5};

    // these do not exist upon SHAMapStore creation, but do exist
    // as of run() or before
@@ -201,7 +199,7 @@ private:
        {
            dbRotating_->fetchNodeObject(
                key, 0, NodeStore::FetchType::synchronous, true);
-            if (!(++check % checkHealthInterval_) && health())
+            if (!(++check % checkHealthInterval_) && stopping())
                return true;
        }

@@ -225,16 +223,15 @@ private:
    void
    clearPrior(LedgerIndex lastRotated);

-    // If rippled is not healthy, defer rotate-delete.
-    // If already unhealthy, do not change state on further check.
-    // Assume that, once unhealthy, a necessary step has been
-    // aborted, so the online-delete process needs to restart
-    // at next ledger.
-    // If recoveryWaitTime_ is set, this may sleep to give rippled
-    // time to recover, so never call it from any thread other than
-    // the main "run()".
-    Health
-    health();
+    /**
+     * This is a health check for online deletion that waits until rippled is
+     * stable until returning. If the server is stopping, then it returns
+     * "true" to inform the caller to allow the server to stop.
+     *
+     * @return Whether the server is stopping.
+     */
+    bool
+    stopping();

 public:
    void