From 5096cc1f5fd2386152188090784c4daf1afbe749 Mon Sep 17 00:00:00 2001 From: seelabs Date: Wed, 4 Dec 2019 10:58:23 -0800 Subject: [PATCH] Increase the deadlock detection timeout: It's possible an overloaded job queue is causing false alarms on the deadlock detector. Log a fatal message after 90s, declare a logic error after 600s. --- src/ripple/app/main/LoadManager.cpp | 43 ++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/src/ripple/app/main/LoadManager.cpp b/src/ripple/app/main/LoadManager.cpp index 6a80803365..d2f19349dc 100644 --- a/src/ripple/app/main/LoadManager.cpp +++ b/src/ripple/app/main/LoadManager.cpp @@ -126,26 +126,49 @@ void LoadManager::run () auto const timeSpentDeadlocked = duration_cast(steady_clock::now() - deadLock); - auto const reportingIntervalSeconds = 10s; + constexpr auto reportingIntervalSeconds = 10s; + constexpr auto deadlockFatalLogMessageTimeLimit = 90s; + constexpr auto deadlockLogicErrorTimeLimit = 600s; if (armed && (timeSpentDeadlocked >= reportingIntervalSeconds)) { - // Report the deadlocked condition every 10 seconds + + // Report the deadlocked condition every reportingIntervalSeconds if ((timeSpentDeadlocked % reportingIntervalSeconds) == 0s) { - JLOG(journal_.warn()) - << "Server stalled for " - << timeSpentDeadlocked.count() << " seconds."; + if (timeSpentDeadlocked < deadlockFatalLogMessageTimeLimit) + { + JLOG(journal_.warn()) + << "Server stalled for " + << timeSpentDeadlocked.count() << " seconds."; + } + else + { + JLOG(journal_.fatal()) + << "Deadlock detected. Deadlocked time: " + << timeSpentDeadlocked.count() << "s"; + if (app_.getJobQueue().isOverloaded()) + { + JLOG(journal_.fatal()) + << app_.getJobQueue().getJson(0); + } + } } - // If we go over 90 seconds spent deadlocked, it means that + // If we go over the deadlockTimeLimit spent deadlocked, it means that // the deadlock resolution code has failed, which qualifies // as undefined behavior. // - constexpr auto deadlockTimeLimit = 90s; - assert (timeSpentDeadlocked < deadlockTimeLimit); - - if (timeSpentDeadlocked >= deadlockTimeLimit) + if (timeSpentDeadlocked >= deadlockLogicErrorTimeLimit) + { + JLOG(journal_.fatal()) + << "LogicError: Deadlock detected. Deadlocked time: " + << timeSpentDeadlocked.count() << "s"; + if (app_.getJobQueue().isOverloaded()) + { + JLOG(journal_.fatal()) << app_.getJobQueue().getJson(0); + } LogicError("Deadlock detected"); + } } }