Increase the deadlock detection timeout:

It's possible an overloaded job queue is causing false alarms on the deadlock
detector. Log a fatal message after 90s, declare a logic error after 600s.
This commit is contained in:
seelabs
2019-12-04 10:58:23 -08:00
committed by Nik Bougalis
parent 2aa11fa41d
commit 5096cc1f5f

View File

@@ -126,26 +126,49 @@ void LoadManager::run ()
auto const timeSpentDeadlocked = auto const timeSpentDeadlocked =
duration_cast<seconds>(steady_clock::now() - deadLock); duration_cast<seconds>(steady_clock::now() - deadLock);
auto const reportingIntervalSeconds = 10s; constexpr auto reportingIntervalSeconds = 10s;
constexpr auto deadlockFatalLogMessageTimeLimit = 90s;
constexpr auto deadlockLogicErrorTimeLimit = 600s;
if (armed && (timeSpentDeadlocked >= reportingIntervalSeconds)) if (armed && (timeSpentDeadlocked >= reportingIntervalSeconds))
{ {
// Report the deadlocked condition every 10 seconds
// Report the deadlocked condition every reportingIntervalSeconds
if ((timeSpentDeadlocked % reportingIntervalSeconds) == 0s) if ((timeSpentDeadlocked % reportingIntervalSeconds) == 0s)
{ {
JLOG(journal_.warn()) if (timeSpentDeadlocked < deadlockFatalLogMessageTimeLimit)
<< "Server stalled for " {
<< timeSpentDeadlocked.count() << " seconds."; JLOG(journal_.warn())
<< "Server stalled for "
<< timeSpentDeadlocked.count() << " seconds.";
}
else
{
JLOG(journal_.fatal())
<< "Deadlock detected. Deadlocked time: "
<< timeSpentDeadlocked.count() << "s";
if (app_.getJobQueue().isOverloaded())
{
JLOG(journal_.fatal())
<< app_.getJobQueue().getJson(0);
}
}
} }
// If we go over 90 seconds spent deadlocked, it means that // If we go over the deadlockTimeLimit spent deadlocked, it means that
// the deadlock resolution code has failed, which qualifies // the deadlock resolution code has failed, which qualifies
// as undefined behavior. // as undefined behavior.
// //
constexpr auto deadlockTimeLimit = 90s; if (timeSpentDeadlocked >= deadlockLogicErrorTimeLimit)
assert (timeSpentDeadlocked < deadlockTimeLimit); {
JLOG(journal_.fatal())
if (timeSpentDeadlocked >= deadlockTimeLimit) << "LogicError: Deadlock detected. Deadlocked time: "
<< timeSpentDeadlocked.count() << "s";
if (app_.getJobQueue().isOverloaded())
{
JLOG(journal_.fatal()) << app_.getJobQueue().getJson(0);
}
LogicError("Deadlock detected"); LogicError("Deadlock detected");
}
} }
} }