Report job queue data if a deadlock is detected

This commit is contained in:
Nik Bougalis
2021-11-16 10:06:23 -08:00
parent 35e0ab4280
commit d36024394d

View File

@@ -108,86 +108,86 @@ LoadManager::run()
while (true) while (true)
{ {
t += 1s;
std::unique_lock sl(mutex_);
if (cv_.wait_until(sl, t, [this] { return stop_; }))
break;
// Copy out shared data under a lock. Use copies outside lock.
auto const deadLock = deadLock_;
auto const armed = armed_;
sl.unlock();
// Measure the amount of time we have been deadlocked, in seconds.
using namespace std::chrono;
auto const timeSpentDeadlocked =
duration_cast<seconds>(steady_clock::now() - deadLock);
constexpr auto reportingIntervalSeconds = 10s;
constexpr auto deadlockFatalLogMessageTimeLimit = 90s;
constexpr auto deadlockLogicErrorTimeLimit = 600s;
if (armed && (timeSpentDeadlocked >= reportingIntervalSeconds))
{ {
t += 1s; // Report the deadlocked condition every
std::unique_lock sl(mutex_); // reportingIntervalSeconds
if (cv_.wait_until(sl, t, [this] { return stop_; })) if ((timeSpentDeadlocked % reportingIntervalSeconds) == 0s)
{ {
break; if (timeSpentDeadlocked < deadlockFatalLogMessageTimeLimit)
}
// Copy out shared data under a lock. Use copies outside lock.
auto const deadLock = deadLock_;
auto const armed = armed_;
sl.unlock();
// Measure the amount of time we have been deadlocked, in seconds.
using namespace std::chrono;
auto const timeSpentDeadlocked =
duration_cast<seconds>(steady_clock::now() - deadLock);
constexpr auto reportingIntervalSeconds = 10s;
constexpr auto deadlockFatalLogMessageTimeLimit = 90s;
constexpr auto deadlockLogicErrorTimeLimit = 600s;
if (armed && (timeSpentDeadlocked >= reportingIntervalSeconds))
{
// Report the deadlocked condition every
// reportingIntervalSeconds
if ((timeSpentDeadlocked % reportingIntervalSeconds) == 0s)
{ {
if (timeSpentDeadlocked < deadlockFatalLogMessageTimeLimit) JLOG(journal_.warn())
{ << "Server stalled for " << timeSpentDeadlocked.count()
JLOG(journal_.warn()) << " seconds.";
<< "Server stalled for "
<< timeSpentDeadlocked.count() << " seconds.";
}
else
{
JLOG(journal_.fatal())
<< "Deadlock detected. Deadlocked time: "
<< timeSpentDeadlocked.count() << "s";
if (app_.getJobQueue().isOverloaded())
{
JLOG(journal_.fatal())
<< app_.getJobQueue().getJson(0);
}
}
}
// If we go over the deadlockTimeLimit spent deadlocked, it
// means that the deadlock resolution code has failed, which
// qualifies as undefined behavior.
//
if (timeSpentDeadlocked >= deadlockLogicErrorTimeLimit)
{
JLOG(journal_.fatal())
<< "LogicError: Deadlock detected. Deadlocked time: "
<< timeSpentDeadlocked.count() << "s";
if (app_.getJobQueue().isOverloaded()) if (app_.getJobQueue().isOverloaded())
{ {
JLOG(journal_.fatal()) << app_.getJobQueue().getJson(0); JLOG(journal_.warn()) << app_.getJobQueue().getJson(0);
} }
LogicError("Deadlock detected"); }
else
{
JLOG(journal_.fatal())
<< "Deadlock detected. Deadlocked time: "
<< timeSpentDeadlocked.count() << "s";
JLOG(journal_.fatal())
<< "JobQueue: " << app_.getJobQueue().getJson(0);
} }
} }
}
bool change = false; // If we go over the deadlockTimeLimit spent deadlocked, it
if (app_.getJobQueue().isOverloaded()) // means that the deadlock resolution code has failed, which
{ // qualifies as undefined behavior.
JLOG(journal_.info()) << app_.getJobQueue().getJson(0); //
change = app_.getFeeTrack().raiseLocalFee(); if (timeSpentDeadlocked >= deadlockLogicErrorTimeLimit)
} {
else JLOG(journal_.fatal())
{ << "LogicError: Deadlock detected. Deadlocked time: "
change = app_.getFeeTrack().lowerLocalFee(); << timeSpentDeadlocked.count() << "s";
JLOG(journal_.fatal())
<< "JobQueue: " << app_.getJobQueue().getJson(0);
LogicError("Deadlock detected");
}
} }
}
if (change) bool change;
{
// VFALCO TODO replace this with a Listener / observer and if (app_.getJobQueue().isOverloaded())
// subscribe in NetworkOPs or Application. {
app_.getOPs().reportFeeChange(); JLOG(journal_.info()) << "Raising local fee (JQ overload): "
} << app_.getJobQueue().getJson(0);
change = app_.getFeeTrack().raiseLocalFee();
}
else
{
change = app_.getFeeTrack().lowerLocalFee();
}
if (change)
{
// VFALCO TODO replace this with a Listener / observer and
// subscribe in NetworkOPs or Application.
app_.getOPs().reportFeeChange();
} }
} }