mirror of
https://github.com/XRPLF/rippled.git
synced 2025-11-24 13:05:53 +00:00
Report job queue data if a deadlock is detected
This commit is contained in:
@@ -108,86 +108,86 @@ LoadManager::run()
|
|||||||
|
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
|
t += 1s;
|
||||||
|
|
||||||
|
std::unique_lock sl(mutex_);
|
||||||
|
if (cv_.wait_until(sl, t, [this] { return stop_; }))
|
||||||
|
break;
|
||||||
|
|
||||||
|
// Copy out shared data under a lock. Use copies outside lock.
|
||||||
|
auto const deadLock = deadLock_;
|
||||||
|
auto const armed = armed_;
|
||||||
|
sl.unlock();
|
||||||
|
|
||||||
|
// Measure the amount of time we have been deadlocked, in seconds.
|
||||||
|
using namespace std::chrono;
|
||||||
|
auto const timeSpentDeadlocked =
|
||||||
|
duration_cast<seconds>(steady_clock::now() - deadLock);
|
||||||
|
|
||||||
|
constexpr auto reportingIntervalSeconds = 10s;
|
||||||
|
constexpr auto deadlockFatalLogMessageTimeLimit = 90s;
|
||||||
|
constexpr auto deadlockLogicErrorTimeLimit = 600s;
|
||||||
|
|
||||||
|
if (armed && (timeSpentDeadlocked >= reportingIntervalSeconds))
|
||||||
{
|
{
|
||||||
t += 1s;
|
// Report the deadlocked condition every
|
||||||
std::unique_lock sl(mutex_);
|
// reportingIntervalSeconds
|
||||||
if (cv_.wait_until(sl, t, [this] { return stop_; }))
|
if ((timeSpentDeadlocked % reportingIntervalSeconds) == 0s)
|
||||||
{
|
{
|
||||||
break;
|
if (timeSpentDeadlocked < deadlockFatalLogMessageTimeLimit)
|
||||||
}
|
|
||||||
// Copy out shared data under a lock. Use copies outside lock.
|
|
||||||
auto const deadLock = deadLock_;
|
|
||||||
auto const armed = armed_;
|
|
||||||
sl.unlock();
|
|
||||||
|
|
||||||
// Measure the amount of time we have been deadlocked, in seconds.
|
|
||||||
using namespace std::chrono;
|
|
||||||
auto const timeSpentDeadlocked =
|
|
||||||
duration_cast<seconds>(steady_clock::now() - deadLock);
|
|
||||||
|
|
||||||
constexpr auto reportingIntervalSeconds = 10s;
|
|
||||||
constexpr auto deadlockFatalLogMessageTimeLimit = 90s;
|
|
||||||
constexpr auto deadlockLogicErrorTimeLimit = 600s;
|
|
||||||
if (armed && (timeSpentDeadlocked >= reportingIntervalSeconds))
|
|
||||||
{
|
|
||||||
// Report the deadlocked condition every
|
|
||||||
// reportingIntervalSeconds
|
|
||||||
if ((timeSpentDeadlocked % reportingIntervalSeconds) == 0s)
|
|
||||||
{
|
{
|
||||||
if (timeSpentDeadlocked < deadlockFatalLogMessageTimeLimit)
|
JLOG(journal_.warn())
|
||||||
{
|
<< "Server stalled for " << timeSpentDeadlocked.count()
|
||||||
JLOG(journal_.warn())
|
<< " seconds.";
|
||||||
<< "Server stalled for "
|
|
||||||
<< timeSpentDeadlocked.count() << " seconds.";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
JLOG(journal_.fatal())
|
|
||||||
<< "Deadlock detected. Deadlocked time: "
|
|
||||||
<< timeSpentDeadlocked.count() << "s";
|
|
||||||
if (app_.getJobQueue().isOverloaded())
|
|
||||||
{
|
|
||||||
JLOG(journal_.fatal())
|
|
||||||
<< app_.getJobQueue().getJson(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we go over the deadlockTimeLimit spent deadlocked, it
|
|
||||||
// means that the deadlock resolution code has failed, which
|
|
||||||
// qualifies as undefined behavior.
|
|
||||||
//
|
|
||||||
if (timeSpentDeadlocked >= deadlockLogicErrorTimeLimit)
|
|
||||||
{
|
|
||||||
JLOG(journal_.fatal())
|
|
||||||
<< "LogicError: Deadlock detected. Deadlocked time: "
|
|
||||||
<< timeSpentDeadlocked.count() << "s";
|
|
||||||
if (app_.getJobQueue().isOverloaded())
|
if (app_.getJobQueue().isOverloaded())
|
||||||
{
|
{
|
||||||
JLOG(journal_.fatal()) << app_.getJobQueue().getJson(0);
|
JLOG(journal_.warn()) << app_.getJobQueue().getJson(0);
|
||||||
}
|
}
|
||||||
LogicError("Deadlock detected");
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
JLOG(journal_.fatal())
|
||||||
|
<< "Deadlock detected. Deadlocked time: "
|
||||||
|
<< timeSpentDeadlocked.count() << "s";
|
||||||
|
JLOG(journal_.fatal())
|
||||||
|
<< "JobQueue: " << app_.getJobQueue().getJson(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
bool change = false;
|
// If we go over the deadlockTimeLimit spent deadlocked, it
|
||||||
if (app_.getJobQueue().isOverloaded())
|
// means that the deadlock resolution code has failed, which
|
||||||
{
|
// qualifies as undefined behavior.
|
||||||
JLOG(journal_.info()) << app_.getJobQueue().getJson(0);
|
//
|
||||||
change = app_.getFeeTrack().raiseLocalFee();
|
if (timeSpentDeadlocked >= deadlockLogicErrorTimeLimit)
|
||||||
}
|
{
|
||||||
else
|
JLOG(journal_.fatal())
|
||||||
{
|
<< "LogicError: Deadlock detected. Deadlocked time: "
|
||||||
change = app_.getFeeTrack().lowerLocalFee();
|
<< timeSpentDeadlocked.count() << "s";
|
||||||
|
JLOG(journal_.fatal())
|
||||||
|
<< "JobQueue: " << app_.getJobQueue().getJson(0);
|
||||||
|
LogicError("Deadlock detected");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (change)
|
bool change;
|
||||||
{
|
|
||||||
// VFALCO TODO replace this with a Listener / observer and
|
if (app_.getJobQueue().isOverloaded())
|
||||||
// subscribe in NetworkOPs or Application.
|
{
|
||||||
app_.getOPs().reportFeeChange();
|
JLOG(journal_.info()) << "Raising local fee (JQ overload): "
|
||||||
}
|
<< app_.getJobQueue().getJson(0);
|
||||||
|
change = app_.getFeeTrack().raiseLocalFee();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
change = app_.getFeeTrack().lowerLocalFee();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (change)
|
||||||
|
{
|
||||||
|
// VFALCO TODO replace this with a Listener / observer and
|
||||||
|
// subscribe in NetworkOPs or Application.
|
||||||
|
app_.getOPs().reportFeeChange();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user