Merge branch 'develop' into vault

This commit is contained in:
Bronek Kozicki
2025-03-17 12:48:28 +00:00
6 changed files with 49 additions and 46 deletions

View File

@@ -420,6 +420,7 @@
# - r.ripple.com 51235
# - sahyadri.isrdc.in 51235
# - hubs.xrpkuwait.com 51235
# - hub.xrpl-commons.org 51235
#
# Examples:
#

View File

@@ -1555,10 +1555,10 @@ ApplicationImp::run()
if (!config_->standalone())
{
// VFALCO NOTE This seems unnecessary. If we properly refactor the load
// manager then the deadlock detector can just always be
// manager then the stall detector can just always be
// "armed"
//
getLoadManager().activateDeadlockDetector();
getLoadManager().activateStallDetector();
}
{

View File

@@ -32,7 +32,7 @@
namespace ripple {
LoadManager::LoadManager(Application& app, beast::Journal journal)
: app_(app), journal_(journal), deadLock_(), armed_(false)
: app_(app), journal_(journal), lastHeartbeat_(), armed_(false)
{
}
@@ -53,19 +53,19 @@ LoadManager::~LoadManager()
//------------------------------------------------------------------------------
void
LoadManager::activateDeadlockDetector()
LoadManager::activateStallDetector()
{
std::lock_guard sl(mutex_);
armed_ = true;
deadLock_ = std::chrono::steady_clock::now();
lastHeartbeat_ = std::chrono::steady_clock::now();
}
void
LoadManager::resetDeadlockDetector()
LoadManager::heartbeat()
{
auto const detector_start = std::chrono::steady_clock::now();
auto const heartbeat = std::chrono::steady_clock::now();
std::lock_guard sl(mutex_);
deadLock_ = detector_start;
lastHeartbeat_ = heartbeat;
}
//------------------------------------------------------------------------------
@@ -118,63 +118,62 @@ LoadManager::run()
break;
// Copy out shared data under a lock. Use copies outside lock.
auto const deadLock = deadLock_;
auto const lastHeartbeat = lastHeartbeat_;
auto const armed = armed_;
sl.unlock();
// Measure the amount of time we have been deadlocked, in seconds.
// Measure the amount of time we have been stalled, in seconds.
using namespace std::chrono;
auto const timeSpentDeadlocked =
duration_cast<seconds>(steady_clock::now() - deadLock);
auto const timeSpentStalled =
duration_cast<seconds>(steady_clock::now() - lastHeartbeat);
constexpr auto reportingIntervalSeconds = 10s;
constexpr auto deadlockFatalLogMessageTimeLimit = 90s;
constexpr auto deadlockLogicErrorTimeLimit = 600s;
constexpr auto stallFatalLogMessageTimeLimit = 90s;
constexpr auto stallLogicErrorTimeLimit = 600s;
if (armed && (timeSpentDeadlocked >= reportingIntervalSeconds))
if (armed && (timeSpentStalled >= reportingIntervalSeconds))
{
// Report the deadlocked condition every
// reportingIntervalSeconds
if ((timeSpentDeadlocked % reportingIntervalSeconds) == 0s)
// Report the stalled condition every reportingIntervalSeconds
if ((timeSpentStalled % reportingIntervalSeconds) == 0s)
{
if (timeSpentDeadlocked < deadlockFatalLogMessageTimeLimit)
if (timeSpentStalled < stallFatalLogMessageTimeLimit)
{
JLOG(journal_.warn())
<< "Server stalled for " << timeSpentDeadlocked.count()
<< "Server stalled for " << timeSpentStalled.count()
<< " seconds.";
if (app_.getJobQueue().isOverloaded())
{
JLOG(journal_.warn()) << app_.getJobQueue().getJson(0);
JLOG(journal_.warn())
<< "JobQueue: " << app_.getJobQueue().getJson(0);
}
}
else
{
JLOG(journal_.fatal())
<< "Deadlock detected. Deadlocked time: "
<< timeSpentDeadlocked.count() << "s";
<< "Server stalled for " << timeSpentStalled.count()
<< " seconds.";
JLOG(journal_.fatal())
<< "JobQueue: " << app_.getJobQueue().getJson(0);
}
}
// If we go over the deadlockTimeLimit spent deadlocked, it
// means that the deadlock resolution code has failed, which
// qualifies as undefined behavior.
//
if (timeSpentDeadlocked >= deadlockLogicErrorTimeLimit)
// If we go over the stallLogicErrorTimeLimit spent stalled, it
// means that the stall resolution code has failed, which qualifies
// as a LogicError
if (timeSpentStalled >= stallLogicErrorTimeLimit)
{
JLOG(journal_.fatal())
<< "LogicError: Deadlock detected. Deadlocked time: "
<< timeSpentDeadlocked.count() << "s";
<< "LogicError: Fatal server stall detected. Stalled time: "
<< timeSpentStalled.count() << "s";
JLOG(journal_.fatal())
<< "JobQueue: " << app_.getJobQueue().getJson(0);
LogicError("Deadlock detected");
LogicError("Fatal server stall detected");
}
}
}
bool change;
bool change = false;
if (app_.getJobQueue().isOverloaded())
{
JLOG(journal_.info()) << "Raising local fee (JQ overload): "

View File

@@ -58,28 +58,28 @@ public:
*/
~LoadManager();
/** Turn on deadlock detection.
/** Turn on stall detection.
The deadlock detector begins in a disabled state. After this function
is called, it will report deadlocks using a separate thread whenever
The stall detector begins in a disabled state. After this function
is called, it will report stalls using a separate thread whenever
the reset function is not called at least once per 10 seconds.
@see resetDeadlockDetector
@see resetStallDetector
*/
// VFALCO NOTE it seems that the deadlock detector has an "armed" state
// VFALCO NOTE it seems that the stall detector has an "armed" state
// to prevent it from going off during program startup if
// there's a lengthy initialization operation taking place?
//
void
activateDeadlockDetector();
activateStallDetector();
/** Reset the deadlock detection timer.
/** Reset the stall detection timer.
A dedicated thread monitors the deadlock timer, and if too much
A dedicated thread monitors the stall timer, and if too much
time passes it will produce log warnings.
*/
void
resetDeadlockDetector();
heartbeat();
//--------------------------------------------------------------------------
@@ -98,12 +98,12 @@ private:
beast::Journal const journal_;
std::thread thread_;
std::mutex mutex_; // Guards deadLock_, armed_, cv_
std::mutex mutex_; // Guards lastHeartbeat_, armed_, cv_
std::condition_variable cv_;
bool stop_ = false;
std::chrono::steady_clock::time_point
deadLock_; // Detect server deadlocks.
// Detect server stalls
std::chrono::steady_clock::time_point lastHeartbeat_;
bool armed_;
friend std::unique_ptr<LoadManager>

View File

@@ -1015,7 +1015,7 @@ NetworkOPsImp::processHeartbeatTimer()
// VFALCO NOTE This is for diagnosing a crash on exit
LoadManager& mgr(app_.getLoadManager());
mgr.resetDeadlockDetector();
mgr.heartbeat();
std::size_t const numPeers = app_.overlay().size();

View File

@@ -503,6 +503,9 @@ OverlayImpl::start()
// Pool of servers operated by @Xrpkuwait - https://xrpkuwait.com
bootstrapIps.push_back("hubs.xrpkuwait.com 51235");
// Pool of servers operated by XRPL Commons - https://xrpl-commons.org
bootstrapIps.push_back("hub.xrpl-commons.org 51235");
}
m_resolver.resolve(