Merge branch 'develop' into vault

This commit is contained in:
Bronek Kozicki
2025-03-17 12:48:28 +00:00
6 changed files with 49 additions and 46 deletions

View File

@@ -420,6 +420,7 @@
# - r.ripple.com 51235 # - r.ripple.com 51235
# - sahyadri.isrdc.in 51235 # - sahyadri.isrdc.in 51235
# - hubs.xrpkuwait.com 51235 # - hubs.xrpkuwait.com 51235
# - hub.xrpl-commons.org 51235
# #
# Examples: # Examples:
# #

View File

@@ -1555,10 +1555,10 @@ ApplicationImp::run()
if (!config_->standalone()) if (!config_->standalone())
{ {
// VFALCO NOTE This seems unnecessary. If we properly refactor the load // VFALCO NOTE This seems unnecessary. If we properly refactor the load
// manager then the deadlock detector can just always be // manager then the stall detector can just always be
// "armed" // "armed"
// //
getLoadManager().activateDeadlockDetector(); getLoadManager().activateStallDetector();
} }
{ {

View File

@@ -32,7 +32,7 @@
namespace ripple { namespace ripple {
LoadManager::LoadManager(Application& app, beast::Journal journal) LoadManager::LoadManager(Application& app, beast::Journal journal)
: app_(app), journal_(journal), deadLock_(), armed_(false) : app_(app), journal_(journal), lastHeartbeat_(), armed_(false)
{ {
} }
@@ -53,19 +53,19 @@ LoadManager::~LoadManager()
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
void void
LoadManager::activateDeadlockDetector() LoadManager::activateStallDetector()
{ {
std::lock_guard sl(mutex_); std::lock_guard sl(mutex_);
armed_ = true; armed_ = true;
deadLock_ = std::chrono::steady_clock::now(); lastHeartbeat_ = std::chrono::steady_clock::now();
} }
void void
LoadManager::resetDeadlockDetector() LoadManager::heartbeat()
{ {
auto const detector_start = std::chrono::steady_clock::now(); auto const heartbeat = std::chrono::steady_clock::now();
std::lock_guard sl(mutex_); std::lock_guard sl(mutex_);
deadLock_ = detector_start; lastHeartbeat_ = heartbeat;
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@@ -118,63 +118,62 @@ LoadManager::run()
break; break;
// Copy out shared data under a lock. Use copies outside lock. // Copy out shared data under a lock. Use copies outside lock.
auto const deadLock = deadLock_; auto const lastHeartbeat = lastHeartbeat_;
auto const armed = armed_; auto const armed = armed_;
sl.unlock(); sl.unlock();
// Measure the amount of time we have been deadlocked, in seconds. // Measure the amount of time we have been stalled, in seconds.
using namespace std::chrono; using namespace std::chrono;
auto const timeSpentDeadlocked = auto const timeSpentStalled =
duration_cast<seconds>(steady_clock::now() - deadLock); duration_cast<seconds>(steady_clock::now() - lastHeartbeat);
constexpr auto reportingIntervalSeconds = 10s; constexpr auto reportingIntervalSeconds = 10s;
constexpr auto deadlockFatalLogMessageTimeLimit = 90s; constexpr auto stallFatalLogMessageTimeLimit = 90s;
constexpr auto deadlockLogicErrorTimeLimit = 600s; constexpr auto stallLogicErrorTimeLimit = 600s;
if (armed && (timeSpentDeadlocked >= reportingIntervalSeconds)) if (armed && (timeSpentStalled >= reportingIntervalSeconds))
{ {
// Report the deadlocked condition every // Report the stalled condition every reportingIntervalSeconds
// reportingIntervalSeconds if ((timeSpentStalled % reportingIntervalSeconds) == 0s)
if ((timeSpentDeadlocked % reportingIntervalSeconds) == 0s)
{ {
if (timeSpentDeadlocked < deadlockFatalLogMessageTimeLimit) if (timeSpentStalled < stallFatalLogMessageTimeLimit)
{ {
JLOG(journal_.warn()) JLOG(journal_.warn())
<< "Server stalled for " << timeSpentDeadlocked.count() << "Server stalled for " << timeSpentStalled.count()
<< " seconds."; << " seconds.";
if (app_.getJobQueue().isOverloaded()) if (app_.getJobQueue().isOverloaded())
{ {
JLOG(journal_.warn()) << app_.getJobQueue().getJson(0); JLOG(journal_.warn())
<< "JobQueue: " << app_.getJobQueue().getJson(0);
} }
} }
else else
{ {
JLOG(journal_.fatal()) JLOG(journal_.fatal())
<< "Deadlock detected. Deadlocked time: " << "Server stalled for " << timeSpentStalled.count()
<< timeSpentDeadlocked.count() << "s"; << " seconds.";
JLOG(journal_.fatal()) JLOG(journal_.fatal())
<< "JobQueue: " << app_.getJobQueue().getJson(0); << "JobQueue: " << app_.getJobQueue().getJson(0);
} }
} }
// If we go over the deadlockTimeLimit spent deadlocked, it // If we go over the stallLogicErrorTimeLimit spent stalled, it
// means that the deadlock resolution code has failed, which // means that the stall resolution code has failed, which qualifies
// qualifies as undefined behavior. // as a LogicError
// if (timeSpentStalled >= stallLogicErrorTimeLimit)
if (timeSpentDeadlocked >= deadlockLogicErrorTimeLimit)
{ {
JLOG(journal_.fatal()) JLOG(journal_.fatal())
<< "LogicError: Deadlock detected. Deadlocked time: " << "LogicError: Fatal server stall detected. Stalled time: "
<< timeSpentDeadlocked.count() << "s"; << timeSpentStalled.count() << "s";
JLOG(journal_.fatal()) JLOG(journal_.fatal())
<< "JobQueue: " << app_.getJobQueue().getJson(0); << "JobQueue: " << app_.getJobQueue().getJson(0);
LogicError("Deadlock detected"); LogicError("Fatal server stall detected");
} }
} }
} }
bool change; bool change = false;
if (app_.getJobQueue().isOverloaded()) if (app_.getJobQueue().isOverloaded())
{ {
JLOG(journal_.info()) << "Raising local fee (JQ overload): " JLOG(journal_.info()) << "Raising local fee (JQ overload): "

View File

@@ -58,28 +58,28 @@ public:
*/ */
~LoadManager(); ~LoadManager();
/** Turn on deadlock detection. /** Turn on stall detection.
The deadlock detector begins in a disabled state. After this function The stall detector begins in a disabled state. After this function
is called, it will report deadlocks using a separate thread whenever is called, it will report stalls using a separate thread whenever
the reset function is not called at least once per 10 seconds. the reset function is not called at least once per 10 seconds.
@see resetDeadlockDetector @see resetStallDetector
*/ */
// VFALCO NOTE it seems that the deadlock detector has an "armed" state // VFALCO NOTE it seems that the stall detector has an "armed" state
// to prevent it from going off during program startup if // to prevent it from going off during program startup if
// there's a lengthy initialization operation taking place? // there's a lengthy initialization operation taking place?
// //
void void
activateDeadlockDetector(); activateStallDetector();
/** Reset the deadlock detection timer. /** Reset the stall detection timer.
A dedicated thread monitors the deadlock timer, and if too much A dedicated thread monitors the stall timer, and if too much
time passes it will produce log warnings. time passes it will produce log warnings.
*/ */
void void
resetDeadlockDetector(); heartbeat();
//-------------------------------------------------------------------------- //--------------------------------------------------------------------------
@@ -98,12 +98,12 @@ private:
beast::Journal const journal_; beast::Journal const journal_;
std::thread thread_; std::thread thread_;
std::mutex mutex_; // Guards deadLock_, armed_, cv_ std::mutex mutex_; // Guards lastHeartbeat_, armed_, cv_
std::condition_variable cv_; std::condition_variable cv_;
bool stop_ = false; bool stop_ = false;
std::chrono::steady_clock::time_point // Detect server stalls
deadLock_; // Detect server deadlocks. std::chrono::steady_clock::time_point lastHeartbeat_;
bool armed_; bool armed_;
friend std::unique_ptr<LoadManager> friend std::unique_ptr<LoadManager>

View File

@@ -1015,7 +1015,7 @@ NetworkOPsImp::processHeartbeatTimer()
// VFALCO NOTE This is for diagnosing a crash on exit // VFALCO NOTE This is for diagnosing a crash on exit
LoadManager& mgr(app_.getLoadManager()); LoadManager& mgr(app_.getLoadManager());
mgr.resetDeadlockDetector(); mgr.heartbeat();
std::size_t const numPeers = app_.overlay().size(); std::size_t const numPeers = app_.overlay().size();

View File

@@ -503,6 +503,9 @@ OverlayImpl::start()
// Pool of servers operated by @Xrpkuwait - https://xrpkuwait.com // Pool of servers operated by @Xrpkuwait - https://xrpkuwait.com
bootstrapIps.push_back("hubs.xrpkuwait.com 51235"); bootstrapIps.push_back("hubs.xrpkuwait.com 51235");
// Pool of servers operated by XRPL Commons - https://xrpl-commons.org
bootstrapIps.push_back("hub.xrpl-commons.org 51235");
} }
m_resolver.resolve( m_resolver.resolve(