diff --git a/cfg/rippled-example.cfg b/cfg/rippled-example.cfg index ee9fdbd274..8fb7d00875 100644 --- a/cfg/rippled-example.cfg +++ b/cfg/rippled-example.cfg @@ -420,6 +420,7 @@ # - r.ripple.com 51235 # - sahyadri.isrdc.in 51235 # - hubs.xrpkuwait.com 51235 +# - hub.xrpl-commons.org 51235 # # Examples: # diff --git a/src/xrpld/app/main/Application.cpp b/src/xrpld/app/main/Application.cpp index 3088c0d56f..aa502b4143 100644 --- a/src/xrpld/app/main/Application.cpp +++ b/src/xrpld/app/main/Application.cpp @@ -1555,10 +1555,10 @@ ApplicationImp::run() if (!config_->standalone()) { // VFALCO NOTE This seems unnecessary. If we properly refactor the load - // manager then the deadlock detector can just always be + // manager then the stall detector can just always be // "armed" // - getLoadManager().activateDeadlockDetector(); + getLoadManager().activateStallDetector(); } { diff --git a/src/xrpld/app/main/LoadManager.cpp b/src/xrpld/app/main/LoadManager.cpp index d4cd3572cc..9ae9e44a3a 100644 --- a/src/xrpld/app/main/LoadManager.cpp +++ b/src/xrpld/app/main/LoadManager.cpp @@ -32,7 +32,7 @@ namespace ripple { LoadManager::LoadManager(Application& app, beast::Journal journal) - : app_(app), journal_(journal), deadLock_(), armed_(false) + : app_(app), journal_(journal), lastHeartbeat_(), armed_(false) { } @@ -53,19 +53,19 @@ LoadManager::~LoadManager() //------------------------------------------------------------------------------ void -LoadManager::activateDeadlockDetector() +LoadManager::activateStallDetector() { std::lock_guard sl(mutex_); armed_ = true; - deadLock_ = std::chrono::steady_clock::now(); + lastHeartbeat_ = std::chrono::steady_clock::now(); } void -LoadManager::resetDeadlockDetector() +LoadManager::heartbeat() { - auto const detector_start = std::chrono::steady_clock::now(); + auto const heartbeat = std::chrono::steady_clock::now(); std::lock_guard sl(mutex_); - deadLock_ = detector_start; + lastHeartbeat_ = heartbeat; } //------------------------------------------------------------------------------ @@ -118,63 +118,62 @@ LoadManager::run() break; // Copy out shared data under a lock. Use copies outside lock. - auto const deadLock = deadLock_; + auto const lastHeartbeat = lastHeartbeat_; auto const armed = armed_; sl.unlock(); - // Measure the amount of time we have been deadlocked, in seconds. + // Measure the amount of time we have been stalled, in seconds. using namespace std::chrono; - auto const timeSpentDeadlocked = - duration_cast(steady_clock::now() - deadLock); + auto const timeSpentStalled = + duration_cast(steady_clock::now() - lastHeartbeat); constexpr auto reportingIntervalSeconds = 10s; - constexpr auto deadlockFatalLogMessageTimeLimit = 90s; - constexpr auto deadlockLogicErrorTimeLimit = 600s; + constexpr auto stallFatalLogMessageTimeLimit = 90s; + constexpr auto stallLogicErrorTimeLimit = 600s; - if (armed && (timeSpentDeadlocked >= reportingIntervalSeconds)) + if (armed && (timeSpentStalled >= reportingIntervalSeconds)) { - // Report the deadlocked condition every - // reportingIntervalSeconds - if ((timeSpentDeadlocked % reportingIntervalSeconds) == 0s) + // Report the stalled condition every reportingIntervalSeconds + if ((timeSpentStalled % reportingIntervalSeconds) == 0s) { - if (timeSpentDeadlocked < deadlockFatalLogMessageTimeLimit) + if (timeSpentStalled < stallFatalLogMessageTimeLimit) { JLOG(journal_.warn()) - << "Server stalled for " << timeSpentDeadlocked.count() + << "Server stalled for " << timeSpentStalled.count() << " seconds."; + if (app_.getJobQueue().isOverloaded()) { - JLOG(journal_.warn()) << app_.getJobQueue().getJson(0); + JLOG(journal_.warn()) + << "JobQueue: " << app_.getJobQueue().getJson(0); } } else { JLOG(journal_.fatal()) - << "Deadlock detected. Deadlocked time: " - << timeSpentDeadlocked.count() << "s"; + << "Server stalled for " << timeSpentStalled.count() + << " seconds."; JLOG(journal_.fatal()) << "JobQueue: " << app_.getJobQueue().getJson(0); } } - // If we go over the deadlockTimeLimit spent deadlocked, it - // means that the deadlock resolution code has failed, which - // qualifies as undefined behavior. - // - if (timeSpentDeadlocked >= deadlockLogicErrorTimeLimit) + // If we go over the stallLogicErrorTimeLimit spent stalled, it + // means that the stall resolution code has failed, which qualifies + // as a LogicError + if (timeSpentStalled >= stallLogicErrorTimeLimit) { JLOG(journal_.fatal()) - << "LogicError: Deadlock detected. Deadlocked time: " - << timeSpentDeadlocked.count() << "s"; + << "LogicError: Fatal server stall detected. Stalled time: " + << timeSpentStalled.count() << "s"; JLOG(journal_.fatal()) << "JobQueue: " << app_.getJobQueue().getJson(0); - LogicError("Deadlock detected"); + LogicError("Fatal server stall detected"); } } } - bool change; - + bool change = false; if (app_.getJobQueue().isOverloaded()) { JLOG(journal_.info()) << "Raising local fee (JQ overload): " diff --git a/src/xrpld/app/main/LoadManager.h b/src/xrpld/app/main/LoadManager.h index 9ec02ce34f..166c40538c 100644 --- a/src/xrpld/app/main/LoadManager.h +++ b/src/xrpld/app/main/LoadManager.h @@ -58,28 +58,28 @@ public: */ ~LoadManager(); - /** Turn on deadlock detection. + /** Turn on stall detection. - The deadlock detector begins in a disabled state. After this function - is called, it will report deadlocks using a separate thread whenever + The stall detector begins in a disabled state. After this function + is called, it will report stalls using a separate thread whenever the reset function is not called at least once per 10 seconds. - @see resetDeadlockDetector + @see resetStallDetector */ - // VFALCO NOTE it seems that the deadlock detector has an "armed" state + // VFALCO NOTE it seems that the stall detector has an "armed" state // to prevent it from going off during program startup if // there's a lengthy initialization operation taking place? // void - activateDeadlockDetector(); + activateStallDetector(); - /** Reset the deadlock detection timer. + /** Reset the stall detection timer. - A dedicated thread monitors the deadlock timer, and if too much + A dedicated thread monitors the stall timer, and if too much time passes it will produce log warnings. */ void - resetDeadlockDetector(); + heartbeat(); //-------------------------------------------------------------------------- @@ -98,12 +98,12 @@ private: beast::Journal const journal_; std::thread thread_; - std::mutex mutex_; // Guards deadLock_, armed_, cv_ + std::mutex mutex_; // Guards lastHeartbeat_, armed_, cv_ std::condition_variable cv_; bool stop_ = false; - std::chrono::steady_clock::time_point - deadLock_; // Detect server deadlocks. + // Detect server stalls + std::chrono::steady_clock::time_point lastHeartbeat_; bool armed_; friend std::unique_ptr diff --git a/src/xrpld/app/misc/NetworkOPs.cpp b/src/xrpld/app/misc/NetworkOPs.cpp index 550f6e4485..db297f743b 100644 --- a/src/xrpld/app/misc/NetworkOPs.cpp +++ b/src/xrpld/app/misc/NetworkOPs.cpp @@ -1015,7 +1015,7 @@ NetworkOPsImp::processHeartbeatTimer() // VFALCO NOTE This is for diagnosing a crash on exit LoadManager& mgr(app_.getLoadManager()); - mgr.resetDeadlockDetector(); + mgr.heartbeat(); std::size_t const numPeers = app_.overlay().size(); diff --git a/src/xrpld/overlay/detail/OverlayImpl.cpp b/src/xrpld/overlay/detail/OverlayImpl.cpp index b3a3395eed..2209414d9f 100644 --- a/src/xrpld/overlay/detail/OverlayImpl.cpp +++ b/src/xrpld/overlay/detail/OverlayImpl.cpp @@ -503,6 +503,9 @@ OverlayImpl::start() // Pool of servers operated by @Xrpkuwait - https://xrpkuwait.com bootstrapIps.push_back("hubs.xrpkuwait.com 51235"); + + // Pool of servers operated by XRPL Commons - https://xrpl-commons.org + bootstrapIps.push_back("hub.xrpl-commons.org 51235"); } m_resolver.resolve(