Detect server deadlocks and trigger the auto-restart mechanism.

This commit is contained in:
JoelKatz
2013-05-08 15:21:22 -07:00
parent 34a15c2baa
commit 04b0f968fd
3 changed files with 28 additions and 1 deletions

View File

@@ -22,7 +22,7 @@ int upTime()
LoadManager::LoadManager(int creditRate, int creditLimit, int debitWarn, int debitLimit) : LoadManager::LoadManager(int creditRate, int creditLimit, int debitWarn, int debitLimit) :
mCreditRate(creditRate), mCreditLimit(creditLimit), mDebitWarn(debitWarn), mDebitLimit(debitLimit), mCreditRate(creditRate), mCreditLimit(creditLimit), mDebitWarn(debitWarn), mDebitLimit(debitLimit),
mShutdown(false), mUptime(0), mCosts(LT_MAX) mShutdown(false), mUptime(0), mDeadLock(0), mCosts(LT_MAX)
{ {
addLoadCost(LoadCost(LT_InvalidRequest, -10, LC_CPU | LC_Network)); addLoadCost(LoadCost(LT_InvalidRequest, -10, LC_CPU | LC_Network));
addLoadCost(LoadCost(LT_RequestNoReply, -1, LC_CPU | LC_Disk)); addLoadCost(LoadCost(LT_RequestNoReply, -1, LC_CPU | LC_Disk));
@@ -67,6 +67,11 @@ LoadManager::~LoadManager()
while (1); while (1);
} }
void LoadManager::noDeadLock()
{
boost::mutex::scoped_lock sl(mLock);
mDeadLock = mUptime;
}
int LoadManager::getCreditRate() const int LoadManager::getCreditRate() const
{ {
@@ -321,6 +326,11 @@ int LoadManager::getUptime()
return mUptime; return mUptime;
} }
static void LogDeadLock(int dlTime)
{
cLog(lsWARNING) << "Server stalled for " << dlTime << " seconds.";
}
void LoadManager::threadEntry() void LoadManager::threadEntry()
{ {
NameThread("loadmgr"); NameThread("loadmgr");
@@ -335,6 +345,18 @@ void LoadManager::threadEntry()
return; return;
} }
++mUptime; ++mUptime;
int dlTime = mUptime - mDeadLock;
if (dlTime >= 10)
{
if ((dlTime % 10) == 0)
{
boost::thread(BIND_TYPE(&LogDeadLock, dlTime)).detach();
}
assert (dlTime < 180);
}
} }
bool change; bool change;

View File

@@ -106,6 +106,8 @@ protected:
int mUptime; int mUptime;
int mSpace2[4]; int mSpace2[4];
int mDeadLock; // Detect server deadlocks
mutable boost::mutex mLock; mutable boost::mutex mLock;
void canonicalize(LoadSource&, int upTime) const; void canonicalize(LoadSource&, int upTime) const;
@@ -141,6 +143,7 @@ public:
int getCost(LoadType t) { return mCosts[static_cast<int>(t)].mCost; } int getCost(LoadType t) { return mCosts[static_cast<int>(t)].mCost; }
int getUptime(); int getUptime();
void noDeadLock();
}; };
class LoadFeeTrack class LoadFeeTrack

View File

@@ -593,6 +593,8 @@ void NetworkOPs::checkState(const boost::system::error_code& result)
{ {
ScopedLock sl(theApp->getMasterLock()); ScopedLock sl(theApp->getMasterLock());
theApp->getLoadManager().noDeadLock();
std::vector<Peer::pointer> peerList = theApp->getConnectionPool().getPeerVector(); std::vector<Peer::pointer> peerList = theApp->getConnectionPool().getPeerVector();
// do we have sufficient peers? If not, we are disconnected. // do we have sufficient peers? If not, we are disconnected.