Improve watchdog restart logic:

Stop attempting to restart the server after five consecutive
restarts fail to remain operational for at least ten seconds.
This commit is contained in:
Nik Bougalis
2015-12-28 14:26:46 -08:00
parent ff6c9e329f
commit fee19390f5
2 changed files with 32 additions and 10 deletions

View File

@@ -299,7 +299,7 @@ int run (int argc, char** argv)
std::string logMe = DoSustain ();
if (!logMe.empty ())
std::cerr << logMe;
std::cerr << logMe << std::endl;
}
// Run the unit tests if requested.

View File

@@ -70,6 +70,17 @@ std::string StopSustain ()
return "Terminating monitor";
}
static
bool checkChild(pid_t pid, int options)
{
int i;
if (waitpid (pChild, &i, options) == -1)
return false;
return kill (pChild, options) == 0;
}
std::string DoSustain ()
{
pManager = getpid ();
@@ -78,7 +89,11 @@ std::string DoSustain ()
signal (SIGUSR1, pass_signal);
signal (SIGUSR2, pass_signal);
for (auto childCount = 1;; ++childCount)
// Number of times the child has exited in less than
// 15 seconds.
int fastExit = 0;
for (auto childCount = 1; ; ++childCount)
{
pChild = fork ();
@@ -100,18 +115,25 @@ std::string DoSustain ()
sleep (sleepBeforeWaiting);
for (;;)
// If the child has already terminated count this
// as a fast exit and an indication that something
// went wrong:
if (!checkChild (pChild, WNOHANG))
{
int i;
waitpid (pChild, &i, 0);
if (kill (pChild, 0))
break;
sleep (sleepBetweenWaits);
if (++fastExit == 5)
_exit (0);
}
else
{
fastExit = 0;
while (checkChild (pChild, 0))
sleep(sleepBetweenWaits);
auto pc = std::to_string (pChild);
rename ("core", ("core." + pc).c_str ());
}
}
}
#else