Improve watchdog restart logic:

Stop attempting to restart the server after five consecutive
restarts fail to remain operational for at least ten seconds.
This commit is contained in:
Nik Bougalis
2015-12-28 14:26:46 -08:00
parent ff6c9e329f
commit fee19390f5
2 changed files with 32 additions and 10 deletions

View File

@@ -299,7 +299,7 @@ int run (int argc, char** argv)
std::string logMe = DoSustain (); std::string logMe = DoSustain ();
if (!logMe.empty ()) if (!logMe.empty ())
std::cerr << logMe; std::cerr << logMe << std::endl;
} }
// Run the unit tests if requested. // Run the unit tests if requested.

View File

@@ -70,6 +70,17 @@ std::string StopSustain ()
return "Terminating monitor"; return "Terminating monitor";
} }
static
bool checkChild(pid_t pid, int options)
{
int i;
if (waitpid (pChild, &i, options) == -1)
return false;
return kill (pChild, options) == 0;
}
std::string DoSustain () std::string DoSustain ()
{ {
pManager = getpid (); pManager = getpid ();
@@ -78,7 +89,11 @@ std::string DoSustain ()
signal (SIGUSR1, pass_signal); signal (SIGUSR1, pass_signal);
signal (SIGUSR2, pass_signal); signal (SIGUSR2, pass_signal);
for (auto childCount = 1;; ++childCount) // Number of times the child has exited in less than
// 15 seconds.
int fastExit = 0;
for (auto childCount = 1; ; ++childCount)
{ {
pChild = fork (); pChild = fork ();
@@ -100,17 +115,24 @@ std::string DoSustain ()
sleep (sleepBeforeWaiting); sleep (sleepBeforeWaiting);
for (;;) // If the child has already terminated count this
// as a fast exit and an indication that something
// went wrong:
if (!checkChild (pChild, WNOHANG))
{ {
int i; if (++fastExit == 5)
waitpid (pChild, &i, 0); _exit (0);
if (kill (pChild, 0))
break;
sleep (sleepBetweenWaits);
} }
else
{
fastExit = 0;
auto pc = std::to_string (pChild); while (checkChild (pChild, 0))
rename ("core", ("core." + pc).c_str ()); sleep(sleepBetweenWaits);
auto pc = std::to_string (pChild);
rename ("core", ("core." + pc).c_str ());
}
} }
} }