Remove the built-in "sustain" watchdog:

The built-in watchdog is simplistic and can, sometimes, cause problems
especially on systems that have the ability to automatically start and
monitor processes.

This commit removes the sustain system entirely, changes the handling
of the SIGTERM signal to properly terminate the process and improves
the error message reported to the user when the command line used to
start `rippled` is incorrect and malformed.
This commit is contained in:
Nik Bougalis
2019-03-30 14:00:00 -05:00
parent 74f9edef07
commit 62a3f33d72
6 changed files with 38 additions and 268 deletions

View File

@@ -434,7 +434,6 @@ target_sources (rippled PRIVATE
src/ripple/basics/impl/BasicConfig.cpp
src/ripple/basics/impl/PerfLogImp.cpp
src/ripple/basics/impl/ResolverAsio.cpp
src/ripple/basics/impl/Sustain.cpp
src/ripple/basics/impl/UptimeClock.cpp
src/ripple/basics/impl/make_SSLContext.cpp
src/ripple/basics/impl/mulDiv.cpp

View File

@@ -1,12 +1,12 @@
[Unit]
Description=Ripple Daemon
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
ExecStart=/opt/ripple/bin/rippled --net --silent --conf /etc/opt/ripple/rippled.cfg
# Default KillSignal can be used if/when rippled handles SIGTERM
KillSignal=SIGINT
Restart=no
Restart=on-failure
User=rippled
Group=rippled
LimitNOFILE=65536

View File

@@ -46,7 +46,6 @@
#include <ripple/basics/ByteUtilities.h>
#include <ripple/basics/PerfLog.h>
#include <ripple/basics/ResolverAsio.h>
#include <ripple/basics/Sustain.h>
#include <ripple/basics/safe_cast.h>
#include <ripple/beast/asio/io_latency_probe.h>
#include <ripple/beast/core/LexicalCast.h>
@@ -73,10 +72,8 @@
#include <condition_variable>
#include <cstring>
#include <fstream>
#include <iostream>
#include <mutex>
#include <sstream>
namespace ripple {
@@ -1081,26 +1078,6 @@ public:
return true;
}
void
signalled(const boost::system::error_code& ec, int signal_number)
{
if (ec == boost::asio::error::operation_aborted)
{
// Indicates the signal handler has been aborted
// do nothing
}
else if (ec)
{
JLOG(m_journal.error()) << "Received signal: " << signal_number
<< " with error: " << ec.message();
}
else
{
JLOG(m_journal.debug()) << "Received signal: " << signal_number;
signalStop();
}
}
//--------------------------------------------------------------------------
//
// Stoppable
@@ -1392,21 +1369,31 @@ private:
//------------------------------------------------------------------------------
// VFALCO TODO Break this function up into many small initialization segments.
// Or better yet refactor these initializations into RAII classes
// which are members of the Application object.
//
// TODO Break this up into smaller, more digestible initialization segments.
bool
ApplicationImp::setup()
{
// We want to intercept and wait for CTRL-C to terminate the process
// We want to intercept CTRL-C and the standard termination signal SIGTERM
// and terminate the process. This handler will NEVER be invoked twice.
//
// Note that async_wait is "one-shot": for each call, the handler will be
// invoked exactly once, either when one of the registered signals in the
// signal set occurs or the signal set is cancelled. Subsequent signals are
// effectively ignored (technically, they are queued up, waiting for a call
// to async_wait).
m_signals.add(SIGINT);
m_signals.add(SIGTERM);
m_signals.async_wait(
[this](boost::system::error_code const& ec, int signum) {
// Indicates the signal handler has been aborted; do nothing
if (ec == boost::asio::error::operation_aborted)
return;
m_signals.async_wait(std::bind(
&ApplicationImp::signalled,
this,
std::placeholders::_1,
std::placeholders::_2));
JLOG(m_journal.info()) << "Received signal " << signum;
if (signum == SIGTERM || signum == SIGINT)
signalStop();
});
assert(mTxnDB == nullptr);
@@ -1805,17 +1792,20 @@ ApplicationImp::run()
JLOG(m_journal.info()) << "Received shutdown request";
stop(m_journal);
JLOG(m_journal.info()) << "Done.";
StopSustain();
}
void
ApplicationImp::signalStop()
{
// Unblock the main thread (which is sitting in run()).
//
// When we get C++20 this can use std::latch.
std::lock_guard lk{mut_};
isTimeToStop = true;
cv_.notify_all();
if (!isTimeToStop)
{
isTimeToStop = true;
cv_.notify_all();
}
}
bool

View File

@@ -21,7 +21,6 @@
#include <ripple/app/main/DBInit.h>
#include <ripple/basics/Log.h>
#include <ripple/basics/StringUtilities.h>
#include <ripple/basics/Sustain.h>
#include <ripple/basics/contract.h>
#include <ripple/beast/clock/basic_seconds_clock.h>
#include <ripple/beast/core/CurrentThreadName.h>
@@ -32,7 +31,6 @@
#include <ripple/json/to_string.h>
#include <ripple/net/RPCCall.h>
#include <ripple/protocol/BuildInfo.h>
#include <ripple/protocol/digest.h>
#include <ripple/resource/Fees.h>
#include <ripple/rpc/RPCHandler.h>
@@ -308,7 +306,8 @@ run(int argc, char** argv)
{
using namespace std;
beast::setCurrentThreadName("rippled: main");
beast::setCurrentThreadName(
"rippled: main " + BuildInfo::getVersionString());
po::variables_map vm;
@@ -334,7 +333,7 @@ run(int argc, char** argv)
gen.add_options()(
"conf", po::value<std::string>(), "Specify the configuration file.")(
"debug", "Enable normally suppressed debug logging")(
"fg", "Run in the foreground.")("help,h", "Display this message.")(
"help,h", "Display this message.")(
"quorum",
po::value<std::size_t>(),
"Override the minimum validation quorum.")(
@@ -416,7 +415,8 @@ run(int argc, char** argv)
"purpose, "
"so this option is not needed for users")(
"unittest-child",
"For internal use only when spawning child unit test processes.");
"For internal use only when spawning child unit test processes.")(
"fg", "Deprecated: server always in foreground mode.");
// Interpret positional arguments as --parameters.
po::positional_options_description p;
@@ -439,10 +439,10 @@ run(int argc, char** argv)
vm);
po::notify(vm); // Invoke option notify functions.
}
catch (std::exception const&)
catch (std::exception const& ex)
{
std::cerr << "rippled: Incorrect command line syntax." << std::endl;
std::cerr << "Use '--help' for a list of options." << std::endl;
std::cerr << "rippled: " << ex.what() << std::endl;
std::cerr << "Try 'rippled --help' for a list of options." << std::endl;
return 1;
}
@@ -708,18 +708,8 @@ run(int argc, char** argv)
if (!adjustDescriptorLimit(1024, logs->journal("Application")))
return -1;
if (HaveSustain() && !vm.count("fg") && !config->standalone())
{
auto const ret = DoSustain();
if (!ret.empty())
std::cerr << "Watchdog: " << ret << std::endl;
}
if (vm.count("debug"))
{
setDebugLogSink(logs->makeSink("Debug", beast::severities::kTrace));
}
auto timeKeeper = make_TimeKeeper(logs->journal("TimeKeeper"));
@@ -727,19 +717,13 @@ run(int argc, char** argv)
std::move(config), std::move(logs), std::move(timeKeeper));
if (!app->setup())
{
StopSustain();
return -1;
}
// With our configuration parsed, ensure we have
// enough file descriptors available:
if (!adjustDescriptorLimit(
app->fdRequired(), app->logs().journal("Application")))
{
StopSustain();
return -1;
}
// Start the server
app->doStart(true /*start timers*/);

View File

@@ -1,38 +0,0 @@
//------------------------------------------------------------------------------
/*
This file is part of rippled: https://github.com/ripple/rippled
Copyright (c) 2012, 2013 Ripple Labs Inc.
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL , DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
//==============================================================================
#ifndef RIPPLE_BASICS_SUSTAIN_H_INCLUDED
#define RIPPLE_BASICS_SUSTAIN_H_INCLUDED
#include <string>
namespace ripple {
// "Sustain" is a system for a buddy process that monitors the main process
// and relaunches it on a fault.
bool
HaveSustain();
std::string
StopSustain();
std::string
DoSustain();
} // namespace ripple
#endif

View File

@@ -1,165 +0,0 @@
//------------------------------------------------------------------------------
/*
This file is part of rippled: https://github.com/ripple/rippled
Copyright (c) 2012, 2013 Ripple Labs Inc.
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL , DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
//==============================================================================
#include <ripple/basics/Sustain.h>
#include <ripple/basics/safe_cast.h>
#include <ripple/beast/core/CurrentThreadName.h>
#include <boost/format.hpp>
// For Sustain Linux variants
// VFALCO TODO Rewrite Sustain to use beast::Process
#ifdef __linux__
#include <sys/prctl.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#endif
#ifdef __FreeBSD__
#include <sys/types.h>
#include <sys/wait.h>
#endif
namespace ripple {
#ifdef __unix__
static auto const sleepBeforeWaiting = 10;
static auto const sleepBetweenWaits = 1;
static pid_t pManager = safe_cast<pid_t>(0);
static pid_t pChild = safe_cast<pid_t>(0);
static void
pass_signal(int a)
{
kill(pChild, a);
}
static void
stop_manager(int)
{
kill(pChild, SIGINT);
_exit(0);
}
bool
HaveSustain()
{
return true;
}
std::string
StopSustain()
{
if (getppid() != pManager)
return "";
kill(pManager, SIGHUP);
return "Terminating monitor";
}
static bool
checkChild(pid_t pid, int options)
{
int i;
if (waitpid(pChild, &i, options) == -1)
return false;
return kill(pChild, 0) == 0;
}
std::string
DoSustain()
{
pManager = getpid();
signal(SIGINT, stop_manager);
signal(SIGHUP, stop_manager);
signal(SIGUSR1, pass_signal);
signal(SIGUSR2, pass_signal);
// Number of times the child has exited in less than
// 15 seconds.
int fastExit = 0;
for (auto childCount = 1;; ++childCount)
{
pChild = fork();
if (pChild == -1)
_exit(0);
auto cc = std::to_string(childCount);
if (pChild == 0)
{
beast::setCurrentThreadName("rippled: main");
signal(SIGINT, SIG_DFL);
signal(SIGHUP, SIG_DFL);
signal(SIGUSR1, SIG_DFL);
signal(SIGUSR2, SIG_DFL);
return "Launching child " + cc;
}
beast::setCurrentThreadName(("rippled: #" + cc).c_str());
sleep(sleepBeforeWaiting);
// If the child has already terminated count this
// as a fast exit and an indication that something
// went wrong:
if (!checkChild(pChild, WNOHANG))
{
if (++fastExit == 5)
_exit(0);
}
else
{
fastExit = 0;
while (checkChild(pChild, 0))
sleep(sleepBetweenWaits);
(void)rename("core", ("core." + std::to_string(pChild)).c_str());
}
}
}
#else
bool
HaveSustain()
{
return false;
}
std::string
DoSustain()
{
return "";
}
std::string
StopSustain()
{
return "";
}
#endif
} // namespace ripple