Performance logging and counters:

* Tally and duration counters for Job Queue tasks and RPC calls
    optionally rendered by server_info and server_state, and
    optionally printed to a distinct log file.
    - Tally each Job Queue task as it is queued, starts, and
      finishes running. Track total duration queued and running.
    - Tally each RPC call as it starts and either finishes
      successfully or throws an exception. Track total running
      duration for each.
  * Track currently executing Job Queue tasks and RPC methods
    along with durations.
  * Json-formatted performance log file written by a dedicated
    thread, for above-described data.
  * New optional parameter, "counters", for server_info and
    server_state. If set, render Job Queue and RPC call counters
    as well as currently executing tasks.
  * New configuration section, "[perf]", to optionally control
    performance logging to a file.
  * Support optional sub-second periods when rendering human-readable
    time points.
This commit is contained in:
Mark Travis
2018-01-13 04:02:43 -08:00
committed by Nikolaos D. Bougalis
parent ef3bc92b82
commit 8eb8c77886
45 changed files with 10379 additions and 577 deletions

View File

@@ -89,7 +89,9 @@ public:
private:
boost::filesystem::path CONFIG_FILE;
public:
boost::filesystem::path CONFIG_DIR;
private:
boost::filesystem::path DEBUG_LOGFILE;
void load ();

View File

@@ -31,6 +31,11 @@
namespace ripple {
namespace perf
{
class PerfLog;
}
class Logs;
struct Coro_create_t
{
@@ -132,7 +137,8 @@ public:
using JobFunction = std::function <void(Job&)>;
JobQueue (beast::insight::Collector::ptr const& collector,
Stoppable& parent, beast::Journal journal, Logs& logs);
Stoppable& parent, beast::Journal journal, Logs& logs,
perf::PerfLog& perfLog);
~JobQueue ();
/** Adds a job to the JobQueue.
@@ -226,18 +232,13 @@ private:
Job::CancelCallback m_cancelCallback;
// Statistics tracking
perf::PerfLog& perfLog_;
beast::insight::Collector::ptr m_collector;
beast::insight::Gauge job_count;
beast::insight::Hook hook;
std::condition_variable cv_;
static JobTypes const& getJobTypes()
{
static JobTypes types;
return types;
}
void collect();
JobTypeData& getJobTypeData (JobType type);
@@ -304,14 +305,6 @@ private:
// <none>
void finishJob (JobType type);
template <class Rep, class Period>
void on_dequeue (JobType type,
std::chrono::duration <Rep, Period> const& value);
template <class Rep, class Period>
void on_execute (JobType type,
std::chrono::duration <Rep, Period> const& value);
// Runs the next appropriate waiting Job.
//
// Pre-conditions:
@@ -322,7 +315,7 @@ private:
//
// Invariants:
// <none>
void processTask () override;
void processTask (int instance) override;
// Returns the limit of running jobs for the given job type.
// For jobs with no limit, we return the largest int. Hopefully that

View File

@@ -48,7 +48,7 @@ public:
bool special, std::chrono::milliseconds avgLatency,
std::chrono::milliseconds peakLatency)
: m_type (type)
, m_name (name)
, m_name (std::move(name))
, m_limit (limit)
, m_special (special)
, m_avgLatency (avgLatency)
@@ -62,7 +62,7 @@ public:
return m_type;
}
std::string name () const
std::string const& name () const
{
return m_name;
}

View File

@@ -23,6 +23,9 @@
#include <ripple/core/Job.h>
#include <ripple/core/JobTypeInfo.h>
#include <map>
#include <string>
#include <type_traits>
#include <unordered_map>
namespace ripple
{
@@ -33,6 +36,7 @@ public:
using Map = std::map <JobType, JobTypeInfo>;
using const_iterator = Map::const_iterator;
private:
JobTypes ()
: m_unknown (jtINVALID, "invalid", 0, true, std::chrono::milliseconds{0},
std::chrono::milliseconds{0})
@@ -56,7 +60,7 @@ add( jtADVANCE, "advanceLedger", maxLimit, false, 0ms, 0m
add( jtPUBLEDGER, "publishNewLedger", maxLimit, false, 3000ms, 4500ms);
add( jtTXN_DATA, "fetchTxnData", 1, false, 0ms, 0ms);
add( jtWAL, "writeAhead", maxLimit, false, 1000ms, 2500ms);
add( jtVALIDATION_t, "trustedValidation", maxLimit, false, 500ms, 1500ms);
add( jtVALIDATION_t, "trustedValidation", maxLimit, false, 500ms, 1500ms);
add( jtWRITE, "writeObjects", maxLimit, false, 1750ms, 2500ms);
add( jtACCEPT, "acceptLedger", maxLimit, false, 0ms, 0ms);
add( jtPROPOSAL_t, "trustedProposal", maxLimit, false, 100ms, 500ms);
@@ -79,6 +83,13 @@ add( jtNS_WRITE, "WriteNode", 0, true, 0ms, 0m
}
public:
static JobTypes const& instance()
{
static JobTypes const types;
return types;
}
JobTypeInfo const& get (JobType jt) const
{
Map::const_iterator const iter (m_map.find (jt));
@@ -95,6 +106,11 @@ add( jtNS_WRITE, "WriteNode", 0, true, 0ms, 0m
return m_unknown;
}
Map::size_type size () const
{
return m_map.size();
}
const_iterator begin () const
{
return m_map.cbegin ();

View File

@@ -130,7 +130,7 @@ class RootStoppable;
For stoppables which are only considered stopped when all of their
children have stopped, and their own internal logic indicates a stop, it
will be necessary to perform special actions in onChildrenStopped(). The
funtion areChildrenStopped() can be used after children have stopped,
function areChildrenStopped() can be used after children have stopped,
but before the Stoppable logic itself has stopped, to determine if the
stoppable's logic is a true stop.

View File

@@ -19,18 +19,21 @@
#include <ripple/core/JobQueue.h>
#include <ripple/basics/contract.h>
#include <ripple/basics/PerfLog.h>
namespace ripple {
JobQueue::JobQueue (beast::insight::Collector::ptr const& collector,
Stoppable& parent, beast::Journal journal, Logs& logs)
Stoppable& parent, beast::Journal journal, Logs& logs,
perf::PerfLog& perfLog)
: Stoppable ("JobQueue", parent)
, m_journal (journal)
, m_lastJob (0)
, m_invalidJobData (getJobTypes ().getInvalid (), collector, logs)
, m_invalidJobData (JobTypes::instance().getInvalid (), collector, logs)
, m_processCount (0)
, m_workers (*this, "JobQueue", 0)
, m_workers (*this, perfLog, "JobQueue", 0)
, m_cancelCallback (std::bind (&Stoppable::isStopping, this))
, perfLog_ (perfLog)
, m_collector (collector)
{
hook = m_collector->make_hook (std::bind (&JobQueue::collect, this));
@@ -39,7 +42,7 @@ JobQueue::JobQueue (beast::insight::Collector::ptr const& collector,
{
std::lock_guard <std::mutex> lock (m_mutex);
for (auto const& x : getJobTypes ())
for (auto const& x : JobTypes::instance())
{
JobTypeInfo const& jt = x.second;
@@ -329,6 +332,7 @@ JobQueue::queueJob (Job const& job, std::lock_guard <std::mutex> const& lock)
JobType const type (job.getType ());
assert (type != jtINVALID);
assert (m_jobSet.find (job) != m_jobSet.end ());
perfLog_.jobQueue(type);
JobTypeData& data (getJobTypeData (type));
@@ -398,34 +402,13 @@ JobQueue::finishJob (JobType type)
--data.running;
}
template <class Rep, class Period>
void JobQueue::on_dequeue (JobType type,
std::chrono::duration <Rep, Period> const& value)
{
using namespace std::chrono;
auto const ms = ceil<milliseconds>(value);
if (ms >= 10ms)
getJobTypeData (type).dequeue.notify (ms);
}
template <class Rep, class Period>
void JobQueue::on_execute (JobType type,
std::chrono::duration <Rep, Period> const& value)
{
using namespace std::chrono;
auto const ms (ceil <milliseconds> (value));
if (ms >= 10ms)
getJobTypeData (type).execute.notify (ms);
}
void
JobQueue::processTask ()
JobQueue::processTask (int instance)
{
JobType type;
{
using namespace std::chrono;
Job::clock_type::time_point const start_time (
Job::clock_type::now());
{
@@ -438,10 +421,18 @@ JobQueue::processTask ()
type = job.getType();
JobTypeData& data(getJobTypeData(type));
JLOG(m_journal.trace()) << "Doing " << data.name () << " job";
on_dequeue (job.getType (), start_time - job.queue_time ());
auto const us = date::ceil<microseconds>(
start_time - job.queue_time());
perfLog_.jobStart(type, us, start_time, instance);
if (us >= 10ms)
getJobTypeData(type).dequeue.notify(us);
job.doJob ();
}
on_execute(type, Job::clock_type::now() - start_time);
auto const us (
date::ceil<microseconds>(Job::clock_type::now() - start_time));
perfLog_.jobFinish(type, us, instance);
if (us >= 10ms)
getJobTypeData(type).execute.notify(us);
}
{
@@ -462,7 +453,7 @@ JobQueue::processTask ()
int
JobQueue::getJobLimit (JobType type)
{
JobTypeInfo const& j (getJobTypes ().get (type));
JobTypeInfo const& j (JobTypes::instance().get (type));
assert (j.type () != jtINVALID);
return j.limit ();

View File

@@ -19,7 +19,7 @@
#include <ripple/basics/Log.h>
#include <ripple/basics/UptimeTimer.h>
#include <ripple/beast/clock/chrono_util.h>
#include <ripple/basics/date.h>
#include <ripple/core/LoadMonitor.h>
namespace ripple {
@@ -108,14 +108,15 @@ void LoadMonitor::addLoadSample (LoadEvent const& s)
auto const total = s.runTime() + s.waitTime();
// Don't include "jitter" as part of the latency
auto const latency = total < 2ms ? 0ms : round<milliseconds>(total);
auto const latency = total < 2ms ? 0ms : date::round<milliseconds>(total);
if (latency > 500ms)
{
auto mj = (latency > 1s) ? j_.warn() : j_.info();
JLOG (mj) << "Job: " << s.name() <<
" run: " << round<milliseconds>(s.runTime()).count() << "ms" <<
" wait: " << round<milliseconds>(s.waitTime()).count() << "ms";
" run: " << date::round<milliseconds>(s.runTime()).count() <<
"ms" << " wait: " <<
date::round<milliseconds>(s.waitTime()).count() << "ms";
}
addSamples (1, latency);

View File

@@ -18,6 +18,7 @@
//==============================================================================
#include <ripple/core/impl/Workers.h>
#include <ripple/basics/PerfLog.h>
#include <ripple/beast/core/CurrentThreadName.h>
#include <cassert>
@@ -25,9 +26,11 @@ namespace ripple {
Workers::Workers (
Callback& callback,
perf::PerfLog& perfLog,
std::string const& threadNames,
int numberOfThreads)
: m_callback (callback)
, perfLog_ (perfLog)
, m_threadNames (threadNames)
, m_allPaused (true, true)
, m_semaphore (0)
@@ -57,12 +60,14 @@ int Workers::getNumberOfThreads () const noexcept
//
void Workers::setNumberOfThreads (int numberOfThreads)
{
static int instance {0};
if (m_numberOfThreads != numberOfThreads)
{
perfLog_.resizeJobs(numberOfThreads);
if (numberOfThreads > m_numberOfThreads)
{
// Increasing the number of working threads
int const amount = numberOfThreads - m_numberOfThreads;
for (int i = 0; i < amount; ++i)
@@ -79,15 +84,14 @@ void Workers::setNumberOfThreads (int numberOfThreads)
}
else
{
worker = new Worker (*this, m_threadNames);
worker = new Worker (*this, m_threadNames, instance++);
m_everyone.push_front (worker);
}
}
}
else if (numberOfThreads < m_numberOfThreads)
else
{
// Decreasing the number of working threads
int const amount = m_numberOfThreads - numberOfThreads;
for (int i = 0; i < amount; ++i)
@@ -142,9 +146,11 @@ void Workers::deleteWorkers (beast::LockFreeStack <Worker>& stack)
//------------------------------------------------------------------------------
Workers::Worker::Worker (Workers& workers, std::string const& threadName)
Workers::Worker::Worker (Workers& workers, std::string const& threadName,
int const instance)
: m_workers {workers}
, threadName_ {threadName}
, instance_ {instance}
, wakeCount_ {0}
, shouldExit_ {false}
{
@@ -216,7 +222,7 @@ void Workers::Worker::run ()
// unblocked in order to process a task.
//
++m_workers.m_runningTaskCount;
m_workers.m_callback.processTask ();
m_workers.m_callback.processTask (instance_);
--m_workers.m_runningTaskCount;
}

View File

@@ -31,6 +31,11 @@
namespace ripple {
namespace perf
{
class PerfLog;
}
/** A group of threads that process tasks.
*/
class Workers
@@ -45,9 +50,11 @@ public:
that you only process one task from inside your callback. Each
call to addTask will result in exactly one call to processTask.
@param instance The worker thread instance.
@see Workers::addTask
*/
virtual void processTask () = 0;
virtual void processTask (int instance) = 0;
};
/** Create the object.
@@ -58,6 +65,7 @@ public:
@param threadNames The name given to each created worker thread.
*/
explicit Workers (Callback& callback,
perf::PerfLog& perfLog,
std::string const& threadNames = "Worker",
int numberOfThreads =
static_cast<int>(std::thread::hardware_concurrency()));
@@ -126,7 +134,9 @@ private:
, public beast::LockFreeStack <Worker, PausedTag>::Node
{
public:
Worker (Workers& workers, std::string const& threadName);
Worker (Workers& workers,
std::string const& threadName,
int const instance);
~Worker ();
@@ -138,6 +148,7 @@ private:
private:
Workers& m_workers;
std::string const threadName_;
int const instance_;
std::thread thread_;
std::mutex mutex_;
@@ -151,6 +162,7 @@ private:
private:
Callback& m_callback;
perf::PerfLog& perfLog_;
std::string m_threadNames; // The name to give each thread
beast::WaitableEvent m_allPaused; // signaled when all threads paused
semaphore m_semaphore; // each pending task is 1 resource