proper parallel extraction

This commit is contained in:
CJ Cobb
2021-03-24 05:33:27 -04:00
parent a7a71ec0c7
commit 6eb87bfaff
4 changed files with 65 additions and 57 deletions

View File

@@ -758,7 +758,7 @@ CassandraBackend::open()
query = {};
query << " update " << tablePrefix << "ledger_range"
<< " set sequence = ? where is_latest = ? if sequence != ?";
<< " set sequence = ? where is_latest = ?";
if (!updateLedgerRange_.prepareStatement(query, session_.get()))
continue;

View File

@@ -751,13 +751,11 @@ public:
CassandraStatement statement{updateLedgerRange_};
statement.bindInt(ledgerSequence_);
statement.bindBoolean(false);
statement.bindInt(ledgerSequence_);
executeSyncWrite(statement);
}
CassandraStatement statement{updateLedgerRange_};
statement.bindInt(ledgerSequence_);
statement.bindBoolean(true);
statement.bindInt(ledgerSequence_);
return executeSyncUpdate(statement);
}
void

View File

@@ -110,7 +110,7 @@ class ThreadSafeQueue
public:
/// @param maxSize maximum size of the queue. Calls that would cause the
/// queue to exceed this size will block until free space is available
explicit ThreadSafeQueue(uint32_t maxSize) : maxSize_(maxSize)
ThreadSafeQueue(uint32_t maxSize) : maxSize_(maxSize)
{
}

View File

@@ -317,7 +317,7 @@ ReportingETL::buildNextLedger(org::xrpl::rpc::v1::GetLedgerResponse& rawData)
// Database must be populated when this starts
std::optional<uint32_t>
ReportingETL::runETLPipeline(uint32_t startSequence, int offset)
ReportingETL::runETLPipeline(uint32_t startSequence, int numExtractors)
{
/*
* Behold, mortals! This function spawns three separate threads, which talk
@@ -356,68 +356,80 @@ ReportingETL::runETLPipeline(uint32_t startSequence, int offset)
std::optional<uint32_t> lastPublishedSequence;
constexpr uint32_t maxQueueSize = 1000;
auto begin = std::chrono::system_clock::now();
using QueueType =
ThreadSafeQueue<std::optional<org::xrpl::rpc::v1::GetLedgerResponse>>;
std::vector<std::shared_ptr<QueueType>> queues;
ThreadSafeQueue<std::optional<org::xrpl::rpc::v1::GetLedgerResponse>>
transformQueue{maxQueueSize};
auto getNext = [&queues, &startSequence, &numExtractors](
uint32_t sequence) -> std::shared_ptr<QueueType> {
std::cout << std::to_string((sequence - startSequence) % numExtractors);
return queues[(sequence - startSequence) % numExtractors];
};
std::vector<std::thread> threads;
for (size_t i = 1; i < numExtractors + 1; ++i)
{
auto transformQueue = std::make_shared<QueueType>(maxQueueSize);
queues.push_back(transformQueue);
std::cout << "added to queues";
std::thread extracter{[this,
&startSequence,
&writeConflict,
&transformQueue,
&offset]() {
threads.emplace_back(
[this, &startSequence, &writeConflict, transformQueue, &i]() {
beast::setCurrentThreadName("rippled: ReportingETL extract");
uint32_t currentSequence = startSequence + offset;
uint32_t currentSequence = startSequence + i;
// there are two stopping conditions here.
// First, if there is a write conflict in the load thread, the ETL
// mechanism should stop.
// The other stopping condition is if the entire server is shutting
// down. This can be detected in a variety of ways. See the comment
// at the top of the function
// First, if there is a write conflict in the load thread, the
// ETL mechanism should stop. The other stopping condition is if
// the entire server is shutting down. This can be detected in a
// variety of ways. See the comment at the top of the function
while (networkValidatedLedgers_.waitUntilValidatedByNetwork(
currentSequence) &&
!writeConflict && !isStopping())
{
auto start = std::chrono::system_clock::now();
std::optional<org::xrpl::rpc::v1::GetLedgerResponse> fetchResponse{
fetchLedgerDataAndDiff(currentSequence)};
std::optional<org::xrpl::rpc::v1::GetLedgerResponse>
fetchResponse{fetchLedgerDataAndDiff(currentSequence)};
auto end = std::chrono::system_clock::now();
auto time = ((end - start).count()) / 1000000000.0;
auto tps =
fetchResponse->transactions_list().transactions_size() / time;
fetchResponse->transactions_list().transactions_size() /
time;
BOOST_LOG_TRIVIAL(info) << "Extract phase time = " << time
<< " . Extract phase tps = " << tps;
// if the fetch is unsuccessful, stop. fetchLedger only returns
// false if the server is shutting down, or if the ledger was
// found in the database (which means another process already
// wrote the ledger that this process was trying to extract;
// this is a form of a write conflict). Otherwise,
// fetchLedgerDataAndDiff will keep trying to fetch the
// specified ledger until successful
// if the fetch is unsuccessful, stop. fetchLedger only
// returns false if the server is shutting down, or if the
// ledger was found in the database (which means another
// process already wrote the ledger that this process was
// trying to extract; this is a form of a write conflict).
// Otherwise, fetchLedgerDataAndDiff will keep trying to
// fetch the specified ledger until successful
if (!fetchResponse)
{
break;
}
transformQueue.push(std::move(fetchResponse));
currentSequence += offset;
transformQueue->push(std::move(fetchResponse));
currentSequence += i;
}
// empty optional tells the transformer to shut down
transformQueue.push({});
}};
transformQueue->push({});
});
}
std::thread transformer{[this,
&writeConflict,
&transformQueue,
&startSequence,
&getNext,
&lastPublishedSequence]() {
beast::setCurrentThreadName("rippled: ReportingETL transform");
uint32_t currentSequence = startSequence;
while (!writeConflict)
{
std::optional<org::xrpl::rpc::v1::GetLedgerResponse> fetchResponse{
transformQueue.pop()};
getNext(currentSequence)->pop()};
// if fetchResponse is an empty optional, the extracter thread
// has stopped and the transformer should stop as well
if (!fetchResponse)
@@ -467,7 +479,8 @@ ReportingETL::runETLPipeline(uint32_t startSequence, int offset)
}};
// wait for all of the threads to stop
extracter.join();
for (auto& t : threads)
t.join();
transformer.join();
auto end = std::chrono::system_clock::now();
BOOST_LOG_TRIVIAL(debug)
@@ -600,10 +613,7 @@ ReportingETL::monitor()
// doContinousETLPipelined returns the most recent sequence
// published empty optional if no sequence was published
std::optional<uint32_t> lastPublished = nextSequence;
for (size_t i = 0; i < 10; ++i)
{
runETLPipeline(nextSequence, i);
}
runETLPipeline(nextSequence, 10);
BOOST_LOG_TRIVIAL(info)
<< __func__ << " : "
<< "Aborting ETL. Falling back to publishing";