proper parallel extraction

2025-11-19 11:15:50 +00:00 · 2021-03-24 05:33:27 -04:00
parent a7a71ec0c7
commit 6eb87bfaff
4 changed files with 65 additions and 57 deletions
--- a/reporting/CassandraBackend.cpp
+++ b/reporting/CassandraBackend.cpp
@@ -758,7 +758,7 @@ CassandraBackend::open()

        query = {};
        query << " update " << tablePrefix << "ledger_range"
-              << " set sequence = ? where is_latest = ? if sequence != ?";
+              << " set sequence = ? where is_latest = ?";
        if (!updateLedgerRange_.prepareStatement(query, session_.get()))
            continue;

--- a/reporting/CassandraBackend.h
+++ b/reporting/CassandraBackend.h
@@ -751,13 +751,11 @@ public:
            CassandraStatement statement{updateLedgerRange_};
            statement.bindInt(ledgerSequence_);
            statement.bindBoolean(false);
-            statement.bindInt(ledgerSequence_);
            executeSyncWrite(statement);
        }
        CassandraStatement statement{updateLedgerRange_};
        statement.bindInt(ledgerSequence_);
        statement.bindBoolean(true);
-        statement.bindInt(ledgerSequence_);
        return executeSyncUpdate(statement);
    }
    void
--- a/reporting/ETLHelpers.h
+++ b/reporting/ETLHelpers.h
@@ -110,7 +110,7 @@ class ThreadSafeQueue
 public:
    /// @param maxSize maximum size of the queue. Calls that would cause the
    /// queue to exceed this size will block until free space is available
-    explicit ThreadSafeQueue(uint32_t maxSize) : maxSize_(maxSize)
+    ThreadSafeQueue(uint32_t maxSize) : maxSize_(maxSize)
    {
    }

--- a/reporting/ReportingETL.cpp
+++ b/reporting/ReportingETL.cpp
@@ -317,7 +317,7 @@ ReportingETL::buildNextLedger(org::xrpl::rpc::v1::GetLedgerResponse& rawData)

 // Database must be populated when this starts
 std::optional<uint32_t>
-ReportingETL::runETLPipeline(uint32_t startSequence, int offset)
+ReportingETL::runETLPipeline(uint32_t startSequence, int numExtractors)
 {
    /*
     * Behold, mortals! This function spawns three separate threads, which talk
@@ -356,68 +356,80 @@ ReportingETL::runETLPipeline(uint32_t startSequence, int offset)
    std::optional<uint32_t> lastPublishedSequence;
    constexpr uint32_t maxQueueSize = 1000;
    auto begin = std::chrono::system_clock::now();
+    using QueueType =
+        ThreadSafeQueue<std::optional<org::xrpl::rpc::v1::GetLedgerResponse>>;
+    std::vector<std::shared_ptr<QueueType>> queues;

-    ThreadSafeQueue<std::optional<org::xrpl::rpc::v1::GetLedgerResponse>>
-        transformQueue{maxQueueSize};
+    auto getNext = [&queues, &startSequence, &numExtractors](
+                       uint32_t sequence) -> std::shared_ptr<QueueType> {
+        std::cout << std::to_string((sequence - startSequence) % numExtractors);
+        return queues[(sequence - startSequence) % numExtractors];
+    };
+    std::vector<std::thread> threads;
+    for (size_t i = 1; i < numExtractors + 1; ++i)
+    {
+        auto transformQueue = std::make_shared<QueueType>(maxQueueSize);
+        queues.push_back(transformQueue);
+        std::cout << "added to queues";

-    std::thread extracter{[this,
-                           &startSequence,
-                           &writeConflict,
-                           &transformQueue,
-                           &offset]() {
+        threads.emplace_back(
+            [this, &startSequence, &writeConflict, transformQueue, &i]() {
                beast::setCurrentThreadName("rippled: ReportingETL extract");
-        uint32_t currentSequence = startSequence + offset;
+                uint32_t currentSequence = startSequence + i;

                // there are two stopping conditions here.
-        // First, if there is a write conflict in the load thread, the ETL
-        // mechanism should stop.
-        // The other stopping condition is if the entire server is shutting
-        // down. This can be detected in a variety of ways. See the comment
-        // at the top of the function
+                // First, if there is a write conflict in the load thread, the
+                // ETL mechanism should stop. The other stopping condition is if
+                // the entire server is shutting down. This can be detected in a
+                // variety of ways. See the comment at the top of the function
                while (networkValidatedLedgers_.waitUntilValidatedByNetwork(
                           currentSequence) &&
                       !writeConflict && !isStopping())
                {
                    auto start = std::chrono::system_clock::now();
-            std::optional<org::xrpl::rpc::v1::GetLedgerResponse> fetchResponse{
-                fetchLedgerDataAndDiff(currentSequence)};
+                    std::optional<org::xrpl::rpc::v1::GetLedgerResponse>
+                        fetchResponse{fetchLedgerDataAndDiff(currentSequence)};
                    auto end = std::chrono::system_clock::now();

                    auto time = ((end - start).count()) / 1000000000.0;
                    auto tps =
-                fetchResponse->transactions_list().transactions_size() / time;
+                        fetchResponse->transactions_list().transactions_size() /
+                        time;

                    BOOST_LOG_TRIVIAL(info) << "Extract phase time = " << time
                                            << " . Extract phase tps = " << tps;
-            // if the fetch is unsuccessful, stop. fetchLedger only returns
-            // false if the server is shutting down, or if the ledger was
-            // found in the database (which means another process already
-            // wrote the ledger that this process was trying to extract;
-            // this is a form of a write conflict). Otherwise,
-            // fetchLedgerDataAndDiff will keep trying to fetch the
-            // specified ledger until successful
+                    // if the fetch is unsuccessful, stop. fetchLedger only
+                    // returns false if the server is shutting down, or if the
+                    // ledger was found in the database (which means another
+                    // process already wrote the ledger that this process was
+                    // trying to extract; this is a form of a write conflict).
+                    // Otherwise, fetchLedgerDataAndDiff will keep trying to
+                    // fetch the specified ledger until successful
                    if (!fetchResponse)
                    {
                        break;
                    }

-            transformQueue.push(std::move(fetchResponse));
-            currentSequence += offset;
+                    transformQueue->push(std::move(fetchResponse));
+                    currentSequence += i;
                }
                // empty optional tells the transformer to shut down
-        transformQueue.push({});
-    }};
+                transformQueue->push({});
+            });
+    }

    std::thread transformer{[this,
                             &writeConflict,
-                             &transformQueue,
+                             &startSequence,
+                             &getNext,
                             &lastPublishedSequence]() {
        beast::setCurrentThreadName("rippled: ReportingETL transform");
+        uint32_t currentSequence = startSequence;

        while (!writeConflict)
        {
            std::optional<org::xrpl::rpc::v1::GetLedgerResponse> fetchResponse{
-                transformQueue.pop()};
+                getNext(currentSequence)->pop()};
            // if fetchResponse is an empty optional, the extracter thread
            // has stopped and the transformer should stop as well
            if (!fetchResponse)
@@ -467,7 +479,8 @@ ReportingETL::runETLPipeline(uint32_t startSequence, int offset)
    }};

    // wait for all of the threads to stop
-    extracter.join();
+    for (auto& t : threads)
+        t.join();
    transformer.join();
    auto end = std::chrono::system_clock::now();
    BOOST_LOG_TRIVIAL(debug)
@@ -600,10 +613,7 @@ ReportingETL::monitor()
            // doContinousETLPipelined returns the most recent sequence
            // published empty optional if no sequence was published
            std::optional<uint32_t> lastPublished = nextSequence;
-            for (size_t i = 0; i < 10; ++i)
-            {
-                runETLPipeline(nextSequence, i);
-            }
+            runETLPipeline(nextSequence, 10);
            BOOST_LOG_TRIVIAL(info)
                << __func__ << " : "
                << "Aborting ETL. Falling back to publishing";