From a72e5a180fe58d14d22a371c393a4085de15b6c6 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Wed, 21 Jan 2026 11:41:26 +0000 Subject: [PATCH] feat: Choose writer by cluster communication (#2830) Fixes #1974 --- src/app/ClioApplication.cpp | 27 +- src/app/Stopper.hpp | 10 +- src/cluster/Backend.cpp | 141 ++++++ src/cluster/Backend.hpp | 147 ++++++ src/cluster/CMakeLists.txt | 4 +- src/cluster/ClioNode.cpp | 42 +- src/cluster/ClioNode.hpp | 37 +- src/cluster/ClusterCommunicationService.cpp | 167 +------ src/cluster/ClusterCommunicationService.hpp | 99 +--- src/cluster/Concepts.hpp | 39 ++ ...cationServiceInterface.hpp => Metrics.cpp} | 45 +- src/cluster/Metrics.hpp | 76 ++++ src/cluster/WriterDecider.cpp | 98 ++++ src/cluster/WriterDecider.hpp | 75 +++ src/cluster/impl/RepeatedTask.hpp | 104 +++++ src/etl/CMakeLists.txt | 1 + src/etl/ETLService.cpp | 106 +++-- src/etl/ETLService.hpp | 16 +- src/etl/SystemState.hpp | 71 ++- src/etl/WriterState.cpp | 88 ++++ src/etl/WriterState.hpp | 193 ++++++++ src/etl/impl/LedgerPublisher.hpp | 17 +- src/etl/impl/Loading.cpp | 5 +- src/util/Channel.hpp | 108 ++++- src/util/Spawn.hpp | 1 + src/util/async/Concepts.hpp | 21 + .../async/context/BasicExecutionContext.hpp | 18 +- src/util/async/context/impl/Execution.hpp | 29 +- tests/common/util/MockBackendTestFixture.hpp | 1 + tests/common/util/MockWriterState.hpp | 40 ++ tests/unit/CMakeLists.txt | 6 + tests/unit/app/StopperTests.cpp | 10 + tests/unit/cluster/BackendTests.cpp | 351 ++++++++++++++ tests/unit/cluster/ClioNodeTests.cpp | 172 ++++++- .../ClusterCommunicationServiceTests.cpp | 304 ++++++------- tests/unit/cluster/MetricsTests.cpp | 179 ++++++++ tests/unit/cluster/RepeatedTaskTests.cpp | 223 +++++++++ tests/unit/cluster/WriterDeciderTests.cpp | 314 +++++++++++++ tests/unit/etl/ETLServiceTests.cpp | 428 +++++++++++++++++- tests/unit/etl/LedgerPublisherTests.cpp | 4 +- tests/unit/etl/LoadingTests.cpp | 56 +++ tests/unit/etl/SystemStateTests.cpp | 73 +++ tests/unit/etl/WriterStateTests.cpp | 162 +++++++ tests/unit/util/ChannelTests.cpp | 8 +- 44 files changed, 3579 insertions(+), 537 deletions(-) create mode 100644 src/cluster/Backend.cpp create mode 100644 src/cluster/Backend.hpp create mode 100644 src/cluster/Concepts.hpp rename src/cluster/{ClusterCommunicationServiceInterface.hpp => Metrics.cpp} (60%) create mode 100644 src/cluster/Metrics.hpp create mode 100644 src/cluster/WriterDecider.cpp create mode 100644 src/cluster/WriterDecider.hpp create mode 100644 src/cluster/impl/RepeatedTask.hpp create mode 100644 src/etl/WriterState.cpp create mode 100644 src/etl/WriterState.hpp create mode 100644 tests/common/util/MockWriterState.hpp create mode 100644 tests/unit/cluster/BackendTests.cpp create mode 100644 tests/unit/cluster/MetricsTests.cpp create mode 100644 tests/unit/cluster/RepeatedTaskTests.cpp create mode 100644 tests/unit/cluster/WriterDeciderTests.cpp create mode 100644 tests/unit/etl/SystemStateTests.cpp create mode 100644 tests/unit/etl/WriterStateTests.cpp diff --git a/src/app/ClioApplication.cpp b/src/app/ClioApplication.cpp index 2cd4803ee..436813d14 100644 --- a/src/app/ClioApplication.cpp +++ b/src/app/ClioApplication.cpp @@ -29,6 +29,8 @@ #include "etl/ETLService.hpp" #include "etl/LoadBalancer.hpp" #include "etl/NetworkValidatedLedgers.hpp" +#include "etl/SystemState.hpp" +#include "etl/WriterState.hpp" #include "feed/SubscriptionManager.hpp" #include "migration/MigrationInspectorFactory.hpp" #include "rpc/Counters.hpp" @@ -121,7 +123,11 @@ ClioApplication::run(bool const useNgWebServer) // Interface to the database auto backend = data::makeBackend(config_, cache); - cluster::ClusterCommunicationService clusterCommunicationService{backend}; + auto systemState = etl::SystemState::makeSystemState(config_); + + cluster::ClusterCommunicationService clusterCommunicationService{ + backend, std::make_unique(systemState) + }; clusterCommunicationService.run(); auto const amendmentCenter = std::make_shared(backend); @@ -151,7 +157,9 @@ ClioApplication::run(bool const useNgWebServer) ); // ETL is responsible for writing and publishing to streams. In read-only mode, ETL only publishes - auto etl = etl::ETLService::makeETLService(config_, ctx, backend, subscriptions, balancer, ledgers); + auto etl = etl::ETLService::makeETLService( + config_, std::move(systemState), ctx, backend, subscriptions, balancer, ledgers + ); auto workQueue = rpc::WorkQueue::makeWorkQueue(config_); auto counters = rpc::Counters::makeCounters(workQueue); @@ -197,7 +205,16 @@ ClioApplication::run(bool const useNgWebServer) } appStopper_.setOnStop( - Stopper::makeOnStopCallback(httpServer.value(), *balancer, *etl, *subscriptions, *backend, cacheSaver, ioc) + Stopper::makeOnStopCallback( + httpServer.value(), + *balancer, + *etl, + *subscriptions, + *backend, + cacheSaver, + clusterCommunicationService, + ioc + ) ); // Blocks until stopped. @@ -213,7 +230,9 @@ ClioApplication::run(bool const useNgWebServer) auto const httpServer = web::makeHttpServer(config_, ioc, dosGuard, handler, cache); appStopper_.setOnStop( - Stopper::makeOnStopCallback(*httpServer, *balancer, *etl, *subscriptions, *backend, cacheSaver, ioc) + Stopper::makeOnStopCallback( + *httpServer, *balancer, *etl, *subscriptions, *backend, cacheSaver, clusterCommunicationService, ioc + ) ); // Blocks until stopped. diff --git a/src/app/Stopper.hpp b/src/app/Stopper.hpp index 190dd5df9..b21ffc487 100644 --- a/src/app/Stopper.hpp +++ b/src/app/Stopper.hpp @@ -19,6 +19,7 @@ #pragma once +#include "cluster/Concepts.hpp" #include "data/BackendInterface.hpp" #include "data/LedgerCacheSaver.hpp" #include "etl/ETLServiceInterface.hpp" @@ -82,10 +83,14 @@ public: * @param subscriptions The subscription manager to stop. * @param backend The backend to stop. * @param cacheSaver The ledger cache saver + * @param clusterCommunicationService The cluster communication service to stop. * @param ioc The io_context to stop. * @return The callback to be called on application stop. */ - template + template < + web::SomeServer ServerType, + data::SomeLedgerCacheSaver LedgerCacheSaverType, + cluster::SomeClusterCommunicationService ClusterCommunicationServiceType> static std::function makeOnStopCallback( ServerType& server, @@ -94,6 +99,7 @@ public: feed::SubscriptionManagerInterface& subscriptions, data::BackendInterface& backend, LedgerCacheSaverType& cacheSaver, + ClusterCommunicationServiceType& clusterCommunicationService, boost::asio::io_context& ioc ) { @@ -111,6 +117,8 @@ public: }); coroutineGroup.asyncWait(yield); + clusterCommunicationService.stop(); + etl.stop(); LOG(util::LogService::info()) << "ETL stopped"; diff --git a/src/cluster/Backend.cpp b/src/cluster/Backend.cpp new file mode 100644 index 000000000..78a50cd84 --- /dev/null +++ b/src/cluster/Backend.cpp @@ -0,0 +1,141 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "cluster/Backend.hpp" + +#include "cluster/ClioNode.hpp" +#include "data/BackendInterface.hpp" +#include "etl/WriterState.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cluster { + +Backend::Backend( + boost::asio::thread_pool& ctx, + std::shared_ptr backend, + std::unique_ptr writerState, + std::chrono::steady_clock::duration readInterval, + std::chrono::steady_clock::duration writeInterval +) + : backend_(std::move(backend)) + , writerState_(std::move(writerState)) + , readerTask_(readInterval, ctx) + , writerTask_(writeInterval, ctx) + , selfUuid_(std::make_shared(boost::uuids::random_generator{}())) +{ +} + +void +Backend::run() +{ + readerTask_.run([this](boost::asio::yield_context yield) { + auto clusterData = doRead(yield); + onNewState_(selfUuid_, std::make_shared(std::move(clusterData))); + }); + + writerTask_.run([this]() { doWrite(); }); +} + +Backend::~Backend() +{ + stop(); +} + +void +Backend::stop() +{ + readerTask_.stop(); + writerTask_.stop(); +} + +ClioNode::CUuid +Backend::selfId() const +{ + return selfUuid_; +} + +Backend::ClusterData +Backend::doRead(boost::asio::yield_context yield) +{ + BackendInterface::ClioNodesDataFetchResult expectedResult; + try { + expectedResult = backend_->fetchClioNodesData(yield); + } catch (...) { + expectedResult = std::unexpected{"Failed to fetch Clio nodes data"}; + } + + if (!expectedResult.has_value()) { + return std::unexpected{std::move(expectedResult).error()}; + } + + std::vector otherNodesData; + for (auto const& [uuid, nodeDataStr] : expectedResult.value()) { + if (uuid == *selfUuid_) { + continue; + } + + boost::system::error_code errorCode; + auto const json = boost::json::parse(nodeDataStr, errorCode); + if (errorCode.failed()) { + return std::unexpected{fmt::format("Error parsing json from DB: {}", nodeDataStr)}; + } + + auto expectedNodeData = boost::json::try_value_to(json); + if (expectedNodeData.has_error()) { + return std::unexpected{fmt::format("Error converting json to ClioNode: {}", nodeDataStr)}; + } + *expectedNodeData->uuid = uuid; + otherNodesData.push_back(std::move(expectedNodeData).value()); + } + otherNodesData.push_back(ClioNode::from(selfUuid_, *writerState_)); + return otherNodesData; +} + +void +Backend::doWrite() +{ + auto const selfData = ClioNode::from(selfUuid_, *writerState_); + boost::json::value jsonValue{}; + boost::json::value_from(selfData, jsonValue); + backend_->writeNodeMessage(*selfData.uuid, boost::json::serialize(jsonValue.as_object())); +} + +} // namespace cluster diff --git a/src/cluster/Backend.hpp b/src/cluster/Backend.hpp new file mode 100644 index 000000000..41ea73f83 --- /dev/null +++ b/src/cluster/Backend.hpp @@ -0,0 +1,147 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#pragma once + +#include "cluster/ClioNode.hpp" +#include "cluster/impl/RepeatedTask.hpp" +#include "data/BackendInterface.hpp" +#include "etl/WriterState.hpp" +#include "util/log/Logger.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cluster { + +/** + * @brief Backend communication handler for cluster state synchronization. + * + * This class manages reading and writing cluster state information to/from the backend database. + * It periodically reads the state of other nodes in the cluster and writes the current node's state, + * enabling cluster-wide coordination and awareness. + */ +class Backend { +public: + /** @brief Type representing cluster data result - either a vector of nodes or an error message */ + using ClusterData = std::expected, std::string>; + +private: + util::Logger log_{"ClusterCommunication"}; + + std::shared_ptr backend_; + std::unique_ptr writerState_; + + impl::RepeatedTask readerTask_; + impl::RepeatedTask writerTask_; + + ClioNode::Uuid selfUuid_; + + boost::signals2::signal)> onNewState_; + +public: + /** + * @brief Construct a Backend communication handler. + * + * @param ctx The execution context for asynchronous operations + * @param backend Interface to the backend database + * @param writerState State indicating whether this node is writing to the database + * @param readInterval How often to read cluster state from the backend + * @param writeInterval How often to write this node's state to the backend + */ + Backend( + boost::asio::thread_pool& ctx, + std::shared_ptr backend, + std::unique_ptr writerState, + std::chrono::steady_clock::duration readInterval, + std::chrono::steady_clock::duration writeInterval + ); + + ~Backend(); + + Backend(Backend&&) = delete; + Backend& + operator=(Backend&&) = delete; + Backend(Backend const&) = delete; + Backend& + operator=(Backend const&) = delete; + + /** + * @brief Start the backend read and write tasks. + * + * Begins periodic reading of cluster state from the backend and writing of this node's state. + */ + void + run(); + + /** + * @brief Stop the backend read and write tasks. + * + * Stops all periodic tasks and waits for them to complete. + */ + void + stop(); + + /** + * @brief Subscribe to new cluster state notifications. + * + * @tparam S Callable type accepting (ClioNode::cUUID, ClusterData) + * @param s Subscriber callback to be invoked when new cluster state is available + * @return A connection object that can be used to unsubscribe + */ + template + requires std::invocable> + boost::signals2::connection + subscribeToNewState(S&& s) + { + return onNewState_.connect(s); + } + + /** + * @brief Get the UUID of this node in the cluster. + * + * @return The UUID of this node. + */ + ClioNode::CUuid + selfId() const; + +private: + ClusterData + doRead(boost::asio::yield_context yield); + + void + doWrite(); +}; + +} // namespace cluster diff --git a/src/cluster/CMakeLists.txt b/src/cluster/CMakeLists.txt index defd5853e..f6460bb23 100644 --- a/src/cluster/CMakeLists.txt +++ b/src/cluster/CMakeLists.txt @@ -1,5 +1,7 @@ add_library(clio_cluster) -target_sources(clio_cluster PRIVATE ClioNode.cpp ClusterCommunicationService.cpp) +target_sources( + clio_cluster PRIVATE Backend.cpp ClioNode.cpp ClusterCommunicationService.cpp Metrics.cpp WriterDecider.cpp +) target_link_libraries(clio_cluster PRIVATE clio_util clio_data) diff --git a/src/cluster/ClioNode.cpp b/src/cluster/ClioNode.cpp index e28585a90..a9dc528e6 100644 --- a/src/cluster/ClioNode.cpp +++ b/src/cluster/ClioNode.cpp @@ -19,6 +19,7 @@ #include "cluster/ClioNode.hpp" +#include "etl/WriterState.hpp" #include "util/TimeUtils.hpp" #include @@ -26,39 +27,72 @@ #include #include +#include +#include #include #include #include #include +#include namespace cluster { namespace { -struct Fields { +struct JsonFields { static constexpr std::string_view const kUPDATE_TIME = "update_time"; + static constexpr std::string_view const kDB_ROLE = "db_role"; }; } // namespace +ClioNode +ClioNode::from(ClioNode::Uuid uuid, etl::WriterStateInterface const& writerState) +{ + auto const dbRole = [&writerState]() { + if (writerState.isReadOnly()) { + return ClioNode::DbRole::ReadOnly; + } + if (writerState.isFallback()) { + return ClioNode::DbRole::Fallback; + } + if (writerState.isLoadingCache()) { + return ClioNode::DbRole::LoadingCache; + } + + return writerState.isWriting() ? ClioNode::DbRole::Writer : ClioNode::DbRole::NotWriter; + }(); + return ClioNode{.uuid = std::move(uuid), .updateTime = std::chrono::system_clock::now(), .dbRole = dbRole}; +} + void tag_invoke(boost::json::value_from_tag, boost::json::value& jv, ClioNode const& node) { jv = { - {Fields::kUPDATE_TIME, util::systemTpToUtcStr(node.updateTime, ClioNode::kTIME_FORMAT)}, + {JsonFields::kUPDATE_TIME, util::systemTpToUtcStr(node.updateTime, ClioNode::kTIME_FORMAT)}, + {JsonFields::kDB_ROLE, static_cast(node.dbRole)} }; } ClioNode tag_invoke(boost::json::value_to_tag, boost::json::value const& jv) { - auto const& updateTimeStr = jv.as_object().at(Fields::kUPDATE_TIME).as_string(); + auto const& updateTimeStr = jv.as_object().at(JsonFields::kUPDATE_TIME).as_string(); auto const updateTime = util::systemTpFromUtcStr(std::string(updateTimeStr), ClioNode::kTIME_FORMAT); if (!updateTime.has_value()) { throw std::runtime_error("Failed to parse update time"); } - return ClioNode{.uuid = std::make_shared(), .updateTime = updateTime.value()}; + auto const dbRoleValue = jv.as_object().at(JsonFields::kDB_ROLE).as_int64(); + if (dbRoleValue > static_cast(ClioNode::DbRole::MAX)) + throw std::runtime_error("Invalid db_role value"); + + return ClioNode{ + // Json data doesn't contain uuid so leaving it empty here. It will be filled outside of this parsing + .uuid = std::make_shared(), + .updateTime = updateTime.value(), + .dbRole = static_cast(dbRoleValue) + }; } } // namespace cluster diff --git a/src/cluster/ClioNode.hpp b/src/cluster/ClioNode.hpp index a350a3715..ea2a83e97 100644 --- a/src/cluster/ClioNode.hpp +++ b/src/cluster/ClioNode.hpp @@ -19,6 +19,8 @@ #pragma once +#include "etl/WriterState.hpp" + #include #include #include @@ -37,16 +39,37 @@ struct ClioNode { */ static constexpr char const* kTIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"; - // enum class WriterRole { - // ReadOnly, - // NotWriter, - // Writer - // }; + /** + * @brief Database role of a node in the cluster. + * + * Roles are used to coordinate which node writes to the database: + * - ReadOnly: Node is configured to never write (strict read-only mode) + * - NotWriter: Node can write but is currently not the designated writer + * - Writer: Node is actively writing to the database + * - Fallback: Node is using the fallback writer decision mechanism + * + * When any node in the cluster is in Fallback mode, the entire cluster switches + * from the cluster communication mechanism to the slower but more reliable + * database-based conflict detection mechanism. + */ + enum class DbRole { ReadOnly = 0, LoadingCache = 1, NotWriter = 2, Writer = 3, Fallback = 4, MAX = 4 }; - std::shared_ptr uuid; ///< The UUID of the node. + using Uuid = std::shared_ptr; + using CUuid = std::shared_ptr; + + Uuid uuid; ///< The UUID of the node. std::chrono::system_clock::time_point updateTime; ///< The time the data about the node was last updated. + DbRole dbRole; ///< The database role of the node - // WriterRole writerRole; + /** + * @brief Create a ClioNode from writer state. + * + * @param uuid The UUID of the node + * @param writerState The writer state to determine the node's database role + * @return A ClioNode with the current time and role derived from writerState + */ + static ClioNode + from(Uuid uuid, etl::WriterStateInterface const& writerState); }; void diff --git a/src/cluster/ClusterCommunicationService.cpp b/src/cluster/ClusterCommunicationService.cpp index a0aba56b9..79f89836e 100644 --- a/src/cluster/ClusterCommunicationService.cpp +++ b/src/cluster/ClusterCommunicationService.cpp @@ -19,11 +19,8 @@ #include "cluster/ClusterCommunicationService.hpp" -#include "cluster/ClioNode.hpp" #include "data/BackendInterface.hpp" -#include "util/Assert.hpp" -#include "util/Spawn.hpp" -#include "util/log/Logger.hpp" +#include "etl/WriterState.hpp" #include #include @@ -41,76 +38,32 @@ #include #include -#include #include -#include #include -#include - -namespace { -constexpr auto kTOTAL_WORKERS = 2uz; // 1 reading and 1 writing worker (coroutines) -} // namespace namespace cluster { ClusterCommunicationService::ClusterCommunicationService( std::shared_ptr backend, + std::unique_ptr writerState, std::chrono::steady_clock::duration readInterval, std::chrono::steady_clock::duration writeInterval ) - : backend_(std::move(backend)) - , readInterval_(readInterval) - , writeInterval_(writeInterval) - , finishedCountdown_(kTOTAL_WORKERS) - , selfData_{ClioNode{ - .uuid = std::make_shared(boost::uuids::random_generator{}()), - .updateTime = std::chrono::system_clock::time_point{} - }} + : backend_(ctx_, std::move(backend), writerState->clone(), readInterval, writeInterval) + , writerDecider_(ctx_, std::move(writerState)) { - nodesInClusterMetric_.set(1); // The node always sees itself - isHealthy_ = true; } void ClusterCommunicationService::run() { - ASSERT(not running_ and not stopped_, "Can only be ran once"); - running_ = true; - - util::spawn(strand_, [this](boost::asio::yield_context yield) { - boost::asio::steady_timer timer(yield.get_executor()); - boost::system::error_code ec; - - while (running_) { - timer.expires_after(readInterval_); - auto token = cancelSignal_.slot(); - timer.async_wait(boost::asio::bind_cancellation_slot(token, yield[ec])); - - if (ec == boost::asio::error::operation_aborted or not running_) - break; - - doRead(yield); - } - - finishedCountdown_.count_down(1); + backend_.subscribeToNewState([this](auto&&... args) { + metrics_.onNewState(std::forward(args)...); }); - - util::spawn(strand_, [this](boost::asio::yield_context yield) { - boost::asio::steady_timer timer(yield.get_executor()); - boost::system::error_code ec; - - while (running_) { - doWrite(); - timer.expires_after(writeInterval_); - auto token = cancelSignal_.slot(); - timer.async_wait(boost::asio::bind_cancellation_slot(token, yield[ec])); - - if (ec == boost::asio::error::operation_aborted or not running_) - break; - } - - finishedCountdown_.count_down(1); + backend_.subscribeToNewState([this](auto&&... args) { + writerDecider_.onNewState(std::forward(args)...); }); + backend_.run(); } ClusterCommunicationService::~ClusterCommunicationService() @@ -121,107 +74,7 @@ ClusterCommunicationService::~ClusterCommunicationService() void ClusterCommunicationService::stop() { - if (stopped_) - return; - - stopped_ = true; - - // for ASAN to see through concurrency correctly we need to exit all coroutines before joining the ctx - running_ = false; - - // cancelSignal_ is not thread safe so we execute emit on the same strand - boost::asio::spawn( - strand_, [this](auto&&) { cancelSignal_.emit(boost::asio::cancellation_type::all); }, boost::asio::use_future - ) - .wait(); - finishedCountdown_.wait(); - - ctx_.join(); -} - -std::shared_ptr -ClusterCommunicationService::selfUuid() const -{ - // Uuid never changes so it is safe to copy it without using strand_ - return selfData_.uuid; -} - -ClioNode -ClusterCommunicationService::selfData() const -{ - ClioNode result{}; - util::spawn(strand_, [this, &result](boost::asio::yield_context) { result = selfData_; }); - return result; -} - -std::expected, std::string> -ClusterCommunicationService::clusterData() const -{ - if (not isHealthy_) { - return std::unexpected{"Service is not healthy"}; - } - std::vector result; - util::spawn(strand_, [this, &result](boost::asio::yield_context) { - result = otherNodesData_; - result.push_back(selfData_); - }); - return result; -} - -void -ClusterCommunicationService::doRead(boost::asio::yield_context yield) -{ - otherNodesData_.clear(); - - BackendInterface::ClioNodesDataFetchResult expectedResult; - try { - expectedResult = backend_->fetchClioNodesData(yield); - } catch (...) { - expectedResult = std::unexpected{"Failed to fecth Clio nodes data"}; - } - - if (!expectedResult.has_value()) { - LOG(log_.error()) << "Failed to fetch nodes data"; - isHealthy_ = false; - return; - } - - // Create a new vector here to not have partially parsed data in otherNodesData_ - std::vector otherNodesData; - for (auto const& [uuid, nodeDataStr] : expectedResult.value()) { - if (uuid == *selfData_.uuid) { - continue; - } - - boost::system::error_code errorCode; - auto const json = boost::json::parse(nodeDataStr, errorCode); - if (errorCode.failed()) { - LOG(log_.error()) << "Error parsing json from DB: " << nodeDataStr; - isHealthy_ = false; - return; - } - - auto expectedNodeData = boost::json::try_value_to(json); - if (expectedNodeData.has_error()) { - LOG(log_.error()) << "Error converting json to ClioNode: " << json; - isHealthy_ = false; - return; - } - *expectedNodeData->uuid = uuid; - otherNodesData.push_back(std::move(expectedNodeData).value()); - } - otherNodesData_ = std::move(otherNodesData); - nodesInClusterMetric_.set(otherNodesData_.size() + 1); - isHealthy_ = true; -} - -void -ClusterCommunicationService::doWrite() -{ - selfData_.updateTime = std::chrono::system_clock::now(); - boost::json::value jsonValue{}; - boost::json::value_from(selfData_, jsonValue); - backend_->writeNodeMessage(*selfData_.uuid, boost::json::serialize(jsonValue.as_object())); + backend_.stop(); } } // namespace cluster diff --git a/src/cluster/ClusterCommunicationService.hpp b/src/cluster/ClusterCommunicationService.hpp index 3814271e6..f73a7cd32 100644 --- a/src/cluster/ClusterCommunicationService.hpp +++ b/src/cluster/ClusterCommunicationService.hpp @@ -19,13 +19,12 @@ #pragma once -#include "cluster/ClioNode.hpp" -#include "cluster/ClusterCommunicationServiceInterface.hpp" +#include "cluster/Backend.hpp" +#include "cluster/Concepts.hpp" +#include "cluster/Metrics.hpp" +#include "cluster/WriterDecider.hpp" #include "data/BackendInterface.hpp" -#include "util/log/Logger.hpp" -#include "util/prometheus/Bool.hpp" -#include "util/prometheus/Gauge.hpp" -#include "util/prometheus/Prometheus.hpp" +#include "etl/WriterState.hpp" #include #include @@ -33,67 +32,49 @@ #include #include -#include #include -#include #include -#include -#include namespace cluster { /** * @brief Service to post and read messages to/from the cluster. It uses a backend to communicate with the cluster. */ -class ClusterCommunicationService : public ClusterCommunicationServiceInterface { - util::prometheus::GaugeInt& nodesInClusterMetric_ = PrometheusService::gaugeInt( - "cluster_nodes_total_number", - {}, - "Total number of nodes this node can detect in the cluster." - ); - util::prometheus::Bool isHealthy_ = PrometheusService::boolMetric( - "cluster_communication_is_healthy", - {}, - "Whether cluster communication service is operating healthy (1 - healthy, 0 - we have a problem)" - ); - +class ClusterCommunicationService : public ClusterCommunicationServiceTag { // TODO: Use util::async::CoroExecutionContext after https://github.com/XRPLF/clio/issues/1973 is implemented boost::asio::thread_pool ctx_{1}; - boost::asio::strand strand_ = boost::asio::make_strand(ctx_); - - util::Logger log_{"ClusterCommunication"}; - - std::shared_ptr backend_; - - std::chrono::steady_clock::duration readInterval_; - std::chrono::steady_clock::duration writeInterval_; - - boost::asio::cancellation_signal cancelSignal_; - std::latch finishedCountdown_; - std::atomic_bool running_ = false; - bool stopped_ = false; - - ClioNode selfData_; - std::vector otherNodesData_; + Backend backend_; + Metrics metrics_; + WriterDecider writerDecider_; public: - static constexpr std::chrono::milliseconds kDEFAULT_READ_INTERVAL{2100}; - static constexpr std::chrono::milliseconds kDEFAULT_WRITE_INTERVAL{1200}; + static constexpr std::chrono::milliseconds kDEFAULT_READ_INTERVAL{1000}; + static constexpr std::chrono::milliseconds kDEFAULT_WRITE_INTERVAL{1000}; + /** * @brief Construct a new Cluster Communication Service object. * * @param backend The backend to use for communication. + * @param writerState The state showing whether clio is writing to the database. * @param readInterval The interval to read messages from the cluster. * @param writeInterval The interval to write messages to the cluster. */ ClusterCommunicationService( std::shared_ptr backend, + std::unique_ptr writerState, std::chrono::steady_clock::duration readInterval = kDEFAULT_READ_INTERVAL, std::chrono::steady_clock::duration writeInterval = kDEFAULT_WRITE_INTERVAL ); ~ClusterCommunicationService() override; + ClusterCommunicationService(ClusterCommunicationService&&) = delete; + ClusterCommunicationService(ClusterCommunicationService const&) = delete; + ClusterCommunicationService& + operator=(ClusterCommunicationService&&) = delete; + ClusterCommunicationService& + operator=(ClusterCommunicationService const&) = delete; + /** * @brief Start the service. */ @@ -105,44 +86,6 @@ public: */ void stop(); - - ClusterCommunicationService(ClusterCommunicationService&&) = delete; - ClusterCommunicationService(ClusterCommunicationService const&) = delete; - ClusterCommunicationService& - operator=(ClusterCommunicationService&&) = delete; - ClusterCommunicationService& - operator=(ClusterCommunicationService const&) = delete; - - /** - * @brief Get the UUID of the current node. - * - * @return The UUID of the current node. - */ - std::shared_ptr - selfUuid() const; - - /** - * @brief Get the data of the current node. - * - * @return The data of the current node. - */ - ClioNode - selfData() const override; - - /** - * @brief Get the data of all nodes in the cluster (including self). - * - * @return The data of all nodes in the cluster or error if the service is not healthy. - */ - std::expected, std::string> - clusterData() const override; - -private: - void - doRead(boost::asio::yield_context yield); - - void - doWrite(); }; } // namespace cluster diff --git a/src/cluster/Concepts.hpp b/src/cluster/Concepts.hpp new file mode 100644 index 000000000..340cb0c62 --- /dev/null +++ b/src/cluster/Concepts.hpp @@ -0,0 +1,39 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#pragma once + +#include + +namespace cluster { + +/** + * @brief Tag type for cluster communication service implementations. + * + * This tag is used to identify types that implement cluster communication functionality. + * Types should inherit from this tag to be recognized as cluster communication services. + */ +struct ClusterCommunicationServiceTag { + virtual ~ClusterCommunicationServiceTag() = default; +}; + +template +concept SomeClusterCommunicationService = std::derived_from; + +} // namespace cluster diff --git a/src/cluster/ClusterCommunicationServiceInterface.hpp b/src/cluster/Metrics.cpp similarity index 60% rename from src/cluster/ClusterCommunicationServiceInterface.hpp rename to src/cluster/Metrics.cpp index 6e79460c5..371a7a203 100644 --- a/src/cluster/ClusterCommunicationServiceInterface.hpp +++ b/src/cluster/Metrics.cpp @@ -17,38 +17,31 @@ */ //============================================================================== -#pragma once +#include "cluster/Metrics.hpp" +#include "cluster/Backend.hpp" #include "cluster/ClioNode.hpp" -#include -#include -#include +#include namespace cluster { -/** - * @brief Interface for the cluster communication service. - */ -class ClusterCommunicationServiceInterface { -public: - virtual ~ClusterCommunicationServiceInterface() = default; +Metrics::Metrics() +{ + nodesInClusterMetric_.set(1); // The node always sees itself + isHealthy_ = true; +} - /** - * @brief Get the data of the current node. - * - * @return The data of the current node. - */ - [[nodiscard]] virtual ClioNode - selfData() const = 0; - - /** - * @brief Get the data of all nodes in the cluster (including self). - * - * @return The data of all nodes in the cluster or error if the service is not healthy. - */ - [[nodiscard]] virtual std::expected, std::string> - clusterData() const = 0; -}; +void +Metrics::onNewState(ClioNode::CUuid, std::shared_ptr clusterData) +{ + if (clusterData->has_value()) { + isHealthy_ = true; + nodesInClusterMetric_.set(clusterData->value().size()); + } else { + isHealthy_ = false; + nodesInClusterMetric_.set(1); + } +} } // namespace cluster diff --git a/src/cluster/Metrics.hpp b/src/cluster/Metrics.hpp new file mode 100644 index 000000000..affea41e7 --- /dev/null +++ b/src/cluster/Metrics.hpp @@ -0,0 +1,76 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#pragma once + +#include "cluster/Backend.hpp" +#include "cluster/ClioNode.hpp" +#include "util/prometheus/Bool.hpp" +#include "util/prometheus/Gauge.hpp" +#include "util/prometheus/Prometheus.hpp" + +#include + +namespace cluster { + +/** + * @brief Manages Prometheus metrics for cluster communication and node tracking. + * + * This class tracks cluster-related metrics including: + * - Total number of nodes detected in the cluster + * - Health status of cluster communication + */ +class Metrics { + /** @brief Gauge tracking the total number of nodes visible in the cluster */ + util::prometheus::GaugeInt& nodesInClusterMetric_ = PrometheusService::gaugeInt( + "cluster_nodes_total_number", + {}, + "Total number of nodes this node can detect in the cluster." + ); + + /** @brief Boolean metric indicating whether cluster communication is healthy */ + util::prometheus::Bool isHealthy_ = PrometheusService::boolMetric( + "cluster_communication_is_healthy", + {}, + "Whether cluster communication service is operating healthy (1 - healthy, 0 - we have a problem)" + ); + +public: + /** + * @brief Constructs a Metrics instance and initializes metrics. + * + * Sets the initial node count to 1 (self) and marks communication as healthy. + */ + Metrics(); + + /** + * @brief Updates metrics based on new cluster state. + * + * This callback is invoked when cluster state changes. It updates: + * - Health status based on whether cluster data is available + * - Node count to reflect the current cluster size + * + * @param uuid The UUID of the node (unused in current implementation) + * @param clusterData Shared pointer to the current cluster data; may be empty if communication failed + */ + void + onNewState(ClioNode::CUuid uuid, std::shared_ptr clusterData); +}; + +} // namespace cluster diff --git a/src/cluster/WriterDecider.cpp b/src/cluster/WriterDecider.cpp new file mode 100644 index 000000000..4713ca807 --- /dev/null +++ b/src/cluster/WriterDecider.cpp @@ -0,0 +1,98 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "cluster/WriterDecider.hpp" + +#include "cluster/Backend.hpp" +#include "cluster/ClioNode.hpp" +#include "etl/WriterState.hpp" +#include "util/Assert.hpp" +#include "util/Spawn.hpp" + +#include + +#include +#include +#include +#include + +namespace cluster { + +WriterDecider::WriterDecider(boost::asio::thread_pool& ctx, std::unique_ptr writerState) + : ctx_(ctx), writerState_(std::move(writerState)) +{ +} + +void +WriterDecider::onNewState(ClioNode::CUuid selfId, std::shared_ptr clusterData) +{ + if (not clusterData->has_value()) + return; + + util::spawn( + ctx_, + [writerState = writerState_->clone(), + selfId = std::move(selfId), + clusterData = clusterData->value()](auto&&) mutable { + auto const selfData = + std::ranges::find_if(clusterData, [&selfId](ClioNode const& node) { return node.uuid == selfId; }); + ASSERT(selfData != clusterData.end(), "Self data should always be in the cluster data"); + + if (selfData->dbRole == ClioNode::DbRole::Fallback) { + return; + } + + if (selfData->dbRole == ClioNode::DbRole::ReadOnly) { + writerState->giveUpWriting(); + return; + } + + // If any node in the cluster is in Fallback mode, the entire cluster must switch + // to the fallback writer decision mechanism for consistency + if (std::ranges::any_of(clusterData, [](ClioNode const& node) { + return node.dbRole == ClioNode::DbRole::Fallback; + })) { + writerState->setWriterDecidingFallback(); + return; + } + + // We are not ReadOnly and there is no Fallback in the cluster + std::ranges::sort(clusterData, [](ClioNode const& lhs, ClioNode const& rhs) { + return *lhs.uuid < *rhs.uuid; + }); + + auto const it = std::ranges::find_if(clusterData, [](ClioNode const& node) { + return node.dbRole == ClioNode::DbRole::NotWriter or node.dbRole == ClioNode::DbRole::Writer; + }); + + if (it == clusterData.end()) { + // No writer nodes in the cluster yet + return; + } + + if (*it->uuid == *selfId) { + writerState->startWriting(); + } else { + writerState->giveUpWriting(); + } + } + ); +} + +} // namespace cluster diff --git a/src/cluster/WriterDecider.hpp b/src/cluster/WriterDecider.hpp new file mode 100644 index 000000000..0a3c0bfce --- /dev/null +++ b/src/cluster/WriterDecider.hpp @@ -0,0 +1,75 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#pragma once + +#include "cluster/Backend.hpp" +#include "cluster/ClioNode.hpp" +#include "etl/WriterState.hpp" + +#include + +#include + +namespace cluster { + +/** + * @brief Decides which node in the cluster should be the writer based on cluster state. + * + * This class monitors cluster state changes and determines whether the current node + * should act as the writer to the database. The decision is made by: + * 1. Sorting all nodes by UUID for deterministic ordering + * 2. Selecting the first node that is allowed to write (not ReadOnly) + * 3. Activating writing on this node if it's the current node, otherwise deactivating + * + * This ensures only one node in the cluster actively writes to the database at a time. + */ +class WriterDecider { + /** @brief Thread pool for spawning asynchronous tasks */ + boost::asio::thread_pool& ctx_; + + /** @brief Interface for controlling the writer state of this node */ + std::unique_ptr writerState_; + +public: + /** + * @brief Constructs a WriterDecider. + * + * @param ctx Thread pool for executing asynchronous operations + * @param writerState Writer state interface for controlling write operations + */ + WriterDecider(boost::asio::thread_pool& ctx, std::unique_ptr writerState); + + /** + * @brief Handles cluster state changes and decides whether this node should be the writer. + * + * This method is called when cluster state changes. It asynchronously: + * - Sorts all nodes by UUID to establish a deterministic order + * - Identifies the first node allowed to write (not ReadOnly) + * - Activates writing if this node is selected, otherwise deactivates writing + * - Logs a warning if no nodes in the cluster are allowed to write + * + * @param selfId The UUID of the current node + * @param clusterData Shared pointer to current cluster data; may be empty if communication failed + */ + void + onNewState(ClioNode::CUuid selfId, std::shared_ptr clusterData); +}; + +} // namespace cluster diff --git a/src/cluster/impl/RepeatedTask.hpp b/src/cluster/impl/RepeatedTask.hpp new file mode 100644 index 000000000..9037c6a47 --- /dev/null +++ b/src/cluster/impl/RepeatedTask.hpp @@ -0,0 +1,104 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#pragma once + +#include "util/Assert.hpp" +#include "util/Spawn.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cluster::impl { + +// TODO: Try to replace util::Repeat by this. https://github.com/XRPLF/clio/issues/2926 +template +class RepeatedTask { + std::chrono::steady_clock::duration interval_; + boost::asio::strand strand_; + + enum class State { Running, Stopped }; + std::atomic state_ = State::Stopped; + + std::binary_semaphore semaphore_{0}; + boost::asio::steady_timer timer_; + +public: + RepeatedTask(std::chrono::steady_clock::duration interval, Context& ctx) + : interval_(interval), strand_(boost::asio::make_strand(ctx)), timer_(strand_) + { + } + + ~RepeatedTask() + { + stop(); + } + + template + requires std::invocable or std::invocable + void + run(Fn&& f) + { + ASSERT(state_ == State::Stopped, "Can only be ran once"); + state_ = State::Running; + util::spawn(strand_, [this, f = std::forward(f)](boost::asio::yield_context yield) { + boost::system::error_code ec; + + while (state_ == State::Running) { + timer_.expires_after(interval_); + timer_.async_wait(yield[ec]); + + if (ec or state_ != State::Running) + break; + + if constexpr (std::invocable) { + f(yield); + } else { + f(); + } + } + + semaphore_.release(); + }); + } + + void + stop() + { + if (auto expected = State::Running; not state_.compare_exchange_strong(expected, State::Stopped)) + return; // Already stopped or not started + + boost::asio::spawn(strand_, [this](auto&&) { timer_.cancel(); }, boost::asio::use_future).wait(); + semaphore_.acquire(); + } +}; + +} // namespace cluster::impl diff --git a/src/etl/CMakeLists.txt b/src/etl/CMakeLists.txt index d6e8557db..466b0cec1 100644 --- a/src/etl/CMakeLists.txt +++ b/src/etl/CMakeLists.txt @@ -11,6 +11,7 @@ target_sources( NetworkValidatedLedgers.cpp NFTHelpers.cpp Source.cpp + WriterState.cpp impl/AmendmentBlockHandler.cpp impl/AsyncGrpcCall.cpp impl/Extraction.cpp diff --git a/src/etl/ETLService.cpp b/src/etl/ETLService.cpp index 6f8c0d4ee..433721c28 100644 --- a/src/etl/ETLService.cpp +++ b/src/etl/ETLService.cpp @@ -78,6 +78,7 @@ namespace etl { std::shared_ptr ETLService::makeETLService( util::config::ClioConfigDefinition const& config, + std::shared_ptr state, util::async::AnyExecutionContext ctx, std::shared_ptr backend, std::shared_ptr subscriptions, @@ -87,9 +88,6 @@ ETLService::makeETLService( { std::shared_ptr ret; - auto state = std::make_shared(); - state->isStrictReadonly = config.get("read_only"); - auto fetcher = std::make_shared(backend, balancer); auto extractor = std::make_shared(fetcher); auto publisher = std::make_shared(ctx, backend, subscriptions, *state); @@ -173,6 +171,7 @@ ETLService::ETLService( , state_(std::move(state)) , startSequence_(config.get().maybeValue("start_sequence")) , finishSequence_(config.get().maybeValue("finish_sequence")) + , writeCommandStrand_(ctx_.makeStrand()) { ASSERT(not state_->isWriting, "ETL should never start in writer mode"); @@ -213,14 +212,13 @@ ETLService::run() return; } - auto nextSequence = rng->maxSequence + 1; - if (backend_->cache().latestLedgerSequence() != 0) { - nextSequence = backend_->cache().latestLedgerSequence(); - } - + auto const nextSequence = syncCacheWithDb(); LOG(log_.debug()) << "Database is populated. Starting monitor loop. sequence = " << nextSequence; + startMonitor(nextSequence); + state_->isLoadingCache = false; + // If we are a writer as the result of loading the initial ledger - start loading if (state_->isWriting) startLoading(nextSequence); @@ -232,6 +230,13 @@ ETLService::stop() { LOG(log_.info()) << "Stop called"; + systemStateWriteCommandSubscription_.disconnect(); + auto count = runningWriteCommandHandlers_.load(); + while (count != 0) { + runningWriteCommandHandlers_.wait(count); // Blocks until value changes + count = runningWriteCommandHandlers_.load(); + } + if (mainLoop_) mainLoop_->wait(); if (taskMan_) @@ -343,35 +348,77 @@ ETLService::loadInitialLedgerIfNeeded() return rng; } +uint32_t +ETLService::syncCacheWithDb() +{ + auto rng = backend_->hardFetchLedgerRangeNoThrow(); + + while (not backend_->cache().isDisabled() and rng->maxSequence > backend_->cache().latestLedgerSequence()) { + LOG(log_.info()) << "Syncing cache with DB. DB latest seq: " << rng->maxSequence + << ". Cache latest seq: " << backend_->cache().latestLedgerSequence(); + for (auto seq = backend_->cache().latestLedgerSequence(); seq <= rng->maxSequence; ++seq) { + LOG(log_.info()) << "ETLService (via syncCacheWithDb) got new seq from db: " << seq; + updateCache(seq); + } + rng = backend_->hardFetchLedgerRangeNoThrow(); + } + return rng->maxSequence + 1; +} + +void +ETLService::updateCache(uint32_t seq) +{ + auto const cacheNeedsUpdate = backend_->cache().latestLedgerSequence() < seq; + auto const backendRange = backend_->fetchLedgerRange(); + auto const backendNeedsUpdate = backendRange.has_value() and backendRange->maxSequence < seq; + + if (cacheNeedsUpdate) { + auto const diff = data::synchronousAndRetryOnTimeout([this, seq](auto yield) { + return backend_->fetchLedgerDiff(seq, yield); + }); + cacheUpdater_->update(seq, diff); + } + + if (backendNeedsUpdate) + backend_->updateRange(seq); + + publisher_->publish(seq, {}); +} + void ETLService::startMonitor(uint32_t seq) { monitor_ = monitorProvider_->make(ctx_, backend_, ledgers_, seq); + systemStateWriteCommandSubscription_ = + state_->writeCommandSignal.connect([this](SystemState::WriteCommand command) { + ++runningWriteCommandHandlers_; + writeCommandStrand_.submit([this, command]() { + switch (command) { + case etl::SystemState::WriteCommand::StartWriting: + attemptTakeoverWriter(); + break; + case etl::SystemState::WriteCommand::StopWriting: + giveUpWriter(); + break; + } + --runningWriteCommandHandlers_; + runningWriteCommandHandlers_.notify_one(); + }); + }); + monitorNewSeqSubscription_ = monitor_->subscribeToNewSequence([this](uint32_t seq) { LOG(log_.info()) << "ETLService (via Monitor) got new seq from db: " << seq; - - if (state_->writeConflict) { - LOG(log_.info()) << "Got a write conflict; Giving up writer seat immediately"; - giveUpWriter(); - } - - if (not state_->isWriting) { - auto const diff = data::synchronousAndRetryOnTimeout([this, seq](auto yield) { - return backend_->fetchLedgerDiff(seq, yield); - }); - - cacheUpdater_->update(seq, diff); - backend_->updateRange(seq); - } - - publisher_->publish(seq, {}); + updateCache(seq); }); monitorDbStalledSubscription_ = monitor_->subscribeToDbStalled([this]() { LOG(log_.warn()) << "ETLService received DbStalled signal from Monitor"; + // Database stall detected - no writer has been active for 10 seconds + // This triggers the fallback mechanism and attempts to become the writer if (not state_->isStrictReadonly and not state_->isWriting) - attemptTakeoverWriter(); + state_->writeCommandSignal(SystemState::WriteCommand::StartWriting); + state_->isWriterDecidingFallback = true; }); monitor_->run(); @@ -394,6 +441,13 @@ ETLService::attemptTakeoverWriter() auto rng = backend_->hardFetchLedgerRangeNoThrow(); ASSERT(rng.has_value(), "Ledger range can't be null"); + if (backend_->cache().latestLedgerSequence() != rng->maxSequence) { + LOG(log_.info()) << "Wanted to take over the ETL writer seat but LedgerCache is outdated"; + // Give ETL time to update LedgerCache. This method will be called because ClusterCommunication will likely to + // continue sending StartWriting signal every 1 second + return; + } + state_->isWriting = true; // switch to writer LOG(log_.info()) << "Taking over the ETL writer seat"; startLoading(rng->maxSequence + 1); @@ -404,7 +458,7 @@ ETLService::giveUpWriter() { ASSERT(not state_->isStrictReadonly, "This should only happen on writer nodes"); state_->isWriting = false; - state_->writeConflict = false; + LOG(log_.info()) << "Giving up writer seat"; taskMan_ = nullptr; } diff --git a/src/etl/ETLService.hpp b/src/etl/ETLService.hpp index 45185d4be..5effa8eec 100644 --- a/src/etl/ETLService.hpp +++ b/src/etl/ETLService.hpp @@ -52,6 +52,7 @@ #include "feed/SubscriptionManagerInterface.hpp" #include "util/async/AnyExecutionContext.hpp" #include "util/async/AnyOperation.hpp" +#include "util/async/AnyStrand.hpp" #include "util/config/ConfigDefinition.hpp" #include "util/log/Logger.hpp" @@ -69,12 +70,12 @@ #include #include +#include #include #include #include #include #include -#include namespace etl { @@ -117,6 +118,9 @@ class ETLService : public ETLServiceInterface { boost::signals2::scoped_connection monitorNewSeqSubscription_; boost::signals2::scoped_connection monitorDbStalledSubscription_; + boost::signals2::scoped_connection systemStateWriteCommandSubscription_; + util::async::AnyStrand writeCommandStrand_; + std::atomic runningWriteCommandHandlers_{0}; std::optional> mainLoop_; @@ -127,6 +131,7 @@ public: * Creates and runs the ETL service. * * @param config The configuration to use + * @param state The system state tracking object * @param ctx Execution context for asynchronous operations * @param backend BackendInterface implementation * @param subscriptions Subscription manager @@ -137,6 +142,7 @@ public: static std::shared_ptr makeETLService( util::config::ClioConfigDefinition const& config, + std::shared_ptr state, util::async::AnyExecutionContext ctx, std::shared_ptr backend, std::shared_ptr subscriptions, @@ -160,7 +166,7 @@ public: * @param initialLoadObserver The observer for initial data loading * @param taskManagerProvider The provider of the task manager instance * @param monitorProvider The provider of the monitor instance - * @param state System state tracking object + * @param state The system state tracking object */ ETLService( util::async::AnyExecutionContext ctx, @@ -206,6 +212,12 @@ private: std::optional loadInitialLedgerIfNeeded(); + [[nodiscard]] uint32_t + syncCacheWithDb(); + + void + updateCache(uint32_t seq); + void startMonitor(uint32_t seq); diff --git a/src/etl/SystemState.hpp b/src/etl/SystemState.hpp index 7f841665f..5f0f12f65 100644 --- a/src/etl/SystemState.hpp +++ b/src/etl/SystemState.hpp @@ -19,11 +19,16 @@ #pragma once +#include "util/config/ConfigDefinition.hpp" +#include "util/log/Logger.hpp" #include "util/prometheus/Bool.hpp" #include "util/prometheus/Label.hpp" #include "util/prometheus/Prometheus.hpp" -#include +#include +#include + +#include namespace etl { @@ -31,6 +36,25 @@ namespace etl { * @brief Represents the state of the ETL subsystem. */ struct SystemState { + SystemState() + { + isLoadingCache = true; + } + + /** + * @brief Factory method to create a SystemState instance. + * + * @param config The configuration to use for initializing the system state + * @return A shared pointer to the newly created SystemState + */ + static std::shared_ptr + makeSystemState(util::config::ClioConfigDefinition const& config) + { + auto state = std::make_shared(); + state->isStrictReadonly = config.get("read_only"); + return state; + } + /** * @brief Whether the process is in strict read-only mode. * @@ -50,8 +74,31 @@ struct SystemState { "Whether the process is writing to the database" ); - std::atomic_bool isStopping = false; /**< @brief Whether the software is stopping. */ - std::atomic_bool writeConflict = false; /**< @brief Whether a write conflict was detected. */ + /** @brief Whether the process is still loading cache after startup. */ + util::prometheus::Bool isLoadingCache = PrometheusService::boolMetric( + "etl_loading_cache", + util::prometheus::Labels{}, + "Whether etl is loading cache after clio startup" + ); + + /** + * @brief Commands for controlling the ETL writer state. + * + * These commands are emitted via writeCommandSignal to coordinate writer state transitions across components. + */ + enum class WriteCommand { + StartWriting, /**< Request to attempt taking over as the ETL writer */ + StopWriting /**< Request to give up the ETL writer role (e.g., due to write conflict) */ + }; + + /** + * @brief Signal for coordinating ETL writer state transitions. + * + * This signal allows components to request changes to the writer state without direct coupling. + * - Emitted with StartWriting when database stalls and node should attempt to become writer + * - Emitted with StopWriting when write conflicts are detected + */ + boost::signals2::signal writeCommandSignal; /** * @brief Whether clio detected an amendment block. @@ -77,6 +124,24 @@ struct SystemState { util::prometheus::Labels{}, "Whether clio detected a corruption that needs manual attention" ); + + /** + * @brief Whether the cluster is using the fallback writer decision mechanism. + * + * The fallback mechanism is triggered when: + * - The database stalls for 10 seconds (detected by Monitor), indicating no active writer + * - A write conflict is detected, indicating multiple nodes attempting to write simultaneously + * + * When fallback mode is active, the cluster stops using the cluster communication mechanism + * (TTL-based role announcements) and relies on the slower but more reliable database-based + * conflict detection. This flag propagates across the cluster - if any node enters fallback + * mode, all nodes in the cluster will switch to fallback mode. + */ + util::prometheus::Bool isWriterDecidingFallback = PrometheusService::boolMetric( + "etl_writing_deciding_fallback", + util::prometheus::Labels{}, + "Whether the cluster is using the fallback writer decision mechanism" + ); }; } // namespace etl diff --git a/src/etl/WriterState.cpp b/src/etl/WriterState.cpp new file mode 100644 index 000000000..8947febd4 --- /dev/null +++ b/src/etl/WriterState.cpp @@ -0,0 +1,88 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "etl/WriterState.hpp" + +#include "etl/SystemState.hpp" + +#include +#include + +namespace etl { + +WriterState::WriterState(std::shared_ptr state) : systemState_(std::move(state)) +{ +} + +bool +WriterState::isReadOnly() const +{ + return systemState_->isStrictReadonly; +} + +bool +WriterState::isWriting() const +{ + return systemState_->isWriting; +} + +void +WriterState::startWriting() +{ + if (isWriting()) + return; + + systemState_->writeCommandSignal(SystemState::WriteCommand::StartWriting); +} + +void +WriterState::giveUpWriting() +{ + if (not isWriting()) + return; + + systemState_->writeCommandSignal(SystemState::WriteCommand::StopWriting); +} + +void +WriterState::setWriterDecidingFallback() +{ + systemState_->isWriterDecidingFallback = true; +} + +bool +WriterState::isFallback() const +{ + return systemState_->isWriterDecidingFallback; +} + +bool +WriterState::isLoadingCache() const +{ + return systemState_->isLoadingCache; +} + +std::unique_ptr +WriterState::clone() const +{ + auto c = WriterState(*this); + return std::make_unique(std::move(c)); +} + +} // namespace etl diff --git a/src/etl/WriterState.hpp b/src/etl/WriterState.hpp new file mode 100644 index 000000000..b30c5162a --- /dev/null +++ b/src/etl/WriterState.hpp @@ -0,0 +1,193 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#pragma once + +#include "etl/SystemState.hpp" + +#include + +namespace etl { + +/** + * @brief Interface for managing writer state in the ETL subsystem. + * + * This interface provides methods to query and control whether the ETL process + * is actively writing to the database. Implementations should coordinate with + * the ETL system state to manage write responsibilities. + */ +class WriterStateInterface { +public: + virtual ~WriterStateInterface() = default; + + /** + * @brief Check if the ETL process is in strict read-only mode. + * @return true if the process is in strict read-only mode, false otherwise + */ + [[nodiscard]] virtual bool + isReadOnly() const = 0; + + /** + * @brief Check if the ETL process is currently writing to the database. + * @return true if the process is writing, false otherwise + */ + [[nodiscard]] virtual bool + isWriting() const = 0; + + /** + * @brief Request to start writing to the database. + * + * This method signals that the process should take over writing responsibilities. + * The actual transition to writing state may not be immediate. + */ + virtual void + startWriting() = 0; + + /** + * @brief Request to stop writing to the database. + * + * This method signals that the process should give up writing responsibilities. + * The actual transition from writing state may not be immediate. + */ + virtual void + giveUpWriting() = 0; + + /** + * @brief Check if the cluster is using the fallback writer decision mechanism. + * + * @return true if the cluster has switched to fallback mode, false otherwise + */ + [[nodiscard]] virtual bool + isFallback() const = 0; + + /** + * @brief Switch the cluster to the fallback writer decision mechanism. + * + * This method is called when the cluster needs to transition from the cluster + * communication mechanism to the slower but more reliable fallback mechanism. + * Once set, this flag propagates to all nodes in the cluster through the + * ClioNode DbRole::Fallback state. + */ + virtual void + setWriterDecidingFallback() = 0; + + /** + * @brief Whether clio is still loading cache after startup. + * + * @return true if clio is still loading cache, false otherwise. + */ + [[nodiscard]] virtual bool + isLoadingCache() const = 0; + + /** + * @brief Create a clone of this writer state. + * + * Creates a new instance of the writer state with the same underlying system state. + * This is used when spawning operations that need their own writer state instance + * while sharing the same system state. + * + * @return A unique pointer to the cloned writer state. + */ + [[nodiscard]] virtual std::unique_ptr + clone() const = 0; +}; + +/** + * @brief Implementation of WriterStateInterface that manages ETL writer state. + * + * This class coordinates with SystemState to manage whether the ETL process + * is actively writing to the database. It provides methods to query the current + * writing state and request transitions between writing and non-writing states. + */ +class WriterState : public WriterStateInterface { +private: + std::shared_ptr systemState_; /**< @brief Shared system state for ETL coordination */ + +public: + /** + * @brief Construct a WriterState with the given system state. + * @param state Shared pointer to the system state for coordination + */ + WriterState(std::shared_ptr state); + + bool + isReadOnly() const override; + + /** + * @brief Check if the ETL process is currently writing to the database. + * @return true if the process is writing, false otherwise + */ + bool + isWriting() const override; + + /** + * @brief Request to start writing to the database. + * + * If already writing, this method does nothing. Otherwise, it sets the + * shouldTakeoverWriting flag in the system state to signal the request. + */ + void + startWriting() override; + + /** + * @brief Request to stop writing to the database. + * + * If not currently writing, this method does nothing. Otherwise, it sets the + * shouldGiveUpWriter flag in the system state to signal the request. + */ + void + giveUpWriting() override; + + /** + * @brief Switch the cluster to the fallback writer decision mechanism. + * + * Sets the isWriterDecidingFallback flag in the system state, which will be + * propagated to other nodes in the cluster through the ClioNode DbRole::Fallback state. + */ + void + setWriterDecidingFallback() override; + + /** + * @brief Check if the cluster is using the fallback writer decision mechanism. + * + * @return true if the cluster has switched to fallback mode, false otherwise + */ + bool + isFallback() const override; + + /** + * @brief Whether clio is still loading cache after startup. + * + * @return true if clio is still loading cache, false otherwise. + */ + bool + isLoadingCache() const override; + + /** + * @brief Create a clone of this writer state. + * + * Creates a new WriterState instance sharing the same system state. + * + * @return A unique pointer to the cloned writer state. + */ + std::unique_ptr + clone() const override; +}; + +} // namespace etl diff --git a/src/etl/impl/LedgerPublisher.hpp b/src/etl/impl/LedgerPublisher.hpp index 0b48ca3f6..1db50299b 100644 --- a/src/etl/impl/LedgerPublisher.hpp +++ b/src/etl/impl/LedgerPublisher.hpp @@ -45,6 +45,7 @@ #include #include +#include #include #include #include @@ -76,6 +77,8 @@ class LedgerPublisher : public LedgerPublisherInterface { util::async::AnyStrand publishStrand_; + std::atomic_bool stop_{false}; + std::shared_ptr backend_; std::shared_ptr subscriptions_; std::reference_wrapper state_; // shared state for ETL @@ -125,7 +128,7 @@ public: { LOG(log_.info()) << "Attempting to publish ledger = " << ledgerSequence; size_t numAttempts = 0; - while (not state_.get().isStopping) { + while (not stop_) { auto range = backend_->hardFetchLedgerRangeNoThrow(); if (!range || range->maxSequence < ledgerSequence) { @@ -258,6 +261,18 @@ public: return *lastPublishedSequence_.lock(); } + /** + * @brief Stops publishing + * + * @note This is a basic implementation to satisfy tests. This will be improved in + * https://github.com/XRPLF/clio/issues/2833 + */ + void + stop() + { + stop_ = true; + } + private: void setLastClose(std::chrono::time_point lastCloseTime) diff --git a/src/etl/impl/Loading.cpp b/src/etl/impl/Loading.cpp index 59d2d0a9c..4b1e24458 100644 --- a/src/etl/impl/Loading.cpp +++ b/src/etl/impl/Loading.cpp @@ -75,7 +75,10 @@ Loader::load(model::LedgerData const& data) << "; took " << duration << "ms"; if (not success) { - state_->writeConflict = true; + // Write conflict detected - another node wrote to the database + // This triggers the fallback mechanism and stops this node from writing + state_->writeCommandSignal(SystemState::WriteCommand::StopWriting); + state_->isWriterDecidingFallback = true; LOG(log_.warn()) << "Another node wrote a ledger into the DB - we have a write conflict"; return std::unexpected(LoaderError::WriteConflict); } diff --git a/src/util/Channel.hpp b/src/util/Channel.hpp index 9b6b6c8ed..af4fbbeed 100644 --- a/src/util/Channel.hpp +++ b/src/util/Channel.hpp @@ -19,6 +19,8 @@ #pragma once +#include "util/async/Concepts.hpp" + #include #include #include @@ -42,15 +44,36 @@ struct ChannelInstantiated; } // namespace detail #endif +/** + * @brief Specifies the producer concurrency model for a Channel. + */ +enum class ProducerType { + Single, /**< Only one Sender can exist (non-copyable). Uses direct Guard ownership for zero overhead. */ + Multi /**< Multiple Senders can exist (copyable). Uses shared_ptr for shared ownership. */ +}; + +/** + * @brief Specifies the consumer concurrency model for a Channel. + */ +enum class ConsumerType { + Single, /**< Only one Receiver can exist (non-copyable). Uses direct Guard ownership for zero overhead. */ + Multi /**< Multiple Receivers can exist (copyable). Uses shared_ptr for shared ownership. */ +}; + /** * @brief Represents a go-like channel, a multi-producer (Sender) multi-consumer (Receiver) thread-safe data pipe. * @note Use INSTANTIATE_CHANNEL_FOR_CLANG macro when using this class. See docs at the bottom of the file for more * details. * * @tparam T The type of data the channel transfers + * @tparam P ProducerType::Multi (default) for multi-producer or ProducerType::Single for single-producer + * @tparam C ConsumerType::Multi (default) for multi-consumer or ConsumerType::Single for single-consumer */ -template +template class Channel { + static constexpr bool kIS_MULTI_PRODUCER = (P == ProducerType::Multi); + static constexpr bool kIS_MULTI_CONSUMER = (C == ConsumerType::Multi); + private: class ControlBlock { using InternalChannelType = boost::asio::experimental::concurrent_channel; @@ -58,7 +81,16 @@ private: InternalChannelType ch_; public: - ControlBlock(auto&& context, std::size_t capacity) : executor_(context.get_executor()), ch_(context, capacity) + template + requires(not async::SomeExecutionContext) + ControlBlock(ContextType&& context, std::size_t capacity) + : executor_(context.get_executor()), ch_(context, capacity) + { + } + + template + ControlBlock(ContextType&& context, std::size_t capacity) + : executor_(context.getExecutor().get_executor()), ch_(context.getExecutor(), capacity) { } @@ -101,30 +133,54 @@ private: } }; +public: /** * @brief The sending end of a channel. * - * Sender is copyable and movable. The channel remains open as long as at least one Sender exists. + * Sender is movable. For multi-producer channels, Sender is also copyable. + * The channel remains open as long as at least one Sender exists. * When all Sender instances are destroyed, the channel is closed and receivers will receive std::nullopt. */ class Sender { std::shared_ptr shared_; - std::shared_ptr guard_; + std::conditional_t, Guard> guard_; + + friend class Channel; - public: /** * @brief Constructs a Sender from a shared control block. * @param shared The shared control block managing the channel state */ Sender(std::shared_ptr shared) - : shared_(std::move(shared)), guard_(std::make_shared(shared_)) {}; + : shared_(shared), guard_([shared = std::move(shared)]() { + if constexpr (kIS_MULTI_PRODUCER) { + return std::make_shared(std::move(shared)); + } else { + return Guard{std::move(shared)}; + } + }()) + { + } + public: Sender(Sender&&) = default; - Sender(Sender const&) = default; + Sender(Sender const&) + requires kIS_MULTI_PRODUCER + = default; + Sender(Sender const&) + requires(!kIS_MULTI_PRODUCER) + = delete; + Sender& operator=(Sender&&) = default; Sender& - operator=(Sender const&) = default; + operator=(Sender const&) + requires kIS_MULTI_PRODUCER + = default; + Sender& + operator=(Sender const&) + requires(!kIS_MULTI_PRODUCER) + = delete; /** * @brief Asynchronously sends data through the channel using a coroutine. @@ -202,27 +258,50 @@ private: /** * @brief The receiving end of a channel. * - * Receiver is copyable and movable. Multiple receivers can consume from the same channel concurrently. + * Receiver is movable. For multi-consumer channels, Receiver is also copyable. + * Multiple receivers can consume from the same multi-consumer channel concurrently. * When all Receiver instances are destroyed, the channel is closed and senders will fail to send. */ class Receiver { std::shared_ptr shared_; - std::shared_ptr guard_; + std::conditional_t, Guard> guard_; + + friend class Channel; - public: /** * @brief Constructs a Receiver from a shared control block. * @param shared The shared control block managing the channel state */ Receiver(std::shared_ptr shared) - : shared_(std::move(shared)), guard_(std::make_shared(shared_)) {}; + : shared_(shared), guard_([shared = std::move(shared)]() { + if constexpr (kIS_MULTI_CONSUMER) { + return std::make_shared(std::move(shared)); + } else { + return Guard{std::move(shared)}; + } + }()) + { + } + public: Receiver(Receiver&&) = default; - Receiver(Receiver const&) = default; + Receiver(Receiver const&) + requires kIS_MULTI_CONSUMER + = default; + Receiver(Receiver const&) + requires(!kIS_MULTI_CONSUMER) + = delete; + Receiver& operator=(Receiver&&) = default; Receiver& - operator=(Receiver const&) = default; + operator=(Receiver const&) + requires kIS_MULTI_CONSUMER + = default; + Receiver& + operator=(Receiver const&) + requires(!kIS_MULTI_CONSUMER) + = delete; /** * @brief Attempts to receive data from the channel without blocking. @@ -297,7 +376,6 @@ private: } }; -public: /** * @brief Factory function to create channel components. * @param context A supported context type (either io_context or thread_pool) diff --git a/src/util/Spawn.hpp b/src/util/Spawn.hpp index bdb90bc57..79229cf9e 100644 --- a/src/util/Spawn.hpp +++ b/src/util/Spawn.hpp @@ -22,6 +22,7 @@ #include #include +#include #include #include diff --git a/src/util/async/Concepts.hpp b/src/util/async/Concepts.hpp index 7a50999f5..1bde21e6e 100644 --- a/src/util/async/Concepts.hpp +++ b/src/util/async/Concepts.hpp @@ -29,6 +29,27 @@ namespace util::async { +/** + * @brief Tag type for identifying execution context types. + * + * Types that inherit from this tag can be detected using the SomeExecutionContext concept. + * This allows generic code to differentiate between raw Boost.Asio contexts and wrapped execution contexts. + */ +struct ExecutionContextTag { + virtual ~ExecutionContextTag() = default; +}; + +/** + * @brief Concept that identifies types derived from ExecutionContextTag. + * + * This concept is used to detect custom execution context wrappers (like BasicExecutionContext) + * and distinguish them from raw Boost.Asio contexts (io_context, thread_pool, etc.). + * + * @tparam T The type to check + */ +template +concept SomeExecutionContext = std::derived_from, ExecutionContextTag>; + /** * @brief Specifies the interface for an entity that can be stopped */ diff --git a/src/util/async/context/BasicExecutionContext.hpp b/src/util/async/context/BasicExecutionContext.hpp index 5fab8fdfc..be8a6a200 100644 --- a/src/util/async/context/BasicExecutionContext.hpp +++ b/src/util/async/context/BasicExecutionContext.hpp @@ -129,7 +129,7 @@ template < typename DispatcherType, typename TimerContextProvider = impl::SelfContextProvider, typename ErrorHandlerType = impl::DefaultErrorHandler> -class BasicExecutionContext { +class BasicExecutionContext : public ExecutionContextTag { ContextType context_; /** @cond */ @@ -182,7 +182,7 @@ public: /** * @brief Stops the underlying thread pool. */ - ~BasicExecutionContext() + ~BasicExecutionContext() override { stop(); } @@ -402,6 +402,20 @@ public: { context_.join(); } + + /** + * @brief Get the underlying executor. + * + * Provides access to the wrapped executor for cases where the execution context + * needs to interact with components that require explicit executor access (like Channel). + * + * @return Reference to the underlying executor + */ + typename ContextType::Executor& + getExecutor() + { + return context_.getExecutor(); + } }; /** diff --git a/src/util/async/context/impl/Execution.hpp b/src/util/async/context/impl/Execution.hpp index 020773ba5..ffc32cd45 100644 --- a/src/util/async/context/impl/Execution.hpp +++ b/src/util/async/context/impl/Execution.hpp @@ -36,17 +36,26 @@ struct SpawnDispatchStrategy { { auto op = outcome.getOperation(); - util::spawn( - ctx.getExecutor(), - [outcome = std::forward(outcome), fn = std::forward(fn)](auto yield) mutable { - if constexpr (SomeStoppableOutcome) { - auto& stopSource = outcome.getStopSource(); - std::invoke(std::forward(fn), outcome, stopSource, stopSource[yield]); - } else { + if constexpr (SomeStoppableOutcome) { + util::spawn( + ctx.getExecutor(), + [outcome = std::forward(outcome), fn = std::forward(fn)](auto yield) mutable { + if constexpr (SomeStoppableOutcome) { + auto& stopSource = outcome.getStopSource(); + std::invoke(std::forward(fn), outcome, stopSource, stopSource[yield]); + } else { + std::invoke(std::forward(fn), outcome); + } + } + ); + } else { + boost::asio::post( + ctx.getExecutor(), + [outcome = std::forward(outcome), fn = std::forward(fn)]() mutable { std::invoke(std::forward(fn), outcome); } - } - ); + ); + } return op; } @@ -55,7 +64,7 @@ struct SpawnDispatchStrategy { static void post(ContextType& ctx, FnType&& fn) { - util::spawn(ctx.getExecutor(), [fn = std::forward(fn)](auto) mutable { + boost::asio::post(ctx.getExecutor(), [fn = std::forward(fn)]() mutable { std::invoke(std::forward(fn)); }); } diff --git a/tests/common/util/MockBackendTestFixture.hpp b/tests/common/util/MockBackendTestFixture.hpp index 06c84e372..59c6d89df 100644 --- a/tests/common/util/MockBackendTestFixture.hpp +++ b/tests/common/util/MockBackendTestFixture.hpp @@ -25,6 +25,7 @@ #include "util/config/ConfigDefinition.hpp" #include +#include #include diff --git a/tests/common/util/MockWriterState.hpp b/tests/common/util/MockWriterState.hpp new file mode 100644 index 000000000..a1821d599 --- /dev/null +++ b/tests/common/util/MockWriterState.hpp @@ -0,0 +1,40 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#pragma once + +#include "etl/WriterState.hpp" + +#include + +#include + +struct MockWriterStateBase : public etl::WriterStateInterface { + MOCK_METHOD(bool, isReadOnly, (), (const, override)); + MOCK_METHOD(bool, isWriting, (), (const, override)); + MOCK_METHOD(void, startWriting, (), (override)); + MOCK_METHOD(void, giveUpWriting, (), (override)); + MOCK_METHOD(void, setWriterDecidingFallback, (), (override)); + MOCK_METHOD(bool, isFallback, (), (const, override)); + MOCK_METHOD(bool, isLoadingCache, (), (const, override)); + MOCK_METHOD(std::unique_ptr, clone, (), (const, override)); +}; + +using MockWriterState = testing::StrictMock; +using NiceMockWriterState = testing::NiceMock; diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index ba3b08f2f..600d307f5 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -21,8 +21,12 @@ target_sources( data/impl/LedgerCacheFileTests.cpp data/impl/OutputFileTests.cpp # Cluster + cluster/BackendTests.cpp cluster/ClioNodeTests.cpp cluster/ClusterCommunicationServiceTests.cpp + cluster/MetricsTests.cpp + cluster/RepeatedTaskTests.cpp + cluster/WriterDeciderTests.cpp # ETL etl/AmendmentBlockHandlerTests.cpp etl/CacheLoaderSettingsTests.cpp @@ -46,7 +50,9 @@ target_sources( etl/SchedulingTests.cpp etl/SourceImplTests.cpp etl/SubscriptionSourceTests.cpp + etl/SystemStateTests.cpp etl/TaskManagerTests.cpp + etl/WriterStateTests.cpp etl/ext/CoreTests.cpp etl/ext/CacheTests.cpp etl/ext/MPTTests.cpp diff --git a/tests/unit/app/StopperTests.cpp b/tests/unit/app/StopperTests.cpp index a9a03eb56..eb3f9b070 100644 --- a/tests/unit/app/StopperTests.cpp +++ b/tests/unit/app/StopperTests.cpp @@ -17,6 +17,7 @@ */ //============================================================================== #include "app/Stopper.hpp" +#include "cluster/Concepts.hpp" #include "util/AsioContextTestFixture.hpp" #include "util/MockBackend.hpp" #include "util/MockETLService.hpp" @@ -87,6 +88,10 @@ struct StopperMakeCallbackTest : util::prometheus::WithPrometheus, SyncAsioConte MOCK_METHOD(void, waitToFinish, ()); }; + struct MockClusterCommunicationService : cluster::ClusterCommunicationServiceTag { + MOCK_METHOD(void, stop, (), ()); + }; + protected: testing::StrictMock serverMock_; testing::StrictMock loadBalancerMock_; @@ -94,6 +99,7 @@ protected: testing::StrictMock subscriptionManagerMock_; testing::StrictMock backendMock_{util::config::ClioConfigDefinition{}}; testing::StrictMock cacheSaverMock_; + testing::StrictMock clusterCommunicationServiceMock_; boost::asio::io_context ioContextToStop_; bool @@ -115,6 +121,7 @@ TEST_F(StopperMakeCallbackTest, makeCallbackTest) subscriptionManagerMock_, backendMock_, cacheSaverMock_, + clusterCommunicationServiceMock_, ioContextToStop_ ); @@ -122,6 +129,9 @@ TEST_F(StopperMakeCallbackTest, makeCallbackTest) EXPECT_CALL(cacheSaverMock_, save).InSequence(s1).WillOnce([this]() { EXPECT_FALSE(isContextStopped()); }); EXPECT_CALL(serverMock_, stop).InSequence(s1).WillOnce([this]() { EXPECT_FALSE(isContextStopped()); }); EXPECT_CALL(loadBalancerMock_, stop).InSequence(s2).WillOnce([this]() { EXPECT_FALSE(isContextStopped()); }); + EXPECT_CALL(clusterCommunicationServiceMock_, stop).InSequence(s1, s2).WillOnce([this]() { + EXPECT_FALSE(isContextStopped()); + }); EXPECT_CALL(etlServiceMock_, stop).InSequence(s1, s2).WillOnce([this]() { EXPECT_FALSE(isContextStopped()); }); EXPECT_CALL(subscriptionManagerMock_, stop).InSequence(s1, s2).WillOnce([this]() { EXPECT_FALSE(isContextStopped()); diff --git a/tests/unit/cluster/BackendTests.cpp b/tests/unit/cluster/BackendTests.cpp new file mode 100644 index 000000000..869ff1828 --- /dev/null +++ b/tests/unit/cluster/BackendTests.cpp @@ -0,0 +1,351 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "cluster/Backend.hpp" +#include "cluster/ClioNode.hpp" +#include "data/BackendInterface.hpp" +#include "util/MockBackendTestFixture.hpp" +#include "util/MockPrometheus.hpp" +#include "util/MockWriterState.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace cluster; + +struct ClusterBackendTest : util::prometheus::WithPrometheus, MockBackendTestStrict { + ~ClusterBackendTest() override + { + ctx.stop(); + ctx.join(); + } + + boost::asio::thread_pool ctx; + std::unique_ptr writerState = std::make_unique(); + MockWriterState& writerStateRef = *writerState; + testing::StrictMock)>> + callbackMock; + std::binary_semaphore semaphore{0}; + + class SemaphoreReleaseGuard { + std::binary_semaphore& semaphore_; + + public: + SemaphoreReleaseGuard(std::binary_semaphore& s) : semaphore_(s) + { + } + ~SemaphoreReleaseGuard() + { + semaphore_.release(); + } + }; +}; + +TEST_F(ClusterBackendTest, SubscribeToNewState) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + clusterBackend.subscribeToNewState(callbackMock.AsStdFunction()); + + EXPECT_CALL(*backend_, fetchClioNodesData) + .Times(testing::AtLeast(1)) + .WillRepeatedly(testing::Return(BackendInterface::ClioNodesDataFetchResult{})); + EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); + EXPECT_CALL(callbackMock, Call) + .Times(testing::AtLeast(1)) + .WillRepeatedly([this](ClioNode::CUuid selfId, std::shared_ptr clusterData) { + SemaphoreReleaseGuard guard{semaphore}; + ASSERT_TRUE(clusterData->has_value()); + EXPECT_EQ(clusterData->value().size(), 1); + auto const& nodeData = clusterData->value().front(); + EXPECT_EQ(nodeData.uuid, selfId); + EXPECT_EQ(nodeData.dbRole, ClioNode::DbRole::ReadOnly); + EXPECT_LE(nodeData.updateTime, std::chrono::system_clock::now()); + }); + + clusterBackend.run(); + semaphore.acquire(); +} + +TEST_F(ClusterBackendTest, Stop) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + EXPECT_CALL(*backend_, fetchClioNodesData) + .Times(testing::AtLeast(1)) + .WillRepeatedly(testing::Return(BackendInterface::ClioNodesDataFetchResult{})); + EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); + + clusterBackend.run(); + std::this_thread::sleep_for(std::chrono::milliseconds{20}); + clusterBackend.stop(); + + testing::Mock::VerifyAndClearExpectations(&(*backend_)); + // Wait to make sure there is no new calls of mockDbBackend + std::this_thread::sleep_for(std::chrono::milliseconds{20}); +} + +TEST_F(ClusterBackendTest, FetchClioNodesDataThrowsException) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + clusterBackend.subscribeToNewState(callbackMock.AsStdFunction()); + + EXPECT_CALL(*backend_, fetchClioNodesData) + .Times(testing::AtLeast(1)) + .WillRepeatedly(testing::Throw(std::runtime_error("Database connection failed"))); + EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); + EXPECT_CALL(callbackMock, Call) + .Times(testing::AtLeast(1)) + .WillRepeatedly([this](ClioNode::CUuid, std::shared_ptr clusterData) { + SemaphoreReleaseGuard guard{semaphore}; + ASSERT_FALSE(clusterData->has_value()); + EXPECT_EQ(clusterData->error(), "Failed to fetch Clio nodes data"); + }); + + clusterBackend.run(); + semaphore.acquire(); +} + +TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsDataWithOtherNodes) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + clusterBackend.subscribeToNewState(callbackMock.AsStdFunction()); + + auto const otherUuid = boost::uuids::random_generator{}(); + auto const otherNodeJson = R"JSON({ + "db_role": 3, + "update_time": "2025-01-15T10:30:00Z" + })JSON"; + + EXPECT_CALL(*backend_, fetchClioNodesData) + .Times(testing::AtLeast(1)) + .WillRepeatedly( + testing::Return( + BackendInterface::ClioNodesDataFetchResult{ + std::vector>{{otherUuid, otherNodeJson}} + } + ) + ); + EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(writerStateRef, isFallback).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(writerStateRef, isLoadingCache).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(writerStateRef, isWriting).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(callbackMock, Call) + .Times(testing::AtLeast(1)) + .WillRepeatedly([&](ClioNode::CUuid selfId, std::shared_ptr clusterData) { + SemaphoreReleaseGuard guard{semaphore}; + ASSERT_TRUE(clusterData->has_value()) << clusterData->error(); + EXPECT_EQ(clusterData->value().size(), 2); + EXPECT_EQ(selfId, clusterBackend.selfId()); + + bool foundSelf = false; + bool foundOther = false; + + for (auto const& node : clusterData->value()) { + if (*node.uuid == *selfId) { + foundSelf = true; + EXPECT_EQ(node.dbRole, ClioNode::DbRole::NotWriter); + } else if (*node.uuid == otherUuid) { + foundOther = true; + EXPECT_EQ(node.dbRole, ClioNode::DbRole::Writer); + } + EXPECT_LE(node.updateTime, std::chrono::system_clock::now()); + } + + EXPECT_TRUE(foundSelf); + EXPECT_TRUE(foundOther); + }); + + clusterBackend.run(); + semaphore.acquire(); +} + +TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsOnlySelfData) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + clusterBackend.subscribeToNewState(callbackMock.AsStdFunction()); + + auto const selfNodeJson = R"JSON({ + "db_role": 1, + "update_time": "2025-01-16T10:30:00Z" + })JSON"; + + EXPECT_CALL(*backend_, fetchClioNodesData).Times(testing::AtLeast(1)).WillRepeatedly([&]() { + return BackendInterface::ClioNodesDataFetchResult{ + std::vector>{{*clusterBackend.selfId(), selfNodeJson}} + }; + }); + EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); + EXPECT_CALL(callbackMock, Call) + .Times(testing::AtLeast(1)) + .WillRepeatedly([this](ClioNode::CUuid selfId, std::shared_ptr clusterData) { + SemaphoreReleaseGuard guard{semaphore}; + ASSERT_TRUE(clusterData->has_value()); + EXPECT_EQ(clusterData->value().size(), 1); + auto const& nodeData = clusterData->value().front(); + EXPECT_EQ(nodeData.uuid, selfId); + EXPECT_EQ(nodeData.dbRole, ClioNode::DbRole::ReadOnly); + EXPECT_LE(nodeData.updateTime, std::chrono::system_clock::now()); + }); + + clusterBackend.run(); + semaphore.acquire(); +} + +TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsInvalidJson) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + clusterBackend.subscribeToNewState(callbackMock.AsStdFunction()); + + auto const otherUuid = boost::uuids::random_generator{}(); + auto const invalidJson = "{ invalid json"; + + EXPECT_CALL(*backend_, fetchClioNodesData) + .Times(testing::AtLeast(1)) + .WillRepeatedly( + testing::Return( + BackendInterface::ClioNodesDataFetchResult{ + std::vector>{{otherUuid, invalidJson}} + } + ) + ); + EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); + EXPECT_CALL(callbackMock, Call) + .Times(testing::AtLeast(1)) + .WillRepeatedly([this, invalidJson](ClioNode::CUuid, std::shared_ptr clusterData) { + SemaphoreReleaseGuard guard{semaphore}; + ASSERT_FALSE(clusterData->has_value()); + EXPECT_THAT(clusterData->error(), testing::HasSubstr("Error parsing json from DB")); + EXPECT_THAT(clusterData->error(), testing::HasSubstr(invalidJson)); + }); + + clusterBackend.run(); + semaphore.acquire(); +} + +TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsValidJsonButCannotConvertToClioNode) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + clusterBackend.subscribeToNewState(callbackMock.AsStdFunction()); + + auto const otherUuid = boost::uuids::random_generator{}(); + // Valid JSON but missing required field 'db_role' + auto const validJsonMissingField = R"JSON({ + "update_time": "2025-01-16T10:30:00Z" + })JSON"; + + EXPECT_CALL(*backend_, fetchClioNodesData) + .Times(testing::AtLeast(1)) + .WillRepeatedly( + testing::Return( + BackendInterface::ClioNodesDataFetchResult{ + std::vector>{{otherUuid, validJsonMissingField}} + } + ) + ); + EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); + EXPECT_CALL(callbackMock, Call) + .Times(testing::AtLeast(1)) + .WillRepeatedly([this](ClioNode::CUuid, std::shared_ptr clusterData) { + SemaphoreReleaseGuard guard{semaphore}; + ASSERT_FALSE(clusterData->has_value()); + EXPECT_THAT(clusterData->error(), testing::HasSubstr("Error converting json to ClioNode")); + }); + + clusterBackend.run(); + semaphore.acquire(); +} + +TEST_F(ClusterBackendTest, WriteNodeMessageWritesSelfDataWithRecentTimestampAndDbRole) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + auto const beforeRun = std::chrono::floor(std::chrono::system_clock::now()); + + EXPECT_CALL(*backend_, fetchClioNodesData) + .Times(testing::AtLeast(1)) + .WillRepeatedly(testing::Return(BackendInterface::ClioNodesDataFetchResult{})); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(writerStateRef, isFallback).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(writerStateRef, isLoadingCache).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(writerStateRef, isWriting).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(*backend_, writeNodeMessage) + .Times(testing::AtLeast(1)) + .WillRepeatedly([&](boost::uuids::uuid const& uuid, std::string message) { + SemaphoreReleaseGuard guard{semaphore}; + auto const afterWrite = std::chrono::system_clock::now(); + + EXPECT_EQ(uuid, *clusterBackend.selfId()); + auto const json = boost::json::parse(message); + auto const node = boost::json::try_value_to(json); + ASSERT_TRUE(node.has_value()); + EXPECT_EQ(node->dbRole, ClioNode::DbRole::NotWriter); + EXPECT_GE(node->updateTime, beforeRun); + EXPECT_LE(node->updateTime, afterWrite); + }); + + clusterBackend.run(); + semaphore.acquire(); +} diff --git a/tests/unit/cluster/ClioNodeTests.cpp b/tests/unit/cluster/ClioNodeTests.cpp index 6cfe9c924..634fbe444 100644 --- a/tests/unit/cluster/ClioNodeTests.cpp +++ b/tests/unit/cluster/ClioNodeTests.cpp @@ -18,6 +18,8 @@ //============================================================================== #include "cluster/ClioNode.hpp" +#include "util/MockWriterState.hpp" +#include "util/NameGenerator.hpp" #include "util/TimeUtils.hpp" #include @@ -26,9 +28,11 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -44,44 +48,44 @@ struct ClioNodeTest : testing::Test { TEST_F(ClioNodeTest, Serialization) { - // Create a ClioNode with test data ClioNode const node{ - .uuid = std::make_shared(boost::uuids::random_generator()()), .updateTime = updateTime + .uuid = std::make_shared(boost::uuids::random_generator()()), + .updateTime = updateTime, + .dbRole = ClioNode::DbRole::Writer }; - // Serialize to JSON boost::json::value jsonValue; EXPECT_NO_THROW(boost::json::value_from(node, jsonValue)); - // Verify JSON structure ASSERT_TRUE(jsonValue.is_object()) << jsonValue; auto const& obj = jsonValue.as_object(); - // Check update_time exists and is a string EXPECT_TRUE(obj.contains("update_time")); EXPECT_TRUE(obj.at("update_time").is_string()); + + EXPECT_TRUE(obj.contains("db_role")); + EXPECT_TRUE(obj.at("db_role").is_number()); + EXPECT_EQ(obj.at("db_role").as_int64(), static_cast(node.dbRole)); } TEST_F(ClioNodeTest, Deserialization) { - boost::json::value const jsonValue = {{"update_time", updateTimeStr}}; + boost::json::value const jsonValue = {{"update_time", updateTimeStr}, {"db_role", 1}}; - // Deserialize to ClioNode - ClioNode node{.uuid = std::make_shared(), .updateTime = {}}; - EXPECT_NO_THROW(node = boost::json::value_to(jsonValue)); + ClioNode node{ + .uuid = std::make_shared(), .updateTime = {}, .dbRole = ClioNode::DbRole::ReadOnly + }; + ASSERT_NO_THROW(node = boost::json::value_to(jsonValue)); - // Verify deserialized data EXPECT_NE(node.uuid, nullptr); EXPECT_EQ(*node.uuid, boost::uuids::uuid{}); EXPECT_EQ(node.updateTime, updateTime); + EXPECT_EQ(node.dbRole, ClioNode::DbRole::LoadingCache); } TEST_F(ClioNodeTest, DeserializationInvalidTime) { - // Prepare an invalid time format boost::json::value const jsonValue{"update_time", "invalid_format"}; - - // Expect an exception during deserialization EXPECT_THROW(boost::json::value_to(jsonValue), std::runtime_error); } @@ -93,3 +97,145 @@ TEST_F(ClioNodeTest, DeserializationMissingTime) // Expect an exception EXPECT_THROW(boost::json::value_to(jsonValue), std::runtime_error); } + +struct ClioNodeDbRoleTestBundle { + std::string testName; + ClioNode::DbRole role; +}; + +struct ClioNodeDbRoleTest : ClioNodeTest, testing::WithParamInterface {}; + +INSTANTIATE_TEST_SUITE_P( + AllDbRoles, + ClioNodeDbRoleTest, + testing::Values( + ClioNodeDbRoleTestBundle{.testName = "ReadOnly", .role = ClioNode::DbRole::ReadOnly}, + ClioNodeDbRoleTestBundle{.testName = "LoadingCache", .role = ClioNode::DbRole::LoadingCache}, + ClioNodeDbRoleTestBundle{.testName = "NotWriter", .role = ClioNode::DbRole::NotWriter}, + ClioNodeDbRoleTestBundle{.testName = "Writer", .role = ClioNode::DbRole::Writer}, + ClioNodeDbRoleTestBundle{.testName = "Fallback", .role = ClioNode::DbRole::Fallback} + ), + tests::util::kNAME_GENERATOR +); + +TEST_P(ClioNodeDbRoleTest, Serialization) +{ + auto const param = GetParam(); + ClioNode const node{ + .uuid = std::make_shared(boost::uuids::random_generator()()), + .updateTime = updateTime, + .dbRole = param.role + }; + auto const jsonValue = boost::json::value_from(node); + EXPECT_EQ(jsonValue.as_object().at("db_role").as_int64(), static_cast(param.role)); +} + +TEST_P(ClioNodeDbRoleTest, Deserialization) +{ + auto const param = GetParam(); + boost::json::value const jsonValue = { + {"update_time", updateTimeStr}, {"db_role", static_cast(param.role)} + }; + auto const node = boost::json::value_to(jsonValue); + EXPECT_EQ(node.dbRole, param.role); +} + +TEST_F(ClioNodeDbRoleTest, DeserializationInvalidDbRole) +{ + boost::json::value const jsonValue = {{"update_time", updateTimeStr}, {"db_role", 10}}; + EXPECT_THROW(boost::json::value_to(jsonValue), std::runtime_error); +} + +TEST_F(ClioNodeDbRoleTest, DeserializationMissingDbRole) +{ + boost::json::value const jsonValue = {{"update_time", updateTimeStr}}; + EXPECT_THROW(boost::json::value_to(jsonValue), std::runtime_error); +} + +struct ClioNodeFromTestBundle { + std::string testName; + bool readOnly; + bool fallback; + bool loadingCache; + bool writing; + ClioNode::DbRole expectedRole; +}; + +struct ClioNodeFromTest : ClioNodeTest, testing::WithParamInterface { + std::shared_ptr uuid = std::make_shared(boost::uuids::random_generator()()); + + MockWriterState writerState; +}; + +INSTANTIATE_TEST_SUITE_P( + AllWriterStates, + ClioNodeFromTest, + testing::Values( + ClioNodeFromTestBundle{ + .testName = "ReadOnly", + .readOnly = true, + .fallback = false, + .loadingCache = false, + .writing = false, + .expectedRole = ClioNode::DbRole::ReadOnly + }, + ClioNodeFromTestBundle{ + .testName = "Fallback", + .readOnly = false, + .fallback = true, + .loadingCache = false, + .writing = false, + .expectedRole = ClioNode::DbRole::Fallback + }, + ClioNodeFromTestBundle{ + .testName = "LoadingCache", + .readOnly = false, + .fallback = false, + .loadingCache = true, + .writing = false, + .expectedRole = ClioNode::DbRole::LoadingCache + }, + ClioNodeFromTestBundle{ + .testName = "NotWriterNotReadOnly", + .readOnly = false, + .fallback = false, + .loadingCache = false, + .writing = false, + .expectedRole = ClioNode::DbRole::NotWriter + }, + ClioNodeFromTestBundle{ + .testName = "Writer", + .readOnly = false, + .fallback = false, + .loadingCache = false, + .writing = true, + .expectedRole = ClioNode::DbRole::Writer + } + ), + tests::util::kNAME_GENERATOR +); + +TEST_P(ClioNodeFromTest, FromWriterState) +{ + auto const& param = GetParam(); + + EXPECT_CALL(writerState, isReadOnly()).WillOnce(testing::Return(param.readOnly)); + if (not param.readOnly) { + EXPECT_CALL(writerState, isFallback()).WillOnce(testing::Return(param.fallback)); + if (not param.fallback) { + EXPECT_CALL(writerState, isLoadingCache()).WillOnce(testing::Return(param.loadingCache)); + if (not param.loadingCache) { + EXPECT_CALL(writerState, isWriting()).WillOnce(testing::Return(param.writing)); + } + } + } + + auto const beforeTime = std::chrono::system_clock::now(); + auto const node = ClioNode::from(uuid, writerState); + auto const afterTime = std::chrono::system_clock::now(); + + EXPECT_EQ(node.uuid, uuid); + EXPECT_EQ(node.dbRole, param.expectedRole); + EXPECT_GE(node.updateTime, beforeTime); + EXPECT_LE(node.updateTime, afterTime); +} diff --git a/tests/unit/cluster/ClusterCommunicationServiceTests.cpp b/tests/unit/cluster/ClusterCommunicationServiceTests.cpp index 544d7e3db..87d965145 100644 --- a/tests/unit/cluster/ClusterCommunicationServiceTests.cpp +++ b/tests/unit/cluster/ClusterCommunicationServiceTests.cpp @@ -22,207 +22,197 @@ #include "data/BackendInterface.hpp" #include "util/MockBackendTestFixture.hpp" #include "util/MockPrometheus.hpp" -#include "util/TimeUtils.hpp" -#include "util/prometheus/Bool.hpp" -#include "util/prometheus/Gauge.hpp" +#include "util/MockWriterState.hpp" #include "util/prometheus/Prometheus.hpp" -#include +#include #include -#include -#include #include -#include #include -#include #include #include #include +#include #include -#include +#include #include -#include +#include #include +#include #include #include using namespace cluster; -namespace { -std::vector const kOTHER_NODES_DATA = { - ClioNode{ - .uuid = std::make_shared(boost::uuids::random_generator()()), - .updateTime = util::systemTpFromUtcStr("2015-05-15T12:00:00Z", ClioNode::kTIME_FORMAT).value() - }, - ClioNode{ - .uuid = std::make_shared(boost::uuids::random_generator()()), - .updateTime = util::systemTpFromUtcStr("2015-05-15T12:00:01Z", ClioNode::kTIME_FORMAT).value() - }, -}; -} // namespace +struct ClusterCommunicationServiceTest : util::prometheus::WithPrometheus, MockBackendTest { + std::unique_ptr writerState = std::make_unique(); + NiceMockWriterState& writerStateRef = *writerState; -struct ClusterCommunicationServiceTest : util::prometheus::WithPrometheus, MockBackendTestStrict { - ClusterCommunicationService clusterCommunicationService{ - backend_, - std::chrono::milliseconds{5}, - std::chrono::milliseconds{9} - }; + static constexpr std::chrono::milliseconds kSHORT_INTERVAL{1}; - util::prometheus::GaugeInt& nodesInClusterMetric = PrometheusService::gaugeInt("cluster_nodes_total_number", {}); - util::prometheus::Bool isHealthyMetric = PrometheusService::boolMetric("cluster_communication_is_healthy", {}); - - std::mutex mtx; - std::condition_variable cv; - - void - notify() + static boost::uuids::uuid + makeUuid(uint8_t value) { - std::unique_lock const lock{mtx}; - cv.notify_one(); + boost::uuids::uuid uuid{}; + std::ranges::fill(uuid, value); + return uuid; } - void - wait() + static ClioNode + makeNode(boost::uuids::uuid const& uuid, ClioNode::DbRole role) { - std::unique_lock lock{mtx}; - cv.wait_until(lock, std::chrono::steady_clock::now() + std::chrono::milliseconds{100}); + return ClioNode{ + .uuid = std::make_shared(uuid), + .updateTime = std::chrono::system_clock::now(), + .dbRole = role + }; + } + + static std::string + nodeToJson(ClioNode const& node) + { + boost::json::value v = boost::json::value_from(node); + return boost::json::serialize(v); + } + + ClusterCommunicationServiceTest() + { + ON_CALL(writerStateRef, clone()).WillByDefault(testing::Invoke([]() { + auto state = std::make_unique(); + ON_CALL(*state, isReadOnly()).WillByDefault(testing::Return(false)); + ON_CALL(*state, isWriting()).WillByDefault(testing::Return(true)); + return state; + })); + ON_CALL(writerStateRef, isReadOnly()).WillByDefault(testing::Return(false)); + ON_CALL(writerStateRef, isWriting()).WillByDefault(testing::Return(true)); + } + + static bool + waitForSignal(std::binary_semaphore& sem, std::chrono::milliseconds timeout = std::chrono::milliseconds{1000}) + { + return sem.try_acquire_for(timeout); } }; -TEST_F(ClusterCommunicationServiceTest, Write) +TEST_F(ClusterCommunicationServiceTest, BackendReadsAndWritesData) { - auto const selfUuid = *clusterCommunicationService.selfUuid(); + auto const otherUuid = makeUuid(0x02); + std::binary_semaphore fetchSemaphore{0}; + std::binary_semaphore writeSemaphore{0}; - auto const nowStr = util::systemTpToUtcStr(std::chrono::system_clock::now(), ClioNode::kTIME_FORMAT); - auto const nowStrPrefix = nowStr.substr(0, nowStr.size() - 3); + BackendInterface::ClioNodesDataFetchResult fetchResult{std::vector>{ + {otherUuid, nodeToJson(makeNode(otherUuid, ClioNode::DbRole::Writer))} + }}; - EXPECT_CALL(*backend_, writeNodeMessage(selfUuid, testing::_)).WillOnce([&](auto&&, std::string const& jsonStr) { - auto const jv = boost::json::parse(jsonStr); - ASSERT_TRUE(jv.is_object()); - auto const& obj = jv.as_object(); - ASSERT_TRUE(obj.contains("update_time")); - ASSERT_TRUE(obj.at("update_time").is_string()); - EXPECT_THAT(std::string{obj.at("update_time").as_string()}, testing::StartsWith(nowStrPrefix)); + ON_CALL(*backend_, fetchClioNodesData).WillByDefault(testing::Invoke([&](auto) { + fetchSemaphore.release(); + return fetchResult; + })); - notify(); - }); + ON_CALL(*backend_, writeNodeMessage).WillByDefault(testing::Invoke([&](auto, auto) { writeSemaphore.release(); })); - clusterCommunicationService.run(); - wait(); - // destructor of clusterCommunicationService calls .stop() + ClusterCommunicationService service{backend_, std::move(writerState), kSHORT_INTERVAL, kSHORT_INTERVAL}; + + service.run(); + + EXPECT_TRUE(waitForSignal(fetchSemaphore)); + EXPECT_TRUE(waitForSignal(writeSemaphore)); + + service.stop(); } -TEST_F(ClusterCommunicationServiceTest, Read_FetchFailed) +TEST_F(ClusterCommunicationServiceTest, MetricsGetsNewStateFromBackend) { - EXPECT_TRUE(isHealthyMetric); - EXPECT_CALL(*backend_, writeNodeMessage).Times(2).WillOnce([](auto&&, auto&&) {}).WillOnce([this](auto&&, auto&&) { - notify(); - }); - EXPECT_CALL(*backend_, fetchClioNodesData).WillRepeatedly([](auto&&) { return std::unexpected{"Failed"}; }); + auto const otherUuid = makeUuid(0x02); + std::binary_semaphore writerActionSemaphore{0}; - clusterCommunicationService.run(); - wait(); - // call .stop() manually so that workers exit before expectations are called more times than we want - clusterCommunicationService.stop(); + BackendInterface::ClioNodesDataFetchResult fetchResult{std::vector>{ + {otherUuid, nodeToJson(makeNode(otherUuid, ClioNode::DbRole::Writer))} + }}; - EXPECT_FALSE(isHealthyMetric); + ON_CALL(*backend_, fetchClioNodesData).WillByDefault(testing::Invoke([&](auto) { return fetchResult; })); + + ON_CALL(writerStateRef, clone()).WillByDefault(testing::Invoke([&]() mutable { + auto state = std::make_unique(); + ON_CALL(*state, startWriting()).WillByDefault(testing::Invoke([&]() { writerActionSemaphore.release(); })); + ON_CALL(*state, giveUpWriting()).WillByDefault(testing::Invoke([&]() { writerActionSemaphore.release(); })); + return state; + })); + + auto& nodesInClusterMetric = PrometheusService::gaugeInt("cluster_nodes_total_number", {}); + auto isHealthyMetric = PrometheusService::boolMetric("cluster_communication_is_healthy", {}); + + ClusterCommunicationService service{backend_, std::move(writerState), kSHORT_INTERVAL, kSHORT_INTERVAL}; + + service.run(); + + // WriterDecider is called after metrics are updated so we could use it as a signal to stop + EXPECT_TRUE(waitForSignal(writerActionSemaphore)); + + service.stop(); + + EXPECT_EQ(nodesInClusterMetric.value(), 2); + EXPECT_TRUE(static_cast(isHealthyMetric)); } -TEST_F(ClusterCommunicationServiceTest, Read_FetchThrew) +TEST_F(ClusterCommunicationServiceTest, WriterDeciderCallsWriterStateMethodsAccordingly) { - EXPECT_TRUE(isHealthyMetric); - EXPECT_CALL(*backend_, writeNodeMessage).Times(2).WillOnce([](auto&&, auto&&) {}).WillOnce([this](auto&&, auto&&) { - notify(); - }); - EXPECT_CALL(*backend_, fetchClioNodesData).WillRepeatedly(testing::Throw(data::DatabaseTimeout{})); + auto const smallerUuid = makeUuid(0x00); + std::binary_semaphore fetchSemaphore{0}; + std::binary_semaphore writerActionSemaphore{0}; - clusterCommunicationService.run(); - wait(); - clusterCommunicationService.stop(); + BackendInterface::ClioNodesDataFetchResult fetchResult{std::vector>{ + {smallerUuid, nodeToJson(makeNode(smallerUuid, ClioNode::DbRole::Writer))} + }}; - EXPECT_FALSE(isHealthyMetric); - EXPECT_FALSE(clusterCommunicationService.clusterData().has_value()); + ON_CALL(*backend_, fetchClioNodesData).WillByDefault(testing::Invoke([&](auto) { + fetchSemaphore.release(); + return fetchResult; + })); + + ON_CALL(*backend_, writeNodeMessage).WillByDefault(testing::Return()); + + ON_CALL(writerStateRef, clone()).WillByDefault(testing::Invoke([&]() mutable { + auto state = std::make_unique(); + ON_CALL(*state, startWriting()).WillByDefault(testing::Invoke([&]() { writerActionSemaphore.release(); })); + ON_CALL(*state, giveUpWriting()).WillByDefault(testing::Invoke([&]() { writerActionSemaphore.release(); })); + return state; + })); + + ClusterCommunicationService service{backend_, std::move(writerState), kSHORT_INTERVAL, kSHORT_INTERVAL}; + + service.run(); + + EXPECT_TRUE(waitForSignal(fetchSemaphore)); + EXPECT_TRUE(waitForSignal(writerActionSemaphore)); + + service.stop(); } -TEST_F(ClusterCommunicationServiceTest, Read_GotInvalidJson) +TEST_F(ClusterCommunicationServiceTest, StopHaltsBackendOperations) { - EXPECT_TRUE(isHealthyMetric); - EXPECT_CALL(*backend_, writeNodeMessage).Times(2).WillOnce([](auto&&, auto&&) {}).WillOnce([this](auto&&, auto&&) { - notify(); - }); - EXPECT_CALL(*backend_, fetchClioNodesData).WillRepeatedly([](auto&&) { - return std::vector>{ - {boost::uuids::random_generator()(), "invalid json"} - }; - }); + std::atomic backendOperationsCount{0}; + std::binary_semaphore fetchSemaphore{0}; - clusterCommunicationService.run(); - wait(); - clusterCommunicationService.stop(); + BackendInterface::ClioNodesDataFetchResult fetchResult{std::vector>{}}; - EXPECT_FALSE(isHealthyMetric); - EXPECT_FALSE(clusterCommunicationService.clusterData().has_value()); -} - -TEST_F(ClusterCommunicationServiceTest, Read_GotInvalidNodeData) -{ - EXPECT_TRUE(isHealthyMetric); - EXPECT_CALL(*backend_, writeNodeMessage).Times(2).WillOnce([](auto&&, auto&&) {}).WillOnce([this](auto&&, auto&&) { - notify(); - }); - EXPECT_CALL(*backend_, fetchClioNodesData).WillRepeatedly([](auto&&) { - return std::vector>{{boost::uuids::random_generator()(), "{}"}}; - }); - - clusterCommunicationService.run(); - wait(); - clusterCommunicationService.stop(); - - EXPECT_FALSE(isHealthyMetric); - EXPECT_FALSE(clusterCommunicationService.clusterData().has_value()); -} - -TEST_F(ClusterCommunicationServiceTest, Read_Success) -{ - EXPECT_TRUE(isHealthyMetric); - EXPECT_EQ(nodesInClusterMetric.value(), 1); - - EXPECT_CALL(*backend_, writeNodeMessage).Times(2).WillOnce([](auto&&, auto&&) {}).WillOnce([this](auto&&, auto&&) { - auto const clusterData = clusterCommunicationService.clusterData(); - ASSERT_TRUE(clusterData.has_value()); - ASSERT_EQ(clusterData->size(), kOTHER_NODES_DATA.size() + 1); - for (auto const& node : kOTHER_NODES_DATA) { - auto const it = - std::ranges::find_if(*clusterData, [&](ClioNode const& n) { return *(n.uuid) == *(node.uuid); }); - EXPECT_NE(it, clusterData->cend()) << boost::uuids::to_string(*node.uuid); - } - auto const selfUuid = clusterCommunicationService.selfUuid(); - auto const it = - std::ranges::find_if(*clusterData, [&selfUuid](ClioNode const& node) { return node.uuid == selfUuid; }); - EXPECT_NE(it, clusterData->end()); - - notify(); - }); - - EXPECT_CALL(*backend_, fetchClioNodesData).WillRepeatedly([this](auto&&) { - auto const selfUuid = clusterCommunicationService.selfUuid(); - std::vector> result = { - {*selfUuid, R"JSON({"update_time": "2015-05-15:12:00:00"})JSON"}, - }; - - for (auto const& node : kOTHER_NODES_DATA) { - boost::json::value jsonValue; - boost::json::value_from(node, jsonValue); - result.emplace_back(*node.uuid, boost::json::serialize(jsonValue)); - } - return result; - }); - - clusterCommunicationService.run(); - wait(); - clusterCommunicationService.stop(); - - EXPECT_TRUE(isHealthyMetric); - EXPECT_EQ(nodesInClusterMetric.value(), 3); + ON_CALL(*backend_, fetchClioNodesData).WillByDefault(testing::Invoke([&](auto) { + backendOperationsCount++; + fetchSemaphore.release(); + return fetchResult; + })); + ON_CALL(*backend_, writeNodeMessage).WillByDefault(testing::Invoke([&](auto&&, auto&&) { + backendOperationsCount++; + })); + + ClusterCommunicationService service{backend_, std::move(writerState), kSHORT_INTERVAL, kSHORT_INTERVAL}; + + service.run(); + EXPECT_TRUE(waitForSignal(fetchSemaphore)); + service.stop(); + + auto const countAfterStop = backendOperationsCount.load(); + std::this_thread::sleep_for(std::chrono::milliseconds{50}); + EXPECT_EQ(backendOperationsCount.load(), countAfterStop); } diff --git a/tests/unit/cluster/MetricsTests.cpp b/tests/unit/cluster/MetricsTests.cpp new file mode 100644 index 000000000..d747ddfcf --- /dev/null +++ b/tests/unit/cluster/MetricsTests.cpp @@ -0,0 +1,179 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "cluster/Backend.hpp" +#include "cluster/ClioNode.hpp" +#include "cluster/Metrics.hpp" +#include "util/MockPrometheus.hpp" +#include "util/prometheus/Gauge.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +using namespace cluster; +using namespace util::prometheus; +using namespace testing; + +struct MetricsTest : WithMockPrometheus { + std::shared_ptr uuid1 = + std::make_shared(boost::uuids::random_generator()()); + std::shared_ptr uuid2 = + std::make_shared(boost::uuids::random_generator()()); + std::shared_ptr uuid3 = + std::make_shared(boost::uuids::random_generator()()); +}; + +TEST_F(MetricsTest, InitializesMetricsOnConstruction) +{ + auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); + auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); + + EXPECT_CALL(nodesInClusterMock, set(1)); + EXPECT_CALL(isHealthyMock, set(1)); + + Metrics metrics; +} + +TEST_F(MetricsTest, OnNewStateWithValidClusterData) +{ + auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); + auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); + + EXPECT_CALL(nodesInClusterMock, set(1)); + EXPECT_CALL(isHealthyMock, set(1)); + + Metrics metrics; + + ClioNode node1{.uuid = uuid1, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::Writer}; + ClioNode node2{.uuid = uuid2, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::ReadOnly}; + ClioNode node3{ + .uuid = uuid3, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::NotWriter + }; + + std::vector nodes = {node1, node2, node3}; + Backend::ClusterData clusterData = std::expected, std::string>(nodes); + auto sharedClusterData = std::make_shared(clusterData); + + EXPECT_CALL(isHealthyMock, set(1)); + EXPECT_CALL(nodesInClusterMock, set(3)); + + metrics.onNewState(uuid1, sharedClusterData); +} + +TEST_F(MetricsTest, OnNewStateWithEmptyClusterData) +{ + auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); + auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); + + EXPECT_CALL(nodesInClusterMock, set(1)); + EXPECT_CALL(isHealthyMock, set(1)); + + Metrics metrics; + + std::vector nodes = {}; + Backend::ClusterData clusterData = std::expected, std::string>(nodes); + auto sharedClusterData = std::make_shared(clusterData); + + EXPECT_CALL(isHealthyMock, set(1)); + EXPECT_CALL(nodesInClusterMock, set(0)); + + metrics.onNewState(uuid1, sharedClusterData); +} + +TEST_F(MetricsTest, OnNewStateWithFailedClusterData) +{ + auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); + auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); + + EXPECT_CALL(nodesInClusterMock, set(1)); + EXPECT_CALL(isHealthyMock, set(1)); + + Metrics metrics; + + Backend::ClusterData clusterData = + std::expected, std::string>(std::unexpected("Connection failed")); + auto sharedClusterData = std::make_shared(clusterData); + + EXPECT_CALL(isHealthyMock, set(0)); + EXPECT_CALL(nodesInClusterMock, set(1)); + + metrics.onNewState(uuid1, sharedClusterData); +} + +TEST_F(MetricsTest, OnNewStateWithSingleNode) +{ + auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); + auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); + + EXPECT_CALL(nodesInClusterMock, set(1)); + EXPECT_CALL(isHealthyMock, set(1)); + + Metrics metrics; + + ClioNode node1{.uuid = uuid1, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::Writer}; + + std::vector nodes = {node1}; + Backend::ClusterData clusterData = std::expected, std::string>(nodes); + auto sharedClusterData = std::make_shared(clusterData); + + EXPECT_CALL(isHealthyMock, set(1)); + EXPECT_CALL(nodesInClusterMock, set(1)); + + metrics.onNewState(uuid1, sharedClusterData); +} + +TEST_F(MetricsTest, OnNewStateRecoveryFromFailure) +{ + auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); + auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); + + EXPECT_CALL(nodesInClusterMock, set(1)); + EXPECT_CALL(isHealthyMock, set(1)); + + Metrics metrics; + + Backend::ClusterData clusterData1 = + std::expected, std::string>(std::unexpected("Connection timeout")); + auto sharedClusterData1 = std::make_shared(clusterData1); + + EXPECT_CALL(isHealthyMock, set(0)); + EXPECT_CALL(nodesInClusterMock, set(1)); + + metrics.onNewState(uuid1, sharedClusterData1); + + ClioNode node1{.uuid = uuid1, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::Writer}; + ClioNode node2{.uuid = uuid2, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::ReadOnly}; + + std::vector nodes = {node1, node2}; + Backend::ClusterData clusterData2 = std::expected, std::string>(nodes); + auto sharedClusterData2 = std::make_shared(clusterData2); + + EXPECT_CALL(isHealthyMock, set(1)); + EXPECT_CALL(nodesInClusterMock, set(2)); + + metrics.onNewState(uuid2, sharedClusterData2); +} diff --git a/tests/unit/cluster/RepeatedTaskTests.cpp b/tests/unit/cluster/RepeatedTaskTests.cpp new file mode 100644 index 000000000..aa399a372 --- /dev/null +++ b/tests/unit/cluster/RepeatedTaskTests.cpp @@ -0,0 +1,223 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "cluster/impl/RepeatedTask.hpp" +#include "util/AsioContextTestFixture.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace cluster::impl; +using namespace testing; + +struct RepeatedTaskTest : AsyncAsioContextTest { + static constexpr auto kTIMEOUT = std::chrono::seconds{5}; +}; + +template +struct RepeatedTaskTypedTest : RepeatedTaskTest { + std::atomic_int32_t callCount{0}; + std::binary_semaphore semaphore{0}; + testing::StrictMock mockFn; + + void + expectCalls(int const expectedCalls) + { + callCount = 0; + + EXPECT_CALL(mockFn, Call).Times(AtLeast(expectedCalls)).WillRepeatedly([this, expectedCalls](auto&&...) { + ++callCount; + if (callCount >= expectedCalls) { + semaphore.release(); + } + }); + } +}; + +namespace { + +using TypesToTest = Types, MockFunction>; + +} // namespace + +TYPED_TEST_SUITE(RepeatedTaskTypedTest, TypesToTest); + +TYPED_TEST(RepeatedTaskTypedTest, CallsFunctionRepeatedly) +{ + RepeatedTask task(std::chrono::milliseconds(1), this->ctx_); + + this->expectCalls(3); + + task.run(this->mockFn.AsStdFunction()); + + EXPECT_TRUE(this->semaphore.try_acquire_for(TestFixture::kTIMEOUT)); + + task.stop(); +} + +TYPED_TEST(RepeatedTaskTypedTest, StopsImmediately) +{ + auto const interval = std::chrono::seconds(5); + RepeatedTask task(interval, this->ctx_); + + task.run(this->mockFn.AsStdFunction()); + + std::this_thread::sleep_for(std::chrono::milliseconds(5)); + + auto start = std::chrono::steady_clock::now(); + task.stop(); + EXPECT_LT(std::chrono::steady_clock::now() - start, interval); +} + +TYPED_TEST(RepeatedTaskTypedTest, MultipleStops) +{ + RepeatedTask task(std::chrono::milliseconds(1), this->ctx_); + + this->expectCalls(3); + + task.run(this->mockFn.AsStdFunction()); + + EXPECT_TRUE(this->semaphore.try_acquire_for(TestFixture::kTIMEOUT)); + + task.stop(); + task.stop(); + task.stop(); +} + +TYPED_TEST(RepeatedTaskTypedTest, DestructorStopsTask) +{ + this->expectCalls(3); + + { + RepeatedTask task(std::chrono::milliseconds(1), this->ctx_); + + task.run(this->mockFn.AsStdFunction()); + + EXPECT_TRUE(this->semaphore.try_acquire_for(TestFixture::kTIMEOUT)); + + // Destructor will call stop() + } + + auto const countAfterDestruction = this->callCount.load(); + + // Wait a bit - no more calls should happen + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + EXPECT_EQ(this->callCount, countAfterDestruction); +} + +TYPED_TEST(RepeatedTaskTypedTest, StopWithoutRunIsNoOp) +{ + RepeatedTask task(std::chrono::milliseconds(1), this->ctx_); + + // Should not crash or hang + task.stop(); +} + +TEST_F(RepeatedTaskTest, MultipleTasksRunConcurrently) +{ + StrictMock> mockFn1; + StrictMock> mockFn2; + + RepeatedTask task1(std::chrono::milliseconds(1), ctx_); + RepeatedTask task2(std::chrono::milliseconds(2), ctx_); + + std::atomic_int32_t callCount1{0}; + std::atomic_int32_t callCount2{0}; + std::binary_semaphore semaphore1{0}; + std::binary_semaphore semaphore2{0}; + + EXPECT_CALL(mockFn1, Call).Times(AtLeast(10)).WillRepeatedly([&]() { + if (++callCount1 >= 10) { + semaphore1.release(); + } + }); + + EXPECT_CALL(mockFn2, Call).Times(AtLeast(5)).WillRepeatedly([&]() { + if (++callCount2 >= 5) { + semaphore2.release(); + } + }); + + task1.run(mockFn1.AsStdFunction()); + task2.run(mockFn2.AsStdFunction()); + + EXPECT_TRUE(semaphore1.try_acquire_for(kTIMEOUT)); + EXPECT_TRUE(semaphore2.try_acquire_for(kTIMEOUT)); + + task1.stop(); + task2.stop(); +} + +TYPED_TEST(RepeatedTaskTypedTest, TaskStateTransitionsCorrectly) +{ + RepeatedTask task(std::chrono::milliseconds(1), this->ctx_); + + task.stop(); // Should be no-op + + this->expectCalls(3); + + task.run(this->mockFn.AsStdFunction()); + + EXPECT_TRUE(this->semaphore.try_acquire_for(TestFixture::kTIMEOUT)); + + task.stop(); + + // Stop again should be no-op + task.stop(); +} + +TEST_F(RepeatedTaskTest, FunctionCanAccessYieldContext) +{ + StrictMock> mockFn; + std::atomic_bool yieldContextUsed = false; + std::binary_semaphore semaphore{0}; + + RepeatedTask task(std::chrono::milliseconds(1), ctx_); + + EXPECT_CALL(mockFn, Call).Times(AtLeast(1)).WillRepeatedly([&](boost::asio::yield_context yield) { + if (yieldContextUsed) + return; + + // Use the yield context to verify it's valid + boost::asio::steady_timer timer(yield.get_executor()); + timer.expires_after(std::chrono::milliseconds(1)); + boost::system::error_code ec; + timer.async_wait(yield[ec]); + EXPECT_FALSE(ec) << ec.message(); + yieldContextUsed = true; + semaphore.release(); + }); + + task.run(mockFn.AsStdFunction()); + + EXPECT_TRUE(semaphore.try_acquire_for(kTIMEOUT)); + + task.stop(); + + EXPECT_TRUE(yieldContextUsed); +} diff --git a/tests/unit/cluster/WriterDeciderTests.cpp b/tests/unit/cluster/WriterDeciderTests.cpp new file mode 100644 index 000000000..c3e6bbe39 --- /dev/null +++ b/tests/unit/cluster/WriterDeciderTests.cpp @@ -0,0 +1,314 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "cluster/Backend.hpp" +#include "cluster/ClioNode.hpp" +#include "cluster/WriterDecider.hpp" +#include "util/MockWriterState.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace cluster; + +enum class ExpectedAction { StartWriting, GiveUpWriting, NoAction, SetFallback }; + +struct WriterDeciderTestParams { + std::string testName; + uint8_t selfUuidValue; + std::vector> nodes; + ExpectedAction expectedAction; + bool useEmptyClusterData = false; +}; + +struct WriterDeciderTest : testing::TestWithParam { + ~WriterDeciderTest() override + { + ctx.stop(); + ctx.join(); + } + + boost::asio::thread_pool ctx{1}; + std::unique_ptr writerState = std::make_unique(); + MockWriterState& writerStateRef = *writerState; + + static ClioNode + makeNode(boost::uuids::uuid const& uuid, ClioNode::DbRole role) + { + return ClioNode{ + .uuid = std::make_shared(uuid), + .updateTime = std::chrono::system_clock::now(), + .dbRole = role + }; + } + + static boost::uuids::uuid + makeUuid(uint8_t value) + { + boost::uuids::uuid uuid{}; + std::ranges::fill(uuid, value); + return uuid; + } +}; + +TEST_P(WriterDeciderTest, WriterSelection) +{ + auto const& params = GetParam(); + + auto const selfUuid = makeUuid(params.selfUuidValue); + + WriterDecider decider{ctx, std::move(writerState)}; + + auto clonedState = std::make_unique(); + + // Set up expectations based on expected action + switch (params.expectedAction) { + case ExpectedAction::StartWriting: + EXPECT_CALL(*clonedState, startWriting()); + EXPECT_CALL(writerStateRef, clone()).WillOnce(testing::Return(testing::ByMove(std::move(clonedState)))); + break; + case ExpectedAction::GiveUpWriting: + EXPECT_CALL(*clonedState, giveUpWriting()); + EXPECT_CALL(writerStateRef, clone()).WillOnce(testing::Return(testing::ByMove(std::move(clonedState)))); + break; + case ExpectedAction::SetFallback: + EXPECT_CALL(*clonedState, setWriterDecidingFallback()); + EXPECT_CALL(writerStateRef, clone()).WillOnce(testing::Return(testing::ByMove(std::move(clonedState)))); + break; + case ExpectedAction::NoAction: + if (not params.useEmptyClusterData) { + // For all-ReadOnly case, we still clone but don't call any action + EXPECT_CALL(writerStateRef, clone()).WillOnce(testing::Return(testing::ByMove(std::move(clonedState)))); + } + // For empty cluster data, clone is never called + break; + } + + std::shared_ptr clusterData; + ClioNode::CUuid selfIdPtr; + + if (params.useEmptyClusterData) { + clusterData = std::make_shared(std::unexpected(std::string("Communication failed"))); + selfIdPtr = std::make_shared(selfUuid); + } else { + std::vector nodes; + nodes.reserve(params.nodes.size()); + for (auto const& [uuidValue, role] : params.nodes) { + auto node = makeNode(makeUuid(uuidValue), role); + if (uuidValue == params.selfUuidValue) { + selfIdPtr = node.uuid; // Use the same shared_ptr as in the node + } + nodes.push_back(std::move(node)); + } + clusterData = std::make_shared(std::move(nodes)); + } + + decider.onNewState(selfIdPtr, clusterData); + + ctx.join(); +} + +INSTANTIATE_TEST_SUITE_P( + WriterDeciderTests, + WriterDeciderTest, + testing::Values( + WriterDeciderTestParams{ + .testName = "SelfNodeIsSelectedAsWriter", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::Writer}, {0x02, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::StartWriting + }, + WriterDeciderTestParams{ + .testName = "OtherNodeIsSelectedAsWriter", + .selfUuidValue = 0x02, + .nodes = {{0x01, ClioNode::DbRole::Writer}, {0x02, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::GiveUpWriting + }, + WriterDeciderTestParams{ + .testName = "NodesAreSortedByUUID", + .selfUuidValue = 0x02, + .nodes = + {{0x03, ClioNode::DbRole::Writer}, {0x02, ClioNode::DbRole::Writer}, {0x01, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::GiveUpWriting + }, + WriterDeciderTestParams{ + .testName = "FirstNodeAfterReadOnlyIsNotSelf", + .selfUuidValue = 0x03, + .nodes = + {{0x01, ClioNode::DbRole::ReadOnly}, + {0x02, ClioNode::DbRole::Writer}, + {0x03, ClioNode::DbRole::NotWriter}}, + .expectedAction = ExpectedAction::GiveUpWriting + }, + WriterDeciderTestParams{ + .testName = "FirstNodeAfterReadOnlyIsSelf", + .selfUuidValue = 0x02, + .nodes = + {{0x01, ClioNode::DbRole::ReadOnly}, + {0x02, ClioNode::DbRole::Writer}, + {0x03, ClioNode::DbRole::NotWriter}}, + .expectedAction = ExpectedAction::StartWriting + }, + WriterDeciderTestParams{ + .testName = "AllNodesReadOnlyGiveUpWriting", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::ReadOnly}, {0x02, ClioNode::DbRole::ReadOnly}}, + .expectedAction = ExpectedAction::GiveUpWriting + }, + WriterDeciderTestParams{ + .testName = "EmptyClusterDataNoActionTaken", + .selfUuidValue = 0x01, + .nodes = {}, + .expectedAction = ExpectedAction::NoAction, + .useEmptyClusterData = true + }, + WriterDeciderTestParams{ + .testName = "SingleNodeClusterSelfIsWriter", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::StartWriting + }, + WriterDeciderTestParams{ + .testName = "NotWriterRoleIsSelectedWhenNoWriterRole", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::NotWriter}, {0x02, ClioNode::DbRole::NotWriter}}, + .expectedAction = ExpectedAction::StartWriting + }, + WriterDeciderTestParams{ + .testName = "MixedRolesFirstNonReadOnlyIsSelected", + .selfUuidValue = 0x03, + .nodes = + {{0x01, ClioNode::DbRole::ReadOnly}, + {0x02, ClioNode::DbRole::Writer}, + {0x03, ClioNode::DbRole::NotWriter}, + {0x04, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::GiveUpWriting + }, + WriterDeciderTestParams{ + .testName = "ShuffledNodesAreSortedCorrectly", + .selfUuidValue = 0x04, + .nodes = + {{0x04, ClioNode::DbRole::Writer}, + {0x01, ClioNode::DbRole::Writer}, + {0x03, ClioNode::DbRole::Writer}, + {0x02, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::GiveUpWriting + }, + WriterDeciderTestParams{ + .testName = "ShuffledNodesWithReadOnlySelfIsSelected", + .selfUuidValue = 0x03, + .nodes = + {{0x05, ClioNode::DbRole::Writer}, + {0x01, ClioNode::DbRole::ReadOnly}, + {0x04, ClioNode::DbRole::Writer}, + {0x03, ClioNode::DbRole::Writer}, + {0x02, ClioNode::DbRole::ReadOnly}}, + .expectedAction = ExpectedAction::StartWriting + }, + WriterDeciderTestParams{ + .testName = "SelfIsFallbackNoActionTaken", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::Fallback}, {0x02, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::NoAction + }, + WriterDeciderTestParams{ + .testName = "OtherNodeIsFallbackSetsFallbackMode", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::Writer}, {0x02, ClioNode::DbRole::Fallback}}, + .expectedAction = ExpectedAction::SetFallback + }, + WriterDeciderTestParams{ + .testName = "SelfIsReadOnlyOthersAreFallbackGiveUpWriting", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::ReadOnly}, {0x02, ClioNode::DbRole::Fallback}}, + .expectedAction = ExpectedAction::GiveUpWriting + }, + WriterDeciderTestParams{ + .testName = "MultipleFallbackNodesSelfNotFallbackSetsFallback", + .selfUuidValue = 0x03, + .nodes = + {{0x01, ClioNode::DbRole::Fallback}, + {0x02, ClioNode::DbRole::Fallback}, + {0x03, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::SetFallback + }, + WriterDeciderTestParams{ + .testName = "MixedRolesWithOneFallbackSetsFallback", + .selfUuidValue = 0x02, + .nodes = + {{0x01, ClioNode::DbRole::Writer}, + {0x02, ClioNode::DbRole::NotWriter}, + {0x03, ClioNode::DbRole::Fallback}, + {0x04, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::SetFallback + }, + WriterDeciderTestParams{ + .testName = "SelfIsLoadingCacheOtherIsWriter", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::LoadingCache}, {0x02, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::GiveUpWriting + }, + WriterDeciderTestParams{ + .testName = "OtherNodeIsLoadingCacheSkipToNextWriter", + .selfUuidValue = 0x02, + .nodes = + {{0x01, ClioNode::DbRole::LoadingCache}, + {0x02, ClioNode::DbRole::Writer}, + {0x03, ClioNode::DbRole::NotWriter}}, + .expectedAction = ExpectedAction::StartWriting + }, + WriterDeciderTestParams{ + .testName = "AllNodesLoadingCacheNoActionTaken", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::LoadingCache}, {0x02, ClioNode::DbRole::LoadingCache}}, + .expectedAction = ExpectedAction::NoAction + }, + WriterDeciderTestParams{ + .testName = "MixedWithLoadingCacheReadOnlyFirstNonReadOnlyNonLoadingCacheSelected", + .selfUuidValue = 0x03, + .nodes = + {{0x01, ClioNode::DbRole::ReadOnly}, + {0x02, ClioNode::DbRole::LoadingCache}, + {0x03, ClioNode::DbRole::Writer}, + {0x04, ClioNode::DbRole::NotWriter}}, + .expectedAction = ExpectedAction::StartWriting + }, + WriterDeciderTestParams{ + .testName = "LoadingCacheBeforeWriterSkipsLoadingCache", + .selfUuidValue = 0x04, + .nodes = + {{0x01, ClioNode::DbRole::LoadingCache}, + {0x02, ClioNode::DbRole::LoadingCache}, + {0x03, ClioNode::DbRole::Writer}, + {0x04, ClioNode::DbRole::NotWriter}}, + .expectedAction = ExpectedAction::GiveUpWriting + } + ), + [](testing::TestParamInfo const& info) { return info.param.testName; } +); diff --git a/tests/unit/etl/ETLServiceTests.cpp b/tests/unit/etl/ETLServiceTests.cpp index 253009459..91625600f 100644 --- a/tests/unit/etl/ETLServiceTests.cpp +++ b/tests/unit/etl/ETLServiceTests.cpp @@ -216,6 +216,10 @@ protected: std::shared_ptr> monitorProvider_ = std::make_shared>(); std::shared_ptr systemState_ = std::make_shared(); + testing::StrictMock> mockWriteSignalCommandCallback_; + boost::signals2::scoped_connection writeCommandConnection_{ + systemState_->writeCommandSignal.connect(mockWriteSignalCommandCallback_.AsStdFunction()) + }; etl::ETLService service_{ ctx_, @@ -300,6 +304,7 @@ TEST_F(ETLServiceTests, RunWithEmptyDatabase) auto mockTaskManager = std::make_unique>(); auto& mockTaskManagerRef = *mockTaskManager; auto ledgerData = createTestData(kSEQ); + EXPECT_TRUE(systemState_->isLoadingCache); testing::Sequence const s; EXPECT_CALL(*backend_, hardFetchLedgerRange).InSequence(s).WillOnce(testing::Return(std::nullopt)); @@ -308,25 +313,61 @@ TEST_F(ETLServiceTests, RunWithEmptyDatabase) EXPECT_CALL(*balancer_, loadInitialLedger(kSEQ, testing::_, testing::_)) .WillOnce(testing::Return(std::vector{})); EXPECT_CALL(*loader_, loadInitialLedger).WillOnce(testing::Return(ripple::LedgerHeader{})); - EXPECT_CALL(*backend_, hardFetchLedgerRange) - .InSequence(s) - .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + // In syncCacheWithDb() + EXPECT_CALL(*backend_, hardFetchLedgerRange).Times(2).InSequence(s).WillRepeatedly([this]() { + backend_->cache().update({}, kSEQ, false); + return data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ}; + }); EXPECT_CALL(mockTaskManagerRef, run); - EXPECT_CALL(*taskManagerProvider_, make(testing::_, testing::_, kSEQ + 1, testing::_)) - .WillOnce(testing::Return(std::unique_ptr(mockTaskManager.release()))); - EXPECT_CALL(*monitorProvider_, make(testing::_, testing::_, testing::_, testing::_, testing::_)) - .WillOnce([](auto, auto, auto, auto, auto) { return std::make_unique>(); }); + EXPECT_CALL(*taskManagerProvider_, make(testing::_, testing::_, kSEQ + 1, testing::_)).WillOnce([&](auto&&...) { + EXPECT_FALSE(systemState_->isLoadingCache); + return std::unique_ptr(mockTaskManager.release()); + }); + EXPECT_CALL(*monitorProvider_, make(testing::_, testing::_, testing::_, kSEQ + 1, testing::_)) + .WillOnce([this](auto, auto, auto, auto, auto) { + EXPECT_TRUE(systemState_->isLoadingCache); + return std::make_unique>(); + }); service_.run(); } TEST_F(ETLServiceTests, RunWithPopulatedDatabase) { + EXPECT_TRUE(systemState_->isLoadingCache); + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); - EXPECT_CALL(*monitorProvider_, make).WillOnce([](auto, auto, auto, auto, auto) { - return std::make_unique>(); - }); + EXPECT_CALL(*monitorProvider_, make(testing::_, testing::_, testing::_, kSEQ + 1, testing::_)) + .WillOnce([this](auto, auto, auto, auto, auto) { + EXPECT_TRUE(systemState_->isLoadingCache); + return std::make_unique>(); + }); + EXPECT_CALL(*ledgers_, getMostRecent()).WillRepeatedly(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); +} + +TEST_F(ETLServiceTests, SyncCacheWithDbBeforeStartingMonitor) +{ + EXPECT_TRUE(systemState_->isLoadingCache); + backend_->cache().update({}, kSEQ - 2, false); + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + + EXPECT_CALL(*backend_, fetchLedgerDiff(kSEQ - 1, testing::_)); + EXPECT_CALL(*cacheUpdater_, update(kSEQ - 1, std::vector())) + .WillOnce([this](auto const seq, auto&&...) { backend_->cache().update({}, seq, false); }); + EXPECT_CALL(*backend_, fetchLedgerDiff(kSEQ, testing::_)); + EXPECT_CALL(*cacheUpdater_, update(kSEQ, std::vector())) + .WillOnce([this](auto const seq, auto&&...) { backend_->cache().update({}, seq, false); }); + + EXPECT_CALL(*monitorProvider_, make(testing::_, testing::_, testing::_, kSEQ + 1, testing::_)) + .WillOnce([this](auto, auto, auto, auto, auto) { + EXPECT_TRUE(systemState_->isLoadingCache); + return std::make_unique>(); + }); EXPECT_CALL(*ledgers_, getMostRecent()).WillRepeatedly(testing::Return(kSEQ)); EXPECT_CALL(*cacheLoader_, load(kSEQ)); @@ -364,19 +405,22 @@ TEST_F(ETLServiceTests, HandlesWriteConflictInMonitorSubscription) EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); EXPECT_CALL(mockMonitorRef, run); + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) - .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + .Times(2) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); EXPECT_CALL(*cacheLoader_, load(kSEQ)); service_.run(); - systemState_->writeConflict = true; + writeCommandConnection_.disconnect(); + systemState_->writeCommandSignal(etl::SystemState::WriteCommand::StopWriting); EXPECT_CALL(*publisher_, publish(kSEQ + 1, testing::_, testing::_)); ASSERT_TRUE(capturedCallback); capturedCallback(kSEQ + 1); - EXPECT_FALSE(systemState_->writeConflict); EXPECT_FALSE(systemState_->isWriting); } @@ -397,8 +441,11 @@ TEST_F(ETLServiceTests, NormalFlowInMonitorSubscription) EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); EXPECT_CALL(mockMonitorRef, run); + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) - .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + .Times(2) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); EXPECT_CALL(*cacheLoader_, load(kSEQ)); @@ -424,13 +471,19 @@ TEST_F(ETLServiceTests, AttemptTakeoverWriter) return std::move(mockMonitor); }); - EXPECT_CALL(mockMonitorRef, subscribeToNewSequence); + std::function onNewSeqCallback; + EXPECT_CALL(mockMonitorRef, subscribeToNewSequence).WillOnce([&onNewSeqCallback](auto cb) { + onNewSeqCallback = std::move(cb); + return boost::signals2::scoped_connection{}; + }); EXPECT_CALL(mockMonitorRef, subscribeToDbStalled).WillOnce([&capturedDbStalledCallback](auto callback) { capturedDbStalledCallback = callback; return boost::signals2::scoped_connection{}; }); EXPECT_CALL(mockMonitorRef, run); + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); @@ -447,10 +500,14 @@ TEST_F(ETLServiceTests, AttemptTakeoverWriter) EXPECT_CALL(*taskManagerProvider_, make(testing::_, testing::_, kSEQ + 1, testing::_)) .WillOnce(testing::Return(std::move(mockTaskManager))); - ASSERT_TRUE(capturedDbStalledCallback); - capturedDbStalledCallback(); + EXPECT_CALL(mockWriteSignalCommandCallback_, Call(etl::SystemState::WriteCommand::StartWriting)); - EXPECT_TRUE(systemState_->isWriting); // should attempt to become writer + ASSERT_TRUE(capturedDbStalledCallback); + EXPECT_FALSE(systemState_->isWriting); // will attempt to become writer after new sequence appears but not yet + EXPECT_FALSE(systemState_->isWriterDecidingFallback); + capturedDbStalledCallback(); + EXPECT_TRUE(systemState_->isWriting); // should attempt to become writer + EXPECT_TRUE(systemState_->isWriterDecidingFallback); // fallback mode activated } TEST_F(ETLServiceTests, GiveUpWriterAfterWriteConflict) @@ -470,22 +527,25 @@ TEST_F(ETLServiceTests, GiveUpWriterAfterWriteConflict) EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); EXPECT_CALL(mockMonitorRef, run); + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) - .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + .Times(2) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); EXPECT_CALL(*cacheLoader_, load(kSEQ)); service_.run(); systemState_->isWriting = true; - systemState_->writeConflict = true; // got a write conflict along the way + writeCommandConnection_.disconnect(); + systemState_->writeCommandSignal(etl::SystemState::WriteCommand::StopWriting); EXPECT_CALL(*publisher_, publish(kSEQ + 1, testing::_, testing::_)); ASSERT_TRUE(capturedCallback); capturedCallback(kSEQ + 1); - EXPECT_FALSE(systemState_->isWriting); // gives up writing - EXPECT_FALSE(systemState_->writeConflict); // and removes write conflict flag + EXPECT_FALSE(systemState_->isWriting); // gives up writing } TEST_F(ETLServiceTests, CancelledLoadInitialLedger) @@ -539,3 +599,327 @@ TEST_F(ETLServiceTests, RunStopsIfInitialLoadIsCancelledByBalancer) EXPECT_FALSE(service_.isAmendmentBlocked()); EXPECT_FALSE(service_.isCorruptionDetected()); } + +TEST_F(ETLServiceTests, DbStalledDoesNotTriggerSignalWhenStrictReadonly) +{ + auto mockMonitor = std::make_unique>(); + auto& mockMonitorRef = *mockMonitor; + std::function capturedDbStalledCallback; + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + EXPECT_CALL(mockMonitorRef, subscribeToNewSequence); + EXPECT_CALL(mockMonitorRef, subscribeToDbStalled).WillOnce([&capturedDbStalledCallback](auto callback) { + capturedDbStalledCallback = callback; + return boost::signals2::scoped_connection{}; + }); + EXPECT_CALL(mockMonitorRef, run); + + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + systemState_->isStrictReadonly = true; // strict readonly mode + systemState_->isWriting = false; + + // No signal should be emitted because node is in strict readonly mode + // But fallback flag should still be set + + ASSERT_TRUE(capturedDbStalledCallback); + EXPECT_FALSE(systemState_->isWriterDecidingFallback); + capturedDbStalledCallback(); + EXPECT_TRUE(systemState_->isWriterDecidingFallback); // fallback mode activated even in readonly +} + +TEST_F(ETLServiceTests, DbStalledDoesNotTriggerSignalWhenAlreadyWriting) +{ + auto mockMonitor = std::make_unique>(); + auto& mockMonitorRef = *mockMonitor; + std::function capturedDbStalledCallback; + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + EXPECT_CALL(mockMonitorRef, subscribeToNewSequence); + EXPECT_CALL(mockMonitorRef, subscribeToDbStalled).WillOnce([&capturedDbStalledCallback](auto callback) { + capturedDbStalledCallback = callback; + return boost::signals2::scoped_connection{}; + }); + EXPECT_CALL(mockMonitorRef, run); + + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + systemState_->isStrictReadonly = false; + systemState_->isWriting = true; // already writing + + // No signal should be emitted because node is already writing + // But fallback flag should still be set + + ASSERT_TRUE(capturedDbStalledCallback); + EXPECT_FALSE(systemState_->isWriterDecidingFallback); + capturedDbStalledCallback(); + EXPECT_TRUE(systemState_->isWriterDecidingFallback); // fallback mode activated +} + +TEST_F(ETLServiceTests, CacheUpdatesDependOnActualCacheState_WriterMode) +{ + auto mockMonitor = std::make_unique>(); + auto& mockMonitorRef = *mockMonitor; + std::function capturedCallback; + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + EXPECT_CALL(mockMonitorRef, subscribeToNewSequence).WillOnce([&capturedCallback](auto callback) { + capturedCallback = callback; + return boost::signals2::scoped_connection{}; + }); + EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); + EXPECT_CALL(mockMonitorRef, run); + + // Set cache to be in sync with DB initially to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + systemState_->isWriting = true; // In writer mode + + // Simulate cache is behind (e.g., update failed previously) + // Cache latestLedgerSequence returns kSEQ (behind the new seq kSEQ + 1) + std::vector const emptyObjs = {}; + backend_->cache().update(emptyObjs, kSEQ); // Set cache to kSEQ + + std::vector const dummyDiff = {}; + EXPECT_CALL(*backend_, fetchLedgerDiff(kSEQ + 1, testing::_)).WillOnce(testing::Return(dummyDiff)); + + // Cache should be updated even though we're in writer mode + EXPECT_CALL(*cacheUpdater_, update(kSEQ + 1, testing::A const&>())); + + EXPECT_CALL(*publisher_, publish(kSEQ + 1, testing::_, testing::_)); + + ASSERT_TRUE(capturedCallback); + capturedCallback(kSEQ + 1); +} + +TEST_F(ETLServiceTests, OnlyCacheUpdatesWhenBackendIsCurrent) +{ + auto mockMonitor = std::make_unique>(); + auto& mockMonitorRef = *mockMonitor; + std::function capturedCallback; + // Set cache to be in sync with DB initially to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + EXPECT_CALL(mockMonitorRef, subscribeToNewSequence).WillOnce([&capturedCallback](auto callback) { + capturedCallback = callback; + return boost::signals2::scoped_connection{}; + }); + EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); + EXPECT_CALL(mockMonitorRef, run); + + // Set backend range to be at kSEQ + 1 (already current) + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})) + .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ + 1})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + systemState_->isWriting = false; + + // Cache is behind (at kSEQ) + std::vector const emptyObjs = {}; + backend_->cache().update(emptyObjs, kSEQ); + + std::vector const dummyDiff = {}; + EXPECT_CALL(*backend_, fetchLedgerDiff(kSEQ + 1, testing::_)).WillOnce(testing::Return(dummyDiff)); + EXPECT_CALL(*cacheUpdater_, update(kSEQ + 1, testing::A const&>())); + + EXPECT_CALL(*publisher_, publish(kSEQ + 1, testing::_, testing::_)); + + ASSERT_TRUE(capturedCallback); + capturedCallback(kSEQ + 1); +} + +TEST_F(ETLServiceTests, NoUpdatesWhenBothCacheAndBackendAreCurrent) +{ + auto mockMonitor = std::make_unique>(); + auto& mockMonitorRef = *mockMonitor; + std::function capturedCallback; + // Set cache to be in sync with DB initially to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + EXPECT_CALL(mockMonitorRef, subscribeToNewSequence).WillOnce([&capturedCallback](auto callback) { + capturedCallback = callback; + return boost::signals2::scoped_connection{}; + }); + EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); + EXPECT_CALL(mockMonitorRef, run); + + // Set backend range to be at kSEQ + 1 (already current) + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})) + .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ + 1})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + + // Cache is current (at kSEQ + 1) + std::vector const emptyObjs = {}; + backend_->cache().update(emptyObjs, kSEQ + 1); + + // Neither should be updated + EXPECT_CALL(*backend_, fetchLedgerDiff).Times(0); + EXPECT_CALL(*cacheUpdater_, update(testing::_, testing::A const&>())).Times(0); + + EXPECT_CALL(*publisher_, publish(kSEQ + 1, testing::_, testing::_)); + + ASSERT_TRUE(capturedCallback); + capturedCallback(kSEQ + 1); +} + +TEST_F(ETLServiceTests, StopWaitsForWriteCommandHandlersToComplete) +{ + auto mockMonitor = std::make_unique>(); + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + systemState_->isStrictReadonly = false; + + auto mockTaskManager = std::make_unique>(); + + EXPECT_CALL(mockWriteSignalCommandCallback_, Call(etl::SystemState::WriteCommand::StartWriting)); + EXPECT_CALL(*taskManagerProvider_, make(testing::_, testing::_, kSEQ + 1, testing::_)) + .WillOnce(testing::Return(std::move(mockTaskManager))); + + // Emit a command + systemState_->writeCommandSignal(etl::SystemState::WriteCommand::StartWriting); + + // The test context processes operations synchronously, so the handler should have run + // Stop should wait for the handler to complete and disconnect the subscription + service_.stop(); + + // Verify stop() returned, meaning all handlers completed + SUCCEED(); +} + +TEST_F(ETLServiceTests, WriteConflictIsHandledImmediately_NotDelayed) +{ + // This test verifies that write conflicts are handled immediately via signal, + // not delayed until the next sequence notification (the old behavior) + + auto mockMonitor = std::make_unique>(); + auto& mockMonitorRef = *mockMonitor; + std::function capturedNewSeqCallback; + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + EXPECT_CALL(mockMonitorRef, subscribeToNewSequence).WillOnce([&capturedNewSeqCallback](auto callback) { + capturedNewSeqCallback = callback; + return boost::signals2::scoped_connection{}; + }); + EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); + EXPECT_CALL(mockMonitorRef, run); + + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + systemState_->isWriting = true; + + // Emit StopWriting signal (simulating write conflict from Loader) + EXPECT_CALL(mockWriteSignalCommandCallback_, Call(etl::SystemState::WriteCommand::StopWriting)); + systemState_->writeCommandSignal(etl::SystemState::WriteCommand::StopWriting); + + // The test context processes operations synchronously, so the handler should have run immediately + // Verify that isWriting is immediately set to false + EXPECT_FALSE(systemState_->isWriting); +} + +TEST_F(ETLServiceTests, WriteCommandsAreSerializedOnStrand) +{ + auto mockMonitor = std::make_unique>(); + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + systemState_->isStrictReadonly = false; + systemState_->isWriting = false; + + auto mockTaskManager1 = std::make_unique>(); + auto mockTaskManager2 = std::make_unique>(); + + // Set up expectations for the sequence of write commands + // The signals should be processed in order: StartWriting, StopWriting, StartWriting + { + testing::InSequence seq; + + // First StartWriting + EXPECT_CALL(mockWriteSignalCommandCallback_, Call(etl::SystemState::WriteCommand::StartWriting)); + EXPECT_CALL(*taskManagerProvider_, make(testing::_, testing::_, kSEQ + 1, testing::_)) + .WillOnce(testing::Return(std::move(mockTaskManager1))); + + // Then StopWriting + EXPECT_CALL(mockWriteSignalCommandCallback_, Call(etl::SystemState::WriteCommand::StopWriting)); + + // Finally second StartWriting + EXPECT_CALL(mockWriteSignalCommandCallback_, Call(etl::SystemState::WriteCommand::StartWriting)); + EXPECT_CALL(*taskManagerProvider_, make(testing::_, testing::_, kSEQ + 1, testing::_)) + .WillOnce(testing::Return(std::move(mockTaskManager2))); + } + + // Emit multiple signals rapidly - they should be serialized on the strand + systemState_->writeCommandSignal(etl::SystemState::WriteCommand::StartWriting); + systemState_->writeCommandSignal(etl::SystemState::WriteCommand::StopWriting); + systemState_->writeCommandSignal(etl::SystemState::WriteCommand::StartWriting); + + // The test context processes operations synchronously, so all signals should have been processed + // Final state should be writing (last signal was StartWriting) + EXPECT_TRUE(systemState_->isWriting); +} diff --git a/tests/unit/etl/LedgerPublisherTests.cpp b/tests/unit/etl/LedgerPublisherTests.cpp index e4d422a9d..5b3c73d0f 100644 --- a/tests/unit/etl/LedgerPublisherTests.cpp +++ b/tests/unit/etl/LedgerPublisherTests.cpp @@ -216,15 +216,14 @@ TEST_F(ETLLedgerPublisherTest, PublishLedgerHeaderCloseTimeGreaterThanNow) TEST_F(ETLLedgerPublisherTest, PublishLedgerSeqStopIsTrue) { auto dummyState = etl::SystemState{}; - dummyState.isStopping = true; auto publisher = impl::LedgerPublisher(ctx, backend_, mockSubscriptionManagerPtr, dummyState); + publisher.stop(); EXPECT_FALSE(publisher.publish(kSEQ, {})); } TEST_F(ETLLedgerPublisherTest, PublishLedgerSeqMaxAttempt) { auto dummyState = etl::SystemState{}; - dummyState.isStopping = false; auto publisher = impl::LedgerPublisher(ctx, backend_, mockSubscriptionManagerPtr, dummyState); static constexpr auto kMAX_ATTEMPT = 2; @@ -238,7 +237,6 @@ TEST_F(ETLLedgerPublisherTest, PublishLedgerSeqMaxAttempt) TEST_F(ETLLedgerPublisherTest, PublishLedgerSeqStopIsFalse) { auto dummyState = etl::SystemState{}; - dummyState.isStopping = false; auto publisher = impl::LedgerPublisher(ctx, backend_, mockSubscriptionManagerPtr, dummyState); LedgerRange const range{.minSequence = kSEQ, .maxSequence = kSEQ}; diff --git a/tests/unit/etl/LoadingTests.cpp b/tests/unit/etl/LoadingTests.cpp index 143f915a2..e65f77060 100644 --- a/tests/unit/etl/LoadingTests.cpp +++ b/tests/unit/etl/LoadingTests.cpp @@ -188,3 +188,59 @@ TEST_F(LoadingAssertTest, LoadInitialLedgerHasDataInDB) EXPECT_CLIO_ASSERT_FAIL({ [[maybe_unused]] auto unused = loader_.loadInitialLedger(data); }); } + +TEST_F(LoadingTests, LoadWriteConflictEmitsStopWritingSignal) +{ + state_->isWriting = true; // writer is active + auto const data = createTestData(); + testing::StrictMock> mockSignalCallback; + + auto connection = state_->writeCommandSignal.connect(mockSignalCallback.AsStdFunction()); + + EXPECT_CALL(*mockRegistryPtr_, dispatch(data)); + EXPECT_CALL(*backend_, doFinishWrites()).WillOnce(testing::Return(false)); // simulate write conflict + EXPECT_CALL(mockSignalCallback, Call(etl::SystemState::WriteCommand::StopWriting)); + + EXPECT_FALSE(state_->isWriterDecidingFallback); + + auto result = loader_.load(data); + EXPECT_FALSE(result.has_value()); + EXPECT_EQ(result.error(), etl::LoaderError::WriteConflict); + EXPECT_TRUE(state_->isWriterDecidingFallback); +} + +TEST_F(LoadingTests, LoadSuccessDoesNotEmitSignal) +{ + state_->isWriting = true; // writer is active + auto const data = createTestData(); + testing::StrictMock> mockSignalCallback; + + auto connection = state_->writeCommandSignal.connect(mockSignalCallback.AsStdFunction()); + + EXPECT_CALL(*mockRegistryPtr_, dispatch(data)); + EXPECT_CALL(*backend_, doFinishWrites()).WillOnce(testing::Return(true)); // success + // No signal should be emitted on success + + EXPECT_FALSE(state_->isWriterDecidingFallback); + + auto result = loader_.load(data); + EXPECT_TRUE(result.has_value()); + EXPECT_FALSE(state_->isWriterDecidingFallback); +} + +TEST_F(LoadingTests, LoadWhenNotWritingDoesNotCheckConflict) +{ + state_->isWriting = false; // not a writer + auto const data = createTestData(); + testing::StrictMock> mockSignalCallback; + + auto connection = state_->writeCommandSignal.connect(mockSignalCallback.AsStdFunction()); + + EXPECT_CALL(*mockRegistryPtr_, dispatch(data)); + // doFinishWrites should not be called when not writing + EXPECT_CALL(*backend_, doFinishWrites()).Times(0); + // No signal should be emitted + + auto result = loader_.load(data); + EXPECT_TRUE(result.has_value()); +} diff --git a/tests/unit/etl/SystemStateTests.cpp b/tests/unit/etl/SystemStateTests.cpp new file mode 100644 index 000000000..aeebd9d66 --- /dev/null +++ b/tests/unit/etl/SystemStateTests.cpp @@ -0,0 +1,73 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2026, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "etl/SystemState.hpp" +#include "util/MockPrometheus.hpp" +#include "util/config/ConfigDefinition.hpp" +#include "util/config/ConfigFileJson.hpp" +#include "util/config/ConfigValue.hpp" +#include "util/config/Types.hpp" + +#include +#include +#include +#include + +#include + +using namespace etl; +using namespace util::config; + +struct SystemStateTest : util::prometheus::WithPrometheus {}; + +TEST_F(SystemStateTest, InitialValuesAreCorrect) +{ + auto state = SystemState{}; + + EXPECT_FALSE(state.isStrictReadonly); + EXPECT_FALSE(state.isWriting); + EXPECT_TRUE(state.isLoadingCache); + EXPECT_FALSE(state.isAmendmentBlocked); + EXPECT_FALSE(state.isCorruptionDetected); + EXPECT_FALSE(state.isWriterDecidingFallback); +} + +struct SystemStateReadOnlyTest : util::prometheus::WithPrometheus, testing::WithParamInterface {}; + +TEST_P(SystemStateReadOnlyTest, MakeSystemStateWithReadOnly) +{ + auto const readOnlyValue = GetParam(); + auto const configJson = boost::json::parse(fmt::format(R"JSON({{"read_only": {}}})JSON", readOnlyValue)); + + auto config = ClioConfigDefinition{{{"read_only", ConfigValue{ConfigType::Boolean}}}}; + auto const configFile = ConfigFileJson{configJson.as_object()}; + auto const errors = config.parse(configFile); + ASSERT_FALSE(errors.has_value()); + + auto state = SystemState::makeSystemState(config); + + EXPECT_EQ(state->isStrictReadonly, readOnlyValue); + EXPECT_FALSE(state->isWriting); + EXPECT_TRUE(state->isLoadingCache); + EXPECT_FALSE(state->isAmendmentBlocked); + EXPECT_FALSE(state->isCorruptionDetected); + EXPECT_FALSE(state->isWriterDecidingFallback); +} + +INSTANTIATE_TEST_SUITE_P(SystemStateTest, SystemStateReadOnlyTest, testing::Values(true, false)); diff --git a/tests/unit/etl/WriterStateTests.cpp b/tests/unit/etl/WriterStateTests.cpp new file mode 100644 index 000000000..872d578fd --- /dev/null +++ b/tests/unit/etl/WriterStateTests.cpp @@ -0,0 +1,162 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "etl/SystemState.hpp" +#include "etl/WriterState.hpp" +#include "util/MockPrometheus.hpp" + +#include +#include + +#include + +using namespace etl; +using namespace testing; + +struct WriterStateTest : util::prometheus::WithPrometheus { + std::shared_ptr systemState = std::make_shared(); + StrictMock> mockWriteCommand; + WriterState writerState{systemState}; + + WriterStateTest() + { + systemState->writeCommandSignal.connect(mockWriteCommand.AsStdFunction()); + } +}; + +TEST_F(WriterStateTest, IsWritingReturnsSystemStateValue) +{ + systemState->isWriting = false; + EXPECT_FALSE(writerState.isWriting()); + + systemState->isWriting = true; + EXPECT_TRUE(writerState.isWriting()); +} + +TEST_F(WriterStateTest, StartWritingEmitsStartWritingCommand) +{ + systemState->isWriting = false; + + EXPECT_CALL(mockWriteCommand, Call(SystemState::WriteCommand::StartWriting)); + + writerState.startWriting(); +} + +TEST_F(WriterStateTest, StartWritingDoesNothingWhenAlreadyWriting) +{ + systemState->isWriting = true; + + // No EXPECT_CALL - StrictMock will fail if any command is emitted + + writerState.startWriting(); +} + +TEST_F(WriterStateTest, GiveUpWritingEmitsStopWritingCommand) +{ + systemState->isWriting = true; + + EXPECT_CALL(mockWriteCommand, Call(SystemState::WriteCommand::StopWriting)); + + writerState.giveUpWriting(); +} + +TEST_F(WriterStateTest, GiveUpWritingDoesNothingWhenNotWriting) +{ + systemState->isWriting = false; + + // No EXPECT_CALL - StrictMock will fail if any command is emitted + + writerState.giveUpWriting(); +} + +TEST_F(WriterStateTest, IsFallbackReturnsFalseByDefault) +{ + EXPECT_FALSE(writerState.isFallback()); +} + +TEST_F(WriterStateTest, SetWriterDecidingFallbackSetsFlag) +{ + EXPECT_FALSE(systemState->isWriterDecidingFallback); + + writerState.setWriterDecidingFallback(); + + EXPECT_TRUE(systemState->isWriterDecidingFallback); +} + +TEST_F(WriterStateTest, IsFallbackReturnsSystemStateValue) +{ + systemState->isWriterDecidingFallback = false; + EXPECT_FALSE(writerState.isFallback()); + + systemState->isWriterDecidingFallback = true; + EXPECT_TRUE(writerState.isFallback()); +} + +TEST_F(WriterStateTest, IsReadOnlyReturnsSystemStateValue) +{ + systemState->isStrictReadonly = false; + EXPECT_FALSE(writerState.isReadOnly()); + + systemState->isStrictReadonly = true; + EXPECT_TRUE(writerState.isReadOnly()); +} + +TEST_F(WriterStateTest, IsLoadingCacheReturnsSystemStateValue) +{ + systemState->isLoadingCache = false; + EXPECT_FALSE(writerState.isLoadingCache()); + + systemState->isLoadingCache = true; + EXPECT_TRUE(writerState.isLoadingCache()); +} + +TEST_F(WriterStateTest, CloneCreatesNewInstanceWithSameSystemState) +{ + systemState->isWriting = true; + systemState->isStrictReadonly = true; + systemState->isLoadingCache = false; + + auto cloned = writerState.clone(); + + ASSERT_NE(cloned.get(), &writerState); + EXPECT_TRUE(cloned->isWriting()); + EXPECT_TRUE(cloned->isReadOnly()); + EXPECT_FALSE(cloned->isLoadingCache()); +} + +TEST_F(WriterStateTest, ClonedInstanceSharesSystemState) +{ + auto cloned = writerState.clone(); + + systemState->isWriting = true; + + EXPECT_TRUE(writerState.isWriting()); + EXPECT_TRUE(cloned->isWriting()); + + systemState->isWriting = false; + + EXPECT_FALSE(writerState.isWriting()); + EXPECT_FALSE(cloned->isWriting()); + + EXPECT_FALSE(writerState.isFallback()); + EXPECT_FALSE(cloned->isFallback()); + cloned->setWriterDecidingFallback(); + EXPECT_TRUE(writerState.isFallback()); + EXPECT_TRUE(cloned->isFallback()); +} diff --git a/tests/unit/util/ChannelTests.cpp b/tests/unit/util/ChannelTests.cpp index 13809c453..3630ee175 100644 --- a/tests/unit/util/ChannelTests.cpp +++ b/tests/unit/util/ChannelTests.cpp @@ -187,7 +187,7 @@ TEST_P(ChannelSpawnTest, MultipleSendersMultipleReceivers) context_.withExecutor([this](auto& executor) { auto [sender, receiver] = util::Channel::create(executor, 10); util::Mutex> receivedValues; - std::vector receivers(kNUM_RECEIVERS, receiver); + std::vector receivers(kNUM_RECEIVERS, receiver); for (auto receiverId = 0uz; receiverId < kNUM_RECEIVERS; ++receiverId) { util::spawn( @@ -402,7 +402,7 @@ TEST_P(ChannelCallbackTest, MultipleSendersMultipleReceivers) context_.withExecutor([this](auto& executor) { auto [sender, receiver] = util::Channel::create(executor, 10); util::Mutex> receivedValues; - std::vector receivers(kNUM_RECEIVERS, receiver); + std::vector receivers(kNUM_RECEIVERS, receiver); for (auto receiverId = 0uz; receiverId < kNUM_RECEIVERS; ++receiverId) { auto& receiverRef = receivers[receiverId]; @@ -528,8 +528,8 @@ TEST_P(ChannelCallbackTest, TryMethodsWithClosedChannel) context_.withExecutor([this](auto& executor) { std::atomic_bool testCompleted{false}; auto [sender, receiver] = util::Channel::create(executor, 3); - auto receiverPtr = std::make_shared(std::move(receiver)); - auto senderPtr = std::make_shared>(std::move(sender)); + auto receiverPtr = std::make_shared::Receiver>(std::move(receiver)); + auto senderPtr = std::make_shared::Sender>>(std::move(sender)); boost::asio::post(executor, [receiverPtr, senderPtr, &testCompleted]() { EXPECT_TRUE(senderPtr->value().trySend(100));