mirror of
https://github.com/XRPLF/clio.git
synced 2026-06-06 02:06:58 +00:00
feat: Recover from fallback writer state (#3000)
Add a timer and new `FallbackRecovery` state to try to recover from `Fallback` writer state every hour. Fixes #2997
This commit is contained in:
@@ -21,24 +21,48 @@
|
||||
|
||||
#include "cluster/Backend.hpp"
|
||||
#include "cluster/ClioNode.hpp"
|
||||
#include "cluster/impl/FallbackRecoveryTimer.hpp"
|
||||
#include "etl/WriterState.hpp"
|
||||
#include "util/Assert.hpp"
|
||||
#include "util/Spawn.hpp"
|
||||
|
||||
#include <boost/asio/error.hpp>
|
||||
#include <boost/asio/thread_pool.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace cluster {
|
||||
|
||||
WriterDecider::WriterDecider(
|
||||
boost::asio::thread_pool& ctx,
|
||||
namespace {
|
||||
|
||||
void
|
||||
startFallbackRecoveryTimer(
|
||||
impl::FallbackRecoveryTimer& fallbackRecoveryTimer,
|
||||
std::unique_ptr<etl::WriterStateInterface> writerState
|
||||
)
|
||||
: ctx_(ctx), writerState_(std::move(writerState))
|
||||
{
|
||||
fallbackRecoveryTimer.start( //
|
||||
[ws = std::move(writerState)](boost::system::error_code ec) mutable {
|
||||
if (ec == boost::asio::error::operation_aborted)
|
||||
return;
|
||||
ASSERT(!ec, "Unexpected error {}: {}", ec.value(), ec.to_string());
|
||||
ws->setFallbackRecovery(true);
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
WriterDecider::WriterDecider(
|
||||
boost::asio::thread_pool& ctx,
|
||||
std::unique_ptr<etl::WriterStateInterface> writerState,
|
||||
std::chrono::steady_clock::duration recoveryTime
|
||||
)
|
||||
: ctx_(ctx), writerState_(std::move(writerState)), fallbackRecoveryTimer_(ctx, recoveryTime)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -55,27 +79,53 @@ WriterDecider::onNewState(
|
||||
ctx_,
|
||||
[writerState = writerState_->clone(),
|
||||
selfId = std::move(selfId),
|
||||
fallbackRecoveryTimer = fallbackRecoveryTimer_,
|
||||
clusterData = clusterData->value()](auto&&) mutable {
|
||||
auto const selfData = std::ranges::find_if(
|
||||
clusterData, [&selfId](ClioNode const& node) { return node.uuid == selfId; }
|
||||
);
|
||||
ASSERT(selfData != clusterData.end(), "Self data should always be in the cluster data");
|
||||
|
||||
if (selfData->dbRole == ClioNode::DbRole::Fallback) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (selfData->dbRole == ClioNode::DbRole::ReadOnly) {
|
||||
writerState->giveUpWriting();
|
||||
return;
|
||||
}
|
||||
|
||||
if (selfData->dbRole == ClioNode::DbRole::Fallback) {
|
||||
auto const clusterInFallbackRecoveryState =
|
||||
std::ranges::any_of(clusterData, [](ClioNode const& node) {
|
||||
return node.dbRole == ClioNode::DbRole::FallbackRecovery;
|
||||
});
|
||||
if (clusterInFallbackRecoveryState) {
|
||||
writerState->setFallbackRecovery(true);
|
||||
fallbackRecoveryTimer.cancel();
|
||||
} else if (not fallbackRecoveryTimer.isRunning()) {
|
||||
startFallbackRecoveryTimer(fallbackRecoveryTimer, std::move(writerState));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (selfData->dbRole == ClioNode::DbRole::FallbackRecovery) {
|
||||
auto const clusterIsReadyToRecover =
|
||||
not std::ranges::any_of(clusterData, [](ClioNode const& node) {
|
||||
return node.dbRole == ClioNode::DbRole::Fallback;
|
||||
});
|
||||
if (clusterIsReadyToRecover) {
|
||||
writerState->giveUpWriting();
|
||||
writerState->setFallbackRecovery(false);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// If any node in the cluster is in Fallback mode, the entire cluster must switch
|
||||
// to the fallback writer decision mechanism for consistency
|
||||
if (std::ranges::any_of(clusterData, [](ClioNode const& node) {
|
||||
auto const clusterInFallbackState =
|
||||
std::ranges::any_of(clusterData, [](ClioNode const& node) {
|
||||
return node.dbRole == ClioNode::DbRole::Fallback;
|
||||
})) {
|
||||
});
|
||||
if (clusterInFallbackState) {
|
||||
writerState->setWriterDecidingFallback();
|
||||
startFallbackRecoveryTimer(fallbackRecoveryTimer, std::move(writerState));
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user