mirror of
https://github.com/XRPLF/rippled.git
synced 2025-12-06 17:27:55 +00:00
Pause for lagging validators.
This commit is contained in:
@@ -919,6 +919,37 @@ RCLConsensus::Adaptor::preStartRound(RCLCxLedger const & prevLgr)
|
||||
return validating_ && synced;
|
||||
}
|
||||
|
||||
bool
|
||||
RCLConsensus::Adaptor::haveValidated() const
|
||||
{
|
||||
return ledgerMaster_.haveValidated();
|
||||
}
|
||||
|
||||
LedgerIndex
|
||||
RCLConsensus::Adaptor::getValidLedgerIndex() const
|
||||
{
|
||||
return ledgerMaster_.getValidLedgerIndex();
|
||||
}
|
||||
|
||||
std::pair<std::size_t, hash_set<RCLConsensus::Adaptor::NodeKey_t>>
|
||||
RCLConsensus::Adaptor::getQuorumKeys() const
|
||||
{
|
||||
return app_.validators().getQuorumKeys();
|
||||
}
|
||||
|
||||
std::size_t
|
||||
RCLConsensus::Adaptor::laggards(Ledger_t::Seq const seq,
|
||||
hash_set<RCLConsensus::Adaptor::NodeKey_t>& trustedKeys) const
|
||||
{
|
||||
return app_.getValidations().laggards(seq, trustedKeys);
|
||||
}
|
||||
|
||||
bool
|
||||
RCLConsensus::Adaptor::validator() const
|
||||
{
|
||||
return !valPublic_.empty();
|
||||
}
|
||||
|
||||
void
|
||||
RCLConsensus::startRound(
|
||||
NetClock::time_point const& now,
|
||||
|
||||
@@ -85,6 +85,7 @@ class RCLConsensus
|
||||
public:
|
||||
using Ledger_t = RCLCxLedger;
|
||||
using NodeID_t = NodeID;
|
||||
using NodeKey_t = PublicKey;
|
||||
using TxSet_t = RCLTxSet;
|
||||
using PeerPosition_t = RCLCxPeerPos;
|
||||
|
||||
@@ -131,6 +132,26 @@ class RCLConsensus
|
||||
bool
|
||||
preStartRound(RCLCxLedger const & prevLedger);
|
||||
|
||||
bool
|
||||
haveValidated() const;
|
||||
|
||||
LedgerIndex
|
||||
getValidLedgerIndex() const;
|
||||
|
||||
std::pair<std::size_t, hash_set<NodeKey_t>>
|
||||
getQuorumKeys() const;
|
||||
|
||||
std::size_t
|
||||
laggards(Ledger_t::Seq const seq,
|
||||
hash_set<NodeKey_t >& trustedKeys) const;
|
||||
|
||||
/** Whether I am a validator.
|
||||
*
|
||||
* @return whether I am a validator.
|
||||
*/
|
||||
bool
|
||||
validator() const;
|
||||
|
||||
/** Consensus simulation parameters
|
||||
*/
|
||||
ConsensusParms const&
|
||||
|
||||
@@ -248,6 +248,13 @@ public:
|
||||
|
||||
std::size_t getFetchPackCacheSize () const;
|
||||
|
||||
//! Whether we have ever fully validated a ledger.
|
||||
bool
|
||||
haveValidated()
|
||||
{
|
||||
return !mValidLedger.empty();
|
||||
}
|
||||
|
||||
private:
|
||||
using ScopedLockType = std::lock_guard <std::recursive_mutex>;
|
||||
using ScopedUnlockType = GenericScopedUnlock <std::recursive_mutex>;
|
||||
@@ -330,9 +337,6 @@ private:
|
||||
|
||||
std::unique_ptr <detail::LedgerCleaner> mLedgerCleaner;
|
||||
|
||||
uint256 mLastValidateHash;
|
||||
std::uint32_t mLastValidateSeq {0};
|
||||
|
||||
// Publish thread is running.
|
||||
bool mAdvanceThread {false};
|
||||
|
||||
|
||||
@@ -864,9 +864,6 @@ LedgerMaster::checkAccept (
|
||||
<< "Advancing accepted ledger to " << ledger->info().seq
|
||||
<< " with >= " << minVal << " validations";
|
||||
|
||||
mLastValidateHash = ledger->info().hash;
|
||||
mLastValidateSeq = ledger->info().seq;
|
||||
|
||||
ledger->setValidated();
|
||||
ledger->setFull();
|
||||
setValidLedger(ledger);
|
||||
|
||||
@@ -357,6 +357,19 @@ public:
|
||||
Json::Value
|
||||
getJson() const;
|
||||
|
||||
using QuorumKeys = std::pair<std::size_t const, hash_set<PublicKey>>;
|
||||
/** Get the quorum and all of the trusted keys.
|
||||
*
|
||||
* @return quorum and keys.
|
||||
*/
|
||||
QuorumKeys
|
||||
getQuorumKeys() const
|
||||
{
|
||||
std::shared_lock<std::shared_timed_mutex> read_lock{mutex_};
|
||||
return {quorum_, trustedKeys_};
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
/** Check response for trusted valid published list
|
||||
|
||||
|
||||
@@ -29,6 +29,8 @@
|
||||
#include <ripple/consensus/LedgerTiming.h>
|
||||
#include <ripple/consensus/DisputedTx.h>
|
||||
#include <ripple/json/json_writer.h>
|
||||
#include <boost/logic/tribool.hpp>
|
||||
#include <sstream>
|
||||
|
||||
namespace ripple {
|
||||
|
||||
@@ -474,6 +476,31 @@ private:
|
||||
void
|
||||
phaseEstablish();
|
||||
|
||||
/** Evaluate whether pausing increases likelihood of validation.
|
||||
*
|
||||
* As a validator that has previously synced to the network, if our most
|
||||
* recent locally-validated ledger did not also achieve
|
||||
* full validation, then consider pausing for awhile based on
|
||||
* the state of other validators.
|
||||
*
|
||||
* Pausing may be beneficial in this situation if at least one validator
|
||||
* is known to be on a sequence number earlier than ours. The minimum
|
||||
* number of validators on the same sequence number does not guarantee
|
||||
* consensus, and waiting for all validators may be too time-consuming.
|
||||
* Therefore, a variable threshold is enacted based on the number
|
||||
* of ledgers past network validation that we are on. For the first phase,
|
||||
* the threshold is also the minimum required for quorum. For the last,
|
||||
* no online validators can have a lower sequence number. For intermediate
|
||||
* phases, the threshold is linear between the minimum required for
|
||||
* quorum and 100%. For example, with 3 total phases and a quorum of
|
||||
* 80%, the 2nd phase would be 90%. Once the final phase is reached,
|
||||
* if consensus still fails to occur, the cycle is begun again at phase 1.
|
||||
*
|
||||
* @return Whether to pause to wait for lagging proposers.
|
||||
*/
|
||||
bool
|
||||
shouldPause() const;
|
||||
|
||||
// Close the open ledger and establish initial position.
|
||||
void
|
||||
closeLedger();
|
||||
@@ -1108,6 +1135,124 @@ Consensus<Adaptor>::phaseOpen()
|
||||
}
|
||||
}
|
||||
|
||||
template <class Adaptor>
|
||||
bool
|
||||
Consensus<Adaptor>::shouldPause() const
|
||||
{
|
||||
auto const& parms = adaptor_.parms();
|
||||
std::uint32_t const ahead (previousLedger_.seq() -
|
||||
std::min(adaptor_.getValidLedgerIndex(), previousLedger_.seq()));
|
||||
auto quorumKeys = adaptor_.getQuorumKeys();
|
||||
auto const& quorum = quorumKeys.first;
|
||||
auto& trustedKeys = quorumKeys.second;
|
||||
std::size_t const totalValidators = trustedKeys.size();
|
||||
std::size_t laggards = adaptor_.laggards(previousLedger_.seq(),
|
||||
trustedKeys);
|
||||
std::size_t const offline = trustedKeys.size();
|
||||
|
||||
std::stringstream vars;
|
||||
vars << " (working seq: " << previousLedger_.seq() << ", "
|
||||
<< "validated seq: " << adaptor_.getValidLedgerIndex() << ", "
|
||||
<< "am validator: " << adaptor_.validator() << ", "
|
||||
<< "have validated: " << adaptor_.haveValidated() << ", "
|
||||
<< "roundTime: " << result_->roundTime.read().count() << ", "
|
||||
<< "max consensus time: " << parms.ledgerMAX_CONSENSUS.count() << ", "
|
||||
<< "validators: " << totalValidators << ", "
|
||||
<< "laggards: " << laggards << ", "
|
||||
<< "offline: " << offline << ", "
|
||||
<< "quorum: " << quorum << ")";
|
||||
|
||||
if (!ahead ||
|
||||
!laggards ||
|
||||
!totalValidators ||
|
||||
!adaptor_.validator() ||
|
||||
!adaptor_.haveValidated() ||
|
||||
result_->roundTime.read() > parms.ledgerMAX_CONSENSUS)
|
||||
{
|
||||
j_.debug() << "not pausing" << vars.str();
|
||||
return false;
|
||||
}
|
||||
|
||||
bool willPause = false;
|
||||
|
||||
/** Maximum phase with distinct thresholds to determine how
|
||||
* many validators must be on our same ledger sequence number.
|
||||
* The threshold for the 1st (0) phase is >= the minimum number that
|
||||
* can achieve quorum. Threshold for the maximum phase is 100%
|
||||
* of all trusted validators. Progression from min to max phase is
|
||||
* simply linear. If there are 5 phases (maxPausePhase = 4)
|
||||
* and minimum quorum is 80%, then thresholds progress as follows:
|
||||
* 0: >=80%
|
||||
* 1: >=85%
|
||||
* 2: >=90%
|
||||
* 3: >=95%
|
||||
* 4: =100%
|
||||
*/
|
||||
constexpr static std::size_t maxPausePhase = 4;
|
||||
|
||||
/**
|
||||
* No particular threshold guarantees consensus. Lower thresholds
|
||||
* are easier to achieve than higher, but higher thresholds are
|
||||
* more likely to reach consensus. Cycle through the phases if
|
||||
* lack of synchronization continues.
|
||||
*
|
||||
* Current information indicates that no phase is likely to be intrinsically
|
||||
* better than any other: the lower the threshold, the less likely that
|
||||
* up-to-date nodes will be able to reach consensus without the laggards.
|
||||
* But the higher the threshold, the longer the likely resulting pause.
|
||||
* 100% is slightly less desirable in the long run because the potential
|
||||
* of a single dawdling peer to slow down everything else. So if we
|
||||
* accept that no phase is any better than any other phase, but that
|
||||
* all of them will potentially enable us to arrive at consensus, cycling
|
||||
* through them seems to be appropriate. Further, if we do reach the
|
||||
* point of having to cycle back around, then it's likely that something
|
||||
* else out of the scope of this delay mechanism is wrong with the
|
||||
* network.
|
||||
*/
|
||||
std::size_t const phase = (ahead - 1) % (maxPausePhase + 1);
|
||||
|
||||
// validators that remain after the laggards() function are considered
|
||||
// offline, and should be considered as laggards for purposes of
|
||||
// evaluating whether the threshold for non-laggards has been reached.
|
||||
switch (phase)
|
||||
{
|
||||
case 0:
|
||||
// Laggards and offline shouldn't preclude consensus.
|
||||
if (laggards + offline > totalValidators - quorum)
|
||||
willPause = true;
|
||||
break;
|
||||
case maxPausePhase:
|
||||
// No tolerance.
|
||||
willPause = true;
|
||||
break;
|
||||
default:
|
||||
// Ensure that sufficient validators are known to be not lagging.
|
||||
// Their sufficiently most recent validation sequence was equal to
|
||||
// or greater than our own.
|
||||
//
|
||||
// The threshold is the amount required for quorum plus
|
||||
// the proportion of the remainder based on number of intermediate
|
||||
// phases between 0 and max.
|
||||
float const nonLaggards = totalValidators - (laggards + offline);
|
||||
float const quorumRatio =
|
||||
static_cast<float>(quorum) / totalValidators;
|
||||
float const allowedDissent = 1.0f - quorumRatio;
|
||||
float const phaseFactor = static_cast<float>(phase) / maxPausePhase;
|
||||
|
||||
if (nonLaggards / totalValidators <
|
||||
quorumRatio + (allowedDissent * phaseFactor))
|
||||
{
|
||||
willPause = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (willPause)
|
||||
j_.warn() << "pausing" << vars.str();
|
||||
else
|
||||
j_.debug() << "not pausing" << vars.str();
|
||||
return willPause;
|
||||
}
|
||||
|
||||
template <class Adaptor>
|
||||
void
|
||||
Consensus<Adaptor>::phaseEstablish()
|
||||
@@ -1130,8 +1275,8 @@ Consensus<Adaptor>::phaseEstablish()
|
||||
|
||||
updateOurPositions();
|
||||
|
||||
// Nothing to do if we don't have consensus.
|
||||
if (!haveConsensus())
|
||||
// Nothing to do if too many laggards or we don't have consensus.
|
||||
if (shouldPause() || !haveConsensus())
|
||||
return;
|
||||
|
||||
if (!haveCloseTimeConsensus_)
|
||||
|
||||
@@ -82,6 +82,15 @@ struct ConsensusParms
|
||||
std::chrono::milliseconds ledgerMIN_CONSENSUS =
|
||||
std::chrono::milliseconds {1950};
|
||||
|
||||
/** The maximum amount of time to spend pausing for laggards.
|
||||
*
|
||||
* This should be sufficiently less than validationFRESHNESS so that
|
||||
* validators don't appear to be offline that are merely waiting for
|
||||
* laggards.
|
||||
*/
|
||||
std::chrono::milliseconds ledgerMAX_CONSENSUS =
|
||||
std::chrono::seconds {10};
|
||||
|
||||
//! Minimum number of seconds to wait to ensure others have computed the LCL
|
||||
std::chrono::milliseconds ledgerMIN_CLOSE = std::chrono::seconds {2};
|
||||
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include <ripple/beast/container/aged_container_utility.h>
|
||||
#include <ripple/beast/container/aged_unordered_map.h>
|
||||
#include <ripple/consensus/LedgerTrie.h>
|
||||
#include <ripple/protocol/PublicKey.h>
|
||||
#include <boost/optional.hpp>
|
||||
#include <mutex>
|
||||
#include <utility>
|
||||
@@ -73,6 +74,16 @@ struct ValidationParms
|
||||
for a reasonable interval.
|
||||
*/
|
||||
std::chrono::seconds validationSET_EXPIRES = std::chrono::minutes{10};
|
||||
|
||||
/** How long we consider a validation fresh.
|
||||
*
|
||||
* The number of seconds since a validation has been seen for it to
|
||||
* be considered to accurately represent a live proposer's most recent
|
||||
* validation. This value should be sufficiently higher than
|
||||
* ledgerMAX_CONSENSUS such that validators who are waiting for
|
||||
* laggards are not considered offline.
|
||||
*/
|
||||
std::chrono::seconds validationFRESHNESS = std::chrono::seconds {20};
|
||||
};
|
||||
|
||||
/** Enforce validation increasing sequence requirement.
|
||||
@@ -278,6 +289,7 @@ class Validations
|
||||
using ID = typename Ledger::ID;
|
||||
using Seq = typename Ledger::Seq;
|
||||
using NodeID = typename Validation::NodeID;
|
||||
using NodeKey = typename Validation::NodeKey;
|
||||
|
||||
using WrappedValidationType = std::decay_t<
|
||||
std::result_of_t<decltype (&Validation::unwrap)(Validation)>>;
|
||||
@@ -966,6 +978,42 @@ public:
|
||||
|
||||
adaptor_.flush(std::move(flushed));
|
||||
}
|
||||
|
||||
/** Return quantity of lagging proposers, and remove online proposers
|
||||
* for purposes of evaluating whether to pause.
|
||||
*
|
||||
* Laggards are the trusted proposers whose sequence number is lower
|
||||
* than the sequence number from which our current pending proposal
|
||||
* is based. Proposers from whom we have not received a validation for
|
||||
* awhile are considered offline.
|
||||
*
|
||||
* Note: the trusted flag is not used in this evaluation because it's made
|
||||
* redundant by checking the list of proposers.
|
||||
*
|
||||
* @param seq Our current sequence number.
|
||||
* @param trustedKeys Public keys of trusted proposers.
|
||||
* @return Quantity of laggards.
|
||||
*/
|
||||
std::size_t
|
||||
laggards(Seq const seq, hash_set<NodeKey>& trustedKeys)
|
||||
{
|
||||
std::size_t laggards = 0;
|
||||
|
||||
current(ScopedLock{mutex_},
|
||||
[](std::size_t) {},
|
||||
[&](NodeID const&, Validation const& v) {
|
||||
if (adaptor_.now() <
|
||||
v.seenTime() + parms_.validationFRESHNESS &&
|
||||
trustedKeys.find(v.key()) != trustedKeys.end())
|
||||
{
|
||||
trustedKeys.erase(v.key());
|
||||
if (seq > v.seq())
|
||||
++laggards;
|
||||
}
|
||||
});
|
||||
|
||||
return laggards;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ripple
|
||||
|
||||
Reference in New Issue
Block a user