diff --git a/src/ripple/app/consensus/RCLConsensus.cpp b/src/ripple/app/consensus/RCLConsensus.cpp index 6a4ec04c53..a1930bd694 100644 --- a/src/ripple/app/consensus/RCLConsensus.cpp +++ b/src/ripple/app/consensus/RCLConsensus.cpp @@ -919,6 +919,37 @@ RCLConsensus::Adaptor::preStartRound(RCLCxLedger const & prevLgr) return validating_ && synced; } +bool +RCLConsensus::Adaptor::haveValidated() const +{ + return ledgerMaster_.haveValidated(); +} + +LedgerIndex +RCLConsensus::Adaptor::getValidLedgerIndex() const +{ + return ledgerMaster_.getValidLedgerIndex(); +} + +std::pair> +RCLConsensus::Adaptor::getQuorumKeys() const +{ + return app_.validators().getQuorumKeys(); +} + +std::size_t +RCLConsensus::Adaptor::laggards(Ledger_t::Seq const seq, + hash_set& trustedKeys) const +{ + return app_.getValidations().laggards(seq, trustedKeys); +} + +bool +RCLConsensus::Adaptor::validator() const +{ + return !valPublic_.empty(); +} + void RCLConsensus::startRound( NetClock::time_point const& now, diff --git a/src/ripple/app/consensus/RCLConsensus.h b/src/ripple/app/consensus/RCLConsensus.h index 3b1207fe0d..48894f02b0 100644 --- a/src/ripple/app/consensus/RCLConsensus.h +++ b/src/ripple/app/consensus/RCLConsensus.h @@ -85,6 +85,7 @@ class RCLConsensus public: using Ledger_t = RCLCxLedger; using NodeID_t = NodeID; + using NodeKey_t = PublicKey; using TxSet_t = RCLTxSet; using PeerPosition_t = RCLCxPeerPos; @@ -131,6 +132,26 @@ class RCLConsensus bool preStartRound(RCLCxLedger const & prevLedger); + bool + haveValidated() const; + + LedgerIndex + getValidLedgerIndex() const; + + std::pair> + getQuorumKeys() const; + + std::size_t + laggards(Ledger_t::Seq const seq, + hash_set& trustedKeys) const; + + /** Whether I am a validator. + * + * @return whether I am a validator. + */ + bool + validator() const; + /** Consensus simulation parameters */ ConsensusParms const& diff --git a/src/ripple/app/ledger/LedgerMaster.h b/src/ripple/app/ledger/LedgerMaster.h index 4f90c082e4..b090de4a00 100644 --- a/src/ripple/app/ledger/LedgerMaster.h +++ b/src/ripple/app/ledger/LedgerMaster.h @@ -248,6 +248,13 @@ public: std::size_t getFetchPackCacheSize () const; + //! Whether we have ever fully validated a ledger. + bool + haveValidated() + { + return !mValidLedger.empty(); + } + private: using ScopedLockType = std::lock_guard ; using ScopedUnlockType = GenericScopedUnlock ; @@ -330,9 +337,6 @@ private: std::unique_ptr mLedgerCleaner; - uint256 mLastValidateHash; - std::uint32_t mLastValidateSeq {0}; - // Publish thread is running. bool mAdvanceThread {false}; diff --git a/src/ripple/app/ledger/impl/LedgerMaster.cpp b/src/ripple/app/ledger/impl/LedgerMaster.cpp index 5be97b47db..ef7867a1bc 100644 --- a/src/ripple/app/ledger/impl/LedgerMaster.cpp +++ b/src/ripple/app/ledger/impl/LedgerMaster.cpp @@ -864,9 +864,6 @@ LedgerMaster::checkAccept ( << "Advancing accepted ledger to " << ledger->info().seq << " with >= " << minVal << " validations"; - mLastValidateHash = ledger->info().hash; - mLastValidateSeq = ledger->info().seq; - ledger->setValidated(); ledger->setFull(); setValidLedger(ledger); diff --git a/src/ripple/app/misc/ValidatorList.h b/src/ripple/app/misc/ValidatorList.h index 61585a659b..b837266c9f 100644 --- a/src/ripple/app/misc/ValidatorList.h +++ b/src/ripple/app/misc/ValidatorList.h @@ -357,6 +357,19 @@ public: Json::Value getJson() const; + using QuorumKeys = std::pair>; + /** Get the quorum and all of the trusted keys. + * + * @return quorum and keys. + */ + QuorumKeys + getQuorumKeys() const + { + std::shared_lock read_lock{mutex_}; + return {quorum_, trustedKeys_}; + } + + private: /** Check response for trusted valid published list diff --git a/src/ripple/consensus/Consensus.h b/src/ripple/consensus/Consensus.h index 29873959d1..7c69afcf70 100644 --- a/src/ripple/consensus/Consensus.h +++ b/src/ripple/consensus/Consensus.h @@ -29,6 +29,8 @@ #include #include #include +#include +#include namespace ripple { @@ -474,6 +476,31 @@ private: void phaseEstablish(); + /** Evaluate whether pausing increases likelihood of validation. + * + * As a validator that has previously synced to the network, if our most + * recent locally-validated ledger did not also achieve + * full validation, then consider pausing for awhile based on + * the state of other validators. + * + * Pausing may be beneficial in this situation if at least one validator + * is known to be on a sequence number earlier than ours. The minimum + * number of validators on the same sequence number does not guarantee + * consensus, and waiting for all validators may be too time-consuming. + * Therefore, a variable threshold is enacted based on the number + * of ledgers past network validation that we are on. For the first phase, + * the threshold is also the minimum required for quorum. For the last, + * no online validators can have a lower sequence number. For intermediate + * phases, the threshold is linear between the minimum required for + * quorum and 100%. For example, with 3 total phases and a quorum of + * 80%, the 2nd phase would be 90%. Once the final phase is reached, + * if consensus still fails to occur, the cycle is begun again at phase 1. + * + * @return Whether to pause to wait for lagging proposers. + */ + bool + shouldPause() const; + // Close the open ledger and establish initial position. void closeLedger(); @@ -1108,6 +1135,124 @@ Consensus::phaseOpen() } } +template +bool +Consensus::shouldPause() const +{ + auto const& parms = adaptor_.parms(); + std::uint32_t const ahead (previousLedger_.seq() - + std::min(adaptor_.getValidLedgerIndex(), previousLedger_.seq())); + auto quorumKeys = adaptor_.getQuorumKeys(); + auto const& quorum = quorumKeys.first; + auto& trustedKeys = quorumKeys.second; + std::size_t const totalValidators = trustedKeys.size(); + std::size_t laggards = adaptor_.laggards(previousLedger_.seq(), + trustedKeys); + std::size_t const offline = trustedKeys.size(); + + std::stringstream vars; + vars << " (working seq: " << previousLedger_.seq() << ", " + << "validated seq: " << adaptor_.getValidLedgerIndex() << ", " + << "am validator: " << adaptor_.validator() << ", " + << "have validated: " << adaptor_.haveValidated() << ", " + << "roundTime: " << result_->roundTime.read().count() << ", " + << "max consensus time: " << parms.ledgerMAX_CONSENSUS.count() << ", " + << "validators: " << totalValidators << ", " + << "laggards: " << laggards << ", " + << "offline: " << offline << ", " + << "quorum: " << quorum << ")"; + + if (!ahead || + !laggards || + !totalValidators || + !adaptor_.validator() || + !adaptor_.haveValidated() || + result_->roundTime.read() > parms.ledgerMAX_CONSENSUS) + { + j_.debug() << "not pausing" << vars.str(); + return false; + } + + bool willPause = false; + + /** Maximum phase with distinct thresholds to determine how + * many validators must be on our same ledger sequence number. + * The threshold for the 1st (0) phase is >= the minimum number that + * can achieve quorum. Threshold for the maximum phase is 100% + * of all trusted validators. Progression from min to max phase is + * simply linear. If there are 5 phases (maxPausePhase = 4) + * and minimum quorum is 80%, then thresholds progress as follows: + * 0: >=80% + * 1: >=85% + * 2: >=90% + * 3: >=95% + * 4: =100% + */ + constexpr static std::size_t maxPausePhase = 4; + + /** + * No particular threshold guarantees consensus. Lower thresholds + * are easier to achieve than higher, but higher thresholds are + * more likely to reach consensus. Cycle through the phases if + * lack of synchronization continues. + * + * Current information indicates that no phase is likely to be intrinsically + * better than any other: the lower the threshold, the less likely that + * up-to-date nodes will be able to reach consensus without the laggards. + * But the higher the threshold, the longer the likely resulting pause. + * 100% is slightly less desirable in the long run because the potential + * of a single dawdling peer to slow down everything else. So if we + * accept that no phase is any better than any other phase, but that + * all of them will potentially enable us to arrive at consensus, cycling + * through them seems to be appropriate. Further, if we do reach the + * point of having to cycle back around, then it's likely that something + * else out of the scope of this delay mechanism is wrong with the + * network. + */ + std::size_t const phase = (ahead - 1) % (maxPausePhase + 1); + + // validators that remain after the laggards() function are considered + // offline, and should be considered as laggards for purposes of + // evaluating whether the threshold for non-laggards has been reached. + switch (phase) + { + case 0: + // Laggards and offline shouldn't preclude consensus. + if (laggards + offline > totalValidators - quorum) + willPause = true; + break; + case maxPausePhase: + // No tolerance. + willPause = true; + break; + default: + // Ensure that sufficient validators are known to be not lagging. + // Their sufficiently most recent validation sequence was equal to + // or greater than our own. + // + // The threshold is the amount required for quorum plus + // the proportion of the remainder based on number of intermediate + // phases between 0 and max. + float const nonLaggards = totalValidators - (laggards + offline); + float const quorumRatio = + static_cast(quorum) / totalValidators; + float const allowedDissent = 1.0f - quorumRatio; + float const phaseFactor = static_cast(phase) / maxPausePhase; + + if (nonLaggards / totalValidators < + quorumRatio + (allowedDissent * phaseFactor)) + { + willPause = true; + } + } + + if (willPause) + j_.warn() << "pausing" << vars.str(); + else + j_.debug() << "not pausing" << vars.str(); + return willPause; +} + template void Consensus::phaseEstablish() @@ -1130,8 +1275,8 @@ Consensus::phaseEstablish() updateOurPositions(); - // Nothing to do if we don't have consensus. - if (!haveConsensus()) + // Nothing to do if too many laggards or we don't have consensus. + if (shouldPause() || !haveConsensus()) return; if (!haveCloseTimeConsensus_) diff --git a/src/ripple/consensus/ConsensusParms.h b/src/ripple/consensus/ConsensusParms.h index d273788116..f456426e82 100644 --- a/src/ripple/consensus/ConsensusParms.h +++ b/src/ripple/consensus/ConsensusParms.h @@ -82,6 +82,15 @@ struct ConsensusParms std::chrono::milliseconds ledgerMIN_CONSENSUS = std::chrono::milliseconds {1950}; + /** The maximum amount of time to spend pausing for laggards. + * + * This should be sufficiently less than validationFRESHNESS so that + * validators don't appear to be offline that are merely waiting for + * laggards. + */ + std::chrono::milliseconds ledgerMAX_CONSENSUS = + std::chrono::seconds {10}; + //! Minimum number of seconds to wait to ensure others have computed the LCL std::chrono::milliseconds ledgerMIN_CLOSE = std::chrono::seconds {2}; diff --git a/src/ripple/consensus/Validations.h b/src/ripple/consensus/Validations.h index 0a499f19ea..719e353ee7 100644 --- a/src/ripple/consensus/Validations.h +++ b/src/ripple/consensus/Validations.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +74,16 @@ struct ValidationParms for a reasonable interval. */ std::chrono::seconds validationSET_EXPIRES = std::chrono::minutes{10}; + + /** How long we consider a validation fresh. + * + * The number of seconds since a validation has been seen for it to + * be considered to accurately represent a live proposer's most recent + * validation. This value should be sufficiently higher than + * ledgerMAX_CONSENSUS such that validators who are waiting for + * laggards are not considered offline. + */ + std::chrono::seconds validationFRESHNESS = std::chrono::seconds {20}; }; /** Enforce validation increasing sequence requirement. @@ -278,6 +289,7 @@ class Validations using ID = typename Ledger::ID; using Seq = typename Ledger::Seq; using NodeID = typename Validation::NodeID; + using NodeKey = typename Validation::NodeKey; using WrappedValidationType = std::decay_t< std::result_of_t>; @@ -966,6 +978,42 @@ public: adaptor_.flush(std::move(flushed)); } + + /** Return quantity of lagging proposers, and remove online proposers + * for purposes of evaluating whether to pause. + * + * Laggards are the trusted proposers whose sequence number is lower + * than the sequence number from which our current pending proposal + * is based. Proposers from whom we have not received a validation for + * awhile are considered offline. + * + * Note: the trusted flag is not used in this evaluation because it's made + * redundant by checking the list of proposers. + * + * @param seq Our current sequence number. + * @param trustedKeys Public keys of trusted proposers. + * @return Quantity of laggards. + */ + std::size_t + laggards(Seq const seq, hash_set& trustedKeys) + { + std::size_t laggards = 0; + + current(ScopedLock{mutex_}, + [](std::size_t) {}, + [&](NodeID const&, Validation const& v) { + if (adaptor_.now() < + v.seenTime() + parms_.validationFRESHNESS && + trustedKeys.find(v.key()) != trustedKeys.end()) + { + trustedKeys.erase(v.key()); + if (seq > v.seq()) + ++laggards; + } + }); + + return laggards; + } }; } // namespace ripple diff --git a/src/test/consensus/Consensus_test.cpp b/src/test/consensus/Consensus_test.cpp index 657e5d5403..6075a320d9 100644 --- a/src/test/consensus/Consensus_test.cpp +++ b/src/test/consensus/Consensus_test.cpp @@ -996,6 +996,125 @@ public: } } + // Helper collector for testPauseForLaggards + // This will remove the ledgerAccept delay used to + // initially create the slow vs. fast validator groups. + struct UndoDelay + { + csf::PeerGroup& g; + + UndoDelay(csf::PeerGroup& a) : g(a) + { + } + + template + void + on(csf::PeerID, csf::SimTime, E const&) + { + } + + void + on(csf::PeerID who, csf::SimTime, csf::AcceptLedger const& e) + { + for (csf::Peer* p : g) + { + if (p->id == who) + p->delays.ledgerAccept = std::chrono::seconds{0}; + } + } + }; + + void + testPauseForLaggards() + { + using namespace csf; + using namespace std::chrono; + + // Test that validators that jump ahead of the network slow + // down. + + // We engineer the following validated ledger history scenario: + // + // / --> B1 --> C1 --> ... -> G1 "ahead" + // A + // \ --> B2 --> C2 "behind" + // + // After validating a common ledger A, a set of "behind" validators + // briefly run slower and validate the lower chain of ledgers. + // The "ahead" validators run normal speed and run ahead validating the + // upper chain of ledgers. + // + // Due to the uncommited support definition of the preferred branch + // protocol, even if the "behind" validators are a majority, the "ahead" + // validators cannot jump to the proper branch until the "behind" + // validators catch up to the same sequence number. For this test to + // succeed, the ahead validators need to briefly slow down consensus. + + ConsensusParms const parms{}; + Sim sim; + SimDuration delay = + date::round(0.2 * parms.ledgerGRANULARITY); + + PeerGroup behind = sim.createGroup(3); + PeerGroup ahead = sim.createGroup(2); + PeerGroup network = ahead + behind; + + hash_set trustedKeys; + for (Peer* p : network) + trustedKeys.insert(p->key); + for (Peer* p : network) + p->trustedKeys = trustedKeys; + + network.trustAndConnect(network, delay); + + // Initial seed round to set prior state + sim.run(1); + + // Have the "behind" group initially take a really long time to + // accept a ledger after ending deliberation + for (Peer* p : behind) + p->delays.ledgerAccept = 20s; + + // Use the collector to revert the delay after the single + // slow ledger is generated + UndoDelay undoDelay{behind}; + sim.collectors.add(undoDelay); + +#if 0 + // Have all beast::journal output printed to stdout + for (Peer* p : network) + p->sink.threshold(beast::severities::kAll); + + // Print ledger accept and fully validated events to stdout + StreamCollector sc{std::cout}; + sim.collectors.add(sc); +#endif + // Run the simulation for 100 seconds of simulation time with + std::chrono::nanoseconds const simDuration = 100s; + + // Simulate clients submitting 1 tx every 5 seconds to a random + // validator + Rate const rate{1, 5s}; + auto peerSelector = makeSelector( + network.begin(), + network.end(), + std::vector(network.size(), 1.), + sim.rng); + auto txSubmitter = makeSubmitter( + ConstantDistribution{rate.inv()}, + sim.scheduler.now(), + sim.scheduler.now() + simDuration, + peerSelector, + sim.scheduler, + sim.rng); + + // Run simulation + sim.run(simDuration); + + // Verify that the network recovered + BEAST_EXPECT(sim.synchronized()); + } + void run() override { @@ -1011,6 +1130,7 @@ public: testFork(); testHubNetwork(); testPreferredByBranch(); + testPauseForLaggards(); } }; diff --git a/src/test/csf/Peer.h b/src/test/csf/Peer.h index 7b56c8c254..c57bb4cee2 100644 --- a/src/test/csf/Peer.h +++ b/src/test/csf/Peer.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -165,9 +166,11 @@ struct Peer //! Type definitions for generic consensus using Ledger_t = Ledger; using NodeID_t = PeerID; + using NodeKey_t = PeerKey; using TxSet_t = TxSet; using PeerPosition_t = Position; using Result = ConsensusResult; + using NodeKey = Validation::NodeKey; //! Logging support that prefixes messages with the peer ID beast::WrappedSink sink; @@ -239,10 +242,7 @@ struct Peer //! Whether to simulate running as validator or a tracking node bool runAsValidator = true; - //! Enforce invariants on validation sequence numbers - SeqEnforcer seqEnforcer; - - //TODO: Consider removing these two, they are only a convenience for tests + // TODO: Consider removing these two, they are only a convenience for tests // Number of proposers in the prior round std::size_t prevProposers = 0; // Duration of prior round @@ -252,6 +252,8 @@ struct Peer // TODO: Use the logic in ValidatorList to set this dynamically std::size_t quorum = 0; + hash_set trustedKeys; + // Simulation parameters ConsensusParms consensusParms; @@ -574,8 +576,7 @@ struct Peer // Can only send one validated ledger per seq if (runAsValidator && isCompatible && !consensusFail && - seqEnforcer( - scheduler.now(), newLedger.seq(), validations.parms())) + validations.canValidateSeq(newLedger.seq())) { bool isFull = proposing; @@ -837,6 +838,39 @@ struct Peer return addTrustedValidation(v); } + bool + haveValidated() const + { + return fullyValidatedLedger.seq() > Ledger::Seq{0}; + } + + Ledger::Seq + getValidLedgerIndex() const + { + return earliestAllowedSeq(); + } + + std::pair> + getQuorumKeys() + { + hash_set keys; + for (auto const& p : trustGraph.trustedPeers(this)) + keys.insert(p->key); + return {quorum, keys}; + } + + std::size_t + laggards(Ledger::Seq const seq, hash_set& trustedKeys) + { + return validations.laggards(seq, trustedKeys); + } + + bool + validator() const + { + return runAsValidator; + } + //-------------------------------------------------------------------------- // A locally submitted transaction void