From e2f562ae91e07691842880d09ecce31da330b376 Mon Sep 17 00:00:00 2001 From: Mark Travis Date: Fri, 22 Mar 2024 13:22:29 -0700 Subject: [PATCH] Don't reach consensus as quickly if no other proposals seen: (#4763) This fixes a case where a peer can desync under a certain timing circumstance--if it reaches a certain point in consensus before it receives proposals. This was noticed under high transaction volumes. Namely, when we arrive at the point of deciding whether consensus is reached after minimum establish phase duration but before having received any proposals. This could be caused by finishing the previous round slightly faster and/or having some delay in receiving proposals. Existing behavior arrives at consensus immediately after the minimum establish duration with no proposals. This causes us to desync because we then close a non-validated ledger. The change in this PR causes us to wait for a configured threshold before making the decision to arrive at consensus with no proposals. This allows validators to catch up and for brief delays in receiving proposals to be absorbed. There should be no drawback since, with no proposals coming in, we needn't be in a huge rush to jump ahead. --- src/ripple/consensus/Consensus.cpp | 39 ++++++++++++++++++++++----- src/ripple/consensus/Consensus.h | 2 +- src/ripple/consensus/ConsensusParms.h | 2 +- src/test/consensus/Consensus_test.cpp | 9 +++++-- 4 files changed, 42 insertions(+), 10 deletions(-) diff --git a/src/ripple/consensus/Consensus.cpp b/src/ripple/consensus/Consensus.cpp index 1b08859c8..cc1f84270 100644 --- a/src/ripple/consensus/Consensus.cpp +++ b/src/ripple/consensus/Consensus.cpp @@ -87,11 +87,24 @@ checkConsensusReached( std::size_t agreeing, std::size_t total, bool count_self, - std::size_t minConsensusPct) + std::size_t minConsensusPct, + bool reachedMax) { - // If we are alone, we have a consensus + // If we are alone for too long, we have consensus. + // Delaying consensus like this avoids a circumstance where a peer + // gets ahead of proposers insofar as it has not received any proposals. + // This could happen if there's a slowdown in receiving proposals. Reaching + // consensus prematurely in this way means that the peer will likely desync. + // The check for reachedMax should allow plenty of time for proposals to + // arrive, and there should be no downside. If a peer is truly not + // receiving any proposals, then there should be no hurry. There's + // really nowhere to go. if (total == 0) - return true; + { + if (reachedMax) + return true; + return false; + } if (count_self) { @@ -120,7 +133,13 @@ checkConsensus( << prevProposers << " agree=" << currentAgree << " validated=" << currentFinished << " time=" << currentAgreeTime.count() << "/" - << previousAgreeTime.count(); + << previousAgreeTime.count() << " proposing? " << proposing + << " minimum duration to reach consensus: " + << parms.ledgerMIN_CONSENSUS.count() << "ms" + << " max consensus time " + << parms.ledgerMAX_CONSENSUS.count() << "s" + << " minimum consensus percentage: " + << parms.minCONSENSUS_PCT; if (currentAgreeTime <= parms.ledgerMIN_CONSENSUS) return ConsensusState::No; @@ -139,7 +158,11 @@ checkConsensus( // Have we, together with the nodes on our UNL list, reached the threshold // to declare consensus? if (checkConsensusReached( - currentAgree, currentProposers, proposing, parms.minCONSENSUS_PCT)) + currentAgree, + currentProposers, + proposing, + parms.minCONSENSUS_PCT, + currentAgreeTime > parms.ledgerMAX_CONSENSUS)) { JLOG(j.debug()) << "normal consensus"; return ConsensusState::Yes; @@ -148,7 +171,11 @@ checkConsensus( // Have sufficient nodes on our UNL list moved on and reached the threshold // to declare consensus? if (checkConsensusReached( - currentFinished, currentProposers, false, parms.minCONSENSUS_PCT)) + currentFinished, + currentProposers, + false, + parms.minCONSENSUS_PCT, + currentAgreeTime > parms.ledgerMAX_CONSENSUS)) { JLOG(j.warn()) << "We see no consensus, but 80% of nodes have moved on"; return ConsensusState::MovedOn; diff --git a/src/ripple/consensus/Consensus.h b/src/ripple/consensus/Consensus.h index df5ec01ce..dfd706683 100644 --- a/src/ripple/consensus/Consensus.h +++ b/src/ripple/consensus/Consensus.h @@ -1155,7 +1155,7 @@ Consensus::shouldPause() const std::size_t const offline = trustedKeys.size(); std::stringstream vars; - vars << " (working seq: " << previousLedger_.seq() << ", " + vars << " consensuslog (working seq: " << previousLedger_.seq() << ", " << "validated seq: " << adaptor_.getValidLedgerIndex() << ", " << "am validator: " << adaptor_.validator() << ", " << "have validated: " << adaptor_.haveValidated() << ", " diff --git a/src/ripple/consensus/ConsensusParms.h b/src/ripple/consensus/ConsensusParms.h index 542b3644b..a0b6c6be8 100644 --- a/src/ripple/consensus/ConsensusParms.h +++ b/src/ripple/consensus/ConsensusParms.h @@ -86,7 +86,7 @@ struct ConsensusParms * validators don't appear to be offline that are merely waiting for * laggards. */ - std::chrono::milliseconds ledgerMAX_CONSENSUS = std::chrono::seconds{10}; + std::chrono::milliseconds ledgerMAX_CONSENSUS = std::chrono::seconds{15}; //! Minimum number of seconds to wait to ensure others have computed the LCL std::chrono::milliseconds ledgerMIN_CLOSE = std::chrono::seconds{2}; diff --git a/src/test/consensus/Consensus_test.cpp b/src/test/consensus/Consensus_test.cpp index 1c19ff070..5c7dc2626 100644 --- a/src/test/consensus/Consensus_test.cpp +++ b/src/test/consensus/Consensus_test.cpp @@ -109,10 +109,15 @@ public: ConsensusState::MovedOn == checkConsensus(10, 2, 1, 8, 3s, 10s, p, true, journal_)); - // No peers makes it easy to agree + // If no peers, don't agree until time has passed. + BEAST_EXPECT( + ConsensusState::No == + checkConsensus(0, 0, 0, 0, 3s, 10s, p, true, journal_)); + + // Agree if no peers and enough time has passed. BEAST_EXPECT( ConsensusState::Yes == - checkConsensus(0, 0, 0, 0, 3s, 10s, p, true, journal_)); + checkConsensus(0, 0, 0, 0, 3s, 16s, p, true, journal_)); } void