Consensus deadlock fixes and reliability improvements. (#187)

* Consensus time vote improvements.
* Skipped self proposal in Observer mode.
* Added docker cluster script params.
* Added state/lcl sync abandon threshold.
* Added pubkey display for cluster script.
* Added proposal latency log.
* Added sync completion check to prevent deadlocks.
This commit is contained in:
Ravin Perera
2020-12-08 15:17:27 +05:30
committed by GitHub
parent 7bf0475b6f
commit fe9e276f8d
14 changed files with 318 additions and 199 deletions

View File

@@ -103,19 +103,18 @@ namespace consensus
// A consensus round consists of 4 stages (0,1,2,3).
// For a given stage, this function may get visited multiple times due to time-wait conditions.
uint64_t stage_start = 0;
if (!wait_and_proceed_stage(stage_start))
if (!wait_and_proceed_stage())
return 0; // This means the stage has been reset.
LOG_DEBUG << "Started stage " << std::to_string(ctx.stage);
// We consider stage start time as the current discreet time throughout the stage.
ctx.time_now = stage_start;
// Throughout consensus, we continously update and prune the candidate proposals for newly
// arived ones and expired ones.
revise_candidate_proposals();
// If possible, switch back to proposer mode before stage processing.
check_sync_completion();
// Get current lcl and state.
std::string lcl = ledger::ctx.get_lcl();
const uint64_t lcl_seq_no = ledger::ctx.get_seq_no();
@@ -164,9 +163,7 @@ namespace consensus
// If we are in sync, vote and get the final winning votes.
// This is the consensus proposal which makes it into the ledger and contract execution
const p2p::proposal p = create_stage123_proposal(STAGE3_THRESHOLD, votes, lcl, unl_count, state, unl_hash);
// Update the unl with the unl changeset that subjected to the consensus.
unl::apply_changeset(p.unl_changeset.additions, p.unl_changeset.removals);
broadcast_proposal(p);
// Update the ledger and execute the contract using the consensus proposal.
if (update_ledger_and_execute_contract(p, lcl, state) == -1)
@@ -229,6 +226,16 @@ namespace consensus
return false;
}
/**
* Checks whether we can switch back from currently ongoing observer-mode sync operation
* that has been completed.
*/
void check_sync_completion()
{
if (conf::cfg.operating_mode == conf::OPERATING_MODE::OBSERVER && !state_sync::ctx.is_syncing && !ledger::sync_ctx.is_syncing)
conf::change_operating_mode(conf::OPERATING_MODE::PROPOSER);
}
/**
* Moves proposals collected from the network into candidate proposals and
* cleans up any outdated proposals from the candidate set.
@@ -247,24 +254,17 @@ namespace consensus
// Add propsals of new nodes and replace proposals from old nodes to reflect current status of nodes.
for (const auto &proposal : collected_proposals)
{
auto prop_itr = ctx.candidate_proposals.find(proposal.pubkey);
if (prop_itr != ctx.candidate_proposals.end())
{
ctx.candidate_proposals.erase(prop_itr);
ctx.candidate_proposals.emplace(proposal.pubkey, std::move(proposal));
}
else
{
ctx.candidate_proposals.emplace(proposal.pubkey, std::move(proposal));
}
ctx.candidate_proposals.erase(proposal.pubkey); // Erase if already exists.
ctx.candidate_proposals.emplace(proposal.pubkey, std::move(proposal));
}
// Prune any outdated proposals.
auto itr = ctx.candidate_proposals.begin();
const uint64_t time_now = util::get_epoch_milliseconds();
while (itr != ctx.candidate_proposals.end())
{
const p2p::proposal &cp = itr->second;
const uint64_t time_diff = (ctx.time_now > cp.sent_timestamp) ? (ctx.time_now - cp.sent_timestamp) : 0;
const uint64_t time_diff = (time_now > cp.sent_timestamp) ? (time_now - cp.sent_timestamp) : 0;
const int8_t stage_diff = ctx.stage - cp.stage;
// only consider recent proposals and proposals from previous stage and current stage.
@@ -277,7 +277,8 @@ namespace consensus
<< " ts:" << std::to_string(cp.time)
<< " lcl:" << cp.lcl.substr(0, 15)
<< " state:" << cp.state
<< " [from:" << ((cp.pubkey == conf::cfg.pubkey) ? "self" : util::get_hex(cp.pubkey, 1, 5)) << "]";
<< " [from:" << ((cp.pubkey == conf::cfg.pubkey) ? "self" : util::get_hex(cp.pubkey, 1, 5)) << "]"
<< "(" << std::to_string(cp.recv_timestamp > cp.sent_timestamp ? cp.recv_timestamp - cp.sent_timestamp : 0) << "ms)";
if (keep_candidate)
++itr;
@@ -290,7 +291,7 @@ namespace consensus
* Syncrhonise the stage/round time for fixed intervals and reset the stage.
* @return True if consensus can proceed in the current round. False if stage is reset.
*/
bool wait_and_proceed_stage(uint64_t &stage_start)
bool wait_and_proceed_stage()
{
// Here, nodes try to synchronise nodes stages using network clock.
// We devide universal time to windows of equal size of roundtime. Each round must be synced with the
@@ -299,23 +300,24 @@ namespace consensus
const uint64_t now = util::get_epoch_milliseconds();
// Rrounds are discreet windows of roundtime.
// This gets the start time of current round window. Stage 0 must start in the next round window.
const uint64_t current_round_start = (((uint64_t)(now / conf::cfg.roundtime)) * conf::cfg.roundtime);
if (ctx.stage == 0)
{
// This gets the start time of current round window. Stage 0 must start in the window after that.
const uint64_t previous_round_start = (((uint64_t)(now / conf::cfg.roundtime)) * conf::cfg.roundtime);
// Stage 0 must start in the next round window.
// (This makes sure stage 3 gets whichever the remaining time in the round after stages 0,1,2)
stage_start = current_round_start + conf::cfg.roundtime;
const uint64_t to_wait = stage_start - now;
ctx.round_start_time = previous_round_start + conf::cfg.roundtime;
const uint64_t to_wait = ctx.round_start_time - now;
LOG_DEBUG << "Waiting " << to_wait << "ms for next round stage 0";
LOG_DEBUG << "Waiting " << to_wait << "ms for next round stage 0.";
util::sleep(to_wait);
return true;
}
else
{
stage_start = current_round_start + (ctx.stage * ctx.stage_time);
const uint64_t stage_start = ctx.round_start_time + (ctx.stage * ctx.stage_time);
// Compute stage time wait.
// Node wait between stages to collect enough proposals from previous stages from other nodes.
@@ -325,7 +327,7 @@ namespace consensus
// it will join in next round. Otherwise it will continue particapating in this round.
if (to_wait < ctx.stage_reset_wait_threshold) //todo: self claculating/adjusting network delay
{
LOG_DEBUG << "Missed stage " << std::to_string(ctx.stage) << " window. Resetting to stage 0";
LOG_DEBUG << "Missed stage " << std::to_string(ctx.stage) << " window. Resetting to stage 0.";
ctx.stage = 1;
return false;
}
@@ -515,51 +517,54 @@ namespace consensus
{
// This is the proposal that stage 0 votes on.
// We report our own values in stage 0.
p2p::proposal stg_prop;
stg_prop.time = ctx.time_now;
stg_prop.stage = 0;
stg_prop.lcl = lcl;
stg_prop.state = state;
stg_prop.unl_hash = unl_hash;
crypto::random_bytes(stg_prop.nonce, ROUND_NONCE_SIZE);
p2p::proposal p;
p.time = ctx.round_start_time;
p.stage = 0;
p.lcl = lcl;
p.state = state;
p.unl_hash = unl_hash;
crypto::random_bytes(p.nonce, ROUND_NONCE_SIZE);
// Populate the proposal with set of candidate user pubkeys.
stg_prop.users.swap(ctx.candidate_users);
p.users.swap(ctx.candidate_users);
// Populate the proposal with hashes of user inputs.
for (const auto &[hash, cand_input] : ctx.candidate_user_inputs)
stg_prop.hash_inputs.emplace(hash);
p.hash_inputs.emplace(hash);
// Populate the proposal with hashes of user outputs.
for (const auto &[hash, cand_output] : ctx.candidate_user_outputs)
stg_prop.hash_outputs.emplace(hash);
p.hash_outputs.emplace(hash);
// Populate the proposal wil unl changeset.
stg_prop.unl_changeset = ctx.candidate_unl_changeset;
// Populate the proposal with unl changeset.
p.unl_changeset = ctx.candidate_unl_changeset;
return stg_prop;
return p;
}
p2p::proposal create_stage123_proposal(const float_t vote_threshold, vote_counter &votes, std::string_view lcl, const size_t unl_count, const hpfs::h32 state, std::string_view unl_hash)
{
// The proposal to be emited at the end of this stage.
p2p::proposal stg_prop;
stg_prop.stage = ctx.stage;
stg_prop.state = state;
p2p::proposal p;
p.stage = ctx.stage;
p.state = state;
// we always vote for our current lcl and state regardless of what other peers are saying
// if there's a fork condition we will either request history and state from
// We always vote for our current lcl and state regardless of what other peers are saying.
// If there's a fork condition we will either request history and state from
// our peers or we will halt depending on level of consensus on the sides of the fork.
stg_prop.lcl = lcl;
p.lcl = lcl;
stg_prop.unl_hash = unl_hash;
// We always votr for our current unl hash.
p.unl_hash = unl_hash;
const uint64_t time_now = util::get_epoch_milliseconds();
// Vote for rest of the proposal fields by looking at candidate proposals.
for (const auto &[pubkey, cp] : ctx.candidate_proposals)
{
// Vote for times.
// Everyone votes on an arbitrary time, as long as it's not in the future and within the round time.
if (ctx.time_now > cp.time && (ctx.time_now - cp.time) <= conf::cfg.roundtime)
// Everyone votes on the discreet time, as long as it's not in the future and within 2 round times.
if (time_now > cp.time && (time_now - cp.time) <= (conf::cfg.roundtime * 2))
increment(votes.time, cp.time);
// Vote for round nonce.
@@ -600,32 +605,32 @@ namespace consensus
// Add user pubkeys which have votes over stage threshold to proposal.
for (const auto &[pubkey, numvotes] : votes.users)
if (numvotes >= required_votes || (ctx.stage == 1 && numvotes > 0))
stg_prop.users.emplace(pubkey);
p.users.emplace(pubkey);
// Add inputs which have votes over stage threshold to proposal.
for (const auto &[hash, numvotes] : votes.inputs)
if (numvotes >= required_votes || (ctx.stage == 1 && numvotes > 0))
stg_prop.hash_inputs.emplace(hash);
p.hash_inputs.emplace(hash);
// Add outputs which have votes over stage threshold to proposal.
for (const auto &[hash, numvotes] : votes.outputs)
if (numvotes >= required_votes)
stg_prop.hash_outputs.emplace(hash);
p.hash_outputs.emplace(hash);
// For the unl changeset reset required votes for majority votes.
// For the unl changeset, reset required votes for majority votes.
required_votes = ceil(MAJORITY_THRESHOLD * unl_count);
// Add unl additions which have votes over majority threshold to proposal.
for (const auto &[pubkey, numvotes] : votes.unl_additions)
if (numvotes >= required_votes)
stg_prop.unl_changeset.additions.emplace(pubkey);
p.unl_changeset.additions.emplace(pubkey);
// Add unl removals which have votes over majority threshold to proposal.
for (const auto &[pubkey, numvotes] : votes.unl_removals)
if (numvotes >= required_votes)
stg_prop.unl_changeset.removals.emplace(pubkey);
p.unl_changeset.removals.emplace(pubkey);
// time is voted on a simple sorted (highest to lowest) and majority basis, since there will always be disagreement.
// time is voted on a simple sorted (highest to lowest) and majority basis.
uint32_t highest_time_vote = 0;
for (auto itr = votes.time.rbegin(); itr != votes.time.rend(); ++itr)
{
@@ -635,9 +640,12 @@ namespace consensus
if (numvotes > highest_time_vote)
{
highest_time_vote = numvotes;
stg_prop.time = time;
p.time = time;
}
}
// If final time happens to be 0 (this can happen if there were no proposals to vote for), we set the time manually.
if (p.time == 0)
p.time = ctx.round_start_time;
// Round nonce is voted on a simple sorted (highest to lowest) and majority basis, since there will always be disagreement.
uint32_t highest_nonce_vote = 0;
@@ -648,31 +656,29 @@ namespace consensus
if (numvotes > highest_nonce_vote)
{
highest_time_vote = numvotes;
stg_prop.nonce = nonce;
highest_nonce_vote = numvotes;
p.nonce = nonce;
}
}
return stg_prop;
return p;
}
/**
* Broadcasts the given proposal to all connected peers if in PROPOSER mode. Otherwise
* only send to self in OBSERVER mode.
* Broadcasts the given proposal to all connected peers if in PROPOSER mode. Does not send in OBSERVER mode.
* @return 0 on success. -1 if no peers to broadcast.
*/
void broadcast_proposal(const p2p::proposal &p)
{
// In observer mode, we do not send out proposals.
if (conf::cfg.operating_mode == conf::OPERATING_MODE::OBSERVER)
return;
flatbuffers::FlatBufferBuilder fbuf(1024);
p2pmsg::create_msg_from_proposal(fbuf, p);
p2p::broadcast_message(fbuf, true, false, !conf::cfg.is_consensus_public);
// In observer mode, we only send out the proposal to ourselves.
if (conf::cfg.operating_mode == conf::OPERATING_MODE::OBSERVER)
p2p::send_message_to_self(fbuf);
else
p2p::broadcast_message(fbuf, true, false, !conf::cfg.is_consensus_public);
LOG_DEBUG << "Proposed u/i/o:" << p.users.size()
LOG_DEBUG << "Proposed <s" << std::to_string(p.stage) << "> u/i/o:" << p.users.size()
<< "/" << p.hash_inputs.size()
<< "/" << p.hash_outputs.size()
<< " ts:" << std::to_string(p.time)
@@ -843,7 +849,8 @@ namespace consensus
}
}
// Clear candidate unl changset after consensus rounds are completed.
// Update the unl with the unl changeset that subjected to the consensus.
unl::apply_changeset(cons_prop.unl_changeset.additions, cons_prop.unl_changeset.removals);
ctx.candidate_unl_changeset.clear();
// Send any output from the previous consensus round to locally connected users.