Large cluster optimizations. (#348)

* Added sync log to streamer.
* Fixed ledger closing attempt while syncing.
* Added diagnostic contract.
* Reset to stage 0 on unreliable votes.
* Reduced peer msg age threshold.
* Added health tracking.
* Weakly-connected detection improvement.
* Increased version 0.5.1.
* Improved client lib server version check.
* Added health logging support to text client.
* Added weakly connected status in status response.
* Increased max peers limits when serializing.
* Local docker cluster manual ip.
* Updated vultr script vm region order.
* Sync status reporting improvement.
* Added milliseconds to logging.
This commit is contained in:
Ravin Perera
2021-09-17 11:53:49 +05:30
committed by GitHub
parent c686745c81
commit 6dc0776b56
32 changed files with 720 additions and 86 deletions

View File

@@ -92,6 +92,9 @@ namespace consensus
break;
}
if (ctx.stage == 0)
status::emit_proposal_health();
if (consensus() == -1)
{
LOG_ERROR << "Consensus thread exited due to an error.";
@@ -119,7 +122,7 @@ namespace consensus
revise_candidate_proposals(ctx.vote_status == VOTES_SYNCED);
// Attempt to close the ledger after scanning last round stage 3 proposals.
if (ctx.stage == 0)
if (ctx.stage == 0 && ctx.vote_status == VOTES_SYNCED)
attempt_ledger_close();
// Get current lcl, state, patch, primary shard and raw shard info.
@@ -156,7 +159,7 @@ namespace consensus
const size_t unl_count = unl::count();
vote_counter votes;
// Check whether we are in sync with other nodes using proposals.
// Check whether we are in sync with other nodes using the proposals we received.
{
int new_sync_status = check_sync_status(unl_count, votes, lcl_id);
@@ -174,7 +177,7 @@ namespace consensus
}
// Update the node's status if we went from in-sync to not-in-sync. We will report back as being in-sync only when ledger is created.
if (ctx.vote_status == VOTES_SYNCED && new_sync_status != VOTES_SYNCED)
if (new_sync_status == VOTES_DESYNC)
status::sync_status_changed(false);
// This marks entering into a new sync cycle.
@@ -195,9 +198,10 @@ namespace consensus
if (ctx.vote_status == VOTES_UNRELIABLE)
{
ctx.stage = 0;
ctx.unreliable_votes_attempts++;
// If we get too many consecative unreliable vote rounds, then we perform time config sniffing just in case the unreliable votes
// If we get too many consecutive unreliable vote rounds, then we perform time config sniffing just in case the unreliable votes
// are caused because our roundtime config information is different from other nodes.
if (ctx.unreliable_votes_attempts >= MAX_UNRELIABLE_VOTES_ATTEMPTS)
{
@@ -208,21 +212,21 @@ namespace consensus
else
{
ctx.unreliable_votes_attempts = 0;
}
if (ctx.vote_status == VOTES_SYNCED)
{
// If we are in sync, vote and broadcast the winning votes to next stage.
const p2p::proposal p = create_stage123_proposal(votes, unl_count, state_hash, patch_hash, last_primary_shard_id, last_raw_shard_id);
broadcast_proposal(p);
// This marks the moment we finish a sync cycle. We are in stage 1 and we detect that our votes are in sync.
if (ctx.stage == 1 && ctx.sync_ongoing)
if (ctx.vote_status == VOTES_SYNCED)
{
// Clear any sync recovery pending state if we enter stage 1 while being in sync.
ctx.sync_ongoing = false;
status::sync_status_changed(true);
LOG_DEBUG << "Sync recovery completed.";
// If we are in sync, vote and broadcast the winning votes to next stage.
const p2p::proposal p = create_stage123_proposal(votes, unl_count, state_hash, patch_hash, last_primary_shard_id, last_raw_shard_id);
broadcast_proposal(p);
// This marks the moment we finish a sync cycle. We are in stage 1 and we just detected that our votes are in sync.
if (ctx.stage == 1 && ctx.sync_ongoing)
{
// Clear any sync recovery pending state if we enter stage 1 while being in sync.
ctx.sync_ongoing = false;
status::sync_status_changed(true);
LOG_DEBUG << "Sync recovery completed.";
}
}
}
@@ -447,7 +451,7 @@ namespace consensus
/**
* Moves proposals collected from the network into candidate proposals and
* cleans up any outdated proposals from the candidate set.
* @param in_sync Whether the node is currently on sync or not. We relax the pruning criteria if we are not in sync.
* @param in_sync Whether the node is currently in sync or not. We relax the pruning criteria if we are not in sync.
*/
void revise_candidate_proposals(const bool in_sync)
{
@@ -459,6 +463,8 @@ namespace consensus
collected_proposals.splice(collected_proposals.end(), p2p::ctx.collected_msgs.proposals);
}
status::report_proposal_batch(collected_proposals);
// Prune incoming proposals if they are older than existing proposal from same node.
{
auto itr = collected_proposals.begin();