Large cluster optimizations. (#348)

* Added sync log to streamer. * Fixed ledger closing attempt while syncing. * Added diagnostic contract. * Reset to stage 0 on unreliable votes. * Reduced peer msg age threshold. * Added health tracking. * Weakly-connected detection improvement. * Increased version 0.5.1. * Improved client lib server version check. * Added health logging support to text client. * Added weakly connected status in status response. * Increased max peers limits when serializing. * Local docker cluster manual ip. * Updated vultr script vm region order. * Sync status reporting improvement. * Added milliseconds to logging.
2026-04-29 15:37:59 +00:00 · 2021-09-17 11:53:49 +05:30
parent c686745c81
commit 6dc0776b56
32 changed files with 720 additions and 86 deletions
--- a/src/status.cpp
+++ b/src/status.cpp
@@ -2,6 +2,7 @@
 #include "util/sequence_hash.hpp"
 #include "ledger/ledger_common.hpp"
 #include "conf.hpp"
+#include "p2p/p2p.hpp"

 namespace status
 {
@@ -20,6 +21,11 @@ namespace status

    std::shared_mutex peers_mutex;
    std::set<conf::peer_ip_port> peers; // Known ip:port pairs for connection verified peers.
+    std::atomic<size_t> peer_count = 0;
+    std::atomic<bool> weakly_connected = false;
+    std::atomic<int16_t> available_mesh_capacity = -1;
+
+    proposal_health phealth = {};

    //----- Ledger status

@@ -33,7 +39,7 @@ namespace status
    void ledger_created(const util::sequence_hash &ledger_id, const ledger::ledger_record &ledger)
    {
        // If currently not-in-sync, report it as in-sync when a ledger is created.
-        if (in_sync != 1)
+        if (in_sync.load() != 1)
            sync_status_changed(true);

        std::unique_lock lock(ledger_mutex);
@@ -44,8 +50,12 @@ namespace status

    void sync_status_changed(const bool new_in_sync)
    {
-        in_sync = new_in_sync ? 1 : 0;
-        event_queue.try_enqueue(sync_status_change_event{new_in_sync});
+        const int new_value = new_in_sync ? 1 : 0;
+        if (new_value != in_sync.load())
+        {
+            in_sync = new_value;
+            event_queue.try_enqueue(sync_status_change_event{new_in_sync});
+        }
    }

    const util::sequence_hash get_lcl_id()
@@ -56,7 +66,7 @@ namespace status

    const bool is_in_sync()
    {
-        return in_sync == 1;
+        return in_sync.load() == 1;
    }

    const ledger::ledger_record get_last_ledger()
@@ -93,6 +103,14 @@ namespace status
    {
        std::unique_lock lock(peers_mutex);
        peers = std::move(updated_peers);
+
+        if (peers.size() != peer_count)
+        {
+            peer_count = peers.size();
+
+            if (conf::cfg.health.connectivity_stats)
+                event_queue.try_enqueue(connectivity_health{peer_count.load(), weakly_connected.load()});
+        }
    }

    const std::set<conf::peer_ip_port> get_peers()
@@ -101,4 +119,90 @@ namespace status
        return peers;
    }

+    const size_t get_peers_count()
+    {
+        return peer_count.load();
+    }
+
+    void set_weakly_connected(const bool is_weakly_connected)
+    {
+        if (weakly_connected.load() != is_weakly_connected)
+        {
+            weakly_connected = is_weakly_connected;
+
+            if (conf::cfg.health.connectivity_stats)
+                event_queue.try_enqueue(connectivity_health{peer_count.load(), weakly_connected.load()});
+        }
+    }
+
+    const bool get_weakly_connected()
+    {
+        return weakly_connected.load();
+    }
+
+    void set_available_mesh_capacity(const int16_t new_capacity)
+    {
+        available_mesh_capacity = new_capacity;
+    }
+
+    const int16_t get_available_mesh_capacity()
+    {
+        return available_mesh_capacity.load();
+    }
+
+    //----- Node health
+
+    void report_proposal_batch(const std::list<p2p::proposal> &proposals)
+    {
+        if (!conf::cfg.health.proposal_stats)
+            return;
+
+        phealth.comm_latency_min = UINT64_MAX;
+        phealth.comm_latency_max = 0;
+        phealth.comm_latency_avg = 0;
+        phealth.read_latency_min = UINT64_MAX;
+        phealth.read_latency_max = 0;
+        phealth.read_latency_avg = 0;
+        phealth.batch_size = proposals.size();
+
+        if (phealth.batch_size == 0)
+            return;
+
+        const uint64_t now = util::get_epoch_milliseconds();
+        uint64_t total_comm_latency = 0;
+        uint64_t total_read_latency = 0;
+
+        for (const p2p::proposal &p : proposals)
+        {
+            const uint64_t comm_latency = (p.sent_timestamp < p.recv_timestamp) ? (p.recv_timestamp - p.sent_timestamp) : 0;
+            const uint64_t read_latency = now - p.recv_timestamp;
+
+            total_comm_latency += comm_latency;
+            total_read_latency += read_latency;
+
+            if (comm_latency < phealth.comm_latency_min)
+                phealth.comm_latency_min = comm_latency;
+
+            if (comm_latency > phealth.comm_latency_max)
+                phealth.comm_latency_max = comm_latency;
+
+            if (read_latency < phealth.read_latency_min)
+                phealth.read_latency_min = read_latency;
+
+            if (read_latency > phealth.read_latency_max)
+                phealth.read_latency_max = read_latency;
+        }
+
+        phealth.comm_latency_avg = total_comm_latency / phealth.batch_size;
+        phealth.read_latency_avg = total_read_latency / phealth.batch_size;
+    }
+
+    void emit_proposal_health()
+    {
+        if (!conf::cfg.health.proposal_stats)
+            return;
+
+        event_queue.try_enqueue(phealth);
+    }
+
 } // namespace status