diff --git a/include/xrpl/server/detail/Door.h b/include/xrpl/server/detail/Door.h index e273ca791f..b72bc9bdea 100644 --- a/include/xrpl/server/detail/Door.h +++ b/include/xrpl/server/detail/Door.h @@ -90,11 +90,11 @@ private: acceptor_type acceptor_; boost::asio::strand strand_; bool ssl_{ - port_.protocol.count("https") > 0 || port_.protocol.count("wss") > 0 || - port_.protocol.count("wss2") > 0 || port_.protocol.count("peer") > 0}; + port_.protocol.contains("https") || port_.protocol.contains("wss") || + port_.protocol.contains("wss2") || port_.protocol.contains("peer")}; bool plain_{ - port_.protocol.count("http") > 0 || port_.protocol.count("ws") > 0 || - (port_.protocol.count("ws2") != 0u)}; + port_.protocol.contains("http") || port_.protocol.contains("ws") || + (port_.protocol.contains("ws2"))}; static constexpr std::chrono::milliseconds kInitialAcceptDelay{50}; static constexpr std::chrono::milliseconds kMaxAcceptDelay{2000}; std::chrono::milliseconds acceptDelay_{kInitialAcceptDelay}; diff --git a/src/xrpld/overlay/detail/ConnectAttempt.cpp b/src/xrpld/overlay/detail/ConnectAttempt.cpp index 1167e19ca3..295f4f5497 100644 --- a/src/xrpld/overlay/detail/ConnectAttempt.cpp +++ b/src/xrpld/overlay/detail/ConnectAttempt.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -28,19 +29,17 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include #include #include -#include +#include #include #include @@ -52,7 +51,7 @@ ConnectAttempt::ConnectAttempt( endpoint_type remoteEndpoint, Resource::Consumer usage, shared_context const& context, - std::uint32_t id, + Peer::id_t id, std::shared_ptr const& slot, beast::Journal journal, OverlayImpl& overlay) @@ -65,7 +64,6 @@ ConnectAttempt::ConnectAttempt( , usage_(usage) , strand_(boost::asio::make_strand(ioContext)) , timer_(ioContext) - , stepTimer_(ioContext) , streamPtr_( std::make_unique( socket_type(std::forward(ioContext)), @@ -78,10 +76,9 @@ ConnectAttempt::ConnectAttempt( ConnectAttempt::~ConnectAttempt() { - // slot_ will be null if we successfully connected - // and transferred ownership to a PeerImp if (slot_ != nullptr) overlay_.peerFinder().onClosed(slot_); + JLOG(journal_.trace()) << "~ConnectAttempt"; } void @@ -92,30 +89,17 @@ ConnectAttempt::stop() boost::asio::post(strand_, std::bind(&ConnectAttempt::stop, shared_from_this())); return; } - - if (!socket_.is_open()) - return; - - JLOG(journal_.debug()) << "stop: Stop"; - - shutdown(); + if (socket_.is_open()) + { + JLOG(journal_.debug()) << "Stop"; + } + close(); } void ConnectAttempt::run() { - if (!strand_.running_in_this_thread()) - { - boost::asio::post(strand_, std::bind(&ConnectAttempt::run, shared_from_this())); - return; - } - - JLOG(journal_.debug()) << "run: connecting to " << remoteEndpoint_; - - ioPending_ = true; - - // Allow up to connectTimeout_ seconds to establish remote peer connection - setTimer(ConnectionStep::TcpConnect); + setTimer(); stream_.next_layer().async_connect( remoteEndpoint_, @@ -126,73 +110,6 @@ ConnectAttempt::run() //------------------------------------------------------------------------------ -void -ConnectAttempt::shutdown() -{ - XRPL_ASSERT( - strand_.running_in_this_thread(), "xrpl::ConnectAttempt::shutdown: strand in this thread"); - - if (!socket_.is_open()) - return; - - shutdown_ = true; - boost::beast::get_lowest_layer(stream_).cancel(); - - tryAsyncShutdown(); -} - -void -ConnectAttempt::tryAsyncShutdown() -{ - XRPL_ASSERT( - strand_.running_in_this_thread(), - "xrpl::ConnectAttempt::tryAsyncShutdown : strand in this thread"); - - if (!shutdown_ || currentStep_ == ConnectionStep::ShutdownStarted) - return; - - if (ioPending_) - return; - - // gracefully shutdown the SSL socket, performing a shutdown handshake - if (currentStep_ != ConnectionStep::TcpConnect && currentStep_ != ConnectionStep::TlsHandshake) - { - setTimer(ConnectionStep::ShutdownStarted); - stream_.async_shutdown(bind_executor( - strand_, - std::bind(&ConnectAttempt::onShutdown, shared_from_this(), std::placeholders::_1))); - return; - } - - close(); -} - -void -ConnectAttempt::onShutdown(error_code ec) -{ - cancelTimer(); - - if (ec) - { - // - eof: the stream was cleanly closed - // - operation_aborted: an expired timer (slow shutdown) - // - stream_truncated: the tcp connection closed (no handshake) it could - // occur if a peer does not perform a graceful disconnect - // - broken_pipe: the peer is gone - // - application data after close notify: benign SSL shutdown condition - bool const shouldLog = - (ec != boost::asio::error::eof && ec != boost::asio::error::operation_aborted && - ec.message().find("application data after close notify") == std::string::npos); - - if (shouldLog) - { - JLOG(journal_.debug()) << "onShutdown: " << ec.message(); - } - } - - close(); -} - void ConnectAttempt::close() { @@ -201,93 +118,50 @@ ConnectAttempt::close() if (!socket_.is_open()) return; - cancelTimer(); + try + { + timer_.cancel(); + socket_.close(); + } + catch (boost::system::system_error const&) // NOLINT(bugprone-empty-catch) + { + // ignored + } - error_code ec; - socket_.close(ec); // NOLINT(bugprone-unused-return-value) + JLOG(journal_.debug()) << "Closed"; } void ConnectAttempt::fail(std::string const& reason) { JLOG(journal_.debug()) << reason; - shutdown(); + close(); } void ConnectAttempt::fail(std::string const& name, error_code ec) { JLOG(journal_.debug()) << name << ": " << ec.message(); - shutdown(); + close(); } void -ConnectAttempt::setTimer(ConnectionStep step) +ConnectAttempt::setTimer() { - currentStep_ = step; - - // Set global timer (only if not already set) - if (timer_.expiry() == std::chrono::steady_clock::time_point{}) - { - try - { - timer_.expires_after(kConnectTimeout); - timer_.async_wait( - boost::asio::bind_executor( - strand_, - std::bind( - &ConnectAttempt::onTimer, shared_from_this(), std::placeholders::_1))); - } - catch (std::exception const& ex) - { - JLOG(journal_.error()) << "setTimer (global): " << ex.what(); - close(); - return; - } - } - - // Set step-specific timer try { - std::chrono::seconds stepTimeout; - switch (step) - { - case ConnectionStep::TcpConnect: - stepTimeout = StepTimeouts::kTcpConnect; - break; - case ConnectionStep::TlsHandshake: - stepTimeout = StepTimeouts::kTlsHandshake; - break; - case ConnectionStep::HttpWrite: - stepTimeout = StepTimeouts::kHttpWrite; - break; - case ConnectionStep::HttpRead: - stepTimeout = StepTimeouts::kHttpRead; - break; - case ConnectionStep::ShutdownStarted: - stepTimeout = StepTimeouts::kTlsShutdown; - break; - case ConnectionStep::Complete: - case ConnectionStep::Init: - return; // No timer needed for init or complete step - } - - // call to expires_after cancels previous timer - stepTimer_.expires_after(stepTimeout); - stepTimer_.async_wait( - boost::asio::bind_executor( - strand_, - std::bind(&ConnectAttempt::onTimer, shared_from_this(), std::placeholders::_1))); - - JLOG(journal_.trace()) << "setTimer: " << stepToString(step) - << " timeout=" << stepTimeout.count() << "s"; + timer_.expires_after(std::chrono::seconds(15)); } - catch (std::exception const& ex) + catch (boost::system::system_error const& e) { - JLOG(journal_.error()) << "setTimer (step " << stepToString(step) << "): " << ex.what(); - close(); + JLOG(journal_.error()) << "setTimer: " << e.code(); return; } + + timer_.async_wait( + boost::asio::bind_executor( + strand_, + std::bind(&ConnectAttempt::onTimer, shared_from_this(), std::placeholders::_1))); } void @@ -296,7 +170,6 @@ ConnectAttempt::cancelTimer() try { timer_.cancel(); - stepTimer_.cancel(); } catch (boost::system::system_error const&) // NOLINT(bugprone-empty-catch) { @@ -321,40 +194,18 @@ ConnectAttempt::onTimer(error_code ec) close(); return; } - - // Determine which timer expired by checking their expiry times - auto const now = std::chrono::steady_clock::now(); - bool const globalExpired = (timer_.expiry() <= now); - bool const stepExpired = (stepTimer_.expiry() <= now); - - if (globalExpired) - { - JLOG(journal_.debug()) << "onTimer: Global timeout; step: " << stepToString(currentStep_); - } - else if (stepExpired) - { - JLOG(journal_.debug()) << "onTimer: Step timeout; step: " << stepToString(currentStep_); - } - else - { - JLOG(journal_.warn()) << "onTimer: Unexpected timer callback"; - } - - close(); + fail("Timeout"); } void ConnectAttempt::onConnect(error_code ec) { - ioPending_ = false; + cancelTimer(); if (ec) { if (ec == boost::asio::error::operation_aborted) - { - tryAsyncShutdown(); return; - } fail("onConnect", ec); return; @@ -371,15 +222,7 @@ ConnectAttempt::onConnect(error_code ec) return; } - if (shutdown_) - { - tryAsyncShutdown(); - return; - } - - ioPending_ = true; - - setTimer(ConnectionStep::TlsHandshake); + setTimer(); stream_.set_verify_mode(boost::asio::ssl::verify_none); stream_.async_handshake( @@ -392,15 +235,14 @@ ConnectAttempt::onConnect(error_code ec) void ConnectAttempt::onHandshake(error_code ec) { - ioPending_ = false; + cancelTimer(); + if (!socket_.is_open()) + return; if (ec) { if (ec == boost::asio::error::operation_aborted) - { - tryAsyncShutdown(); return; - } fail("onHandshake", ec); return; @@ -413,21 +255,18 @@ ConnectAttempt::onHandshake(error_code ec) return; } - setTimer(ConnectionStep::HttpWrite); - - // check if we connected to ourselves if (!overlay_.peerFinder().onConnected( slot_, beast::IPAddressConversion::fromAsio(localEndpoint))) { - fail("Self connection"); + fail("Duplicate connection"); return; } auto const sharedValue = makeSharedValue(*streamPtr_, journal_); if (!sharedValue) { - shutdown(); - return; // makeSharedValue logs + close(); // makeSharedValue logs + return; } req_ = makeRequest( @@ -445,14 +284,7 @@ ConnectAttempt::onHandshake(error_code ec) remoteEndpoint_.address(), app_); - if (shutdown_) - { - tryAsyncShutdown(); - return; - } - - ioPending_ = true; - + setTimer(); boost::beast::http::async_write( stream_, req_, @@ -464,30 +296,20 @@ ConnectAttempt::onHandshake(error_code ec) void ConnectAttempt::onWrite(error_code ec) { - ioPending_ = false; + cancelTimer(); + + if (!socket_.is_open()) + return; if (ec) { if (ec == boost::asio::error::operation_aborted) - { - tryAsyncShutdown(); return; - } fail("onWrite", ec); return; } - if (shutdown_) - { - tryAsyncShutdown(); - return; - } - - ioPending_ = true; - - setTimer(ConnectionStep::HttpRead); - boost::beast::http::async_read( stream_, readBuf_, @@ -501,21 +323,24 @@ void ConnectAttempt::onRead(error_code ec) { cancelTimer(); - ioPending_ = false; - currentStep_ = ConnectionStep::Complete; + + if (!socket_.is_open()) + return; if (ec) { + if (ec == boost::asio::error::operation_aborted) + return; + if (ec == boost::asio::error::eof) { JLOG(journal_.debug()) << "EOF"; - shutdown(); - return; - } - - if (ec == boost::asio::error::operation_aborted) - { - tryAsyncShutdown(); + setTimer(); + stream_.async_shutdown( + boost::asio::bind_executor( + strand_, + std::bind( + &ConnectAttempt::onShutdown, shared_from_this(), std::placeholders::_1))); return; } @@ -523,13 +348,25 @@ ConnectAttempt::onRead(error_code ec) return; } - if (shutdown_) + processResponse(); +} + +void +ConnectAttempt::onShutdown(error_code ec) +{ + cancelTimer(); + if (!ec) { - tryAsyncShutdown(); + close(); return; } - processResponse(); + if (ec != boost::asio::error::eof) + { + fail("onShutdown", ec); + return; + } + close(); } //-------------------------------------------------------------------------- @@ -537,71 +374,47 @@ ConnectAttempt::onRead(error_code ec) void ConnectAttempt::processResponse() { - if (!OverlayImpl::isPeerUpgrade(response_)) + if (response_.result() == boost::beast::http::status::service_unavailable) { - // A peer may respond with service_unavailable and a list of alternative - // peers to connect to, a differing status code is unexpected - if (response_.result() != boost::beast::http::status::service_unavailable) - { - JLOG(journal_.warn()) << "Unable to upgrade to peer protocol: " << response_.result() - << " (" << response_.reason() << ")"; - shutdown(); - return; - } - - // Parse response body to determine if this is a redirect or other - // service unavailable - std::string responseBody; - responseBody.reserve(boost::asio::buffer_size(response_.body().data())); + json::Value json; + json::Reader r; + std::string s; + s.reserve(boost::asio::buffer_size(response_.body().data())); for (auto const buffer : response_.body().data()) { - responseBody.append( - static_cast(buffer.data()), boost::asio::buffer_size(buffer)); + s.append(static_cast(buffer.data()), boost::asio::buffer_size(buffer)); } - - json::Value json; - json::Reader reader; - auto const isValidJson = reader.parse(responseBody, json); - - // Check if this is a redirect response (contains peer-ips field) - auto const isRedirect = isValidJson && json.isObject() && json.isMember("peer-ips"); - - if (!isRedirect) + auto const success = r.parse(s, json); + if (success) { - JLOG(journal_.warn()) << "processResponse: " << remoteEndpoint_ - << " failed to upgrade to peer protocol: " << response_.result() - << " (" << response_.reason() << ")"; - - shutdown(); - return; + if (json.isObject() && json.isMember("peer-ips")) + { + json::Value const& ips = json["peer-ips"]; + if (ips.isArray()) + { + std::vector eps; + eps.reserve(ips.size()); + for (auto const& v : ips) + { + if (v.isString()) + { + error_code ec; + auto const ep = parseEndpoint(v.asString(), ec); + if (!ec) + eps.push_back(ep); + } + } + overlay_.peerFinder().onRedirects(remoteEndpoint_, eps); + } + } } + } - json::Value const& peerIps = json["peer-ips"]; - if (!peerIps.isArray()) - { - fail("processResponse: invalid peer-ips format"); - return; - } - - // Extract and validate peer endpoints - std::vector redirectEndpoints; - redirectEndpoints.reserve(peerIps.size()); - - for (auto const& ipValue : peerIps) - { - if (!ipValue.isString()) - continue; - - error_code ec; - auto const endpoint = parseEndpoint(ipValue.asString(), ec); - if (!ec) - redirectEndpoints.push_back(endpoint); - } - - // Notify PeerFinder about the redirect redirectEndpoints may be empty - overlay_.peerFinder().onRedirects(remoteEndpoint_, redirectEndpoints); - - fail("processResponse: failed to connect to peer: redirected"); + if (!OverlayImpl::isPeerUpgrade(response_)) + { + JLOG(journal_.info()) << "Unable to upgrade to peer protocol: " << response_.result() + << " (" << response_.reason() << ")"; + close(); return; } @@ -625,8 +438,8 @@ ConnectAttempt::processResponse() auto const sharedValue = makeSharedValue(*streamPtr_, journal_); if (!sharedValue) { - shutdown(); - return; // makeSharedValue logs + close(); // makeSharedValue logs + return; } try @@ -641,30 +454,21 @@ ConnectAttempt::processResponse() usage_.setPublicKey(publicKey); - JLOG(journal_.debug()) << "Protocol: " << to_string(*negotiatedProtocol); JLOG(journal_.info()) << "Public Key: " << toBase58(TokenType::NodePublic, publicKey); + JLOG(journal_.debug()) << "Protocol: " << to_string(*negotiatedProtocol); + auto const member = app_.getCluster().member(publicKey); if (member) { JLOG(journal_.info()) << "Cluster name: " << *member; } - auto const result = overlay_.peerFinder().activate(slot_, publicKey, member.has_value()); + auto const result = + overlay_.peerFinder().activate(slot_, publicKey, static_cast(member)); if (result != PeerFinder::Result::Success) { - std::stringstream ss; - ss << "Outbound Connect Attempt " << remoteEndpoint_ << " " << to_string(result); - fail(ss.str()); - return; - } - - if (!socket_.is_open()) - return; - - if (shutdown_) - { - tryAsyncShutdown(); + fail("Outbound " + std::string(to_string(result))); return; } diff --git a/src/xrpld/overlay/detail/ConnectAttempt.h b/src/xrpld/overlay/detail/ConnectAttempt.h index 949e6accbf..aba224d5c7 100644 --- a/src/xrpld/overlay/detail/ConnectAttempt.h +++ b/src/xrpld/overlay/detail/ConnectAttempt.h @@ -1,42 +1,13 @@ #pragma once +#include #include -#include +#include namespace xrpl { -/** - * @class ConnectAttempt - * @brief Manages outbound peer connection attempts with comprehensive timeout - * handling - * - * The ConnectAttempt class handles the complete lifecycle of establishing an - * outbound connection to a peer in the XRPL network. It implements a - * sophisticated dual-timer system that provides both global timeout protection - * and per-step timeout diagnostics. - * - * The connection establishment follows these steps: - * 1. **TCP Connect**: Establish basic network connection - * 2. **TLS Handshake**: Negotiate SSL/TLS encryption - * 3. **HTTP Write**: Send peer handshake request - * 4. **HTTP Read**: Receive and validate peer response - * 5. **Complete**: Connection successfully established - * - * Uses a hybrid timeout approach: - * - **Global Timer**: Hard limit (20s) for entire connection process - * - **Step Timers**: Individual timeouts for each connection phase - * - * - All errors result in connection termination - * - * All operations are serialized using boost::asio::strand to ensure thread - * safety. The class is designed to be used exclusively within the ASIO event - * loop. - * - * @note This class should not be used directly. It is managed by OverlayImpl - * as part of the peer discovery and connection management system. - * - */ +/** Manages an outbound connection attempt. */ class ConnectAttempt : public OverlayImpl::Child, public std::enable_shared_from_this { @@ -45,95 +16,29 @@ private: using endpoint_type = boost::asio::ip::tcp::endpoint; using request_type = boost::beast::http::request; using response_type = boost::beast::http::response; + using socket_type = boost::asio::ip::tcp::socket; using middle_type = boost::beast::tcp_stream; using stream_type = boost::beast::ssl_stream; using shared_context = std::shared_ptr; - /** - * @enum ConnectionStep - * @brief Represents the current phase of the connection establishment - * process - * - * Used for tracking progress and providing detailed timeout diagnostics. - * Each step has its own timeout value defined in StepTimeouts. - */ - enum class ConnectionStep { - Init, // Initial state, nothing started - TcpConnect, // Establishing TCP connection to remote peer - TlsHandshake, // Performing SSL/TLS handshake - HttpWrite, // Sending HTTP upgrade request - HttpRead, // Reading HTTP upgrade response - Complete, // Connection successfully established - ShutdownStarted // Connection shutdown has started - }; - - // A timeout for connection process, greater than all step timeouts - static constexpr std::chrono::seconds kConnectTimeout{25}; - - /** - * @struct StepTimeouts - * @brief Defines timeout values for each connection step - * - * These timeouts are designed to detect slow individual phases while - * allowing the global timeout to enforce the overall time limit. - */ - struct StepTimeouts - { - // TCP connection timeout - static constexpr std::chrono::seconds kTcpConnect{8}; - // SSL handshake timeout - static constexpr std::chrono::seconds kTlsHandshake{8}; - // HTTP write timeout - static constexpr std::chrono::seconds kHttpWrite{3}; - // HTTP read timeout - static constexpr std::chrono::seconds kHttpRead{3}; - // SSL shutdown timeout - static constexpr std::chrono::seconds kTlsShutdown{2}; - }; - - // Core application and networking components Application& app_; - Peer::id_t const id_; + std::uint32_t const id_; beast::WrappedSink sink_; beast::Journal const journal_; endpoint_type remoteEndpoint_; Resource::Consumer usage_; - boost::asio::strand strand_; boost::asio::basic_waitable_timer timer_; - boost::asio::basic_waitable_timer stepTimer_; - - std::unique_ptr streamPtr_; // SSL stream (owned) + std::unique_ptr streamPtr_; socket_type& socket_; stream_type& stream_; boost::beast::multi_buffer readBuf_; - response_type response_; std::shared_ptr slot_; request_type req_; - bool shutdown_ = false; // Shutdown has been initiated - bool ioPending_ = false; // Async I/O operation in progress - ConnectionStep currentStep_ = ConnectionStep::Init; - public: - /** - * @brief Construct a new ConnectAttempt object - * - * @param app Application context providing configuration and services - * @param ioContext ASIO I/O context for async operations - * @param remoteEndpoint Target peer endpoint to connect to - * @param usage Resource usage tracker for rate limiting - * @param context Shared SSL context for encryption - * @param id Unique peer identifier for this connection attempt - * @param slot PeerFinder slot representing this connection - * @param journal Logging interface for diagnostics - * @param overlay Parent overlay manager - * - * @note The constructor only initializes the object. Call run() to begin - * the actual connection attempt. - */ ConnectAttempt( Application& app, boost::asio::io_context& ioContext, @@ -147,111 +52,38 @@ public: ~ConnectAttempt() override; - /** - * @brief Stop the connection attempt - * - * This method is thread-safe and can be called from any thread. - */ void stop() override; - /** - * @brief Begin the connection attempt - * - * This method is thread-safe and posts to the strand if needed. - */ void run(); private: - /** - * @brief Set timers for the specified connection step - * - * @param step The connection step to set timers for - * - * Sets both the step-specific timer and the global timer (if not already - * set). - */ void - setTimer(ConnectionStep step); - - /** - * @brief Cancel both global and step timers - * - * Used during cleanup and when connection completes successfully. - * Exceptions from timer cancellation are safely ignored. - */ + close(); + void + fail(std::string const& reason); + void + fail(std::string const& name, error_code ec); + void + setTimer(); void cancelTimer(); - - /** - * @brief Handle timer expiration events - * - * @param ec Error code from timer operation - * - * Determines which timer expired (global vs step) and logs appropriate - * diagnostic information before terminating the connection. - */ void onTimer(error_code ec); - - // Connection phase handlers void - onConnect(error_code ec); // TCP connection completion handler + onConnect(error_code ec); void - onHandshake(error_code ec); // TLS handshake completion handler + onHandshake(error_code ec); void - onWrite(error_code ec); // HTTP write completion handler + onWrite(error_code ec); void - onRead(error_code ec); // HTTP read completion handler - - // Error and cleanup handlers + onRead(error_code ec); void - fail(std::string const& reason); // Fail with custom reason - void - fail(std::string const& name, error_code ec); // Fail with system error - void - shutdown(); // Initiate graceful shutdown - void - tryAsyncShutdown(); // Attempt async SSL shutdown - void - onShutdown(error_code ec); // SSL shutdown completion handler - void - close(); // Force close socket - - /** - * @brief Process the HTTP upgrade response from peer - * - * Validates the peer's response, extracts protocol information, - * verifies handshake, and either creates a PeerImp or handles - * redirect responses. - */ + onShutdown(error_code ec); void processResponse(); - static std::string - stepToString(ConnectionStep step) - { - switch (step) - { - case ConnectionStep::Init: - return "Init"; - case ConnectionStep::TcpConnect: - return "TcpConnect"; - case ConnectionStep::TlsHandshake: - return "TlsHandshake"; - case ConnectionStep::HttpWrite: - return "HttpWrite"; - case ConnectionStep::HttpRead: - return "HttpRead"; - case ConnectionStep::Complete: - return "Complete"; - case ConnectionStep::ShutdownStarted: - return "ShutdownStarted"; - } - return "Unknown"; - }; - template static boost::asio::ip::tcp::endpoint parseEndpoint(std::string const& s, boost::system::error_code& ec) diff --git a/src/xrpld/overlay/detail/PeerImp.cpp b/src/xrpld/overlay/detail/PeerImp.cpp index 23e1785f6b..325f8ba038 100644 --- a/src/xrpld/overlay/detail/PeerImp.cpp +++ b/src/xrpld/overlay/detail/PeerImp.cpp @@ -74,7 +74,7 @@ #include #include #include -#include +#include #include @@ -110,9 +110,6 @@ constexpr std::chrono::milliseconds kPeerHighLatency{300}; /** How often we PING the peer to check for latency and sendq probe */ constexpr std::chrono::seconds kPeerTimerInterval{60}; -/** The timeout for a shutdown timer */ -constexpr std::chrono::seconds kShutdownTimerInterval{5}; - } // namespace // TODO: Remove this exclusion once unit tests are added after the hotfix @@ -270,13 +267,7 @@ PeerImp::stop() if (!socket_.is_open()) return; - // The rationale for using different severity levels is that - // outbound connections are under our control and may be logged - // at a higher level, but inbound connections are more numerous and - // uncontrolled so to prevent log flooding the severity is reduced. - JLOG(journal_.debug()) << "stop: Stop"; - - shutdown(); + close(); } //------------------------------------------------------------------------------ @@ -289,17 +280,13 @@ PeerImp::send(std::shared_ptr const& m) post(strand_, std::bind(&PeerImp::send, shared_from_this(), m)); return; } - + if (gracefulClose_) + return; + if (detaching_) + return; if (!socket_.is_open()) return; - // we are in progress of closing the connection - if (shutdown_) - { - tryAsyncShutdown(); - return; - } - auto validator = m->getValidatorKey(); if (validator && !squelch_.expireSquelch(*validator)) { @@ -338,7 +325,6 @@ PeerImp::send(std::shared_ptr const& m) if (sendqSize != 0) return; - writePending_ = true; boost::asio::async_write( stream_, boost::asio::buffer(sendQueue_.front()->getBuffer(compressionEnabled_)), @@ -621,16 +607,20 @@ PeerImp::hasRange(std::uint32_t uMin, std::uint32_t uMax) //------------------------------------------------------------------------------ void -PeerImp::fail(std::string const& name, error_code ec) +PeerImp::close() { - XRPL_ASSERT(strand_.running_in_this_thread(), "xrpl::PeerImp::fail : strand in this thread"); - + XRPL_ASSERT(strand_.running_in_this_thread(), "xrpl::PeerImp::close : strand in this thread"); if (!socket_.is_open()) return; - JLOG(journal_.warn()) << name << ": " << ec.message(); + detaching_ = true; // DEPRECATED - shutdown(); + cancelTimer(); + error_code ec; + socket_.close(ec); // NOLINT(bugprone-unused-return-value) + + overlay_.incPeerDisconnect(); + JLOG((inbound_ ? journal_.debug() : journal_.info())) << "close: Closed"; } void @@ -644,123 +634,71 @@ PeerImp::fail(std::string const& reason) (void (Peer::*)(std::string const&))&PeerImp::fail, shared_from_this(), reason)); return; } - - if (!socket_.is_open()) - return; - - // Call to name() locks, log only if the message will be outputted - if (journal_.active(beast::Severity::Warning)) + if (journal_.active(beast::Severity::Warning) && socket_.is_open()) { std::string const n = name(); JLOG(journal_.warn()) << n << " failed: " << reason; } - - shutdown(); + close(); } void -PeerImp::tryAsyncShutdown() +PeerImp::fail(std::string const& name, error_code ec) { - XRPL_ASSERT( - strand_.running_in_this_thread(), - "xrpl::PeerImp::tryAsyncShutdown : strand in this thread"); - - if (!shutdown_ || shutdownStarted_) + XRPL_ASSERT(strand_.running_in_this_thread(), "xrpl::PeerImp::fail : strand in this thread"); + if (!socket_.is_open()) return; - if (readPending_ || writePending_) - return; - - shutdownStarted_ = true; - - setTimer(kShutdownTimerInterval); - - // gracefully shutdown the SSL socket, performing a shutdown handshake - stream_.async_shutdown(bind_executor( - strand_, std::bind(&PeerImp::onShutdown, shared_from_this(), std::placeholders::_1))); -} - -void -PeerImp::shutdown() -{ - XRPL_ASSERT(strand_.running_in_this_thread(), "xrpl::PeerImp::shutdown: strand in this thread"); - - if (!socket_.is_open() || shutdown_) - return; - - shutdown_ = true; - - boost::beast::get_lowest_layer(stream_).cancel(); - - tryAsyncShutdown(); -} - -void -PeerImp::onShutdown(error_code ec) -{ - cancelTimer(); - if (ec) - { - // - eof: the stream was cleanly closed - // - operation_aborted: an expired timer (slow shutdown) - // - stream_truncated: the tcp connection closed (no handshake) it could - // occur if a peer does not perform a graceful disconnect - // - broken_pipe: the peer is gone - bool const shouldLog = - (ec != boost::asio::error::eof && ec != boost::asio::error::operation_aborted && - ec.message().find("application data after close notify") == std::string::npos); - - if (shouldLog) - { - JLOG(journal_.debug()) << "onShutdown: " << ec.message(); - } - } + JLOG(journal_.warn()) << name << ": " << ec.message(); close(); } void -PeerImp::close() +PeerImp::gracefulClose() { - XRPL_ASSERT(strand_.running_in_this_thread(), "xrpl::PeerImp::close : strand in this thread"); - - if (!socket_.is_open()) + XRPL_ASSERT( + strand_.running_in_this_thread(), "xrpl::PeerImp::gracefulClose : strand in this thread"); + XRPL_ASSERT(socket_.is_open(), "xrpl::PeerImp::gracefulClose : socket is open"); + XRPL_ASSERT(!gracefulClose_, "xrpl::PeerImp::gracefulClose : socket is not closing"); + gracefulClose_ = true; + if (!sendQueue_.empty()) return; - - cancelTimer(); - - error_code ec; - socket_.close(ec); // NOLINT(bugprone-unused-return-value) - - overlay_.incPeerDisconnect(); - - // The rationale for using different severity levels is that - // outbound connections are under our control and may be logged - // at a higher level, but inbound connections are more numerous and - // uncontrolled so to prevent log flooding the severity is reduced. - JLOG((inbound_ ? journal_.debug() : journal_.info())) << "close: Closed"; + setTimer(); + stream_.async_shutdown(bind_executor( + strand_, std::bind(&PeerImp::onShutdown, shared_from_this(), std::placeholders::_1))); } -//------------------------------------------------------------------------------ - void -PeerImp::setTimer(std::chrono::seconds interval) +PeerImp::setTimer() { try { - timer_.expires_after(interval); + timer_.expires_after(kPeerTimerInterval); } - catch (std::exception const& ex) + catch (boost::system::system_error const& e) { - JLOG(journal_.error()) << "setTimer: " << ex.what(); - shutdown(); + JLOG(journal_.error()) << "setTimer: " << e.code(); return; } - timer_.async_wait(bind_executor( strand_, std::bind(&PeerImp::onTimer, shared_from_this(), std::placeholders::_1))); } +// convenience for ignoring the error code +void +PeerImp::cancelTimer() noexcept +{ + try + { + timer_.cancel(); + } + catch (boost::system::system_error const&) // NOLINT(bugprone-empty-catch) + { + // ignored + } +} + //------------------------------------------------------------------------------ std::string @@ -774,14 +712,11 @@ PeerImp::makePrefix(std::string const& fingerprint) void PeerImp::onTimer(error_code const& ec) { - XRPL_ASSERT(strand_.running_in_this_thread(), "xrpl::PeerImp::onTimer : strand in this thread"); - if (!socket_.is_open()) return; if (ec) { - // do not initiate shutdown, timers are frequently cancelled if (ec == boost::asio::error::operation_aborted) return; @@ -791,15 +726,6 @@ PeerImp::onTimer(error_code const& ec) return; } - // the timer expired before the shutdown completed - // force close the connection - if (shutdown_) - { - JLOG(journal_.debug()) << "onTimer: shutdown timer expired"; - close(); - return; - } - if (largeSendq_++ >= Tuning::kSendqIntervals) { fail("Large send queue"); @@ -840,20 +766,32 @@ PeerImp::onTimer(error_code const& ec) send(std::make_shared(message, protocol::mtPING)); - setTimer(kPeerTimerInterval); + setTimer(); } void -PeerImp::cancelTimer() noexcept +PeerImp::onShutdown(error_code ec) { - try + cancelTimer(); + + if (ec) { - timer_.cancel(); - } - catch (std::exception const& ex) - { - JLOG(journal_.error()) << "cancelTimer: " << ex.what(); + // - eof: the stream was cleanly closed + // - operation_aborted: an expired timer (slow shutdown) + // - stream_truncated: the tcp connection closed (no handshake) it could + // occur if a peer does not perform a graceful disconnect + // - broken_pipe: the peer is gone + bool const shouldLog = + (ec != boost::asio::error::eof && ec != boost::asio::error::operation_aborted && + ec.message().find("application data after close notify") == std::string::npos); + + if (shouldLog) + { + JLOG(journal_.debug()) << "onShutdown: " << ec.message(); + } } + + close(); } //------------------------------------------------------------------------------ @@ -862,15 +800,6 @@ PeerImp::doAccept() { XRPL_ASSERT(readBuffer_.size() == 0, "xrpl::PeerImp::doAccept : empty read buffer"); - JLOG(journal_.debug()) << "doAccept"; - - // a shutdown was initiated before the handshake, there is nothing to do - if (shutdown_) - { - tryAsyncShutdown(); - return; - } - auto const sharedValue = makeSharedValue(*streamPtr_, journal_); // This shouldn't fail since we already computed @@ -881,7 +810,7 @@ PeerImp::doAccept() return; } - JLOG(journal_.debug()) << "Protocol: " << to_string(protocol_); + JLOG(journal_.info()) << "Protocol: " << to_string(protocol_); if (auto member = app_.getCluster().member(publicKey_)) { @@ -921,16 +850,15 @@ PeerImp::doAccept() error_code ec, std::size_t bytesTransferred) { if (!socket_.is_open()) return; - if (ec == boost::asio::error::operation_aborted) - { - tryAsyncShutdown(); - return; - } if (ec) { + if (ec == boost::asio::error::operation_aborted) + return; + fail("onWriteResponse", ec); return; } + if (writeBuffer->size() == bytesTransferred) { doProtocolStart(); @@ -961,13 +889,6 @@ PeerImp::domain() const void PeerImp::doProtocolStart() { - // a shutdown was initiated before the handshare, there is nothing to do - if (shutdown_) - { - tryAsyncShutdown(); - return; - } - onReadMessage(error_code(), 0); // Send all the validator lists that have been loaded @@ -999,45 +920,31 @@ PeerImp::doProtocolStart() if (auto m = overlay_.getManifestsMessage()) send(m); - setTimer(kPeerTimerInterval); + setTimer(); } // Called repeatedly with protocol message data void PeerImp::onReadMessage(error_code ec, std::size_t bytesTransferred) { - XRPL_ASSERT( - strand_.running_in_this_thread(), "xrpl::PeerImp::onReadMessage : strand in this thread"); - - readPending_ = false; - if (!socket_.is_open()) return; if (ec) { + if (ec == boost::asio::error::operation_aborted) + return; + if (ec == boost::asio::error::eof) { - JLOG(journal_.debug()) << "EOF"; - shutdown(); - return; - } - - if (ec == boost::asio::error::operation_aborted) - { - tryAsyncShutdown(); + JLOG(journal_.info()) << "EOF"; + gracefulClose(); return; } fail("onReadMessage", ec); return; } - // we started shutdown, no reason to process further data - if (shutdown_) - { - tryAsyncShutdown(); - return; - } if (auto stream = journal_.trace()) { @@ -1062,34 +969,23 @@ PeerImp::onReadMessage(error_code ec, std::size_t bytesTransferred) 350ms, journal_); - if (!socket_.is_open()) - return; - - // the error_code is produced by invokeProtocolMessage - // it could be due to a bad message if (ec) { fail("onReadMessage", ec); return; } + if (!socket_.is_open()) + return; + + if (gracefulClose_) + return; + if (bytesConsumed == 0) break; - readBuffer_.consume(bytesConsumed); } - // check if a shutdown was initiated while processing messages - if (shutdown_) - { - tryAsyncShutdown(); - return; - } - - readPending_ = true; - - XRPL_ASSERT(!shutdownStarted_, "xrpl::PeerImp::onReadMessage : shutdown started"); - // Timeout on writes only stream_.async_read_some( readBuffer_.prepare(std::max(Tuning::kReadBufferBytes, hint)), @@ -1105,26 +1001,17 @@ PeerImp::onReadMessage(error_code ec, std::size_t bytesTransferred) void PeerImp::onWriteMessage(error_code ec, std::size_t bytesTransferred) { - XRPL_ASSERT( - strand_.running_in_this_thread(), "xrpl::PeerImp::onWriteMessage : strand in this thread"); - - writePending_ = false; - if (!socket_.is_open()) return; if (ec) { if (ec == boost::asio::error::operation_aborted) - { - tryAsyncShutdown(); return; - } fail("onWriteMessage", ec); return; } - if (auto stream = journal_.trace()) { stream << "onWriteMessage: " @@ -1135,18 +1022,8 @@ PeerImp::onWriteMessage(error_code ec, std::size_t bytesTransferred) XRPL_ASSERT(!sendQueue_.empty(), "xrpl::PeerImp::onWriteMessage : non-empty send buffer"); sendQueue_.pop(); - - if (shutdown_) - { - tryAsyncShutdown(); - return; - } - if (!sendQueue_.empty()) { - writePending_ = true; - XRPL_ASSERT(!shutdownStarted_, "xrpl::PeerImp::onWriteMessage : shutdown started"); - // Timeout on writes only boost::asio::async_write( stream_, @@ -1160,6 +1037,13 @@ PeerImp::onWriteMessage(error_code ec, std::size_t bytesTransferred) std::placeholders::_2))); return; } + + if (gracefulClose_) + { + stream_.async_shutdown(bind_executor( + strand_, std::bind(&PeerImp::onShutdown, shared_from_this(), std::placeholders::_1))); + return; + } } //------------------------------------------------------------------------------ diff --git a/src/xrpld/overlay/detail/PeerImp.h b/src/xrpld/overlay/detail/PeerImp.h index dca70d88bd..f5d87371be 100644 --- a/src/xrpld/overlay/detail/PeerImp.h +++ b/src/xrpld/overlay/detail/PeerImp.h @@ -30,68 +30,6 @@ namespace xrpl { struct ValidatorBlobInfo; class SHAMap; -/** - * @class PeerImp - * @brief This class manages established peer-to-peer connections, handles - message exchange, monitors connection health, and graceful shutdown. - * - - * The PeerImp shutdown mechanism is a multi-stage process - * designed to ensure graceful connection termination while handling ongoing - * I/O operations safely. The shutdown can be initiated from multiple points - * and follows a deterministic state machine. - * - * The shutdown process can be triggered from several entry points: - * - **External requests**: `stop()` method called by overlay management - * - **Error conditions**: `fail(error_code)` or `fail(string)` on protocol - * violations - * - **Timer expiration**: Various timeout scenarios (ping timeout, large send - * queue) - * - **Connection health**: Peer tracking divergence or unknown state timeouts - * - * The shutdown follows this progression: - * - * Normal Operation → shutdown() → tryAsyncShutdown() → onShutdown() → close() - * ↓ ↓ ↓ ↓ - * Set shutdown_ SSL graceful Timer cancel Socket close - * Cancel timer shutdown start & cleanup & metrics - * 5s safety timer Set shutdownStarted_ update - * - * Two primary flags coordinate the shutdown process: - * - `shutdown_`: Set when shutdown is requested - * - `shutdownStarted_`: Set when SSL shutdown begins - * - * The shutdown mechanism carefully coordinates with ongoing read/write - * operations: - * - * **Read Operations (`onReadMessage`)**: - * - Checks `shutdown_` flag after processing each message batch - * - If shutdown initiated during processing, calls `tryAsyncShutdown()` - * - * **Write Operations (`onWriteMessage`)**: - * - Checks `shutdown_` flag before queuing new writes - * - Calls `tryAsyncShutdown()` when shutdown flag detected - * - * Multiple timers require coordination during shutdown: - * 1. **Peer Timer**: Regular ping/pong timer cancelled immediately in - * `shutdown()` - * 2. **Shutdown Timer**: 5-second safety timer ensures shutdown completion - * 3. **Operation Cancellation**: All pending async operations are cancelled - * - * The shutdown implements fallback mechanisms: - * - **Graceful Path**: SSL shutdown → Socket close → Cleanup - * - **Forced Path**: If SSL shutdown fails or times out, proceeds to socket - * close - * - **Safety Timer**: 5-second timeout prevents hanging shutdowns - * - * All shutdown operations are serialized through the boost::asio::strand to - * ensure thread safety. The strand guarantees that shutdown state changes - * and I/O operation callbacks are executed sequentially. - * - * @note This class requires careful coordination between async operations, - * timer management, and shutdown procedures to ensure no resource leaks - * or hanging connections in high-throughput networking scenarios. - */ class PeerImp : public Peer, public std::enable_shared_from_this, public OverlayImpl::Child { public: @@ -121,8 +59,6 @@ private: socket_type& socket_; stream_type& stream_; boost::asio::strand strand_; - - // Multi-purpose timer for peer activity monitoring and shutdown safety waitable_timer timer_; // Updated at each stage of the connection process to reflect @@ -139,6 +75,7 @@ private: std::atomic tracking_; clock_type::time_point trackingTime_; + bool detaching_ = false; // Node public key of peer. PublicKey const publicKey_; std::string name_; @@ -216,19 +153,7 @@ private: http_response_type response_; boost::beast::http::fields const& headers_; std::queue> sendQueue_; - - // Primary shutdown flag set when shutdown is requested - bool shutdown_ = false; - - // SSL shutdown coordination flag - bool shutdownStarted_ = false; - - // Indicates a read operation is currently pending - bool readPending_ = false; - - // Indicates a write operation is currently pending - bool writePending_ = false; - + bool gracefulClose_ = false; int largeSendq_ = 0; std::unique_ptr loadEvent_; // The highest sequence of each PublisherList that has @@ -476,6 +401,9 @@ public: bool isHighLatency() const override; + void + fail(std::string const& reason); + bool compressionEnabled() const override { @@ -489,129 +417,32 @@ public: } private: - /** - * @brief Handles a failure associated with a specific error code. - * - * This function is called when an operation fails with an error code. It - * logs the warning message and gracefully shutdowns the connection. - * - * The function will do nothing if the connection is already closed or if a - * shutdown is already in progress. - * - * @param name The name of the operation that failed (e.g., "read", - * "write"). - * @param ec The error code associated with the failure. - * @note This function must be called from within the object's strand. - */ - void - fail(std::string const& name, error_code ec); - - /** - * @brief Handles a failure described by a reason string. - * - * This overload is used for logical errors or protocol violations not - * associated with a specific error code. It logs a warning with the - * given reason, then initiates a graceful shutdown. - * - * The function will do nothing if the connection is already closed or if a - * shutdown is already in progress. - * - * @param reason A descriptive string explaining the reason for the failure. - * @note This function must be called from within the object's strand. - */ - void - fail(std::string const& reason); - - /** @brief Initiates the peer disconnection sequence. - * - * This is the primary entry point to start closing a peer connection. It - * marks the peer for shutdown and cancels any outstanding asynchronous - * operations. This cancellation allows the graceful shutdown to proceed - * once the handlers for the cancelled operations have completed. - * - * @note This method must be called on the peer's strand. - */ - void - shutdown(); - - /** @brief Attempts to perform a graceful SSL shutdown if conditions are - * met. - * - * This helper function checks if the peer is in a state where a graceful - * SSL shutdown can be performed (i.e., shutdown has been requested and no - * I/O operations are currently in progress). - * - * @note This method must be called on the peer's strand. - */ - void - tryAsyncShutdown(); - - /** - * @brief Handles the completion of the asynchronous SSL shutdown. - * - * This function is the callback for the `async_shutdown` operation started - * in `shutdown()`. Its first action is to cancel the timer. It - * then inspects the error code to determine the outcome. - * - * Regardless of the result, this function proceeds to call `close()` to - * ensure the underlying socket is fully closed. - * - * @param ec The error code resulting from the `async_shutdown` operation. - */ - void - onShutdown(error_code ec); - - /** - * @brief Forcibly closes the underlying socket connection. - * - * This function provides the final, non-graceful shutdown of the peer - * connection. It ensures any pending timers are cancelled and then - * immediately closes the TCP socket, bypassing the SSL shutdown handshake. - * - * After closing, it notifies the overlay manager of the disconnection. - * - * @note This function must be called from within the object's strand. - */ void close(); - /** - * @brief Sets and starts the peer timer. - * - * This function starts timer, which is used to detect inactivity - * and prevent stalled connections. It sets the timer to expire after the - * predefined `peerTimerInterval`. - * - * @note This function will terminate the connection in case of any errors. - */ void - setTimer(std::chrono::seconds interval); + fail(std::string const& name, error_code ec); - /** - * @brief Handles the expiration of the peer activity timer. - * - * This callback is invoked when the timer set by `setTimer` expires. It - * watches the peer connection, checking for various timeout and health - * conditions. - * - * @param ec The error code associated with the timer's expiration. - * `operation_aborted` is expected if the timer was cancelled. - */ void - onTimer(error_code const& ec); + gracefulClose(); + + void + setTimer(); - /** - * @brief Cancels any pending wait on the peer activity timer. - * - * This function is called to stop the timer. It gracefully manages any - * errors that might occur during the cancellation process. - */ void cancelTimer() noexcept; static std::string makePrefix(std::string const& fingerprint); + // Called when the timer wait completes + void + onTimer(boost::system::error_code const& ec); + + // Called when SSL shutdown completes + void + onShutdown(error_code ec); + void doAccept();