refactor: Load balancer forwarding metrics (#2103)

This commit is contained in:
Ayaz Salikhov
2025-05-14 11:43:31 +01:00
committed by GitHub
parent a369381594
commit 243e174f1e
3 changed files with 111 additions and 37 deletions

View File

@@ -61,14 +61,10 @@
#include <vector> #include <vector>
using namespace util::config; using namespace util::config;
using namespace util::prometheus; using util::prometheus::Labels;
namespace etl { namespace etl {
namespace {
std::vector<std::int64_t> const kHISTOGRAM_BUCKETS{1, 2, 5, 10, 20, 50, 100, 200, 500, 700, 1000};
} // namespace
std::shared_ptr<etlng::LoadBalancerInterface> std::shared_ptr<etlng::LoadBalancerInterface>
LoadBalancer::makeLoadBalancer( LoadBalancer::makeLoadBalancer(
ClioConfigDefinition const& config, ClioConfigDefinition const& config,
@@ -92,27 +88,33 @@ LoadBalancer::LoadBalancer(
std::shared_ptr<NetworkValidatedLedgersInterface> validatedLedgers, std::shared_ptr<NetworkValidatedLedgersInterface> validatedLedgers,
SourceFactory sourceFactory SourceFactory sourceFactory
) )
: forwardedDurationHistogram_(PrometheusService::histogramInt( : forwardingCounters_{
"lb_forwarded_duration_milliseconds_histogram", .successDuration = PrometheusService::counterInt(
Labels(), "forwarding_duration_milliseconds_counter",
kHISTOGRAM_BUCKETS, Labels({util::prometheus::Label{"status", "success"}}),
"The duration of processing forwarded requests" "The duration of processing successful forwarded requests"
)) ),
, forwardedRetryCounter_(PrometheusService::counterInt( .failDuration = PrometheusService::counterInt(
"lb_forwarded_retry_counter", "forwarding_duration_milliseconds_counter",
Labels(), Labels({util::prometheus::Label{"status", "fail"}}),
"The number of retries before a forwarded request was successful. Initial attempt excluded" "The duration of processing failed forwarded requests"
)) ),
, cacheTriedCounter_(PrometheusService::counterInt( .retries = PrometheusService::counterInt(
"lb_cache_tried_counter", "forwarding_retries_counter",
Labels(), Labels(),
"The number of requests that we tried to serve from the cache" "The number of retries before a forwarded request was successful. Initial attempt excluded"
)) ),
, cacheMissCounter_(PrometheusService::counterInt( .cacheHit = PrometheusService::counterInt(
"lb_cache_miss_counter", "forwarding_cache_hit_counter",
Labels(), Labels(),
"The number of requests that were not served from the cache" "The number of requests that we served from the cache"
)) ),
.cacheMiss = PrometheusService::counterInt(
"forwarding_cache_miss_counter",
Labels(),
"The number of requests that were not served from the cache"
)
}
{ {
auto const forwardingCacheTimeout = config.get<float>("forwarding.cache_timeout"); auto const forwardingCacheTimeout = config.get<float>("forwarding.cache_timeout");
if (forwardingCacheTimeout > 0.f) { if (forwardingCacheTimeout > 0.f) {
@@ -266,11 +268,11 @@ LoadBalancer::forwardToRippled(
auto const cmd = boost::json::value_to<std::string>(request.at("command")); auto const cmd = boost::json::value_to<std::string>(request.at("command"));
if (forwardingCache_ and forwardingCache_->shouldCache(cmd)) { if (forwardingCache_ and forwardingCache_->shouldCache(cmd)) {
++cacheTriedCounter_.get(); bool servedFromCache = true;
auto updater = auto updater =
[this, &request, &clientIp, isAdmin](boost::asio::yield_context yield [this, &request, &clientIp, &servedFromCache, isAdmin](boost::asio::yield_context yield
) -> std::expected<util::ResponseExpirationCache::EntryData, util::ResponseExpirationCache::Error> { ) -> std::expected<util::ResponseExpirationCache::EntryData, util::ResponseExpirationCache::Error> {
servedFromCache = false;
auto result = forwardToRippledImpl(request, clientIp, isAdmin, yield); auto result = forwardToRippledImpl(request, clientIp, isAdmin, yield);
if (result.has_value()) { if (result.has_value()) {
return util::ResponseExpirationCache::EntryData{ return util::ResponseExpirationCache::EntryData{
@@ -288,6 +290,9 @@ LoadBalancer::forwardToRippled(
std::move(updater), std::move(updater),
[](util::ResponseExpirationCache::EntryData const& entry) { return not entry.response.contains("error"); } [](util::ResponseExpirationCache::EntryData const& entry) { return not entry.response.contains("error"); }
); );
if (servedFromCache) {
++forwardingCounters_.cacheHit.get();
}
if (result.has_value()) { if (result.has_value()) {
return std::move(result).value(); return std::move(result).value();
} }
@@ -395,7 +400,7 @@ LoadBalancer::forwardToRippledImpl(
boost::asio::yield_context yield boost::asio::yield_context yield
) )
{ {
++cacheMissCounter_.get(); ++forwardingCounters_.cacheMiss.get();
ASSERT(not sources_.empty(), "ETL sources must be configured to forward requests."); ASSERT(not sources_.empty(), "ETL sources must be configured to forward requests.");
std::size_t sourceIdx = util::Random::uniform(0ul, sources_.size() - 1); std::size_t sourceIdx = util::Random::uniform(0ul, sources_.size() - 1);
@@ -409,13 +414,14 @@ LoadBalancer::forwardToRippledImpl(
while (numAttempts < sources_.size()) { while (numAttempts < sources_.size()) {
auto [res, duration] = auto [res, duration] =
util::timed([&]() { return sources_[sourceIdx]->forwardToRippled(request, clientIp, xUserValue, yield); }); util::timed([&]() { return sources_[sourceIdx]->forwardToRippled(request, clientIp, xUserValue, yield); });
forwardedDurationHistogram_.get().observe(duration);
if (res) { if (res) {
forwardingCounters_.successDuration.get() += duration;
response = std::move(res).value(); response = std::move(res).value();
break; break;
} }
++forwardedRetryCounter_.get(); forwardingCounters_.failDuration.get() += duration;
++forwardingCounters_.retries.get();
error = std::max(error, res.error()); // Choose the best result between all sources error = std::max(error, res.error()); // Choose the best result between all sources
sourceIdx = (sourceIdx + 1) % sources_.size(); sourceIdx = (sourceIdx + 1) % sources_.size();

View File

@@ -33,7 +33,6 @@
#include "util/log/Logger.hpp" #include "util/log/Logger.hpp"
#include "util/newconfig/ConfigDefinition.hpp" #include "util/newconfig/ConfigDefinition.hpp"
#include "util/prometheus/Counter.hpp" #include "util/prometheus/Counter.hpp"
#include "util/prometheus/Histogram.hpp"
#include <boost/asio.hpp> #include <boost/asio.hpp>
#include <boost/asio/io_context.hpp> #include <boost/asio/io_context.hpp>
@@ -94,10 +93,13 @@ private:
std::uint32_t downloadRanges_ = std::uint32_t downloadRanges_ =
kDEFAULT_DOWNLOAD_RANGES; /*< The number of markers to use when downloading initial ledger */ kDEFAULT_DOWNLOAD_RANGES; /*< The number of markers to use when downloading initial ledger */
std::reference_wrapper<util::prometheus::HistogramInt> forwardedDurationHistogram_; struct ForwardingCounters {
std::reference_wrapper<util::prometheus::CounterInt> forwardedRetryCounter_; std::reference_wrapper<util::prometheus::CounterInt> successDuration;
std::reference_wrapper<util::prometheus::CounterInt> cacheTriedCounter_; std::reference_wrapper<util::prometheus::CounterInt> failDuration;
std::reference_wrapper<util::prometheus::CounterInt> cacheMissCounter_; std::reference_wrapper<util::prometheus::CounterInt> retries;
std::reference_wrapper<util::prometheus::CounterInt> cacheHit;
std::reference_wrapper<util::prometheus::CounterInt> cacheMiss;
} forwardingCounters_;
// Using mutex instead of atomic_bool because choosing a new source to // Using mutex instead of atomic_bool because choosing a new source to
// forward messages should be done with a mutual exclusion otherwise there will be a race condition // forward messages should be done with a mutual exclusion otherwise there will be a race condition

View File

@@ -59,6 +59,7 @@
using namespace etl; using namespace etl;
using namespace util::config; using namespace util::config;
using testing::Return; using testing::Return;
using namespace util::prometheus;
constexpr static auto const kTWO_SOURCES_LEDGER_RESPONSE = R"({ constexpr static auto const kTWO_SOURCES_LEDGER_RESPONSE = R"({
"etl_sources": [ "etl_sources": [
@@ -641,6 +642,71 @@ TEST_F(LoadBalancerForwardToRippledTests, source0Fails)
}); });
} }
struct LoadBalancerForwardToRippledPrometheusTests : LoadBalancerForwardToRippledTests, WithMockPrometheus {};
TEST_F(LoadBalancerForwardToRippledPrometheusTests, forwardingCacheEnabled)
{
configJson_.as_object()["forwarding"] = boost::json::object{{"cache_timeout", 10.}};
EXPECT_CALL(sourceFactory_, makeSource).Times(2);
auto loadBalancer = makeLoadBalancer();
auto const request = boost::json::object{{"command", "server_info"}};
auto& cacheHitCounter = makeMock<CounterInt>("forwarding_cache_hit_counter", "");
auto& cacheMissCounter = makeMock<CounterInt>("forwarding_cache_miss_counter", "");
auto& successDurationCounter =
makeMock<CounterInt>("forwarding_duration_milliseconds_counter", "{status=\"success\"}");
EXPECT_CALL(cacheMissCounter, add(1));
EXPECT_CALL(cacheHitCounter, add(1)).Times(3);
EXPECT_CALL(successDurationCounter, add(testing::_));
EXPECT_CALL(
sourceFactory_.sourceAt(0),
forwardToRippled(request, clientIP_, LoadBalancer::kUSER_FORWARDING_X_USER_VALUE, testing::_)
)
.WillOnce(Return(response_));
runSpawn([&](boost::asio::yield_context yield) {
EXPECT_EQ(loadBalancer->forwardToRippled(request, clientIP_, false, yield), response_);
EXPECT_EQ(loadBalancer->forwardToRippled(request, clientIP_, false, yield), response_);
EXPECT_EQ(loadBalancer->forwardToRippled(request, clientIP_, false, yield), response_);
EXPECT_EQ(loadBalancer->forwardToRippled(request, clientIP_, false, yield), response_);
});
}
TEST_F(LoadBalancerForwardToRippledPrometheusTests, source0Fails)
{
EXPECT_CALL(sourceFactory_, makeSource).Times(2);
auto loadBalancer = makeLoadBalancer();
auto& cacheMissCounter = makeMock<CounterInt>("forwarding_cache_miss_counter", "");
auto& retriesCounter = makeMock<CounterInt>("forwarding_retries_counter", "");
auto& successDurationCounter =
makeMock<CounterInt>("forwarding_duration_milliseconds_counter", "{status=\"success\"}");
auto& failDurationCounter = makeMock<CounterInt>("forwarding_duration_milliseconds_counter", "{status=\"fail\"}");
EXPECT_CALL(cacheMissCounter, add(1));
EXPECT_CALL(retriesCounter, add(1));
EXPECT_CALL(successDurationCounter, add(testing::_));
EXPECT_CALL(failDurationCounter, add(testing::_));
EXPECT_CALL(
sourceFactory_.sourceAt(0),
forwardToRippled(request_, clientIP_, LoadBalancer::kUSER_FORWARDING_X_USER_VALUE, testing::_)
)
.WillOnce(Return(std::unexpected{rpc::ClioError::EtlConnectionError}));
EXPECT_CALL(
sourceFactory_.sourceAt(1),
forwardToRippled(request_, clientIP_, LoadBalancer::kUSER_FORWARDING_X_USER_VALUE, testing::_)
)
.WillOnce(Return(response_));
runSpawn([&](boost::asio::yield_context yield) {
EXPECT_EQ(loadBalancer->forwardToRippled(request_, clientIP_, false, yield), response_);
});
}
struct LoadBalancerForwardToRippledErrorTestBundle { struct LoadBalancerForwardToRippledErrorTestBundle {
std::string testName; std::string testName;
rpc::ClioError firstSourceError; rpc::ClioError firstSourceError;