mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-03 08:46:46 +00:00
fix(telemetry): address code review issues in OTelCollector
- Fix use-after-free: extract gauge callback to static function and call RemoveCallback in ~OTelGaugeImpl() before unregistering from collector - Use memory_order_acq_rel on callHooks() debounce CAS for proper happens-before relationship between hook invocations - Add explicit 2s timeout to ForceFlush() in destructor to prevent blocking indefinitely when OTLP endpoint is unreachable at shutdown - Add OTLP receiver to metrics pipeline so native OTel metrics from xrpld are actually received by the collector - Remove stale health check port from docker-compose (extension was removed from collector config) - Clarify fallback docs: StatsD path requires re-enabling receiver/port - Fix comments: Counter uses uint64_t not int64_t, gauge clamps to [0, INT64_MAX] not [0, UINT64_MAX] Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -17,7 +17,7 @@
|
||||
* OTelCounterImpl / OTelGaugeImpl / OTelEventImpl / OTelMeterImpl
|
||||
* | | | |
|
||||
* v v v v
|
||||
* OTel Counter<uint64> ObservableGauge Histogram<double> Counter<uint64>
|
||||
* Counter<uint64_t> ObservableGauge Histogram<double> Counter<uint64_t>
|
||||
* | | | |
|
||||
* +--------------------+----------------+--------------+
|
||||
* |
|
||||
@@ -123,7 +123,7 @@ private:
|
||||
/**
|
||||
* @brief OTel-backed implementation of beast::insight::CounterImpl.
|
||||
*
|
||||
* Wraps an OTel Counter<int64_t> instrument. Each increment() call
|
||||
* Wraps an OTel Counter<uint64_t> instrument. Each increment() call
|
||||
* is forwarded directly to the OTel counter's Add() method. The
|
||||
* PeriodicMetricReader collects and exports the accumulated delta.
|
||||
*
|
||||
@@ -239,7 +239,7 @@ public:
|
||||
/**
|
||||
* @brief Increment (or decrement) the gauge by a signed amount.
|
||||
*
|
||||
* Clamps the result to [0, UINT64_MAX] to match StatsDGaugeImpl
|
||||
* Clamps the result to [0, INT64_MAX] to match StatsDGaugeImpl
|
||||
* behavior.
|
||||
*
|
||||
* @param amount Signed amount to add to the current value.
|
||||
@@ -254,6 +254,10 @@ public:
|
||||
int64_t
|
||||
currentValue() const;
|
||||
|
||||
/** Static callback registered with the OTel SDK observable gauge. */
|
||||
static void
|
||||
gaugeCallback(opentelemetry::metrics::ObserverResult result, void* state);
|
||||
|
||||
private:
|
||||
OTelGaugeImpl&
|
||||
operator=(OTelGaugeImpl const&);
|
||||
@@ -578,27 +582,25 @@ OTelGaugeImpl::OTelGaugeImpl(
|
||||
: m_gauge(meter->CreateInt64ObservableGauge(name)), m_collector(collector)
|
||||
{
|
||||
m_collector->addGauge(this);
|
||||
m_gauge->AddCallback(gaugeCallback, this);
|
||||
}
|
||||
|
||||
// Register the async callback that the SDK calls during collection.
|
||||
// Before reading the gauge value, invoke all registered hooks so that
|
||||
// hook handlers (e.g. NetworkOPs State_Accounting) have a chance to
|
||||
// update gauge values. callHooks() uses a debounce timestamp so hooks
|
||||
// run at most once per collection cycle even with many gauges.
|
||||
m_gauge->AddCallback(
|
||||
[](opentelemetry::metrics::ObserverResult result, void* state) {
|
||||
auto* self = static_cast<OTelGaugeImpl*>(state);
|
||||
self->m_collector->callHooks();
|
||||
if (auto intResult = opentelemetry::nostd::get_if<opentelemetry::nostd::shared_ptr<
|
||||
opentelemetry::metrics::ObserverResultT<int64_t>>>(&result))
|
||||
{
|
||||
(*intResult)->Observe(self->currentValue());
|
||||
}
|
||||
},
|
||||
this);
|
||||
void
|
||||
OTelGaugeImpl::gaugeCallback(opentelemetry::metrics::ObserverResult result, void* state)
|
||||
{
|
||||
auto* self = static_cast<OTelGaugeImpl*>(state);
|
||||
self->m_collector->callHooks();
|
||||
if (auto intResult = opentelemetry::nostd::get_if<
|
||||
opentelemetry::nostd::shared_ptr<opentelemetry::metrics::ObserverResultT<int64_t>>>(
|
||||
&result))
|
||||
{
|
||||
(*intResult)->Observe(self->currentValue());
|
||||
}
|
||||
}
|
||||
|
||||
OTelGaugeImpl::~OTelGaugeImpl()
|
||||
{
|
||||
m_gauge->RemoveCallback(gaugeCallback, this);
|
||||
m_collector->removeGauge(this);
|
||||
}
|
||||
|
||||
@@ -720,8 +722,7 @@ OTelCollectorImp::~OTelCollectorImp()
|
||||
m_journal.info() << "OTelCollector shutting down";
|
||||
if (m_provider)
|
||||
{
|
||||
// ForceFlush to export any pending metrics before shutdown.
|
||||
m_provider->ForceFlush();
|
||||
m_provider->ForceFlush(std::chrono::milliseconds(2000));
|
||||
m_provider->Shutdown();
|
||||
}
|
||||
if (m_journal.info())
|
||||
@@ -783,10 +784,10 @@ OTelCollectorImp::callHooks()
|
||||
auto now = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now().time_since_epoch())
|
||||
.count();
|
||||
auto last = m_lastHookCallMs.load(std::memory_order_relaxed);
|
||||
auto last = m_lastHookCallMs.load(std::memory_order_acquire);
|
||||
if (now - last < 500)
|
||||
return;
|
||||
if (!m_lastHookCallMs.compare_exchange_strong(last, now, std::memory_order_relaxed))
|
||||
if (!m_lastHookCallMs.compare_exchange_strong(last, now, std::memory_order_acq_rel))
|
||||
return; // Another thread won the race.
|
||||
|
||||
std::lock_guard lock(m_mutex);
|
||||
|
||||
Reference in New Issue
Block a user