fix(telemetry): address review findings and PR #6437 comments

Critical fixes:
- Restore accidentally removed mallocTrim call and MallocTrim.h include
- Add missing shouldTraceLedger() to interface and all implementations
- Derive networkId/networkType from config_->NETWORK_ID (0=mainnet,
  1=testnet, 2=devnet) instead of leaving defaults unpopulated
- Clamp sampling_ratio to [0.0, 1.0] in config parser

PR comment fixes:
- Rename rippled -> xrpld in service name defaults, getTracer() calls,
  Docker network, comments, and docs/build/telemetry.md
- Remove exporter config option (only otlp_http supported)
- Add trace_ledger and service_name to example config
- Clarify head-based sampling semantics in config comments
- Add filter descriptions for span intrinsic filters in Grafana datasource
- Add inline comments to Docker Compose services

Docker/config improvements:
- Remove deprecated version: "3.8" from docker-compose.yml
- Pin images: collector 0.121.0, grafana 11.5.2
- Add health_check extension to otel-collector-config.yaml
- Comment out Tempo metrics_generator remote_write (no Prometheus service)
- Add Prometheus datasource caveat in Grafana datasource config

Other:
- Revert unrelated formatting changes in ServiceRegistry.h
- Change Conan telemetry default to False (matches CMake OFF)
- Add CLAUDE.md-required docs (ASCII diagrams, usage examples,
  @note thread-safety) to Telemetry.h and SpanGuard.h

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Pratik Mankawde
2026-04-16 17:07:01 +01:00
parent 8a49f2c8c0
commit 4830715d57
14 changed files with 264 additions and 69 deletions

View File

@@ -76,6 +76,12 @@ public:
return false;
}
bool
shouldTraceLedger() const override
{
return false;
}
#ifdef XRPL_ENABLE_TELEMETRY
opentelemetry::nostd::shared_ptr<opentelemetry::trace::Tracer>
getTracer(std::string_view) override

View File

@@ -93,6 +93,12 @@ public:
return false;
}
bool
shouldTraceLedger() const override
{
return false;
}
opentelemetry::nostd::shared_ptr<trace_api::Tracer>
getTracer(std::string_view) override
{
@@ -241,6 +247,12 @@ public:
return setup_.tracePeer;
}
bool
shouldTraceLedger() const override
{
return setup_.traceLedger;
}
opentelemetry::nostd::shared_ptr<trace_api::Tracer>
getTracer(std::string_view name) override
{
@@ -252,7 +264,7 @@ public:
opentelemetry::nostd::shared_ptr<trace_api::Span>
startSpan(std::string_view name, trace_api::SpanKind kind) override
{
auto tracer = getTracer("rippled");
auto tracer = getTracer("xrpld");
trace_api::StartSpanOptions opts;
opts.kind = kind;
return tracer->StartSpan(std::string(name), opts);
@@ -264,7 +276,7 @@ public:
opentelemetry::context::Context const& parentContext,
trace_api::SpanKind kind) override
{
auto tracer = getTracer("rippled");
auto tracer = getTracer("xrpld");
trace_api::StartSpanOptions opts;
opts.kind = kind;
opts.parent = parentContext;

View File

@@ -9,23 +9,49 @@
#include <xrpl/telemetry/Telemetry.h>
#include <algorithm>
namespace xrpl {
namespace telemetry {
namespace {
/** Derive a human-readable network type label from the numeric network ID.
@param networkId The network identifier from [network_id] config.
@return "mainnet", "testnet", "devnet", or "unknown" for other values.
*/
std::string
networkTypeFromId(std::uint32_t networkId)
{
switch (networkId)
{
case 0:
return "mainnet";
case 1:
return "testnet";
case 2:
return "devnet";
default:
return "unknown";
}
}
} // namespace
Telemetry::Setup
setup_Telemetry(
Section const& section,
std::string const& nodePublicKey,
std::string const& version)
std::string const& version,
std::uint32_t networkId)
{
Telemetry::Setup setup;
setup.enabled = section.value_or<int>("enabled", 0) != 0;
setup.serviceName = section.value_or<std::string>("service_name", "rippled");
setup.serviceName = section.value_or<std::string>("service_name", "xrpld");
setup.serviceVersion = version;
setup.serviceInstanceId = section.value_or<std::string>("service_instance_id", nodePublicKey);
setup.exporterType = section.value_or<std::string>("exporter", "otlp_http");
setup.exporterEndpoint =
section.value_or<std::string>("endpoint", "http://localhost:4318/v1/traces");
@@ -33,12 +59,16 @@ setup_Telemetry(
setup.tlsCertPath = section.value_or<std::string>("tls_ca_cert", "");
setup.samplingRatio = section.value_or<double>("sampling_ratio", 1.0);
setup.samplingRatio = std::clamp(setup.samplingRatio, 0.0, 1.0);
setup.batchSize = section.value_or<std::uint32_t>("batch_size", 512u);
setup.batchDelay =
std::chrono::milliseconds{section.value_or<std::uint32_t>("batch_delay_ms", 5000u)};
setup.maxQueueSize = section.value_or<std::uint32_t>("max_queue_size", 2048u);
setup.networkId = networkId;
setup.networkType = networkTypeFromId(networkId);
setup.traceTransactions = section.value_or<int>("trace_transactions", 1) != 0;
setup.traceConsensus = section.value_or<int>("trace_consensus", 1) != 0;
setup.traceRpc = section.value_or<int>("trace_rpc", 1) != 0;

View File

@@ -31,6 +31,7 @@
#include <xrpld/shamap/NodeFamily.h>
#include <xrpl/basics/ByteUtilities.h>
#include <xrpl/basics/MallocTrim.h>
#include <xrpl/basics/ResolverAsio.h>
#include <xrpl/basics/random.h>
#include <xrpl/beast/asio/io_latency_probe.h>
@@ -265,7 +266,8 @@ public:
telemetry::setup_Telemetry(
config_->section("telemetry"),
"", // Updated later via setServiceInstanceId()
BuildInfo::getVersionString()),
BuildInfo::getVersionString(),
config_->NETWORK_ID),
logs_->journal("Telemetry")))
, m_txMaster(*this)
@@ -1075,6 +1077,8 @@ public:
<< "; size after: " << cachedSLEs_.size();
}
mallocTrim("doSweep", m_journal);
// Set timer to do another sweep later.
setSweepTimer();
}