From 010ac78fc35f8b250534fd1658ad5ebeabfe1939 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Tue, 17 Mar 2026 10:58:56 +0000 Subject: [PATCH] Fix MetricsRegistry: add rippled_ prefix and Resource attributes The MetricsRegistry's OTel MeterProvider was missing Resource attributes (service.name, service.instance.id) causing all 6 nodes' metrics to merge into a single "unknown_service" in Prometheus with no exported_instance label for per-node filtering. Additionally, instrument names lacked the rippled_ prefix that dashboards and integration tests expect (e.g. "job_queued_total" should be "rippled_job_queued_total" to match the beast::insight naming convention). Changes: - Add Resource with service.name and service.instance.id to MeterProvider - Prefix all instrument names with rippled_ (counters, histograms, gauges) - Update start() signature to accept instanceId parameter - Pass service_instance_id from [telemetry] config in Application::start() Co-Authored-By: Claude Opus 4.6 --- src/xrpld/app/main/Application.cpp | 10 ++++- src/xrpld/telemetry/MetricsRegistry.cpp | 57 ++++++++++++++++--------- src/xrpld/telemetry/MetricsRegistry.h | 28 ++++++------ 3 files changed, 62 insertions(+), 33 deletions(-) diff --git a/src/xrpld/app/main/Application.cpp b/src/xrpld/app/main/Application.cpp index 996381832b..37cf25ead9 100644 --- a/src/xrpld/app/main/Application.cpp +++ b/src/xrpld/app/main/Application.cpp @@ -1517,7 +1517,15 @@ ApplicationImp::start(bool withTimers) auto const& section = config_->section("telemetry"); std::string endpoint = "http://localhost:4318/v1/metrics"; set(endpoint, "metrics_endpoint", section); - metricsRegistry_->start(endpoint); + + // Pass the service_instance_id so the MeterProvider Resource + // carries it, giving Prometheus an exported_instance label. + std::string instanceId; + set(instanceId, "service_instance_id", section); + if (instanceId.empty() && nodeIdentity_) + instanceId = toBase58(TokenType::NodePublic, nodeIdentity_->first); + + metricsRegistry_->start(endpoint, instanceId); } } diff --git a/src/xrpld/telemetry/MetricsRegistry.cpp b/src/xrpld/telemetry/MetricsRegistry.cpp index 7729fef464..3b7af52e17 100644 --- a/src/xrpld/telemetry/MetricsRegistry.cpp +++ b/src/xrpld/telemetry/MetricsRegistry.cpp @@ -41,9 +41,12 @@ #include #include #include +#include +#include namespace metric_sdk = opentelemetry::sdk::metrics; namespace otlp_http = opentelemetry::exporter::otlp; +namespace resource = opentelemetry::sdk::resource; #endif // XRPL_ENABLE_TELEMETRY @@ -61,13 +64,14 @@ MetricsRegistry::~MetricsRegistry() } void -MetricsRegistry::start(std::string const& endpoint) +MetricsRegistry::start(std::string const& endpoint, std::string const& instanceId) { #ifdef XRPL_ENABLE_TELEMETRY if (!enabled_) return; - JLOG(journal_.info()) << "MetricsRegistry: starting, endpoint=" << endpoint; + JLOG(journal_.info()) << "MetricsRegistry: starting, endpoint=" << endpoint + << ", instanceId=" << instanceId; // Configure OTLP/HTTP metric exporter. otlp_http::OtlpHttpMetricExporterOptions exporterOpts; @@ -81,8 +85,17 @@ MetricsRegistry::start(std::string const& endpoint) auto reader = metric_sdk::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOpts); - // Create MeterProvider and attach the reader. - provider_ = std::make_shared(); + // Configure resource attributes so Prometheus exported_instance labels + // distinguish metrics from different nodes (matches OTelCollector setup). + resource::ResourceAttributes attrs; + attrs[resource::SemanticConventions::kServiceName] = "rippled"; + if (!instanceId.empty()) + attrs[resource::SemanticConventions::kServiceInstanceId] = instanceId; + auto resourceAttrs = resource::Resource::Create(attrs); + + // Create MeterProvider with resource, then attach the metric reader. + provider_ = metric_sdk::MeterProviderFactory::Create( + std::make_unique(), resourceAttrs); provider_->AddMetricReader(std::move(reader)); // Get a meter for all rippled instruments. @@ -91,23 +104,26 @@ MetricsRegistry::start(std::string const& endpoint) // --- Create synchronous instruments --- // RPC per-method counters and histogram. - rpcStartedCounter_ = - meter_->CreateUInt64Counter("rpc_method_started_total", "Total RPC method calls started"); + rpcStartedCounter_ = meter_->CreateUInt64Counter( + "rippled_rpc_method_started_total", "Total RPC method calls started"); rpcFinishedCounter_ = meter_->CreateUInt64Counter( - "rpc_method_finished_total", "Total RPC method calls completed successfully"); + "rippled_rpc_method_finished_total", "Total RPC method calls completed successfully"); rpcErroredCounter_ = meter_->CreateUInt64Counter( - "rpc_method_errored_total", "Total RPC method calls that errored"); + "rippled_rpc_method_errored_total", "Total RPC method calls that errored"); rpcDurationHistogram_ = meter_->CreateDoubleHistogram( - "rpc_method_duration_us", "RPC method execution time in microseconds"); + "rippled_rpc_method_duration_us", "RPC method execution time in microseconds"); // Job queue per-type counters and histograms. - jobQueuedCounter_ = meter_->CreateUInt64Counter("job_queued_total", "Total jobs enqueued"); - jobStartedCounter_ = meter_->CreateUInt64Counter("job_started_total", "Total jobs started"); - jobFinishedCounter_ = meter_->CreateUInt64Counter("job_finished_total", "Total jobs completed"); + jobQueuedCounter_ = + meter_->CreateUInt64Counter("rippled_job_queued_total", "Total jobs enqueued"); + jobStartedCounter_ = + meter_->CreateUInt64Counter("rippled_job_started_total", "Total jobs started"); + jobFinishedCounter_ = + meter_->CreateUInt64Counter("rippled_job_finished_total", "Total jobs completed"); jobQueuedDurationHistogram_ = meter_->CreateDoubleHistogram( - "job_queued_duration_us", "Time jobs spent waiting in the queue (microseconds)"); + "rippled_job_queued_duration_us", "Time jobs spent waiting in the queue (microseconds)"); jobRunningDurationHistogram_ = meter_->CreateDoubleHistogram( - "job_running_duration_us", "Job execution time in microseconds"); + "rippled_job_running_duration_us", "Job execution time in microseconds"); // Register all observable (async) gauges. registerAsyncGauges(); @@ -253,7 +269,7 @@ MetricsRegistry::registerAsyncGauges() { // --- Task 9.2: Cache hit rate and size gauges --- cacheHitRateGauge_ = - meter_->CreateDoubleObservableGauge("cache_metrics", "Cache hit rates and sizes"); + meter_->CreateDoubleObservableGauge("rippled_cache_metrics", "Cache hit rates and sizes"); cacheHitRateGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); @@ -307,7 +323,8 @@ MetricsRegistry::registerAsyncGauges() this); // --- Task 9.3: TxQ metrics gauges --- - txqGauge_ = meter_->CreateDoubleObservableGauge("txq_metrics", "Transaction queue metrics"); + txqGauge_ = + meter_->CreateDoubleObservableGauge("rippled_txq_metrics", "Transaction queue metrics"); txqGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); @@ -349,7 +366,7 @@ MetricsRegistry::registerAsyncGauges() // --- Task 9.6: Counted object instance gauges --- objectCountGauge_ = meter_->CreateInt64ObservableGauge( - "object_count", "Live instance counts for key internal object types"); + "rippled_object_count", "Live instance counts for key internal object types"); objectCountGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* /* state */) { try @@ -373,8 +390,8 @@ MetricsRegistry::registerAsyncGauges() this); // --- Task 9.7: Load factor breakdown gauges --- - loadFactorGauge_ = - meter_->CreateDoubleObservableGauge("load_factor_metrics", "Fee load factor breakdown"); + loadFactorGauge_ = meter_->CreateDoubleObservableGauge( + "rippled_load_factor_metrics", "Fee load factor breakdown"); loadFactorGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); @@ -444,7 +461,7 @@ MetricsRegistry::registerAsyncGauges() // libxrpl nodestore code — the MetricsRegistry reads the existing atomic // counters from Database via its public accessors. nodeStoreGauge_ = meter_->CreateInt64ObservableGauge( - "nodestore_state", "NodeStore I/O counters, queue depth, and write load"); + "rippled_nodestore_state", "NodeStore I/O counters, queue depth, and write load"); nodeStoreGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); diff --git a/src/xrpld/telemetry/MetricsRegistry.h b/src/xrpld/telemetry/MetricsRegistry.h index be977901d2..e6d39892b1 100644 --- a/src/xrpld/telemetry/MetricsRegistry.h +++ b/src/xrpld/telemetry/MetricsRegistry.h @@ -23,15 +23,15 @@ | +-- OtlpHttpMetricExporter | +-- Counters / Histograms (synchronous instruments) - | +-- rpc_method_started_total - | +-- rpc_method_finished_total - | +-- rpc_method_errored_total - | +-- rpc_method_duration_us (Histogram) - | +-- job_queued_total - | +-- job_started_total - | +-- job_finished_total - | +-- job_queued_duration_us (Histogram) - | +-- job_running_duration_us (Histogram) + | +-- rippled_rpc_method_started_total + | +-- rippled_rpc_method_finished_total + | +-- rippled_rpc_method_errored_total + | +-- rippled_rpc_method_duration_us (Histogram) + | +-- rippled_job_queued_total + | +-- rippled_job_started_total + | +-- rippled_job_finished_total + | +-- rippled_job_queued_duration_us (Histogram) + | +-- rippled_job_running_duration_us (Histogram) | +-- Observable Gauges (async callbacks, polled by reader) +-- Cache hit rates (SLE, ledger, AL) @@ -150,11 +150,15 @@ public: /** Initialize the OTel metrics pipeline and register all instruments. - @param endpoint OTLP/HTTP endpoint URL for metric export - (e.g. "http://localhost:4318/v1/metrics"). + @param endpoint OTLP/HTTP endpoint URL for metric export + (e.g. "http://localhost:4318/v1/metrics"). + @param instanceId Value for the service.instance.id resource + attribute. When non-empty, Prometheus metrics + carry an exported_instance label for per-node + filtering. */ void - start(std::string const& endpoint); + start(std::string const& endpoint, std::string const& instanceId = {}); /** Flush pending metrics and shut down the pipeline. */ void