Fix MetricsRegistry: add rippled_ prefix and Resource attributes

The MetricsRegistry's OTel MeterProvider was missing Resource attributes
(service.name, service.instance.id) causing all 6 nodes' metrics to merge
into a single "unknown_service" in Prometheus with no exported_instance
label for per-node filtering.

Additionally, instrument names lacked the rippled_ prefix that dashboards
and integration tests expect (e.g. "job_queued_total" should be
"rippled_job_queued_total" to match the beast::insight naming convention).

Changes:
- Add Resource with service.name and service.instance.id to MeterProvider
- Prefix all instrument names with rippled_ (counters, histograms, gauges)
- Update start() signature to accept instanceId parameter
- Pass service_instance_id from [telemetry] config in Application::start()

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Pratik Mankawde
2026-03-17 10:58:56 +00:00
parent a2651262be
commit 010ac78fc3
3 changed files with 62 additions and 33 deletions

View File

@@ -1517,7 +1517,15 @@ ApplicationImp::start(bool withTimers)
auto const& section = config_->section("telemetry");
std::string endpoint = "http://localhost:4318/v1/metrics";
set(endpoint, "metrics_endpoint", section);
metricsRegistry_->start(endpoint);
// Pass the service_instance_id so the MeterProvider Resource
// carries it, giving Prometheus an exported_instance label.
std::string instanceId;
set(instanceId, "service_instance_id", section);
if (instanceId.empty() && nodeIdentity_)
instanceId = toBase58(TokenType::NodePublic, nodeIdentity_->first);
metricsRegistry_->start(endpoint, instanceId);
}
}

View File

@@ -41,9 +41,12 @@
#include <opentelemetry/sdk/metrics/export/periodic_exporting_metric_reader_options.h>
#include <opentelemetry/sdk/metrics/meter_provider.h>
#include <opentelemetry/sdk/metrics/meter_provider_factory.h>
#include <opentelemetry/sdk/resource/resource.h>
#include <opentelemetry/sdk/resource/semantic_conventions.h>
namespace metric_sdk = opentelemetry::sdk::metrics;
namespace otlp_http = opentelemetry::exporter::otlp;
namespace resource = opentelemetry::sdk::resource;
#endif // XRPL_ENABLE_TELEMETRY
@@ -61,13 +64,14 @@ MetricsRegistry::~MetricsRegistry()
}
void
MetricsRegistry::start(std::string const& endpoint)
MetricsRegistry::start(std::string const& endpoint, std::string const& instanceId)
{
#ifdef XRPL_ENABLE_TELEMETRY
if (!enabled_)
return;
JLOG(journal_.info()) << "MetricsRegistry: starting, endpoint=" << endpoint;
JLOG(journal_.info()) << "MetricsRegistry: starting, endpoint=" << endpoint
<< ", instanceId=" << instanceId;
// Configure OTLP/HTTP metric exporter.
otlp_http::OtlpHttpMetricExporterOptions exporterOpts;
@@ -81,8 +85,17 @@ MetricsRegistry::start(std::string const& endpoint)
auto reader =
metric_sdk::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOpts);
// Create MeterProvider and attach the reader.
provider_ = std::make_shared<metric_sdk::MeterProvider>();
// Configure resource attributes so Prometheus exported_instance labels
// distinguish metrics from different nodes (matches OTelCollector setup).
resource::ResourceAttributes attrs;
attrs[resource::SemanticConventions::kServiceName] = "rippled";
if (!instanceId.empty())
attrs[resource::SemanticConventions::kServiceInstanceId] = instanceId;
auto resourceAttrs = resource::Resource::Create(attrs);
// Create MeterProvider with resource, then attach the metric reader.
provider_ = metric_sdk::MeterProviderFactory::Create(
std::make_unique<metric_sdk::ViewRegistry>(), resourceAttrs);
provider_->AddMetricReader(std::move(reader));
// Get a meter for all rippled instruments.
@@ -91,23 +104,26 @@ MetricsRegistry::start(std::string const& endpoint)
// --- Create synchronous instruments ---
// RPC per-method counters and histogram.
rpcStartedCounter_ =
meter_->CreateUInt64Counter("rpc_method_started_total", "Total RPC method calls started");
rpcStartedCounter_ = meter_->CreateUInt64Counter(
"rippled_rpc_method_started_total", "Total RPC method calls started");
rpcFinishedCounter_ = meter_->CreateUInt64Counter(
"rpc_method_finished_total", "Total RPC method calls completed successfully");
"rippled_rpc_method_finished_total", "Total RPC method calls completed successfully");
rpcErroredCounter_ = meter_->CreateUInt64Counter(
"rpc_method_errored_total", "Total RPC method calls that errored");
"rippled_rpc_method_errored_total", "Total RPC method calls that errored");
rpcDurationHistogram_ = meter_->CreateDoubleHistogram(
"rpc_method_duration_us", "RPC method execution time in microseconds");
"rippled_rpc_method_duration_us", "RPC method execution time in microseconds");
// Job queue per-type counters and histograms.
jobQueuedCounter_ = meter_->CreateUInt64Counter("job_queued_total", "Total jobs enqueued");
jobStartedCounter_ = meter_->CreateUInt64Counter("job_started_total", "Total jobs started");
jobFinishedCounter_ = meter_->CreateUInt64Counter("job_finished_total", "Total jobs completed");
jobQueuedCounter_ =
meter_->CreateUInt64Counter("rippled_job_queued_total", "Total jobs enqueued");
jobStartedCounter_ =
meter_->CreateUInt64Counter("rippled_job_started_total", "Total jobs started");
jobFinishedCounter_ =
meter_->CreateUInt64Counter("rippled_job_finished_total", "Total jobs completed");
jobQueuedDurationHistogram_ = meter_->CreateDoubleHistogram(
"job_queued_duration_us", "Time jobs spent waiting in the queue (microseconds)");
"rippled_job_queued_duration_us", "Time jobs spent waiting in the queue (microseconds)");
jobRunningDurationHistogram_ = meter_->CreateDoubleHistogram(
"job_running_duration_us", "Job execution time in microseconds");
"rippled_job_running_duration_us", "Job execution time in microseconds");
// Register all observable (async) gauges.
registerAsyncGauges();
@@ -253,7 +269,7 @@ MetricsRegistry::registerAsyncGauges()
{
// --- Task 9.2: Cache hit rate and size gauges ---
cacheHitRateGauge_ =
meter_->CreateDoubleObservableGauge("cache_metrics", "Cache hit rates and sizes");
meter_->CreateDoubleObservableGauge("rippled_cache_metrics", "Cache hit rates and sizes");
cacheHitRateGauge_->AddCallback(
[](opentelemetry::metrics::ObserverResult result, void* state) {
auto* self = static_cast<MetricsRegistry*>(state);
@@ -307,7 +323,8 @@ MetricsRegistry::registerAsyncGauges()
this);
// --- Task 9.3: TxQ metrics gauges ---
txqGauge_ = meter_->CreateDoubleObservableGauge("txq_metrics", "Transaction queue metrics");
txqGauge_ =
meter_->CreateDoubleObservableGauge("rippled_txq_metrics", "Transaction queue metrics");
txqGauge_->AddCallback(
[](opentelemetry::metrics::ObserverResult result, void* state) {
auto* self = static_cast<MetricsRegistry*>(state);
@@ -349,7 +366,7 @@ MetricsRegistry::registerAsyncGauges()
// --- Task 9.6: Counted object instance gauges ---
objectCountGauge_ = meter_->CreateInt64ObservableGauge(
"object_count", "Live instance counts for key internal object types");
"rippled_object_count", "Live instance counts for key internal object types");
objectCountGauge_->AddCallback(
[](opentelemetry::metrics::ObserverResult result, void* /* state */) {
try
@@ -373,8 +390,8 @@ MetricsRegistry::registerAsyncGauges()
this);
// --- Task 9.7: Load factor breakdown gauges ---
loadFactorGauge_ =
meter_->CreateDoubleObservableGauge("load_factor_metrics", "Fee load factor breakdown");
loadFactorGauge_ = meter_->CreateDoubleObservableGauge(
"rippled_load_factor_metrics", "Fee load factor breakdown");
loadFactorGauge_->AddCallback(
[](opentelemetry::metrics::ObserverResult result, void* state) {
auto* self = static_cast<MetricsRegistry*>(state);
@@ -444,7 +461,7 @@ MetricsRegistry::registerAsyncGauges()
// libxrpl nodestore code — the MetricsRegistry reads the existing atomic
// counters from Database via its public accessors.
nodeStoreGauge_ = meter_->CreateInt64ObservableGauge(
"nodestore_state", "NodeStore I/O counters, queue depth, and write load");
"rippled_nodestore_state", "NodeStore I/O counters, queue depth, and write load");
nodeStoreGauge_->AddCallback(
[](opentelemetry::metrics::ObserverResult result, void* state) {
auto* self = static_cast<MetricsRegistry*>(state);

View File

@@ -23,15 +23,15 @@
| +-- OtlpHttpMetricExporter
|
+-- Counters / Histograms (synchronous instruments)
| +-- rpc_method_started_total
| +-- rpc_method_finished_total
| +-- rpc_method_errored_total
| +-- rpc_method_duration_us (Histogram)
| +-- job_queued_total
| +-- job_started_total
| +-- job_finished_total
| +-- job_queued_duration_us (Histogram)
| +-- job_running_duration_us (Histogram)
| +-- rippled_rpc_method_started_total
| +-- rippled_rpc_method_finished_total
| +-- rippled_rpc_method_errored_total
| +-- rippled_rpc_method_duration_us (Histogram)
| +-- rippled_job_queued_total
| +-- rippled_job_started_total
| +-- rippled_job_finished_total
| +-- rippled_job_queued_duration_us (Histogram)
| +-- rippled_job_running_duration_us (Histogram)
|
+-- Observable Gauges (async callbacks, polled by reader)
+-- Cache hit rates (SLE, ledger, AL)
@@ -150,11 +150,15 @@ public:
/** Initialize the OTel metrics pipeline and register all instruments.
@param endpoint OTLP/HTTP endpoint URL for metric export
(e.g. "http://localhost:4318/v1/metrics").
@param endpoint OTLP/HTTP endpoint URL for metric export
(e.g. "http://localhost:4318/v1/metrics").
@param instanceId Value for the service.instance.id resource
attribute. When non-empty, Prometheus metrics
carry an exported_instance label for per-node
filtering.
*/
void
start(std::string const& endpoint);
start(std::string const& endpoint, std::string const& instanceId = {});
/** Flush pending metrics and shut down the pipeline. */
void