diff --git a/docker/telemetry/grafana/dashboards/xrpld-job-queue.json b/docker/telemetry/grafana/dashboards/xrpld-job-queue.json index b237501aed..0f206eff82 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-job-queue.json +++ b/docker/telemetry/grafana/dashboards/xrpld-job-queue.json @@ -169,7 +169,7 @@ } }, { - "title": "Job Queue Wait Time (P50, P95, P99)", + "title": "Job Queue Wait Time", "description": "Histogram quantiles for time jobs spend waiting in the queue before execution starts. High values indicate thread pool saturation.", "type": "timeseries", "gridPos": { @@ -189,22 +189,8 @@ "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(xrpld_job_queued_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m])))", - "legendFormat": "P50 [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_job_queued_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m])))", - "legendFormat": "P95 [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.99, sum by (le, exported_instance) (rate(xrpld_job_queued_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m])))", - "legendFormat": "P99 [{{exported_instance}}]" + "expr": "histogram_quantile(1.0, sum by (le, exported_instance) (rate(xrpld_job_queued_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m])))", + "legendFormat": "Wait [{{exported_instance}}]" } ], "fieldConfig": { @@ -228,7 +214,7 @@ } }, { - "title": "Job Execution Time (P50, P95, P99)", + "title": "Job Execution Time", "description": "Histogram quantiles for actual job execution time. High values indicate expensive operations or resource contention.", "type": "timeseries", "gridPos": { @@ -248,22 +234,8 @@ "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(xrpld_job_running_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m])))", - "legendFormat": "P50 [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_job_running_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m])))", - "legendFormat": "P95 [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.99, sum by (le, exported_instance) (rate(xrpld_job_running_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m])))", - "legendFormat": "P99 [{{exported_instance}}]" + "expr": "histogram_quantile(1.0, sum by (le, exported_instance) (rate(xrpld_job_running_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m])))", + "legendFormat": "Exec [{{exported_instance}}]" } ], "fieldConfig": { @@ -287,7 +259,7 @@ } }, { - "title": "Per-Job-Type Execution Time (P95)", + "title": "Per-Job-Type Execution Time", "description": "95th percentile execution time broken down by job type. Identifies the slowest job types.", "type": "timeseries", "gridPos": { @@ -312,7 +284,7 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(10, histogram_quantile(0.95, sum by (le, job_type, exported_instance) (rate(xrpld_job_running_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m]))))", + "expr": "topk(10, histogram_quantile(1.0, sum by (le, job_type, exported_instance) (rate(xrpld_job_running_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m]))))", "legendFormat": "{{job_type}} [{{exported_instance}}]" } ], diff --git a/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json b/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json index 319dd56313..b1ce4a1941 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json +++ b/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json @@ -169,7 +169,7 @@ } }, { - "title": "RPC Latency (P50, P95, P99) - All Methods", + "title": "RPC Latency - All Methods", "description": "Histogram quantiles for RPC execution time across all methods. Sourced from rpc_method_duration_us histogram.", "type": "timeseries", "gridPos": { @@ -189,22 +189,8 @@ "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(xrpld_rpc_method_duration_us_bucket{exported_instance=~\"$node\", method=~\"$method\"}[5m])))", - "legendFormat": "P50 [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_rpc_method_duration_us_bucket{exported_instance=~\"$node\", method=~\"$method\"}[5m])))", - "legendFormat": "P95 [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.99, sum by (le, exported_instance) (rate(xrpld_rpc_method_duration_us_bucket{exported_instance=~\"$node\", method=~\"$method\"}[5m])))", - "legendFormat": "P99 [{{exported_instance}}]" + "expr": "histogram_quantile(1.0, sum by (le, exported_instance) (rate(xrpld_rpc_method_duration_us_bucket{exported_instance=~\"$node\", method=~\"$method\"}[5m])))", + "legendFormat": "Latency [{{exported_instance}}]" } ], "fieldConfig": { @@ -228,7 +214,7 @@ } }, { - "title": "Per-Method Latency P95 (Top 10 Slowest)", + "title": "Per-Method Latency (Top 10 Slowest)", "description": "95th percentile execution time per method. Identifies the slowest RPC endpoints.", "type": "timeseries", "gridPos": { @@ -253,7 +239,7 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(10, histogram_quantile(0.95, sum by (le, method, exported_instance) (rate(xrpld_rpc_method_duration_us_bucket{exported_instance=~\"$node\", method=~\"$method\"}[5m]))))", + "expr": "topk(10, histogram_quantile(1.0, sum by (le, method, exported_instance) (rate(xrpld_rpc_method_duration_us_bucket{exported_instance=~\"$node\", method=~\"$method\"}[5m]))))", "legendFormat": "{{method}} [{{exported_instance}}]" } ], diff --git a/src/xrpld/telemetry/MetricsRegistry.cpp b/src/xrpld/telemetry/MetricsRegistry.cpp index 97708e2255..79e76ea737 100644 --- a/src/xrpld/telemetry/MetricsRegistry.cpp +++ b/src/xrpld/telemetry/MetricsRegistry.cpp @@ -50,9 +50,14 @@ #include #include #include +#include #include #include +#include #include +#include +#include +#include #include #include #include @@ -73,6 +78,64 @@ namespace metric_sdk = opentelemetry::sdk::metrics; namespace otlp_http = opentelemetry::exporter::otlp; namespace resource = opentelemetry::sdk::resource; +namespace { + +// Microsecond-valued duration histogram instrument names. Each is +// referenced twice — once to register the explicit-bucket view and once +// to create the instrument — so they are named constants to keep the two +// sites in sync (a mismatch would silently drop the bucket override). +constexpr char kJobQueuedDurationUs[] = "xrpld_job_queued_duration_us"; +constexpr char kJobRunningDurationUs[] = "xrpld_job_running_duration_us"; +constexpr char kRpcMethodDurationUs[] = "xrpld_rpc_method_duration_us"; + +/** Register an explicit-bucket histogram view for a microsecond-valued + * instrument. + * + * The SDK's default histogram buckets top out at 10,000 (10 ms when the + * values are microseconds), so any duration above 10 ms saturates and + * every quantile reads as 10 ms. Job wait/run times and RPC latencies + * routinely exceed that, so we install boundaries spanning 100 µs to + * 60 s to capture the real distribution. + * + * @param views The registry to add the view to. + * @param name Instrument name to match (e.g. "xrpld_job_running_duration_us"). + */ +void +addMicrosecondHistogramView(metric_sdk::ViewRegistry& views, std::string const& name) +{ + // Boundaries in microseconds: 100µs, 500µs, 1ms, 5ms, 10ms, 25ms, 50ms, + // 100ms, 250ms, 500ms, 1s, 2.5s, 5s, 10s, 30s, 60s. Covers sub-millisecond + // jobs through multi-second stalls without saturating. + auto config = std::make_shared(); + config->boundaries_ = { + 100.0, + 500.0, + 1'000.0, + 5'000.0, + 10'000.0, + 25'000.0, + 50'000.0, + 100'000.0, + 250'000.0, + 500'000.0, + 1'000'000.0, + 2'500'000.0, + 5'000'000.0, + 10'000'000.0, + 30'000'000.0, + 60'000'000.0}; + + auto selector = metric_sdk::InstrumentSelectorFactory::Create( + metric_sdk::InstrumentType::kHistogram, name, ""); + auto meterSelector = metric_sdk::MeterSelectorFactory::Create("xrpld", "1.0.0", ""); + auto view = + metric_sdk::ViewFactory::Create(name, "", metric_sdk::AggregationType::kHistogram, config); + + views.AddView(std::move(selector), std::move(meterSelector), std::move(view)); +} + +} // namespace + #endif // XRPL_ENABLE_TELEMETRY namespace xrpl::telemetry { @@ -124,9 +187,16 @@ MetricsRegistry::start(std::string const& endpoint, std::string const& instanceI attrs[opentelemetry::semconv::service::kServiceInstanceId] = instanceId; auto resourceAttrs = resource::Resource::Create(attrs); + // Build a view registry with explicit microsecond buckets for the + // duration histograms. Without this they use the SDK default buckets + // (max 10,000 = 10 ms), saturating every quantile at 10 ms. + auto views = std::make_unique(); + addMicrosecondHistogramView(*views, kJobQueuedDurationUs); + addMicrosecondHistogramView(*views, kJobRunningDurationUs); + addMicrosecondHistogramView(*views, kRpcMethodDurationUs); + // Create MeterProvider with resource, then attach the metric reader. - provider_ = metric_sdk::MeterProviderFactory::Create( - std::make_unique(), resourceAttrs); + provider_ = metric_sdk::MeterProviderFactory::Create(std::move(views), resourceAttrs); provider_->AddMetricReader(std::move(reader)); // Get a meter for all xrpld instruments. @@ -142,7 +212,7 @@ MetricsRegistry::start(std::string const& endpoint, std::string const& instanceI rpcErroredCounter_ = meter_->CreateUInt64Counter( "xrpld_rpc_method_errored_total", "Total RPC method calls that errored"); rpcDurationHistogram_ = meter_->CreateDoubleHistogram( - "xrpld_rpc_method_duration_us", "RPC method execution time in microseconds"); + kRpcMethodDurationUs, "RPC method execution time in microseconds"); // Job queue per-type counters and histograms. jobQueuedCounter_ = @@ -152,9 +222,9 @@ MetricsRegistry::start(std::string const& endpoint, std::string const& instanceI jobFinishedCounter_ = meter_->CreateUInt64Counter("xrpld_job_finished_total", "Total jobs completed"); jobQueuedDurationHistogram_ = meter_->CreateDoubleHistogram( - "xrpld_job_queued_duration_us", "Time jobs spent waiting in the queue (microseconds)"); - jobRunningDurationHistogram_ = meter_->CreateDoubleHistogram( - "xrpld_job_running_duration_us", "Job execution time in microseconds"); + kJobQueuedDurationUs, "Time jobs spent waiting in the queue (microseconds)"); + jobRunningDurationHistogram_ = + meter_->CreateDoubleHistogram(kJobRunningDurationUs, "Job execution time in microseconds"); // --- External dashboard parity counters (Task 7.14) --- ledgersClosedCounter_ = meter_->CreateUInt64Counter(