From 9289cb671d767f38e79a77922a4f5af49ae61683 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Tue, 10 Mar 2026 16:20:54 +0000 Subject: [PATCH] Phase 9: Internal Metric Instrumentation Gap Fill (Tasks 9.1-9.10) Implement ~50 OTel metrics covering NodeStore I/O, cache hit rates, TxQ state, PerfLog per-RPC/per-job counters, CountedObject instances, and load factor breakdown via MetricsRegistry. Core implementation: - MetricsRegistry class with synchronous instruments (Counter, Histogram) for RPC and Job metrics, and ObservableGauge callbacks for cache, TxQ, CountedObject, LoadFactor, and NodeStore state polling. - ServiceRegistry extended with getMetricsRegistry() virtual method. - Application wires MetricsRegistry lifecycle (create/start/stop). - PerfLogImp instrumented to emit OTel metrics on RPC and Job events. Dashboards & observability: - 3 new Grafana dashboards: RPC Performance, Job Queue, Fee Market/TxQ. - Extended statsd-node-health dashboard with NodeStore, Cache, and CountedObject panels. - 10 alerting rules added to telemetry-runbook.md. - Integration test extended with 12 OTel metric validation checks. Documentation: - 09-data-collection-reference.md updated with Phase 9 metric tables. - Unit tests for MetricsRegistry disabled-path (no-op) behavior. All OTel SDK code guarded with #ifdef XRPL_ENABLE_TELEMETRY. Co-Authored-By: Claude Opus 4.6 --- .../09-data-collection-reference.md | 124 ++++- .../dashboards/rippled-fee-market.json | 317 ++++++++++++ .../grafana/dashboards/rippled-job-queue.json | 324 ++++++++++++ .../grafana/dashboards/rippled-rpc-perf.json | 333 ++++++++++++ .../dashboards/system-node-health.json | 310 +++++++++++- docker/telemetry/integration-test.sh | 48 ++ docs/telemetry-runbook.md | 68 ++- include/xrpl/core/ServiceRegistry.h | 9 +- src/test/telemetry/MetricsRegistry_test.cpp | 374 ++++++++++++++ src/xrpld/app/main/Application.cpp | 29 ++ src/xrpld/perflog/detail/PerfLogImp.cpp | 44 +- src/xrpld/telemetry/MetricsRegistry.cpp | 473 ++++++++++++++++++ src/xrpld/telemetry/MetricsRegistry.h | 280 +++++++++++ 13 files changed, 2722 insertions(+), 11 deletions(-) create mode 100644 docker/telemetry/grafana/dashboards/rippled-fee-market.json create mode 100644 docker/telemetry/grafana/dashboards/rippled-job-queue.json create mode 100644 docker/telemetry/grafana/dashboards/rippled-rpc-perf.json create mode 100644 src/test/telemetry/MetricsRegistry_test.cpp create mode 100644 src/xrpld/telemetry/MetricsRegistry.cpp create mode 100644 src/xrpld/telemetry/MetricsRegistry.h diff --git a/OpenTelemetryPlan/09-data-collection-reference.md b/OpenTelemetryPlan/09-data-collection-reference.md index 587cafec73..0f897c360b 100644 --- a/OpenTelemetryPlan/09-data-collection-reference.md +++ b/OpenTelemetryPlan/09-data-collection-reference.md @@ -11,6 +11,7 @@ graph LR subgraph rippledNode["rippled Node"] A["Trace Macros
XRPL_TRACE_SPAN
(OTLP/HTTP exporter)"] B["beast::insight
OTel native metrics
(OTLP/HTTP exporter)"] + C["MetricsRegistry
OTel SDK metrics
(OTLP/HTTP exporter)"] end subgraph collector["OTel Collector :4317 / :4318"] @@ -33,11 +34,12 @@ graph LR end subgraph viz["Visualization"] - F["Grafana :3000
10 dashboards"] + F["Grafana :3000
13 dashboards"] end A -->|"OTLP/HTTP :4318
(traces + attributes)"| R1 B -->|"OTLP/HTTP :4318
(gauges, counters, histograms)"| R1 + C -->|"OTLP/HTTP :4318
(counters, histograms,
observable gauges)"| R1 BP -->|"OTLP/gRPC :4317"| D BP -->|"OTLP/gRPC"| T @@ -751,6 +753,126 @@ Phase 11 builds a custom OTel Collector receiver (Go) that polls rippled's admin | `xrpl_orderbook_ask_depth` | Gauge | `pair=""` | Total ask volume | | `xrpl_orderbook_spread` | Gauge | `pair=""` | Best bid-ask spread | +### Phase 9: OTel SDK-Exported Metrics (MetricsRegistry) + +Phase 9 introduces the `MetricsRegistry` class (`src/xrpld/telemetry/MetricsRegistry.h/.cpp`) +which registers metrics directly with the OpenTelemetry Metrics SDK. These are exported +via OTLP/HTTP to the OTel Collector and scraped by Prometheus. + +#### NodeStore I/O (Observable Gauge — `nodestore_state`) + +| Prometheus Metric | Type | Labels | Description | +| ------------------------------------------------------ | ----- | -------- | ------------------------------------ | +| `rippled_nodestore_state{metric="node_reads_total"}` | Gauge | `metric` | Cumulative NodeStore read operations | +| `rippled_nodestore_state{metric="node_reads_hit"}` | Gauge | `metric` | Reads served from cache | +| `rippled_nodestore_state{metric="node_writes"}` | Gauge | `metric` | Cumulative write operations | +| `rippled_nodestore_state{metric="node_written_bytes"}` | Gauge | `metric` | Cumulative bytes written | +| `rippled_nodestore_state{metric="node_read_bytes"}` | Gauge | `metric` | Cumulative bytes read | +| `rippled_nodestore_state{metric="write_load"}` | Gauge | `metric` | Current write load score | +| `rippled_nodestore_state{metric="read_queue"}` | Gauge | `metric` | Items in read prefetch queue | + +#### Cache Hit Rates & Sizes (Observable Gauge — `cache_metrics`) + +| Prometheus Metric | Type | Labels | Description | +| ----------------------------------------------------- | ----- | -------- | ----------------------------- | +| `rippled_cache_metrics{metric="SLE_hit_rate"}` | Gauge | `metric` | SLE cache hit rate (0.0-1.0) | +| `rippled_cache_metrics{metric="ledger_hit_rate"}` | Gauge | `metric` | Ledger cache hit rate | +| `rippled_cache_metrics{metric="AL_hit_rate"}` | Gauge | `metric` | AcceptedLedger cache hit rate | +| `rippled_cache_metrics{metric="treenode_cache_size"}` | Gauge | `metric` | SHAMap TreeNode cache entries | +| `rippled_cache_metrics{metric="treenode_track_size"}` | Gauge | `metric` | Tracked tree nodes | +| `rippled_cache_metrics{metric="fullbelow_size"}` | Gauge | `metric` | FullBelow cache entries | + +#### Transaction Queue (Observable Gauge — `txq_metrics`) + +| Prometheus Metric | Type | Labels | Description | +| ------------------------------------------------------------ | ----- | -------- | -------------------------------- | +| `rippled_txq_metrics{metric="txq_count"}` | Gauge | `metric` | Transactions currently in queue | +| `rippled_txq_metrics{metric="txq_max_size"}` | Gauge | `metric` | Maximum queue capacity | +| `rippled_txq_metrics{metric="txq_in_ledger"}` | Gauge | `metric` | Transactions in open ledger | +| `rippled_txq_metrics{metric="txq_per_ledger"}` | Gauge | `metric` | Expected transactions per ledger | +| `rippled_txq_metrics{metric="txq_reference_fee_level"}` | Gauge | `metric` | Reference fee level | +| `rippled_txq_metrics{metric="txq_min_processing_fee_level"}` | Gauge | `metric` | Minimum fee to get processed | +| `rippled_txq_metrics{metric="txq_med_fee_level"}` | Gauge | `metric` | Median fee level in queue | +| `rippled_txq_metrics{metric="txq_open_ledger_fee_level"}` | Gauge | `metric` | Open ledger fee escalation level | + +#### Per-RPC Method Metrics (Synchronous Counters/Histogram) + +| Prometheus Metric | Type | Labels | Description | +| ----------------------------------- | --------- | ----------------- | -------------------------------- | +| `rippled_rpc_method_started_total` | Counter | `method=""` | RPC calls started | +| `rippled_rpc_method_finished_total` | Counter | `method=""` | RPC calls completed successfully | +| `rippled_rpc_method_errored_total` | Counter | `method=""` | RPC calls that errored | +| `rippled_rpc_method_duration_us` | Histogram | `method=""` | Execution time distribution (us) | + +#### Per-Job-Type Metrics (Synchronous Counters/Histogram) + +| Prometheus Metric | Type | Labels | Description | +| --------------------------------- | --------- | ------------------- | --------------------------------- | +| `rippled_job_queued_total` | Counter | `job_type=""` | Jobs enqueued | +| `rippled_job_started_total` | Counter | `job_type=""` | Jobs started | +| `rippled_job_finished_total` | Counter | `job_type=""` | Jobs completed | +| `rippled_job_queued_duration_us` | Histogram | `job_type=""` | Queue wait time distribution (us) | +| `rippled_job_running_duration_us` | Histogram | `job_type=""` | Execution time distribution (us) | + +#### Counted Object Instances (Observable Gauge — `object_count`) + +| Prometheus Metric | Type | Labels | Description | +| ---------------------------------------------- | ----- | --------------- | ------------------------------ | +| `rippled_object_count{type="Transaction"}` | Gauge | `type=""` | Live Transaction objects | +| `rippled_object_count{type="Ledger"}` | Gauge | `type=""` | Live Ledger objects | +| `rippled_object_count{type="NodeObject"}` | Gauge | `type=""` | Live NodeObject instances | +| `rippled_object_count{type="STTx"}` | Gauge | `type=""` | Serialized transaction objects | +| `rippled_object_count{type="STLedgerEntry"}` | Gauge | `type=""` | Serialized ledger entries | +| `rippled_object_count{type="InboundLedger"}` | Gauge | `type=""` | Ledgers being fetched | +| `rippled_object_count{type="Pathfinder"}` | Gauge | `type=""` | Active pathfinding operations | +| `rippled_object_count{type="PathRequest"}` | Gauge | `type=""` | Active path requests | +| `rippled_object_count{type="HashRouterEntry"}` | Gauge | `type=""` | Hash router entries | + +#### Load Factor Breakdown (Observable Gauge — `load_factor_metrics`) + +| Prometheus Metric | Type | Labels | Description | +| ------------------------------------------------------------------ | ----- | -------- | --------------------------------------- | +| `rippled_load_factor_metrics{metric="load_factor"}` | Gauge | `metric` | Combined transaction cost multiplier | +| `rippled_load_factor_metrics{metric="load_factor_server"}` | Gauge | `metric` | Server + cluster + network contribution | +| `rippled_load_factor_metrics{metric="load_factor_local"}` | Gauge | `metric` | Local server load only | +| `rippled_load_factor_metrics{metric="load_factor_net"}` | Gauge | `metric` | Network-wide load estimate | +| `rippled_load_factor_metrics{metric="load_factor_cluster"}` | Gauge | `metric` | Cluster peer load | +| `rippled_load_factor_metrics{metric="load_factor_fee_escalation"}` | Gauge | `metric` | Open ledger fee escalation | +| `rippled_load_factor_metrics{metric="load_factor_fee_queue"}` | Gauge | `metric` | Queue entry fee level | + +#### Prometheus Query Examples (Phase 9) + +```promql +# NodeStore cache hit ratio +rippled_nodestore_state{metric="node_reads_hit"} / rippled_nodestore_state{metric="node_reads_total"} + +# RPC error rate for server_info +rate(rippled_rpc_method_errored_total{method="server_info"}[5m]) + +# Job queue wait time p95 +histogram_quantile(0.95, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m]))) + +# TxQ utilization percentage +rippled_txq_metrics{metric="txq_count"} / rippled_txq_metrics{metric="txq_max_size"} + +# High load factor alert candidate +rippled_load_factor_metrics{metric="load_factor"} > 5 +``` + +### New Grafana Dashboards (Phase 9) + +| Dashboard | UID | Data Source | Key Panels | +| ---------------------- | -------------------- | ----------- | --------------------------------------------------------- | +| Fee Market & TxQ | `rippled-fee-market` | Prometheus | TxQ depth/capacity, fee levels, load factor breakdown | +| Job Queue Analysis | `rippled-job-queue` | Prometheus | Per-job rates, queue wait times, execution times | +| RPC Performance (OTel) | `rippled-rpc-perf` | Prometheus | Per-method call rates, error rates, latency distributions | + +### Updated Grafana Dashboards (Phase 9) + +| Dashboard | UID | New Panels Added | +| -------------------- | ---------------------------- | ------------------------------------------------------ | +| Node Health (StatsD) | `rippled-statsd-node-health` | NodeStore I/O, cache hit rates, object instance counts | + ### New Grafana Dashboards (Phase 11) | Dashboard | UID | Data Source | Key Panels | diff --git a/docker/telemetry/grafana/dashboards/rippled-fee-market.json b/docker/telemetry/grafana/dashboards/rippled-fee-market.json new file mode 100644 index 0000000000..7ff6dd65c3 --- /dev/null +++ b/docker/telemetry/grafana/dashboards/rippled-fee-market.json @@ -0,0 +1,317 @@ +{ + "annotations": { + "list": [] + }, + "description": "Fee market dynamics: TxQ depth/capacity, fee escalation levels, and load factor breakdown. Sourced from OTel MetricsRegistry observable gauges (Phase 9).", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Transaction Queue Depth", + "description": "Current number of transactions waiting in the queue vs. maximum capacity. Sourced from MetricsRegistry txq_metrics observable gauge with metric=txq_count and metric=txq_max_size.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{metric=\"txq_count\"}", + "legendFormat": "Queue Depth" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{metric=\"txq_max_size\"}", + "legendFormat": "Max Capacity" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Transactions Per Ledger", + "description": "Transactions in the current open ledger vs. expected per-ledger count. Sourced from txq_metrics with metric=txq_in_ledger and metric=txq_per_ledger.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{metric=\"txq_in_ledger\"}", + "legendFormat": "In Ledger" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{metric=\"txq_per_ledger\"}", + "legendFormat": "Expected Per Ledger" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Fee Escalation Levels", + "description": "Fee levels that control transaction queue admission. Reference fee level is the baseline; open ledger fee level triggers escalation. Sourced from txq_metrics observable gauge.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{metric=\"txq_reference_fee_level\"}", + "legendFormat": "Reference Fee Level" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{metric=\"txq_min_processing_fee_level\"}", + "legendFormat": "Min Processing Fee Level" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{metric=\"txq_med_fee_level\"}", + "legendFormat": "Median Fee Level" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{metric=\"txq_open_ledger_fee_level\"}", + "legendFormat": "Open Ledger Fee Level" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 5, + "scaleDistribution": { + "type": "log", + "log": 2 + } + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Load Factor Breakdown", + "description": "Decomposed load factor components: server (max of local, net, cluster), fee escalation, fee queue, and combined. Values are unitless multipliers where 1.0 = no load. Sourced from load_factor_metrics observable gauge.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_load_factor_metrics{metric=\"load_factor\"}", + "legendFormat": "Combined Load Factor" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_load_factor_metrics{metric=\"load_factor_server\"}", + "legendFormat": "Server" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_load_factor_metrics{metric=\"load_factor_fee_escalation\"}", + "legendFormat": "Fee Escalation" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_load_factor_metrics{metric=\"load_factor_fee_queue\"}", + "legendFormat": "Fee Queue" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 2 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [] + } + }, + { + "title": "Load Factor Components", + "description": "Individual load factor contributors: local server load, network load, and cluster load. Only differ from 1.0 under load conditions. Sourced from load_factor_metrics observable gauge.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_load_factor_metrics{metric=\"load_factor_local\"}", + "legendFormat": "Local" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_load_factor_metrics{metric=\"load_factor_net\"}", + "legendFormat": "Network" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_load_factor_metrics{metric=\"load_factor_cluster\"}", + "legendFormat": "Cluster" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "otel", "fee-market"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "rippled - Fee Market & TxQ", + "uid": "rippled-fee-market", + "version": 1 +} diff --git a/docker/telemetry/grafana/dashboards/rippled-job-queue.json b/docker/telemetry/grafana/dashboards/rippled-job-queue.json new file mode 100644 index 0000000000..1f3a30ca75 --- /dev/null +++ b/docker/telemetry/grafana/dashboards/rippled-job-queue.json @@ -0,0 +1,324 @@ +{ + "annotations": { + "list": [] + }, + "description": "Job queue analysis: per-job-type throughput rates, queue wait times, and execution times. Sourced from OTel MetricsRegistry synchronous counters and histograms (Phase 9).", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Job Throughput Rate (per second)", + "description": "Rate of jobs queued, started, and finished across all job types. Computed as rate() over the OTel counter values. High queue rates with low finish rates indicate backlog.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(rippled_job_queued_total[5m]))", + "legendFormat": "Queued/s" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(rippled_job_started_total[5m]))", + "legendFormat": "Started/s" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(rippled_job_finished_total[5m]))", + "legendFormat": "Finished/s" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Per-Job-Type Queued Rate", + "description": "Rate of jobs queued broken down by job_type label. Identifies which job types contribute most to queue activity.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, rate(rippled_job_queued_total[5m]))", + "legendFormat": "{{job_type}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Per-Job-Type Finish Rate", + "description": "Rate of jobs completing broken down by job_type. Compare with queued rate to identify backlog per type.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, rate(rippled_job_finished_total[5m]))", + "legendFormat": "{{job_type}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Job Queue Wait Time (p50, p95, p99)", + "description": "Histogram quantiles for time jobs spend waiting in the queue before execution starts. High values indicate thread pool saturation.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m])))", + "legendFormat": "p50" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m])))", + "legendFormat": "p95" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m])))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "us", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Job Execution Time (p50, p95, p99)", + "description": "Histogram quantiles for actual job execution time. High values indicate expensive operations or resource contention.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m])))", + "legendFormat": "p50" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m])))", + "legendFormat": "p95" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m])))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "us", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Per-Job-Type Execution Time (p95)", + "description": "95th percentile execution time broken down by job type. Identifies the slowest job types.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, histogram_quantile(0.95, sum by (le, job_type) (rate(rippled_job_running_duration_us_bucket[5m]))))", + "legendFormat": "{{job_type}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "us", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "otel", "job-queue"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "rippled - Job Queue Analysis", + "uid": "rippled-job-queue", + "version": 1 +} diff --git a/docker/telemetry/grafana/dashboards/rippled-rpc-perf.json b/docker/telemetry/grafana/dashboards/rippled-rpc-perf.json new file mode 100644 index 0000000000..d26eae7a6f --- /dev/null +++ b/docker/telemetry/grafana/dashboards/rippled-rpc-perf.json @@ -0,0 +1,333 @@ +{ + "annotations": { + "list": [] + }, + "description": "Per-RPC-method performance: call rates, error rates, and latency distributions. Sourced from OTel MetricsRegistry synchronous counters and histograms (Phase 9).", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "RPC Call Rate (all methods)", + "description": "Aggregate rate of RPC calls started, finished, and errored across all methods. Computed as rate() over OTel counters.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(rippled_rpc_method_started_total[5m]))", + "legendFormat": "Started/s" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(rippled_rpc_method_finished_total[5m]))", + "legendFormat": "Finished/s" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(rippled_rpc_method_errored_total[5m]))", + "legendFormat": "Errored/s" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Per-Method Call Rate (Top 10)", + "description": "Per-method RPC call rate, showing the 10 most active methods. Useful for identifying hot paths.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, rate(rippled_rpc_method_started_total[5m]))", + "legendFormat": "{{method}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Per-Method Error Rate (Top 10)", + "description": "Per-method RPC error rate. Non-zero values warrant investigation. Common culprits: invalid parameters, resource exhaustion.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, rate(rippled_rpc_method_errored_total[5m]))", + "legendFormat": "{{method}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "RPC Latency (p50, p95, p99) - All Methods", + "description": "Histogram quantiles for RPC execution time across all methods. Sourced from rpc_method_duration_us histogram.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m])))", + "legendFormat": "p50" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m])))", + "legendFormat": "p95" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m])))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "us", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Per-Method Latency p95 (Top 10 Slowest)", + "description": "95th percentile execution time per method. Identifies the slowest RPC endpoints.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, histogram_quantile(0.95, sum by (le, method) (rate(rippled_rpc_method_duration_us_bucket[5m]))))", + "legendFormat": "{{method}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "us", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "RPC Error Ratio by Method", + "description": "Error ratio (errors / total started) per method. Values above 0.05 (5%) warrant investigation.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, rate(rippled_rpc_method_errored_total[5m]) / (rate(rippled_rpc_method_started_total[5m]) > 0))", + "legendFormat": "{{method}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.05 + }, + { + "color": "red", + "value": 0.25 + } + ] + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "otel", "rpc"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "rippled - RPC Performance (OTel)", + "uid": "rippled-rpc-perf", + "version": 1 +} diff --git a/docker/telemetry/grafana/dashboards/system-node-health.json b/docker/telemetry/grafana/dashboards/system-node-health.json index 3fa987210c..0b46b45a6d 100644 --- a/docker/telemetry/grafana/dashboards/system-node-health.json +++ b/docker/telemetry/grafana/dashboards/system-node-health.json @@ -399,10 +399,318 @@ }, "overrides": [] } + }, + { + "title": "--- OTel: NodeStore I/O ---", + "type": "row", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "collapsed": false, + "panels": [] + }, + { + "title": "NodeStore Read/Write Totals", + "description": "Cumulative NodeStore read and write operation counts. Sourced from MetricsRegistry nodestore_state observable gauge with metric=node_reads_total, node_writes, node_reads_hit.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 33 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_nodestore_state{metric=\"node_reads_total\"}", + "legendFormat": "Reads Total" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_nodestore_state{metric=\"node_reads_hit\"}", + "legendFormat": "Reads Hit (cache)" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_nodestore_state{metric=\"node_writes\"}", + "legendFormat": "Writes Total" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "NodeStore Write Load & Read Queue", + "description": "Instantaneous write load score and read queue depth. High write load indicates backend pressure. High read queue indicates prefetch thread saturation.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 33 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_nodestore_state{metric=\"write_load\"}", + "legendFormat": "Write Load" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_nodestore_state{metric=\"read_queue\"}", + "legendFormat": "Read Queue" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + }, + "color": { + "mode": "palette-classic" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 100 + }, + { + "color": "red", + "value": 1000 + } + ] + } + }, + "overrides": [] + } + }, + { + "title": "--- OTel: Cache Hit Rates ---", + "type": "row", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 41 + }, + "collapsed": false, + "panels": [] + }, + { + "title": "Cache Hit Rates", + "description": "Hit rates for SLE cache, Ledger cache, and AcceptedLedger cache. Values from 0.0 to 1.0. Low values indicate cache thrashing. Sourced from MetricsRegistry cache_metrics observable gauge.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_cache_metrics{metric=\"SLE_hit_rate\"}", + "legendFormat": "SLE Hit Rate" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_cache_metrics{metric=\"ledger_hit_rate\"}", + "legendFormat": "Ledger Hit Rate" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_cache_metrics{metric=\"AL_hit_rate\"}", + "legendFormat": "AcceptedLedger Hit Rate" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Cache Sizes", + "description": "TreeNode cache size, TreeNode track size, and FullBelow cache size. Sourced from MetricsRegistry cache_metrics observable gauge.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_cache_metrics{metric=\"treenode_cache_size\"}", + "legendFormat": "TreeNode Cache" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_cache_metrics{metric=\"treenode_track_size\"}", + "legendFormat": "TreeNode Track" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_cache_metrics{metric=\"fullbelow_size\"}", + "legendFormat": "FullBelow" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "--- OTel: Object Instance Counts ---", + "type": "row", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 50 + }, + "collapsed": false, + "panels": [] + }, + { + "title": "Object Instance Counts", + "description": "Live instance counts for key internal object types tracked by CountedObject. Sourced from MetricsRegistry object_count observable gauge. High counts may indicate memory pressure or object leaks.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 51 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["last", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(15, rippled_object_count)", + "legendFormat": "{{type}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } } ], "schemaVersion": 39, - "tags": ["rippled", "statsd", "node-health", "telemetry"], + "tags": ["rippled", "statsd", "otel", "node-health", "telemetry"], "templating": { "list": [ { diff --git a/docker/telemetry/integration-test.sh b/docker/telemetry/integration-test.sh index 5e6696a33c..09a0032c51 100755 --- a/docker/telemetry/integration-test.sh +++ b/docker/telemetry/integration-test.sh @@ -353,6 +353,7 @@ trace_transactions=1 trace_consensus=1 trace_peer=1 trace_ledger=1 +metrics_endpoint=http://localhost:4318/v1/metrics [insight] server=otel @@ -636,6 +637,53 @@ else fail "StatsD port 8125 appears to be listening (should not be needed)" fi +# --------------------------------------------------------------------------- +# Step 10c: Verify Phase 9 OTel SDK Metrics +# --------------------------------------------------------------------------- +log "" +log "--- Phase 9: OTel SDK Metrics (MetricsRegistry) ---" +log "Waiting 15s for OTel metric export + Prometheus scrape..." +sleep 15 + +check_otel_metric() { + local metric_name="$1" + local result + result=$(curl -sf "$PROM/api/v1/query?query=$metric_name" \ + | jq '.data.result | length' 2>/dev/null || echo 0) + if [ "$result" -gt 0 ]; then + ok "OTel: $metric_name ($result series)" + else + fail "OTel: $metric_name (0 series)" + fi +} + +# Task 9.1: NodeStore I/O +check_otel_metric 'rippled_nodestore_state{metric="node_reads_total"}' +check_otel_metric 'rippled_nodestore_state{metric="write_load"}' + +# Task 9.2: Cache hit rates +check_otel_metric 'rippled_cache_metrics{metric="SLE_hit_rate"}' +check_otel_metric 'rippled_cache_metrics{metric="treenode_cache_size"}' + +# Task 9.3: TxQ metrics +check_otel_metric 'rippled_txq_metrics{metric="txq_count"}' +check_otel_metric 'rippled_txq_metrics{metric="txq_reference_fee_level"}' + +# Task 9.4: Per-RPC metrics +check_otel_metric "rippled_rpc_method_started_total" +check_otel_metric "rippled_rpc_method_finished_total" + +# Task 9.5: Per-job metrics +check_otel_metric "rippled_job_queued_total" +check_otel_metric "rippled_job_finished_total" + +# Task 9.6: Counted object instances +check_otel_metric "rippled_object_count" + +# Task 9.7: Load factor breakdown +check_otel_metric 'rippled_load_factor_metrics{metric="load_factor"}' +check_otel_metric 'rippled_load_factor_metrics{metric="load_factor_server"}' + # --------------------------------------------------------------------------- # Step 11: Summary # --------------------------------------------------------------------------- diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index b47487b14e..b5a5acdb0e 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -231,7 +231,7 @@ When using StatsD, uncomment the `statsd` receiver in `otel-collector-config.yam ## Grafana Dashboards -Eight dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: +Thirteen dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: ### RPC Performance (`rippled-rpc-perf`) @@ -403,8 +403,74 @@ count_over_time({job="rippled"} |= "trace_id=" [5m]) 4. Open Grafana at http://localhost:3000 -> Explore -> Loki and search for `{job="rippled"} |= "trace_id="`. 5. Click the TraceID link to navigate to the corresponding trace in Tempo. +## Phase 9: OTel Metrics Alerting Rules + +The following alerting rules are recommended for the Phase 9 OTel SDK metrics. +Add to your Prometheus alerting rules configuration. + +### NodeStore + +| Alert Name | Severity | Condition | For | Description | +| --------------------------- | -------- | ---------------------------------------------------- | --- | ------------------------------------------------------- | +| `NodeStoreHighWriteLoad` | Warning | `rippled_nodestore_state{metric="write_load"} > 100` | 5m | NodeStore backend is under sustained write pressure | +| `NodeStoreReadQueueBacklog` | Warning | `rippled_nodestore_state{metric="read_queue"} > 500` | 5m | Prefetch thread pool is saturated; reads are backing up | + +### Cache + +| Alert Name | Severity | Condition | For | Description | +| ----------------------- | -------- | ------------------------------------------------------- | --- | ------------------------------------------------------ | +| `SLECacheHitRateLow` | Warning | `rippled_cache_metrics{metric="SLE_hit_rate"} < 0.5` | 10m | SLE cache is thrashing; consider increasing cache size | +| `LedgerCacheHitRateLow` | Warning | `rippled_cache_metrics{metric="ledger_hit_rate"} < 0.5` | 10m | Ledger cache hit rate is degraded | + +### Transaction Queue + +| Alert Name | Severity | Condition | For | Description | +| ---------------------- | -------- | ---------------------------------------------------------------------------------------------------------------------- | --- | -------------------------------------------------- | +| `TxQNearCapacity` | Warning | `rippled_txq_metrics{metric="txq_count"} / rippled_txq_metrics{metric="txq_max_size"} > 0.8` | 5m | TxQ is >80% full; transactions may be rejected | +| `TxQHighFeeEscalation` | Warning | `rippled_txq_metrics{metric="txq_open_ledger_fee_level"} / rippled_txq_metrics{metric="txq_reference_fee_level"} > 10` | 5m | Fee escalation is 10x above reference; high demand | + +### Load Factor + +| Alert Name | Severity | Condition | For | Description | +| --------------------- | -------- | -------------------------------------------------------------- | --- | -------------------------------------------------------------- | +| `HighLoadFactor` | Warning | `rippled_load_factor_metrics{metric="load_factor"} > 5` | 10m | Combined load factor is elevated; transactions cost 5x+ normal | +| `HighLocalLoadFactor` | Critical | `rippled_load_factor_metrics{metric="load_factor_local"} > 10` | 5m | Local server load is critically elevated | + +### RPC Performance + +| Alert Name | Severity | Condition | For | Description | +| ------------------ | -------- | ---------------------------------------------------------------------------------------------------------- | --- | --------------------------------- | +| `HighRPCErrorRate` | Warning | `sum(rate(rippled_rpc_method_errored_total[5m])) / sum(rate(rippled_rpc_method_started_total[5m])) > 0.05` | 5m | >5% of RPC calls are erroring | +| `SlowRPCLatency` | Warning | `histogram_quantile(0.95, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m]))) > 5000000` | 5m | RPC p95 latency exceeds 5 seconds | + +### Job Queue + +| Alert Name | Severity | Condition | For | Description | +| ------------------ | -------- | ----------------------------------------------------------------------------------------------------- | --- | ---------------------------------------------------- | +| `JobQueueBacklog` | Warning | `sum(rate(rippled_job_queued_total[5m])) - sum(rate(rippled_job_finished_total[5m])) > 100` | 5m | Jobs are being queued faster than they're completing | +| `SlowJobExecution` | Warning | `histogram_quantile(0.95, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m]))) > 10000000` | 5m | Job execution p95 exceeds 10 seconds | + ## Troubleshooting +### No OTel SDK metrics in Prometheus + +1. Verify `enabled=1` in the `[telemetry]` config section +2. Check that `metrics_endpoint` points to the OTel Collector's HTTP receiver + (default: `http://localhost:4318/v1/metrics`) +3. Check rippled logs for `MetricsRegistry: started successfully` message +4. Verify the OTel Collector is configured with an OTLP receiver and Prometheus exporter +5. Check Prometheus targets page for the collector scrape target + +### Cache hit rates are zero + +Cache hit rates may be zero during startup before caches are warmed. Wait for the +node to reach `Full` operating mode and process several ledgers before investigating. + +### NodeStore I/O counters not incrementing + +NodeStore counters are cumulative and may appear flat if the node is idle. Submit +some transactions or RPC requests to generate I/O activity. + ### No traces appearing in Jaeger 1. Check rippled logs for `Telemetry starting` message diff --git a/include/xrpl/core/ServiceRegistry.h b/include/xrpl/core/ServiceRegistry.h index 200b1bf59c..86f92ce774 100644 --- a/include/xrpl/core/ServiceRegistry.h +++ b/include/xrpl/core/ServiceRegistry.h @@ -21,7 +21,8 @@ class PerfLog; } namespace telemetry { class Telemetry; -} +class MetricsRegistry; +} // namespace telemetry // This is temporary until we migrate all code to use ServiceRegistry. class Application; @@ -211,6 +212,12 @@ public: virtual telemetry::Telemetry& getTelemetry() = 0; + /** Return the MetricsRegistry, or nullptr if telemetry is disabled. + Used by PerfLog and other hot paths to record OTel metrics. + */ + virtual telemetry::MetricsRegistry* + getMetricsRegistry() = 0; + // Configuration and state virtual bool isStopping() const = 0; diff --git a/src/test/telemetry/MetricsRegistry_test.cpp b/src/test/telemetry/MetricsRegistry_test.cpp new file mode 100644 index 0000000000..29877d4604 --- /dev/null +++ b/src/test/telemetry/MetricsRegistry_test.cpp @@ -0,0 +1,374 @@ +/** Unit tests for MetricsRegistry. + + Tests cover: + - Construction with telemetry disabled (no-op behavior). + - start()/stop() lifecycle when disabled. + - Synchronous instrument recording methods do not crash when disabled. + - Double stop() is safe. + + NOTE: Tests that exercise the OTel SDK path require XRPL_ENABLE_TELEMETRY + to be defined at build time (telemetry=ON). The no-op path tests run + unconditionally. +*/ + +#include + +#include +#include + +namespace xrpl { +namespace test { + +/** Minimal mock ServiceRegistry for MetricsRegistry testing. + + Only the getMetricsRegistry() call is used in the tests; other methods + are not invoked because the registry is disabled (enabled=false) so no + gauge callbacks execute. + + All pure virtual methods throw to catch accidental calls during tests. +*/ +class MockServiceRegistry : public ServiceRegistry +{ + [[noreturn]] void + throwUnimplemented() const + { + Throw("MockServiceRegistry: method not implemented"); + } + +public: + // ServiceRegistry interface — stubs that should never be called. + CollectorManager& + getCollectorManager() override + { + throwUnimplemented(); + } + Family& + getNodeFamily() override + { + throwUnimplemented(); + } + TimeKeeper& + timeKeeper() override + { + throwUnimplemented(); + } + JobQueue& + getJobQueue() override + { + throwUnimplemented(); + } + NodeCache& + getTempNodeCache() override + { + throwUnimplemented(); + } + CachedSLEs& + cachedSLEs() override + { + throwUnimplemented(); + } + NetworkIDService& + getNetworkIDService() override + { + throwUnimplemented(); + } + AmendmentTable& + getAmendmentTable() override + { + throwUnimplemented(); + } + HashRouter& + getHashRouter() override + { + throwUnimplemented(); + } + LoadFeeTrack& + getFeeTrack() override + { + throwUnimplemented(); + } + LoadManager& + getLoadManager() override + { + throwUnimplemented(); + } + RCLValidations& + getValidations() override + { + throwUnimplemented(); + } + ValidatorList& + validators() override + { + throwUnimplemented(); + } + ValidatorSite& + validatorSites() override + { + throwUnimplemented(); + } + ManifestCache& + validatorManifests() override + { + throwUnimplemented(); + } + ManifestCache& + publisherManifests() override + { + throwUnimplemented(); + } + Overlay& + overlay() override + { + throwUnimplemented(); + } + Cluster& + cluster() override + { + throwUnimplemented(); + } + PeerReservationTable& + peerReservations() override + { + throwUnimplemented(); + } + Resource::Manager& + getResourceManager() override + { + throwUnimplemented(); + } + NodeStore::Database& + getNodeStore() override + { + throwUnimplemented(); + } + SHAMapStore& + getSHAMapStore() override + { + throwUnimplemented(); + } + RelationalDatabase& + getRelationalDatabase() override + { + throwUnimplemented(); + } + InboundLedgers& + getInboundLedgers() override + { + throwUnimplemented(); + } + InboundTransactions& + getInboundTransactions() override + { + throwUnimplemented(); + } + TaggedCache& + getAcceptedLedgerCache() override + { + throwUnimplemented(); + } + LedgerMaster& + getLedgerMaster() override + { + throwUnimplemented(); + } + LedgerCleaner& + getLedgerCleaner() override + { + throwUnimplemented(); + } + LedgerReplayer& + getLedgerReplayer() override + { + throwUnimplemented(); + } + PendingSaves& + pendingSaves() override + { + throwUnimplemented(); + } + OpenLedger& + openLedger() override + { + throwUnimplemented(); + } + OpenLedger const& + openLedger() const override + { + throwUnimplemented(); + } + NetworkOPs& + getOPs() override + { + throwUnimplemented(); + } + OrderBookDB& + getOrderBookDB() override + { + throwUnimplemented(); + } + TransactionMaster& + getMasterTransaction() override + { + throwUnimplemented(); + } + TxQ& + getTxQ() override + { + throwUnimplemented(); + } + PathRequests& + getPathRequests() override + { + throwUnimplemented(); + } + ServerHandler& + getServerHandler() override + { + throwUnimplemented(); + } + perf::PerfLog& + getPerfLog() override + { + throwUnimplemented(); + } + telemetry::Telemetry& + getTelemetry() override + { + throwUnimplemented(); + } + telemetry::MetricsRegistry* + getMetricsRegistry() override + { + return nullptr; + } + bool + isStopping() const override + { + return false; + } + beast::Journal + journal(std::string const&) override + { + return beast::Journal(beast::Journal::getNullSink()); + } + boost::asio::io_context& + getIOContext() override + { + throwUnimplemented(); + } + Logs& + logs() override + { + throwUnimplemented(); + } + std::optional const& + trapTxID() const override + { + static std::optional const empty; + return empty; + } + DatabaseCon& + getWalletDB() override + { + throwUnimplemented(); + } + Application& + app() override + { + throwUnimplemented(); + } +}; + +class MetricsRegistry_test : public beast::unit_test::suite +{ + void + testDisabledConstruction() + { + testcase("Disabled construction"); + + MockServiceRegistry mockApp; + beast::Journal j(beast::Journal::getNullSink()); + + // Construct with enabled=false; should be a no-op. + telemetry::MetricsRegistry registry(false, mockApp, j); + BEAST_EXPECT(!registry.isEnabled()); + } + + void + testDisabledStartStop() + { + testcase("Disabled start/stop"); + + MockServiceRegistry mockApp; + beast::Journal j(beast::Journal::getNullSink()); + + telemetry::MetricsRegistry registry(false, mockApp, j); + + // start() and stop() should be no-ops when disabled. + registry.start("http://localhost:4318/v1/metrics"); + registry.stop(); + + // Double stop should be safe. + registry.stop(); + + pass(); + } + + void + testDisabledRecording() + { + testcase("Disabled recording methods"); + + MockServiceRegistry mockApp; + beast::Journal j(beast::Journal::getNullSink()); + + telemetry::MetricsRegistry registry(false, mockApp, j); + registry.start("http://localhost:4318/v1/metrics"); + + // All recording methods should be no-ops (not crash). + registry.recordRpcStarted("server_info"); + registry.recordRpcFinished("server_info", 1000); + registry.recordRpcErrored("ledger", 500); + registry.recordJobQueued("ledgerData"); + registry.recordJobStarted("ledgerData", 200); + registry.recordJobFinished("ledgerData", 3000); + + registry.stop(); + + pass(); + } + + void + testDestructorStops() + { + testcase("Destructor calls stop"); + + MockServiceRegistry mockApp; + beast::Journal j(beast::Journal::getNullSink()); + + { + // Let the destructor handle cleanup. + telemetry::MetricsRegistry registry(false, mockApp, j); + registry.start("http://localhost:4318/v1/metrics"); + } + + // If we get here without crash, the destructor handled stop. + pass(); + } + +public: + void + run() override + { + testDisabledConstruction(); + testDisabledStartStop(); + testDisabledRecording(); + testDestructorStops(); + } +}; + +BEAST_DEFINE_TESTSUITE(MetricsRegistry, telemetry, ripple); + +} // namespace test +} // namespace xrpl diff --git a/src/xrpld/app/main/Application.cpp b/src/xrpld/app/main/Application.cpp index 2a4efa7d10..996381832b 100644 --- a/src/xrpld/app/main/Application.cpp +++ b/src/xrpld/app/main/Application.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -148,6 +149,9 @@ public: beast::Journal m_journal; std::unique_ptr perfLog_; std::unique_ptr telemetry_; + /// OTel metrics registry for gap-fill metrics (counters, histograms, + /// observable gauges). Created after telemetry_ during setup(). + std::unique_ptr metricsRegistry_; Application::MutexType m_masterMutex; // Required by the SHAMapStore @@ -633,6 +637,12 @@ public: return *telemetry_; } + telemetry::MetricsRegistry* + getMetricsRegistry() override + { + return metricsRegistry_.get(); + } + NodeCache& getTempNodeCache() override { @@ -1282,6 +1292,11 @@ ApplicationImp::setup(boost::program_options::variables_map const& cmdline) if (!config_->section("telemetry").exists("service_instance_id")) telemetry_->setServiceInstanceId(toBase58(TokenType::NodePublic, nodeIdentity_->first)); + // Create the OTel MetricsRegistry for gap-fill metrics (counters, + // histograms, observable gauges). It is started later in start(). + metricsRegistry_ = std::make_unique( + telemetry_->isEnabled(), *this, logs_->journal("MetricsRegistry")); + if (!cluster_->load(config().section(SECTION_CLUSTER_NODES))) { JLOG(m_journal.fatal()) << "Invalid entry in cluster configuration."; @@ -1494,6 +1509,16 @@ ApplicationImp::start(bool withTimers) ledgerCleaner_->start(); perfLog_->start(); telemetry_->start(); + + // Start the metrics pipeline after telemetry; the endpoint uses the + // same base URL but the /v1/metrics path. + if (metricsRegistry_) + { + auto const& section = config_->section("telemetry"); + std::string endpoint = "http://localhost:4318/v1/metrics"; + set(endpoint, "metrics_endpoint", section); + metricsRegistry_->start(endpoint); + } } void @@ -1584,6 +1609,10 @@ ApplicationImp::run() ledgerCleaner_->stop(); m_nodeStore->stop(); perfLog_->stop(); + // Stop metrics pipeline before telemetry — gauge callbacks reference + // Application services that may be shutting down. + if (metricsRegistry_) + metricsRegistry_->stop(); // Telemetry must stop last among trace-producing components. // serverHandler_, overlay_, and jobQueue_ are already stopped above, // so no threads should be calling startSpan() at this point. diff --git a/src/xrpld/perflog/detail/PerfLogImp.cpp b/src/xrpld/perflog/detail/PerfLogImp.cpp index d1267953e4..ddbafe8198 100644 --- a/src/xrpld/perflog/detail/PerfLogImp.cpp +++ b/src/xrpld/perflog/detail/PerfLogImp.cpp @@ -1,9 +1,11 @@ #include +#include #include #include #include #include +#include #include #include @@ -314,6 +316,10 @@ PerfLogImp::rpcStart(std::string const& method, std::uint64_t const requestId) } std::lock_guard lock(counters_.methodsMutex_); counters_.methods_[requestId] = {counter->first.c_str(), steady_clock::now()}; + + // Task 9.4: Record RPC start in OTel metrics pipeline. + if (auto* mr = app_.getMetricsRegistry()) + mr->recordRpcStarted(method); } void @@ -343,13 +349,25 @@ PerfLogImp::rpcEnd(std::string const& method, std::uint64_t const requestId, boo // LCOV_EXCL_STOP } } - std::lock_guard lock(counter->second.mutex); - if (finish) - ++counter->second.value.finished; - else - ++counter->second.value.errored; - counter->second.value.duration += - std::chrono::duration_cast(steady_clock::now() - startTime); + auto const duration = std::chrono::duration_cast(steady_clock::now() - startTime); + { + std::lock_guard lock(counter->second.mutex); + if (finish) + ++counter->second.value.finished; + else + ++counter->second.value.errored; + counter->second.value.duration += duration; + } + + // Task 9.4: Record RPC completion/error in OTel metrics pipeline. + if (auto* mr = app_.getMetricsRegistry()) + { + auto const durUs = duration.count(); + if (finish) + mr->recordRpcFinished(method, durUs); + else + mr->recordRpcErrored(method, durUs); + } } void @@ -365,6 +383,10 @@ PerfLogImp::jobQueue(JobType const type) } std::lock_guard lock(counter->second.mutex); ++counter->second.value.queued; + + // Task 9.5: Record job enqueue in OTel metrics pipeline. + if (auto* mr = app_.getMetricsRegistry()) + mr->recordJobQueued(JobTypes::name(type)); } void @@ -391,6 +413,10 @@ PerfLogImp::jobStart( std::lock_guard lock(counters_.jobsMutex_); if (instance >= 0 && instance < counters_.jobs_.size()) counters_.jobs_[instance] = {type, startTime}; + + // Task 9.5: Record job start in OTel metrics pipeline. + if (auto* mr = app_.getMetricsRegistry()) + mr->recordJobStarted(JobTypes::name(type), dur.count()); } void @@ -413,6 +439,10 @@ PerfLogImp::jobFinish(JobType const type, microseconds dur, int instance) std::lock_guard lock(counters_.jobsMutex_); if (instance >= 0 && instance < counters_.jobs_.size()) counters_.jobs_[instance] = {jtINVALID, steady_time_point()}; + + // Task 9.5: Record job finish in OTel metrics pipeline. + if (auto* mr = app_.getMetricsRegistry()) + mr->recordJobFinished(JobTypes::name(type), dur.count()); } void diff --git a/src/xrpld/telemetry/MetricsRegistry.cpp b/src/xrpld/telemetry/MetricsRegistry.cpp new file mode 100644 index 0000000000..25b9dcd6d3 --- /dev/null +++ b/src/xrpld/telemetry/MetricsRegistry.cpp @@ -0,0 +1,473 @@ +/** MetricsRegistry implementation — OpenTelemetry metric instruments for rippled. + + This file contains: + - Construction / destruction logic for the OTel MeterProvider pipeline. + - Synchronous instrument creation (counters, histograms) for RPC, job + queue, and NodeStore I/O metrics. + - Observable gauge callback registration for cache hit rates, TxQ state, + CountedObject instances, load factors, and NodeStore queue depth. + - No-op stubs when XRPL_ENABLE_TELEMETRY is not defined. +*/ + +#include + +#ifdef XRPL_ENABLE_TELEMETRY + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace metric_api = opentelemetry::metrics; +namespace metric_sdk = opentelemetry::sdk::metrics; +namespace otlp_http = opentelemetry::exporter::otlp; + +#endif // XRPL_ENABLE_TELEMETRY + +namespace xrpl { +namespace telemetry { + +MetricsRegistry::MetricsRegistry(bool enabled, ServiceRegistry& app, beast::Journal journal) + : enabled_(enabled), app_(app), journal_(journal) +{ +} + +MetricsRegistry::~MetricsRegistry() +{ + stop(); +} + +void +MetricsRegistry::start(std::string const& endpoint) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!enabled_) + return; + + JLOG(journal_.info()) << "MetricsRegistry: starting, endpoint=" << endpoint; + + // Configure OTLP/HTTP metric exporter. + otlp_http::OtlpHttpMetricExporterOptions exporterOpts; + exporterOpts.url = endpoint; + auto exporter = otlp_http::OtlpHttpMetricExporterFactory::Create(exporterOpts); + + // Configure periodic reader with 10-second export interval. + metric_sdk::PeriodicExportingMetricReaderOptions readerOpts; + readerOpts.export_interval_millis = std::chrono::milliseconds(10000); + readerOpts.export_timeout_millis = std::chrono::milliseconds(5000); + auto reader = + metric_sdk::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOpts); + + // Create MeterProvider and attach the reader. + provider_ = std::make_shared(); + provider_->AddMetricReader(std::move(reader)); + + // Get a meter for all rippled instruments. + meter_ = provider_->GetMeter("rippled", "1.0.0"); + + // --- Create synchronous instruments --- + + // RPC per-method counters and histogram. + rpcStartedCounter_ = + meter_->CreateUInt64Counter("rpc_method_started_total", "Total RPC method calls started"); + rpcFinishedCounter_ = meter_->CreateUInt64Counter( + "rpc_method_finished_total", "Total RPC method calls completed successfully"); + rpcErroredCounter_ = meter_->CreateUInt64Counter( + "rpc_method_errored_total", "Total RPC method calls that errored"); + rpcDurationHistogram_ = meter_->CreateDoubleHistogram( + "rpc_method_duration_us", "RPC method execution time in microseconds"); + + // Job queue per-type counters and histograms. + jobQueuedCounter_ = meter_->CreateUInt64Counter("job_queued_total", "Total jobs enqueued"); + jobStartedCounter_ = meter_->CreateUInt64Counter("job_started_total", "Total jobs started"); + jobFinishedCounter_ = meter_->CreateUInt64Counter("job_finished_total", "Total jobs completed"); + jobQueuedDurationHistogram_ = meter_->CreateDoubleHistogram( + "job_queued_duration_us", "Time jobs spent waiting in the queue (microseconds)"); + jobRunningDurationHistogram_ = meter_->CreateDoubleHistogram( + "job_running_duration_us", "Job execution time in microseconds"); + + // Register all observable (async) gauges. + registerAsyncGauges(); + + JLOG(journal_.info()) << "MetricsRegistry: started successfully"; +#else + (void)endpoint; +#endif // XRPL_ENABLE_TELEMETRY +} + +void +MetricsRegistry::stop() +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!provider_) + return; + + JLOG(journal_.info()) << "MetricsRegistry: stopping"; + + // Force-flush any pending metrics, then destroy the provider. + // This stops the PeriodicExportingMetricReader, which in turn + // stops invoking observable gauge callbacks. No explicit + // RemoveCallback is needed — the provider destruction handles it. + provider_->ForceFlush(); + provider_.reset(); + + JLOG(journal_.info()) << "MetricsRegistry: stopped"; +#endif // XRPL_ENABLE_TELEMETRY +} + +// ----------------------------------------------------------------- +// Synchronous instrument recording — RPC metrics (Task 9.4) +// ----------------------------------------------------------------- + +void +MetricsRegistry::recordRpcStarted(std::string_view method) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!enabled_ || !rpcStartedCounter_) + return; + rpcStartedCounter_->Add(1, {{"method", std::string(method)}}); +#else + (void)method; +#endif +} + +void +MetricsRegistry::recordRpcFinished(std::string_view method, std::int64_t durationUs) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!enabled_ || !rpcFinishedCounter_) + return; + rpcFinishedCounter_->Add(1, {{"method", std::string(method)}}); + if (rpcDurationHistogram_) + rpcDurationHistogram_->Record( + static_cast(durationUs), {{"method", std::string(method)}}); +#else + (void)method; + (void)durationUs; +#endif +} + +void +MetricsRegistry::recordRpcErrored(std::string_view method, std::int64_t durationUs) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!enabled_ || !rpcErroredCounter_) + return; + rpcErroredCounter_->Add(1, {{"method", std::string(method)}}); + if (rpcDurationHistogram_) + rpcDurationHistogram_->Record( + static_cast(durationUs), {{"method", std::string(method)}}); +#else + (void)method; + (void)durationUs; +#endif +} + +// ----------------------------------------------------------------- +// Synchronous instrument recording — Job Queue metrics (Task 9.5) +// ----------------------------------------------------------------- + +void +MetricsRegistry::recordJobQueued(std::string_view jobType) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!enabled_ || !jobQueuedCounter_) + return; + jobQueuedCounter_->Add(1, {{"job_type", std::string(jobType)}}); +#else + (void)jobType; +#endif +} + +void +MetricsRegistry::recordJobStarted(std::string_view jobType, std::int64_t queuedDurUs) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!enabled_ || !jobStartedCounter_) + return; + jobStartedCounter_->Add(1, {{"job_type", std::string(jobType)}}); + if (jobQueuedDurationHistogram_) + jobQueuedDurationHistogram_->Record( + static_cast(queuedDurUs), {{"job_type", std::string(jobType)}}); +#else + (void)jobType; + (void)queuedDurUs; +#endif +} + +void +MetricsRegistry::recordJobFinished(std::string_view jobType, std::int64_t runningDurUs) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!enabled_ || !jobFinishedCounter_) + return; + jobFinishedCounter_->Add(1, {{"job_type", std::string(jobType)}}); + if (jobRunningDurationHistogram_) + jobRunningDurationHistogram_->Record( + static_cast(runningDurUs), {{"job_type", std::string(jobType)}}); +#else + (void)jobType; + (void)runningDurUs; +#endif +} + +// ----------------------------------------------------------------- +// Observable gauge callbacks (Tasks 9.1, 9.2, 9.3, 9.6, 9.7) +// ----------------------------------------------------------------- + +#ifdef XRPL_ENABLE_TELEMETRY + +void +MetricsRegistry::registerAsyncGauges() +{ + // --- Task 9.2: Cache hit rate and size gauges --- + cacheHitRateGauge_ = + meter_->CreateDoubleObservableGauge("cache_metrics", "Cache hit rates and sizes"); + cacheHitRateGauge_->AddCallback( + [](opentelemetry::metrics::ObserverResult result, void* state) { + auto* self = static_cast(state); + auto& app = self->app_; + + try + { + // SLE cache hit rate (0.0 - 1.0). + auto sleRate = app.cachedSLEs().rate(); + opentelemetry::nostd::get>>(result) + ->Observe(sleRate, {{"metric", "SLE_hit_rate"}}); + + // Ledger cache hit rate. + auto ledgerRate = app.getLedgerMaster().getCacheHitRate(); + opentelemetry::nostd::get>>(result) + ->Observe(ledgerRate, {{"metric", "ledger_hit_rate"}}); + + // AcceptedLedger cache hit rate. + auto alRate = app.getAcceptedLedgerCache().getHitRate(); + opentelemetry::nostd::get>>(result) + ->Observe(alRate, {{"metric", "AL_hit_rate"}}); + + // TreeNode cache size. + auto tnCacheSize = app.getNodeFamily().getTreeNodeCache()->getCacheSize(); + opentelemetry::nostd::get>>(result) + ->Observe( + static_cast(tnCacheSize), {{"metric", "treenode_cache_size"}}); + + // TreeNode track size. + auto tnTrackSize = app.getNodeFamily().getTreeNodeCache()->getTrackSize(); + opentelemetry::nostd::get>>(result) + ->Observe( + static_cast(tnTrackSize), {{"metric", "treenode_track_size"}}); + + // FullBelow cache size. + auto fbSize = app.getNodeFamily().getFullBelowCache()->size(); + opentelemetry::nostd::get>>(result) + ->Observe(static_cast(fbSize), {{"metric", "fullbelow_size"}}); + } + catch (...) + { + // Silently skip if services are not yet ready. + } + }, + this); + + // --- Task 9.3: TxQ metrics gauges --- + txqGauge_ = meter_->CreateDoubleObservableGauge("txq_metrics", "Transaction queue metrics"); + txqGauge_->AddCallback( + [](opentelemetry::metrics::ObserverResult result, void* state) { + auto* self = static_cast(state); + auto& app = self->app_; + + try + { + auto const metrics = app.getTxQ().getMetrics(*app.openLedger().current()); + + auto observe = [&](char const* name, double value) { + opentelemetry::nostd::get>>(result) + ->Observe(value, {{"metric", name}}); + }; + + observe("txq_count", static_cast(metrics.txCount)); + observe( + "txq_max_size", + metrics.txQMaxSize ? static_cast(*metrics.txQMaxSize) : 0.0); + observe("txq_in_ledger", static_cast(metrics.txInLedger)); + observe("txq_per_ledger", static_cast(metrics.txPerLedger)); + observe( + "txq_reference_fee_level", + static_cast(metrics.referenceFeeLevel.fee())); + observe( + "txq_min_processing_fee_level", + static_cast(metrics.minProcessingFeeLevel.fee())); + observe("txq_med_fee_level", static_cast(metrics.medFeeLevel.fee())); + observe( + "txq_open_ledger_fee_level", + static_cast(metrics.openLedgerFeeLevel.fee())); + } + catch (...) + { + // Silently skip if TxQ or OpenLedger are not yet ready. + } + }, + this); + + // --- Task 9.6: Counted object instance gauges --- + objectCountGauge_ = meter_->CreateInt64ObservableGauge( + "object_count", "Live instance counts for key internal object types"); + objectCountGauge_->AddCallback( + [](opentelemetry::metrics::ObserverResult result, void* /* state */) { + try + { + // Iterate through all CountedObject types via the linked + // list in CountedObjects. We report all types with count + // > 0, filtering to the key types of interest. + auto counts = CountedObjects::getInstance().getCounts(0); + for (auto const& [name, count] : counts) + { + opentelemetry::nostd::get>>(result) + ->Observe(static_cast(count), {{"type", name}}); + } + } + catch (...) + { + // Silently skip on error. + } + }, + this); + + // --- Task 9.7: Load factor breakdown gauges --- + loadFactorGauge_ = + meter_->CreateDoubleObservableGauge("load_factor_metrics", "Fee load factor breakdown"); + loadFactorGauge_->AddCallback( + [](opentelemetry::metrics::ObserverResult result, void* state) { + auto* self = static_cast(state); + auto& app = self->app_; + + try + { + auto& feeTrack = app.getFeeTrack(); + auto const loadBase = static_cast(feeTrack.getLoadBase()); + + auto observe = [&](char const* name, double value) { + opentelemetry::nostd::get>>(result) + ->Observe(value, {{"metric", name}}); + }; + + // Combined load factor (server component). + observe( + "load_factor_server", static_cast(feeTrack.getLoadFactor()) / loadBase); + + // Individual factor components. + observe( + "load_factor_local", static_cast(feeTrack.getLocalFee()) / loadBase); + observe("load_factor_net", static_cast(feeTrack.getRemoteFee()) / loadBase); + observe( + "load_factor_cluster", + static_cast(feeTrack.getClusterFee()) / loadBase); + + // Fee escalation factors from TxQ. + auto const metrics = app.getTxQ().getMetrics(*app.openLedger().current()); + auto refLevel = static_cast(metrics.referenceFeeLevel.fee()); + if (refLevel > 0) + { + observe( + "load_factor_fee_escalation", + static_cast(metrics.openLedgerFeeLevel.fee()) / refLevel); + observe( + "load_factor_fee_queue", + static_cast(metrics.minProcessingFeeLevel.fee()) / refLevel); + } + + // Combined load factor (max of server and fee escalation). + auto const loadFactorServer = feeTrack.getLoadFactor(); + auto const loadBaseServer = feeTrack.getLoadBase(); + double combined = static_cast(loadFactorServer) / loadBase; + if (refLevel > 0) + { + double feeEscalation = static_cast(metrics.openLedgerFeeLevel.fee()) * + loadBaseServer / refLevel; + if (feeEscalation > static_cast(loadFactorServer)) + { + combined = feeEscalation / loadBase; + } + } + observe("load_factor", combined); + } + catch (...) + { + // Silently skip if services are not yet ready. + } + }, + this); + + // --- Task 9.1: NodeStore I/O gauges --- + // The cumulative counters (reads, writes, bytes) are also exposed here + // as observable gauges. This avoids adding an xrpld dependency into the + // libxrpl nodestore code — the MetricsRegistry reads the existing atomic + // counters from Database via its public accessors. + nodeStoreGauge_ = meter_->CreateInt64ObservableGauge( + "nodestore_state", "NodeStore I/O counters, queue depth, and write load"); + nodeStoreGauge_->AddCallback( + [](opentelemetry::metrics::ObserverResult result, void* state) { + auto* self = static_cast(state); + auto& app = self->app_; + + try + { + auto& db = app.getNodeStore(); + + auto observe = [&](char const* name, int64_t value) { + opentelemetry::nostd::get>>(result) + ->Observe(value, {{"metric", name}}); + }; + + // Cumulative counters (monotonically increasing). + observe("node_reads_total", static_cast(db.getFetchTotalCount())); + observe("node_reads_hit", static_cast(db.getFetchHitCount())); + observe("node_writes", static_cast(db.getStoreCount())); + observe("node_written_bytes", static_cast(db.getStoreSize())); + observe("node_read_bytes", static_cast(db.getFetchSize())); + + // Write load score (instantaneous). + observe("write_load", static_cast(db.getWriteLoad())); + + // Read queue depth (instantaneous). + Json::Value obj(Json::objectValue); + db.getCountsJson(obj); + if (obj.isMember("read_queue")) + { + observe("read_queue", static_cast(obj["read_queue"].asUInt())); + } + } + catch (...) + { + // Silently skip on error. + } + }, + this); +} + +#endif // XRPL_ENABLE_TELEMETRY + +} // namespace telemetry +} // namespace xrpl diff --git a/src/xrpld/telemetry/MetricsRegistry.h b/src/xrpld/telemetry/MetricsRegistry.h new file mode 100644 index 0000000000..be977901d2 --- /dev/null +++ b/src/xrpld/telemetry/MetricsRegistry.h @@ -0,0 +1,280 @@ +#pragma once + +/** Central OTel Metrics Registry for rippled. + + Owns all OpenTelemetry metric instruments (counters, histograms, + observable gauges) that are NOT already covered by the beast::insight + StatsD pipeline. The instruments are created once at startup and polled + by the OTel PeriodicExportingMetricReader at a configurable interval + (default 10 s). + + When XRPL_ENABLE_TELEMETRY is **not** defined, this class compiles to a + lightweight no-op: every public method is an empty inline. + + Dependency / ownership diagram (ASCII): + + Application + | + +-- MetricsRegistry (unique_ptr, created in setup(), started/stopped with telemetry) + | + +-- OTel MeterProvider (owns reader + exporter) + | | + | +-- PeriodicExportingMetricReader + | +-- OtlpHttpMetricExporter + | + +-- Counters / Histograms (synchronous instruments) + | +-- rpc_method_started_total + | +-- rpc_method_finished_total + | +-- rpc_method_errored_total + | +-- rpc_method_duration_us (Histogram) + | +-- job_queued_total + | +-- job_started_total + | +-- job_finished_total + | +-- job_queued_duration_us (Histogram) + | +-- job_running_duration_us (Histogram) + | + +-- Observable Gauges (async callbacks, polled by reader) + +-- Cache hit rates (SLE, ledger, AL) + +-- TreeNode / FullBelow sizes + +-- TxQ metrics + +-- CountedObject counts + +-- Load factor breakdown + +-- NodeStore I/O gauges + + Control-flow for async gauges: + + PeriodicExportingMetricReader (background thread, 10 s tick) + | + v + OTel SDK invokes registered ObservableGauge callbacks + | + v + Each callback reads current value from Application services + (e.g. app.getTxQ().getMetrics(), app.getFeeTrack().getLoadFactor()) + | + v + Result set is exported via OTLP/HTTP to the collector + + Control-flow for synchronous instruments: + + PerfLogImp::rpcStart/rpcEnd/jobQueue/jobStart/jobFinish + | + v + MetricsRegistry::recordRpc*(method, ...) / recordJob*(type, ...) + | + v + OTel Counter::Add() or Histogram::Record() + | + v + Periodically flushed by the MetricReader + + Example usage: + + @code + // In Application::setup(), after telemetry_ is created: + metricsRegistry_ = std::make_unique( + telemetry_->isEnabled(), app, journal); + metricsRegistry_->start(setup.exporterEndpoint); + + // In PerfLogImp::rpcStart(): + if (auto* mr = app_.getMetricsRegistry()) + mr->recordRpcStarted("server_info"); + + // In PerfLogImp::rpcEnd(): + if (auto* mr = app_.getMetricsRegistry()) + { + mr->recordRpcFinished("server_info", durationUs); + // or: mr->recordRpcErrored("server_info", durationUs); + } + + // In PerfLogImp::jobQueue(): + if (auto* mr = app_.getMetricsRegistry()) + mr->recordJobQueued("ledgerData"); + + // Shutdown: + metricsRegistry_->stop(); + @endcode + + Caveats: + - The MetricsRegistry must be created AFTER the Telemetry object because + it reads isEnabled() to decide whether to initialize the OTel SDK. + - Observable gauge callbacks capture a reference to the Application; the + Application must outlive the MetricsRegistry (guaranteed because + MetricsRegistry is stopped before Application teardown). + - If a new CountedObject type is added, it will NOT appear automatically + in the object_count gauge; the callback iterates a fixed list. + - Adding a new synchronous instrument requires updating both the header + and the .cpp, then calling the new record*() method from the + instrumentation site. +*/ + +#include + +#include +#include +#include +#include + +#ifdef XRPL_ENABLE_TELEMETRY +#include +#include +#include +#include +#endif + +namespace xrpl { + +class ServiceRegistry; + +namespace telemetry { + +class MetricsRegistry +{ +public: + /** Construct a MetricsRegistry. + + @param enabled Whether OTel metric export is active. When false, + all methods become no-ops. + @param app Reference to the ServiceRegistry (Application) for + reading current metric values in gauge callbacks. + @param journal Journal for log output. + */ + MetricsRegistry(bool enabled, ServiceRegistry& app, beast::Journal journal); + + ~MetricsRegistry(); + + /// Non-copyable, non-movable. + MetricsRegistry(MetricsRegistry const&) = delete; + MetricsRegistry& + operator=(MetricsRegistry const&) = delete; + + /** Initialize the OTel metrics pipeline and register all instruments. + + @param endpoint OTLP/HTTP endpoint URL for metric export + (e.g. "http://localhost:4318/v1/metrics"). + */ + void + start(std::string const& endpoint); + + /** Flush pending metrics and shut down the pipeline. */ + void + stop(); + + /** @return true if the registry is actively exporting metrics. */ + bool + isEnabled() const noexcept + { + return enabled_; + } + + // ----------------------------------------------------------------- + // Synchronous instrument recording (called from PerfLog hot paths) + // ----------------------------------------------------------------- + + /** Record an RPC method call start. + @param method The RPC method name (e.g. "server_info"). + */ + void + recordRpcStarted(std::string_view method); + + /** Record an RPC method call completion. + @param method The RPC method name. + @param durationUs Execution time in microseconds. + */ + void + recordRpcFinished(std::string_view method, std::int64_t durationUs); + + /** Record an RPC method call error. + @param method The RPC method name. + @param durationUs Execution time in microseconds. + */ + void + recordRpcErrored(std::string_view method, std::int64_t durationUs); + + /** Record a job enqueued event. + @param jobType The job type name (e.g. "ledgerData"). + */ + void + recordJobQueued(std::string_view jobType); + + /** Record a job start event. + @param jobType The job type name. + @param queuedDurUs Time the job spent waiting in the queue (us). + */ + void + recordJobStarted(std::string_view jobType, std::int64_t queuedDurUs); + + /** Record a job finish event. + @param jobType The job type name. + @param runningDurUs Execution time in microseconds. + */ + void + recordJobFinished(std::string_view jobType, std::int64_t runningDurUs); + +private: + /// Master enable flag; when false all methods are no-ops. + bool const enabled_; + + /// Reference to Application services for gauge callbacks. + ServiceRegistry& app_; + + /// Journal for logging. + beast::Journal const journal_; + +#ifdef XRPL_ENABLE_TELEMETRY + /// The SDK MeterProvider that owns the export pipeline. + std::shared_ptr provider_; + + /// The Meter used to create all instruments. + opentelemetry::nostd::shared_ptr meter_; + + // --- Synchronous instruments (RPC) --- + /// Counter: rpc_method_started_total{method=""} + opentelemetry::nostd::unique_ptr> rpcStartedCounter_; + /// Counter: rpc_method_finished_total{method=""} + opentelemetry::nostd::unique_ptr> rpcFinishedCounter_; + /// Counter: rpc_method_errored_total{method=""} + opentelemetry::nostd::unique_ptr> rpcErroredCounter_; + /// Histogram: rpc_method_duration_us{method=""} + opentelemetry::nostd::unique_ptr> + rpcDurationHistogram_; + + // --- Synchronous instruments (Job Queue) --- + /// Counter: job_queued_total{job_type=""} + opentelemetry::nostd::unique_ptr> jobQueuedCounter_; + /// Counter: job_started_total{job_type=""} + opentelemetry::nostd::unique_ptr> jobStartedCounter_; + /// Counter: job_finished_total{job_type=""} + opentelemetry::nostd::unique_ptr> jobFinishedCounter_; + /// Histogram: job_queued_duration_us{job_type=""} + opentelemetry::nostd::unique_ptr> + jobQueuedDurationHistogram_; + /// Histogram: job_running_duration_us{job_type=""} + opentelemetry::nostd::unique_ptr> + jobRunningDurationHistogram_; + + // --- Observable gauges (registered via callbacks) --- + // Handles are stored so we can remove callbacks on shutdown. + /// Observable gauges for cache hit rates and sizes. + opentelemetry::nostd::shared_ptr + cacheHitRateGauge_; + /// Observable gauges for TxQ metrics. + opentelemetry::nostd::shared_ptr txqGauge_; + /// Observable gauges for counted object instances. + opentelemetry::nostd::shared_ptr + objectCountGauge_; + /// Observable gauges for load factor breakdown. + opentelemetry::nostd::shared_ptr loadFactorGauge_; + /// Observable gauges for NodeStore write_load and read_queue. + opentelemetry::nostd::shared_ptr nodeStoreGauge_; + + /** Register all observable gauge callbacks with the OTel SDK. + Called once during start(). + */ + void + registerAsyncGauges(); +#endif // XRPL_ENABLE_TELEMETRY +}; + +} // namespace telemetry +} // namespace xrpl