diff --git a/OpenTelemetryPlan/09-data-collection-reference.md b/OpenTelemetryPlan/09-data-collection-reference.md
index 587cafec73..0f897c360b 100644
--- a/OpenTelemetryPlan/09-data-collection-reference.md
+++ b/OpenTelemetryPlan/09-data-collection-reference.md
@@ -11,6 +11,7 @@ graph LR
subgraph rippledNode["rippled Node"]
A["Trace Macros
XRPL_TRACE_SPAN
(OTLP/HTTP exporter)"]
B["beast::insight
OTel native metrics
(OTLP/HTTP exporter)"]
+ C["MetricsRegistry
OTel SDK metrics
(OTLP/HTTP exporter)"]
end
subgraph collector["OTel Collector :4317 / :4318"]
@@ -33,11 +34,12 @@ graph LR
end
subgraph viz["Visualization"]
- F["Grafana :3000
10 dashboards"]
+ F["Grafana :3000
13 dashboards"]
end
A -->|"OTLP/HTTP :4318
(traces + attributes)"| R1
B -->|"OTLP/HTTP :4318
(gauges, counters, histograms)"| R1
+ C -->|"OTLP/HTTP :4318
(counters, histograms,
observable gauges)"| R1
BP -->|"OTLP/gRPC :4317"| D
BP -->|"OTLP/gRPC"| T
@@ -751,6 +753,126 @@ Phase 11 builds a custom OTel Collector receiver (Go) that polls rippled's admin
| `xrpl_orderbook_ask_depth` | Gauge | `pair=""` | Total ask volume |
| `xrpl_orderbook_spread` | Gauge | `pair=""` | Best bid-ask spread |
+### Phase 9: OTel SDK-Exported Metrics (MetricsRegistry)
+
+Phase 9 introduces the `MetricsRegistry` class (`src/xrpld/telemetry/MetricsRegistry.h/.cpp`)
+which registers metrics directly with the OpenTelemetry Metrics SDK. These are exported
+via OTLP/HTTP to the OTel Collector and scraped by Prometheus.
+
+#### NodeStore I/O (Observable Gauge — `nodestore_state`)
+
+| Prometheus Metric | Type | Labels | Description |
+| ------------------------------------------------------ | ----- | -------- | ------------------------------------ |
+| `rippled_nodestore_state{metric="node_reads_total"}` | Gauge | `metric` | Cumulative NodeStore read operations |
+| `rippled_nodestore_state{metric="node_reads_hit"}` | Gauge | `metric` | Reads served from cache |
+| `rippled_nodestore_state{metric="node_writes"}` | Gauge | `metric` | Cumulative write operations |
+| `rippled_nodestore_state{metric="node_written_bytes"}` | Gauge | `metric` | Cumulative bytes written |
+| `rippled_nodestore_state{metric="node_read_bytes"}` | Gauge | `metric` | Cumulative bytes read |
+| `rippled_nodestore_state{metric="write_load"}` | Gauge | `metric` | Current write load score |
+| `rippled_nodestore_state{metric="read_queue"}` | Gauge | `metric` | Items in read prefetch queue |
+
+#### Cache Hit Rates & Sizes (Observable Gauge — `cache_metrics`)
+
+| Prometheus Metric | Type | Labels | Description |
+| ----------------------------------------------------- | ----- | -------- | ----------------------------- |
+| `rippled_cache_metrics{metric="SLE_hit_rate"}` | Gauge | `metric` | SLE cache hit rate (0.0-1.0) |
+| `rippled_cache_metrics{metric="ledger_hit_rate"}` | Gauge | `metric` | Ledger cache hit rate |
+| `rippled_cache_metrics{metric="AL_hit_rate"}` | Gauge | `metric` | AcceptedLedger cache hit rate |
+| `rippled_cache_metrics{metric="treenode_cache_size"}` | Gauge | `metric` | SHAMap TreeNode cache entries |
+| `rippled_cache_metrics{metric="treenode_track_size"}` | Gauge | `metric` | Tracked tree nodes |
+| `rippled_cache_metrics{metric="fullbelow_size"}` | Gauge | `metric` | FullBelow cache entries |
+
+#### Transaction Queue (Observable Gauge — `txq_metrics`)
+
+| Prometheus Metric | Type | Labels | Description |
+| ------------------------------------------------------------ | ----- | -------- | -------------------------------- |
+| `rippled_txq_metrics{metric="txq_count"}` | Gauge | `metric` | Transactions currently in queue |
+| `rippled_txq_metrics{metric="txq_max_size"}` | Gauge | `metric` | Maximum queue capacity |
+| `rippled_txq_metrics{metric="txq_in_ledger"}` | Gauge | `metric` | Transactions in open ledger |
+| `rippled_txq_metrics{metric="txq_per_ledger"}` | Gauge | `metric` | Expected transactions per ledger |
+| `rippled_txq_metrics{metric="txq_reference_fee_level"}` | Gauge | `metric` | Reference fee level |
+| `rippled_txq_metrics{metric="txq_min_processing_fee_level"}` | Gauge | `metric` | Minimum fee to get processed |
+| `rippled_txq_metrics{metric="txq_med_fee_level"}` | Gauge | `metric` | Median fee level in queue |
+| `rippled_txq_metrics{metric="txq_open_ledger_fee_level"}` | Gauge | `metric` | Open ledger fee escalation level |
+
+#### Per-RPC Method Metrics (Synchronous Counters/Histogram)
+
+| Prometheus Metric | Type | Labels | Description |
+| ----------------------------------- | --------- | ----------------- | -------------------------------- |
+| `rippled_rpc_method_started_total` | Counter | `method=""` | RPC calls started |
+| `rippled_rpc_method_finished_total` | Counter | `method=""` | RPC calls completed successfully |
+| `rippled_rpc_method_errored_total` | Counter | `method=""` | RPC calls that errored |
+| `rippled_rpc_method_duration_us` | Histogram | `method=""` | Execution time distribution (us) |
+
+#### Per-Job-Type Metrics (Synchronous Counters/Histogram)
+
+| Prometheus Metric | Type | Labels | Description |
+| --------------------------------- | --------- | ------------------- | --------------------------------- |
+| `rippled_job_queued_total` | Counter | `job_type=""` | Jobs enqueued |
+| `rippled_job_started_total` | Counter | `job_type=""` | Jobs started |
+| `rippled_job_finished_total` | Counter | `job_type=""` | Jobs completed |
+| `rippled_job_queued_duration_us` | Histogram | `job_type=""` | Queue wait time distribution (us) |
+| `rippled_job_running_duration_us` | Histogram | `job_type=""` | Execution time distribution (us) |
+
+#### Counted Object Instances (Observable Gauge — `object_count`)
+
+| Prometheus Metric | Type | Labels | Description |
+| ---------------------------------------------- | ----- | --------------- | ------------------------------ |
+| `rippled_object_count{type="Transaction"}` | Gauge | `type=""` | Live Transaction objects |
+| `rippled_object_count{type="Ledger"}` | Gauge | `type=""` | Live Ledger objects |
+| `rippled_object_count{type="NodeObject"}` | Gauge | `type=""` | Live NodeObject instances |
+| `rippled_object_count{type="STTx"}` | Gauge | `type=""` | Serialized transaction objects |
+| `rippled_object_count{type="STLedgerEntry"}` | Gauge | `type=""` | Serialized ledger entries |
+| `rippled_object_count{type="InboundLedger"}` | Gauge | `type=""` | Ledgers being fetched |
+| `rippled_object_count{type="Pathfinder"}` | Gauge | `type=""` | Active pathfinding operations |
+| `rippled_object_count{type="PathRequest"}` | Gauge | `type=""` | Active path requests |
+| `rippled_object_count{type="HashRouterEntry"}` | Gauge | `type=""` | Hash router entries |
+
+#### Load Factor Breakdown (Observable Gauge — `load_factor_metrics`)
+
+| Prometheus Metric | Type | Labels | Description |
+| ------------------------------------------------------------------ | ----- | -------- | --------------------------------------- |
+| `rippled_load_factor_metrics{metric="load_factor"}` | Gauge | `metric` | Combined transaction cost multiplier |
+| `rippled_load_factor_metrics{metric="load_factor_server"}` | Gauge | `metric` | Server + cluster + network contribution |
+| `rippled_load_factor_metrics{metric="load_factor_local"}` | Gauge | `metric` | Local server load only |
+| `rippled_load_factor_metrics{metric="load_factor_net"}` | Gauge | `metric` | Network-wide load estimate |
+| `rippled_load_factor_metrics{metric="load_factor_cluster"}` | Gauge | `metric` | Cluster peer load |
+| `rippled_load_factor_metrics{metric="load_factor_fee_escalation"}` | Gauge | `metric` | Open ledger fee escalation |
+| `rippled_load_factor_metrics{metric="load_factor_fee_queue"}` | Gauge | `metric` | Queue entry fee level |
+
+#### Prometheus Query Examples (Phase 9)
+
+```promql
+# NodeStore cache hit ratio
+rippled_nodestore_state{metric="node_reads_hit"} / rippled_nodestore_state{metric="node_reads_total"}
+
+# RPC error rate for server_info
+rate(rippled_rpc_method_errored_total{method="server_info"}[5m])
+
+# Job queue wait time p95
+histogram_quantile(0.95, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m])))
+
+# TxQ utilization percentage
+rippled_txq_metrics{metric="txq_count"} / rippled_txq_metrics{metric="txq_max_size"}
+
+# High load factor alert candidate
+rippled_load_factor_metrics{metric="load_factor"} > 5
+```
+
+### New Grafana Dashboards (Phase 9)
+
+| Dashboard | UID | Data Source | Key Panels |
+| ---------------------- | -------------------- | ----------- | --------------------------------------------------------- |
+| Fee Market & TxQ | `rippled-fee-market` | Prometheus | TxQ depth/capacity, fee levels, load factor breakdown |
+| Job Queue Analysis | `rippled-job-queue` | Prometheus | Per-job rates, queue wait times, execution times |
+| RPC Performance (OTel) | `rippled-rpc-perf` | Prometheus | Per-method call rates, error rates, latency distributions |
+
+### Updated Grafana Dashboards (Phase 9)
+
+| Dashboard | UID | New Panels Added |
+| -------------------- | ---------------------------- | ------------------------------------------------------ |
+| Node Health (StatsD) | `rippled-statsd-node-health` | NodeStore I/O, cache hit rates, object instance counts |
+
### New Grafana Dashboards (Phase 11)
| Dashboard | UID | Data Source | Key Panels |
diff --git a/docker/telemetry/grafana/dashboards/rippled-fee-market.json b/docker/telemetry/grafana/dashboards/rippled-fee-market.json
new file mode 100644
index 0000000000..7ff6dd65c3
--- /dev/null
+++ b/docker/telemetry/grafana/dashboards/rippled-fee-market.json
@@ -0,0 +1,317 @@
+{
+ "annotations": {
+ "list": []
+ },
+ "description": "Fee market dynamics: TxQ depth/capacity, fee escalation levels, and load factor breakdown. Sourced from OTel MetricsRegistry observable gauges (Phase 9).",
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 1,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "title": "Transaction Queue Depth",
+ "description": "Current number of transactions waiting in the queue vs. maximum capacity. Sourced from MetricsRegistry txq_metrics observable gauge with metric=txq_count and metric=txq_max_size.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 0
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_txq_metrics{metric=\"txq_count\"}",
+ "legendFormat": "Queue Depth"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_txq_metrics{metric=\"txq_max_size\"}",
+ "legendFormat": "Max Capacity"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 2,
+ "fillOpacity": 10
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "Transactions Per Ledger",
+ "description": "Transactions in the current open ledger vs. expected per-ledger count. Sourced from txq_metrics with metric=txq_in_ledger and metric=txq_per_ledger.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_txq_metrics{metric=\"txq_in_ledger\"}",
+ "legendFormat": "In Ledger"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_txq_metrics{metric=\"txq_per_ledger\"}",
+ "legendFormat": "Expected Per Ledger"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 2,
+ "fillOpacity": 10
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "Fee Escalation Levels",
+ "description": "Fee levels that control transaction queue admission. Reference fee level is the baseline; open ledger fee level triggers escalation. Sourced from txq_metrics observable gauge.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 8
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_txq_metrics{metric=\"txq_reference_fee_level\"}",
+ "legendFormat": "Reference Fee Level"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_txq_metrics{metric=\"txq_min_processing_fee_level\"}",
+ "legendFormat": "Min Processing Fee Level"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_txq_metrics{metric=\"txq_med_fee_level\"}",
+ "legendFormat": "Median Fee Level"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_txq_metrics{metric=\"txq_open_ledger_fee_level\"}",
+ "legendFormat": "Open Ledger Fee Level"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 2,
+ "fillOpacity": 5,
+ "scaleDistribution": {
+ "type": "log",
+ "log": 2
+ }
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "Load Factor Breakdown",
+ "description": "Decomposed load factor components: server (max of local, net, cluster), fee escalation, fee queue, and combined. Values are unitless multipliers where 1.0 = no load. Sourced from load_factor_metrics observable gauge.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 16
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_load_factor_metrics{metric=\"load_factor\"}",
+ "legendFormat": "Combined Load Factor"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_load_factor_metrics{metric=\"load_factor_server\"}",
+ "legendFormat": "Server"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_load_factor_metrics{metric=\"load_factor_fee_escalation\"}",
+ "legendFormat": "Fee Escalation"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_load_factor_metrics{metric=\"load_factor_fee_queue\"}",
+ "legendFormat": "Fee Queue"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 2,
+ "fillOpacity": 5
+ },
+ "color": {
+ "mode": "palette-classic"
+ },
+ "thresholds": {
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "yellow",
+ "value": 2
+ },
+ {
+ "color": "red",
+ "value": 10
+ }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "Load Factor Components",
+ "description": "Individual load factor contributors: local server load, network load, and cluster load. Only differ from 1.0 under load conditions. Sourced from load_factor_metrics observable gauge.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 16
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_load_factor_metrics{metric=\"load_factor_local\"}",
+ "legendFormat": "Local"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_load_factor_metrics{metric=\"load_factor_net\"}",
+ "legendFormat": "Network"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_load_factor_metrics{metric=\"load_factor_cluster\"}",
+ "legendFormat": "Cluster"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 2,
+ "fillOpacity": 5
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ }
+ ],
+ "schemaVersion": 39,
+ "tags": ["rippled", "otel", "fee-market"],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "browser",
+ "title": "rippled - Fee Market & TxQ",
+ "uid": "rippled-fee-market",
+ "version": 1
+}
diff --git a/docker/telemetry/grafana/dashboards/rippled-job-queue.json b/docker/telemetry/grafana/dashboards/rippled-job-queue.json
new file mode 100644
index 0000000000..1f3a30ca75
--- /dev/null
+++ b/docker/telemetry/grafana/dashboards/rippled-job-queue.json
@@ -0,0 +1,324 @@
+{
+ "annotations": {
+ "list": []
+ },
+ "description": "Job queue analysis: per-job-type throughput rates, queue wait times, and execution times. Sourced from OTel MetricsRegistry synchronous counters and histograms (Phase 9).",
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 1,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "title": "Job Throughput Rate (per second)",
+ "description": "Rate of jobs queued, started, and finished across all job types. Computed as rate() over the OTel counter values. High queue rates with low finish rates indicate backlog.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "sum(rate(rippled_job_queued_total[5m]))",
+ "legendFormat": "Queued/s"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "sum(rate(rippled_job_started_total[5m]))",
+ "legendFormat": "Started/s"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "sum(rate(rippled_job_finished_total[5m]))",
+ "legendFormat": "Finished/s"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ops",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 2,
+ "fillOpacity": 10
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "Per-Job-Type Queued Rate",
+ "description": "Rate of jobs queued broken down by job_type label. Identifies which job types contribute most to queue activity.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 8
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ },
+ "legend": {
+ "displayMode": "table",
+ "placement": "right",
+ "calcs": ["mean", "max"]
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "topk(10, rate(rippled_job_queued_total[5m]))",
+ "legendFormat": "{{job_type}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ops",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 1,
+ "fillOpacity": 5
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "Per-Job-Type Finish Rate",
+ "description": "Rate of jobs completing broken down by job_type. Compare with queued rate to identify backlog per type.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 8
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ },
+ "legend": {
+ "displayMode": "table",
+ "placement": "right",
+ "calcs": ["mean", "max"]
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "topk(10, rate(rippled_job_finished_total[5m]))",
+ "legendFormat": "{{job_type}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ops",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 1,
+ "fillOpacity": 5
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "Job Queue Wait Time (p50, p95, p99)",
+ "description": "Histogram quantiles for time jobs spend waiting in the queue before execution starts. High values indicate thread pool saturation.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 16
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m])))",
+ "legendFormat": "p50"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m])))",
+ "legendFormat": "p95"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m])))",
+ "legendFormat": "p99"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "us",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 2,
+ "fillOpacity": 5
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "Job Execution Time (p50, p95, p99)",
+ "description": "Histogram quantiles for actual job execution time. High values indicate expensive operations or resource contention.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 16
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m])))",
+ "legendFormat": "p50"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m])))",
+ "legendFormat": "p95"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m])))",
+ "legendFormat": "p99"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "us",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 2,
+ "fillOpacity": 5
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "Per-Job-Type Execution Time (p95)",
+ "description": "95th percentile execution time broken down by job type. Identifies the slowest job types.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 24
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ },
+ "legend": {
+ "displayMode": "table",
+ "placement": "right",
+ "calcs": ["mean", "max"]
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "topk(10, histogram_quantile(0.95, sum by (le, job_type) (rate(rippled_job_running_duration_us_bucket[5m]))))",
+ "legendFormat": "{{job_type}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "us",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 1,
+ "fillOpacity": 5
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ }
+ ],
+ "schemaVersion": 39,
+ "tags": ["rippled", "otel", "job-queue"],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "browser",
+ "title": "rippled - Job Queue Analysis",
+ "uid": "rippled-job-queue",
+ "version": 1
+}
diff --git a/docker/telemetry/grafana/dashboards/rippled-rpc-perf.json b/docker/telemetry/grafana/dashboards/rippled-rpc-perf.json
new file mode 100644
index 0000000000..d26eae7a6f
--- /dev/null
+++ b/docker/telemetry/grafana/dashboards/rippled-rpc-perf.json
@@ -0,0 +1,333 @@
+{
+ "annotations": {
+ "list": []
+ },
+ "description": "Per-RPC-method performance: call rates, error rates, and latency distributions. Sourced from OTel MetricsRegistry synchronous counters and histograms (Phase 9).",
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 1,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "title": "RPC Call Rate (all methods)",
+ "description": "Aggregate rate of RPC calls started, finished, and errored across all methods. Computed as rate() over OTel counters.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "sum(rate(rippled_rpc_method_started_total[5m]))",
+ "legendFormat": "Started/s"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "sum(rate(rippled_rpc_method_finished_total[5m]))",
+ "legendFormat": "Finished/s"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "sum(rate(rippled_rpc_method_errored_total[5m]))",
+ "legendFormat": "Errored/s"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ops",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 2,
+ "fillOpacity": 10
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "Per-Method Call Rate (Top 10)",
+ "description": "Per-method RPC call rate, showing the 10 most active methods. Useful for identifying hot paths.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 8
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ },
+ "legend": {
+ "displayMode": "table",
+ "placement": "right",
+ "calcs": ["mean", "max"]
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "topk(10, rate(rippled_rpc_method_started_total[5m]))",
+ "legendFormat": "{{method}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ops",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 1,
+ "fillOpacity": 5
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "Per-Method Error Rate (Top 10)",
+ "description": "Per-method RPC error rate. Non-zero values warrant investigation. Common culprits: invalid parameters, resource exhaustion.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 8
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ },
+ "legend": {
+ "displayMode": "table",
+ "placement": "right",
+ "calcs": ["mean", "max"]
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "topk(10, rate(rippled_rpc_method_errored_total[5m]))",
+ "legendFormat": "{{method}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ops",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 1,
+ "fillOpacity": 5
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "RPC Latency (p50, p95, p99) - All Methods",
+ "description": "Histogram quantiles for RPC execution time across all methods. Sourced from rpc_method_duration_us histogram.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 16
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m])))",
+ "legendFormat": "p50"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m])))",
+ "legendFormat": "p95"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m])))",
+ "legendFormat": "p99"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "us",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 2,
+ "fillOpacity": 5
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "Per-Method Latency p95 (Top 10 Slowest)",
+ "description": "95th percentile execution time per method. Identifies the slowest RPC endpoints.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 16
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ },
+ "legend": {
+ "displayMode": "table",
+ "placement": "right",
+ "calcs": ["mean", "max"]
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "topk(10, histogram_quantile(0.95, sum by (le, method) (rate(rippled_rpc_method_duration_us_bucket[5m]))))",
+ "legendFormat": "{{method}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "us",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 1,
+ "fillOpacity": 5
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "RPC Error Ratio by Method",
+ "description": "Error ratio (errors / total started) per method. Values above 0.05 (5%) warrant investigation.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 24
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ },
+ "legend": {
+ "displayMode": "table",
+ "placement": "right",
+ "calcs": ["mean", "max"]
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "topk(10, rate(rippled_rpc_method_errored_total[5m]) / (rate(rippled_rpc_method_started_total[5m]) > 0))",
+ "legendFormat": "{{method}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "percentunit",
+ "min": 0,
+ "max": 1,
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 1,
+ "fillOpacity": 5
+ },
+ "color": {
+ "mode": "palette-classic"
+ },
+ "thresholds": {
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "yellow",
+ "value": 0.05
+ },
+ {
+ "color": "red",
+ "value": 0.25
+ }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ }
+ ],
+ "schemaVersion": 39,
+ "tags": ["rippled", "otel", "rpc"],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "browser",
+ "title": "rippled - RPC Performance (OTel)",
+ "uid": "rippled-rpc-perf",
+ "version": 1
+}
diff --git a/docker/telemetry/grafana/dashboards/system-node-health.json b/docker/telemetry/grafana/dashboards/system-node-health.json
index 3fa987210c..0b46b45a6d 100644
--- a/docker/telemetry/grafana/dashboards/system-node-health.json
+++ b/docker/telemetry/grafana/dashboards/system-node-health.json
@@ -399,10 +399,318 @@
},
"overrides": []
}
+ },
+ {
+ "title": "--- OTel: NodeStore I/O ---",
+ "type": "row",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 32
+ },
+ "collapsed": false,
+ "panels": []
+ },
+ {
+ "title": "NodeStore Read/Write Totals",
+ "description": "Cumulative NodeStore read and write operation counts. Sourced from MetricsRegistry nodestore_state observable gauge with metric=node_reads_total, node_writes, node_reads_hit.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 33
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_nodestore_state{metric=\"node_reads_total\"}",
+ "legendFormat": "Reads Total"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_nodestore_state{metric=\"node_reads_hit\"}",
+ "legendFormat": "Reads Hit (cache)"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_nodestore_state{metric=\"node_writes\"}",
+ "legendFormat": "Writes Total"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 2,
+ "fillOpacity": 10
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "NodeStore Write Load & Read Queue",
+ "description": "Instantaneous write load score and read queue depth. High write load indicates backend pressure. High read queue indicates prefetch thread saturation.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 33
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_nodestore_state{metric=\"write_load\"}",
+ "legendFormat": "Write Load"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_nodestore_state{metric=\"read_queue\"}",
+ "legendFormat": "Read Queue"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 2,
+ "fillOpacity": 10
+ },
+ "color": {
+ "mode": "palette-classic"
+ },
+ "thresholds": {
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "yellow",
+ "value": 100
+ },
+ {
+ "color": "red",
+ "value": 1000
+ }
+ ]
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "--- OTel: Cache Hit Rates ---",
+ "type": "row",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 41
+ },
+ "collapsed": false,
+ "panels": []
+ },
+ {
+ "title": "Cache Hit Rates",
+ "description": "Hit rates for SLE cache, Ledger cache, and AcceptedLedger cache. Values from 0.0 to 1.0. Low values indicate cache thrashing. Sourced from MetricsRegistry cache_metrics observable gauge.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 42
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_cache_metrics{metric=\"SLE_hit_rate\"}",
+ "legendFormat": "SLE Hit Rate"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_cache_metrics{metric=\"ledger_hit_rate\"}",
+ "legendFormat": "Ledger Hit Rate"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_cache_metrics{metric=\"AL_hit_rate\"}",
+ "legendFormat": "AcceptedLedger Hit Rate"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "percentunit",
+ "min": 0,
+ "max": 1,
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 2,
+ "fillOpacity": 10
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "Cache Sizes",
+ "description": "TreeNode cache size, TreeNode track size, and FullBelow cache size. Sourced from MetricsRegistry cache_metrics observable gauge.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 42
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_cache_metrics{metric=\"treenode_cache_size\"}",
+ "legendFormat": "TreeNode Cache"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_cache_metrics{metric=\"treenode_track_size\"}",
+ "legendFormat": "TreeNode Track"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "rippled_cache_metrics{metric=\"fullbelow_size\"}",
+ "legendFormat": "FullBelow"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 2,
+ "fillOpacity": 10
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
+ },
+ {
+ "title": "--- OTel: Object Instance Counts ---",
+ "type": "row",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 50
+ },
+ "collapsed": false,
+ "panels": []
+ },
+ {
+ "title": "Object Instance Counts",
+ "description": "Live instance counts for key internal object types tracked by CountedObject. Sourced from MetricsRegistry object_count observable gauge. High counts may indicate memory pressure or object leaks.",
+ "type": "timeseries",
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 51
+ },
+ "options": {
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ },
+ "legend": {
+ "displayMode": "table",
+ "placement": "right",
+ "calcs": ["last", "max"]
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "expr": "topk(15, rippled_object_count)",
+ "legendFormat": "{{type}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short",
+ "custom": {
+ "drawStyle": "line",
+ "lineWidth": 1,
+ "fillOpacity": 5
+ },
+ "color": {
+ "mode": "palette-classic"
+ }
+ },
+ "overrides": []
+ }
}
],
"schemaVersion": 39,
- "tags": ["rippled", "statsd", "node-health", "telemetry"],
+ "tags": ["rippled", "statsd", "otel", "node-health", "telemetry"],
"templating": {
"list": [
{
diff --git a/docker/telemetry/integration-test.sh b/docker/telemetry/integration-test.sh
index 5e6696a33c..09a0032c51 100755
--- a/docker/telemetry/integration-test.sh
+++ b/docker/telemetry/integration-test.sh
@@ -353,6 +353,7 @@ trace_transactions=1
trace_consensus=1
trace_peer=1
trace_ledger=1
+metrics_endpoint=http://localhost:4318/v1/metrics
[insight]
server=otel
@@ -636,6 +637,53 @@ else
fail "StatsD port 8125 appears to be listening (should not be needed)"
fi
+# ---------------------------------------------------------------------------
+# Step 10c: Verify Phase 9 OTel SDK Metrics
+# ---------------------------------------------------------------------------
+log ""
+log "--- Phase 9: OTel SDK Metrics (MetricsRegistry) ---"
+log "Waiting 15s for OTel metric export + Prometheus scrape..."
+sleep 15
+
+check_otel_metric() {
+ local metric_name="$1"
+ local result
+ result=$(curl -sf "$PROM/api/v1/query?query=$metric_name" \
+ | jq '.data.result | length' 2>/dev/null || echo 0)
+ if [ "$result" -gt 0 ]; then
+ ok "OTel: $metric_name ($result series)"
+ else
+ fail "OTel: $metric_name (0 series)"
+ fi
+}
+
+# Task 9.1: NodeStore I/O
+check_otel_metric 'rippled_nodestore_state{metric="node_reads_total"}'
+check_otel_metric 'rippled_nodestore_state{metric="write_load"}'
+
+# Task 9.2: Cache hit rates
+check_otel_metric 'rippled_cache_metrics{metric="SLE_hit_rate"}'
+check_otel_metric 'rippled_cache_metrics{metric="treenode_cache_size"}'
+
+# Task 9.3: TxQ metrics
+check_otel_metric 'rippled_txq_metrics{metric="txq_count"}'
+check_otel_metric 'rippled_txq_metrics{metric="txq_reference_fee_level"}'
+
+# Task 9.4: Per-RPC metrics
+check_otel_metric "rippled_rpc_method_started_total"
+check_otel_metric "rippled_rpc_method_finished_total"
+
+# Task 9.5: Per-job metrics
+check_otel_metric "rippled_job_queued_total"
+check_otel_metric "rippled_job_finished_total"
+
+# Task 9.6: Counted object instances
+check_otel_metric "rippled_object_count"
+
+# Task 9.7: Load factor breakdown
+check_otel_metric 'rippled_load_factor_metrics{metric="load_factor"}'
+check_otel_metric 'rippled_load_factor_metrics{metric="load_factor_server"}'
+
# ---------------------------------------------------------------------------
# Step 11: Summary
# ---------------------------------------------------------------------------
diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md
index b47487b14e..b5a5acdb0e 100644
--- a/docs/telemetry-runbook.md
+++ b/docs/telemetry-runbook.md
@@ -231,7 +231,7 @@ When using StatsD, uncomment the `statsd` receiver in `otel-collector-config.yam
## Grafana Dashboards
-Eight dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`:
+Thirteen dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`:
### RPC Performance (`rippled-rpc-perf`)
@@ -403,8 +403,74 @@ count_over_time({job="rippled"} |= "trace_id=" [5m])
4. Open Grafana at http://localhost:3000 -> Explore -> Loki and search for `{job="rippled"} |= "trace_id="`.
5. Click the TraceID link to navigate to the corresponding trace in Tempo.
+## Phase 9: OTel Metrics Alerting Rules
+
+The following alerting rules are recommended for the Phase 9 OTel SDK metrics.
+Add to your Prometheus alerting rules configuration.
+
+### NodeStore
+
+| Alert Name | Severity | Condition | For | Description |
+| --------------------------- | -------- | ---------------------------------------------------- | --- | ------------------------------------------------------- |
+| `NodeStoreHighWriteLoad` | Warning | `rippled_nodestore_state{metric="write_load"} > 100` | 5m | NodeStore backend is under sustained write pressure |
+| `NodeStoreReadQueueBacklog` | Warning | `rippled_nodestore_state{metric="read_queue"} > 500` | 5m | Prefetch thread pool is saturated; reads are backing up |
+
+### Cache
+
+| Alert Name | Severity | Condition | For | Description |
+| ----------------------- | -------- | ------------------------------------------------------- | --- | ------------------------------------------------------ |
+| `SLECacheHitRateLow` | Warning | `rippled_cache_metrics{metric="SLE_hit_rate"} < 0.5` | 10m | SLE cache is thrashing; consider increasing cache size |
+| `LedgerCacheHitRateLow` | Warning | `rippled_cache_metrics{metric="ledger_hit_rate"} < 0.5` | 10m | Ledger cache hit rate is degraded |
+
+### Transaction Queue
+
+| Alert Name | Severity | Condition | For | Description |
+| ---------------------- | -------- | ---------------------------------------------------------------------------------------------------------------------- | --- | -------------------------------------------------- |
+| `TxQNearCapacity` | Warning | `rippled_txq_metrics{metric="txq_count"} / rippled_txq_metrics{metric="txq_max_size"} > 0.8` | 5m | TxQ is >80% full; transactions may be rejected |
+| `TxQHighFeeEscalation` | Warning | `rippled_txq_metrics{metric="txq_open_ledger_fee_level"} / rippled_txq_metrics{metric="txq_reference_fee_level"} > 10` | 5m | Fee escalation is 10x above reference; high demand |
+
+### Load Factor
+
+| Alert Name | Severity | Condition | For | Description |
+| --------------------- | -------- | -------------------------------------------------------------- | --- | -------------------------------------------------------------- |
+| `HighLoadFactor` | Warning | `rippled_load_factor_metrics{metric="load_factor"} > 5` | 10m | Combined load factor is elevated; transactions cost 5x+ normal |
+| `HighLocalLoadFactor` | Critical | `rippled_load_factor_metrics{metric="load_factor_local"} > 10` | 5m | Local server load is critically elevated |
+
+### RPC Performance
+
+| Alert Name | Severity | Condition | For | Description |
+| ------------------ | -------- | ---------------------------------------------------------------------------------------------------------- | --- | --------------------------------- |
+| `HighRPCErrorRate` | Warning | `sum(rate(rippled_rpc_method_errored_total[5m])) / sum(rate(rippled_rpc_method_started_total[5m])) > 0.05` | 5m | >5% of RPC calls are erroring |
+| `SlowRPCLatency` | Warning | `histogram_quantile(0.95, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m]))) > 5000000` | 5m | RPC p95 latency exceeds 5 seconds |
+
+### Job Queue
+
+| Alert Name | Severity | Condition | For | Description |
+| ------------------ | -------- | ----------------------------------------------------------------------------------------------------- | --- | ---------------------------------------------------- |
+| `JobQueueBacklog` | Warning | `sum(rate(rippled_job_queued_total[5m])) - sum(rate(rippled_job_finished_total[5m])) > 100` | 5m | Jobs are being queued faster than they're completing |
+| `SlowJobExecution` | Warning | `histogram_quantile(0.95, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m]))) > 10000000` | 5m | Job execution p95 exceeds 10 seconds |
+
## Troubleshooting
+### No OTel SDK metrics in Prometheus
+
+1. Verify `enabled=1` in the `[telemetry]` config section
+2. Check that `metrics_endpoint` points to the OTel Collector's HTTP receiver
+ (default: `http://localhost:4318/v1/metrics`)
+3. Check rippled logs for `MetricsRegistry: started successfully` message
+4. Verify the OTel Collector is configured with an OTLP receiver and Prometheus exporter
+5. Check Prometheus targets page for the collector scrape target
+
+### Cache hit rates are zero
+
+Cache hit rates may be zero during startup before caches are warmed. Wait for the
+node to reach `Full` operating mode and process several ledgers before investigating.
+
+### NodeStore I/O counters not incrementing
+
+NodeStore counters are cumulative and may appear flat if the node is idle. Submit
+some transactions or RPC requests to generate I/O activity.
+
### No traces appearing in Jaeger
1. Check rippled logs for `Telemetry starting` message
diff --git a/include/xrpl/core/ServiceRegistry.h b/include/xrpl/core/ServiceRegistry.h
index 200b1bf59c..86f92ce774 100644
--- a/include/xrpl/core/ServiceRegistry.h
+++ b/include/xrpl/core/ServiceRegistry.h
@@ -21,7 +21,8 @@ class PerfLog;
}
namespace telemetry {
class Telemetry;
-}
+class MetricsRegistry;
+} // namespace telemetry
// This is temporary until we migrate all code to use ServiceRegistry.
class Application;
@@ -211,6 +212,12 @@ public:
virtual telemetry::Telemetry&
getTelemetry() = 0;
+ /** Return the MetricsRegistry, or nullptr if telemetry is disabled.
+ Used by PerfLog and other hot paths to record OTel metrics.
+ */
+ virtual telemetry::MetricsRegistry*
+ getMetricsRegistry() = 0;
+
// Configuration and state
virtual bool
isStopping() const = 0;
diff --git a/src/test/telemetry/MetricsRegistry_test.cpp b/src/test/telemetry/MetricsRegistry_test.cpp
new file mode 100644
index 0000000000..29877d4604
--- /dev/null
+++ b/src/test/telemetry/MetricsRegistry_test.cpp
@@ -0,0 +1,374 @@
+/** Unit tests for MetricsRegistry.
+
+ Tests cover:
+ - Construction with telemetry disabled (no-op behavior).
+ - start()/stop() lifecycle when disabled.
+ - Synchronous instrument recording methods do not crash when disabled.
+ - Double stop() is safe.
+
+ NOTE: Tests that exercise the OTel SDK path require XRPL_ENABLE_TELEMETRY
+ to be defined at build time (telemetry=ON). The no-op path tests run
+ unconditionally.
+*/
+
+#include
+
+#include
+#include
+
+namespace xrpl {
+namespace test {
+
+/** Minimal mock ServiceRegistry for MetricsRegistry testing.
+
+ Only the getMetricsRegistry() call is used in the tests; other methods
+ are not invoked because the registry is disabled (enabled=false) so no
+ gauge callbacks execute.
+
+ All pure virtual methods throw to catch accidental calls during tests.
+*/
+class MockServiceRegistry : public ServiceRegistry
+{
+ [[noreturn]] void
+ throwUnimplemented() const
+ {
+ Throw("MockServiceRegistry: method not implemented");
+ }
+
+public:
+ // ServiceRegistry interface — stubs that should never be called.
+ CollectorManager&
+ getCollectorManager() override
+ {
+ throwUnimplemented();
+ }
+ Family&
+ getNodeFamily() override
+ {
+ throwUnimplemented();
+ }
+ TimeKeeper&
+ timeKeeper() override
+ {
+ throwUnimplemented();
+ }
+ JobQueue&
+ getJobQueue() override
+ {
+ throwUnimplemented();
+ }
+ NodeCache&
+ getTempNodeCache() override
+ {
+ throwUnimplemented();
+ }
+ CachedSLEs&
+ cachedSLEs() override
+ {
+ throwUnimplemented();
+ }
+ NetworkIDService&
+ getNetworkIDService() override
+ {
+ throwUnimplemented();
+ }
+ AmendmentTable&
+ getAmendmentTable() override
+ {
+ throwUnimplemented();
+ }
+ HashRouter&
+ getHashRouter() override
+ {
+ throwUnimplemented();
+ }
+ LoadFeeTrack&
+ getFeeTrack() override
+ {
+ throwUnimplemented();
+ }
+ LoadManager&
+ getLoadManager() override
+ {
+ throwUnimplemented();
+ }
+ RCLValidations&
+ getValidations() override
+ {
+ throwUnimplemented();
+ }
+ ValidatorList&
+ validators() override
+ {
+ throwUnimplemented();
+ }
+ ValidatorSite&
+ validatorSites() override
+ {
+ throwUnimplemented();
+ }
+ ManifestCache&
+ validatorManifests() override
+ {
+ throwUnimplemented();
+ }
+ ManifestCache&
+ publisherManifests() override
+ {
+ throwUnimplemented();
+ }
+ Overlay&
+ overlay() override
+ {
+ throwUnimplemented();
+ }
+ Cluster&
+ cluster() override
+ {
+ throwUnimplemented();
+ }
+ PeerReservationTable&
+ peerReservations() override
+ {
+ throwUnimplemented();
+ }
+ Resource::Manager&
+ getResourceManager() override
+ {
+ throwUnimplemented();
+ }
+ NodeStore::Database&
+ getNodeStore() override
+ {
+ throwUnimplemented();
+ }
+ SHAMapStore&
+ getSHAMapStore() override
+ {
+ throwUnimplemented();
+ }
+ RelationalDatabase&
+ getRelationalDatabase() override
+ {
+ throwUnimplemented();
+ }
+ InboundLedgers&
+ getInboundLedgers() override
+ {
+ throwUnimplemented();
+ }
+ InboundTransactions&
+ getInboundTransactions() override
+ {
+ throwUnimplemented();
+ }
+ TaggedCache&
+ getAcceptedLedgerCache() override
+ {
+ throwUnimplemented();
+ }
+ LedgerMaster&
+ getLedgerMaster() override
+ {
+ throwUnimplemented();
+ }
+ LedgerCleaner&
+ getLedgerCleaner() override
+ {
+ throwUnimplemented();
+ }
+ LedgerReplayer&
+ getLedgerReplayer() override
+ {
+ throwUnimplemented();
+ }
+ PendingSaves&
+ pendingSaves() override
+ {
+ throwUnimplemented();
+ }
+ OpenLedger&
+ openLedger() override
+ {
+ throwUnimplemented();
+ }
+ OpenLedger const&
+ openLedger() const override
+ {
+ throwUnimplemented();
+ }
+ NetworkOPs&
+ getOPs() override
+ {
+ throwUnimplemented();
+ }
+ OrderBookDB&
+ getOrderBookDB() override
+ {
+ throwUnimplemented();
+ }
+ TransactionMaster&
+ getMasterTransaction() override
+ {
+ throwUnimplemented();
+ }
+ TxQ&
+ getTxQ() override
+ {
+ throwUnimplemented();
+ }
+ PathRequests&
+ getPathRequests() override
+ {
+ throwUnimplemented();
+ }
+ ServerHandler&
+ getServerHandler() override
+ {
+ throwUnimplemented();
+ }
+ perf::PerfLog&
+ getPerfLog() override
+ {
+ throwUnimplemented();
+ }
+ telemetry::Telemetry&
+ getTelemetry() override
+ {
+ throwUnimplemented();
+ }
+ telemetry::MetricsRegistry*
+ getMetricsRegistry() override
+ {
+ return nullptr;
+ }
+ bool
+ isStopping() const override
+ {
+ return false;
+ }
+ beast::Journal
+ journal(std::string const&) override
+ {
+ return beast::Journal(beast::Journal::getNullSink());
+ }
+ boost::asio::io_context&
+ getIOContext() override
+ {
+ throwUnimplemented();
+ }
+ Logs&
+ logs() override
+ {
+ throwUnimplemented();
+ }
+ std::optional const&
+ trapTxID() const override
+ {
+ static std::optional const empty;
+ return empty;
+ }
+ DatabaseCon&
+ getWalletDB() override
+ {
+ throwUnimplemented();
+ }
+ Application&
+ app() override
+ {
+ throwUnimplemented();
+ }
+};
+
+class MetricsRegistry_test : public beast::unit_test::suite
+{
+ void
+ testDisabledConstruction()
+ {
+ testcase("Disabled construction");
+
+ MockServiceRegistry mockApp;
+ beast::Journal j(beast::Journal::getNullSink());
+
+ // Construct with enabled=false; should be a no-op.
+ telemetry::MetricsRegistry registry(false, mockApp, j);
+ BEAST_EXPECT(!registry.isEnabled());
+ }
+
+ void
+ testDisabledStartStop()
+ {
+ testcase("Disabled start/stop");
+
+ MockServiceRegistry mockApp;
+ beast::Journal j(beast::Journal::getNullSink());
+
+ telemetry::MetricsRegistry registry(false, mockApp, j);
+
+ // start() and stop() should be no-ops when disabled.
+ registry.start("http://localhost:4318/v1/metrics");
+ registry.stop();
+
+ // Double stop should be safe.
+ registry.stop();
+
+ pass();
+ }
+
+ void
+ testDisabledRecording()
+ {
+ testcase("Disabled recording methods");
+
+ MockServiceRegistry mockApp;
+ beast::Journal j(beast::Journal::getNullSink());
+
+ telemetry::MetricsRegistry registry(false, mockApp, j);
+ registry.start("http://localhost:4318/v1/metrics");
+
+ // All recording methods should be no-ops (not crash).
+ registry.recordRpcStarted("server_info");
+ registry.recordRpcFinished("server_info", 1000);
+ registry.recordRpcErrored("ledger", 500);
+ registry.recordJobQueued("ledgerData");
+ registry.recordJobStarted("ledgerData", 200);
+ registry.recordJobFinished("ledgerData", 3000);
+
+ registry.stop();
+
+ pass();
+ }
+
+ void
+ testDestructorStops()
+ {
+ testcase("Destructor calls stop");
+
+ MockServiceRegistry mockApp;
+ beast::Journal j(beast::Journal::getNullSink());
+
+ {
+ // Let the destructor handle cleanup.
+ telemetry::MetricsRegistry registry(false, mockApp, j);
+ registry.start("http://localhost:4318/v1/metrics");
+ }
+
+ // If we get here without crash, the destructor handled stop.
+ pass();
+ }
+
+public:
+ void
+ run() override
+ {
+ testDisabledConstruction();
+ testDisabledStartStop();
+ testDisabledRecording();
+ testDestructorStops();
+ }
+};
+
+BEAST_DEFINE_TESTSUITE(MetricsRegistry, telemetry, ripple);
+
+} // namespace test
+} // namespace xrpl
diff --git a/src/xrpld/app/main/Application.cpp b/src/xrpld/app/main/Application.cpp
index 2a4efa7d10..996381832b 100644
--- a/src/xrpld/app/main/Application.cpp
+++ b/src/xrpld/app/main/Application.cpp
@@ -29,6 +29,7 @@
#include
#include
#include
+#include
#include
#include
@@ -148,6 +149,9 @@ public:
beast::Journal m_journal;
std::unique_ptr perfLog_;
std::unique_ptr telemetry_;
+ /// OTel metrics registry for gap-fill metrics (counters, histograms,
+ /// observable gauges). Created after telemetry_ during setup().
+ std::unique_ptr metricsRegistry_;
Application::MutexType m_masterMutex;
// Required by the SHAMapStore
@@ -633,6 +637,12 @@ public:
return *telemetry_;
}
+ telemetry::MetricsRegistry*
+ getMetricsRegistry() override
+ {
+ return metricsRegistry_.get();
+ }
+
NodeCache&
getTempNodeCache() override
{
@@ -1282,6 +1292,11 @@ ApplicationImp::setup(boost::program_options::variables_map const& cmdline)
if (!config_->section("telemetry").exists("service_instance_id"))
telemetry_->setServiceInstanceId(toBase58(TokenType::NodePublic, nodeIdentity_->first));
+ // Create the OTel MetricsRegistry for gap-fill metrics (counters,
+ // histograms, observable gauges). It is started later in start().
+ metricsRegistry_ = std::make_unique(
+ telemetry_->isEnabled(), *this, logs_->journal("MetricsRegistry"));
+
if (!cluster_->load(config().section(SECTION_CLUSTER_NODES)))
{
JLOG(m_journal.fatal()) << "Invalid entry in cluster configuration.";
@@ -1494,6 +1509,16 @@ ApplicationImp::start(bool withTimers)
ledgerCleaner_->start();
perfLog_->start();
telemetry_->start();
+
+ // Start the metrics pipeline after telemetry; the endpoint uses the
+ // same base URL but the /v1/metrics path.
+ if (metricsRegistry_)
+ {
+ auto const& section = config_->section("telemetry");
+ std::string endpoint = "http://localhost:4318/v1/metrics";
+ set(endpoint, "metrics_endpoint", section);
+ metricsRegistry_->start(endpoint);
+ }
}
void
@@ -1584,6 +1609,10 @@ ApplicationImp::run()
ledgerCleaner_->stop();
m_nodeStore->stop();
perfLog_->stop();
+ // Stop metrics pipeline before telemetry — gauge callbacks reference
+ // Application services that may be shutting down.
+ if (metricsRegistry_)
+ metricsRegistry_->stop();
// Telemetry must stop last among trace-producing components.
// serverHandler_, overlay_, and jobQueue_ are already stopped above,
// so no threads should be calling startSpan() at this point.
diff --git a/src/xrpld/perflog/detail/PerfLogImp.cpp b/src/xrpld/perflog/detail/PerfLogImp.cpp
index d1267953e4..ddbafe8198 100644
--- a/src/xrpld/perflog/detail/PerfLogImp.cpp
+++ b/src/xrpld/perflog/detail/PerfLogImp.cpp
@@ -1,9 +1,11 @@
#include
+#include
#include
#include
#include
#include
+#include
#include
#include
@@ -314,6 +316,10 @@ PerfLogImp::rpcStart(std::string const& method, std::uint64_t const requestId)
}
std::lock_guard lock(counters_.methodsMutex_);
counters_.methods_[requestId] = {counter->first.c_str(), steady_clock::now()};
+
+ // Task 9.4: Record RPC start in OTel metrics pipeline.
+ if (auto* mr = app_.getMetricsRegistry())
+ mr->recordRpcStarted(method);
}
void
@@ -343,13 +349,25 @@ PerfLogImp::rpcEnd(std::string const& method, std::uint64_t const requestId, boo
// LCOV_EXCL_STOP
}
}
- std::lock_guard lock(counter->second.mutex);
- if (finish)
- ++counter->second.value.finished;
- else
- ++counter->second.value.errored;
- counter->second.value.duration +=
- std::chrono::duration_cast(steady_clock::now() - startTime);
+ auto const duration = std::chrono::duration_cast(steady_clock::now() - startTime);
+ {
+ std::lock_guard lock(counter->second.mutex);
+ if (finish)
+ ++counter->second.value.finished;
+ else
+ ++counter->second.value.errored;
+ counter->second.value.duration += duration;
+ }
+
+ // Task 9.4: Record RPC completion/error in OTel metrics pipeline.
+ if (auto* mr = app_.getMetricsRegistry())
+ {
+ auto const durUs = duration.count();
+ if (finish)
+ mr->recordRpcFinished(method, durUs);
+ else
+ mr->recordRpcErrored(method, durUs);
+ }
}
void
@@ -365,6 +383,10 @@ PerfLogImp::jobQueue(JobType const type)
}
std::lock_guard lock(counter->second.mutex);
++counter->second.value.queued;
+
+ // Task 9.5: Record job enqueue in OTel metrics pipeline.
+ if (auto* mr = app_.getMetricsRegistry())
+ mr->recordJobQueued(JobTypes::name(type));
}
void
@@ -391,6 +413,10 @@ PerfLogImp::jobStart(
std::lock_guard lock(counters_.jobsMutex_);
if (instance >= 0 && instance < counters_.jobs_.size())
counters_.jobs_[instance] = {type, startTime};
+
+ // Task 9.5: Record job start in OTel metrics pipeline.
+ if (auto* mr = app_.getMetricsRegistry())
+ mr->recordJobStarted(JobTypes::name(type), dur.count());
}
void
@@ -413,6 +439,10 @@ PerfLogImp::jobFinish(JobType const type, microseconds dur, int instance)
std::lock_guard lock(counters_.jobsMutex_);
if (instance >= 0 && instance < counters_.jobs_.size())
counters_.jobs_[instance] = {jtINVALID, steady_time_point()};
+
+ // Task 9.5: Record job finish in OTel metrics pipeline.
+ if (auto* mr = app_.getMetricsRegistry())
+ mr->recordJobFinished(JobTypes::name(type), dur.count());
}
void
diff --git a/src/xrpld/telemetry/MetricsRegistry.cpp b/src/xrpld/telemetry/MetricsRegistry.cpp
new file mode 100644
index 0000000000..25b9dcd6d3
--- /dev/null
+++ b/src/xrpld/telemetry/MetricsRegistry.cpp
@@ -0,0 +1,473 @@
+/** MetricsRegistry implementation — OpenTelemetry metric instruments for rippled.
+
+ This file contains:
+ - Construction / destruction logic for the OTel MeterProvider pipeline.
+ - Synchronous instrument creation (counters, histograms) for RPC, job
+ queue, and NodeStore I/O metrics.
+ - Observable gauge callback registration for cache hit rates, TxQ state,
+ CountedObject instances, load factors, and NodeStore queue depth.
+ - No-op stubs when XRPL_ENABLE_TELEMETRY is not defined.
+*/
+
+#include
+
+#ifdef XRPL_ENABLE_TELEMETRY
+
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace metric_api = opentelemetry::metrics;
+namespace metric_sdk = opentelemetry::sdk::metrics;
+namespace otlp_http = opentelemetry::exporter::otlp;
+
+#endif // XRPL_ENABLE_TELEMETRY
+
+namespace xrpl {
+namespace telemetry {
+
+MetricsRegistry::MetricsRegistry(bool enabled, ServiceRegistry& app, beast::Journal journal)
+ : enabled_(enabled), app_(app), journal_(journal)
+{
+}
+
+MetricsRegistry::~MetricsRegistry()
+{
+ stop();
+}
+
+void
+MetricsRegistry::start(std::string const& endpoint)
+{
+#ifdef XRPL_ENABLE_TELEMETRY
+ if (!enabled_)
+ return;
+
+ JLOG(journal_.info()) << "MetricsRegistry: starting, endpoint=" << endpoint;
+
+ // Configure OTLP/HTTP metric exporter.
+ otlp_http::OtlpHttpMetricExporterOptions exporterOpts;
+ exporterOpts.url = endpoint;
+ auto exporter = otlp_http::OtlpHttpMetricExporterFactory::Create(exporterOpts);
+
+ // Configure periodic reader with 10-second export interval.
+ metric_sdk::PeriodicExportingMetricReaderOptions readerOpts;
+ readerOpts.export_interval_millis = std::chrono::milliseconds(10000);
+ readerOpts.export_timeout_millis = std::chrono::milliseconds(5000);
+ auto reader =
+ metric_sdk::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOpts);
+
+ // Create MeterProvider and attach the reader.
+ provider_ = std::make_shared();
+ provider_->AddMetricReader(std::move(reader));
+
+ // Get a meter for all rippled instruments.
+ meter_ = provider_->GetMeter("rippled", "1.0.0");
+
+ // --- Create synchronous instruments ---
+
+ // RPC per-method counters and histogram.
+ rpcStartedCounter_ =
+ meter_->CreateUInt64Counter("rpc_method_started_total", "Total RPC method calls started");
+ rpcFinishedCounter_ = meter_->CreateUInt64Counter(
+ "rpc_method_finished_total", "Total RPC method calls completed successfully");
+ rpcErroredCounter_ = meter_->CreateUInt64Counter(
+ "rpc_method_errored_total", "Total RPC method calls that errored");
+ rpcDurationHistogram_ = meter_->CreateDoubleHistogram(
+ "rpc_method_duration_us", "RPC method execution time in microseconds");
+
+ // Job queue per-type counters and histograms.
+ jobQueuedCounter_ = meter_->CreateUInt64Counter("job_queued_total", "Total jobs enqueued");
+ jobStartedCounter_ = meter_->CreateUInt64Counter("job_started_total", "Total jobs started");
+ jobFinishedCounter_ = meter_->CreateUInt64Counter("job_finished_total", "Total jobs completed");
+ jobQueuedDurationHistogram_ = meter_->CreateDoubleHistogram(
+ "job_queued_duration_us", "Time jobs spent waiting in the queue (microseconds)");
+ jobRunningDurationHistogram_ = meter_->CreateDoubleHistogram(
+ "job_running_duration_us", "Job execution time in microseconds");
+
+ // Register all observable (async) gauges.
+ registerAsyncGauges();
+
+ JLOG(journal_.info()) << "MetricsRegistry: started successfully";
+#else
+ (void)endpoint;
+#endif // XRPL_ENABLE_TELEMETRY
+}
+
+void
+MetricsRegistry::stop()
+{
+#ifdef XRPL_ENABLE_TELEMETRY
+ if (!provider_)
+ return;
+
+ JLOG(journal_.info()) << "MetricsRegistry: stopping";
+
+ // Force-flush any pending metrics, then destroy the provider.
+ // This stops the PeriodicExportingMetricReader, which in turn
+ // stops invoking observable gauge callbacks. No explicit
+ // RemoveCallback is needed — the provider destruction handles it.
+ provider_->ForceFlush();
+ provider_.reset();
+
+ JLOG(journal_.info()) << "MetricsRegistry: stopped";
+#endif // XRPL_ENABLE_TELEMETRY
+}
+
+// -----------------------------------------------------------------
+// Synchronous instrument recording — RPC metrics (Task 9.4)
+// -----------------------------------------------------------------
+
+void
+MetricsRegistry::recordRpcStarted(std::string_view method)
+{
+#ifdef XRPL_ENABLE_TELEMETRY
+ if (!enabled_ || !rpcStartedCounter_)
+ return;
+ rpcStartedCounter_->Add(1, {{"method", std::string(method)}});
+#else
+ (void)method;
+#endif
+}
+
+void
+MetricsRegistry::recordRpcFinished(std::string_view method, std::int64_t durationUs)
+{
+#ifdef XRPL_ENABLE_TELEMETRY
+ if (!enabled_ || !rpcFinishedCounter_)
+ return;
+ rpcFinishedCounter_->Add(1, {{"method", std::string(method)}});
+ if (rpcDurationHistogram_)
+ rpcDurationHistogram_->Record(
+ static_cast(durationUs), {{"method", std::string(method)}});
+#else
+ (void)method;
+ (void)durationUs;
+#endif
+}
+
+void
+MetricsRegistry::recordRpcErrored(std::string_view method, std::int64_t durationUs)
+{
+#ifdef XRPL_ENABLE_TELEMETRY
+ if (!enabled_ || !rpcErroredCounter_)
+ return;
+ rpcErroredCounter_->Add(1, {{"method", std::string(method)}});
+ if (rpcDurationHistogram_)
+ rpcDurationHistogram_->Record(
+ static_cast(durationUs), {{"method", std::string(method)}});
+#else
+ (void)method;
+ (void)durationUs;
+#endif
+}
+
+// -----------------------------------------------------------------
+// Synchronous instrument recording — Job Queue metrics (Task 9.5)
+// -----------------------------------------------------------------
+
+void
+MetricsRegistry::recordJobQueued(std::string_view jobType)
+{
+#ifdef XRPL_ENABLE_TELEMETRY
+ if (!enabled_ || !jobQueuedCounter_)
+ return;
+ jobQueuedCounter_->Add(1, {{"job_type", std::string(jobType)}});
+#else
+ (void)jobType;
+#endif
+}
+
+void
+MetricsRegistry::recordJobStarted(std::string_view jobType, std::int64_t queuedDurUs)
+{
+#ifdef XRPL_ENABLE_TELEMETRY
+ if (!enabled_ || !jobStartedCounter_)
+ return;
+ jobStartedCounter_->Add(1, {{"job_type", std::string(jobType)}});
+ if (jobQueuedDurationHistogram_)
+ jobQueuedDurationHistogram_->Record(
+ static_cast(queuedDurUs), {{"job_type", std::string(jobType)}});
+#else
+ (void)jobType;
+ (void)queuedDurUs;
+#endif
+}
+
+void
+MetricsRegistry::recordJobFinished(std::string_view jobType, std::int64_t runningDurUs)
+{
+#ifdef XRPL_ENABLE_TELEMETRY
+ if (!enabled_ || !jobFinishedCounter_)
+ return;
+ jobFinishedCounter_->Add(1, {{"job_type", std::string(jobType)}});
+ if (jobRunningDurationHistogram_)
+ jobRunningDurationHistogram_->Record(
+ static_cast(runningDurUs), {{"job_type", std::string(jobType)}});
+#else
+ (void)jobType;
+ (void)runningDurUs;
+#endif
+}
+
+// -----------------------------------------------------------------
+// Observable gauge callbacks (Tasks 9.1, 9.2, 9.3, 9.6, 9.7)
+// -----------------------------------------------------------------
+
+#ifdef XRPL_ENABLE_TELEMETRY
+
+void
+MetricsRegistry::registerAsyncGauges()
+{
+ // --- Task 9.2: Cache hit rate and size gauges ---
+ cacheHitRateGauge_ =
+ meter_->CreateDoubleObservableGauge("cache_metrics", "Cache hit rates and sizes");
+ cacheHitRateGauge_->AddCallback(
+ [](opentelemetry::metrics::ObserverResult result, void* state) {
+ auto* self = static_cast(state);
+ auto& app = self->app_;
+
+ try
+ {
+ // SLE cache hit rate (0.0 - 1.0).
+ auto sleRate = app.cachedSLEs().rate();
+ opentelemetry::nostd::get>>(result)
+ ->Observe(sleRate, {{"metric", "SLE_hit_rate"}});
+
+ // Ledger cache hit rate.
+ auto ledgerRate = app.getLedgerMaster().getCacheHitRate();
+ opentelemetry::nostd::get>>(result)
+ ->Observe(ledgerRate, {{"metric", "ledger_hit_rate"}});
+
+ // AcceptedLedger cache hit rate.
+ auto alRate = app.getAcceptedLedgerCache().getHitRate();
+ opentelemetry::nostd::get>>(result)
+ ->Observe(alRate, {{"metric", "AL_hit_rate"}});
+
+ // TreeNode cache size.
+ auto tnCacheSize = app.getNodeFamily().getTreeNodeCache()->getCacheSize();
+ opentelemetry::nostd::get>>(result)
+ ->Observe(
+ static_cast(tnCacheSize), {{"metric", "treenode_cache_size"}});
+
+ // TreeNode track size.
+ auto tnTrackSize = app.getNodeFamily().getTreeNodeCache()->getTrackSize();
+ opentelemetry::nostd::get>>(result)
+ ->Observe(
+ static_cast(tnTrackSize), {{"metric", "treenode_track_size"}});
+
+ // FullBelow cache size.
+ auto fbSize = app.getNodeFamily().getFullBelowCache()->size();
+ opentelemetry::nostd::get>>(result)
+ ->Observe(static_cast(fbSize), {{"metric", "fullbelow_size"}});
+ }
+ catch (...)
+ {
+ // Silently skip if services are not yet ready.
+ }
+ },
+ this);
+
+ // --- Task 9.3: TxQ metrics gauges ---
+ txqGauge_ = meter_->CreateDoubleObservableGauge("txq_metrics", "Transaction queue metrics");
+ txqGauge_->AddCallback(
+ [](opentelemetry::metrics::ObserverResult result, void* state) {
+ auto* self = static_cast(state);
+ auto& app = self->app_;
+
+ try
+ {
+ auto const metrics = app.getTxQ().getMetrics(*app.openLedger().current());
+
+ auto observe = [&](char const* name, double value) {
+ opentelemetry::nostd::get>>(result)
+ ->Observe(value, {{"metric", name}});
+ };
+
+ observe("txq_count", static_cast(metrics.txCount));
+ observe(
+ "txq_max_size",
+ metrics.txQMaxSize ? static_cast(*metrics.txQMaxSize) : 0.0);
+ observe("txq_in_ledger", static_cast(metrics.txInLedger));
+ observe("txq_per_ledger", static_cast(metrics.txPerLedger));
+ observe(
+ "txq_reference_fee_level",
+ static_cast(metrics.referenceFeeLevel.fee()));
+ observe(
+ "txq_min_processing_fee_level",
+ static_cast(metrics.minProcessingFeeLevel.fee()));
+ observe("txq_med_fee_level", static_cast(metrics.medFeeLevel.fee()));
+ observe(
+ "txq_open_ledger_fee_level",
+ static_cast(metrics.openLedgerFeeLevel.fee()));
+ }
+ catch (...)
+ {
+ // Silently skip if TxQ or OpenLedger are not yet ready.
+ }
+ },
+ this);
+
+ // --- Task 9.6: Counted object instance gauges ---
+ objectCountGauge_ = meter_->CreateInt64ObservableGauge(
+ "object_count", "Live instance counts for key internal object types");
+ objectCountGauge_->AddCallback(
+ [](opentelemetry::metrics::ObserverResult result, void* /* state */) {
+ try
+ {
+ // Iterate through all CountedObject types via the linked
+ // list in CountedObjects. We report all types with count
+ // > 0, filtering to the key types of interest.
+ auto counts = CountedObjects::getInstance().getCounts(0);
+ for (auto const& [name, count] : counts)
+ {
+ opentelemetry::nostd::get>>(result)
+ ->Observe(static_cast(count), {{"type", name}});
+ }
+ }
+ catch (...)
+ {
+ // Silently skip on error.
+ }
+ },
+ this);
+
+ // --- Task 9.7: Load factor breakdown gauges ---
+ loadFactorGauge_ =
+ meter_->CreateDoubleObservableGauge("load_factor_metrics", "Fee load factor breakdown");
+ loadFactorGauge_->AddCallback(
+ [](opentelemetry::metrics::ObserverResult result, void* state) {
+ auto* self = static_cast(state);
+ auto& app = self->app_;
+
+ try
+ {
+ auto& feeTrack = app.getFeeTrack();
+ auto const loadBase = static_cast(feeTrack.getLoadBase());
+
+ auto observe = [&](char const* name, double value) {
+ opentelemetry::nostd::get>>(result)
+ ->Observe(value, {{"metric", name}});
+ };
+
+ // Combined load factor (server component).
+ observe(
+ "load_factor_server", static_cast(feeTrack.getLoadFactor()) / loadBase);
+
+ // Individual factor components.
+ observe(
+ "load_factor_local", static_cast(feeTrack.getLocalFee()) / loadBase);
+ observe("load_factor_net", static_cast(feeTrack.getRemoteFee()) / loadBase);
+ observe(
+ "load_factor_cluster",
+ static_cast(feeTrack.getClusterFee()) / loadBase);
+
+ // Fee escalation factors from TxQ.
+ auto const metrics = app.getTxQ().getMetrics(*app.openLedger().current());
+ auto refLevel = static_cast(metrics.referenceFeeLevel.fee());
+ if (refLevel > 0)
+ {
+ observe(
+ "load_factor_fee_escalation",
+ static_cast(metrics.openLedgerFeeLevel.fee()) / refLevel);
+ observe(
+ "load_factor_fee_queue",
+ static_cast(metrics.minProcessingFeeLevel.fee()) / refLevel);
+ }
+
+ // Combined load factor (max of server and fee escalation).
+ auto const loadFactorServer = feeTrack.getLoadFactor();
+ auto const loadBaseServer = feeTrack.getLoadBase();
+ double combined = static_cast(loadFactorServer) / loadBase;
+ if (refLevel > 0)
+ {
+ double feeEscalation = static_cast(metrics.openLedgerFeeLevel.fee()) *
+ loadBaseServer / refLevel;
+ if (feeEscalation > static_cast(loadFactorServer))
+ {
+ combined = feeEscalation / loadBase;
+ }
+ }
+ observe("load_factor", combined);
+ }
+ catch (...)
+ {
+ // Silently skip if services are not yet ready.
+ }
+ },
+ this);
+
+ // --- Task 9.1: NodeStore I/O gauges ---
+ // The cumulative counters (reads, writes, bytes) are also exposed here
+ // as observable gauges. This avoids adding an xrpld dependency into the
+ // libxrpl nodestore code — the MetricsRegistry reads the existing atomic
+ // counters from Database via its public accessors.
+ nodeStoreGauge_ = meter_->CreateInt64ObservableGauge(
+ "nodestore_state", "NodeStore I/O counters, queue depth, and write load");
+ nodeStoreGauge_->AddCallback(
+ [](opentelemetry::metrics::ObserverResult result, void* state) {
+ auto* self = static_cast(state);
+ auto& app = self->app_;
+
+ try
+ {
+ auto& db = app.getNodeStore();
+
+ auto observe = [&](char const* name, int64_t value) {
+ opentelemetry::nostd::get>>(result)
+ ->Observe(value, {{"metric", name}});
+ };
+
+ // Cumulative counters (monotonically increasing).
+ observe("node_reads_total", static_cast(db.getFetchTotalCount()));
+ observe("node_reads_hit", static_cast(db.getFetchHitCount()));
+ observe("node_writes", static_cast(db.getStoreCount()));
+ observe("node_written_bytes", static_cast(db.getStoreSize()));
+ observe("node_read_bytes", static_cast(db.getFetchSize()));
+
+ // Write load score (instantaneous).
+ observe("write_load", static_cast(db.getWriteLoad()));
+
+ // Read queue depth (instantaneous).
+ Json::Value obj(Json::objectValue);
+ db.getCountsJson(obj);
+ if (obj.isMember("read_queue"))
+ {
+ observe("read_queue", static_cast(obj["read_queue"].asUInt()));
+ }
+ }
+ catch (...)
+ {
+ // Silently skip on error.
+ }
+ },
+ this);
+}
+
+#endif // XRPL_ENABLE_TELEMETRY
+
+} // namespace telemetry
+} // namespace xrpl
diff --git a/src/xrpld/telemetry/MetricsRegistry.h b/src/xrpld/telemetry/MetricsRegistry.h
new file mode 100644
index 0000000000..be977901d2
--- /dev/null
+++ b/src/xrpld/telemetry/MetricsRegistry.h
@@ -0,0 +1,280 @@
+#pragma once
+
+/** Central OTel Metrics Registry for rippled.
+
+ Owns all OpenTelemetry metric instruments (counters, histograms,
+ observable gauges) that are NOT already covered by the beast::insight
+ StatsD pipeline. The instruments are created once at startup and polled
+ by the OTel PeriodicExportingMetricReader at a configurable interval
+ (default 10 s).
+
+ When XRPL_ENABLE_TELEMETRY is **not** defined, this class compiles to a
+ lightweight no-op: every public method is an empty inline.
+
+ Dependency / ownership diagram (ASCII):
+
+ Application
+ |
+ +-- MetricsRegistry (unique_ptr, created in setup(), started/stopped with telemetry)
+ |
+ +-- OTel MeterProvider (owns reader + exporter)
+ | |
+ | +-- PeriodicExportingMetricReader
+ | +-- OtlpHttpMetricExporter
+ |
+ +-- Counters / Histograms (synchronous instruments)
+ | +-- rpc_method_started_total
+ | +-- rpc_method_finished_total
+ | +-- rpc_method_errored_total
+ | +-- rpc_method_duration_us (Histogram)
+ | +-- job_queued_total
+ | +-- job_started_total
+ | +-- job_finished_total
+ | +-- job_queued_duration_us (Histogram)
+ | +-- job_running_duration_us (Histogram)
+ |
+ +-- Observable Gauges (async callbacks, polled by reader)
+ +-- Cache hit rates (SLE, ledger, AL)
+ +-- TreeNode / FullBelow sizes
+ +-- TxQ metrics
+ +-- CountedObject counts
+ +-- Load factor breakdown
+ +-- NodeStore I/O gauges
+
+ Control-flow for async gauges:
+
+ PeriodicExportingMetricReader (background thread, 10 s tick)
+ |
+ v
+ OTel SDK invokes registered ObservableGauge callbacks
+ |
+ v
+ Each callback reads current value from Application services
+ (e.g. app.getTxQ().getMetrics(), app.getFeeTrack().getLoadFactor())
+ |
+ v
+ Result set is exported via OTLP/HTTP to the collector
+
+ Control-flow for synchronous instruments:
+
+ PerfLogImp::rpcStart/rpcEnd/jobQueue/jobStart/jobFinish
+ |
+ v
+ MetricsRegistry::recordRpc*(method, ...) / recordJob*(type, ...)
+ |
+ v
+ OTel Counter::Add() or Histogram::Record()
+ |
+ v
+ Periodically flushed by the MetricReader
+
+ Example usage:
+
+ @code
+ // In Application::setup(), after telemetry_ is created:
+ metricsRegistry_ = std::make_unique(
+ telemetry_->isEnabled(), app, journal);
+ metricsRegistry_->start(setup.exporterEndpoint);
+
+ // In PerfLogImp::rpcStart():
+ if (auto* mr = app_.getMetricsRegistry())
+ mr->recordRpcStarted("server_info");
+
+ // In PerfLogImp::rpcEnd():
+ if (auto* mr = app_.getMetricsRegistry())
+ {
+ mr->recordRpcFinished("server_info", durationUs);
+ // or: mr->recordRpcErrored("server_info", durationUs);
+ }
+
+ // In PerfLogImp::jobQueue():
+ if (auto* mr = app_.getMetricsRegistry())
+ mr->recordJobQueued("ledgerData");
+
+ // Shutdown:
+ metricsRegistry_->stop();
+ @endcode
+
+ Caveats:
+ - The MetricsRegistry must be created AFTER the Telemetry object because
+ it reads isEnabled() to decide whether to initialize the OTel SDK.
+ - Observable gauge callbacks capture a reference to the Application; the
+ Application must outlive the MetricsRegistry (guaranteed because
+ MetricsRegistry is stopped before Application teardown).
+ - If a new CountedObject type is added, it will NOT appear automatically
+ in the object_count gauge; the callback iterates a fixed list.
+ - Adding a new synchronous instrument requires updating both the header
+ and the .cpp, then calling the new record*() method from the
+ instrumentation site.
+*/
+
+#include
+
+#include
+#include
+#include
+#include
+
+#ifdef XRPL_ENABLE_TELEMETRY
+#include
+#include
+#include
+#include
+#endif
+
+namespace xrpl {
+
+class ServiceRegistry;
+
+namespace telemetry {
+
+class MetricsRegistry
+{
+public:
+ /** Construct a MetricsRegistry.
+
+ @param enabled Whether OTel metric export is active. When false,
+ all methods become no-ops.
+ @param app Reference to the ServiceRegistry (Application) for
+ reading current metric values in gauge callbacks.
+ @param journal Journal for log output.
+ */
+ MetricsRegistry(bool enabled, ServiceRegistry& app, beast::Journal journal);
+
+ ~MetricsRegistry();
+
+ /// Non-copyable, non-movable.
+ MetricsRegistry(MetricsRegistry const&) = delete;
+ MetricsRegistry&
+ operator=(MetricsRegistry const&) = delete;
+
+ /** Initialize the OTel metrics pipeline and register all instruments.
+
+ @param endpoint OTLP/HTTP endpoint URL for metric export
+ (e.g. "http://localhost:4318/v1/metrics").
+ */
+ void
+ start(std::string const& endpoint);
+
+ /** Flush pending metrics and shut down the pipeline. */
+ void
+ stop();
+
+ /** @return true if the registry is actively exporting metrics. */
+ bool
+ isEnabled() const noexcept
+ {
+ return enabled_;
+ }
+
+ // -----------------------------------------------------------------
+ // Synchronous instrument recording (called from PerfLog hot paths)
+ // -----------------------------------------------------------------
+
+ /** Record an RPC method call start.
+ @param method The RPC method name (e.g. "server_info").
+ */
+ void
+ recordRpcStarted(std::string_view method);
+
+ /** Record an RPC method call completion.
+ @param method The RPC method name.
+ @param durationUs Execution time in microseconds.
+ */
+ void
+ recordRpcFinished(std::string_view method, std::int64_t durationUs);
+
+ /** Record an RPC method call error.
+ @param method The RPC method name.
+ @param durationUs Execution time in microseconds.
+ */
+ void
+ recordRpcErrored(std::string_view method, std::int64_t durationUs);
+
+ /** Record a job enqueued event.
+ @param jobType The job type name (e.g. "ledgerData").
+ */
+ void
+ recordJobQueued(std::string_view jobType);
+
+ /** Record a job start event.
+ @param jobType The job type name.
+ @param queuedDurUs Time the job spent waiting in the queue (us).
+ */
+ void
+ recordJobStarted(std::string_view jobType, std::int64_t queuedDurUs);
+
+ /** Record a job finish event.
+ @param jobType The job type name.
+ @param runningDurUs Execution time in microseconds.
+ */
+ void
+ recordJobFinished(std::string_view jobType, std::int64_t runningDurUs);
+
+private:
+ /// Master enable flag; when false all methods are no-ops.
+ bool const enabled_;
+
+ /// Reference to Application services for gauge callbacks.
+ ServiceRegistry& app_;
+
+ /// Journal for logging.
+ beast::Journal const journal_;
+
+#ifdef XRPL_ENABLE_TELEMETRY
+ /// The SDK MeterProvider that owns the export pipeline.
+ std::shared_ptr provider_;
+
+ /// The Meter used to create all instruments.
+ opentelemetry::nostd::shared_ptr meter_;
+
+ // --- Synchronous instruments (RPC) ---
+ /// Counter: rpc_method_started_total{method=""}
+ opentelemetry::nostd::unique_ptr> rpcStartedCounter_;
+ /// Counter: rpc_method_finished_total{method=""}
+ opentelemetry::nostd::unique_ptr> rpcFinishedCounter_;
+ /// Counter: rpc_method_errored_total{method=""}
+ opentelemetry::nostd::unique_ptr> rpcErroredCounter_;
+ /// Histogram: rpc_method_duration_us{method=""}
+ opentelemetry::nostd::unique_ptr>
+ rpcDurationHistogram_;
+
+ // --- Synchronous instruments (Job Queue) ---
+ /// Counter: job_queued_total{job_type=""}
+ opentelemetry::nostd::unique_ptr> jobQueuedCounter_;
+ /// Counter: job_started_total{job_type=""}
+ opentelemetry::nostd::unique_ptr> jobStartedCounter_;
+ /// Counter: job_finished_total{job_type=""}
+ opentelemetry::nostd::unique_ptr> jobFinishedCounter_;
+ /// Histogram: job_queued_duration_us{job_type=""}
+ opentelemetry::nostd::unique_ptr>
+ jobQueuedDurationHistogram_;
+ /// Histogram: job_running_duration_us{job_type=""}
+ opentelemetry::nostd::unique_ptr>
+ jobRunningDurationHistogram_;
+
+ // --- Observable gauges (registered via callbacks) ---
+ // Handles are stored so we can remove callbacks on shutdown.
+ /// Observable gauges for cache hit rates and sizes.
+ opentelemetry::nostd::shared_ptr
+ cacheHitRateGauge_;
+ /// Observable gauges for TxQ metrics.
+ opentelemetry::nostd::shared_ptr txqGauge_;
+ /// Observable gauges for counted object instances.
+ opentelemetry::nostd::shared_ptr
+ objectCountGauge_;
+ /// Observable gauges for load factor breakdown.
+ opentelemetry::nostd::shared_ptr loadFactorGauge_;
+ /// Observable gauges for NodeStore write_load and read_queue.
+ opentelemetry::nostd::shared_ptr nodeStoreGauge_;
+
+ /** Register all observable gauge callbacks with the OTel SDK.
+ Called once during start().
+ */
+ void
+ registerAsyncGauges();
+#endif // XRPL_ENABLE_TELEMETRY
+};
+
+} // namespace telemetry
+} // namespace xrpl