mirror of
https://github.com/XRPLF/rippled.git
synced 2026-04-29 15:37:57 +00:00
Phase 9: Internal Metric Instrumentation Gap Fill (Tasks 9.1-9.10)
Implement ~50 OTel metrics covering NodeStore I/O, cache hit rates, TxQ state, PerfLog per-RPC/per-job counters, CountedObject instances, and load factor breakdown via MetricsRegistry. Core implementation: - MetricsRegistry class with synchronous instruments (Counter, Histogram) for RPC and Job metrics, and ObservableGauge callbacks for cache, TxQ, CountedObject, LoadFactor, and NodeStore state polling. - ServiceRegistry extended with getMetricsRegistry() virtual method. - Application wires MetricsRegistry lifecycle (create/start/stop). - PerfLogImp instrumented to emit OTel metrics on RPC and Job events. Dashboards & observability: - 3 new Grafana dashboards: RPC Performance, Job Queue, Fee Market/TxQ. - Extended statsd-node-health dashboard with NodeStore, Cache, and CountedObject panels. - 10 alerting rules added to telemetry-runbook.md. - Integration test extended with 12 OTel metric validation checks. Documentation: - 09-data-collection-reference.md updated with Phase 9 metric tables. - Unit tests for MetricsRegistry disabled-path (no-op) behavior. All OTel SDK code guarded with #ifdef XRPL_ENABLE_TELEMETRY. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -11,6 +11,7 @@ graph LR
|
||||
subgraph rippledNode["rippled Node"]
|
||||
A["Trace Macros<br/>XRPL_TRACE_SPAN<br/>(OTLP/HTTP exporter)"]
|
||||
B["beast::insight<br/>OTel native metrics<br/>(OTLP/HTTP exporter)"]
|
||||
C["MetricsRegistry<br/>OTel SDK metrics<br/>(OTLP/HTTP exporter)"]
|
||||
end
|
||||
|
||||
subgraph collector["OTel Collector :4317 / :4318"]
|
||||
@@ -33,11 +34,12 @@ graph LR
|
||||
end
|
||||
|
||||
subgraph viz["Visualization"]
|
||||
F["Grafana :3000<br/>10 dashboards"]
|
||||
F["Grafana :3000<br/>13 dashboards"]
|
||||
end
|
||||
|
||||
A -->|"OTLP/HTTP :4318<br/>(traces + attributes)"| R1
|
||||
B -->|"OTLP/HTTP :4318<br/>(gauges, counters, histograms)"| R1
|
||||
C -->|"OTLP/HTTP :4318<br/>(counters, histograms,<br/>observable gauges)"| R1
|
||||
|
||||
BP -->|"OTLP/gRPC :4317"| D
|
||||
BP -->|"OTLP/gRPC"| T
|
||||
@@ -751,6 +753,126 @@ Phase 11 builds a custom OTel Collector receiver (Go) that polls rippled's admin
|
||||
| `xrpl_orderbook_ask_depth` | Gauge | `pair="<base/quote>"` | Total ask volume |
|
||||
| `xrpl_orderbook_spread` | Gauge | `pair="<base/quote>"` | Best bid-ask spread |
|
||||
|
||||
### Phase 9: OTel SDK-Exported Metrics (MetricsRegistry)
|
||||
|
||||
Phase 9 introduces the `MetricsRegistry` class (`src/xrpld/telemetry/MetricsRegistry.h/.cpp`)
|
||||
which registers metrics directly with the OpenTelemetry Metrics SDK. These are exported
|
||||
via OTLP/HTTP to the OTel Collector and scraped by Prometheus.
|
||||
|
||||
#### NodeStore I/O (Observable Gauge — `nodestore_state`)
|
||||
|
||||
| Prometheus Metric | Type | Labels | Description |
|
||||
| ------------------------------------------------------ | ----- | -------- | ------------------------------------ |
|
||||
| `rippled_nodestore_state{metric="node_reads_total"}` | Gauge | `metric` | Cumulative NodeStore read operations |
|
||||
| `rippled_nodestore_state{metric="node_reads_hit"}` | Gauge | `metric` | Reads served from cache |
|
||||
| `rippled_nodestore_state{metric="node_writes"}` | Gauge | `metric` | Cumulative write operations |
|
||||
| `rippled_nodestore_state{metric="node_written_bytes"}` | Gauge | `metric` | Cumulative bytes written |
|
||||
| `rippled_nodestore_state{metric="node_read_bytes"}` | Gauge | `metric` | Cumulative bytes read |
|
||||
| `rippled_nodestore_state{metric="write_load"}` | Gauge | `metric` | Current write load score |
|
||||
| `rippled_nodestore_state{metric="read_queue"}` | Gauge | `metric` | Items in read prefetch queue |
|
||||
|
||||
#### Cache Hit Rates & Sizes (Observable Gauge — `cache_metrics`)
|
||||
|
||||
| Prometheus Metric | Type | Labels | Description |
|
||||
| ----------------------------------------------------- | ----- | -------- | ----------------------------- |
|
||||
| `rippled_cache_metrics{metric="SLE_hit_rate"}` | Gauge | `metric` | SLE cache hit rate (0.0-1.0) |
|
||||
| `rippled_cache_metrics{metric="ledger_hit_rate"}` | Gauge | `metric` | Ledger cache hit rate |
|
||||
| `rippled_cache_metrics{metric="AL_hit_rate"}` | Gauge | `metric` | AcceptedLedger cache hit rate |
|
||||
| `rippled_cache_metrics{metric="treenode_cache_size"}` | Gauge | `metric` | SHAMap TreeNode cache entries |
|
||||
| `rippled_cache_metrics{metric="treenode_track_size"}` | Gauge | `metric` | Tracked tree nodes |
|
||||
| `rippled_cache_metrics{metric="fullbelow_size"}` | Gauge | `metric` | FullBelow cache entries |
|
||||
|
||||
#### Transaction Queue (Observable Gauge — `txq_metrics`)
|
||||
|
||||
| Prometheus Metric | Type | Labels | Description |
|
||||
| ------------------------------------------------------------ | ----- | -------- | -------------------------------- |
|
||||
| `rippled_txq_metrics{metric="txq_count"}` | Gauge | `metric` | Transactions currently in queue |
|
||||
| `rippled_txq_metrics{metric="txq_max_size"}` | Gauge | `metric` | Maximum queue capacity |
|
||||
| `rippled_txq_metrics{metric="txq_in_ledger"}` | Gauge | `metric` | Transactions in open ledger |
|
||||
| `rippled_txq_metrics{metric="txq_per_ledger"}` | Gauge | `metric` | Expected transactions per ledger |
|
||||
| `rippled_txq_metrics{metric="txq_reference_fee_level"}` | Gauge | `metric` | Reference fee level |
|
||||
| `rippled_txq_metrics{metric="txq_min_processing_fee_level"}` | Gauge | `metric` | Minimum fee to get processed |
|
||||
| `rippled_txq_metrics{metric="txq_med_fee_level"}` | Gauge | `metric` | Median fee level in queue |
|
||||
| `rippled_txq_metrics{metric="txq_open_ledger_fee_level"}` | Gauge | `metric` | Open ledger fee escalation level |
|
||||
|
||||
#### Per-RPC Method Metrics (Synchronous Counters/Histogram)
|
||||
|
||||
| Prometheus Metric | Type | Labels | Description |
|
||||
| ----------------------------------- | --------- | ----------------- | -------------------------------- |
|
||||
| `rippled_rpc_method_started_total` | Counter | `method="<name>"` | RPC calls started |
|
||||
| `rippled_rpc_method_finished_total` | Counter | `method="<name>"` | RPC calls completed successfully |
|
||||
| `rippled_rpc_method_errored_total` | Counter | `method="<name>"` | RPC calls that errored |
|
||||
| `rippled_rpc_method_duration_us` | Histogram | `method="<name>"` | Execution time distribution (us) |
|
||||
|
||||
#### Per-Job-Type Metrics (Synchronous Counters/Histogram)
|
||||
|
||||
| Prometheus Metric | Type | Labels | Description |
|
||||
| --------------------------------- | --------- | ------------------- | --------------------------------- |
|
||||
| `rippled_job_queued_total` | Counter | `job_type="<name>"` | Jobs enqueued |
|
||||
| `rippled_job_started_total` | Counter | `job_type="<name>"` | Jobs started |
|
||||
| `rippled_job_finished_total` | Counter | `job_type="<name>"` | Jobs completed |
|
||||
| `rippled_job_queued_duration_us` | Histogram | `job_type="<name>"` | Queue wait time distribution (us) |
|
||||
| `rippled_job_running_duration_us` | Histogram | `job_type="<name>"` | Execution time distribution (us) |
|
||||
|
||||
#### Counted Object Instances (Observable Gauge — `object_count`)
|
||||
|
||||
| Prometheus Metric | Type | Labels | Description |
|
||||
| ---------------------------------------------- | ----- | --------------- | ------------------------------ |
|
||||
| `rippled_object_count{type="Transaction"}` | Gauge | `type="<name>"` | Live Transaction objects |
|
||||
| `rippled_object_count{type="Ledger"}` | Gauge | `type="<name>"` | Live Ledger objects |
|
||||
| `rippled_object_count{type="NodeObject"}` | Gauge | `type="<name>"` | Live NodeObject instances |
|
||||
| `rippled_object_count{type="STTx"}` | Gauge | `type="<name>"` | Serialized transaction objects |
|
||||
| `rippled_object_count{type="STLedgerEntry"}` | Gauge | `type="<name>"` | Serialized ledger entries |
|
||||
| `rippled_object_count{type="InboundLedger"}` | Gauge | `type="<name>"` | Ledgers being fetched |
|
||||
| `rippled_object_count{type="Pathfinder"}` | Gauge | `type="<name>"` | Active pathfinding operations |
|
||||
| `rippled_object_count{type="PathRequest"}` | Gauge | `type="<name>"` | Active path requests |
|
||||
| `rippled_object_count{type="HashRouterEntry"}` | Gauge | `type="<name>"` | Hash router entries |
|
||||
|
||||
#### Load Factor Breakdown (Observable Gauge — `load_factor_metrics`)
|
||||
|
||||
| Prometheus Metric | Type | Labels | Description |
|
||||
| ------------------------------------------------------------------ | ----- | -------- | --------------------------------------- |
|
||||
| `rippled_load_factor_metrics{metric="load_factor"}` | Gauge | `metric` | Combined transaction cost multiplier |
|
||||
| `rippled_load_factor_metrics{metric="load_factor_server"}` | Gauge | `metric` | Server + cluster + network contribution |
|
||||
| `rippled_load_factor_metrics{metric="load_factor_local"}` | Gauge | `metric` | Local server load only |
|
||||
| `rippled_load_factor_metrics{metric="load_factor_net"}` | Gauge | `metric` | Network-wide load estimate |
|
||||
| `rippled_load_factor_metrics{metric="load_factor_cluster"}` | Gauge | `metric` | Cluster peer load |
|
||||
| `rippled_load_factor_metrics{metric="load_factor_fee_escalation"}` | Gauge | `metric` | Open ledger fee escalation |
|
||||
| `rippled_load_factor_metrics{metric="load_factor_fee_queue"}` | Gauge | `metric` | Queue entry fee level |
|
||||
|
||||
#### Prometheus Query Examples (Phase 9)
|
||||
|
||||
```promql
|
||||
# NodeStore cache hit ratio
|
||||
rippled_nodestore_state{metric="node_reads_hit"} / rippled_nodestore_state{metric="node_reads_total"}
|
||||
|
||||
# RPC error rate for server_info
|
||||
rate(rippled_rpc_method_errored_total{method="server_info"}[5m])
|
||||
|
||||
# Job queue wait time p95
|
||||
histogram_quantile(0.95, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m])))
|
||||
|
||||
# TxQ utilization percentage
|
||||
rippled_txq_metrics{metric="txq_count"} / rippled_txq_metrics{metric="txq_max_size"}
|
||||
|
||||
# High load factor alert candidate
|
||||
rippled_load_factor_metrics{metric="load_factor"} > 5
|
||||
```
|
||||
|
||||
### New Grafana Dashboards (Phase 9)
|
||||
|
||||
| Dashboard | UID | Data Source | Key Panels |
|
||||
| ---------------------- | -------------------- | ----------- | --------------------------------------------------------- |
|
||||
| Fee Market & TxQ | `rippled-fee-market` | Prometheus | TxQ depth/capacity, fee levels, load factor breakdown |
|
||||
| Job Queue Analysis | `rippled-job-queue` | Prometheus | Per-job rates, queue wait times, execution times |
|
||||
| RPC Performance (OTel) | `rippled-rpc-perf` | Prometheus | Per-method call rates, error rates, latency distributions |
|
||||
|
||||
### Updated Grafana Dashboards (Phase 9)
|
||||
|
||||
| Dashboard | UID | New Panels Added |
|
||||
| -------------------- | ---------------------------- | ------------------------------------------------------ |
|
||||
| Node Health (StatsD) | `rippled-statsd-node-health` | NodeStore I/O, cache hit rates, object instance counts |
|
||||
|
||||
### New Grafana Dashboards (Phase 11)
|
||||
|
||||
| Dashboard | UID | Data Source | Key Panels |
|
||||
|
||||
317
docker/telemetry/grafana/dashboards/rippled-fee-market.json
Normal file
317
docker/telemetry/grafana/dashboards/rippled-fee-market.json
Normal file
@@ -0,0 +1,317 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"description": "Fee market dynamics: TxQ depth/capacity, fee escalation levels, and load factor breakdown. Sourced from OTel MetricsRegistry observable gauges (Phase 9).",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "Transaction Queue Depth",
|
||||
"description": "Current number of transactions waiting in the queue vs. maximum capacity. Sourced from MetricsRegistry txq_metrics observable gauge with metric=txq_count and metric=txq_max_size.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_txq_metrics{metric=\"txq_count\"}",
|
||||
"legendFormat": "Queue Depth"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_txq_metrics{metric=\"txq_max_size\"}",
|
||||
"legendFormat": "Max Capacity"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Transactions Per Ledger",
|
||||
"description": "Transactions in the current open ledger vs. expected per-ledger count. Sourced from txq_metrics with metric=txq_in_ledger and metric=txq_per_ledger.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_txq_metrics{metric=\"txq_in_ledger\"}",
|
||||
"legendFormat": "In Ledger"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_txq_metrics{metric=\"txq_per_ledger\"}",
|
||||
"legendFormat": "Expected Per Ledger"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Fee Escalation Levels",
|
||||
"description": "Fee levels that control transaction queue admission. Reference fee level is the baseline; open ledger fee level triggers escalation. Sourced from txq_metrics observable gauge.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_txq_metrics{metric=\"txq_reference_fee_level\"}",
|
||||
"legendFormat": "Reference Fee Level"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_txq_metrics{metric=\"txq_min_processing_fee_level\"}",
|
||||
"legendFormat": "Min Processing Fee Level"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_txq_metrics{metric=\"txq_med_fee_level\"}",
|
||||
"legendFormat": "Median Fee Level"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_txq_metrics{metric=\"txq_open_ledger_fee_level\"}",
|
||||
"legendFormat": "Open Ledger Fee Level"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "log",
|
||||
"log": 2
|
||||
}
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Load Factor Breakdown",
|
||||
"description": "Decomposed load factor components: server (max of local, net, cluster), fee escalation, fee queue, and combined. Values are unitless multipliers where 1.0 = no load. Sourced from load_factor_metrics observable gauge.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_load_factor_metrics{metric=\"load_factor\"}",
|
||||
"legendFormat": "Combined Load Factor"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_load_factor_metrics{metric=\"load_factor_server\"}",
|
||||
"legendFormat": "Server"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_load_factor_metrics{metric=\"load_factor_fee_escalation\"}",
|
||||
"legendFormat": "Fee Escalation"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_load_factor_metrics{metric=\"load_factor_fee_queue\"}",
|
||||
"legendFormat": "Fee Queue"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 5
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Load Factor Components",
|
||||
"description": "Individual load factor contributors: local server load, network load, and cluster load. Only differ from 1.0 under load conditions. Sourced from load_factor_metrics observable gauge.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 16
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_load_factor_metrics{metric=\"load_factor_local\"}",
|
||||
"legendFormat": "Local"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_load_factor_metrics{metric=\"load_factor_net\"}",
|
||||
"legendFormat": "Network"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_load_factor_metrics{metric=\"load_factor_cluster\"}",
|
||||
"legendFormat": "Cluster"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 5
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["rippled", "otel", "fee-market"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "rippled - Fee Market & TxQ",
|
||||
"uid": "rippled-fee-market",
|
||||
"version": 1
|
||||
}
|
||||
324
docker/telemetry/grafana/dashboards/rippled-job-queue.json
Normal file
324
docker/telemetry/grafana/dashboards/rippled-job-queue.json
Normal file
@@ -0,0 +1,324 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"description": "Job queue analysis: per-job-type throughput rates, queue wait times, and execution times. Sourced from OTel MetricsRegistry synchronous counters and histograms (Phase 9).",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "Job Throughput Rate (per second)",
|
||||
"description": "Rate of jobs queued, started, and finished across all job types. Computed as rate() over the OTel counter values. High queue rates with low finish rates indicate backlog.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum(rate(rippled_job_queued_total[5m]))",
|
||||
"legendFormat": "Queued/s"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum(rate(rippled_job_started_total[5m]))",
|
||||
"legendFormat": "Started/s"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum(rate(rippled_job_finished_total[5m]))",
|
||||
"legendFormat": "Finished/s"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Per-Job-Type Queued Rate",
|
||||
"description": "Rate of jobs queued broken down by job_type label. Identifies which job types contribute most to queue activity.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "topk(10, rate(rippled_job_queued_total[5m]))",
|
||||
"legendFormat": "{{job_type}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 5
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Per-Job-Type Finish Rate",
|
||||
"description": "Rate of jobs completing broken down by job_type. Compare with queued rate to identify backlog per type.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "topk(10, rate(rippled_job_finished_total[5m]))",
|
||||
"legendFormat": "{{job_type}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 5
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Job Queue Wait Time (p50, p95, p99)",
|
||||
"description": "Histogram quantiles for time jobs spend waiting in the queue before execution starts. High values indicate thread pool saturation.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m])))",
|
||||
"legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m])))",
|
||||
"legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m])))",
|
||||
"legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "us",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 5
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Job Execution Time (p50, p95, p99)",
|
||||
"description": "Histogram quantiles for actual job execution time. High values indicate expensive operations or resource contention.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 16
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m])))",
|
||||
"legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m])))",
|
||||
"legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m])))",
|
||||
"legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "us",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 5
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Per-Job-Type Execution Time (p95)",
|
||||
"description": "95th percentile execution time broken down by job type. Identifies the slowest job types.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "topk(10, histogram_quantile(0.95, sum by (le, job_type) (rate(rippled_job_running_duration_us_bucket[5m]))))",
|
||||
"legendFormat": "{{job_type}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "us",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 5
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["rippled", "otel", "job-queue"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "rippled - Job Queue Analysis",
|
||||
"uid": "rippled-job-queue",
|
||||
"version": 1
|
||||
}
|
||||
333
docker/telemetry/grafana/dashboards/rippled-rpc-perf.json
Normal file
333
docker/telemetry/grafana/dashboards/rippled-rpc-perf.json
Normal file
@@ -0,0 +1,333 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"description": "Per-RPC-method performance: call rates, error rates, and latency distributions. Sourced from OTel MetricsRegistry synchronous counters and histograms (Phase 9).",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "RPC Call Rate (all methods)",
|
||||
"description": "Aggregate rate of RPC calls started, finished, and errored across all methods. Computed as rate() over OTel counters.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum(rate(rippled_rpc_method_started_total[5m]))",
|
||||
"legendFormat": "Started/s"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum(rate(rippled_rpc_method_finished_total[5m]))",
|
||||
"legendFormat": "Finished/s"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum(rate(rippled_rpc_method_errored_total[5m]))",
|
||||
"legendFormat": "Errored/s"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Per-Method Call Rate (Top 10)",
|
||||
"description": "Per-method RPC call rate, showing the 10 most active methods. Useful for identifying hot paths.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "topk(10, rate(rippled_rpc_method_started_total[5m]))",
|
||||
"legendFormat": "{{method}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 5
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Per-Method Error Rate (Top 10)",
|
||||
"description": "Per-method RPC error rate. Non-zero values warrant investigation. Common culprits: invalid parameters, resource exhaustion.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "topk(10, rate(rippled_rpc_method_errored_total[5m]))",
|
||||
"legendFormat": "{{method}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 5
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "RPC Latency (p50, p95, p99) - All Methods",
|
||||
"description": "Histogram quantiles for RPC execution time across all methods. Sourced from rpc_method_duration_us histogram.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m])))",
|
||||
"legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m])))",
|
||||
"legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m])))",
|
||||
"legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "us",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 5
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Per-Method Latency p95 (Top 10 Slowest)",
|
||||
"description": "95th percentile execution time per method. Identifies the slowest RPC endpoints.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 16
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "topk(10, histogram_quantile(0.95, sum by (le, method) (rate(rippled_rpc_method_duration_us_bucket[5m]))))",
|
||||
"legendFormat": "{{method}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "us",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 5
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "RPC Error Ratio by Method",
|
||||
"description": "Error ratio (errors / total started) per method. Values above 0.05 (5%) warrant investigation.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "topk(10, rate(rippled_rpc_method_errored_total[5m]) / (rate(rippled_rpc_method_started_total[5m]) > 0))",
|
||||
"legendFormat": "{{method}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit",
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 5
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 0.05
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0.25
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["rippled", "otel", "rpc"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "rippled - RPC Performance (OTel)",
|
||||
"uid": "rippled-rpc-perf",
|
||||
"version": 1
|
||||
}
|
||||
@@ -399,10 +399,318 @@
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "--- OTel: NodeStore I/O ---",
|
||||
"type": "row",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
},
|
||||
"collapsed": false,
|
||||
"panels": []
|
||||
},
|
||||
{
|
||||
"title": "NodeStore Read/Write Totals",
|
||||
"description": "Cumulative NodeStore read and write operation counts. Sourced from MetricsRegistry nodestore_state observable gauge with metric=node_reads_total, node_writes, node_reads_hit.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 33
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_nodestore_state{metric=\"node_reads_total\"}",
|
||||
"legendFormat": "Reads Total"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_nodestore_state{metric=\"node_reads_hit\"}",
|
||||
"legendFormat": "Reads Hit (cache)"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_nodestore_state{metric=\"node_writes\"}",
|
||||
"legendFormat": "Writes Total"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "NodeStore Write Load & Read Queue",
|
||||
"description": "Instantaneous write load score and read queue depth. High write load indicates backend pressure. High read queue indicates prefetch thread saturation.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 33
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_nodestore_state{metric=\"write_load\"}",
|
||||
"legendFormat": "Write Load"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_nodestore_state{metric=\"read_queue\"}",
|
||||
"legendFormat": "Read Queue"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 100
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 1000
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "--- OTel: Cache Hit Rates ---",
|
||||
"type": "row",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 41
|
||||
},
|
||||
"collapsed": false,
|
||||
"panels": []
|
||||
},
|
||||
{
|
||||
"title": "Cache Hit Rates",
|
||||
"description": "Hit rates for SLE cache, Ledger cache, and AcceptedLedger cache. Values from 0.0 to 1.0. Low values indicate cache thrashing. Sourced from MetricsRegistry cache_metrics observable gauge.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 42
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_cache_metrics{metric=\"SLE_hit_rate\"}",
|
||||
"legendFormat": "SLE Hit Rate"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_cache_metrics{metric=\"ledger_hit_rate\"}",
|
||||
"legendFormat": "Ledger Hit Rate"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_cache_metrics{metric=\"AL_hit_rate\"}",
|
||||
"legendFormat": "AcceptedLedger Hit Rate"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit",
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Cache Sizes",
|
||||
"description": "TreeNode cache size, TreeNode track size, and FullBelow cache size. Sourced from MetricsRegistry cache_metrics observable gauge.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 42
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_cache_metrics{metric=\"treenode_cache_size\"}",
|
||||
"legendFormat": "TreeNode Cache"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_cache_metrics{metric=\"treenode_track_size\"}",
|
||||
"legendFormat": "TreeNode Track"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "rippled_cache_metrics{metric=\"fullbelow_size\"}",
|
||||
"legendFormat": "FullBelow"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "--- OTel: Object Instance Counts ---",
|
||||
"type": "row",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 50
|
||||
},
|
||||
"collapsed": false,
|
||||
"panels": []
|
||||
},
|
||||
{
|
||||
"title": "Object Instance Counts",
|
||||
"description": "Live instance counts for key internal object types tracked by CountedObject<T>. Sourced from MetricsRegistry object_count observable gauge. High counts may indicate memory pressure or object leaks.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 51
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["last", "max"]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "topk(15, rippled_object_count)",
|
||||
"legendFormat": "{{type}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 5
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["rippled", "statsd", "node-health", "telemetry"],
|
||||
"tags": ["rippled", "statsd", "otel", "node-health", "telemetry"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
|
||||
@@ -353,6 +353,7 @@ trace_transactions=1
|
||||
trace_consensus=1
|
||||
trace_peer=1
|
||||
trace_ledger=1
|
||||
metrics_endpoint=http://localhost:4318/v1/metrics
|
||||
|
||||
[insight]
|
||||
server=otel
|
||||
@@ -636,6 +637,53 @@ else
|
||||
fail "StatsD port 8125 appears to be listening (should not be needed)"
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 10c: Verify Phase 9 OTel SDK Metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
log ""
|
||||
log "--- Phase 9: OTel SDK Metrics (MetricsRegistry) ---"
|
||||
log "Waiting 15s for OTel metric export + Prometheus scrape..."
|
||||
sleep 15
|
||||
|
||||
check_otel_metric() {
|
||||
local metric_name="$1"
|
||||
local result
|
||||
result=$(curl -sf "$PROM/api/v1/query?query=$metric_name" \
|
||||
| jq '.data.result | length' 2>/dev/null || echo 0)
|
||||
if [ "$result" -gt 0 ]; then
|
||||
ok "OTel: $metric_name ($result series)"
|
||||
else
|
||||
fail "OTel: $metric_name (0 series)"
|
||||
fi
|
||||
}
|
||||
|
||||
# Task 9.1: NodeStore I/O
|
||||
check_otel_metric 'rippled_nodestore_state{metric="node_reads_total"}'
|
||||
check_otel_metric 'rippled_nodestore_state{metric="write_load"}'
|
||||
|
||||
# Task 9.2: Cache hit rates
|
||||
check_otel_metric 'rippled_cache_metrics{metric="SLE_hit_rate"}'
|
||||
check_otel_metric 'rippled_cache_metrics{metric="treenode_cache_size"}'
|
||||
|
||||
# Task 9.3: TxQ metrics
|
||||
check_otel_metric 'rippled_txq_metrics{metric="txq_count"}'
|
||||
check_otel_metric 'rippled_txq_metrics{metric="txq_reference_fee_level"}'
|
||||
|
||||
# Task 9.4: Per-RPC metrics
|
||||
check_otel_metric "rippled_rpc_method_started_total"
|
||||
check_otel_metric "rippled_rpc_method_finished_total"
|
||||
|
||||
# Task 9.5: Per-job metrics
|
||||
check_otel_metric "rippled_job_queued_total"
|
||||
check_otel_metric "rippled_job_finished_total"
|
||||
|
||||
# Task 9.6: Counted object instances
|
||||
check_otel_metric "rippled_object_count"
|
||||
|
||||
# Task 9.7: Load factor breakdown
|
||||
check_otel_metric 'rippled_load_factor_metrics{metric="load_factor"}'
|
||||
check_otel_metric 'rippled_load_factor_metrics{metric="load_factor_server"}'
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 11: Summary
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -231,7 +231,7 @@ When using StatsD, uncomment the `statsd` receiver in `otel-collector-config.yam
|
||||
|
||||
## Grafana Dashboards
|
||||
|
||||
Eight dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`:
|
||||
Thirteen dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`:
|
||||
|
||||
### RPC Performance (`rippled-rpc-perf`)
|
||||
|
||||
@@ -403,8 +403,74 @@ count_over_time({job="rippled"} |= "trace_id=" [5m])
|
||||
4. Open Grafana at http://localhost:3000 -> Explore -> Loki and search for `{job="rippled"} |= "trace_id="`.
|
||||
5. Click the TraceID link to navigate to the corresponding trace in Tempo.
|
||||
|
||||
## Phase 9: OTel Metrics Alerting Rules
|
||||
|
||||
The following alerting rules are recommended for the Phase 9 OTel SDK metrics.
|
||||
Add to your Prometheus alerting rules configuration.
|
||||
|
||||
### NodeStore
|
||||
|
||||
| Alert Name | Severity | Condition | For | Description |
|
||||
| --------------------------- | -------- | ---------------------------------------------------- | --- | ------------------------------------------------------- |
|
||||
| `NodeStoreHighWriteLoad` | Warning | `rippled_nodestore_state{metric="write_load"} > 100` | 5m | NodeStore backend is under sustained write pressure |
|
||||
| `NodeStoreReadQueueBacklog` | Warning | `rippled_nodestore_state{metric="read_queue"} > 500` | 5m | Prefetch thread pool is saturated; reads are backing up |
|
||||
|
||||
### Cache
|
||||
|
||||
| Alert Name | Severity | Condition | For | Description |
|
||||
| ----------------------- | -------- | ------------------------------------------------------- | --- | ------------------------------------------------------ |
|
||||
| `SLECacheHitRateLow` | Warning | `rippled_cache_metrics{metric="SLE_hit_rate"} < 0.5` | 10m | SLE cache is thrashing; consider increasing cache size |
|
||||
| `LedgerCacheHitRateLow` | Warning | `rippled_cache_metrics{metric="ledger_hit_rate"} < 0.5` | 10m | Ledger cache hit rate is degraded |
|
||||
|
||||
### Transaction Queue
|
||||
|
||||
| Alert Name | Severity | Condition | For | Description |
|
||||
| ---------------------- | -------- | ---------------------------------------------------------------------------------------------------------------------- | --- | -------------------------------------------------- |
|
||||
| `TxQNearCapacity` | Warning | `rippled_txq_metrics{metric="txq_count"} / rippled_txq_metrics{metric="txq_max_size"} > 0.8` | 5m | TxQ is >80% full; transactions may be rejected |
|
||||
| `TxQHighFeeEscalation` | Warning | `rippled_txq_metrics{metric="txq_open_ledger_fee_level"} / rippled_txq_metrics{metric="txq_reference_fee_level"} > 10` | 5m | Fee escalation is 10x above reference; high demand |
|
||||
|
||||
### Load Factor
|
||||
|
||||
| Alert Name | Severity | Condition | For | Description |
|
||||
| --------------------- | -------- | -------------------------------------------------------------- | --- | -------------------------------------------------------------- |
|
||||
| `HighLoadFactor` | Warning | `rippled_load_factor_metrics{metric="load_factor"} > 5` | 10m | Combined load factor is elevated; transactions cost 5x+ normal |
|
||||
| `HighLocalLoadFactor` | Critical | `rippled_load_factor_metrics{metric="load_factor_local"} > 10` | 5m | Local server load is critically elevated |
|
||||
|
||||
### RPC Performance
|
||||
|
||||
| Alert Name | Severity | Condition | For | Description |
|
||||
| ------------------ | -------- | ---------------------------------------------------------------------------------------------------------- | --- | --------------------------------- |
|
||||
| `HighRPCErrorRate` | Warning | `sum(rate(rippled_rpc_method_errored_total[5m])) / sum(rate(rippled_rpc_method_started_total[5m])) > 0.05` | 5m | >5% of RPC calls are erroring |
|
||||
| `SlowRPCLatency` | Warning | `histogram_quantile(0.95, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m]))) > 5000000` | 5m | RPC p95 latency exceeds 5 seconds |
|
||||
|
||||
### Job Queue
|
||||
|
||||
| Alert Name | Severity | Condition | For | Description |
|
||||
| ------------------ | -------- | ----------------------------------------------------------------------------------------------------- | --- | ---------------------------------------------------- |
|
||||
| `JobQueueBacklog` | Warning | `sum(rate(rippled_job_queued_total[5m])) - sum(rate(rippled_job_finished_total[5m])) > 100` | 5m | Jobs are being queued faster than they're completing |
|
||||
| `SlowJobExecution` | Warning | `histogram_quantile(0.95, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m]))) > 10000000` | 5m | Job execution p95 exceeds 10 seconds |
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### No OTel SDK metrics in Prometheus
|
||||
|
||||
1. Verify `enabled=1` in the `[telemetry]` config section
|
||||
2. Check that `metrics_endpoint` points to the OTel Collector's HTTP receiver
|
||||
(default: `http://localhost:4318/v1/metrics`)
|
||||
3. Check rippled logs for `MetricsRegistry: started successfully` message
|
||||
4. Verify the OTel Collector is configured with an OTLP receiver and Prometheus exporter
|
||||
5. Check Prometheus targets page for the collector scrape target
|
||||
|
||||
### Cache hit rates are zero
|
||||
|
||||
Cache hit rates may be zero during startup before caches are warmed. Wait for the
|
||||
node to reach `Full` operating mode and process several ledgers before investigating.
|
||||
|
||||
### NodeStore I/O counters not incrementing
|
||||
|
||||
NodeStore counters are cumulative and may appear flat if the node is idle. Submit
|
||||
some transactions or RPC requests to generate I/O activity.
|
||||
|
||||
### No traces appearing in Jaeger
|
||||
|
||||
1. Check rippled logs for `Telemetry starting` message
|
||||
|
||||
@@ -21,7 +21,8 @@ class PerfLog;
|
||||
}
|
||||
namespace telemetry {
|
||||
class Telemetry;
|
||||
}
|
||||
class MetricsRegistry;
|
||||
} // namespace telemetry
|
||||
|
||||
// This is temporary until we migrate all code to use ServiceRegistry.
|
||||
class Application;
|
||||
@@ -211,6 +212,12 @@ public:
|
||||
virtual telemetry::Telemetry&
|
||||
getTelemetry() = 0;
|
||||
|
||||
/** Return the MetricsRegistry, or nullptr if telemetry is disabled.
|
||||
Used by PerfLog and other hot paths to record OTel metrics.
|
||||
*/
|
||||
virtual telemetry::MetricsRegistry*
|
||||
getMetricsRegistry() = 0;
|
||||
|
||||
// Configuration and state
|
||||
virtual bool
|
||||
isStopping() const = 0;
|
||||
|
||||
374
src/test/telemetry/MetricsRegistry_test.cpp
Normal file
374
src/test/telemetry/MetricsRegistry_test.cpp
Normal file
@@ -0,0 +1,374 @@
|
||||
/** Unit tests for MetricsRegistry.
|
||||
|
||||
Tests cover:
|
||||
- Construction with telemetry disabled (no-op behavior).
|
||||
- start()/stop() lifecycle when disabled.
|
||||
- Synchronous instrument recording methods do not crash when disabled.
|
||||
- Double stop() is safe.
|
||||
|
||||
NOTE: Tests that exercise the OTel SDK path require XRPL_ENABLE_TELEMETRY
|
||||
to be defined at build time (telemetry=ON). The no-op path tests run
|
||||
unconditionally.
|
||||
*/
|
||||
|
||||
#include <xrpld/telemetry/MetricsRegistry.h>
|
||||
|
||||
#include <xrpl/beast/unit_test.h>
|
||||
#include <xrpl/core/ServiceRegistry.h>
|
||||
|
||||
namespace xrpl {
|
||||
namespace test {
|
||||
|
||||
/** Minimal mock ServiceRegistry for MetricsRegistry testing.
|
||||
|
||||
Only the getMetricsRegistry() call is used in the tests; other methods
|
||||
are not invoked because the registry is disabled (enabled=false) so no
|
||||
gauge callbacks execute.
|
||||
|
||||
All pure virtual methods throw to catch accidental calls during tests.
|
||||
*/
|
||||
class MockServiceRegistry : public ServiceRegistry
|
||||
{
|
||||
[[noreturn]] void
|
||||
throwUnimplemented() const
|
||||
{
|
||||
Throw<std::logic_error>("MockServiceRegistry: method not implemented");
|
||||
}
|
||||
|
||||
public:
|
||||
// ServiceRegistry interface — stubs that should never be called.
|
||||
CollectorManager&
|
||||
getCollectorManager() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
Family&
|
||||
getNodeFamily() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
TimeKeeper&
|
||||
timeKeeper() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
JobQueue&
|
||||
getJobQueue() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
NodeCache&
|
||||
getTempNodeCache() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
CachedSLEs&
|
||||
cachedSLEs() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
NetworkIDService&
|
||||
getNetworkIDService() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
AmendmentTable&
|
||||
getAmendmentTable() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
HashRouter&
|
||||
getHashRouter() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
LoadFeeTrack&
|
||||
getFeeTrack() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
LoadManager&
|
||||
getLoadManager() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
RCLValidations&
|
||||
getValidations() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
ValidatorList&
|
||||
validators() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
ValidatorSite&
|
||||
validatorSites() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
ManifestCache&
|
||||
validatorManifests() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
ManifestCache&
|
||||
publisherManifests() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
Overlay&
|
||||
overlay() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
Cluster&
|
||||
cluster() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
PeerReservationTable&
|
||||
peerReservations() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
Resource::Manager&
|
||||
getResourceManager() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
NodeStore::Database&
|
||||
getNodeStore() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
SHAMapStore&
|
||||
getSHAMapStore() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
RelationalDatabase&
|
||||
getRelationalDatabase() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
InboundLedgers&
|
||||
getInboundLedgers() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
InboundTransactions&
|
||||
getInboundTransactions() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
TaggedCache<uint256, AcceptedLedger>&
|
||||
getAcceptedLedgerCache() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
LedgerMaster&
|
||||
getLedgerMaster() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
LedgerCleaner&
|
||||
getLedgerCleaner() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
LedgerReplayer&
|
||||
getLedgerReplayer() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
PendingSaves&
|
||||
pendingSaves() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
OpenLedger&
|
||||
openLedger() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
OpenLedger const&
|
||||
openLedger() const override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
NetworkOPs&
|
||||
getOPs() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
OrderBookDB&
|
||||
getOrderBookDB() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
TransactionMaster&
|
||||
getMasterTransaction() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
TxQ&
|
||||
getTxQ() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
PathRequests&
|
||||
getPathRequests() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
ServerHandler&
|
||||
getServerHandler() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
perf::PerfLog&
|
||||
getPerfLog() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
telemetry::Telemetry&
|
||||
getTelemetry() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
telemetry::MetricsRegistry*
|
||||
getMetricsRegistry() override
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
bool
|
||||
isStopping() const override
|
||||
{
|
||||
return false;
|
||||
}
|
||||
beast::Journal
|
||||
journal(std::string const&) override
|
||||
{
|
||||
return beast::Journal(beast::Journal::getNullSink());
|
||||
}
|
||||
boost::asio::io_context&
|
||||
getIOContext() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
Logs&
|
||||
logs() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
std::optional<uint256> const&
|
||||
trapTxID() const override
|
||||
{
|
||||
static std::optional<uint256> const empty;
|
||||
return empty;
|
||||
}
|
||||
DatabaseCon&
|
||||
getWalletDB() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
Application&
|
||||
app() override
|
||||
{
|
||||
throwUnimplemented();
|
||||
}
|
||||
};
|
||||
|
||||
class MetricsRegistry_test : public beast::unit_test::suite
|
||||
{
|
||||
void
|
||||
testDisabledConstruction()
|
||||
{
|
||||
testcase("Disabled construction");
|
||||
|
||||
MockServiceRegistry mockApp;
|
||||
beast::Journal j(beast::Journal::getNullSink());
|
||||
|
||||
// Construct with enabled=false; should be a no-op.
|
||||
telemetry::MetricsRegistry registry(false, mockApp, j);
|
||||
BEAST_EXPECT(!registry.isEnabled());
|
||||
}
|
||||
|
||||
void
|
||||
testDisabledStartStop()
|
||||
{
|
||||
testcase("Disabled start/stop");
|
||||
|
||||
MockServiceRegistry mockApp;
|
||||
beast::Journal j(beast::Journal::getNullSink());
|
||||
|
||||
telemetry::MetricsRegistry registry(false, mockApp, j);
|
||||
|
||||
// start() and stop() should be no-ops when disabled.
|
||||
registry.start("http://localhost:4318/v1/metrics");
|
||||
registry.stop();
|
||||
|
||||
// Double stop should be safe.
|
||||
registry.stop();
|
||||
|
||||
pass();
|
||||
}
|
||||
|
||||
void
|
||||
testDisabledRecording()
|
||||
{
|
||||
testcase("Disabled recording methods");
|
||||
|
||||
MockServiceRegistry mockApp;
|
||||
beast::Journal j(beast::Journal::getNullSink());
|
||||
|
||||
telemetry::MetricsRegistry registry(false, mockApp, j);
|
||||
registry.start("http://localhost:4318/v1/metrics");
|
||||
|
||||
// All recording methods should be no-ops (not crash).
|
||||
registry.recordRpcStarted("server_info");
|
||||
registry.recordRpcFinished("server_info", 1000);
|
||||
registry.recordRpcErrored("ledger", 500);
|
||||
registry.recordJobQueued("ledgerData");
|
||||
registry.recordJobStarted("ledgerData", 200);
|
||||
registry.recordJobFinished("ledgerData", 3000);
|
||||
|
||||
registry.stop();
|
||||
|
||||
pass();
|
||||
}
|
||||
|
||||
void
|
||||
testDestructorStops()
|
||||
{
|
||||
testcase("Destructor calls stop");
|
||||
|
||||
MockServiceRegistry mockApp;
|
||||
beast::Journal j(beast::Journal::getNullSink());
|
||||
|
||||
{
|
||||
// Let the destructor handle cleanup.
|
||||
telemetry::MetricsRegistry registry(false, mockApp, j);
|
||||
registry.start("http://localhost:4318/v1/metrics");
|
||||
}
|
||||
|
||||
// If we get here without crash, the destructor handled stop.
|
||||
pass();
|
||||
}
|
||||
|
||||
public:
|
||||
void
|
||||
run() override
|
||||
{
|
||||
testDisabledConstruction();
|
||||
testDisabledStartStop();
|
||||
testDisabledRecording();
|
||||
testDestructorStops();
|
||||
}
|
||||
};
|
||||
|
||||
BEAST_DEFINE_TESTSUITE(MetricsRegistry, telemetry, ripple);
|
||||
|
||||
} // namespace test
|
||||
} // namespace xrpl
|
||||
@@ -29,6 +29,7 @@
|
||||
#include <xrpld/overlay/PeerSet.h>
|
||||
#include <xrpld/overlay/make_Overlay.h>
|
||||
#include <xrpld/shamap/NodeFamily.h>
|
||||
#include <xrpld/telemetry/MetricsRegistry.h>
|
||||
|
||||
#include <xrpl/basics/ByteUtilities.h>
|
||||
#include <xrpl/basics/ResolverAsio.h>
|
||||
@@ -148,6 +149,9 @@ public:
|
||||
beast::Journal m_journal;
|
||||
std::unique_ptr<perf::PerfLog> perfLog_;
|
||||
std::unique_ptr<telemetry::Telemetry> telemetry_;
|
||||
/// OTel metrics registry for gap-fill metrics (counters, histograms,
|
||||
/// observable gauges). Created after telemetry_ during setup().
|
||||
std::unique_ptr<telemetry::MetricsRegistry> metricsRegistry_;
|
||||
Application::MutexType m_masterMutex;
|
||||
|
||||
// Required by the SHAMapStore
|
||||
@@ -633,6 +637,12 @@ public:
|
||||
return *telemetry_;
|
||||
}
|
||||
|
||||
telemetry::MetricsRegistry*
|
||||
getMetricsRegistry() override
|
||||
{
|
||||
return metricsRegistry_.get();
|
||||
}
|
||||
|
||||
NodeCache&
|
||||
getTempNodeCache() override
|
||||
{
|
||||
@@ -1282,6 +1292,11 @@ ApplicationImp::setup(boost::program_options::variables_map const& cmdline)
|
||||
if (!config_->section("telemetry").exists("service_instance_id"))
|
||||
telemetry_->setServiceInstanceId(toBase58(TokenType::NodePublic, nodeIdentity_->first));
|
||||
|
||||
// Create the OTel MetricsRegistry for gap-fill metrics (counters,
|
||||
// histograms, observable gauges). It is started later in start().
|
||||
metricsRegistry_ = std::make_unique<telemetry::MetricsRegistry>(
|
||||
telemetry_->isEnabled(), *this, logs_->journal("MetricsRegistry"));
|
||||
|
||||
if (!cluster_->load(config().section(SECTION_CLUSTER_NODES)))
|
||||
{
|
||||
JLOG(m_journal.fatal()) << "Invalid entry in cluster configuration.";
|
||||
@@ -1494,6 +1509,16 @@ ApplicationImp::start(bool withTimers)
|
||||
ledgerCleaner_->start();
|
||||
perfLog_->start();
|
||||
telemetry_->start();
|
||||
|
||||
// Start the metrics pipeline after telemetry; the endpoint uses the
|
||||
// same base URL but the /v1/metrics path.
|
||||
if (metricsRegistry_)
|
||||
{
|
||||
auto const& section = config_->section("telemetry");
|
||||
std::string endpoint = "http://localhost:4318/v1/metrics";
|
||||
set(endpoint, "metrics_endpoint", section);
|
||||
metricsRegistry_->start(endpoint);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
@@ -1584,6 +1609,10 @@ ApplicationImp::run()
|
||||
ledgerCleaner_->stop();
|
||||
m_nodeStore->stop();
|
||||
perfLog_->stop();
|
||||
// Stop metrics pipeline before telemetry — gauge callbacks reference
|
||||
// Application services that may be shutting down.
|
||||
if (metricsRegistry_)
|
||||
metricsRegistry_->stop();
|
||||
// Telemetry must stop last among trace-producing components.
|
||||
// serverHandler_, overlay_, and jobQueue_ are already stopped above,
|
||||
// so no threads should be calling startSpan() at this point.
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
#include <xrpld/perflog/detail/PerfLogImp.h>
|
||||
#include <xrpld/telemetry/MetricsRegistry.h>
|
||||
|
||||
#include <xrpl/basics/BasicConfig.h>
|
||||
#include <xrpl/beast/core/CurrentThreadName.h>
|
||||
#include <xrpl/beast/utility/Journal.h>
|
||||
#include <xrpl/core/JobTypes.h>
|
||||
#include <xrpl/core/ServiceRegistry.h>
|
||||
#include <xrpl/json/json_writer.h>
|
||||
|
||||
#include <atomic>
|
||||
@@ -314,6 +316,10 @@ PerfLogImp::rpcStart(std::string const& method, std::uint64_t const requestId)
|
||||
}
|
||||
std::lock_guard lock(counters_.methodsMutex_);
|
||||
counters_.methods_[requestId] = {counter->first.c_str(), steady_clock::now()};
|
||||
|
||||
// Task 9.4: Record RPC start in OTel metrics pipeline.
|
||||
if (auto* mr = app_.getMetricsRegistry())
|
||||
mr->recordRpcStarted(method);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -343,13 +349,25 @@ PerfLogImp::rpcEnd(std::string const& method, std::uint64_t const requestId, boo
|
||||
// LCOV_EXCL_STOP
|
||||
}
|
||||
}
|
||||
std::lock_guard lock(counter->second.mutex);
|
||||
if (finish)
|
||||
++counter->second.value.finished;
|
||||
else
|
||||
++counter->second.value.errored;
|
||||
counter->second.value.duration +=
|
||||
std::chrono::duration_cast<microseconds>(steady_clock::now() - startTime);
|
||||
auto const duration = std::chrono::duration_cast<microseconds>(steady_clock::now() - startTime);
|
||||
{
|
||||
std::lock_guard lock(counter->second.mutex);
|
||||
if (finish)
|
||||
++counter->second.value.finished;
|
||||
else
|
||||
++counter->second.value.errored;
|
||||
counter->second.value.duration += duration;
|
||||
}
|
||||
|
||||
// Task 9.4: Record RPC completion/error in OTel metrics pipeline.
|
||||
if (auto* mr = app_.getMetricsRegistry())
|
||||
{
|
||||
auto const durUs = duration.count();
|
||||
if (finish)
|
||||
mr->recordRpcFinished(method, durUs);
|
||||
else
|
||||
mr->recordRpcErrored(method, durUs);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
@@ -365,6 +383,10 @@ PerfLogImp::jobQueue(JobType const type)
|
||||
}
|
||||
std::lock_guard lock(counter->second.mutex);
|
||||
++counter->second.value.queued;
|
||||
|
||||
// Task 9.5: Record job enqueue in OTel metrics pipeline.
|
||||
if (auto* mr = app_.getMetricsRegistry())
|
||||
mr->recordJobQueued(JobTypes::name(type));
|
||||
}
|
||||
|
||||
void
|
||||
@@ -391,6 +413,10 @@ PerfLogImp::jobStart(
|
||||
std::lock_guard lock(counters_.jobsMutex_);
|
||||
if (instance >= 0 && instance < counters_.jobs_.size())
|
||||
counters_.jobs_[instance] = {type, startTime};
|
||||
|
||||
// Task 9.5: Record job start in OTel metrics pipeline.
|
||||
if (auto* mr = app_.getMetricsRegistry())
|
||||
mr->recordJobStarted(JobTypes::name(type), dur.count());
|
||||
}
|
||||
|
||||
void
|
||||
@@ -413,6 +439,10 @@ PerfLogImp::jobFinish(JobType const type, microseconds dur, int instance)
|
||||
std::lock_guard lock(counters_.jobsMutex_);
|
||||
if (instance >= 0 && instance < counters_.jobs_.size())
|
||||
counters_.jobs_[instance] = {jtINVALID, steady_time_point()};
|
||||
|
||||
// Task 9.5: Record job finish in OTel metrics pipeline.
|
||||
if (auto* mr = app_.getMetricsRegistry())
|
||||
mr->recordJobFinished(JobTypes::name(type), dur.count());
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
473
src/xrpld/telemetry/MetricsRegistry.cpp
Normal file
473
src/xrpld/telemetry/MetricsRegistry.cpp
Normal file
@@ -0,0 +1,473 @@
|
||||
/** MetricsRegistry implementation — OpenTelemetry metric instruments for rippled.
|
||||
|
||||
This file contains:
|
||||
- Construction / destruction logic for the OTel MeterProvider pipeline.
|
||||
- Synchronous instrument creation (counters, histograms) for RPC, job
|
||||
queue, and NodeStore I/O metrics.
|
||||
- Observable gauge callback registration for cache hit rates, TxQ state,
|
||||
CountedObject instances, load factors, and NodeStore queue depth.
|
||||
- No-op stubs when XRPL_ENABLE_TELEMETRY is not defined.
|
||||
*/
|
||||
|
||||
#include <xrpld/telemetry/MetricsRegistry.h>
|
||||
|
||||
#ifdef XRPL_ENABLE_TELEMETRY
|
||||
|
||||
#include <xrpld/app/ledger/AcceptedLedger.h>
|
||||
#include <xrpld/app/ledger/LedgerMaster.h>
|
||||
#include <xrpld/app/misc/TxQ.h>
|
||||
|
||||
#include <xrpl/basics/CountedObject.h>
|
||||
#include <xrpl/core/ServiceRegistry.h>
|
||||
#include <xrpl/nodestore/Database.h>
|
||||
#include <xrpl/server/LoadFeeTrack.h>
|
||||
|
||||
#include <opentelemetry/exporters/otlp/otlp_http_metric_exporter_factory.h>
|
||||
#include <opentelemetry/exporters/otlp/otlp_http_metric_exporter_options.h>
|
||||
#include <opentelemetry/metrics/provider.h>
|
||||
#include <opentelemetry/sdk/metrics/export/periodic_exporting_metric_reader_factory.h>
|
||||
#include <opentelemetry/sdk/metrics/export/periodic_exporting_metric_reader_options.h>
|
||||
#include <opentelemetry/sdk/metrics/meter_provider.h>
|
||||
#include <opentelemetry/sdk/metrics/meter_provider_factory.h>
|
||||
|
||||
namespace metric_api = opentelemetry::metrics;
|
||||
namespace metric_sdk = opentelemetry::sdk::metrics;
|
||||
namespace otlp_http = opentelemetry::exporter::otlp;
|
||||
|
||||
#endif // XRPL_ENABLE_TELEMETRY
|
||||
|
||||
namespace xrpl {
|
||||
namespace telemetry {
|
||||
|
||||
MetricsRegistry::MetricsRegistry(bool enabled, ServiceRegistry& app, beast::Journal journal)
|
||||
: enabled_(enabled), app_(app), journal_(journal)
|
||||
{
|
||||
}
|
||||
|
||||
MetricsRegistry::~MetricsRegistry()
|
||||
{
|
||||
stop();
|
||||
}
|
||||
|
||||
void
|
||||
MetricsRegistry::start(std::string const& endpoint)
|
||||
{
|
||||
#ifdef XRPL_ENABLE_TELEMETRY
|
||||
if (!enabled_)
|
||||
return;
|
||||
|
||||
JLOG(journal_.info()) << "MetricsRegistry: starting, endpoint=" << endpoint;
|
||||
|
||||
// Configure OTLP/HTTP metric exporter.
|
||||
otlp_http::OtlpHttpMetricExporterOptions exporterOpts;
|
||||
exporterOpts.url = endpoint;
|
||||
auto exporter = otlp_http::OtlpHttpMetricExporterFactory::Create(exporterOpts);
|
||||
|
||||
// Configure periodic reader with 10-second export interval.
|
||||
metric_sdk::PeriodicExportingMetricReaderOptions readerOpts;
|
||||
readerOpts.export_interval_millis = std::chrono::milliseconds(10000);
|
||||
readerOpts.export_timeout_millis = std::chrono::milliseconds(5000);
|
||||
auto reader =
|
||||
metric_sdk::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOpts);
|
||||
|
||||
// Create MeterProvider and attach the reader.
|
||||
provider_ = std::make_shared<metric_sdk::MeterProvider>();
|
||||
provider_->AddMetricReader(std::move(reader));
|
||||
|
||||
// Get a meter for all rippled instruments.
|
||||
meter_ = provider_->GetMeter("rippled", "1.0.0");
|
||||
|
||||
// --- Create synchronous instruments ---
|
||||
|
||||
// RPC per-method counters and histogram.
|
||||
rpcStartedCounter_ =
|
||||
meter_->CreateUInt64Counter("rpc_method_started_total", "Total RPC method calls started");
|
||||
rpcFinishedCounter_ = meter_->CreateUInt64Counter(
|
||||
"rpc_method_finished_total", "Total RPC method calls completed successfully");
|
||||
rpcErroredCounter_ = meter_->CreateUInt64Counter(
|
||||
"rpc_method_errored_total", "Total RPC method calls that errored");
|
||||
rpcDurationHistogram_ = meter_->CreateDoubleHistogram(
|
||||
"rpc_method_duration_us", "RPC method execution time in microseconds");
|
||||
|
||||
// Job queue per-type counters and histograms.
|
||||
jobQueuedCounter_ = meter_->CreateUInt64Counter("job_queued_total", "Total jobs enqueued");
|
||||
jobStartedCounter_ = meter_->CreateUInt64Counter("job_started_total", "Total jobs started");
|
||||
jobFinishedCounter_ = meter_->CreateUInt64Counter("job_finished_total", "Total jobs completed");
|
||||
jobQueuedDurationHistogram_ = meter_->CreateDoubleHistogram(
|
||||
"job_queued_duration_us", "Time jobs spent waiting in the queue (microseconds)");
|
||||
jobRunningDurationHistogram_ = meter_->CreateDoubleHistogram(
|
||||
"job_running_duration_us", "Job execution time in microseconds");
|
||||
|
||||
// Register all observable (async) gauges.
|
||||
registerAsyncGauges();
|
||||
|
||||
JLOG(journal_.info()) << "MetricsRegistry: started successfully";
|
||||
#else
|
||||
(void)endpoint;
|
||||
#endif // XRPL_ENABLE_TELEMETRY
|
||||
}
|
||||
|
||||
void
|
||||
MetricsRegistry::stop()
|
||||
{
|
||||
#ifdef XRPL_ENABLE_TELEMETRY
|
||||
if (!provider_)
|
||||
return;
|
||||
|
||||
JLOG(journal_.info()) << "MetricsRegistry: stopping";
|
||||
|
||||
// Force-flush any pending metrics, then destroy the provider.
|
||||
// This stops the PeriodicExportingMetricReader, which in turn
|
||||
// stops invoking observable gauge callbacks. No explicit
|
||||
// RemoveCallback is needed — the provider destruction handles it.
|
||||
provider_->ForceFlush();
|
||||
provider_.reset();
|
||||
|
||||
JLOG(journal_.info()) << "MetricsRegistry: stopped";
|
||||
#endif // XRPL_ENABLE_TELEMETRY
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// Synchronous instrument recording — RPC metrics (Task 9.4)
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
void
|
||||
MetricsRegistry::recordRpcStarted(std::string_view method)
|
||||
{
|
||||
#ifdef XRPL_ENABLE_TELEMETRY
|
||||
if (!enabled_ || !rpcStartedCounter_)
|
||||
return;
|
||||
rpcStartedCounter_->Add(1, {{"method", std::string(method)}});
|
||||
#else
|
||||
(void)method;
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
MetricsRegistry::recordRpcFinished(std::string_view method, std::int64_t durationUs)
|
||||
{
|
||||
#ifdef XRPL_ENABLE_TELEMETRY
|
||||
if (!enabled_ || !rpcFinishedCounter_)
|
||||
return;
|
||||
rpcFinishedCounter_->Add(1, {{"method", std::string(method)}});
|
||||
if (rpcDurationHistogram_)
|
||||
rpcDurationHistogram_->Record(
|
||||
static_cast<double>(durationUs), {{"method", std::string(method)}});
|
||||
#else
|
||||
(void)method;
|
||||
(void)durationUs;
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
MetricsRegistry::recordRpcErrored(std::string_view method, std::int64_t durationUs)
|
||||
{
|
||||
#ifdef XRPL_ENABLE_TELEMETRY
|
||||
if (!enabled_ || !rpcErroredCounter_)
|
||||
return;
|
||||
rpcErroredCounter_->Add(1, {{"method", std::string(method)}});
|
||||
if (rpcDurationHistogram_)
|
||||
rpcDurationHistogram_->Record(
|
||||
static_cast<double>(durationUs), {{"method", std::string(method)}});
|
||||
#else
|
||||
(void)method;
|
||||
(void)durationUs;
|
||||
#endif
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// Synchronous instrument recording — Job Queue metrics (Task 9.5)
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
void
|
||||
MetricsRegistry::recordJobQueued(std::string_view jobType)
|
||||
{
|
||||
#ifdef XRPL_ENABLE_TELEMETRY
|
||||
if (!enabled_ || !jobQueuedCounter_)
|
||||
return;
|
||||
jobQueuedCounter_->Add(1, {{"job_type", std::string(jobType)}});
|
||||
#else
|
||||
(void)jobType;
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
MetricsRegistry::recordJobStarted(std::string_view jobType, std::int64_t queuedDurUs)
|
||||
{
|
||||
#ifdef XRPL_ENABLE_TELEMETRY
|
||||
if (!enabled_ || !jobStartedCounter_)
|
||||
return;
|
||||
jobStartedCounter_->Add(1, {{"job_type", std::string(jobType)}});
|
||||
if (jobQueuedDurationHistogram_)
|
||||
jobQueuedDurationHistogram_->Record(
|
||||
static_cast<double>(queuedDurUs), {{"job_type", std::string(jobType)}});
|
||||
#else
|
||||
(void)jobType;
|
||||
(void)queuedDurUs;
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
MetricsRegistry::recordJobFinished(std::string_view jobType, std::int64_t runningDurUs)
|
||||
{
|
||||
#ifdef XRPL_ENABLE_TELEMETRY
|
||||
if (!enabled_ || !jobFinishedCounter_)
|
||||
return;
|
||||
jobFinishedCounter_->Add(1, {{"job_type", std::string(jobType)}});
|
||||
if (jobRunningDurationHistogram_)
|
||||
jobRunningDurationHistogram_->Record(
|
||||
static_cast<double>(runningDurUs), {{"job_type", std::string(jobType)}});
|
||||
#else
|
||||
(void)jobType;
|
||||
(void)runningDurUs;
|
||||
#endif
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// Observable gauge callbacks (Tasks 9.1, 9.2, 9.3, 9.6, 9.7)
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
#ifdef XRPL_ENABLE_TELEMETRY
|
||||
|
||||
void
|
||||
MetricsRegistry::registerAsyncGauges()
|
||||
{
|
||||
// --- Task 9.2: Cache hit rate and size gauges ---
|
||||
cacheHitRateGauge_ =
|
||||
meter_->CreateDoubleObservableGauge("cache_metrics", "Cache hit rates and sizes");
|
||||
cacheHitRateGauge_->AddCallback(
|
||||
[](opentelemetry::metrics::ObserverResult result, void* state) {
|
||||
auto* self = static_cast<MetricsRegistry*>(state);
|
||||
auto& app = self->app_;
|
||||
|
||||
try
|
||||
{
|
||||
// SLE cache hit rate (0.0 - 1.0).
|
||||
auto sleRate = app.cachedSLEs().rate();
|
||||
opentelemetry::nostd::get<opentelemetry::nostd::shared_ptr<
|
||||
opentelemetry::metrics::ObserverResultT<double>>>(result)
|
||||
->Observe(sleRate, {{"metric", "SLE_hit_rate"}});
|
||||
|
||||
// Ledger cache hit rate.
|
||||
auto ledgerRate = app.getLedgerMaster().getCacheHitRate();
|
||||
opentelemetry::nostd::get<opentelemetry::nostd::shared_ptr<
|
||||
opentelemetry::metrics::ObserverResultT<double>>>(result)
|
||||
->Observe(ledgerRate, {{"metric", "ledger_hit_rate"}});
|
||||
|
||||
// AcceptedLedger cache hit rate.
|
||||
auto alRate = app.getAcceptedLedgerCache().getHitRate();
|
||||
opentelemetry::nostd::get<opentelemetry::nostd::shared_ptr<
|
||||
opentelemetry::metrics::ObserverResultT<double>>>(result)
|
||||
->Observe(alRate, {{"metric", "AL_hit_rate"}});
|
||||
|
||||
// TreeNode cache size.
|
||||
auto tnCacheSize = app.getNodeFamily().getTreeNodeCache()->getCacheSize();
|
||||
opentelemetry::nostd::get<opentelemetry::nostd::shared_ptr<
|
||||
opentelemetry::metrics::ObserverResultT<double>>>(result)
|
||||
->Observe(
|
||||
static_cast<double>(tnCacheSize), {{"metric", "treenode_cache_size"}});
|
||||
|
||||
// TreeNode track size.
|
||||
auto tnTrackSize = app.getNodeFamily().getTreeNodeCache()->getTrackSize();
|
||||
opentelemetry::nostd::get<opentelemetry::nostd::shared_ptr<
|
||||
opentelemetry::metrics::ObserverResultT<double>>>(result)
|
||||
->Observe(
|
||||
static_cast<double>(tnTrackSize), {{"metric", "treenode_track_size"}});
|
||||
|
||||
// FullBelow cache size.
|
||||
auto fbSize = app.getNodeFamily().getFullBelowCache()->size();
|
||||
opentelemetry::nostd::get<opentelemetry::nostd::shared_ptr<
|
||||
opentelemetry::metrics::ObserverResultT<double>>>(result)
|
||||
->Observe(static_cast<double>(fbSize), {{"metric", "fullbelow_size"}});
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
// Silently skip if services are not yet ready.
|
||||
}
|
||||
},
|
||||
this);
|
||||
|
||||
// --- Task 9.3: TxQ metrics gauges ---
|
||||
txqGauge_ = meter_->CreateDoubleObservableGauge("txq_metrics", "Transaction queue metrics");
|
||||
txqGauge_->AddCallback(
|
||||
[](opentelemetry::metrics::ObserverResult result, void* state) {
|
||||
auto* self = static_cast<MetricsRegistry*>(state);
|
||||
auto& app = self->app_;
|
||||
|
||||
try
|
||||
{
|
||||
auto const metrics = app.getTxQ().getMetrics(*app.openLedger().current());
|
||||
|
||||
auto observe = [&](char const* name, double value) {
|
||||
opentelemetry::nostd::get<opentelemetry::nostd::shared_ptr<
|
||||
opentelemetry::metrics::ObserverResultT<double>>>(result)
|
||||
->Observe(value, {{"metric", name}});
|
||||
};
|
||||
|
||||
observe("txq_count", static_cast<double>(metrics.txCount));
|
||||
observe(
|
||||
"txq_max_size",
|
||||
metrics.txQMaxSize ? static_cast<double>(*metrics.txQMaxSize) : 0.0);
|
||||
observe("txq_in_ledger", static_cast<double>(metrics.txInLedger));
|
||||
observe("txq_per_ledger", static_cast<double>(metrics.txPerLedger));
|
||||
observe(
|
||||
"txq_reference_fee_level",
|
||||
static_cast<double>(metrics.referenceFeeLevel.fee()));
|
||||
observe(
|
||||
"txq_min_processing_fee_level",
|
||||
static_cast<double>(metrics.minProcessingFeeLevel.fee()));
|
||||
observe("txq_med_fee_level", static_cast<double>(metrics.medFeeLevel.fee()));
|
||||
observe(
|
||||
"txq_open_ledger_fee_level",
|
||||
static_cast<double>(metrics.openLedgerFeeLevel.fee()));
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
// Silently skip if TxQ or OpenLedger are not yet ready.
|
||||
}
|
||||
},
|
||||
this);
|
||||
|
||||
// --- Task 9.6: Counted object instance gauges ---
|
||||
objectCountGauge_ = meter_->CreateInt64ObservableGauge(
|
||||
"object_count", "Live instance counts for key internal object types");
|
||||
objectCountGauge_->AddCallback(
|
||||
[](opentelemetry::metrics::ObserverResult result, void* /* state */) {
|
||||
try
|
||||
{
|
||||
// Iterate through all CountedObject types via the linked
|
||||
// list in CountedObjects. We report all types with count
|
||||
// > 0, filtering to the key types of interest.
|
||||
auto counts = CountedObjects::getInstance().getCounts(0);
|
||||
for (auto const& [name, count] : counts)
|
||||
{
|
||||
opentelemetry::nostd::get<opentelemetry::nostd::shared_ptr<
|
||||
opentelemetry::metrics::ObserverResultT<int64_t>>>(result)
|
||||
->Observe(static_cast<int64_t>(count), {{"type", name}});
|
||||
}
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
// Silently skip on error.
|
||||
}
|
||||
},
|
||||
this);
|
||||
|
||||
// --- Task 9.7: Load factor breakdown gauges ---
|
||||
loadFactorGauge_ =
|
||||
meter_->CreateDoubleObservableGauge("load_factor_metrics", "Fee load factor breakdown");
|
||||
loadFactorGauge_->AddCallback(
|
||||
[](opentelemetry::metrics::ObserverResult result, void* state) {
|
||||
auto* self = static_cast<MetricsRegistry*>(state);
|
||||
auto& app = self->app_;
|
||||
|
||||
try
|
||||
{
|
||||
auto& feeTrack = app.getFeeTrack();
|
||||
auto const loadBase = static_cast<double>(feeTrack.getLoadBase());
|
||||
|
||||
auto observe = [&](char const* name, double value) {
|
||||
opentelemetry::nostd::get<opentelemetry::nostd::shared_ptr<
|
||||
opentelemetry::metrics::ObserverResultT<double>>>(result)
|
||||
->Observe(value, {{"metric", name}});
|
||||
};
|
||||
|
||||
// Combined load factor (server component).
|
||||
observe(
|
||||
"load_factor_server", static_cast<double>(feeTrack.getLoadFactor()) / loadBase);
|
||||
|
||||
// Individual factor components.
|
||||
observe(
|
||||
"load_factor_local", static_cast<double>(feeTrack.getLocalFee()) / loadBase);
|
||||
observe("load_factor_net", static_cast<double>(feeTrack.getRemoteFee()) / loadBase);
|
||||
observe(
|
||||
"load_factor_cluster",
|
||||
static_cast<double>(feeTrack.getClusterFee()) / loadBase);
|
||||
|
||||
// Fee escalation factors from TxQ.
|
||||
auto const metrics = app.getTxQ().getMetrics(*app.openLedger().current());
|
||||
auto refLevel = static_cast<double>(metrics.referenceFeeLevel.fee());
|
||||
if (refLevel > 0)
|
||||
{
|
||||
observe(
|
||||
"load_factor_fee_escalation",
|
||||
static_cast<double>(metrics.openLedgerFeeLevel.fee()) / refLevel);
|
||||
observe(
|
||||
"load_factor_fee_queue",
|
||||
static_cast<double>(metrics.minProcessingFeeLevel.fee()) / refLevel);
|
||||
}
|
||||
|
||||
// Combined load factor (max of server and fee escalation).
|
||||
auto const loadFactorServer = feeTrack.getLoadFactor();
|
||||
auto const loadBaseServer = feeTrack.getLoadBase();
|
||||
double combined = static_cast<double>(loadFactorServer) / loadBase;
|
||||
if (refLevel > 0)
|
||||
{
|
||||
double feeEscalation = static_cast<double>(metrics.openLedgerFeeLevel.fee()) *
|
||||
loadBaseServer / refLevel;
|
||||
if (feeEscalation > static_cast<double>(loadFactorServer))
|
||||
{
|
||||
combined = feeEscalation / loadBase;
|
||||
}
|
||||
}
|
||||
observe("load_factor", combined);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
// Silently skip if services are not yet ready.
|
||||
}
|
||||
},
|
||||
this);
|
||||
|
||||
// --- Task 9.1: NodeStore I/O gauges ---
|
||||
// The cumulative counters (reads, writes, bytes) are also exposed here
|
||||
// as observable gauges. This avoids adding an xrpld dependency into the
|
||||
// libxrpl nodestore code — the MetricsRegistry reads the existing atomic
|
||||
// counters from Database via its public accessors.
|
||||
nodeStoreGauge_ = meter_->CreateInt64ObservableGauge(
|
||||
"nodestore_state", "NodeStore I/O counters, queue depth, and write load");
|
||||
nodeStoreGauge_->AddCallback(
|
||||
[](opentelemetry::metrics::ObserverResult result, void* state) {
|
||||
auto* self = static_cast<MetricsRegistry*>(state);
|
||||
auto& app = self->app_;
|
||||
|
||||
try
|
||||
{
|
||||
auto& db = app.getNodeStore();
|
||||
|
||||
auto observe = [&](char const* name, int64_t value) {
|
||||
opentelemetry::nostd::get<opentelemetry::nostd::shared_ptr<
|
||||
opentelemetry::metrics::ObserverResultT<int64_t>>>(result)
|
||||
->Observe(value, {{"metric", name}});
|
||||
};
|
||||
|
||||
// Cumulative counters (monotonically increasing).
|
||||
observe("node_reads_total", static_cast<int64_t>(db.getFetchTotalCount()));
|
||||
observe("node_reads_hit", static_cast<int64_t>(db.getFetchHitCount()));
|
||||
observe("node_writes", static_cast<int64_t>(db.getStoreCount()));
|
||||
observe("node_written_bytes", static_cast<int64_t>(db.getStoreSize()));
|
||||
observe("node_read_bytes", static_cast<int64_t>(db.getFetchSize()));
|
||||
|
||||
// Write load score (instantaneous).
|
||||
observe("write_load", static_cast<int64_t>(db.getWriteLoad()));
|
||||
|
||||
// Read queue depth (instantaneous).
|
||||
Json::Value obj(Json::objectValue);
|
||||
db.getCountsJson(obj);
|
||||
if (obj.isMember("read_queue"))
|
||||
{
|
||||
observe("read_queue", static_cast<int64_t>(obj["read_queue"].asUInt()));
|
||||
}
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
// Silently skip on error.
|
||||
}
|
||||
},
|
||||
this);
|
||||
}
|
||||
|
||||
#endif // XRPL_ENABLE_TELEMETRY
|
||||
|
||||
} // namespace telemetry
|
||||
} // namespace xrpl
|
||||
280
src/xrpld/telemetry/MetricsRegistry.h
Normal file
280
src/xrpld/telemetry/MetricsRegistry.h
Normal file
@@ -0,0 +1,280 @@
|
||||
#pragma once
|
||||
|
||||
/** Central OTel Metrics Registry for rippled.
|
||||
|
||||
Owns all OpenTelemetry metric instruments (counters, histograms,
|
||||
observable gauges) that are NOT already covered by the beast::insight
|
||||
StatsD pipeline. The instruments are created once at startup and polled
|
||||
by the OTel PeriodicExportingMetricReader at a configurable interval
|
||||
(default 10 s).
|
||||
|
||||
When XRPL_ENABLE_TELEMETRY is **not** defined, this class compiles to a
|
||||
lightweight no-op: every public method is an empty inline.
|
||||
|
||||
Dependency / ownership diagram (ASCII):
|
||||
|
||||
Application
|
||||
|
|
||||
+-- MetricsRegistry (unique_ptr, created in setup(), started/stopped with telemetry)
|
||||
|
|
||||
+-- OTel MeterProvider (owns reader + exporter)
|
||||
| |
|
||||
| +-- PeriodicExportingMetricReader
|
||||
| +-- OtlpHttpMetricExporter
|
||||
|
|
||||
+-- Counters / Histograms (synchronous instruments)
|
||||
| +-- rpc_method_started_total
|
||||
| +-- rpc_method_finished_total
|
||||
| +-- rpc_method_errored_total
|
||||
| +-- rpc_method_duration_us (Histogram)
|
||||
| +-- job_queued_total
|
||||
| +-- job_started_total
|
||||
| +-- job_finished_total
|
||||
| +-- job_queued_duration_us (Histogram)
|
||||
| +-- job_running_duration_us (Histogram)
|
||||
|
|
||||
+-- Observable Gauges (async callbacks, polled by reader)
|
||||
+-- Cache hit rates (SLE, ledger, AL)
|
||||
+-- TreeNode / FullBelow sizes
|
||||
+-- TxQ metrics
|
||||
+-- CountedObject counts
|
||||
+-- Load factor breakdown
|
||||
+-- NodeStore I/O gauges
|
||||
|
||||
Control-flow for async gauges:
|
||||
|
||||
PeriodicExportingMetricReader (background thread, 10 s tick)
|
||||
|
|
||||
v
|
||||
OTel SDK invokes registered ObservableGauge callbacks
|
||||
|
|
||||
v
|
||||
Each callback reads current value from Application services
|
||||
(e.g. app.getTxQ().getMetrics(), app.getFeeTrack().getLoadFactor())
|
||||
|
|
||||
v
|
||||
Result set is exported via OTLP/HTTP to the collector
|
||||
|
||||
Control-flow for synchronous instruments:
|
||||
|
||||
PerfLogImp::rpcStart/rpcEnd/jobQueue/jobStart/jobFinish
|
||||
|
|
||||
v
|
||||
MetricsRegistry::recordRpc*(method, ...) / recordJob*(type, ...)
|
||||
|
|
||||
v
|
||||
OTel Counter::Add() or Histogram::Record()
|
||||
|
|
||||
v
|
||||
Periodically flushed by the MetricReader
|
||||
|
||||
Example usage:
|
||||
|
||||
@code
|
||||
// In Application::setup(), after telemetry_ is created:
|
||||
metricsRegistry_ = std::make_unique<telemetry::MetricsRegistry>(
|
||||
telemetry_->isEnabled(), app, journal);
|
||||
metricsRegistry_->start(setup.exporterEndpoint);
|
||||
|
||||
// In PerfLogImp::rpcStart():
|
||||
if (auto* mr = app_.getMetricsRegistry())
|
||||
mr->recordRpcStarted("server_info");
|
||||
|
||||
// In PerfLogImp::rpcEnd():
|
||||
if (auto* mr = app_.getMetricsRegistry())
|
||||
{
|
||||
mr->recordRpcFinished("server_info", durationUs);
|
||||
// or: mr->recordRpcErrored("server_info", durationUs);
|
||||
}
|
||||
|
||||
// In PerfLogImp::jobQueue():
|
||||
if (auto* mr = app_.getMetricsRegistry())
|
||||
mr->recordJobQueued("ledgerData");
|
||||
|
||||
// Shutdown:
|
||||
metricsRegistry_->stop();
|
||||
@endcode
|
||||
|
||||
Caveats:
|
||||
- The MetricsRegistry must be created AFTER the Telemetry object because
|
||||
it reads isEnabled() to decide whether to initialize the OTel SDK.
|
||||
- Observable gauge callbacks capture a reference to the Application; the
|
||||
Application must outlive the MetricsRegistry (guaranteed because
|
||||
MetricsRegistry is stopped before Application teardown).
|
||||
- If a new CountedObject type is added, it will NOT appear automatically
|
||||
in the object_count gauge; the callback iterates a fixed list.
|
||||
- Adding a new synchronous instrument requires updating both the header
|
||||
and the .cpp, then calling the new record*() method from the
|
||||
instrumentation site.
|
||||
*/
|
||||
|
||||
#include <xrpl/beast/utility/Journal.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#ifdef XRPL_ENABLE_TELEMETRY
|
||||
#include <opentelemetry/metrics/meter.h>
|
||||
#include <opentelemetry/metrics/meter_provider.h>
|
||||
#include <opentelemetry/metrics/observer_result.h>
|
||||
#include <opentelemetry/sdk/metrics/meter_provider.h>
|
||||
#endif
|
||||
|
||||
namespace xrpl {
|
||||
|
||||
class ServiceRegistry;
|
||||
|
||||
namespace telemetry {
|
||||
|
||||
class MetricsRegistry
|
||||
{
|
||||
public:
|
||||
/** Construct a MetricsRegistry.
|
||||
|
||||
@param enabled Whether OTel metric export is active. When false,
|
||||
all methods become no-ops.
|
||||
@param app Reference to the ServiceRegistry (Application) for
|
||||
reading current metric values in gauge callbacks.
|
||||
@param journal Journal for log output.
|
||||
*/
|
||||
MetricsRegistry(bool enabled, ServiceRegistry& app, beast::Journal journal);
|
||||
|
||||
~MetricsRegistry();
|
||||
|
||||
/// Non-copyable, non-movable.
|
||||
MetricsRegistry(MetricsRegistry const&) = delete;
|
||||
MetricsRegistry&
|
||||
operator=(MetricsRegistry const&) = delete;
|
||||
|
||||
/** Initialize the OTel metrics pipeline and register all instruments.
|
||||
|
||||
@param endpoint OTLP/HTTP endpoint URL for metric export
|
||||
(e.g. "http://localhost:4318/v1/metrics").
|
||||
*/
|
||||
void
|
||||
start(std::string const& endpoint);
|
||||
|
||||
/** Flush pending metrics and shut down the pipeline. */
|
||||
void
|
||||
stop();
|
||||
|
||||
/** @return true if the registry is actively exporting metrics. */
|
||||
bool
|
||||
isEnabled() const noexcept
|
||||
{
|
||||
return enabled_;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// Synchronous instrument recording (called from PerfLog hot paths)
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
/** Record an RPC method call start.
|
||||
@param method The RPC method name (e.g. "server_info").
|
||||
*/
|
||||
void
|
||||
recordRpcStarted(std::string_view method);
|
||||
|
||||
/** Record an RPC method call completion.
|
||||
@param method The RPC method name.
|
||||
@param durationUs Execution time in microseconds.
|
||||
*/
|
||||
void
|
||||
recordRpcFinished(std::string_view method, std::int64_t durationUs);
|
||||
|
||||
/** Record an RPC method call error.
|
||||
@param method The RPC method name.
|
||||
@param durationUs Execution time in microseconds.
|
||||
*/
|
||||
void
|
||||
recordRpcErrored(std::string_view method, std::int64_t durationUs);
|
||||
|
||||
/** Record a job enqueued event.
|
||||
@param jobType The job type name (e.g. "ledgerData").
|
||||
*/
|
||||
void
|
||||
recordJobQueued(std::string_view jobType);
|
||||
|
||||
/** Record a job start event.
|
||||
@param jobType The job type name.
|
||||
@param queuedDurUs Time the job spent waiting in the queue (us).
|
||||
*/
|
||||
void
|
||||
recordJobStarted(std::string_view jobType, std::int64_t queuedDurUs);
|
||||
|
||||
/** Record a job finish event.
|
||||
@param jobType The job type name.
|
||||
@param runningDurUs Execution time in microseconds.
|
||||
*/
|
||||
void
|
||||
recordJobFinished(std::string_view jobType, std::int64_t runningDurUs);
|
||||
|
||||
private:
|
||||
/// Master enable flag; when false all methods are no-ops.
|
||||
bool const enabled_;
|
||||
|
||||
/// Reference to Application services for gauge callbacks.
|
||||
ServiceRegistry& app_;
|
||||
|
||||
/// Journal for logging.
|
||||
beast::Journal const journal_;
|
||||
|
||||
#ifdef XRPL_ENABLE_TELEMETRY
|
||||
/// The SDK MeterProvider that owns the export pipeline.
|
||||
std::shared_ptr<opentelemetry::sdk::metrics::MeterProvider> provider_;
|
||||
|
||||
/// The Meter used to create all instruments.
|
||||
opentelemetry::nostd::shared_ptr<opentelemetry::metrics::Meter> meter_;
|
||||
|
||||
// --- Synchronous instruments (RPC) ---
|
||||
/// Counter: rpc_method_started_total{method="<name>"}
|
||||
opentelemetry::nostd::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> rpcStartedCounter_;
|
||||
/// Counter: rpc_method_finished_total{method="<name>"}
|
||||
opentelemetry::nostd::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> rpcFinishedCounter_;
|
||||
/// Counter: rpc_method_errored_total{method="<name>"}
|
||||
opentelemetry::nostd::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> rpcErroredCounter_;
|
||||
/// Histogram: rpc_method_duration_us{method="<name>"}
|
||||
opentelemetry::nostd::unique_ptr<opentelemetry::metrics::Histogram<double>>
|
||||
rpcDurationHistogram_;
|
||||
|
||||
// --- Synchronous instruments (Job Queue) ---
|
||||
/// Counter: job_queued_total{job_type="<name>"}
|
||||
opentelemetry::nostd::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> jobQueuedCounter_;
|
||||
/// Counter: job_started_total{job_type="<name>"}
|
||||
opentelemetry::nostd::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> jobStartedCounter_;
|
||||
/// Counter: job_finished_total{job_type="<name>"}
|
||||
opentelemetry::nostd::unique_ptr<opentelemetry::metrics::Counter<uint64_t>> jobFinishedCounter_;
|
||||
/// Histogram: job_queued_duration_us{job_type="<name>"}
|
||||
opentelemetry::nostd::unique_ptr<opentelemetry::metrics::Histogram<double>>
|
||||
jobQueuedDurationHistogram_;
|
||||
/// Histogram: job_running_duration_us{job_type="<name>"}
|
||||
opentelemetry::nostd::unique_ptr<opentelemetry::metrics::Histogram<double>>
|
||||
jobRunningDurationHistogram_;
|
||||
|
||||
// --- Observable gauges (registered via callbacks) ---
|
||||
// Handles are stored so we can remove callbacks on shutdown.
|
||||
/// Observable gauges for cache hit rates and sizes.
|
||||
opentelemetry::nostd::shared_ptr<opentelemetry::metrics::ObservableInstrument>
|
||||
cacheHitRateGauge_;
|
||||
/// Observable gauges for TxQ metrics.
|
||||
opentelemetry::nostd::shared_ptr<opentelemetry::metrics::ObservableInstrument> txqGauge_;
|
||||
/// Observable gauges for counted object instances.
|
||||
opentelemetry::nostd::shared_ptr<opentelemetry::metrics::ObservableInstrument>
|
||||
objectCountGauge_;
|
||||
/// Observable gauges for load factor breakdown.
|
||||
opentelemetry::nostd::shared_ptr<opentelemetry::metrics::ObservableInstrument> loadFactorGauge_;
|
||||
/// Observable gauges for NodeStore write_load and read_queue.
|
||||
opentelemetry::nostd::shared_ptr<opentelemetry::metrics::ObservableInstrument> nodeStoreGauge_;
|
||||
|
||||
/** Register all observable gauge callbacks with the OTel SDK.
|
||||
Called once during start().
|
||||
*/
|
||||
void
|
||||
registerAsyncGauges();
|
||||
#endif // XRPL_ENABLE_TELEMETRY
|
||||
};
|
||||
|
||||
} // namespace telemetry
|
||||
} // namespace xrpl
|
||||
Reference in New Issue
Block a user