From f3a095ab653fdbbf1bf9f1fc9a94837d0552c535 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 14 May 2026 16:09:48 +0100 Subject: [PATCH 1/8] docs(telemetry): align Phase 1a plan docs with Phase 1b implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase-1a plan documents advertised OTLP/gRPC on port 4317 as the default exporter, four unparsed [telemetry] config keys, and "Phase 4a Complete" status with exit-criteria checkboxes marked done. Every downstream branch through Phase 5 ships only OTLP/HTTP on port 4318 via OtlpHttpExporterFactory, never parses the advertised keys, and the Phase 4 work is not yet delivered. Fixes: - 02-design-decisions.md: flip §2.1.1 SDK dependency recommendations to OTLP/HTTP (shipped) with OTLP/gRPC marked Future. Update §2.2 architecture diagram and text from OTLP/gRPC:4317 to OTLP/HTTP:4318. Rewrite §2.2.1 as "OTLP/HTTP (Shipped)" and §2.2.2 as "OTLP/gRPC (Future Work — Planned Upgrade)" with a concrete checklist (Conan dep, config parsing, factory branch, runbook/dashboard updates) for landing the gRPC transport later. - 05-configuration-reference.md: drop the fabricated exporter/otlp_grpc key and the :4317 default from the sample config block and the options-summary table. Move trace_pathfind, trace_txq, trace_validator, trace_amendment into a new "Planned (not yet implemented)" table citing the phase that will add each one. Keep the example config minimal so copy-paste does not produce a silently-ignored stanza. - 06-implementation-phases.md: reset Phase 4 Exit Criteria checkboxes from [x] to [ ] (Phase 4 is not shipped at Phase-1a time). Rename "Phase 4a Complete" to "Phase 4a Plan" and describe the work as future. Replace the broken forward link to Phase4_taskList.md (introduced in the Phase 2 PR) with a sentence pointing readers to where that spec will land. Renumber the final section 6.12 to 6.11 so it sits directly after 6.10; section 6.11 ("Effort Summary") was intentionally removed in earlier edits. --- OpenTelemetryPlan/02-design-decisions.md | 74 ++++++++++++------- .../05-configuration-reference.md | 70 ++++++++++-------- OpenTelemetryPlan/06-implementation-phases.md | 20 ++--- 3 files changed, 97 insertions(+), 67 deletions(-) diff --git a/OpenTelemetryPlan/02-design-decisions.md b/OpenTelemetryPlan/02-design-decisions.md index fe87fc78db..184b40bca2 100644 --- a/OpenTelemetryPlan/02-design-decisions.md +++ b/OpenTelemetryPlan/02-design-decisions.md @@ -13,13 +13,13 @@ **Primary Choice**: OpenTelemetry C++ SDK (`opentelemetry-cpp`) -| Component | Purpose | Required | -| --------------------------------------- | ---------------------- | ----------- | -| `opentelemetry-cpp::api` | Tracing API headers | Yes | -| `opentelemetry-cpp::sdk` | SDK implementation | Yes | -| `opentelemetry-cpp::ext` | Extensions (exporters) | Yes | -| `opentelemetry-cpp::otlp_grpc_exporter` | OTLP/gRPC export | Recommended | -| `opentelemetry-cpp::otlp_http_exporter` | OTLP/HTTP export | Alternative | +| Component | Purpose | Required | +| --------------------------------------- | ---------------------- | ------------------------- | +| `opentelemetry-cpp::api` | Tracing API headers | Yes | +| `opentelemetry-cpp::sdk` | SDK implementation | Yes | +| `opentelemetry-cpp::ext` | Extensions (exporters) | Yes | +| `opentelemetry-cpp::otlp_http_exporter` | OTLP/HTTP export | Yes (shipped in Phase 1b) | +| `opentelemetry-cpp::otlp_grpc_exporter` | OTLP/gRPC export | Future (not yet wired up) | ### 2.1.2 Instrumentation Strategy @@ -51,9 +51,9 @@ flowchart TB elastic["Elastic
APM"] end - node1 -->|"OTLP/gRPC
:4317"| collector - node2 -->|"OTLP/gRPC
:4317"| collector - node3 -->|"OTLP/gRPC
:4317"| collector + node1 -->|"OTLP/HTTP
:4318"| collector + node2 -->|"OTLP/HTTP
:4318"| collector + node3 -->|"OTLP/HTTP
:4318"| collector collector --> tempo collector --> elastic @@ -65,27 +65,15 @@ flowchart TB **Reading the diagram:** -- **xrpld Nodes (blue)**: The source of telemetry data. Each xrpld node exports spans via OTLP/gRPC on port 4317. +- **xrpld Nodes (blue)**: The source of telemetry data. Each xrpld node exports spans via OTLP/HTTP on port 4318 (the only exporter shipped in Phase 1b). - **OpenTelemetry Collector (red)**: The central aggregation point that receives spans from all nodes. Can run as a sidecar (per-node) or standalone (shared). Handles batching, filtering, and routing. - **Observability Backends (green)**: The storage and visualization destinations. Tempo is the recommended backend for both development and production, and Elastic APM is an alternative. The Collector routes to one or more backends. -- **Arrows (nodes to collector to backends)**: The data pipeline -- spans flow from nodes to the Collector over gRPC, then the Collector fans out to the configured backends. +- **Arrows (nodes to collector to backends)**: The data pipeline -- spans flow from nodes to the Collector over HTTP, then the Collector fans out to the configured backends. -### 2.2.1 OTLP/gRPC (Recommended) +### 2.2.1 OTLP/HTTP (Shipped in Phase 1b) ```cpp -// Configuration for OTLP over gRPC -namespace otlp = opentelemetry::exporter::otlp; - -otlp::OtlpGrpcExporterOptions opts; -opts.endpoint = "localhost:4317"; -opts.useTls = true; -opts.sslCaCertPath = "/path/to/ca.crt"; -``` - -### 2.2.2 OTLP/HTTP (Alternative) - -```cpp -// Configuration for OTLP over HTTP +// Configuration for OTLP over HTTP (the only exporter currently wired up). namespace otlp = opentelemetry::exporter::otlp; otlp::OtlpHttpExporterOptions opts; @@ -93,6 +81,40 @@ opts.url = "http://localhost:4318/v1/traces"; opts.content_type = otlp::HttpRequestContentType::kJson; // or kBinary ``` +### 2.2.2 OTLP/gRPC (Future Work — Planned Upgrade) + +OTLP/gRPC is planned as a future upgrade from the HTTP exporter. The gRPC +transport offers lower per-span overhead and tighter back-pressure semantics +than HTTP/JSON, making it attractive for production deployments once the HTTP +path is validated in earlier phases. + +Required to land this upgrade: + +1. Add `opentelemetry-cpp::otlp_grpc_exporter` to the Conan recipe (the + dependency already exists but is not linked in Phase 1b builds). +2. Extend `TelemetryConfig.cpp` to parse an `exporter` key (`otlp_http` + default, `otlp_grpc` opt-in) and a gRPC endpoint override. +3. In `Telemetry::start()` branch on the parsed exporter type and construct + either `OtlpHttpExporterFactory::Create(httpOpts)` or + `OtlpGrpcExporterFactory::Create(grpcOpts)` accordingly. +4. Update the runbook and dashboards to document the alternate port and TLS + settings. + +Example Phase 1b+ gRPC configuration (when wired up): + +```cpp +// Configuration for OTLP over gRPC (future work). +namespace otlp = opentelemetry::exporter::otlp; + +otlp::OtlpGrpcExporterOptions opts; +opts.endpoint = ":4317"; +opts.use_ssl_credentials = true; +opts.ssl_credentials_cacert_path = "/path/to/ca.crt"; +``` + +Until that work lands, `OtlpGrpcExporterOptions` is **not** used by any code +path in Phase 1b through Phase 5. + --- ## 2.3 Span Naming Conventions diff --git a/OpenTelemetryPlan/05-configuration-reference.md b/OpenTelemetryPlan/05-configuration-reference.md index 56627c3b6c..6c7161be7b 100644 --- a/OpenTelemetryPlan/05-configuration-reference.md +++ b/OpenTelemetryPlan/05-configuration-reference.md @@ -26,11 +26,10 @@ Add to `cfg/xrpld-example.cfg`: # # Enable/disable telemetry (default: 0 = disabled) # enabled=1 # -# # Exporter type: "otlp_grpc" (default), "otlp_http", or "none" -# exporter=otlp_grpc -# -# # OTLP endpoint (default: localhost:4317 for gRPC, localhost:4318 for HTTP) -# endpoint=localhost:4317 +# # OTLP endpoint (default: http://localhost:4318/v1/traces - OTLP/HTTP) +# # Note: only OTLP/HTTP is shipped in Phase 1b. OTLP/gRPC support is +# # planned as future work and is not yet parsed by TelemetryConfig.cpp. +# endpoint=http://localhost:4318/v1/traces # # # Use TLS for exporter connection (default: 0) # use_tls=0 @@ -56,10 +55,12 @@ Add to `cfg/xrpld-example.cfg`: # trace_rpc=1 # RPC request handling # trace_peer=0 # Peer messages (high volume, disabled by default) # trace_ledger=1 # Ledger acquisition and building -# trace_pathfind=1 # Path computation (can be expensive) -# trace_txq=1 # Transaction queue and fee escalation -# trace_validator=0 # Validator list and manifest updates (low volume) -# trace_amendment=0 # Amendment voting (very low volume) +# +# # Planned (not yet parsed by TelemetryConfig.cpp): +# # trace_pathfind=1 # Path computation (Phase 2) +# # trace_txq=1 # Transaction queue (Phase 3) +# # trace_validator=0 # Validator list / manifest (future) +# # trace_amendment=0 # Amendment voting (future) # # # Service identification (automatically detected if not specified) # # service_name=xrpld @@ -71,28 +72,35 @@ enabled=0 ### 5.1.2 Configuration Options Summary -| Option | Type | Default | Description | -| --------------------- | ------ | ---------------- | ----------------------------------------- | -| `enabled` | bool | `false` | Enable/disable telemetry | -| `exporter` | string | `"otlp_grpc"` | Exporter type: otlp_grpc, otlp_http, none | -| `endpoint` | string | `localhost:4317` | OTLP collector endpoint | -| `use_tls` | bool | `false` | Enable TLS for exporter connection | -| `tls_ca_cert` | string | `""` | Path to CA certificate file | -| `sampling_ratio` | float | `1.0` | Sampling ratio (0.0-1.0) | -| `batch_size` | uint | `512` | Spans per export batch | -| `batch_delay_ms` | uint | `5000` | Max delay before sending batch (ms) | -| `max_queue_size` | uint | `2048` | Maximum queued spans | -| `trace_transactions` | bool | `true` | Enable transaction tracing | -| `trace_consensus` | bool | `true` | Enable consensus tracing | -| `trace_rpc` | bool | `true` | Enable RPC tracing | -| `trace_peer` | bool | `false` | Enable peer message tracing (high volume) | -| `trace_ledger` | bool | `true` | Enable ledger tracing | -| `trace_pathfind` | bool | `true` | Enable path computation tracing | -| `trace_txq` | bool | `true` | Enable transaction queue tracing | -| `trace_validator` | bool | `false` | Enable validator list/manifest tracing | -| `trace_amendment` | bool | `false` | Enable amendment voting tracing | -| `service_name` | string | `"xrpld"` | Service name for traces | -| `service_instance_id` | string | `` | Instance identifier | +| Option | Type | Default | Description | +| --------------------- | ------ | --------------------------------- | ----------------------------------------- | +| `enabled` | bool | `false` | Enable/disable telemetry | +| `endpoint` | string | `http://localhost:4318/v1/traces` | OTLP/HTTP collector endpoint | +| `use_tls` | bool | `false` | Enable TLS for exporter connection | +| `tls_ca_cert` | string | `""` | Path to CA certificate file | +| `sampling_ratio` | float | `1.0` | Sampling ratio (0.0-1.0) | +| `batch_size` | uint | `512` | Spans per export batch | +| `batch_delay_ms` | uint | `5000` | Max delay before sending batch (ms) | +| `max_queue_size` | uint | `2048` | Maximum queued spans | +| `trace_transactions` | bool | `true` | Enable transaction tracing | +| `trace_consensus` | bool | `true` | Enable consensus tracing | +| `trace_rpc` | bool | `true` | Enable RPC tracing | +| `trace_peer` | bool | `false` | Enable peer message tracing (high volume) | +| `trace_ledger` | bool | `true` | Enable ledger tracing | +| `service_name` | string | `"xrpld"` | Service name for traces | +| `service_instance_id` | string | `` | Instance identifier | + +**Planned (not yet implemented)**: the following options appear in the design +documents but are not parsed by `TelemetryConfig.cpp` in Phase 1b and later +phases. They will be added as the corresponding subsystems are instrumented: + +| Option | Planned Phase | Purpose | +| ----------------- | ------------- | ---------------------------------------- | +| `exporter` | Future | Select between OTLP/HTTP and OTLP/gRPC | +| `trace_pathfind` | Phase 2 | Path computation tracing toggle | +| `trace_txq` | Phase 3 | Transaction queue tracing toggle | +| `trace_validator` | Future | Validator list / manifest update tracing | +| `trace_amendment` | Future | Amendment voting tracing | --- diff --git a/OpenTelemetryPlan/06-implementation-phases.md b/OpenTelemetryPlan/06-implementation-phases.md index ccf1fd54d4..12eea9c67b 100644 --- a/OpenTelemetryPlan/06-implementation-phases.md +++ b/OpenTelemetryPlan/06-implementation-phases.md @@ -166,22 +166,21 @@ gantt ### Exit Criteria -- [x] Complete consensus round traces -- [x] Phase transitions visible -- [x] Proposals and validations traced -- [x] No impact on consensus timing +- [ ] Complete consensus round traces +- [ ] Phase transitions visible +- [ ] Proposals and validations traced +- [ ] No impact on consensus timing - [ ] Multi-validator test network validated -### Implementation Status — Phase 4a Complete +### Implementation Status — Phase 4a Plan -Phase 4a (establish-phase gap fill & cross-node correlation) adds: +Phase 4a (establish-phase gap fill & cross-node correlation) will add: - **Deterministic trace ID** derived from `previousLedger.id()` so all validators in the same round share the same `trace_id` (switchable via `consensus_trace_strategy` config: `"deterministic"` or `"attribute"`). See [Configuration Reference](./05-configuration-reference.md) for full - configuration options. The `consensus_trace_strategy` option will be - documented in the configuration reference as part of Phase 4a implementation. + configuration options. - **Round lifecycle spans**: `consensus.round` with round-to-round span links. - **Establish phase**: `consensus.establish`, `consensus.update_positions` (with `dispute.resolve` events), `consensus.check` (with threshold tracking). @@ -192,7 +191,8 @@ Phase 4a (establish-phase gap fill & cross-node correlation) adds: (`startRoundTracing`, `createValidationSpan`, `startEstablishTracing`, `updateEstablishTracing`, `endEstablishTracing`). -See [Phase4_taskList.md](./Phase4_taskList.md) for the full spec and implementation notes. +The `Phase4_taskList.md` spec document is introduced in the Phase 2 PR (#6424) +and will contain the full task breakdown and implementation notes. --- @@ -490,7 +490,7 @@ Clear, measurable criteria for each phase. --- -## 6.12 Recommended Implementation Order +## 6.11 Recommended Implementation Order Based on ROI analysis, implement in this exact order: From dec8b0a9a1120cfa93a3e38f86193dfc17ffaa7c Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 14 May 2026 16:34:58 +0100 Subject: [PATCH 2/8] docs(telemetry): fix stale RPC span names + drop volatile line numbers in runbook - RPC Spans table: `rpc.request` was documented but the code actually emits `rpc.http_request`. Listed the actual emitted names (`rpc.http_request`, `rpc.ws_upgrade`, `rpc.ws_message`, `rpc.process`) and their parent/child relationship. - Drop `:` suffixes from Source File columns in both RPC and Transaction span tables. Line numbers drift with every refactor; the filename is enough for operators to grep. - Summary table: replace the never-emitted `rpc.request` row with the real entry points so `span_name=` filters in PromQL / TraceQL match. --- docs/telemetry-runbook.md | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index b700b85073..52febc148a 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -64,19 +64,20 @@ All spans instrumented in xrpld, grouped by subsystem: ### RPC Spans (Phase 2) -| Span Name | Source File | Attributes | Description | -| -------------------- | --------------------- | -------------------------------- | -------------------------------------------------- | -| `rpc.request` | ServerHandler.cpp:271 | — | Top-level HTTP RPC request | -| `rpc.process` | ServerHandler.cpp:573 | — | RPC processing (child of rpc.request) | -| `rpc.ws_message` | ServerHandler.cpp:384 | — | WebSocket RPC message | -| `rpc.command.` | RPCHandler.cpp:161 | `command`, `version`, `rpc_role` | Per-command span (e.g., `rpc.command.server_info`) | +| Span Name | Source File | Attributes | Description | +| -------------------- | ----------------- | -------------------------------- | ----------------------------------------------------- | +| `rpc.http_request` | ServerHandler.cpp | — | Top-level HTTP RPC request | +| `rpc.ws_upgrade` | ServerHandler.cpp | — | WebSocket upgrade handshake | +| `rpc.ws_message` | ServerHandler.cpp | — | WebSocket RPC message | +| `rpc.process` | ServerHandler.cpp | — | RPC processing (child of rpc.http_request/ws_message) | +| `rpc.command.` | RPCHandler.cpp | `command`, `version`, `rpc_role` | Per-command span (e.g., `rpc.command.server_info`) | ### Transaction Spans (Phase 3) -| Span Name | Source File | Attributes | Description | -| ------------ | ------------------- | ------------------------------------------------------------------------- | ------------------------------------- | -| `tx.process` | NetworkOPs.cpp:1227 | `xrpl.tx.hash`, `local`, `path` | Transaction submission and processing | -| `tx.receive` | PeerImp.cpp:1273 | `xrpl.peer.id`, `xrpl.tx.hash`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay | +| Span Name | Source File | Attributes | Description | +| ------------ | -------------- | ------------------------------------------------------------------------- | ------------------------------------- | +| `tx.process` | NetworkOPs.cpp | `xrpl.tx.hash`, `local`, `path` | Transaction submission and processing | +| `tx.receive` | PeerImp.cpp | `xrpl.peer.id`, `xrpl.tx.hash`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay | ### Transaction Queue Spans (Phase 3) @@ -295,7 +296,9 @@ Three dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: | Span Name | Prometheus Metric Filter | Grafana Dashboard | | ------------------------------ | -------------------------------------------- | --------------------------------------------- | -| `rpc.request` | `{span_name="rpc.request"}` | -- (available but not paneled) | +| `rpc.http_request` | `{span_name="rpc.http_request"}` | -- (available but not paneled) | +| `rpc.ws_upgrade` | `{span_name="rpc.ws_upgrade"}` | -- (available but not paneled) | +| `rpc.ws_message` | `{span_name="rpc.ws_message"}` | -- (available but not paneled) | | `rpc.process` | `{span_name="rpc.process"}` | -- (available but not paneled) | | `rpc.command.*` | `{span_name=~"rpc.command.*"}` | RPC Performance (all 4 panels) | | `tx.process` | `{span_name="tx.process"}` | Transaction Overview (3 panels) | From 44cdc8133ed92c9917969f513dbbbf6d41932279 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 14 May 2026 16:51:14 +0100 Subject: [PATCH 3/8] =?UTF-8?q?fix(telemetry):=20phase-6=20dashboards=20?= =?UTF-8?q?=E2=80=94=20rename=20UIDs,=20add=20$node=20filter,=20drop=20lin?= =?UTF-8?q?e=20numbers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase-6 introduces ledger-operations, peer-network, and the five StatsD dashboards. Align them with the rest of the chain: - Rename dashboard UIDs from `rippled-*` to `xrpld-*` so the provisioned UIDs match the post-rename-script documentation (`docs.sh` rewrites .md but not .json, so the two drifted). Runbook references `xrpld-rpc-perf`, `xrpld-transactions`, etc., now the JSON matches. - Add the `$node` template variable + `exported_instance=~"$node"` filter to every target in the five `statsd-*` dashboards. Mirrors the pattern already used by consensus-health, ledger-operations, and peer-network per the project rule that every dashboard must support per-node filtering. - Strip `:` (and `:NN-NN` range) suffixes from C++ file references in every dashboard panel description and in docker/telemetry/TESTING.md. Line numbers drift on every refactor; the filename alone is enough to grep. - Replace stale `rpc.request` entries with the real emitted span names (`rpc.http_request`, `rpc.ws_upgrade`, `rpc.ws_message`, `rpc.process`) in TESTING.md so operators can copy-paste the filters and hit real traces. - Also drop the `:706` line ref from the `StatsDCollector.cpp` callout in `06-implementation-phases.md`. --- OpenTelemetryPlan/06-implementation-phases.md | 2 +- docker/telemetry/TESTING.md | 39 ++--- .../grafana/dashboards/ledger-operations.json | 10 +- .../grafana/dashboards/peer-network.json | 6 +- .../dashboards/statsd-ledger-data-sync.json | 103 ++++++++----- .../dashboards/statsd-network-traffic.json | 85 +++++++---- .../dashboards/statsd-node-health.json | 140 ++++++++++-------- .../statsd-overlay-traffic-detail.json | 97 +++++++----- .../dashboards/statsd-rpc-pathfinding.json | 71 +++++---- 9 files changed, 329 insertions(+), 224 deletions(-) diff --git a/OpenTelemetryPlan/06-implementation-phases.md b/OpenTelemetryPlan/06-implementation-phases.md index 208de9346f..9300daf14e 100644 --- a/OpenTelemetryPlan/06-implementation-phases.md +++ b/OpenTelemetryPlan/06-implementation-phases.md @@ -350,7 +350,7 @@ xrpld has a mature metrics framework (`beast::insight`) that emits StatsD-format ### Wire Format Fix (Task 6.1) — DEFERRED -The `StatsDMeterImpl` in `StatsDCollector.cpp:706` sends metrics with `|m` suffix, which is non-standard StatsD. The OTel StatsD receiver silently drops these. Fix: change `|m` to `|c` (counter), which is semantically correct since meters are increment-only counters. Only 2 metrics are affected (`warn`, `drop` in Resource Manager). +The `StatsDMeterImpl` in `StatsDCollector.cpp` sends metrics with `|m` suffix, which is non-standard StatsD. The OTel StatsD receiver silently drops these. Fix: change `|m` to `|c` (counter), which is semantically correct since meters are increment-only counters. Only 2 metrics are affected (`warn`, `drop` in Resource Manager). **Status**: Deferred as a separate change — this is a breaking change for any StatsD backend that previously consumed the custom `|m` type. The Resource Warnings and Resource Drops dashboard panels will show no data until this fix is applied. diff --git a/docker/telemetry/TESTING.md b/docker/telemetry/TESTING.md index 45a2541c0d..1346f2d49c 100644 --- a/docker/telemetry/TESTING.md +++ b/docker/telemetry/TESTING.md @@ -376,25 +376,26 @@ See the "Verification Queries" section below. All 16 production span names instrumented across Phases 2-5: -| Span Name | Source File | Phase | Key Attributes | How to Trigger | -| --------------------------- | --------------------- | ----- | ---------------------------------------------------------------------------------------- | ------------------------- | -| `rpc.request` | ServerHandler.cpp:271 | 2 | -- | Any HTTP RPC call | -| `rpc.process` | ServerHandler.cpp:573 | 2 | -- | Any HTTP RPC call | -| `rpc.ws_message` | ServerHandler.cpp:384 | 2 | -- | WebSocket RPC message | -| `rpc.command.` | RPCHandler.cpp:161 | 2 | `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role` | Any RPC command | -| `tx.process` | NetworkOPs.cpp:1227 | 3 | `xrpl.tx.hash`, `xrpl.tx.local`, `xrpl.tx.path` | Submit transaction | -| `tx.receive` | PeerImp.cpp:1273 | 3 | `xrpl.peer.id` | Peer relays transaction | -| `consensus.proposal.send` | RCLConsensus.cpp:177 | 4 | `xrpl.consensus.round` | Consensus proposing phase | -| `consensus.ledger_close` | RCLConsensus.cpp:282 | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | -| `consensus.accept` | RCLConsensus.cpp:395 | 4 | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | Ledger accepted | -| `consensus.validation.send` | RCLConsensus.cpp:753 | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | Validation sent | -| `consensus.accept.apply` | RCLConsensus.cpp:453 | 4 | `xrpl.consensus.close_time`, `close_time_correct`, `close_resolution_ms`, `state` | Ledger apply + close time | -| `tx.apply` | BuildLedger.cpp:88 | 5 | `xrpl.ledger.tx_count`, `xrpl.ledger.tx_failed` | Ledger close (tx set) | -| `ledger.build` | BuildLedger.cpp:31 | 5 | `xrpl.ledger.seq`, `xrpl.ledger.close_time`, `close_time_correct`, `close_resolution_ms` | Ledger build | -| `ledger.validate` | LedgerMaster.cpp:915 | 5 | `xrpl.ledger.seq`, `xrpl.ledger.validations` | Ledger validated | -| `ledger.store` | LedgerMaster.cpp:409 | 5 | `xrpl.ledger.seq` | Ledger stored | -| `peer.proposal.receive` | PeerImp.cpp:1667 | 5 | `xrpl.peer.id`, `xrpl.peer.proposal.trusted` | Peer sends proposal | -| `peer.validation.receive` | PeerImp.cpp:2264 | 5 | `xrpl.peer.id`, `xrpl.peer.validation.trusted` | Peer sends validation | +| Span Name | Source File | Phase | Key Attributes | How to Trigger | +| --------------------------- | ----------------- | ----- | ---------------------------------------------------------------------------------------- | ------------------------- | +| `rpc.http_request` | ServerHandler.cpp | 2 | -- | Any HTTP RPC call | +| `rpc.ws_upgrade` | ServerHandler.cpp | 2 | -- | WebSocket upgrade | +| `rpc.ws_message` | ServerHandler.cpp | 2 | -- | WebSocket RPC message | +| `rpc.process` | ServerHandler.cpp | 2 | -- | RPC processing | +| `rpc.command.` | RPCHandler.cpp | 2 | `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role` | Any RPC command | +| `tx.process` | NetworkOPs.cpp | 3 | `xrpl.tx.hash`, `xrpl.tx.local`, `xrpl.tx.path` | Submit transaction | +| `tx.receive` | PeerImp.cpp | 3 | `xrpl.peer.id` | Peer relays transaction | +| `consensus.proposal.send` | RCLConsensus.cpp | 4 | `xrpl.consensus.round` | Consensus proposing phase | +| `consensus.ledger_close` | RCLConsensus.cpp | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | +| `consensus.accept` | RCLConsensus.cpp | 4 | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | Ledger accepted | +| `consensus.validation.send` | RCLConsensus.cpp | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | Validation sent | +| `consensus.accept.apply` | RCLConsensus.cpp | 4 | `xrpl.consensus.close_time`, `close_time_correct`, `close_resolution_ms`, `state` | Ledger apply + close time | +| `tx.apply` | BuildLedger.cpp | 5 | `xrpl.ledger.tx_count`, `xrpl.ledger.tx_failed` | Ledger close (tx set) | +| `ledger.build` | BuildLedger.cpp | 5 | `xrpl.ledger.seq`, `xrpl.ledger.close_time`, `close_time_correct`, `close_resolution_ms` | Ledger build | +| `ledger.validate` | LedgerMaster.cpp | 5 | `xrpl.ledger.seq`, `xrpl.ledger.validations` | Ledger validated | +| `ledger.store` | LedgerMaster.cpp | 5 | `xrpl.ledger.seq` | Ledger stored | +| `peer.proposal.receive` | PeerImp.cpp | 5 | `xrpl.peer.id`, `xrpl.peer.proposal.trusted` | Peer sends proposal | +| `peer.validation.receive` | PeerImp.cpp | 5 | `xrpl.peer.id`, `xrpl.peer.validation.trusted` | Peer sends validation | --- diff --git a/docker/telemetry/grafana/dashboards/ledger-operations.json b/docker/telemetry/grafana/dashboards/ledger-operations.json index c9c8c5efc3..55e5de7716 100644 --- a/docker/telemetry/grafana/dashboards/ledger-operations.json +++ b/docker/telemetry/grafana/dashboards/ledger-operations.json @@ -10,7 +10,7 @@ "panels": [ { "title": "Ledger Build Rate", - "description": "Rate at which new ledgers are being built. The ledger.build span (BuildLedger.cpp:31) wraps the entire buildLedgerImpl() function which creates a new ledger from a parent, applies transactions, flushes SHAMap nodes, and sets the accepted state. Should match the consensus close rate (~0.25/sec on mainnet with ~4s rounds).", + "description": "Rate at which new ledgers are being built. The ledger.build span (BuildLedger.cpp) wraps the entire buildLedgerImpl() function which creates a new ledger from a parent, applies transactions, flushes SHAMap nodes, and sets the accepted state. Should match the consensus close rate (~0.25/sec on mainnet with ~4s rounds).", "type": "stat", "gridPos": { "h": 8, @@ -88,7 +88,7 @@ }, { "title": "Ledger Validation Rate", - "description": "Rate at which ledgers pass the validation threshold and are accepted as fully validated. The ledger.validate span (LedgerMaster.cpp:915) fires in checkAccept() only after the ledger receives sufficient trusted validations (>= quorum). Records xrpl.ledger.seq and validations (the number of validations received).", + "description": "Rate at which ledgers pass the validation threshold and are accepted as fully validated. The ledger.validate span (LedgerMaster.cpp) fires in checkAccept() only after the ledger receives sufficient trusted validations (>= quorum). Records xrpl.ledger.seq and validations (the number of validations received).", "type": "stat", "gridPos": { "h": 8, @@ -156,7 +156,7 @@ }, { "title": "Transaction Apply Duration", - "description": "p95 and p50 duration of applying the consensus transaction set during ledger building. The tx.apply span (BuildLedger.cpp:88) wraps applyTransactions() which iterates through the CanonicalTXSet with multiple retry passes. Records tx_count (successful) and tx_failed (failed) as attributes.", + "description": "p95 and p50 duration of applying the consensus transaction set during ledger building. The tx.apply span (BuildLedger.cpp) wraps applyTransactions() which iterates through the CanonicalTXSet with multiple retry passes. Records tx_count (successful) and tx_failed (failed) as attributes.", "type": "timeseries", "gridPos": { "h": 8, @@ -241,7 +241,7 @@ }, { "title": "Ledger Store Rate", - "description": "Rate at which ledgers are stored into the ledger history. The ledger.store span (LedgerMaster.cpp:409) wraps storeLedger() which inserts the ledger into the LedgerHistory cache. Records xrpl.ledger.seq. Should match the ledger build rate under normal operation.", + "description": "Rate at which ledgers are stored into the ledger history. The ledger.store span (LedgerMaster.cpp) wraps storeLedger() which inserts the ledger into the LedgerHistory cache. Records xrpl.ledger.seq. Should match the ledger build rate under normal operation.", "type": "stat", "gridPos": { "h": 8, @@ -349,5 +349,5 @@ "to": "now" }, "title": "Ledger Operations", - "uid": "rippled-ledger-ops" + "uid": "xrpld-ledger-ops" } diff --git a/docker/telemetry/grafana/dashboards/peer-network.json b/docker/telemetry/grafana/dashboards/peer-network.json index 0fd6e6048f..9907efd46d 100644 --- a/docker/telemetry/grafana/dashboards/peer-network.json +++ b/docker/telemetry/grafana/dashboards/peer-network.json @@ -11,7 +11,7 @@ "panels": [ { "title": "Peer Proposal Receive Rate", - "description": "Rate of consensus proposals received from network peers. The peer.proposal.receive span (PeerImp.cpp:1667) fires in onMessage(TMProposeSet) for each incoming proposal. Records xrpl.peer.id (sending peer) and proposal_trusted (whether the proposer is in our UNL). Requires trace_peer=1 in the telemetry config.", + "description": "Rate of consensus proposals received from network peers. The peer.proposal.receive span (PeerImp.cpp) fires in onMessage(TMProposeSet) for each incoming proposal. Records xrpl.peer.id (sending peer) and proposal_trusted (whether the proposer is in our UNL). Requires trace_peer=1 in the telemetry config.", "type": "timeseries", "gridPos": { "h": 8, @@ -50,7 +50,7 @@ }, { "title": "Peer Validation Receive Rate", - "description": "Rate of ledger validations received from network peers. The peer.validation.receive span (PeerImp.cpp:2264) fires in onMessage(TMValidation) for each incoming validation message. Records xrpl.peer.id (sending peer) and validation_trusted (whether the validator is trusted). Requires trace_peer=1 in the telemetry config.", + "description": "Rate of ledger validations received from network peers. The peer.validation.receive span (PeerImp.cpp) fires in onMessage(TMValidation) for each incoming validation message. Records xrpl.peer.id (sending peer) and validation_trusted (whether the validator is trusted). Requires trace_peer=1 in the telemetry config.", "type": "timeseries", "gridPos": { "h": 8, @@ -223,5 +223,5 @@ "to": "now" }, "title": "Peer Network", - "uid": "rippled-peer-net" + "uid": "xrpld-peer-net" } diff --git a/docker/telemetry/grafana/dashboards/statsd-ledger-data-sync.json b/docker/telemetry/grafana/dashboards/statsd-ledger-data-sync.json index 502d78e7aa..a7ad93a142 100644 --- a/docker/telemetry/grafana/dashboards/statsd-ledger-data-sync.json +++ b/docker/telemetry/grafana/dashboards/statsd-ledger-data-sync.json @@ -30,56 +30,56 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_data_get_Bytes_In", + "expr": "rippled_ledger_data_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Ledger Data Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_data_share_Bytes_In", + "expr": "rippled_ledger_data_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Ledger Data Share" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_data_Transaction_Set_candidate_get_Bytes_In", + "expr": "rippled_ledger_data_Transaction_Set_candidate_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "TX Set Candidate Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_data_Transaction_Set_candidate_share_Bytes_In", + "expr": "rippled_ledger_data_Transaction_Set_candidate_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "TX Set Candidate Share" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_data_Transaction_Node_get_Bytes_In", + "expr": "rippled_ledger_data_Transaction_Node_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "TX Node Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_data_Transaction_Node_share_Bytes_In", + "expr": "rippled_ledger_data_Transaction_Node_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "TX Node Share" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_data_Account_State_Node_get_Bytes_In", + "expr": "rippled_ledger_data_Account_State_Node_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Account State Node Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_data_Account_State_Node_share_Bytes_In", + "expr": "rippled_ledger_data_Account_State_Node_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Account State Node Share" } ], @@ -118,56 +118,56 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_share_Bytes_In", + "expr": "rippled_ledger_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Ledger Share In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_get_Bytes_In", + "expr": "rippled_ledger_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Ledger Get In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_Transaction_Set_candidate_share_Bytes_In", + "expr": "rippled_ledger_Transaction_Set_candidate_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "TX Set Candidate Share" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_Transaction_Set_candidate_get_Bytes_In", + "expr": "rippled_ledger_Transaction_Set_candidate_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "TX Set Candidate Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_Transaction_node_share_Bytes_In", + "expr": "rippled_ledger_Transaction_node_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "TX Node Share" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_Transaction_node_get_Bytes_In", + "expr": "rippled_ledger_Transaction_node_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "TX Node Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_Account_State_node_share_Bytes_In", + "expr": "rippled_ledger_Account_State_node_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Account State Share" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledger_Account_State_node_get_Bytes_In", + "expr": "rippled_ledger_Account_State_node_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Account State Get" } ], @@ -206,56 +206,56 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Ledger_get_Bytes_In", + "expr": "rippled_getobject_Ledger_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Ledger Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Ledger_share_Bytes_In", + "expr": "rippled_getobject_Ledger_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Ledger Share" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Transaction_get_Bytes_In", + "expr": "rippled_getobject_Transaction_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Transaction Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Transaction_share_Bytes_In", + "expr": "rippled_getobject_Transaction_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Transaction Share" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Transaction_node_get_Bytes_In", + "expr": "rippled_getobject_Transaction_node_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "TX Node Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Transaction_node_share_Bytes_In", + "expr": "rippled_getobject_Transaction_node_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "TX Node Share" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Account_State_node_get_Bytes_In", + "expr": "rippled_getobject_Account_State_node_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Account State Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Account_State_node_share_Bytes_In", + "expr": "rippled_getobject_Account_State_node_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Account State Share" } ], @@ -294,49 +294,49 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_CAS_get_Bytes_In", + "expr": "rippled_getobject_CAS_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "CAS Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_CAS_share_Bytes_In", + "expr": "rippled_getobject_CAS_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "CAS Share" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Fetch_Pack_share_Bytes_In", + "expr": "rippled_getobject_Fetch_Pack_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Fetch Pack Share" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Fetch_Pack_get_Bytes_In", + "expr": "rippled_getobject_Fetch_Pack_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Fetch Pack Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Transactions_get_Bytes_In", + "expr": "rippled_getobject_Transactions_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Transactions Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_get_Bytes_In", + "expr": "rippled_getobject_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Aggregate Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_share_Bytes_In", + "expr": "rippled_getobject_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Aggregate Share" } ], @@ -375,49 +375,49 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Ledger_get_Messages_In", + "expr": "rippled_getobject_Ledger_get_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Ledger Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Transaction_get_Messages_In", + "expr": "rippled_getobject_Transaction_get_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Transaction Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Transaction_node_get_Messages_In", + "expr": "rippled_getobject_Transaction_node_get_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "TX Node Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Account_State_node_get_Messages_In", + "expr": "rippled_getobject_Account_State_node_get_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Account State Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_CAS_get_Messages_In", + "expr": "rippled_getobject_CAS_get_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "CAS Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Fetch_Pack_get_Messages_In", + "expr": "rippled_getobject_Fetch_Pack_get_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Fetch Pack Get" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_getobject_Transactions_get_Messages_In", + "expr": "rippled_getobject_Transactions_get_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Transactions Get" } ], @@ -463,7 +463,7 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(20, {__name__=~\"rippled_.*_Bytes_In\", __name__!~\"rippled_total_.*\"})", + "expr": "topk{exported_instance=~\"$node\"}(20, {__name__=~\"rippled_.*_Bytes_In\", __name__!~\"rippled_total_.*\"})", "legendFormat": "{{__name__}}" } ], @@ -495,12 +495,33 @@ "schemaVersion": 39, "tags": ["rippled", "statsd", "ledger", "sync", "telemetry"], "templating": { - "list": [] + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by xrpld node (service.instance.id \u2014 e.g. Node-1)", + "type": "query", + "query": "label_values(exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] }, "time": { "from": "now-1h", "to": "now" }, "title": "Ledger Data & Sync (StatsD)", - "uid": "rippled-statsd-ledger-sync" + "uid": "xrpld-statsd-ledger-sync" } diff --git a/docker/telemetry/grafana/dashboards/statsd-network-traffic.json b/docker/telemetry/grafana/dashboards/statsd-network-traffic.json index d4bfbddaa9..9cec7ff4d1 100644 --- a/docker/telemetry/grafana/dashboards/statsd-network-traffic.json +++ b/docker/telemetry/grafana/dashboards/statsd-network-traffic.json @@ -11,7 +11,7 @@ "panels": [ { "title": "Active Peers", - "description": "Number of active inbound and outbound peer connections. Sourced from Peer_Finder.Active_Inbound_Peers and Peer_Finder.Active_Outbound_Peers gauges (PeerfinderManager.cpp:214-215). A healthy mainnet node typically has 10-21 outbound and 0-85 inbound peers depending on configuration.", + "description": "Number of active inbound and outbound peer connections. Sourced from Peer_Finder.Active_Inbound_Peers and Peer_Finder.Active_Outbound_Peers gauges (PeerfinderManager.cpp). A healthy mainnet node typically has 10-21 outbound and 0-85 inbound peers depending on configuration.", "type": "timeseries", "gridPos": { "h": 8, @@ -30,14 +30,14 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_Peer_Finder_Active_Inbound_Peers", + "expr": "rippled_Peer_Finder_Active_Inbound_Peers{exported_instance=~\"$node\"}", "legendFormat": "Inbound Peers" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_Peer_Finder_Active_Outbound_Peers", + "expr": "rippled_Peer_Finder_Active_Outbound_Peers{exported_instance=~\"$node\"}", "legendFormat": "Outbound Peers" } ], @@ -57,7 +57,7 @@ }, { "title": "Peer Disconnects", - "description": "Cumulative count of peer disconnections. Sourced from the Overlay.Peer_Disconnects gauge (OverlayImpl.h:557). A rising trend indicates network instability, aggressive peer management, or resource exhaustion causing connection drops.", + "description": "Cumulative count of peer disconnections. Sourced from the Overlay.Peer_Disconnects gauge (OverlayImpl.h). A rising trend indicates network instability, aggressive peer management, or resource exhaustion causing connection drops.", "type": "timeseries", "gridPos": { "h": 8, @@ -76,7 +76,7 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_Overlay_Peer_Disconnects", + "expr": "rippled_Overlay_Peer_Disconnects{exported_instance=~\"$node\"}", "legendFormat": "Disconnects" } ], @@ -96,7 +96,7 @@ }, { "title": "Total Network Bytes", - "description": "Rate of total bytes sent and received across all peer connections. Sourced from the total.Bytes_In and total.Bytes_Out traffic category gauges (OverlayImpl.h:535-548). Wrapped in rate() to show throughput rather than cumulative counter values.", + "description": "Rate of total bytes sent and received across all peer connections. Sourced from the total.Bytes_In and total.Bytes_Out traffic category gauges (OverlayImpl.h). Wrapped in rate() to show throughput rather than cumulative counter values.", "type": "timeseries", "gridPos": { "h": 8, @@ -115,14 +115,14 @@ "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_total_Bytes_In[5m])", + "expr": "rate{exported_instance=~\"$node\"}(rippled_total_Bytes_In[5m])", "legendFormat": "Bytes In" }, { "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_total_Bytes_Out[5m])", + "expr": "rate{exported_instance=~\"$node\"}(rippled_total_Bytes_Out[5m])", "legendFormat": "Bytes Out" } ], @@ -142,7 +142,7 @@ }, { "title": "Total Network Messages", - "description": "Total messages sent and received across all peer connections. Sourced from the total.Messages_In and total.Messages_Out traffic category gauges (OverlayImpl.h:535-548). Shows the overall message throughput of the overlay network.", + "description": "Total messages sent and received across all peer connections. Sourced from the total.Messages_In and total.Messages_Out traffic category gauges (OverlayImpl.h). Shows the overall message throughput of the overlay network.", "type": "timeseries", "gridPos": { "h": 8, @@ -161,14 +161,14 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_total_Messages_In", + "expr": "rippled_total_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Messages In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_total_Messages_Out", + "expr": "rippled_total_Messages_Out{exported_instance=~\"$node\"}", "legendFormat": "Messages Out" } ], @@ -207,21 +207,21 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_transactions_Messages_In", + "expr": "rippled_transactions_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "TX Messages In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_transactions_Messages_Out", + "expr": "rippled_transactions_Messages_Out{exported_instance=~\"$node\"}", "legendFormat": "TX Messages Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_transactions_duplicate_Messages_In", + "expr": "rippled_transactions_duplicate_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "TX Duplicate In" } ], @@ -260,28 +260,28 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_proposals_Messages_In", + "expr": "rippled_proposals_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Proposals In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_proposals_Messages_Out", + "expr": "rippled_proposals_Messages_Out{exported_instance=~\"$node\"}", "legendFormat": "Proposals Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_proposals_untrusted_Messages_In", + "expr": "rippled_proposals_untrusted_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Untrusted In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_proposals_duplicate_Messages_In", + "expr": "rippled_proposals_duplicate_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Duplicate In" } ], @@ -320,28 +320,28 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_validations_Messages_In", + "expr": "rippled_validations_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Validations In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_validations_Messages_Out", + "expr": "rippled_validations_Messages_Out{exported_instance=~\"$node\"}", "legendFormat": "Validations Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_validations_untrusted_Messages_In", + "expr": "rippled_validations_untrusted_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Untrusted In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_validations_duplicate_Messages_In", + "expr": "rippled_validations_duplicate_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Duplicate In" } ], @@ -380,7 +380,7 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(10, {__name__=~\"rippled_.*_Bytes_In\", __name__!~\"rippled_total_.*\"})", + "expr": "topk{exported_instance=~\"$node\"}(10, {__name__=~\"rippled_.*_Bytes_In\", __name__!~\"rippled_total_.*\"})", "legendFormat": "{{__name__}}" } ], @@ -677,42 +677,42 @@ "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_transactions_duplicate_Bytes_In[5m])", + "expr": "rate{exported_instance=~\"$node\"}(rippled_transactions_duplicate_Bytes_In[5m])", "legendFormat": "TX Duplicate In" }, { "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_transactions_duplicate_Bytes_Out[5m])", + "expr": "rate{exported_instance=~\"$node\"}(rippled_transactions_duplicate_Bytes_Out[5m])", "legendFormat": "TX Duplicate Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_proposals_duplicate_Bytes_In[5m])", + "expr": "rate{exported_instance=~\"$node\"}(rippled_proposals_duplicate_Bytes_In[5m])", "legendFormat": "Proposals Duplicate In" }, { "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_proposals_duplicate_Bytes_Out[5m])", + "expr": "rate{exported_instance=~\"$node\"}(rippled_proposals_duplicate_Bytes_Out[5m])", "legendFormat": "Proposals Duplicate Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_validations_duplicate_Bytes_In[5m])", + "expr": "rate{exported_instance=~\"$node\"}(rippled_validations_duplicate_Bytes_In[5m])", "legendFormat": "Validations Duplicate In" }, { "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_validations_duplicate_Bytes_Out[5m])", + "expr": "rate{exported_instance=~\"$node\"}(rippled_validations_duplicate_Bytes_Out[5m])", "legendFormat": "Validations Duplicate Out" } ], @@ -751,7 +751,7 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(15, rate({__name__=~\"rippled_.*_Bytes_In\", __name__!~\"rippled_total_.*\"}[5m]))", + "expr": "topk{exported_instance=~\"$node\"}(15, rate({__name__=~\"rippled_.*_Bytes_In\", __name__!~\"rippled_total_.*\"}[5m]))", "legendFormat": "{{__name__}}" } ], @@ -773,12 +773,33 @@ "schemaVersion": 39, "tags": ["rippled", "statsd", "network", "telemetry"], "templating": { - "list": [] + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by xrpld node (service.instance.id \u2014 e.g. Node-1)", + "type": "query", + "query": "label_values(exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] }, "time": { "from": "now-1h", "to": "now" }, "title": "Network Traffic (StatsD)", - "uid": "rippled-statsd-network" + "uid": "xrpld-statsd-network" } diff --git a/docker/telemetry/grafana/dashboards/statsd-node-health.json b/docker/telemetry/grafana/dashboards/statsd-node-health.json index 3676c32fc7..82d7f808ef 100644 --- a/docker/telemetry/grafana/dashboards/statsd-node-health.json +++ b/docker/telemetry/grafana/dashboards/statsd-node-health.json @@ -11,7 +11,7 @@ "panels": [ { "title": "Validated Ledger Age", - "description": "Age of the most recently validated ledger in seconds. Sourced from the LedgerMaster.Validated_Ledger_Age gauge (LedgerMaster.h:373) which is updated every collection interval via the insight hook. Values above 20s indicate the node is falling behind the network.", + "description": "Age of the most recently validated ledger in seconds. Sourced from the LedgerMaster.Validated_Ledger_Age gauge (LedgerMaster.h) which is updated every collection interval via the insight hook. Values above 20s indicate the node is falling behind the network.", "type": "stat", "gridPos": { "h": 8, @@ -30,7 +30,7 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_LedgerMaster_Validated_Ledger_Age", + "expr": "rippled_LedgerMaster_Validated_Ledger_Age{exported_instance=~\"$node\"}", "legendFormat": "Validated Age" } ], @@ -59,7 +59,7 @@ }, { "title": "Published Ledger Age", - "description": "Age of the most recently published ledger in seconds. Sourced from the LedgerMaster.Published_Ledger_Age gauge (LedgerMaster.h:374). Published ledger age should track close to validated ledger age. A growing gap indicates publish pipeline backlog.", + "description": "Age of the most recently published ledger in seconds. Sourced from the LedgerMaster.Published_Ledger_Age gauge (LedgerMaster.h). Published ledger age should track close to validated ledger age. A growing gap indicates publish pipeline backlog.", "type": "stat", "gridPos": { "h": 8, @@ -78,7 +78,7 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_LedgerMaster_Published_Ledger_Age", + "expr": "rippled_LedgerMaster_Published_Ledger_Age{exported_instance=~\"$node\"}", "legendFormat": "Published Age" } ], @@ -107,7 +107,7 @@ }, { "title": "Operating Mode Duration", - "description": "Cumulative time spent in each operating mode (Disconnected, Connected, Syncing, Tracking, Full). Sourced from State_Accounting.*_duration gauges (NetworkOPs.cpp:774-778). A healthy node should spend the vast majority of time in Full mode.", + "description": "Cumulative time spent in each operating mode (Disconnected, Connected, Syncing, Tracking, Full). Sourced from State_Accounting.*_duration gauges (NetworkOPs.cpp). A healthy node should spend the vast majority of time in Full mode.", "type": "timeseries", "gridPos": { "h": 8, @@ -126,35 +126,35 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_State_Accounting_Full_duration", + "expr": "rippled_State_Accounting_Full_duration{exported_instance=~\"$node\"}", "legendFormat": "Full" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_State_Accounting_Tracking_duration", + "expr": "rippled_State_Accounting_Tracking_duration{exported_instance=~\"$node\"}", "legendFormat": "Tracking" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_State_Accounting_Syncing_duration", + "expr": "rippled_State_Accounting_Syncing_duration{exported_instance=~\"$node\"}", "legendFormat": "Syncing" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_State_Accounting_Connected_duration", + "expr": "rippled_State_Accounting_Connected_duration{exported_instance=~\"$node\"}", "legendFormat": "Connected" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_State_Accounting_Disconnected_duration", + "expr": "rippled_State_Accounting_Disconnected_duration{exported_instance=~\"$node\"}", "legendFormat": "Disconnected" } ], @@ -174,7 +174,7 @@ }, { "title": "Operating Mode Transitions", - "description": "Count of transitions into each operating mode. Sourced from State_Accounting.*_transitions gauges (NetworkOPs.cpp:780-786). Frequent transitions out of Full mode indicate instability. Transitions to Disconnected or Syncing warrant investigation.", + "description": "Count of transitions into each operating mode. Sourced from State_Accounting.*_transitions gauges (NetworkOPs.cpp). Frequent transitions out of Full mode indicate instability. Transitions to Disconnected or Syncing warrant investigation.", "type": "timeseries", "gridPos": { "h": 8, @@ -193,35 +193,35 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_State_Accounting_Full_transitions", + "expr": "rippled_State_Accounting_Full_transitions{exported_instance=~\"$node\"}", "legendFormat": "Full" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_State_Accounting_Tracking_transitions", + "expr": "rippled_State_Accounting_Tracking_transitions{exported_instance=~\"$node\"}", "legendFormat": "Tracking" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_State_Accounting_Syncing_transitions", + "expr": "rippled_State_Accounting_Syncing_transitions{exported_instance=~\"$node\"}", "legendFormat": "Syncing" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_State_Accounting_Connected_transitions", + "expr": "rippled_State_Accounting_Connected_transitions{exported_instance=~\"$node\"}", "legendFormat": "Connected" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_State_Accounting_Disconnected_transitions", + "expr": "rippled_State_Accounting_Disconnected_transitions{exported_instance=~\"$node\"}", "legendFormat": "Disconnected" } ], @@ -241,7 +241,7 @@ }, { "title": "I/O Latency", - "description": "P95 and P50 of the I/O service loop latency in milliseconds. Sourced from the ios_latency event (Application.cpp:438) which measures how long it takes for the io_context to process a timer callback. Values above 10ms are logged; above 500ms trigger warnings. High values indicate thread pool saturation or blocking operations.", + "description": "P95 and P50 of the I/O service loop latency in milliseconds. Sourced from the ios_latency event (Application.cpp) which measures how long it takes for the io_context to process a timer callback. Values above 10ms are logged; above 500ms trigger warnings. High values indicate thread pool saturation or blocking operations.", "type": "timeseries", "gridPos": { "h": 8, @@ -260,14 +260,14 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_ios_latency{quantile=\"0.95\"}", + "expr": "rippled_ios_latency{exported_instance=~\"$node\", quantile=\"0.95\"}", "legendFormat": "P95 I/O Latency" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ios_latency{quantile=\"0.5\"}", + "expr": "rippled_ios_latency{exported_instance=~\"$node\", quantile=\"0.5\"}", "legendFormat": "P50 I/O Latency" } ], @@ -287,7 +287,7 @@ }, { "title": "Job Queue Depth", - "description": "Current number of jobs waiting in the job queue. Sourced from the job_count gauge (JobQueue.cpp:26). A sustained high value indicates the node cannot process work fast enough — common during ledger replay or heavy RPC load.", + "description": "Current number of jobs waiting in the job queue. Sourced from the job_count gauge (JobQueue.cpp). A sustained high value indicates the node cannot process work fast enough \u2014 common during ledger replay or heavy RPC load.", "type": "timeseries", "gridPos": { "h": 8, @@ -306,7 +306,7 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_job_count", + "expr": "rippled_job_count{exported_instance=~\"$node\"}", "legendFormat": "Job Queue Depth" } ], @@ -326,7 +326,7 @@ }, { "title": "Ledger Fetch Rate", - "description": "Rate of ledger fetch requests initiated by the node. Sourced from the ledger_fetches counter (InboundLedgers.cpp:44) which increments each time the node requests a ledger from a peer. High rates indicate the node is catching up or missing ledgers.", + "description": "Rate of ledger fetch requests initiated by the node. Sourced from the ledger_fetches counter (InboundLedgers.cpp) which increments each time the node requests a ledger from a peer. High rates indicate the node is catching up or missing ledgers.", "type": "stat", "gridPos": { "h": 8, @@ -345,7 +345,7 @@ "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_ledger_fetches_total[5m])", + "expr": "rate{exported_instance=~\"$node\"}(rippled_ledger_fetches_total[5m])", "legendFormat": "Fetches / Sec" } ], @@ -358,7 +358,7 @@ }, { "title": "Ledger History Mismatches", - "description": "Rate of ledger history hash mismatches. Sourced from the ledger.history.mismatch counter (LedgerHistory.cpp:16) which increments when a built ledger hash does not match the expected validated hash. Non-zero values indicate consensus divergence or database corruption.", + "description": "Rate of ledger history hash mismatches. Sourced from the ledger.history.mismatch counter (LedgerHistory.cpp) which increments when a built ledger hash does not match the expected validated hash. Non-zero values indicate consensus divergence or database corruption.", "type": "stat", "gridPos": { "h": 8, @@ -377,7 +377,7 @@ "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_ledger_history_mismatch_total[5m])", + "expr": "rate{exported_instance=~\"$node\"}(rippled_ledger_history_mismatch_total[5m])", "legendFormat": "Mismatches / Sec" } ], @@ -402,7 +402,7 @@ }, { "title": "Key Jobs Execution Time", - "description": "Execution time for critical job types at the selected quantile. Sourced from per-job-type events in JobTypeData (JobTypeData.h:48). Shows how long key consensus, transaction, and maintenance jobs take to execute. Spikes indicate processing bottlenecks.", + "description": "Execution time for critical job types at the selected quantile. Sourced from per-job-type events in JobTypeData (JobTypeData.h). Shows how long key consensus, transaction, and maintenance jobs take to execute. Spikes indicate processing bottlenecks.", "type": "timeseries", "gridPos": { "h": 8, @@ -421,77 +421,77 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_acceptLedger{quantile=\"$quantile\"}", + "expr": "rippled_acceptLedger{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Accept Ledger [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_advanceLedger{quantile=\"$quantile\"}", + "expr": "rippled_advanceLedger{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Advance Ledger [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_transaction{quantile=\"$quantile\"}", + "expr": "rippled_transaction{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Transaction [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_writeObjects{quantile=\"$quantile\"}", + "expr": "rippled_writeObjects{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Write Objects [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_heartbeat{quantile=\"$quantile\"}", + "expr": "rippled_heartbeat{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Heartbeat [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_sweep{quantile=\"$quantile\"}", + "expr": "rippled_sweep{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Sweep [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_trustedValidation{quantile=\"$quantile\"}", + "expr": "rippled_trustedValidation{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Trusted Validation [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_trustedProposal{quantile=\"$quantile\"}", + "expr": "rippled_trustedProposal{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Trusted Proposal [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_publishNewLedger{quantile=\"$quantile\"}", + "expr": "rippled_publishNewLedger{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Publish New Ledger [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_clientRPC{quantile=\"$quantile\"}", + "expr": "rippled_clientRPC{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Client RPC [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledgerData{quantile=\"$quantile\"}", + "expr": "rippled_ledgerData{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Ledger Data [{{quantile}}]" } ], @@ -511,7 +511,7 @@ }, { "title": "Key Jobs Dequeue Wait Time", - "description": "Time spent waiting in the job queue before execution for critical job types. Sourced from per-job-type dequeue events (JobTypeData.h:47). High dequeue times indicate the job queue is backlogged and jobs are waiting too long to be scheduled.", + "description": "Time spent waiting in the job queue before execution for critical job types. Sourced from per-job-type dequeue events (JobTypeData.h). High dequeue times indicate the job queue is backlogged and jobs are waiting too long to be scheduled.", "type": "timeseries", "gridPos": { "h": 8, @@ -530,77 +530,77 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_acceptLedger_q{quantile=\"$quantile\"}", + "expr": "rippled_acceptLedger_q{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Accept Ledger [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_advanceLedger_q{quantile=\"$quantile\"}", + "expr": "rippled_advanceLedger_q{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Advance Ledger [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_transaction_q{quantile=\"$quantile\"}", + "expr": "rippled_transaction_q{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Transaction [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_writeObjects_q{quantile=\"$quantile\"}", + "expr": "rippled_writeObjects_q{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Write Objects [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_heartbeat_q{quantile=\"$quantile\"}", + "expr": "rippled_heartbeat_q{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Heartbeat [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_sweep_q{quantile=\"$quantile\"}", + "expr": "rippled_sweep_q{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Sweep [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_trustedValidation_q{quantile=\"$quantile\"}", + "expr": "rippled_trustedValidation_q{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Trusted Validation [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_trustedProposal_q{quantile=\"$quantile\"}", + "expr": "rippled_trustedProposal_q{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Trusted Proposal [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_publishNewLedger_q{quantile=\"$quantile\"}", + "expr": "rippled_publishNewLedger_q{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Publish New Ledger [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_clientRPC_q{quantile=\"$quantile\"}", + "expr": "rippled_clientRPC_q{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Client RPC [{{quantile}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_ledgerData_q{quantile=\"$quantile\"}", + "expr": "rippled_ledgerData_q{exported_instance=~\"$node\", quantile=\"$quantile\"}", "legendFormat": "Ledger Data [{{quantile}}]" } ], @@ -620,7 +620,7 @@ }, { "title": "FullBelowCache Size", - "description": "Number of entries in the FullBelowCache. Sourced from the TaggedCache size gauge (TaggedCache.h:183) for the Node family full below cache (NodeFamily.cpp:29). This cache tracks which SHAMap nodes have all children present locally, avoiding redundant fetches during ledger acquisition.", + "description": "Number of entries in the FullBelowCache. Sourced from the TaggedCache size gauge (TaggedCache.h) for the Node family full below cache (NodeFamily.cpp). This cache tracks which SHAMap nodes have all children present locally, avoiding redundant fetches during ledger acquisition.", "type": "timeseries", "gridPos": { "h": 8, @@ -639,7 +639,7 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_Node_family_full_below_cache_size", + "expr": "rippled_Node_family_full_below_cache_size{exported_instance=~\"$node\"}", "legendFormat": "FullBelowCache Size" } ], @@ -659,7 +659,7 @@ }, { "title": "FullBelowCache Hit Rate", - "description": "Hit rate percentage for the FullBelowCache. Sourced from the TaggedCache hit_rate gauge (TaggedCache.h:184). A high hit rate means the node is efficiently reusing cached knowledge about complete SHAMap subtrees. Low hit rates during steady state warrant investigation.", + "description": "Hit rate percentage for the FullBelowCache. Sourced from the TaggedCache hit_rate gauge (TaggedCache.h). A high hit rate means the node is efficiently reusing cached knowledge about complete SHAMap subtrees. Low hit rates during steady state warrant investigation.", "type": "gauge", "gridPos": { "h": 8, @@ -678,7 +678,7 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_Node_family_full_below_cache_hit_rate", + "expr": "rippled_Node_family_full_below_cache_hit_rate{exported_instance=~\"$node\"}", "legendFormat": "Hit Rate" } ], @@ -728,7 +728,7 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_LedgerMaster_Published_Ledger_Age - rippled_LedgerMaster_Validated_Ledger_Age", + "expr": "rippled_LedgerMaster_Published_Ledger_Age{exported_instance=~\"$node\"} - rippled_LedgerMaster_Validated_Ledger_Age", "legendFormat": "Publish Gap" } ], @@ -757,7 +757,7 @@ }, { "title": "State Duration Rate (Full vs Tracking)", - "description": "Rate of change of time spent in Full and Tracking operating modes, normalized to seconds. Sourced from State_Accounting duration gauges (NetworkOPs.cpp:774-778). In steady state the Full duration rate should be close to 1.0 (gaining one second of Full-mode time per wall-clock second). A drop below 1.0 means the node is spending time in other modes.", + "description": "Rate of change of time spent in Full and Tracking operating modes, normalized to seconds. Sourced from State_Accounting duration gauges (NetworkOPs.cpp). In steady state the Full duration rate should be close to 1.0 (gaining one second of Full-mode time per wall-clock second). A drop below 1.0 means the node is spending time in other modes.", "type": "timeseries", "gridPos": { "h": 8, @@ -776,14 +776,14 @@ "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_State_Accounting_Full_duration[5m]) / 1000000", + "expr": "rate{exported_instance=~\"$node\"}(rippled_State_Accounting_Full_duration[5m]) / 1000000", "legendFormat": "Full Mode Rate" }, { "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_State_Accounting_Tracking_duration[5m]) / 1000000", + "expr": "rate{exported_instance=~\"$node\"}(rippled_State_Accounting_Tracking_duration[5m]) / 1000000", "legendFormat": "Tracking Mode Rate" } ], @@ -822,7 +822,7 @@ "datasource": { "type": "prometheus" }, - "expr": "{__name__=~\"rippled_(makeFetchPack|publishAcqLedger|untrustedValidation|manifest|localTransaction|ledgerReplayRequest|ledgerRequest|untrustedProposal|ledgerReplayTask|ledgerData|clientCommand|clientSubscribe|clientFeeChange|clientConsensus|clientAccountHistory|clientRPC|clientWebsocket|RPC|updatePaths|transaction|batch|advanceLedger|publishNewLedger|fetchTxnData|writeAhead|trustedValidation|writeObjects|acceptLedger|trustedProposal|sweep|clusterReport|heartbeat|administration|handleHaveTransactions|doTransactions)\", quantile=\"$quantile\"}", + "expr": "{__name__{exported_instance=~\"$node\"}=~\"rippled_(makeFetchPack|publishAcqLedger|untrustedValidation|manifest|localTransaction|ledgerReplayRequest|ledgerRequest|untrustedProposal|ledgerReplayTask|ledgerData|clientCommand|clientSubscribe|clientFeeChange|clientConsensus|clientAccountHistory|clientRPC|clientWebsocket|RPC|updatePaths|transaction|batch|advanceLedger|publishNewLedger|fetchTxnData|writeAhead|trustedValidation|writeObjects|acceptLedger|trustedProposal|sweep|clusterReport|heartbeat|administration|handleHaveTransactions|doTransactions)\", quantile=\"$quantile\"}", "legendFormat": "{{__name__}} [{{quantile}}]" } ], @@ -861,7 +861,7 @@ "datasource": { "type": "prometheus" }, - "expr": "{__name__=~\"rippled_(makeFetchPack_q|publishAcqLedger_q|untrustedValidation_q|manifest_q|localTransaction_q|ledgerReplayRequest_q|ledgerRequest_q|untrustedProposal_q|ledgerReplayTask_q|ledgerData_q|clientCommand_q|clientSubscribe_q|clientFeeChange_q|clientConsensus_q|clientAccountHistory_q|clientRPC_q|clientWebsocket_q|RPC_q|updatePaths_q|transaction_q|batch_q|advanceLedger_q|publishNewLedger_q|fetchTxnData_q|writeAhead_q|trustedValidation_q|writeObjects_q|acceptLedger_q|trustedProposal_q|sweep_q|clusterReport_q|heartbeat_q|administration_q|handleHaveTransactions_q|doTransactions_q)\", quantile=\"$quantile\"}", + "expr": "{__name__{exported_instance=~\"$node\"}=~\"rippled_(makeFetchPack_q|publishAcqLedger_q|untrustedValidation_q|manifest_q|localTransaction_q|ledgerReplayRequest_q|ledgerRequest_q|untrustedProposal_q|ledgerReplayTask_q|ledgerData_q|clientCommand_q|clientSubscribe_q|clientFeeChange_q|clientConsensus_q|clientAccountHistory_q|clientRPC_q|clientWebsocket_q|RPC_q|updatePaths_q|transaction_q|batch_q|advanceLedger_q|publishNewLedger_q|fetchTxnData_q|writeAhead_q|trustedValidation_q|writeObjects_q|acceptLedger_q|trustedProposal_q|sweep_q|clusterReport_q|heartbeat_q|administration_q|handleHaveTransactions_q|doTransactions_q)\", quantile=\"$quantile\"}", "legendFormat": "{{__name__}} [{{quantile}}]" } ], @@ -884,6 +884,26 @@ "tags": ["rippled", "statsd", "node-health", "telemetry"], "templating": { "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by xrpld node (service.instance.id \u2014 e.g. Node-1)", + "type": "query", + "query": "label_values(exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + }, { "name": "quantile", "label": "Quantile", @@ -926,5 +946,5 @@ "to": "now" }, "title": "Node Health (StatsD)", - "uid": "rippled-statsd-node-health" + "uid": "xrpld-statsd-node-health" } diff --git a/docker/telemetry/grafana/dashboards/statsd-overlay-traffic-detail.json b/docker/telemetry/grafana/dashboards/statsd-overlay-traffic-detail.json index a09a2b5d17..25e8dcbf19 100644 --- a/docker/telemetry/grafana/dashboards/statsd-overlay-traffic-detail.json +++ b/docker/telemetry/grafana/dashboards/statsd-overlay-traffic-detail.json @@ -30,42 +30,42 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_squelch_Messages_In", + "expr": "rippled_squelch_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Squelch In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_squelch_Messages_Out", + "expr": "rippled_squelch_Messages_Out{exported_instance=~\"$node\"}", "legendFormat": "Squelch Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_squelch_suppressed_Messages_In", + "expr": "rippled_squelch_suppressed_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Suppressed In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_squelch_suppressed_Messages_Out", + "expr": "rippled_squelch_suppressed_Messages_Out{exported_instance=~\"$node\"}", "legendFormat": "Suppressed Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_squelch_ignored_Messages_In", + "expr": "rippled_squelch_ignored_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Ignored In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_squelch_ignored_Messages_Out", + "expr": "rippled_squelch_ignored_Messages_Out{exported_instance=~\"$node\"}", "legendFormat": "Ignored Out" } ], @@ -104,42 +104,42 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_overhead_Bytes_In", + "expr": "rippled_overhead_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Base Overhead In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_overhead_Bytes_Out", + "expr": "rippled_overhead_Bytes_Out{exported_instance=~\"$node\"}", "legendFormat": "Base Overhead Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_overhead_cluster_Bytes_In", + "expr": "rippled_overhead_cluster_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Cluster In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_overhead_cluster_Bytes_Out", + "expr": "rippled_overhead_cluster_Bytes_Out{exported_instance=~\"$node\"}", "legendFormat": "Cluster Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_overhead_manifest_Bytes_In", + "expr": "rippled_overhead_manifest_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Manifest In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_overhead_manifest_Bytes_Out", + "expr": "rippled_overhead_manifest_Bytes_Out{exported_instance=~\"$node\"}", "legendFormat": "Manifest Out" } ], @@ -178,28 +178,28 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_validator_lists_Bytes_In", + "expr": "rippled_validator_lists_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Bytes In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_validator_lists_Bytes_Out", + "expr": "rippled_validator_lists_Bytes_Out{exported_instance=~\"$node\"}", "legendFormat": "Bytes Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_validator_lists_Messages_In", + "expr": "rippled_validator_lists_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Messages In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_validator_lists_Messages_Out", + "expr": "rippled_validator_lists_Messages_Out{exported_instance=~\"$node\"}", "legendFormat": "Messages Out" } ], @@ -255,28 +255,28 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_set_get_Bytes_In", + "expr": "rippled_set_get_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Set Get In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_set_get_Bytes_Out", + "expr": "rippled_set_get_Bytes_Out{exported_instance=~\"$node\"}", "legendFormat": "Set Get Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_set_share_Bytes_In", + "expr": "rippled_set_share_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Set Share In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_set_share_Bytes_Out", + "expr": "rippled_set_share_Bytes_Out{exported_instance=~\"$node\"}", "legendFormat": "Set Share Out" } ], @@ -315,28 +315,28 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_have_transactions_Messages_In", + "expr": "rippled_have_transactions_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Have TX In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_have_transactions_Messages_Out", + "expr": "rippled_have_transactions_Messages_Out{exported_instance=~\"$node\"}", "legendFormat": "Have TX Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_requested_transactions_Messages_In", + "expr": "rippled_requested_transactions_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Requested TX In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_requested_transactions_Messages_Out", + "expr": "rippled_requested_transactions_Messages_Out{exported_instance=~\"$node\"}", "legendFormat": "Requested TX Out" } ], @@ -375,28 +375,28 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_unknown_Bytes_In", + "expr": "rippled_unknown_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Unknown Bytes In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_unknown_Bytes_Out", + "expr": "rippled_unknown_Bytes_Out{exported_instance=~\"$node\"}", "legendFormat": "Unknown Bytes Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_unknown_Messages_In", + "expr": "rippled_unknown_Messages_In{exported_instance=~\"$node\"}", "legendFormat": "Unknown Messages In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_unknown_Messages_Out", + "expr": "rippled_unknown_Messages_Out{exported_instance=~\"$node\"}", "legendFormat": "Unknown Messages Out" } ], @@ -452,28 +452,28 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_proof_path_request_Bytes_In", + "expr": "rippled_proof_path_request_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Request Bytes In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_proof_path_request_Bytes_Out", + "expr": "rippled_proof_path_request_Bytes_Out{exported_instance=~\"$node\"}", "legendFormat": "Request Bytes Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_proof_path_response_Bytes_In", + "expr": "rippled_proof_path_response_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Response Bytes In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_proof_path_response_Bytes_Out", + "expr": "rippled_proof_path_response_Bytes_Out{exported_instance=~\"$node\"}", "legendFormat": "Response Bytes Out" } ], @@ -512,28 +512,28 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_replay_delta_request_Bytes_In", + "expr": "rippled_replay_delta_request_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Request Bytes In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_replay_delta_request_Bytes_Out", + "expr": "rippled_replay_delta_request_Bytes_Out{exported_instance=~\"$node\"}", "legendFormat": "Request Bytes Out" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_replay_delta_response_Bytes_In", + "expr": "rippled_replay_delta_response_Bytes_In{exported_instance=~\"$node\"}", "legendFormat": "Response Bytes In" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_replay_delta_response_Bytes_Out", + "expr": "rippled_replay_delta_response_Bytes_Out{exported_instance=~\"$node\"}", "legendFormat": "Response Bytes Out" } ], @@ -555,12 +555,33 @@ "schemaVersion": 39, "tags": ["rippled", "statsd", "overlay", "network", "telemetry"], "templating": { - "list": [] + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by xrpld node (service.instance.id \u2014 e.g. Node-1)", + "type": "query", + "query": "label_values(exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] }, "time": { "from": "now-1h", "to": "now" }, "title": "Overlay Traffic Detail (StatsD)", - "uid": "rippled-statsd-overlay-detail" + "uid": "xrpld-statsd-overlay-detail" } diff --git a/docker/telemetry/grafana/dashboards/statsd-rpc-pathfinding.json b/docker/telemetry/grafana/dashboards/statsd-rpc-pathfinding.json index 10bf1575e3..2470564859 100644 --- a/docker/telemetry/grafana/dashboards/statsd-rpc-pathfinding.json +++ b/docker/telemetry/grafana/dashboards/statsd-rpc-pathfinding.json @@ -11,7 +11,7 @@ "panels": [ { "title": "RPC Request Rate (StatsD)", - "description": "Rate of RPC requests as counted by the beast::insight counter. Sourced from rpc.requests (ServerHandler.cpp:108) which increments on every HTTP and WebSocket RPC request. Compare with the span-based rpc.request rate in the RPC Performance dashboard for cross-validation.", + "description": "Rate of RPC requests as counted by the beast::insight counter. Sourced from rpc.requests (ServerHandler.cpp) which increments on every HTTP and WebSocket RPC request. Compare with the span-based rpc.request rate in the RPC Performance dashboard for cross-validation.", "type": "stat", "gridPos": { "h": 8, @@ -30,7 +30,7 @@ "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_rpc_requests_total[5m])", + "expr": "rate{exported_instance=~\"$node\"}(rippled_rpc_requests_total[5m])", "legendFormat": "Requests / Sec" } ], @@ -43,7 +43,7 @@ }, { "title": "RPC Response Time (StatsD)", - "description": "P95 and P50 of RPC response time from the beast::insight timer. Sourced from the rpc.time event (ServerHandler.cpp:110) which records elapsed milliseconds for each RPC response. This measures the full HTTP handler time, not just command execution. Compare with span-based rpc.request duration.", + "description": "P95 and P50 of RPC response time from the beast::insight timer. Sourced from the rpc.time event (ServerHandler.cpp) which records elapsed milliseconds for each RPC response. This measures the full HTTP handler time, not just command execution. Compare with span-based rpc.request duration.", "type": "timeseries", "gridPos": { "h": 8, @@ -62,14 +62,14 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_rpc_time{quantile=\"0.95\"}", + "expr": "rippled_rpc_time{exported_instance=~\"$node\", quantile=\"0.95\"}", "legendFormat": "P95 Response Time" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_rpc_time{quantile=\"0.5\"}", + "expr": "rippled_rpc_time{exported_instance=~\"$node\", quantile=\"0.5\"}", "legendFormat": "P50 Response Time" } ], @@ -89,7 +89,7 @@ }, { "title": "RPC Response Size", - "description": "P95 and P50 of RPC response payload size in bytes. Sourced from the rpc.size event (ServerHandler.cpp:109) which records the byte length of each RPC JSON response. Large responses may indicate expensive queries (e.g. account_tx with many results) or API misuse.", + "description": "P95 and P50 of RPC response payload size in bytes. Sourced from the rpc.size event (ServerHandler.cpp) which records the byte length of each RPC JSON response. Large responses may indicate expensive queries (e.g. account_tx with many results) or API misuse.", "type": "timeseries", "gridPos": { "h": 8, @@ -108,14 +108,14 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_rpc_size{quantile=\"0.95\"}", + "expr": "rippled_rpc_size{exported_instance=~\"$node\", quantile=\"0.95\"}", "legendFormat": "P95 Response Size" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_rpc_size{quantile=\"0.5\"}", + "expr": "rippled_rpc_size{exported_instance=~\"$node\", quantile=\"0.5\"}", "legendFormat": "P50 Response Size" } ], @@ -135,7 +135,7 @@ }, { "title": "RPC Response Time Distribution", - "description": "Distribution of RPC response times from the beast::insight timer showing P50, P90, P95, and P99 quantiles. Sourced from the rpc.time event (ServerHandler.cpp:110). Useful for detecting bimodal latency or long-tail requests.", + "description": "Distribution of RPC response times from the beast::insight timer showing P50, P90, P95, and P99 quantiles. Sourced from the rpc.time event (ServerHandler.cpp). Useful for detecting bimodal latency or long-tail requests.", "type": "timeseries", "gridPos": { "h": 8, @@ -154,28 +154,28 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_rpc_time{quantile=\"0.5\"}", + "expr": "rippled_rpc_time{exported_instance=~\"$node\", quantile=\"0.5\"}", "legendFormat": "P50" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_rpc_time{quantile=\"0.9\"}", + "expr": "rippled_rpc_time{exported_instance=~\"$node\", quantile=\"0.9\"}", "legendFormat": "P90" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_rpc_time{quantile=\"0.95\"}", + "expr": "rippled_rpc_time{exported_instance=~\"$node\", quantile=\"0.95\"}", "legendFormat": "P95" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_rpc_time{quantile=\"0.99\"}", + "expr": "rippled_rpc_time{exported_instance=~\"$node\", quantile=\"0.99\"}", "legendFormat": "P99" } ], @@ -195,7 +195,7 @@ }, { "title": "Pathfinding Fast Duration", - "description": "P95 and P50 of fast pathfinding execution time. Sourced from the pathfind_fast event (PathRequests.h:23) which records the duration of the fast pathfinding algorithm. Fast pathfinding uses a simplified search that trades accuracy for speed.", + "description": "P95 and P50 of fast pathfinding execution time. Sourced from the pathfind_fast event (PathRequests.h) which records the duration of the fast pathfinding algorithm. Fast pathfinding uses a simplified search that trades accuracy for speed.", "type": "timeseries", "gridPos": { "h": 8, @@ -214,14 +214,14 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_pathfind_fast{quantile=\"0.95\"}", + "expr": "rippled_pathfind_fast{exported_instance=~\"$node\", quantile=\"0.95\"}", "legendFormat": "P95 Fast Pathfind" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_pathfind_fast{quantile=\"0.5\"}", + "expr": "rippled_pathfind_fast{exported_instance=~\"$node\", quantile=\"0.5\"}", "legendFormat": "P50 Fast Pathfind" } ], @@ -241,7 +241,7 @@ }, { "title": "Pathfinding Full Duration", - "description": "P95 and P50 of full pathfinding execution time. Sourced from the pathfind_full event (PathRequests.h:24) which records the duration of the exhaustive pathfinding search. Full pathfinding is more expensive and can take significantly longer than fast mode.", + "description": "P95 and P50 of full pathfinding execution time. Sourced from the pathfind_full event (PathRequests.h) which records the duration of the exhaustive pathfinding search. Full pathfinding is more expensive and can take significantly longer than fast mode.", "type": "timeseries", "gridPos": { "h": 8, @@ -260,14 +260,14 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_pathfind_full{quantile=\"0.95\"}", + "expr": "rippled_pathfind_full{exported_instance=~\"$node\", quantile=\"0.95\"}", "legendFormat": "P95 Full Pathfind" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_pathfind_full{quantile=\"0.5\"}", + "expr": "rippled_pathfind_full{exported_instance=~\"$node\", quantile=\"0.5\"}", "legendFormat": "P50 Full Pathfind" } ], @@ -287,7 +287,7 @@ }, { "title": "Resource Warnings Rate", - "description": "Rate of resource warning events from the Resource Manager. Sourced from the warn meter (Logic.h:33) which increments when a consumer (peer or RPC client) exceeds the warning threshold for resource usage. A rising rate indicates aggressive clients that may need throttling. NOTE: This panel will show no data until the |m -> |c fix is applied in StatsDCollector.cpp:706 (Phase 6 Task 6.1).", + "description": "Rate of resource warning events from the Resource Manager. Sourced from the warn meter (Logic.h) which increments when a consumer (peer or RPC client) exceeds the warning threshold for resource usage. A rising rate indicates aggressive clients that may need throttling. NOTE: This panel will show no data until the |m -> |c fix is applied in StatsDCollector.cpp (Phase 6 Task 6.1).", "type": "stat", "gridPos": { "h": 8, @@ -306,7 +306,7 @@ "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_warn_total[5m])", + "expr": "rate{exported_instance=~\"$node\"}(rippled_warn_total[5m])", "legendFormat": "Warnings / Sec" } ], @@ -335,7 +335,7 @@ }, { "title": "Resource Drops Rate", - "description": "Rate of resource drop events from the Resource Manager. Sourced from the drop meter (Logic.h:34) which increments when a consumer is disconnected or blocked due to excessive resource usage. Non-zero values mean the node is actively rejecting abusive connections. NOTE: This panel will show no data until the |m -> |c fix is applied in StatsDCollector.cpp:706 (Phase 6 Task 6.1).", + "description": "Rate of resource drop events from the Resource Manager. Sourced from the drop meter (Logic.h) which increments when a consumer is disconnected or blocked due to excessive resource usage. Non-zero values mean the node is actively rejecting abusive connections. NOTE: This panel will show no data until the |m -> |c fix is applied in StatsDCollector.cpp (Phase 6 Task 6.1).", "type": "stat", "gridPos": { "h": 8, @@ -354,7 +354,7 @@ "datasource": { "type": "prometheus" }, - "expr": "rate(rippled_drop_total[5m])", + "expr": "rate{exported_instance=~\"$node\"}(rippled_drop_total[5m])", "legendFormat": "Drops / Sec" } ], @@ -385,12 +385,33 @@ "schemaVersion": 39, "tags": ["rippled", "statsd", "rpc", "pathfinding", "telemetry"], "templating": { - "list": [] + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by xrpld node (service.instance.id \u2014 e.g. Node-1)", + "type": "query", + "query": "label_values(exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] }, "time": { "from": "now-1h", "to": "now" }, "title": "RPC & Pathfinding (StatsD)", - "uid": "rippled-statsd-rpc" + "uid": "xrpld-statsd-rpc" } From a789f6ccf588341d27b58baabf7bce24b2462a18 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 14 May 2026 16:53:40 +0100 Subject: [PATCH 4/8] docs(telemetry): fix stale rpc.request refs + drop unparsed exporter key in TESTING.md Follow-up to the dashboard cleanup on this branch. Caught additional sites in TESTING.md that still reference the never-emitted `rpc.request` span: - TraceQL query examples in Step 5 "Verify traces in Tempo" now filter on `name="rpc.http_request"` (the real emitted name). - Expected-spans table replaces `rpc.request` with `rpc.http_request`. - Query loop under the Prometheus verification section now iterates over the full set of emitted RPC entry-point names (`rpc.http_request`, `rpc.ws_upgrade`, `rpc.ws_message`, `rpc.process`). Also drop `exporter=otlp_http` from the sample telemetry config block. `TelemetryConfig.cpp` does not parse an `exporter` key in any phase through Phase 8; only OTLP/HTTP is wired up, so the line is either a silently ignored no-op or misleading documentation. --- docker/telemetry/TESTING.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docker/telemetry/TESTING.md b/docker/telemetry/TESTING.md index 1346f2d49c..fd20121f4b 100644 --- a/docker/telemetry/TESTING.md +++ b/docker/telemetry/TESTING.md @@ -123,7 +123,7 @@ curl -s "$TEMPO/api/v2/search/tag/resource.service.name/values" | jq '.tagValues # Check RPC spans curl -s "$TEMPO/api/search" \ - --data-urlencode 'q={resource.service.name="xrpld" && name="rpc.request"}' \ + --data-urlencode 'q={resource.service.name="xrpld" && name="rpc.http_request"}' \ --data-urlencode 'limit=5' | jq '.traces | length' curl -s "$TEMPO/api/search" \ @@ -159,7 +159,7 @@ rm -rf data/ | Span Name | Expected | Notes | | --------------------------- | -------- | ----------------------------- | -| `rpc.request` | Yes | Every HTTP RPC call | +| `rpc.http_request` | Yes | Every HTTP RPC call | | `rpc.process` | Yes | Every RPC processing | | `rpc.command.server_info` | Yes | server_info RPC | | `rpc.command.server_state` | Yes | server_state RPC | @@ -285,7 +285,6 @@ online_delete=256 [telemetry] enabled=1 endpoint=http://localhost:4318/v1/traces -exporter=otlp_http sampling_ratio=1.0 batch_size=512 batch_delay_ms=2000 @@ -412,7 +411,7 @@ TEMPO="http://localhost:3200" curl -s "$TEMPO/api/v2/search/tag/resource.service.name/values" | jq '.tagValues[].value' # Query traces by operation -for op in "rpc.request" "rpc.process" \ +for op in "rpc.http_request" "rpc.ws_upgrade" "rpc.ws_message" "rpc.process" \ "rpc.command.server_info" "rpc.command.server_state" "rpc.command.ledger" \ "tx.process" "tx.receive" "tx.apply" \ "consensus.proposal.send" "consensus.ledger_close" \ From 1a36ef4b0fb75519c56e525731090d43d58140de Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 14 May 2026 16:58:47 +0100 Subject: [PATCH 5/8] fix(telemetry): rename remaining rippled-* dashboard UIDs + fix stale rpc.request span filter Follow-up to the phase-6 dashboard cleanup. The three dashboards introduced by commit f6105ece98 (consensus-health, rpc-performance, transaction-overview) were missed in the initial UID rename and still carried `rippled-*` UIDs plus line-number refs in panel descriptions. - UIDs: `rippled-consensus` -> `xrpld-consensus`, `rippled-rpc-perf` -> `xrpld-rpc-perf`, `rippled-transactions` -> `xrpld-transactions`, matching the post-`docs.sh`-rename runbook and the other dashboards in this PR. - Strip `:` suffixes from `ServerHandler.cpp`, `RCLConsensus.cpp`, `NetworkOPs.cpp`, etc. references in panel descriptions. Line numbers drift on every refactor; the filename is enough to grep. - Fix the Overall RPC Throughput panel: two targets filtered on `span_name="rpc.request"` (never emitted) instead of `span_name="rpc.http_request"` (the real emitted name). The panel would have shown zero data until this fix. --- .../telemetry/grafana/dashboards/consensus-health.json | 10 +++++----- .../telemetry/grafana/dashboards/rpc-performance.json | 10 +++++----- .../grafana/dashboards/transaction-overview.json | 10 +++++----- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index d40f42cc58..2d4597533b 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -10,7 +10,7 @@ "panels": [ { "title": "Consensus Round Duration", - "description": "p95 and p50 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp:395) measures the time to process an accepted ledger including transaction application and state finalization. The span carries proposers and round_time_ms attributes. Normal range is 3-6 seconds on mainnet.", + "description": "p95 and p50 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp) measures the time to process an accepted ledger including transaction application and state finalization. The span carries proposers and round_time_ms attributes. Normal range is 3-6 seconds on mainnet.", "type": "timeseries", "gridPos": { "h": 8, @@ -56,7 +56,7 @@ }, { "title": "Consensus Proposals Sent Rate", - "description": "Rate at which this node sends consensus proposals to the network. Sourced from the consensus.proposal.send span (RCLConsensus.cpp:177) which fires each time the node proposes a transaction set. The span carries xrpl.consensus.round identifying the consensus round number. A healthy proposing node should show steady proposal output.", + "description": "Rate at which this node sends consensus proposals to the network. Sourced from the consensus.proposal.send span (RCLConsensus.cpp) which fires each time the node proposes a transaction set. The span carries xrpl.consensus.round identifying the consensus round number. A healthy proposing node should show steady proposal output.", "type": "timeseries", "gridPos": { "h": 8, @@ -95,7 +95,7 @@ }, { "title": "Ledger Close Duration", - "description": "p95 duration of the ledger close event. The consensus.ledger_close span (RCLConsensus.cpp:282) measures the time from when consensus triggers a ledger close to completion. Carries xrpl.ledger.seq and xrpl.consensus.mode attributes. Compare with Consensus Round Duration to understand how close timing relates to overall round time.", + "description": "p95 duration of the ledger close event. The consensus.ledger_close span (RCLConsensus.cpp) measures the time from when consensus triggers a ledger close to completion. Carries xrpl.ledger.seq and xrpl.consensus.mode attributes. Compare with Consensus Round Duration to understand how close timing relates to overall round time.", "type": "timeseries", "gridPos": { "h": 8, @@ -134,7 +134,7 @@ }, { "title": "Validation Send Rate", - "description": "Rate at which this node sends ledger validations to the network. Sourced from the consensus.validation.send span (RCLConsensus.cpp:753). Each validation confirms the node has fully validated a ledger. The span carries xrpl.ledger.seq and proposing. Should closely track the ledger close rate when the node is healthy.", + "description": "Rate at which this node sends ledger validations to the network. Sourced from the consensus.validation.send span (RCLConsensus.cpp). Each validation confirms the node has fully validated a ledger. The span carries xrpl.ledger.seq and proposing. Should closely track the ledger close rate when the node is healthy.", "type": "stat", "gridPos": { "h": 8, @@ -772,5 +772,5 @@ "to": "now" }, "title": "Consensus Health", - "uid": "rippled-consensus" + "uid": "xrpld-consensus" } diff --git a/docker/telemetry/grafana/dashboards/rpc-performance.json b/docker/telemetry/grafana/dashboards/rpc-performance.json index 7834ec4029..a90983b2e8 100644 --- a/docker/telemetry/grafana/dashboards/rpc-performance.json +++ b/docker/telemetry/grafana/dashboards/rpc-performance.json @@ -166,7 +166,7 @@ }, { "title": "Overall RPC Throughput", - "description": "Aggregate RPC throughput showing two layers of the request pipeline. rpc.request is the outer HTTP handler (ServerHandler.cpp:271) that accepts incoming connections. rpc.process is the inner processing layer (ServerHandler.cpp:573) that parses and dispatches. A gap between the two indicates requests being queued or rejected before processing.", + "description": "Aggregate RPC throughput showing two layers of the request pipeline. rpc.http_request is the outer HTTP handler (ServerHandler.cpp) that accepts incoming connections. rpc.process is the inner processing layer (ServerHandler.cpp) that parses and dispatches. A gap between the two indicates requests being queued or rejected before processing.", "type": "timeseries", "gridPos": { "h": 8, @@ -185,8 +185,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=\"rpc.request\"}[5m]))", - "legendFormat": "rpc.request / Sec [{{exported_instance}}]" + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=\"rpc.http_request\"}[5m]))", + "legendFormat": "rpc.http_request / Sec [{{exported_instance}}]" }, { "datasource": { @@ -290,7 +290,7 @@ }, { "title": "WebSocket Message Rate", - "description": "Rate of incoming WebSocket RPC messages processed by the server. Sourced from the rpc.ws_message span (ServerHandler.cpp:384). Only active when clients connect via WebSocket instead of HTTP. Zero is normal if only HTTP RPC is in use.", + "description": "Rate of incoming WebSocket RPC messages processed by the server. Sourced from the rpc.ws_message span (ServerHandler.cpp). Only active when clients connect via WebSocket instead of HTTP. Zero is normal if only HTTP RPC is in use.", "type": "stat", "gridPos": { "h": 8, @@ -372,5 +372,5 @@ "to": "now" }, "title": "RPC Performance", - "uid": "rippled-rpc-perf" + "uid": "xrpld-rpc-perf" } diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index edcb4d872e..0de4adf8a3 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -10,7 +10,7 @@ "panels": [ { "title": "Transaction Processing Rate", - "description": "Rate of transactions entering the processing pipeline. tx.process (NetworkOPs.cpp:1227) fires when a transaction is submitted locally or received from a peer and enters processTransaction(). tx.receive (PeerImp.cpp:1273) fires when a raw transaction message arrives from a peer before deduplication.", + "description": "Rate of transactions entering the processing pipeline. tx.process (NetworkOPs.cpp) fires when a transaction is submitted locally or received from a peer and enters processTransaction(). tx.receive (PeerImp.cpp) fires when a raw transaction message arrives from a peer before deduplication.", "type": "timeseries", "gridPos": { "h": 8, @@ -128,7 +128,7 @@ }, { "title": "Transaction Receive vs Suppressed", - "description": "Total rate of raw transaction messages received from peers (tx.receive span from PeerImp.cpp:1273). This fires before deduplication via the HashRouter, so the difference between tx.receive and tx.process reflects suppressed duplicate transactions.", + "description": "Total rate of raw transaction messages received from peers (tx.receive span from PeerImp.cpp). This fires before deduplication via the HashRouter, so the difference between tx.receive and tx.process reflects suppressed duplicate transactions.", "type": "timeseries", "gridPos": { "h": 8, @@ -197,7 +197,7 @@ }, { "title": "Transaction Apply Duration per Ledger", - "description": "p95 and p50 latency of applying the consensus transaction set to a new ledger. The tx.apply span (BuildLedger.cpp:88) wraps the applyTransactions() function that iterates through the CanonicalTXSet and applies each transaction to the OpenView. Long durations indicate heavy transaction sets or expensive transaction processing.", + "description": "p95 and p50 latency of applying the consensus transaction set to a new ledger. The tx.apply span (BuildLedger.cpp) wraps the applyTransactions() function that iterates through the CanonicalTXSet and applies each transaction to the OpenView. Long durations indicate heavy transaction sets or expensive transaction processing.", "type": "timeseries", "gridPos": { "h": 8, @@ -243,7 +243,7 @@ }, { "title": "Peer Transaction Receive Rate", - "description": "Rate of transaction messages received from network peers. Sourced from the tx.receive span (PeerImp.cpp:1273) which fires in the onMessage(TMTransaction) handler. High rates may indicate network-wide transaction volume spikes or peer flooding.", + "description": "Rate of transaction messages received from network peers. Sourced from the tx.receive span (PeerImp.cpp) which fires in the onMessage(TMTransaction) handler. High rates may indicate network-wide transaction volume spikes or peer flooding.", "type": "timeseries", "gridPos": { "h": 8, @@ -380,5 +380,5 @@ "to": "now" }, "title": "Transaction Overview", - "uid": "rippled-transactions" + "uid": "xrpld-transactions" } From 92bc0b24b883dfcad6cb976ef6cd9f6492901394 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 14 May 2026 16:59:43 +0100 Subject: [PATCH 6/8] docs(telemetry): drop volatile line numbers from Phase 4 span-catalog table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 4 added a span catalog in `06-implementation-phases.md` listing the source location for each consensus span. Line numbers `Consensus.h:707`, `RCLConsensus.cpp:232/341/492/541/900` drift on every refactor and would become stale PR after PR. Filename alone is enough for operators to grep — the RCLConsensus.cpp spans are already unambiguous from the span name itself. --- OpenTelemetryPlan/06-implementation-phases.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/OpenTelemetryPlan/06-implementation-phases.md b/OpenTelemetryPlan/06-implementation-phases.md index 8611d53adc..1c68cd5808 100644 --- a/OpenTelemetryPlan/06-implementation-phases.md +++ b/OpenTelemetryPlan/06-implementation-phases.md @@ -179,14 +179,14 @@ SHAMap tracing are not implemented. ### Spans Produced -| Span Name | Location | Attributes | -| --------------------------- | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `consensus.phase.open` | `Consensus.h:707` | _(none)_ | -| `consensus.proposal.send` | `RCLConsensus.cpp:232` | `xrpl.consensus.round` | -| `consensus.ledger_close` | `RCLConsensus.cpp:341` | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | -| `consensus.accept` | `RCLConsensus.cpp:492` | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms`, `xrpl.consensus.quorum` | -| `consensus.accept.apply` | `RCLConsensus.cpp:541` | `xrpl.consensus.close_time`, `close_time_correct`, `close_resolution_ms`, `state`, `proposing`, `round_time_ms`, `ledger.seq`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction` | -| `consensus.validation.send` | `RCLConsensus.cpp:900` | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | +| Span Name | Location | Attributes | +| --------------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `consensus.phase.open` | `Consensus.h` | _(none)_ | +| `consensus.proposal.send` | `RCLConsensus.cpp` | `xrpl.consensus.round` | +| `consensus.ledger_close` | `RCLConsensus.cpp` | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | +| `consensus.accept` | `RCLConsensus.cpp` | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms`, `xrpl.consensus.quorum` | +| `consensus.accept.apply` | `RCLConsensus.cpp` | `xrpl.consensus.close_time`, `close_time_correct`, `close_resolution_ms`, `state`, `proposing`, `round_time_ms`, `ledger.seq`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction` | +| `consensus.validation.send` | `RCLConsensus.cpp` | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | ### Exit Criteria From 145b1469d6e3aba1517fb01850e49218ed0043ed Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 14 May 2026 17:11:25 +0100 Subject: [PATCH 7/8] fix(telemetry): rename phase-9 dashboard JSON files rippled-* -> xrpld-* File renames to match the post-docs.sh project-wide rename + the UID rename applied in the previous commit. Five phase-9 dashboards are affected: - rippled-fee-market.json -> xrpld-fee-market.json - rippled-job-queue.json -> xrpld-job-queue.json - rippled-peer-quality.json -> xrpld-peer-quality.json - rippled-rpc-perf.json -> xrpld-rpc-perf-otel.json - rippled-validator-health.json-> xrpld-validator-health.json `rippled-rpc-perf.json` is renamed to `xrpld-rpc-perf-otel.json` (rather than `xrpld-rpc-perf.json`) to avoid colliding with the phase-6 `rpc-performance.json` dashboard which also uses the `xrpld-rpc-perf` UID. The new filename matches its now-unique `xrpld-rpc-perf-otel` UID that was set in the merge commit. --- .../dashboards/{rippled-fee-market.json => xrpld-fee-market.json} | 0 .../dashboards/{rippled-job-queue.json => xrpld-job-queue.json} | 0 .../{rippled-peer-quality.json => xrpld-peer-quality.json} | 0 .../{rippled-rpc-perf.json => xrpld-rpc-perf-otel.json} | 0 ...{rippled-validator-health.json => xrpld-validator-health.json} | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename docker/telemetry/grafana/dashboards/{rippled-fee-market.json => xrpld-fee-market.json} (100%) rename docker/telemetry/grafana/dashboards/{rippled-job-queue.json => xrpld-job-queue.json} (100%) rename docker/telemetry/grafana/dashboards/{rippled-peer-quality.json => xrpld-peer-quality.json} (100%) rename docker/telemetry/grafana/dashboards/{rippled-rpc-perf.json => xrpld-rpc-perf-otel.json} (100%) rename docker/telemetry/grafana/dashboards/{rippled-validator-health.json => xrpld-validator-health.json} (100%) diff --git a/docker/telemetry/grafana/dashboards/rippled-fee-market.json b/docker/telemetry/grafana/dashboards/xrpld-fee-market.json similarity index 100% rename from docker/telemetry/grafana/dashboards/rippled-fee-market.json rename to docker/telemetry/grafana/dashboards/xrpld-fee-market.json diff --git a/docker/telemetry/grafana/dashboards/rippled-job-queue.json b/docker/telemetry/grafana/dashboards/xrpld-job-queue.json similarity index 100% rename from docker/telemetry/grafana/dashboards/rippled-job-queue.json rename to docker/telemetry/grafana/dashboards/xrpld-job-queue.json diff --git a/docker/telemetry/grafana/dashboards/rippled-peer-quality.json b/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json similarity index 100% rename from docker/telemetry/grafana/dashboards/rippled-peer-quality.json rename to docker/telemetry/grafana/dashboards/xrpld-peer-quality.json diff --git a/docker/telemetry/grafana/dashboards/rippled-rpc-perf.json b/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json similarity index 100% rename from docker/telemetry/grafana/dashboards/rippled-rpc-perf.json rename to docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json diff --git a/docker/telemetry/grafana/dashboards/rippled-validator-health.json b/docker/telemetry/grafana/dashboards/xrpld-validator-health.json similarity index 100% rename from docker/telemetry/grafana/dashboards/rippled-validator-health.json rename to docker/telemetry/grafana/dashboards/xrpld-validator-health.json From 2735e4ac782d8b9eff427832ea7dfbc8733c9e4d Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 14 May 2026 17:20:52 +0100 Subject: [PATCH 8/8] fix(telemetry): detach metrics gauge callbacks before Application services stop MetricsRegistry observable-gauge callbacks run on the OTel reader thread and read live state from nodeStore_, overlay_, networkOPs_, ledgerMaster, inboundLedgers, loadManager, and others. The old shutdown sequence called metricsRegistry_->stop() AFTER all those services were already stopped, which left a race window between each service's stop() and the final provider_->ForceFlush() during which a callback could dereference already-stopped service state. The try/catch guards in each callback mitigated crashes but not reads from freed members. - Add MetricsRegistry::detachCallbacks() that sets an atomic callbacksDetached_ with release ordering. Idempotent. - Guard every ObservableGauge callback entry with an acquire-load of the same flag and return early if it is set. Covers all 15 registered callbacks (cacheHitRate, txq, objectCount, loadFactor, nodeStore, serverInfo, buildInfo, completeLedgers, dbMetrics, validatorHealth, peerQuality, ledgerEconomy, stateTracking, storageDetail, validationAgreement). - Application::run() shutdown sequence now calls metricsRegistry_->detachCallbacks() right after m_loadManager->stop() and BEFORE m_shaMapStore, m_jobQueue, overlay_, grpcServer_, m_networkOPs, serverHandler_, m_ledgerReplayer, m_inboundTransactions, m_inboundLedgers, ledgerCleaner_, m_nodeStore, perfLog_ are stopped. The acquire/release pair guarantees subsequent reader-thread ticks see the detach before they dereference stopped services. - metricsRegistry_->stop() keeps setting the flag as a belt-and-suspenders defense in case a future caller forgets to detach first. - Drop the misleading "No explicit RemoveCallback is needed" comment from stop(); provider destruction alone does not beat the reader thread to already-freed state. The objectCountGauge callback previously discarded its state pointer via `void* /* state */`; restore the state argument so it can access self->callbacksDetached_ too. --- src/xrpld/app/main/Application.cpp | 13 +++++++ src/xrpld/telemetry/MetricsRegistry.cpp | 50 +++++++++++++++++++++++-- src/xrpld/telemetry/MetricsRegistry.h | 34 ++++++++++++++++- 3 files changed, 93 insertions(+), 4 deletions(-) diff --git a/src/xrpld/app/main/Application.cpp b/src/xrpld/app/main/Application.cpp index 8d4c620fae..44a280af85 100644 --- a/src/xrpld/app/main/Application.cpp +++ b/src/xrpld/app/main/Application.cpp @@ -1686,6 +1686,19 @@ ApplicationImp::run() // The order of these stop calls is delicate. // Re-ordering them risks undefined behavior. m_loadManager->stop(); + + // Detach MetricsRegistry observable-gauge callbacks BEFORE stopping + // any service the callbacks read from. The callbacks run on the OTel + // reader thread and touch nodeStore_, overlay_, networkOPs_, + // ledgerMaster, inboundLedgers, etc. A final tick that fires after + // one of those services has shut down would dereference dangling + // state. detachCallbacks() flips an atomic flag every callback + // acquire-loads at its entry, so subsequent ticks become no-ops. + // The final provider teardown still happens in metricsRegistry_->stop() + // farther down. + if (metricsRegistry_) + metricsRegistry_->detachCallbacks(); + m_shaMapStore->stop(); m_jobQueue->stop(); if (overlay_) diff --git a/src/xrpld/telemetry/MetricsRegistry.cpp b/src/xrpld/telemetry/MetricsRegistry.cpp index d13486f07c..2e348f58c0 100644 --- a/src/xrpld/telemetry/MetricsRegistry.cpp +++ b/src/xrpld/telemetry/MetricsRegistry.cpp @@ -176,6 +176,15 @@ MetricsRegistry::start(std::string const& endpoint, std::string const& instanceI #endif // XRPL_ENABLE_TELEMETRY } +void +MetricsRegistry::detachCallbacks() noexcept +{ +#ifdef XRPL_ENABLE_TELEMETRY + // Release so every subsequent callback acquire-load sees true. + callbacksDetached_.store(true, std::memory_order_release); +#endif // XRPL_ENABLE_TELEMETRY +} + void MetricsRegistry::stop() { @@ -185,10 +194,16 @@ MetricsRegistry::stop() JLOG(journal_.info()) << "MetricsRegistry: stopping"; + // Belt-and-suspenders: detachCallbacks() should have already been + // called by Application shutdown before any service the callbacks + // observe was stopped. Setting the flag here is redundant for a + // correct caller but protects against a future caller that forgets + // to detach first. + callbacksDetached_.store(true, std::memory_order_release); + // Force-flush any pending metrics, then destroy the provider. // This stops the PeriodicExportingMetricReader, which in turn - // stops invoking observable gauge callbacks. No explicit - // RemoveCallback is needed — the provider destruction handles it. + // stops invoking observable gauge callbacks. provider_->ForceFlush(); provider_.reset(); @@ -344,6 +359,8 @@ MetricsRegistry::registerCacheHitRateGauge() cacheHitRateGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; auto& app = self->app_; try @@ -413,6 +430,8 @@ MetricsRegistry::registerTxqGauge() txqGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; auto& app = self->app_; try @@ -457,7 +476,10 @@ MetricsRegistry::registerObjectCountGauge() objectCountGauge_ = meter_->CreateInt64ObservableGauge( "xrpld_object_count", "Live instance counts for key internal object types"); objectCountGauge_->AddCallback( - [](opentelemetry::metrics::ObserverResult result, void* /* state */) { + [](opentelemetry::metrics::ObserverResult result, void* state) { + auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; try { // Iterate through all CountedObject types via the linked @@ -488,6 +510,8 @@ MetricsRegistry::registerLoadFactorGauge() loadFactorGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; auto& app = self->app_; try @@ -562,6 +586,8 @@ MetricsRegistry::registerNodeStoreGauge() nodeStoreGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; auto& app = self->app_; try @@ -633,6 +659,8 @@ MetricsRegistry::registerServerInfoGauge() serverInfoGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; auto& app = self->app_; try @@ -721,6 +749,8 @@ MetricsRegistry::registerCompleteLedgersGauge() completeLedgersGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; auto& app = self->app_; try @@ -778,6 +808,8 @@ MetricsRegistry::registerDbMetricsGauge() dbMetricsGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; auto& app = self->app_; try @@ -815,6 +847,8 @@ MetricsRegistry::registerValidatorHealthGauge() validatorHealthGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; auto& app = self->app_; try @@ -862,6 +896,8 @@ MetricsRegistry::registerPeerQualityGauge() peerQualityGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; auto& app = self->app_; try @@ -941,6 +977,8 @@ MetricsRegistry::registerLedgerEconomyGauge() ledgerEconomyGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; auto& app = self->app_; try @@ -999,6 +1037,8 @@ MetricsRegistry::registerStateTrackingGauge() stateTrackingGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; auto& app = self->app_; try @@ -1046,6 +1086,8 @@ MetricsRegistry::registerStorageDetailGauge() storageDetailGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; auto& app = self->app_; try @@ -1083,6 +1125,8 @@ MetricsRegistry::registerValidationAgreementGauge() validationAgreementGauge_->AddCallback( [](opentelemetry::metrics::ObserverResult result, void* state) { auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; try { diff --git a/src/xrpld/telemetry/MetricsRegistry.h b/src/xrpld/telemetry/MetricsRegistry.h index 4d2cf11d1b..9a59fb28dd 100644 --- a/src/xrpld/telemetry/MetricsRegistry.h +++ b/src/xrpld/telemetry/MetricsRegistry.h @@ -129,6 +129,7 @@ #include +#include #include #include #include @@ -231,7 +232,30 @@ public: void start(std::string const& endpoint, std::string const& instanceId = {}); - /** Flush pending metrics and shut down the pipeline. */ + /** Detach all ObservableGauge callbacks so they no-op on the next + reader-thread tick. + + Must be called BEFORE any Application service that the callbacks + read (nodeStore, overlay, networkOPs, ledgerMaster, etc.) is + stopped. The flag is checked with acquire ordering at the top of + every callback; together with the release store here it + guarantees that once `detachCallbacks()` returns, no subsequent + callback invocation will dereference an already-stopped service. + + Idempotent. Safe to call multiple times. Safe to call before + `start()` (has no effect). The actual SDK-level provider + shutdown still happens in `stop()`. + */ + void + detachCallbacks() noexcept; + + /** Flush pending metrics and shut down the pipeline. + + @pre `detachCallbacks()` should have been called earlier in the + shutdown sequence; otherwise there is a narrow race between + the final reader-thread tick and the destruction of + Application services that the gauge callbacks read from. + */ void stop(); @@ -354,6 +378,14 @@ private: /// Journal for logging. beast::Journal const journal_; + + /// Set by detachCallbacks() during shutdown so every ObservableGauge + /// callback returns early before reading Application services that + /// may already be stopped. Checked with memory_order_acquire at the + /// top of each callback to pair with the memory_order_release store + /// in detachCallbacks(). + std::atomic callbacksDetached_{false}; + /// The SDK MeterProvider that owns the export pipeline. std::shared_ptr provider_;