Merge branch 'pratik/otel-phase9-metric-gap-fill' into pratik/otel-phase10-workload-validation

2026-07-23 23:20:33 +00:00 · 2026-06-05 12:50:09 +01:00
parent 5b53ac99be f37a4a1022
commit db5b93e2c4
19 changed files with 761 additions and 426 deletions
--- a/docker/telemetry/grafana/dashboards/consensus-health.json
+++ b/docker/telemetry/grafana/dashboards/consensus-health.json
@@ -10,7 +10,7 @@
  "panels": [
    {
      "title": "Consensus Round Duration",
-      "description": "p95 and p50 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp) measures the time to process an accepted ledger including transaction application and state finalization. The span carries proposers and round_time_ms attributes. Normal range is 3-6 seconds on mainnet.",
+      "description": "p95 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp) measures the time to process an accepted ledger including transaction application and state finalization. The span carries proposers and round_time_ms attributes. Normal range is 3-6 seconds on mainnet.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -31,13 +31,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))",
          "legendFormat": "P95 Round Duration [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))",
-          "legendFormat": "P50 Round Duration [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -181,13 +174,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))",
          "legendFormat": "P95 Apply Duration [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))",
-          "legendFormat": "P50 Apply Duration [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -745,7 +731,7 @@
    },
    {
      "title": "Consensus Round Duration (Full Round)",
-      "description": "p95/p50 duration of the full consensus round. The consensus.round span (RCLConsensus.cpp startRound) wraps an entire round end-to-end. Filterable by consensus mode. This is the single most important consensus-health signal; rising round time precedes ledger-age alarms.",
+      "description": "p95 duration of the full consensus round. The consensus.round span (RCLConsensus.cpp startRound) wraps an entire round end-to-end. Filterable by consensus mode. This is the single most important consensus-health signal; rising round time precedes ledger-age alarms.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -766,13 +752,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))",
          "legendFormat": "P95 Round [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))",
-          "legendFormat": "P50 Round [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -837,7 +816,7 @@
    },
    {
      "title": "Position Update Duration",
-      "description": "p95/p50 duration of the consensus.update_positions span, which tallies disputes and updates this node's position each round. Long durations indicate heavy dispute resolution or slow convergence on close time.",
+      "description": "p95 duration of the consensus.update_positions span, which tallies disputes and updates this node's position each round. Long durations indicate heavy dispute resolution or slow convergence on close time.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -858,13 +837,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))",
          "legendFormat": "P95 Update [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))",
-          "legendFormat": "P50 Update [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
--- a/docker/telemetry/grafana/dashboards/ledger-operations.json
+++ b/docker/telemetry/grafana/dashboards/ledger-operations.json
@@ -42,7 +42,7 @@
    },
    {
      "title": "Ledger Build Duration",
-      "description": "p95 and p50 duration of ledger builds. Measures the full buildLedgerImpl() call including transaction application, SHAMap flushing, and ledger acceptance. The span records xrpl.ledger.seq as an attribute. Long build times indicate expensive transaction sets or I/O pressure from SHAMap flushes.",
+      "description": "p95 duration of ledger builds. Measures the full buildLedgerImpl() call including transaction application, SHAMap flushing, and ledger acceptance. The span records xrpl.ledger.seq as an attribute. Long build times indicate expensive transaction sets or I/O pressure from SHAMap flushes.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -63,13 +63,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.build\"}[5m])))",
          "legendFormat": "P95 Build Duration [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.build\"}[5m])))",
-          "legendFormat": "P50 Build Duration [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -156,7 +149,7 @@
    },
    {
      "title": "Transaction Apply Duration",
-      "description": "p95 and p50 duration of applying the consensus transaction set during ledger building. The tx.apply span (BuildLedger.cpp) wraps applyTransactions() which iterates through the CanonicalTXSet with multiple retry passes. Records tx_count (successful) and tx_failed (failed) as attributes.",
+      "description": "p95 duration of applying the consensus transaction set during ledger building. The tx.apply span (BuildLedger.cpp) wraps applyTransactions() which iterates through the CanonicalTXSet with multiple retry passes. Records tx_count (successful) and tx_failed (failed) as attributes.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -177,13 +170,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))",
          "legendFormat": "P95 tx.apply [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))",
-          "legendFormat": "P50 tx.apply [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
--- a/docker/telemetry/grafana/dashboards/system-node-health.json
+++ b/docker/telemetry/grafana/dashboards/system-node-health.json
@@ -243,7 +243,7 @@
    },
    {
      "title": "I/O Latency",
-      "description": "P95 and P50 of the I/O service loop latency in milliseconds. Sourced from the ios_latency event (Application.cpp) which measures how long it takes for the io_context to process a timer callback. Values above 10ms are logged; above 500ms trigger warnings. High values indicate thread pool saturation or blocking operations.",
+      "description": "P95 of the I/O service loop latency in milliseconds. Sourced from the ios_latency event (Application.cpp) which measures how long it takes for the io_context to process a timer callback. Values above 10ms are logged; above 500ms trigger warnings. High values indicate thread pool saturation or blocking operations.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -264,13 +264,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_ios_latency_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
          "legendFormat": "P95 I/O Latency [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(xrpld_ios_latency_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
-          "legendFormat": "P50 I/O Latency [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -2054,7 +2047,7 @@
    },
    {
      "title": "Ledger Acquire Duration (Inbound Fetch)",
-      "description": "p95/p50 duration of the ledger.acquire span (InboundLedger): how long it takes to fetch a missing ledger from peers. A spike signals the node is falling behind or recovering from a fork. Populated under back-fill / sync activity.",
+      "description": "p95 duration of the ledger.acquire span (InboundLedger): how long it takes to fetch a missing ledger from peers. A spike signals the node is falling behind or recovering from a fork. Populated under back-fill / sync activity.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -2075,13 +2068,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m])))",
          "legendFormat": "P95 Acquire [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m])))",
-          "legendFormat": "P50 Acquire [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
--- a/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json
+++ b/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json
@@ -43,7 +43,7 @@
    },
    {
      "title": "RPC Response Time (System Metrics)",
-      "description": "P95 and P50 of RPC response time from the beast::insight timer. Sourced from the rpc.time event (ServerHandler.cpp) which records elapsed milliseconds for each RPC response. This measures the full HTTP handler time, not just command execution. Compare with span-based rpc.request duration.",
+      "description": "P95 of RPC response time from the beast::insight timer. Sourced from the rpc.time event (ServerHandler.cpp) which records elapsed milliseconds for each RPC response. This measures the full HTTP handler time, not just command execution. Compare with span-based rpc.request duration.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -64,13 +64,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_rpc_time_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
          "legendFormat": "P95 Response Time [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_rpc_time_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
-          "legendFormat": "P50 Response Time [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -89,7 +82,7 @@
    },
    {
      "title": "RPC Response Size",
-      "description": "P95 and P50 of RPC response payload size in bytes. Sourced from the rpc.size event (ServerHandler.cpp) which records the byte length of each RPC JSON response. Large responses may indicate expensive queries (e.g. account_tx with many results) or API misuse.",
+      "description": "P95 of RPC response payload size in bytes. Sourced from the rpc.size event (ServerHandler.cpp) which records the byte length of each RPC JSON response. Large responses may indicate expensive queries (e.g. account_tx with many results) or API misuse.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -110,13 +103,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_rpc_size_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
          "legendFormat": "P95 Response Size [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_rpc_size_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
-          "legendFormat": "P50 Response Size [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -135,7 +121,7 @@
    },
    {
      "title": "RPC Response Time Distribution",
-      "description": "Distribution of RPC response times from the beast::insight timer showing P50, P90, P95, and P99 quantiles. Sourced from the rpc.time event (ServerHandler.cpp). Useful for detecting bimodal latency or long-tail requests.",
+      "description": "Distribution of RPC response times from the beast::insight timer showing P90, P95, and P99 quantiles. Sourced from the rpc.time event (ServerHandler.cpp). Useful for detecting bimodal latency or long-tail requests.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -150,13 +136,6 @@
        }
      },
      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_rpc_time_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
-          "legendFormat": "P50 [{{exported_instance}}]"
-        },
        {
          "datasource": {
            "type": "prometheus"
@@ -195,7 +174,7 @@
    },
    {
      "title": "Pathfinding Fast Duration",
-      "description": "P95 and P50 of fast pathfinding execution time. Sourced from the pathfind_fast event (PathRequests.h) which records the duration of the fast pathfinding algorithm. Fast pathfinding uses a simplified search that trades accuracy for speed.",
+      "description": "P95 of fast pathfinding execution time. Sourced from the pathfind_fast event (PathRequests.h) which records the duration of the fast pathfinding algorithm. Fast pathfinding uses a simplified search that trades accuracy for speed.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -216,13 +195,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_pathfind_fast_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
          "legendFormat": "P95 Fast Pathfind [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_pathfind_fast_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
-          "legendFormat": "P50 Fast Pathfind [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -241,7 +213,7 @@
    },
    {
      "title": "Pathfinding Full Duration",
-      "description": "P95 and P50 of full pathfinding execution time. Sourced from the pathfind_full event (PathRequests.h) which records the duration of the exhaustive pathfinding search. Full pathfinding is more expensive and can take significantly longer than fast mode.",
+      "description": "P95 of full pathfinding execution time. Sourced from the pathfind_full event (PathRequests.h) which records the duration of the exhaustive pathfinding search. Full pathfinding is more expensive and can take significantly longer than fast mode.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -262,13 +234,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_pathfind_full_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
          "legendFormat": "P95 Full Pathfind [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_pathfind_full_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
-          "legendFormat": "P50 Full Pathfind [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -500,7 +465,7 @@
    },
    {
      "title": "Pathfinding Compute Duration (Spans)",
-      "description": "p95/p50 of the pathfind.compute span, the per-request path computation. Complements the StatsD pathfind_fast/full timers with span-level visibility. Populated under pathfinding (book/path) RPC load.",
+      "description": "p95 of the pathfind.compute span, the per-request path computation. Complements the StatsD pathfind_fast/full timers with span-level visibility. Populated under pathfinding (book/path) RPC load.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -521,13 +486,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"pathfind.compute\"}[5m])))",
          "legendFormat": "P95 Compute [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"pathfind.compute\"}[5m])))",
-          "legendFormat": "P50 Compute [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
--- a/docker/telemetry/grafana/dashboards/transaction-overview.json
+++ b/docker/telemetry/grafana/dashboards/transaction-overview.json
@@ -56,7 +56,7 @@
    },
    {
      "title": "Transaction Processing Latency by Type",
-      "description": "Per-transaction-type processing latency (p95 and p50). Filter with $tx_type variable above.",
+      "description": "Per-transaction-type processing latency (p95). Filter with $tx_type variable above.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -82,13 +82,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, tx_type, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.process\", tx_type=~\"$tx_type\"}[5m])))",
          "legendFormat": "P95 {{tx_type}} [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, tx_type, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.process\", tx_type=~\"$tx_type\"}[5m])))",
-          "legendFormat": "P50 {{tx_type}} [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -208,7 +201,7 @@
    },
    {
      "title": "Transaction Apply Duration per Ledger",
-      "description": "p95 and p50 latency of applying the consensus transaction set to a new ledger. The tx.apply span (BuildLedger.cpp) wraps the applyTransactions() function that iterates through the CanonicalTXSet and applies each transaction to the OpenView. Long durations indicate heavy transaction sets or expensive transaction processing.",
+      "description": "p95 latency of applying the consensus transaction set to a new ledger. The tx.apply span (BuildLedger.cpp) wraps the applyTransactions() function that iterates through the CanonicalTXSet and applies each transaction to the OpenView. Long durations indicate heavy transaction sets or expensive transaction processing.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -229,13 +222,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))",
          "legendFormat": "P95 tx.apply [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))",
-          "legendFormat": "P50 tx.apply [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -587,7 +573,7 @@
    },
    {
      "title": "Queue Accept (Drain) Duration per Ledger",
-      "description": "p95/p50 duration of the txq.accept span, which drains queued transactions into a newly closed ledger. Rising drain time signals queue pressure at ledger close.",
+      "description": "p95 duration of the txq.accept span, which drains queued transactions into a newly closed ledger. Rising drain time signals queue pressure at ledger close.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -608,13 +594,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"txq.accept\"}[5m])))",
          "legendFormat": "P95 Drain [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"txq.accept\"}[5m])))",
-          "legendFormat": "P50 Drain [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -669,6 +648,138 @@
        },
        "overrides": []
      }
+    },
+    {
+      "title": "Tx Apply Pipeline Rate by Stage",
+      "description": "Span rate for each apply-pipeline stage (preflight, preclaim, apply). A drop between stages shows where transactions are filtered out. Requires the stage dimension in spanmetrics.",
+      "type": "timeseries",
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 64
+      },
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": ["mean", "max"]
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum by (stage, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\"}[5m]))",
+          "legendFormat": "{{stage}} [{{exported_instance}}]"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ops",
+          "custom": {
+            "axisLabel": "Spans / Sec",
+            "spanNulls": true,
+            "insertNulls": false,
+            "showPoints": "auto",
+            "pointSize": 3
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "title": "Tx Apply Pipeline Latency by Stage (p95)",
+      "description": "95th-percentile duration of each apply-pipeline stage. Isolates which stage (preflight, preclaim, apply) dominates transaction processing time.",
+      "type": "timeseries",
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 64
+      },
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": ["mean", "max"]
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum by (le, stage, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\"}[5m])))",
+          "legendFormat": "P95 {{stage}} [{{exported_instance}}]"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ms",
+          "custom": {
+            "axisLabel": "Duration (ms)",
+            "spanNulls": true,
+            "insertNulls": false,
+            "showPoints": "auto",
+            "pointSize": 3
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "title": "Tx Apply Pipeline Failure Rate by Stage",
+      "description": "Rate of apply-pipeline spans whose ter_result is not tesSUCCESS, split by stage. Shows whether failures concentrate in preflight, preclaim, or apply. Filters on ter_result rather than span status because a failing ter code completes the span normally; only thrown exceptions set an error status.",
+      "type": "timeseries",
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 72
+      },
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": ["mean", "max"]
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum by (stage, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\", ter_result!~\"tesSUCCESS|\"}[5m]))",
+          "legendFormat": "{{stage}} [{{exported_instance}}]"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ops",
+          "custom": {
+            "axisLabel": "Failed Spans / Sec",
+            "spanNulls": true,
+            "insertNulls": false,
+            "showPoints": "auto",
+            "pointSize": 3
+          }
+        },
+        "overrides": []
+      }
    }
  ],
  "schemaVersion": 39,
@@ -768,6 +879,24 @@
        },
        "sort": 1,
        "label": "Queue Status"
+      },
+      {
+        "name": "stage",
+        "type": "query",
+        "datasource": {
+          "type": "prometheus"
+        },
+        "query": "label_values(traces_span_metrics_calls_total{span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage!=\"\"}, stage)",
+        "refresh": 2,
+        "includeAll": true,
+        "multi": true,
+        "allValue": ".*",
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "sort": 1,
+        "label": "Apply Stage"
      }
    ]
  },
--- a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml
+++ b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml
@@ -3,12 +3,10 @@
 # Access Grafana at http://localhost:3000, then use Explore -> Tempo
 # to browse xrpld traces using TraceQL.
 #
-# Search filters provide pre-configured dropdowns in the Explore UI.
-# Each phase adds filters for the span attributes it introduces.
-# Phase 1b (infra): Base filters — node identity, service, span name, status.
-# Phase 2 (RPC):    RPC command, status, role filters.
-# Phase 3 (TX):     Transaction hash, local/peer origin, status.
-# Phase 4 (Cons):   Consensus mode, round, ledger sequence, close time.
+# Search filters provide quick-start dropdowns in the Explore UI for the most
+# common investigation entry points. This is not an exhaustive attribute list —
+# use TraceQL autocomplete or see OpenTelemetryPlan/09-data-collection-reference.md §4
+# for the full attribute inventory and example queries.

 apiVersion: 1

@@ -40,177 +38,45 @@ datasources:
        spanEndTimeShift: "1h"
      search:
        filters:
-          # --- Node identification filters ---
-          # service.name: logical service name (default: "xrpld").
-          #   Useful when running multiple service types in the same collector.
-          - id: service-name
-            tag: service.name
-            operator: "="
-            scope: resource
-            type: static
-          # service.instance.id: unique node identifier — configurable via
-          #   the service_instance_id setting in [telemetry], defaults to the
-          #   node's public key. E.g. "Node-1" or "nHB1X37...".
+          # service.instance.id: unique node identifier (public key or configured name).
          - id: node-id
            tag: service.instance.id
            operator: "="
            scope: resource
            type: static
-          # service.version: xrpld build version (e.g., "2.4.0-b1").
-          #   Filter traces from specific software releases.
-          - id: node-version
-            tag: service.version
-            operator: "="
-            scope: resource
-            type: dynamic
-          # xrpl.network.id: numeric network identifier
-          #   (0 = mainnet, 1 = testnet, 2 = devnet, etc.).
-          #   Derived from the [network_id] config section.
-          - id: network-id
-            tag: xrpl.network.id
-            operator: "="
-            scope: resource
-            type: dynamic
-          # xrpl.network.type: human-readable network name derived from
-          #   network ID ("mainnet", "testnet", "devnet", "unknown").
-          - id: network-type
-            tag: xrpl.network.type
-            operator: "="
-            scope: resource
-            type: static
-          # --- Span intrinsic filters ---
-          # name: the span operation name (e.g., "rpc.command.server_info").
-          #   Use to find traces for a specific RPC command or subsystem.
+          # name: span operation name (e.g., "rpc.command.server_info").
          - id: span-name
            tag: name
            operator: "="
            scope: intrinsic
            type: static
          # status: span completion status ("ok", "error", "unset").
-          #   Filter for failed operations to diagnose errors.
          - id: span-status
            tag: status
            operator: "="
            scope: intrinsic
            type: static
-          # duration: span wall-clock duration. Use with ">" operator
-          #   to find slow operations (e.g., duration > 500ms).
-          - id: span-duration
-            tag: duration
-            operator: ">"
-            scope: intrinsic
-            type: static
-          # Phase 2: RPC tracing filters
+          # command: RPC command name (e.g., "server_info", "submit").
          - id: rpc-command
            tag: command
            operator: "="
            scope: span
            type: static
-          - id: rpc-status
-            tag: rpc_status
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: rpc-role
-            tag: rpc_role
-            operator: "="
-            scope: span
-            type: dynamic
-          # Phase 3: Transaction tracing filters
+          # tx_hash: transaction hash — direct lookup for a known transaction.
          - id: tx-hash
            tag: tx_hash
            operator: "="
            scope: span
            type: static
-          - id: tx-origin
-            tag: local
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: tx-status
-            tag: tx_status
-            operator: "="
-            scope: span
-            type: dynamic
-          # Phase 4: Consensus tracing filters
-          - id: consensus-mode
-            tag: xrpl.consensus.mode
+          # tx_type: transaction type (e.g., "Payment", "OfferCreate").
+          - id: tx-type
+            tag: tx_type
            operator: "="
            scope: span
            type: static
-          - id: consensus-round
-            tag: xrpl.consensus.round
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-ledger-seq
-            tag: xrpl.ledger.seq
+          # ledger_hash: ledger hash — scope all spans to a specific closed ledger.
+          - id: ledger-hash
+            tag: ledger_hash
            operator: "="
            scope: span
            type: static
-          - id: consensus-close-time-correct
-            tag: close_time_correct
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-state
-            tag: consensus_state
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-close-resolution
-            tag: close_resolution_ms
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-proposers
-            tag: proposers
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-result
-            tag: consensus_result
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-mode-old
-            tag: mode_old
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-mode-new
-            tag: mode_new
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-ledger-id
-            tag: xrpl.consensus.ledger_id
-            operator: "="
-            scope: span
-            type: static
-          # Phase 3/4: Additional transaction and queue filters
-          - id: tx-path
-            tag: path
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: tx-suppressed
-            tag: suppressed
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: peer-version
-            tag: peer_version
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: txq-status
-            tag: txq_status
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: txq-ter-code
-            tag: ter_code
-            operator: "="
-            scope: span
-            type: dynamic
--- a/docker/telemetry/otel-collector-config.yaml
+++ b/docker/telemetry/otel-collector-config.yaml
@@ -94,6 +94,9 @@ connectors:
      - name: validation_trusted
      - name: tx_type
      - name: ter_result
+      # Apply-pipeline stage (preflight|preclaim|apply) — splits the
+      # tx.preflight/tx.preclaim/tx.transactor span RED metrics per stage.
+      - name: stage
      - name: txq_status
      - name: load_type
      - name: is_batch