Merge branch 'pratik/otel-phase9-metric-gap-fill' into pratik/otel-phase10-workload-validation

2026-07-24 15:40:26 +00:00 · 2026-06-05 12:50:09 +01:00
parent 5b53ac99be f37a4a1022
commit db5b93e2c4
19 changed files with 761 additions and 426 deletions
--- a/.github/scripts/levelization/results/loops.txt
+++ b/.github/scripts/levelization/results/loops.txt
@@ -20,7 +20,7 @@ Loop: xrpld.app xrpld.shamap
  xrpld.shamap > xrpld.app

 Loop: xrpld.app xrpld.telemetry
-  xrpld.telemetry == xrpld.app
+  xrpld.telemetry ~= xrpld.app

 Loop: xrpld.overlay xrpld.rpc
  xrpld.rpc ~= xrpld.overlay
--- a/.github/scripts/levelization/results/ordering.txt
+++ b/.github/scripts/levelization/results/ordering.txt
@@ -242,6 +242,7 @@ xrpl.tx > xrpl.basics
 xrpl.tx > xrpl.core
 xrpl.tx > xrpl.ledger
 xrpl.tx > xrpl.protocol
+xrpl.tx > xrpl.telemetry
 xrpld.app > test.unit_test
 xrpld.app > xrpl.basics
 xrpld.app > xrpl.core
--- a/OpenTelemetryPlan/09-data-collection-reference.md
+++ b/OpenTelemetryPlan/09-data-collection-reference.md
@@ -101,13 +101,23 @@ Controlled by `trace_rpc=1` in `[telemetry]` config.

 Controlled by `trace_transactions=1` in `[telemetry]` config.

-| Span Name    | Parent         | Source File     | Description                                                       |
-| ------------ | -------------- | --------------- | ----------------------------------------------------------------- |
-| `tx.process` | —              | NetworkOPs.cpp  | Transaction submission entry point (local or peer-relayed)        |
-| `tx.receive` | —              | PeerImp.cpp     | Raw transaction received from peer overlay (before deduplication) |
-| `tx.apply`   | `ledger.build` | BuildLedger.cpp | Transaction set applied to new ledger during consensus            |
+| Span Name       | Parent         | Source File     | Description                                                       |
+| --------------- | -------------- | --------------- | ----------------------------------------------------------------- |
+| `tx.process`    | —              | NetworkOPs.cpp  | Transaction submission entry point (local or peer-relayed)        |
+| `tx.receive`    | —              | PeerImp.cpp     | Raw transaction received from peer overlay (before deduplication) |
+| `tx.apply`      | `ledger.build` | BuildLedger.cpp | Transaction set applied to new ledger during consensus            |
+| `tx.preflight`  | —              | applySteps.cpp  | Stateless checks stage (`stage=preflight`)                        |
+| `tx.preclaim`   | —              | applySteps.cpp  | Ledger-aware checks stage before fee claim (`stage=preclaim`)     |
+| `tx.transactor` | —              | Transactor.cpp  | Apply stage — the transactor runs (`stage=apply`)                 |
+
+The three apply-pipeline spans share a deterministic `trace_id` derived from
+`txID[0:16]`, so preflight, preclaim, and transactor for one transaction group
+under a single trace even though they run sequentially and often on different
+threads. A transaction that hard-fails preflight or preclaim never reaches the
+later spans — the `stage` attribute identifies where it stopped.

 **Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"tx.process|tx.receive"}`
+or, for the apply pipeline: `{resource.service.name="xrpld" && name=~"tx.preflight|tx.preclaim|tx.transactor"}`

 **Grafana dashboard**: _Transaction Overview_ (`xrpld-transactions`)

@@ -179,13 +189,19 @@ Every span can carry key-value attributes that provide context for filtering and

 #### Transaction Attributes

-| Attribute            | Type    | Set On                     | Description                                          |
-| -------------------- | ------- | -------------------------- | ---------------------------------------------------- |
-| `xrpl.tx.hash`       | string  | `tx.process`, `tx.receive` | Transaction hash (hex-encoded)                       |
-| `xrpl.tx.local`      | boolean | `tx.process`               | `true` if locally submitted, `false` if peer-relayed |
-| `xrpl.tx.path`       | string  | `tx.process`               | Submission path: `"sync"` or `"async"`               |
-| `xrpl.tx.suppressed` | boolean | `tx.receive`               | `true` if transaction was suppressed (duplicate)     |
-| `xrpl.tx.status`     | string  | `tx.receive`               | Transaction status (e.g., `"known_bad"`)             |
+| Attribute           | Type    | Set On                                         | Description                                                           |
+| ------------------- | ------- | ---------------------------------------------- | --------------------------------------------------------------------- |
+| `xrpl.tx.hash`      | string  | `tx.process`, `tx.receive`                     | Transaction hash (hex-encoded)                                        |
+| `local`             | boolean | `tx.process`                                   | `true` if locally submitted, `false` if peer-relayed                  |
+| `path`              | string  | `tx.process`                                   | Submission path: `"sync"` or `"async"`                                |
+| `suppressed`        | boolean | `tx.receive`                                   | `true` if transaction was suppressed (duplicate)                      |
+| `tx_status`         | string  | `tx.receive`                                   | Transaction status (e.g., `"known_bad"`)                              |
+| `xrpl.peer.id`      | int64   | `tx.receive`                                   | Peer identifier (also set on peer spans)                              |
+| `xrpl.peer.version` | string  | `tx.receive`                                   | Peer protocol version string                                          |
+| `stage`             | string  | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Apply-pipeline stage: `preflight`, `preclaim`, or `apply`             |
+| `tx_type`           | string  | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Transaction type name (e.g., `Payment`)                               |
+| `ter_result`        | string  | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Engine result token for that stage (e.g., `tesSUCCESS`, `terPRE_SEQ`) |
+| `applied`           | boolean | `tx.transactor`                                | `true` if the transaction was applied to the ledger                   |

 **Tempo query**: `{span.xrpl.tx.hash="<hash>"}` to trace a specific transaction across nodes.

@@ -250,14 +266,25 @@ The OTel Collector's SpanMetrics connector automatically generates RED (Rate, Er

 **Additional dimension labels** (configured in `otel-collector-config.yaml`):

-| Span Attribute                 | Prometheus Label               | Applies To                |
-| ------------------------------ | ------------------------------ | ------------------------- |
-| `command`                      | `xrpl_rpc_command`             | `rpc.command.*`           |
-| `rpc_status`                   | `xrpl_rpc_status`              | `rpc.command.*`           |
-| `xrpl.consensus.mode`          | `xrpl_consensus_mode`          | `consensus.ledger_close`  |
-| `xrpl.tx.local`                | `xrpl_tx_local`                | `tx.process`              |
-| `xrpl.peer.proposal.trusted`   | `xrpl_peer_proposal_trusted`   | `peer.proposal.receive`   |
-| `xrpl.peer.validation.trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` |
+| Span Attribute        | Prometheus Label               | Applies To                                     |
+| --------------------- | ------------------------------ | ---------------------------------------------- |
+| `command`             | `xrpl_rpc_command`             | `rpc.command.*`                                |
+| `rpc_status`          | `xrpl_rpc_status`              | `rpc.command.*`                                |
+| `xrpl.consensus.mode` | `xrpl_consensus_mode`          | `consensus.ledger_close`                       |
+| `local`               | `xrpl_tx_local`                | `tx.process`                                   |
+| `proposal_trusted`    | `xrpl_peer_proposal_trusted`   | `peer.proposal.receive`                        |
+| `validation_trusted`  | `xrpl_peer_validation_trusted` | `peer.validation.receive`                      |
+| `stage`               | `stage`                        | `tx.preflight`, `tx.preclaim`, `tx.transactor` |
+
+The `stage` dimension (3 values: `preflight`, `preclaim`, `apply`) turns the
+apply-pipeline spans into per-stage RED metrics with no native instruments — the
+_Transaction Overview_ dashboard charts rate, p95 latency, and failure rate by stage.
+
+> **Sampling caveat**: span-derived metrics inherit the **tracer head-sampling**
+> ratio (`sampling_ratio` in `[telemetry]`, via `TraceIdRatioBasedSampler`). At
+> `sampling_ratio < 1.0` the stage RED metrics undercount proportionally — they
+> reflect sampled traces, not the full transaction volume. Native StatsD/meter
+> metrics do not sample. Account for this when reading absolute stage rates.

 **Where to query**: Prometheus → `traces_span_metrics_calls_total{span_name="rpc.command.server_info"}`

--- a/OpenTelemetryPlan/Phase3_taskList.md
+++ b/OpenTelemetryPlan/Phase3_taskList.md
@@ -474,17 +474,22 @@ This gives the best of both worlds: guaranteed cross-node correlation via determ

 **Attributes added**:

-| Span            | Attribute        | Type   | Source                                                              |
-| --------------- | ---------------- | ------ | ------------------------------------------------------------------- |
-| `tx.process`    | `tx_type`        | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` |
-| `tx.process`    | `fee`            | int64  | `stx->getFieldAmount(sfFee).xrp().drops()`                          |
-| `tx.process`    | `sequence`       | int64  | `stx->getSeqProxy().value()`                                        |
-| `tx.process`    | `ter_result`     | string | `transToken(e.result)` (set after batch application)                |
-| `tx.process`    | `applied`        | bool   | `e.applied` (set after batch application)                           |
-| `tx.receive`    | `tx_type`        | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` |
-| `txq.enqueue`   | `tx_type`        | string | same pattern as above                                               |
-| `txq.accept.tx` | `txq_status`     | string | `applied` / `failed` / `retried`                                    |
-| `txq.accept`    | `ledger_changed` | bool   | set at end of accept loop                                           |
+| Span              | Attribute            | Type   | Source                                                              |
+| ----------------- | -------------------- | ------ | ------------------------------------------------------------------- |
+| `tx.process`      | `tx_type`            | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` |
+| `tx.process`      | `fee`                | int64  | `stx->getFieldAmount(sfFee).xrp().drops()`                          |
+| `tx.process`      | `sequence`           | int64  | `stx->getSeqProxy().value()`                                        |
+| `tx.process`      | `ter_result`         | string | `transToken(e.result)` (set after batch application)                |
+| `tx.process`      | `applied`            | bool   | `e.applied` (set after batch application)                           |
+| `tx.receive`      | `tx_type`            | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` |
+| `txq.enqueue`     | `tx_type`            | string | same pattern as above                                               |
+| `txq.enqueue`     | `txq_status`         | string | `queued` / `applied_direct` / `applied` / `rejected`                |
+| `txq.enqueue`     | `fee_level_paid`     | int64  | `getFeeLevelPaid(view, *tx).value()`                                |
+| `txq.enqueue`     | `required_fee_level` | int64  | `getRequiredFeeLevel(...).value()`                                  |
+| `txq.batch_clear` | `num_cleared`        | int64  | queued txs cleared ahead of the applying tx                         |
+| `txq.cleanup`     | `expired_count`      | int64  | entries dropped for passed `LastLedgerSequence`                     |
+| `txq.accept.tx`   | `txq_status`         | string | `applied` / `failed` / `retried`                                    |
+| `txq.accept`      | `ledger_changed`     | bool   | set at end of accept loop                                           |

 **New attr keys**: `TxSpanNames.h` (`txType`, `fee`, `sequence`, `terResult`, `applied`), `TxQSpanNames.h` (`txType`).

--- a/docker/telemetry/grafana/dashboards/consensus-health.json
+++ b/docker/telemetry/grafana/dashboards/consensus-health.json
@@ -10,7 +10,7 @@
  "panels": [
    {
      "title": "Consensus Round Duration",
-      "description": "p95 and p50 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp) measures the time to process an accepted ledger including transaction application and state finalization. The span carries proposers and round_time_ms attributes. Normal range is 3-6 seconds on mainnet.",
+      "description": "p95 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp) measures the time to process an accepted ledger including transaction application and state finalization. The span carries proposers and round_time_ms attributes. Normal range is 3-6 seconds on mainnet.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -31,13 +31,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))",
          "legendFormat": "P95 Round Duration [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))",
-          "legendFormat": "P50 Round Duration [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -181,13 +174,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))",
          "legendFormat": "P95 Apply Duration [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))",
-          "legendFormat": "P50 Apply Duration [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -745,7 +731,7 @@
    },
    {
      "title": "Consensus Round Duration (Full Round)",
-      "description": "p95/p50 duration of the full consensus round. The consensus.round span (RCLConsensus.cpp startRound) wraps an entire round end-to-end. Filterable by consensus mode. This is the single most important consensus-health signal; rising round time precedes ledger-age alarms.",
+      "description": "p95 duration of the full consensus round. The consensus.round span (RCLConsensus.cpp startRound) wraps an entire round end-to-end. Filterable by consensus mode. This is the single most important consensus-health signal; rising round time precedes ledger-age alarms.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -766,13 +752,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))",
          "legendFormat": "P95 Round [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))",
-          "legendFormat": "P50 Round [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -837,7 +816,7 @@
    },
    {
      "title": "Position Update Duration",
-      "description": "p95/p50 duration of the consensus.update_positions span, which tallies disputes and updates this node's position each round. Long durations indicate heavy dispute resolution or slow convergence on close time.",
+      "description": "p95 duration of the consensus.update_positions span, which tallies disputes and updates this node's position each round. Long durations indicate heavy dispute resolution or slow convergence on close time.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -858,13 +837,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))",
          "legendFormat": "P95 Update [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))",
-          "legendFormat": "P50 Update [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
--- a/docker/telemetry/grafana/dashboards/ledger-operations.json
+++ b/docker/telemetry/grafana/dashboards/ledger-operations.json
@@ -42,7 +42,7 @@
    },
    {
      "title": "Ledger Build Duration",
-      "description": "p95 and p50 duration of ledger builds. Measures the full buildLedgerImpl() call including transaction application, SHAMap flushing, and ledger acceptance. The span records xrpl.ledger.seq as an attribute. Long build times indicate expensive transaction sets or I/O pressure from SHAMap flushes.",
+      "description": "p95 duration of ledger builds. Measures the full buildLedgerImpl() call including transaction application, SHAMap flushing, and ledger acceptance. The span records xrpl.ledger.seq as an attribute. Long build times indicate expensive transaction sets or I/O pressure from SHAMap flushes.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -63,13 +63,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.build\"}[5m])))",
          "legendFormat": "P95 Build Duration [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.build\"}[5m])))",
-          "legendFormat": "P50 Build Duration [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -156,7 +149,7 @@
    },
    {
      "title": "Transaction Apply Duration",
-      "description": "p95 and p50 duration of applying the consensus transaction set during ledger building. The tx.apply span (BuildLedger.cpp) wraps applyTransactions() which iterates through the CanonicalTXSet with multiple retry passes. Records tx_count (successful) and tx_failed (failed) as attributes.",
+      "description": "p95 duration of applying the consensus transaction set during ledger building. The tx.apply span (BuildLedger.cpp) wraps applyTransactions() which iterates through the CanonicalTXSet with multiple retry passes. Records tx_count (successful) and tx_failed (failed) as attributes.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -177,13 +170,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))",
          "legendFormat": "P95 tx.apply [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))",
-          "legendFormat": "P50 tx.apply [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
--- a/docker/telemetry/grafana/dashboards/system-node-health.json
+++ b/docker/telemetry/grafana/dashboards/system-node-health.json
@@ -243,7 +243,7 @@
    },
    {
      "title": "I/O Latency",
-      "description": "P95 and P50 of the I/O service loop latency in milliseconds. Sourced from the ios_latency event (Application.cpp) which measures how long it takes for the io_context to process a timer callback. Values above 10ms are logged; above 500ms trigger warnings. High values indicate thread pool saturation or blocking operations.",
+      "description": "P95 of the I/O service loop latency in milliseconds. Sourced from the ios_latency event (Application.cpp) which measures how long it takes for the io_context to process a timer callback. Values above 10ms are logged; above 500ms trigger warnings. High values indicate thread pool saturation or blocking operations.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -264,13 +264,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_ios_latency_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
          "legendFormat": "P95 I/O Latency [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(xrpld_ios_latency_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
-          "legendFormat": "P50 I/O Latency [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -2054,7 +2047,7 @@
    },
    {
      "title": "Ledger Acquire Duration (Inbound Fetch)",
-      "description": "p95/p50 duration of the ledger.acquire span (InboundLedger): how long it takes to fetch a missing ledger from peers. A spike signals the node is falling behind or recovering from a fork. Populated under back-fill / sync activity.",
+      "description": "p95 duration of the ledger.acquire span (InboundLedger): how long it takes to fetch a missing ledger from peers. A spike signals the node is falling behind or recovering from a fork. Populated under back-fill / sync activity.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -2075,13 +2068,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m])))",
          "legendFormat": "P95 Acquire [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m])))",
-          "legendFormat": "P50 Acquire [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
--- a/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json
+++ b/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json
@@ -43,7 +43,7 @@
    },
    {
      "title": "RPC Response Time (System Metrics)",
-      "description": "P95 and P50 of RPC response time from the beast::insight timer. Sourced from the rpc.time event (ServerHandler.cpp) which records elapsed milliseconds for each RPC response. This measures the full HTTP handler time, not just command execution. Compare with span-based rpc.request duration.",
+      "description": "P95 of RPC response time from the beast::insight timer. Sourced from the rpc.time event (ServerHandler.cpp) which records elapsed milliseconds for each RPC response. This measures the full HTTP handler time, not just command execution. Compare with span-based rpc.request duration.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -64,13 +64,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_rpc_time_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
          "legendFormat": "P95 Response Time [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_rpc_time_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
-          "legendFormat": "P50 Response Time [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -89,7 +82,7 @@
    },
    {
      "title": "RPC Response Size",
-      "description": "P95 and P50 of RPC response payload size in bytes. Sourced from the rpc.size event (ServerHandler.cpp) which records the byte length of each RPC JSON response. Large responses may indicate expensive queries (e.g. account_tx with many results) or API misuse.",
+      "description": "P95 of RPC response payload size in bytes. Sourced from the rpc.size event (ServerHandler.cpp) which records the byte length of each RPC JSON response. Large responses may indicate expensive queries (e.g. account_tx with many results) or API misuse.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -110,13 +103,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_rpc_size_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
          "legendFormat": "P95 Response Size [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_rpc_size_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
-          "legendFormat": "P50 Response Size [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -135,7 +121,7 @@
    },
    {
      "title": "RPC Response Time Distribution",
-      "description": "Distribution of RPC response times from the beast::insight timer showing P50, P90, P95, and P99 quantiles. Sourced from the rpc.time event (ServerHandler.cpp). Useful for detecting bimodal latency or long-tail requests.",
+      "description": "Distribution of RPC response times from the beast::insight timer showing P90, P95, and P99 quantiles. Sourced from the rpc.time event (ServerHandler.cpp). Useful for detecting bimodal latency or long-tail requests.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -150,13 +136,6 @@
        }
      },
      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_rpc_time_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
-          "legendFormat": "P50 [{{exported_instance}}]"
-        },
        {
          "datasource": {
            "type": "prometheus"
@@ -195,7 +174,7 @@
    },
    {
      "title": "Pathfinding Fast Duration",
-      "description": "P95 and P50 of fast pathfinding execution time. Sourced from the pathfind_fast event (PathRequests.h) which records the duration of the fast pathfinding algorithm. Fast pathfinding uses a simplified search that trades accuracy for speed.",
+      "description": "P95 of fast pathfinding execution time. Sourced from the pathfind_fast event (PathRequests.h) which records the duration of the fast pathfinding algorithm. Fast pathfinding uses a simplified search that trades accuracy for speed.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -216,13 +195,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_pathfind_fast_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
          "legendFormat": "P95 Fast Pathfind [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_pathfind_fast_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
-          "legendFormat": "P50 Fast Pathfind [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -241,7 +213,7 @@
    },
    {
      "title": "Pathfinding Full Duration",
-      "description": "P95 and P50 of full pathfinding execution time. Sourced from the pathfind_full event (PathRequests.h) which records the duration of the exhaustive pathfinding search. Full pathfinding is more expensive and can take significantly longer than fast mode.",
+      "description": "P95 of full pathfinding execution time. Sourced from the pathfind_full event (PathRequests.h) which records the duration of the exhaustive pathfinding search. Full pathfinding is more expensive and can take significantly longer than fast mode.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -262,13 +234,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_pathfind_full_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
          "legendFormat": "P95 Full Pathfind [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_pathfind_full_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
-          "legendFormat": "P50 Full Pathfind [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -500,7 +465,7 @@
    },
    {
      "title": "Pathfinding Compute Duration (Spans)",
-      "description": "p95/p50 of the pathfind.compute span, the per-request path computation. Complements the StatsD pathfind_fast/full timers with span-level visibility. Populated under pathfinding (book/path) RPC load.",
+      "description": "p95 of the pathfind.compute span, the per-request path computation. Complements the StatsD pathfind_fast/full timers with span-level visibility. Populated under pathfinding (book/path) RPC load.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -521,13 +486,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"pathfind.compute\"}[5m])))",
          "legendFormat": "P95 Compute [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"pathfind.compute\"}[5m])))",
-          "legendFormat": "P50 Compute [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
--- a/docker/telemetry/grafana/dashboards/transaction-overview.json
+++ b/docker/telemetry/grafana/dashboards/transaction-overview.json
@@ -56,7 +56,7 @@
    },
    {
      "title": "Transaction Processing Latency by Type",
-      "description": "Per-transaction-type processing latency (p95 and p50). Filter with $tx_type variable above.",
+      "description": "Per-transaction-type processing latency (p95). Filter with $tx_type variable above.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -82,13 +82,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, tx_type, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.process\", tx_type=~\"$tx_type\"}[5m])))",
          "legendFormat": "P95 {{tx_type}} [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, tx_type, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.process\", tx_type=~\"$tx_type\"}[5m])))",
-          "legendFormat": "P50 {{tx_type}} [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -208,7 +201,7 @@
    },
    {
      "title": "Transaction Apply Duration per Ledger",
-      "description": "p95 and p50 latency of applying the consensus transaction set to a new ledger. The tx.apply span (BuildLedger.cpp) wraps the applyTransactions() function that iterates through the CanonicalTXSet and applies each transaction to the OpenView. Long durations indicate heavy transaction sets or expensive transaction processing.",
+      "description": "p95 latency of applying the consensus transaction set to a new ledger. The tx.apply span (BuildLedger.cpp) wraps the applyTransactions() function that iterates through the CanonicalTXSet and applies each transaction to the OpenView. Long durations indicate heavy transaction sets or expensive transaction processing.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -229,13 +222,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))",
          "legendFormat": "P95 tx.apply [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))",
-          "legendFormat": "P50 tx.apply [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -587,7 +573,7 @@
    },
    {
      "title": "Queue Accept (Drain) Duration per Ledger",
-      "description": "p95/p50 duration of the txq.accept span, which drains queued transactions into a newly closed ledger. Rising drain time signals queue pressure at ledger close.",
+      "description": "p95 duration of the txq.accept span, which drains queued transactions into a newly closed ledger. Rising drain time signals queue pressure at ledger close.",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
@@ -608,13 +594,6 @@
          },
          "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"txq.accept\"}[5m])))",
          "legendFormat": "P95 Drain [{{exported_instance}}]"
-        },
-        {
-          "datasource": {
-            "type": "prometheus"
-          },
-          "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"txq.accept\"}[5m])))",
-          "legendFormat": "P50 Drain [{{exported_instance}}]"
        }
      ],
      "fieldConfig": {
@@ -669,6 +648,138 @@
        },
        "overrides": []
      }
+    },
+    {
+      "title": "Tx Apply Pipeline Rate by Stage",
+      "description": "Span rate for each apply-pipeline stage (preflight, preclaim, apply). A drop between stages shows where transactions are filtered out. Requires the stage dimension in spanmetrics.",
+      "type": "timeseries",
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 64
+      },
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": ["mean", "max"]
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum by (stage, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\"}[5m]))",
+          "legendFormat": "{{stage}} [{{exported_instance}}]"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ops",
+          "custom": {
+            "axisLabel": "Spans / Sec",
+            "spanNulls": true,
+            "insertNulls": false,
+            "showPoints": "auto",
+            "pointSize": 3
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "title": "Tx Apply Pipeline Latency by Stage (p95)",
+      "description": "95th-percentile duration of each apply-pipeline stage. Isolates which stage (preflight, preclaim, apply) dominates transaction processing time.",
+      "type": "timeseries",
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 64
+      },
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": ["mean", "max"]
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum by (le, stage, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\"}[5m])))",
+          "legendFormat": "P95 {{stage}} [{{exported_instance}}]"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ms",
+          "custom": {
+            "axisLabel": "Duration (ms)",
+            "spanNulls": true,
+            "insertNulls": false,
+            "showPoints": "auto",
+            "pointSize": 3
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "title": "Tx Apply Pipeline Failure Rate by Stage",
+      "description": "Rate of apply-pipeline spans whose ter_result is not tesSUCCESS, split by stage. Shows whether failures concentrate in preflight, preclaim, or apply. Filters on ter_result rather than span status because a failing ter code completes the span normally; only thrown exceptions set an error status.",
+      "type": "timeseries",
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 72
+      },
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": ["mean", "max"]
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum by (stage, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\", ter_result!~\"tesSUCCESS|\"}[5m]))",
+          "legendFormat": "{{stage}} [{{exported_instance}}]"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ops",
+          "custom": {
+            "axisLabel": "Failed Spans / Sec",
+            "spanNulls": true,
+            "insertNulls": false,
+            "showPoints": "auto",
+            "pointSize": 3
+          }
+        },
+        "overrides": []
+      }
    }
  ],
  "schemaVersion": 39,
@@ -768,6 +879,24 @@
        },
        "sort": 1,
        "label": "Queue Status"
+      },
+      {
+        "name": "stage",
+        "type": "query",
+        "datasource": {
+          "type": "prometheus"
+        },
+        "query": "label_values(traces_span_metrics_calls_total{span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage!=\"\"}, stage)",
+        "refresh": 2,
+        "includeAll": true,
+        "multi": true,
+        "allValue": ".*",
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "sort": 1,
+        "label": "Apply Stage"
      }
    ]
  },
--- a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml
+++ b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml
@@ -3,12 +3,10 @@
 # Access Grafana at http://localhost:3000, then use Explore -> Tempo
 # to browse xrpld traces using TraceQL.
 #
-# Search filters provide pre-configured dropdowns in the Explore UI.
-# Each phase adds filters for the span attributes it introduces.
-# Phase 1b (infra): Base filters — node identity, service, span name, status.
-# Phase 2 (RPC):    RPC command, status, role filters.
-# Phase 3 (TX):     Transaction hash, local/peer origin, status.
-# Phase 4 (Cons):   Consensus mode, round, ledger sequence, close time.
+# Search filters provide quick-start dropdowns in the Explore UI for the most
+# common investigation entry points. This is not an exhaustive attribute list —
+# use TraceQL autocomplete or see OpenTelemetryPlan/09-data-collection-reference.md §4
+# for the full attribute inventory and example queries.

 apiVersion: 1

@@ -40,177 +38,45 @@ datasources:
        spanEndTimeShift: "1h"
      search:
        filters:
-          # --- Node identification filters ---
-          # service.name: logical service name (default: "xrpld").
-          #   Useful when running multiple service types in the same collector.
-          - id: service-name
-            tag: service.name
-            operator: "="
-            scope: resource
-            type: static
-          # service.instance.id: unique node identifier — configurable via
-          #   the service_instance_id setting in [telemetry], defaults to the
-          #   node's public key. E.g. "Node-1" or "nHB1X37...".
+          # service.instance.id: unique node identifier (public key or configured name).
          - id: node-id
            tag: service.instance.id
            operator: "="
            scope: resource
            type: static
-          # service.version: xrpld build version (e.g., "2.4.0-b1").
-          #   Filter traces from specific software releases.
-          - id: node-version
-            tag: service.version
-            operator: "="
-            scope: resource
-            type: dynamic
-          # xrpl.network.id: numeric network identifier
-          #   (0 = mainnet, 1 = testnet, 2 = devnet, etc.).
-          #   Derived from the [network_id] config section.
-          - id: network-id
-            tag: xrpl.network.id
-            operator: "="
-            scope: resource
-            type: dynamic
-          # xrpl.network.type: human-readable network name derived from
-          #   network ID ("mainnet", "testnet", "devnet", "unknown").
-          - id: network-type
-            tag: xrpl.network.type
-            operator: "="
-            scope: resource
-            type: static
-          # --- Span intrinsic filters ---
-          # name: the span operation name (e.g., "rpc.command.server_info").
-          #   Use to find traces for a specific RPC command or subsystem.
+          # name: span operation name (e.g., "rpc.command.server_info").
          - id: span-name
            tag: name
            operator: "="
            scope: intrinsic
            type: static
          # status: span completion status ("ok", "error", "unset").
-          #   Filter for failed operations to diagnose errors.
          - id: span-status
            tag: status
            operator: "="
            scope: intrinsic
            type: static
-          # duration: span wall-clock duration. Use with ">" operator
-          #   to find slow operations (e.g., duration > 500ms).
-          - id: span-duration
-            tag: duration
-            operator: ">"
-            scope: intrinsic
-            type: static
-          # Phase 2: RPC tracing filters
+          # command: RPC command name (e.g., "server_info", "submit").
          - id: rpc-command
            tag: command
            operator: "="
            scope: span
            type: static
-          - id: rpc-status
-            tag: rpc_status
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: rpc-role
-            tag: rpc_role
-            operator: "="
-            scope: span
-            type: dynamic
-          # Phase 3: Transaction tracing filters
+          # tx_hash: transaction hash — direct lookup for a known transaction.
          - id: tx-hash
            tag: tx_hash
            operator: "="
            scope: span
            type: static
-          - id: tx-origin
-            tag: local
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: tx-status
-            tag: tx_status
-            operator: "="
-            scope: span
-            type: dynamic
-          # Phase 4: Consensus tracing filters
-          - id: consensus-mode
-            tag: xrpl.consensus.mode
+          # tx_type: transaction type (e.g., "Payment", "OfferCreate").
+          - id: tx-type
+            tag: tx_type
            operator: "="
            scope: span
            type: static
-          - id: consensus-round
-            tag: xrpl.consensus.round
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-ledger-seq
-            tag: xrpl.ledger.seq
+          # ledger_hash: ledger hash — scope all spans to a specific closed ledger.
+          - id: ledger-hash
+            tag: ledger_hash
            operator: "="
            scope: span
            type: static
-          - id: consensus-close-time-correct
-            tag: close_time_correct
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-state
-            tag: consensus_state
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-close-resolution
-            tag: close_resolution_ms
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-proposers
-            tag: proposers
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-result
-            tag: consensus_result
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-mode-old
-            tag: mode_old
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-mode-new
-            tag: mode_new
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: consensus-ledger-id
-            tag: xrpl.consensus.ledger_id
-            operator: "="
-            scope: span
-            type: static
-          # Phase 3/4: Additional transaction and queue filters
-          - id: tx-path
-            tag: path
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: tx-suppressed
-            tag: suppressed
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: peer-version
-            tag: peer_version
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: txq-status
-            tag: txq_status
-            operator: "="
-            scope: span
-            type: dynamic
-          - id: txq-ter-code
-            tag: ter_code
-            operator: "="
-            scope: span
-            type: dynamic
--- a/docker/telemetry/otel-collector-config.yaml
+++ b/docker/telemetry/otel-collector-config.yaml
@@ -94,6 +94,9 @@ connectors:
      - name: validation_trusted
      - name: tx_type
      - name: ter_result
+      # Apply-pipeline stage (preflight|preclaim|apply) — splits the
+      # tx.preflight/tx.preclaim/tx.transactor span RED metrics per stage.
+      - name: stage
      - name: txq_status
      - name: load_type
      - name: is_batch
--- a/docs/telemetry-runbook.md
+++ b/docs/telemetry-runbook.md
@@ -75,11 +75,20 @@ All spans instrumented in xrpld, grouped by subsystem:

 ### Transaction Spans (Phase 3)

-| Span Name    | Source File     | Attributes                                                                        | Description                           |
-| ------------ | --------------- | --------------------------------------------------------------------------------- | ------------------------------------- |
-| `tx.process` | NetworkOPs.cpp  | `tx_hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing |
-| `tx.receive` | PeerImp.cpp     | `peer_id`, `tx_hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status`        | Transaction received from peer relay  |
-| `tx.apply`   | BuildLedger.cpp | `ledger_seq`, `tx_count`, `tx_failed`                                             | Transaction set applied per ledger    |
+| Span Name       | Source File     | Attributes                                                                        | Description                           |
+| --------------- | --------------- | --------------------------------------------------------------------------------- | ------------------------------------- |
+| `tx.process`    | NetworkOPs.cpp  | `tx_hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing |
+| `tx.receive`    | PeerImp.cpp     | `peer_id`, `tx_hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status`        | Transaction received from peer relay  |
+| `tx.apply`      | BuildLedger.cpp | `ledger_seq`, `tx_count`, `tx_failed`                                             | Transaction set applied per ledger    |
+| `tx.preflight`  | applySteps.cpp  | `stage`, `tx_type`, `ter_result`                                                  | Stateless checks stage                |
+| `tx.preclaim`   | applySteps.cpp  | `stage`, `tx_type`, `ter_result`                                                  | Ledger-aware checks stage             |
+| `tx.transactor` | Transactor.cpp  | `stage`, `tx_type`, `ter_result`, `applied`                                       | Apply stage (transactor runs)         |
+
+The three apply-pipeline spans (`tx.preflight`, `tx.preclaim`, `tx.transactor`)
+share a deterministic `trace_id` from `txID[0:16]`, so they group under one
+trace per transaction. The `stage` attribute (`preflight` / `preclaim` /
+`apply`) drives the collector spanmetrics `stage` dimension, giving per-stage
+RED metrics on the _Transaction Overview_ dashboard.

 ### Transaction Queue Spans (Phase 3)

@@ -119,21 +128,23 @@ All spans instrumented in xrpld, grouped by subsystem:

 #### Close Time Queries (Tempo TraceQL)

+Span attributes are filtered with `span.<attr>` inside `{}`. Combine conditions with `&&`.
+
 ```
 # Find rounds where validators disagreed on close time
-{name="consensus.accept.apply"} | close_time_correct = false
+{name="consensus.accept.apply" && span.close_time_correct = false}

 # Find consensus failures (moved_on)
-{name="consensus.accept.apply"} | consensus_state = "moved_on"
+{name="consensus.accept.apply" && span.consensus_state = "moved_on"}

 # Find slow ledger applications (>5s)
-{name="consensus.accept.apply"} | duration > 5s
+{name="consensus.accept.apply" && duration > 5000ms}

 # Find specific ledger's consensus details
-{name="consensus.accept.apply"} | ledger_seq = 92345678
+{name="consensus.accept.apply" && span.ledger_seq = 92345678}

 # Find all spans in a consensus round (deterministic trace strategy)
-{name="consensus.round"} | consensus_round_id = <round_id>
+{name="consensus.round" && span.consensus_round_id = "<round_id>"}

 # Find dispute resolutions
 {name="consensus.update_positions"} >> {event:name="dispute.resolve"}
@@ -160,127 +171,246 @@ All spans instrumented in xrpld, grouped by subsystem:

 This section shows what questions you can answer using the span attributes, with example Tempo TraceQL queries.

+**TraceQL syntax note:** span attributes must be referenced with the `span.` prefix inside `{}`.
+Conditions are combined with `&&`. The `|` pipeline operator is not supported on this Tempo version.
+
+```
+# General pattern
+{name="<span-name>" && span.<attr> = <value> && span.<attr2> != <value2>}
+
+# Duration filter (no prefix needed)
+{name="<span-name>" && duration > 500ms}
+
+# Regex match
+{name="<span-name>" && span.<attr> =~ "<pattern>.*"}
+
+# Multiple span names
+{name = "<span-a>" || name = "<span-b>"}
+
+# Name regex
+{name =~ "<pattern>.*" && span.<attr> = <value>}
+
+# Structural: find parent spans that have a matching child/event
+{name="<parent>"} >> {event:name="<event-name>"}
+```
+
 ### Transaction Workflow Analysis

 ```
-# Find all AMM transactions (AMMDeposit, AMMWithdraw, AMMCreate, etc.)
-{name="tx.process"} | tx_type =~ "AMM.*"
+# Find all AMM transactions (AMMDeposit, AMMWithdraw, AMMVote)
+{name="tx.process" && span.tx_type =~ "AMM.*"}
+
+# Find a specific AMM operation
+{name="tx.process" && span.tx_type = "AMMDeposit"}
+{name="tx.process" && span.tx_type = "AMMWithdraw"}
+{name="tx.process" && span.tx_type = "AMMVote"}

 # Find Payment transactions that failed
-{name="tx.process"} | tx_type = "Payment" && ter_result != "tesSUCCESS"
+{name="tx.process" && span.tx_type = "Payment" && span.ter_result != "tesSUCCESS"}
+
+# Find Payment failures due to path issues
+{name="tx.process" && span.tx_type = "Payment" && span.ter_result =~ "tecPATH.*"}

 # Compare latency of different transaction types
-{name="tx.process"} | tx_type = "OfferCreate"
-{name="tx.process"} | tx_type = "Payment"
+{name="tx.process" && span.tx_type = "OfferCreate"}
+{name="tx.process" && span.tx_type = "Payment"}

 # Find high-fee transactions (fee > 1 XRP = 1000000 drops)
-{name="tx.process"} | fee > 1000000
+{name="tx.process" && span.fee > 1000000}

 # Find transactions that were not applied
-{name="tx.process"} | applied = false
+{name="tx.process" && span.applied = false}

-# Trace a specific transaction by type across the network
-{name=~"tx\\..*"} | tx_type = "NFTokenMint"
+# Find NFTokenMint across tx and txq spans
+{name =~ "tx.*|txq.*" && span.tx_type = "NFTokenMint"}
+
+# Find all NFT-related activity
+{name =~ "tx.*|txq.*" && span.tx_type =~ "NFToken.*"}
+
+# Find TrustSet transactions (IOU trust lines)
+{name="tx.process" && span.tx_type = "TrustSet"}
+
+# Find oracle price updates
+{name="tx.process" && span.tx_type = "OracleSet"}
 ```

+### DEX (OfferCreate / OfferCancel)
+
+```
+# All DEX offer creates
+{name="tx.process" && span.tx_type = "OfferCreate"}
+
+# Offers killed (ImmediateOrCancel/FillOrKill with no fill)
+{name="tx.process" && span.tx_type = "OfferCreate" && span.ter_result = "tecKILLED"}
+
+# Offers that failed due to insufficient funds
+{name="tx.process" && span.tx_type = "OfferCreate" && span.ter_result = "tecUNFUNDED_OFFER"}
+
+# Offers failed due to insufficient reserve to place the offer
+{name="tx.process" && span.tx_type = "OfferCreate" && span.ter_result = "tecINSUF_RESERVE_OFFER"}
+
+# Offer cancellations
+{name="tx.process" && span.tx_type = "OfferCancel"}
+
+# OfferCreate transactions received from peers (cross-node relay)
+{name="tx.receive" && span.tx_type = "OfferCreate"}
+```
+
+### Apply Pipeline by Stage
+
+```
+# All three stages of one transaction (preflight -> preclaim -> apply)
+{name=~"tx.preflight|tx.preclaim|tx.transactor"}
+
+# Transactions that failed at the preclaim stage
+{name="tx.preclaim"} | ter_result != "tesSUCCESS"
+
+# Transactions that hard-failed preflight (never reached preclaim/apply)
+{name="tx.preflight"} | ter_result != "tesSUCCESS"
+```
+
+PromQL on the span-derived metrics (dashboard: _Transaction Overview_):
+
+```
+# Per-stage throughput — the funnel preflight >= preclaim >= apply
+sum by (stage) (rate(traces_span_metrics_calls_total{span_name=~"tx.preflight|tx.preclaim|tx.transactor"}[5m]))
+
+# Per-stage p95 latency
+histogram_quantile(0.95, sum by (le, stage) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"tx.preflight|tx.preclaim|tx.transactor"}[5m])))
+
+# Per-stage failure rate (ter_result != tesSUCCESS; a failing ter completes the
+# span normally, so filter on the attribute, not status_code which only flags exceptions)
+sum by (stage) (rate(traces_span_metrics_calls_total{span_name=~"tx.preflight|tx.preclaim|tx.transactor", ter_result!~"tesSUCCESS|"}[5m]))
+```
+
+> **Alerting**: a rising `tx.preflight` / `tx.preclaim` failure rate points to
+> malformed or stale-sequence submissions (often spam or a misbehaving client);
+> a rising `tx.transactor` failure rate points to apply-time problems. Alert per
+> stage rather than on a single aggregate so the failing stage is obvious.
+
+> **Sampling caveat**: these stage metrics are span-derived and inherit the
+> **tracer head-sampling** ratio (`sampling_ratio`). At `sampling_ratio < 1.0`
+> they undercount proportionally — treat them as relative trends, not absolute
+> transaction counts. Native StatsD metrics are unsampled.
+
 ### Transaction Queue Health

 ```
 # Find transactions rejected from the queue
-{name="txq.accept_tx"} | txq_status = "failed"
+{name="txq.accept_tx" && span.txq_status = "failed"}

-# Which transaction types get queued most often?
-{name="txq.enqueue"} | tx_type = "Payment"
-{name="txq.enqueue"} | tx_type = "OfferCreate"
-
-# Find ledger closes that applied queued transactions
-{name="txq.accept"} | ledger_changed = true
+# Find transactions being retried
+{name="txq.accept_tx" && span.txq_status = "retried"}

 # Find transactions that exhausted retries
-{name="txq.accept_tx"} | txq_status = "retried" && retries_remaining = 0
+{name="txq.accept_tx" && span.txq_status = "retried" && span.retries_remaining = 0}
+
+# Which transaction types get queued most often?
+{name="txq.enqueue" && span.tx_type = "Payment"}
+{name="txq.enqueue" && span.tx_type = "OfferCreate"}
+{name="txq.enqueue" && span.tx_type =~ "NFToken.*"}
+
+# Find ledger closes that applied queued transactions
+{name="txq.accept" && span.ledger_changed = true}
 ```

 ### RPC Debugging

 ```
 # Find batch RPC requests
-{name="rpc.process"} | is_batch = true
+{name="rpc.process" && span.is_batch = true}

 # Find large RPC payloads (>100KB)
-{name="rpc.http_request"} | request_payload_size > 100000
+{name="rpc.http_request" && span.request_payload_size > 100000}

 # Find resource-heavy RPC commands (by load_type)
-{name=~"rpc.command.*"} | load_type = "exceptioned RPC"
+{name =~ "rpc.command.*" && span.load_type = "exceptioned RPC"}

 # Find a specific WebSocket command
-{name="rpc.ws_message"} | command = "subscribe"
+{name="rpc.ws_message" && span.command = "subscribe"}
+
+# Find server_info calls
+{name="rpc.command.server_info"}

 # Find slow pathfinding with many source assets
-{name="pathfind.discover"} | pathfind_num_source_assets > 10
+{name="pathfind.discover" && span.pathfind_num_source_assets > 10}
 ```

 ### PathFinding Performance

 ```
 # Find pathfinding for specific currencies
-{name="pathfind.compute"} | pathfind_dest_currency = "USD"
+{name="pathfind.compute" && span.pathfind_dest_currency = "USD"}

 # Find expensive pathfinding (many source assets to explore)
-{name="pathfind.discover"} | pathfind_num_source_assets > 20
+{name="pathfind.discover" && span.pathfind_num_source_assets > 20}

-# Find large pathfinding requests
-{name="pathfind.compute"} | duration > 1s
+# Find slow pathfinding requests
+{name="pathfind.compute" && duration > 1000ms}
 ```

 ### Consensus Health

 ```
 # Find rounds where consensus timed out (expired)
-{name="consensus.accept"} | consensus_state = "expired"
+{name="consensus.accept" && span.consensus_state = "expired"}

 # Find rounds where we moved on without full agreement
-{name="consensus.accept"} | consensus_state = "moved_on"
+{name="consensus.accept" && span.consensus_state = "moved_on"}

 # Find rounds with many disputes
-{name="consensus.accept"} | disputes_count > 5
+{name="consensus.accept" && span.disputes_count > 5}
+
+# Find slow consensus rounds (>5s)
+{name="consensus.accept" && span.round_time_ms > 5000}

 # Find bow-out proposals (node resigned from round)
-{name="consensus.proposal.send"} | is_bow_out = true
+{name="consensus.proposal.send" && span.is_bow_out = true}

 # Correlate validation with its ledger
-{name="consensus.validation.send"} | ledger_hash = "<hash>"
+{name="consensus.validation.send" && span.ledger_hash = "<hash>"}

 # Find rounds where validators disagreed on close time
-{name="consensus.accept.apply"} | close_time_correct = false
+{name="consensus.accept.apply" && span.close_time_correct = false}
+
+# Find both validation send and receive (compare sender vs receiver latency)
+{name = "consensus.validation.send" || name = "consensus.validation.receive"}
 ```

 ### Cross-Subsystem Correlation

 ```
 # Follow a transaction from receive through queue to ledger
-{name=~"tx\\..*|txq\\..*"} | tx_type = "Payment" && duration > 500ms
+{name =~ "tx.*|txq.*" && span.tx_type = "Payment" && duration > 500ms}

-# Find all NFT-related activity
-{name=~"tx\\..*|txq\\..*"} | tx_type =~ "NFToken.*"
+# Find all NFT-related activity across tx and txq spans
+{name =~ "tx.*|txq.*" && span.tx_type =~ "NFToken.*"}

-# Find consensus rounds with slow transactions
-{name="consensus.accept"} | round_time_ms > 5000
+# Find all AMM activity across tx and txq spans
+{name =~ "tx.*|txq.*" && span.tx_type =~ "AMM.*"}
+
+# Find cross-node transaction receives (no errors)
+{name="tx.receive" && status != error}
 ```

 ### Where to Look (Quick Reference)

-| Question                            | Span                        | Key Attributes                 |
-| ----------------------------------- | --------------------------- | ------------------------------ |
-| "Which tx type is slowest?"         | `tx.process`                | `tx_type` + duration           |
-| "Why was my tx rejected?"           | `tx.process`                | `ter_result`, `applied`        |
-| "Is the TxQ backing up?"            | `txq.accept`                | `queue_size`, `ledger_changed` |
-| "Why was my tx dropped from queue?" | `txq.accept_tx`             | `txq_status`, `ter_code`       |
-| "Are batch requests a problem?"     | `rpc.process`               | `is_batch`, `batch_size`       |
-| "Which RPC is expensive?"           | `rpc.command.*`             | `load_type`, duration          |
-| "Did consensus stall?"              | `consensus.check`           | `consensus_stalled`            |
-| "Was consensus outcome normal?"     | `consensus.accept`          | `consensus_state`              |
-| "Did a validator bow out?"          | `consensus.proposal.send`   | `is_bow_out`                   |
-| "Which ledger was validated?"       | `consensus.validation.send` | `ledger_hash`                  |
+| Question                            | Span                        | Key Attributes                           |
+| ----------------------------------- | --------------------------- | ---------------------------------------- |
+| "Which tx type is slowest?"         | `tx.process`                | `span.tx_type` + duration                |
+| "Why was my tx rejected?"           | `tx.process`                | `span.ter_result`, `span.applied`        |
+| "What AMM operations happened?"     | `tx.process`                | `span.tx_type =~ "AMM.*"`                |
+| "What DEX offers failed?"           | `tx.process`                | `span.tx_type`, `span.ter_result`        |
+| "What NFT activity occurred?"       | `tx.process`, `txq.enqueue` | `span.tx_type =~ "NFToken.*"`            |
+| "Is the TxQ backing up?"            | `txq.accept`                | `span.queue_size`, `span.ledger_changed` |
+| "Why was my tx dropped from queue?" | `txq.accept_tx`             | `span.txq_status`, `span.ter_code`       |
+| "Are batch requests a problem?"     | `rpc.process`               | `span.is_batch`, `span.batch_size`       |
+| "Which RPC is expensive?"           | `rpc.command.*`             | `span.load_type`, duration               |
+| "Did consensus reach threshold?"    | `consensus.check`           | `span.consensus_result`                  |
+| "Was consensus outcome normal?"     | `consensus.accept`          | `span.consensus_state`                   |
+| "Did a validator bow out?"          | `consensus.proposal.send`   | `span.is_bow_out`                        |
+| "Which ledger was validated?"       | `consensus.validation.send` | `span.ledger_hash`                       |
+| "Did close time agreement fail?"    | `consensus.accept.apply`    | `span.close_time_correct`                |

 ---

@@ -349,20 +479,20 @@ all its normal attributes, it just lacks a cross-node parent link.
 ### Example Tempo Queries

 ```
-# Find cross-node transaction traces (tx.process -> tx.receive across nodes)
-{name="tx.receive"} && status != error
+# Find cross-node transaction traces (tx.receive spans with no errors)
+{name="tx.receive" && status != error}

 # Find proposals received with cross-node parent context
-{name="consensus.proposal.receive"} && nestedSetParent > 0
+{name="consensus.proposal.receive"}

 # Trace a transaction across the network by its hash
-{name=~"tx\\..*"} | tx_hash = "<hash>"
+{name =~ "tx.*" && span.tx_hash = "<hash>"}

 # Find all spans in a cross-node consensus trace
-{rootServiceName="xrpld"} | consensus_round_id = <round_id>
+{resource.service.name="xrpld" && span.consensus_round_id = "<round_id>"}

 # Compare latency between sender and receiver for validations
-{name="consensus.validation.send" || name="consensus.validation.receive"}
+{name = "consensus.validation.send" || name = "consensus.validation.receive"}
 ```

 ## Prometheus Metrics (Spanmetrics)
@@ -672,21 +802,26 @@ Log files are ingested by the OTel Collector's `filelog` receiver, which tails `

 ### LogQL Query Examples

+The OTel Collector emits logs to Loki with `service_name="xrpld"` (not `job="xrpld"`).
+
 ```logql
 # Find all logs for a specific trace
-{job="xrpld"} |= "trace_id=abc123def456789012345678abcdef01"
+{service_name="xrpld"} |= "trace_id=abc123def456789012345678abcdef01"

 # Error logs with trace context (log lines with ERR severity that have a trace_id)
-{job="xrpld"} |= "ERR" |= "trace_id="
+{service_name="xrpld"} |= "ERR" |= "trace_id="

 # All logs from a specific partition that were emitted during a span
-{job="xrpld"} |= "LedgerMaster" | regexp `trace_id=(?P<trace_id>[a-f0-9]+)` | trace_id != ""
+{service_name="xrpld"} |= "LedgerMaster" | regexp `trace_id=(?P<trace_id>[a-f0-9]+)` | trace_id != ""
+
+# Logs from a specific subsystem during a span (e.g. LedgerConsensus)
+{service_name="xrpld"} |= "LedgerConsensus" |= "trace_id="

 # Logs from the last hour containing trace context
-{job="xrpld"} |= "trace_id=" | regexp `(?P<partition>\S+):(?P<sev>\S+)\s+trace_id=(?P<tid>[a-f0-9]+)`
+{service_name="xrpld"} |= "trace_id=" | regexp `(?P<partition>\S+):(?P<sev>\S+)\s+trace_id=(?P<tid>[a-f0-9]+)`

 # Count of traced vs untraced log lines
-count_over_time({job="xrpld"} |= "trace_id=" [5m])
+count_over_time({service_name="xrpld"} |= "trace_id=" [5m])
 ```

 ### Verifying Log Correlation
@@ -694,7 +829,7 @@ count_over_time({job="xrpld"} |= "trace_id=" [5m])
 1. Start the observability stack and xrpld with telemetry enabled.
 2. Send an RPC request: `curl http://localhost:5005 -d '{"method":"server_info"}'`
 3. Check the debug.log for `trace_id=` entries: `grep trace_id= /path/to/debug.log`
-4. Open Grafana at http://localhost:3000 -> Explore -> Loki and search for `{job="xrpld"} |= "trace_id="`.
+4. Open Grafana at http://localhost:3000 -> Explore -> Loki and search for `{service_name="xrpld"} |= "trace_id="`.
 5. Click the TraceID link to navigate to the corresponding trace in Tempo.

 ## Troubleshooting
--- a/include/xrpl/tx/detail/TxApplySpanNames.h
+++ b/include/xrpl/tx/detail/TxApplySpanNames.h
@@ -0,0 +1,109 @@
+#pragma once
+
+/** Compile-time span name constants for the transaction apply pipeline.
+ *
+ *  Defines the span names and attribute keys used by the three apply-pipeline
+ *  stages — preflight, preclaim, and transactor (apply) — that run inside the
+ *  library (`src/libxrpl/tx/`). Built on the StaticStr/join() primitives from
+ *  <xrpl/telemetry/SpanNames.h>.
+ *
+ *  Why a separate header from TxSpanNames.h:
+ *  TxSpanNames.h lives under src/xrpld/ (daemon) and serves the overlay/app
+ *  lifecycle spans (tx.receive, tx.process). Library code (applySteps.cpp,
+ *  Transactor.cpp) must not depend on daemon headers, so the apply-pipeline
+ *  constants live here instead. The attribute strings ("tx_type",
+ *  "ter_result", "applied") intentionally match TxSpanNames.h so the collector
+ *  spanmetrics connector aggregates both sets under the same dimensions.
+ *
+ *  Span hierarchy (deterministic trace_id derived from txID[0:16]):
+ *
+ *  The three stages run sequentially and often on different threads, so they
+ *  do not auto-parent. Each uses a hash-derived trace_id keyed on the same
+ *  transaction id, placing all three under one trace without context
+ *  propagation. A transaction that hard-fails preflight or preclaim never
+ *  reaches the transactor span — the stage attribute identifies where it
+ *  stopped.
+ *
+ *    +-----------------------------------------------------------+
+ *    | trace_id = txID[0:16]                                     |
+ *    |                                                           |
+ *    |  +-------------------+   +------------------+   +-------+  |
+ *    |  | tx.preflight      |   | tx.preclaim      |   | tx.   |  |
+ *    |  | stage=preflight   |-->| stage=preclaim   |-->| trans |  |
+ *    |  | tx_type           |   | tx_type          |   | actor |  |
+ *    |  | ter_result        |   | ter_result       |   | stage=|  |
+ *    |  +-------------------+   +------------------+   | apply |  |
+ *    |   stateless checks       ledger-aware checks   +-------+  |
+ *    |   (signature, fields)    (sequence, fee)        applies   |
+ *    +-----------------------------------------------------------+
+ *
+ *  Usage:
+ *  @code
+ *      #include <xrpl/tx/detail/TxApplySpanNames.h>
+ *      using namespace telemetry;
+ *
+ *      // preflight() / preclaim() use hashSpan with a full span name:
+ *      auto span = SpanGuard::hashSpan(
+ *          TraceCategory::Transactions, tx_apply_span::preflight,
+ *          txID.data(), txID.kBytes);
+ *      span.setAttribute(tx_apply_span::attr::stage, tx_apply_span::val::preflight);
+ *      span.setAttribute(tx_apply_span::attr::terResult, transToken(ter).c_str());
+ *  @endcode
+ *
+ *  @code
+ *      // Transactor::operator() uses span() with prefix + suffix:
+ *      auto span = SpanGuard::span(
+ *          TraceCategory::Transactions, seg::tx, tx_apply_span::op::transactor);
+ *      span.setAttribute(tx_apply_span::attr::stage, tx_apply_span::val::apply);
+ *  @endcode
+ */
+
+#include <xrpl/telemetry/SpanNames.h>
+
+namespace xrpl::telemetry::tx_apply_span {
+
+// ===== Span operation suffixes =============================================
+
+namespace op {
+/// "preflight" — stateless transaction checks (suffix form).
+inline constexpr auto preflight = makeStr("preflight");
+/// "preclaim" — ledger-aware checks before fee claim (suffix form).
+inline constexpr auto preclaim = makeStr("preclaim");
+/// "transactor" — the apply stage (suffix form, used with span()).
+inline constexpr auto transactor = makeStr("transactor");
+}  // namespace op
+
+// ===== Full span names (tx.<op>) ===========================================
+
+/// "tx.preflight" — full name for hashSpan() at the preflight stage.
+inline constexpr auto preflight = join(seg::tx, op::preflight);
+/// "tx.preclaim" — full name for hashSpan() at the preclaim stage.
+inline constexpr auto preclaim = join(seg::tx, op::preclaim);
+
+// ===== Attribute keys ======================================================
+
+namespace attr {
+/// "stage" — which apply-pipeline stage this span represents. Drives the
+/// collector spanmetrics `stage` dimension for per-stage RED metrics.
+inline constexpr auto stage = makeStr("stage");
+/// "tx_type" — transaction type name (e.g., "Payment", "OfferCreate").
+/// Matches tx_span::attr::txType so both share the spanmetrics dimension.
+inline constexpr auto txType = makeStr("tx_type");
+/// "ter_result" — engine result code after the stage (e.g., "tesSUCCESS").
+inline constexpr auto terResult = makeStr("ter_result");
+/// "applied" — whether the transaction was applied to the ledger (apply only).
+inline constexpr auto applied = makeStr("applied");
+}  // namespace attr
+
+// ===== Attribute values (stage names) ======================================
+
+namespace val {
+/// "preflight" — value of the stage attribute on tx.preflight.
+inline constexpr auto preflight = makeStr("preflight");
+/// "preclaim" — value of the stage attribute on tx.preclaim.
+inline constexpr auto preclaim = makeStr("preclaim");
+/// "apply" — value of the stage attribute on tx.transactor.
+inline constexpr auto apply = makeStr("apply");
+}  // namespace val
+
+}  // namespace xrpl::telemetry::tx_apply_span
--- a/src/libxrpl/tx/Transactor.cpp
+++ b/src/libxrpl/tx/Transactor.cpp
@@ -44,6 +44,7 @@
 #include <xrpl/tx/SignerEntries.h>
 #include <xrpl/tx/apply.h>
 #include <xrpl/tx/applySteps.h>
+#include <xrpl/tx/detail/TxApplySpanNames.h>

 #include <cstddef>
 #include <cstdint>
@@ -1199,9 +1200,11 @@ Transactor::operator()()
    auto span = telemetry::SpanGuard::span(
        telemetry::TraceCategory::Transactions,
        telemetry::seg::tx,
-        telemetry::makeStr("transactor"));
+        telemetry::tx_apply_span::op::transactor);
+    // "apply" — the third apply-pipeline stage, after preflight and preclaim.
+    span.setAttribute(telemetry::tx_apply_span::attr::stage, telemetry::tx_apply_span::val::apply);
    if (auto const* fmt = TxFormats::getInstance().findByType(ctx_.tx.getTxnType()))
-        span.setAttribute("tx_type", fmt->getName().c_str());
+        span.setAttribute(telemetry::tx_apply_span::attr::txType, fmt->getName().c_str());

    JLOG(j_.trace()) << "apply: " << ctx_.tx.getTransactionID();

@@ -1429,8 +1432,8 @@ Transactor::operator()()

    JLOG(j_.trace()) << (applied ? "applied " : "not applied ") << transToken(result);

-    span.setAttribute("ter_result", transToken(result).c_str());
-    span.setAttribute("applied", applied);
+    span.setAttribute(telemetry::tx_apply_span::attr::terResult, transToken(result).c_str());
+    span.setAttribute(telemetry::tx_apply_span::attr::applied, applied);

    return {result, applied, metadata};
 }
--- a/src/libxrpl/tx/applySteps.cpp
+++ b/src/libxrpl/tx/applySteps.cpp
@@ -13,13 +13,16 @@
 #include <xrpl/protocol/SeqProxy.h>
 #include <xrpl/protocol/TER.h>
 #include <xrpl/protocol/XRPAmount.h>
+#include <xrpl/telemetry/SpanGuard.h>
 #include <xrpl/tx/ApplyContext.h>
 #include <xrpl/tx/Transactor.h>
+#include <xrpl/tx/detail/TxApplySpanNames.h>

 #include <cstdint>
 #include <exception>
 #include <memory>
 #include <optional>
+#include <string_view>
 #include <utility>
 #pragma push_macro("TRANSACTION")
 #undef TRANSACTION
@@ -51,6 +54,47 @@ struct UnknownTxnType : std::exception
    }
 };

+/** Look up the human-readable transaction type name for span attributes.
+ *  Returns nullptr if the type is unknown so the caller can skip the
+ *  attribute rather than emit an empty value.
+ */
+char const*
+txTypeName(TxType txnType)
+{
+    if (auto const* fmt = TxFormats::getInstance().findByType(txnType))
+        return fmt->getName().c_str();
+    return nullptr;
+}
+
+/** Create a deterministic-trace span for an apply-pipeline stage.
+ *
+ *  The trace_id is derived from txID[0:16] so the preflight, preclaim, and
+ *  transactor spans of one transaction share a trace even though they run
+ *  sequentially and often on different threads. Sets the stage, tx_type, and
+ *  (after the stage runs) ter_result attributes that drive the collector
+ *  spanmetrics dimensions. A no-op when telemetry is disabled.
+ *
+ *  @param name   Full span name (tx_apply_span::preflight / ::preclaim).
+ *  @param stage  Stage attribute value (tx_apply_span::val::*).
+ *  @param tx     The transaction supplying the id and type.
+ */
+[[nodiscard]] telemetry::SpanGuard
+makeStageSpan(std::string_view name, std::string_view stage, STTx const& tx)
+{
+    auto const txID = tx.getTransactionID();
+    auto span = telemetry::SpanGuard::hashSpan(
+        telemetry::TraceCategory::Transactions, name, txID.data(), txID.kBytes);
+    // Guard the type lookup behind the active check: preflight runs for every
+    // transaction, so findByType() must not run when tracing is off/disabled.
+    if (span)
+    {
+        span.setAttribute(telemetry::tx_apply_span::attr::stage, stage);
+        if (char const* typeName = txTypeName(tx.getTxnType()))
+            span.setAttribute(telemetry::tx_apply_span::attr::txType, typeName);
+    }
+    return span;
+}
+
 // Call a lambda with the concrete transaction type as a template parameter
 // throw an "UnknownTxnType" exception on error
 template <class F>
@@ -133,82 +177,122 @@ consequencesHelper(PreflightContext const& ctx)
 static std::pair<NotTEC, TxConsequences>
 invokePreflight(PreflightContext const& ctx)
 {
+    // Trace the preflight stage. The span shares the transaction's
+    // deterministic trace_id so it correlates with preclaim and transactor.
+    auto span = makeStageSpan(
+        telemetry::tx_apply_span::preflight, telemetry::tx_apply_span::val::preflight, ctx.tx);
    try
    {
-        return withTxnType(ctx.rules, ctx.tx.getTxnType(), [&]<typename T>() {
+        auto result = withTxnType(ctx.rules, ctx.tx.getTxnType(), [&]<typename T>() {
            auto const tec = Transactor::invokePreflight<T>(ctx);
            return std::make_pair(
                tec, isTesSuccess(tec) ? consequencesHelper<T>(ctx) : TxConsequences{tec});
        });
+        if (span)
+        {
+            span.setAttribute(
+                telemetry::tx_apply_span::attr::terResult, transToken(result.first).c_str());
+        }
+        return result;
    }
    catch (UnknownTxnType const& e)
    {
        // Should never happen
        // LCOV_EXCL_START
        JLOG(ctx.j.fatal()) << "Unknown transaction type in preflight: " << e.txnType;
+        span.recordException(e);
        UNREACHABLE("xrpl::invokePreflight : unknown transaction type");
        return {temUNKNOWN, TxConsequences{temUNKNOWN}};
        // LCOV_EXCL_STOP
    }
+    catch (std::exception const& e)
+    {
+        // The caller's preflight() maps this to tefEXCEPTION. Record it on the
+        // span before unwinding so per-stage error counts include exceptions.
+        span.setAttribute(
+            telemetry::tx_apply_span::attr::terResult, transToken(tefEXCEPTION).c_str());
+        span.recordException(e);
+        throw;
+    }
 }

 static TER
 invokePreclaim(PreclaimContext const& ctx)
 {
+    // Trace the preclaim stage under the transaction's deterministic trace_id.
+    auto span = makeStageSpan(
+        telemetry::tx_apply_span::preclaim, telemetry::tx_apply_span::val::preclaim, ctx.tx);
    try
    {
        // use name hiding to accomplish compile-time polymorphism of static
        // class functions for Transactor and derived classes.
-        return withTxnType(ctx.view.rules(), ctx.tx.getTxnType(), [&]<typename T>() -> TER {
-            // preclaim functionality is divided into two sections:
-            // 1. Up to and including the signature check: returns NotTEC.
-            //    All transaction checks before and including checkSign
-            //    MUST return NotTEC, or something more restrictive.
-            //    Allowing tec results in these steps risks theft or
-            //    destruction of funds, as a fee will be charged before the
-            //    signature is checked.
-            // 2. After the signature check: returns TER.
+        TER const preclaimTer =
+            withTxnType(ctx.view.rules(), ctx.tx.getTxnType(), [&]<typename T>() -> TER {
+                // preclaim functionality is divided into two sections:
+                // 1. Up to and including the signature check: returns NotTEC.
+                //    All transaction checks before and including checkSign
+                //    MUST return NotTEC, or something more restrictive.
+                //    Allowing tec results in these steps risks theft or
+                //    destruction of funds, as a fee will be charged before the
+                //    signature is checked.
+                // 2. After the signature check: returns TER.

-            // If the transactor requires a valid account and the
-            // transaction doesn't list one, preflight will have already
-            // a flagged a failure.
-            auto const id = ctx.tx.getAccountID(sfAccount);
+                // If the transactor requires a valid account and the
+                // transaction doesn't list one, preflight will have already
+                // a flagged a failure.
+                auto const id = ctx.tx.getAccountID(sfAccount);

-            if (id != beast::kZero)
-            {
-                if (NotTEC const preSigResult = [&]() -> NotTEC {
-                        if (NotTEC const result = T::checkSeqProxy(ctx.view, ctx.tx, ctx.j))
-                            return result;
+                if (id != beast::kZero)
+                {
+                    if (NotTEC const preSigResult = [&]() -> NotTEC {
+                            if (NotTEC const result = T::checkSeqProxy(ctx.view, ctx.tx, ctx.j))
+                                return result;

-                        if (NotTEC const result = T::checkPriorTxAndLastLedger(ctx))
-                            return result;
+                            if (NotTEC const result = T::checkPriorTxAndLastLedger(ctx))
+                                return result;

-                        if (NotTEC const result = T::checkPermission(ctx.view, ctx.tx))
-                            return result;
+                            if (NotTEC const result = T::checkPermission(ctx.view, ctx.tx))
+                                return result;

-                        if (NotTEC const result = T::checkSign(ctx))
-                            return result;
+                            if (NotTEC const result = T::checkSign(ctx))
+                                return result;

-                        return tesSUCCESS;
-                    }())
-                    return preSigResult;
+                            return tesSUCCESS;
+                        }())
+                        return preSigResult;

-                if (TER const result = T::checkFee(ctx, calculateBaseFee(ctx.view, ctx.tx)))
-                    return result;
-            }
+                    if (TER const result = T::checkFee(ctx, calculateBaseFee(ctx.view, ctx.tx)))
+                        return result;
+                }

-            return T::preclaim(ctx);
-        });
+                return T::preclaim(ctx);
+            });
+        if (span)
+        {
+            span.setAttribute(
+                telemetry::tx_apply_span::attr::terResult, transToken(preclaimTer).c_str());
+        }
+        return preclaimTer;
    }
    catch (UnknownTxnType const& e)
    {
        // Should never happen
        // LCOV_EXCL_START
        JLOG(ctx.j.fatal()) << "Unknown transaction type in preclaim: " << e.txnType;
+        span.recordException(e);
        UNREACHABLE("xrpl::invokePreclaim : unknown transaction type");
        return temUNKNOWN;
        // LCOV_EXCL_STOP
    }
+    catch (std::exception const& e)
+    {
+        // The caller's preclaim() maps this to tefEXCEPTION. Record it on the
+        // span before unwinding so per-stage error counts include exceptions.
+        span.setAttribute(
+            telemetry::tx_apply_span::attr::terResult, transToken(tefEXCEPTION).c_str());
+        span.recordException(e);
+        throw;
+    }
 }

 /**
--- a/src/tests/libxrpl/telemetry/TxApplySpanNames.cpp
+++ b/src/tests/libxrpl/telemetry/TxApplySpanNames.cpp
@@ -0,0 +1,52 @@
+#include <xrpl/tx/detail/TxApplySpanNames.h>
+
+#include <gtest/gtest.h>
+
+#include <string_view>
+
+/** Contract tests for the transaction apply-pipeline span constants.
+ *
+ *  The span names and attribute keys in TxApplySpanNames.h are a cross-component
+ *  contract: the collector spanmetrics connector aggregates on these exact
+ *  strings (dimensions tx_type, ter_result, stage) and the Grafana
+ *  transaction-overview dashboard queries them. A silent rename here would
+ *  break per-stage metrics with no compile error, so these tests pin the
+ *  literal values. They need no telemetry runtime and run in every build.
+ */
+
+using namespace xrpl::telemetry;
+
+TEST(TxApplySpanNames, span_names_are_dot_qualified)
+{
+    // Full span names feed SpanGuard::hashSpan() in applySteps.cpp.
+    EXPECT_EQ(std::string_view(tx_apply_span::preflight), "tx.preflight");
+    EXPECT_EQ(std::string_view(tx_apply_span::preclaim), "tx.preclaim");
+}
+
+TEST(TxApplySpanNames, operation_suffixes)
+{
+    // Suffix used with SpanGuard::span(cat, seg::tx, suffix) in Transactor.cpp.
+    EXPECT_EQ(std::string_view(tx_apply_span::op::preflight), "preflight");
+    EXPECT_EQ(std::string_view(tx_apply_span::op::preclaim), "preclaim");
+    EXPECT_EQ(std::string_view(tx_apply_span::op::transactor), "transactor");
+}
+
+TEST(TxApplySpanNames, attribute_keys_match_collector_dimensions)
+{
+    // These keys MUST match docker/telemetry/otel-collector-config.yaml
+    // spanmetrics dimensions and TxSpanNames.h (so both span sets aggregate
+    // under one dimension).
+    EXPECT_EQ(std::string_view(tx_apply_span::attr::stage), "stage");
+    EXPECT_EQ(std::string_view(tx_apply_span::attr::txType), "tx_type");
+    EXPECT_EQ(std::string_view(tx_apply_span::attr::terResult), "ter_result");
+    EXPECT_EQ(std::string_view(tx_apply_span::attr::applied), "applied");
+}
+
+TEST(TxApplySpanNames, stage_values_are_the_three_pipeline_stages)
+{
+    // The stage attribute carries exactly these three values; they become the
+    // spanmetrics `stage` dimension cardinality (3) and the dashboard filter.
+    EXPECT_EQ(std::string_view(tx_apply_span::val::preflight), "preflight");
+    EXPECT_EQ(std::string_view(tx_apply_span::val::preclaim), "preclaim");
+    EXPECT_EQ(std::string_view(tx_apply_span::val::apply), "apply");
+}
--- a/src/xrpld/app/ledger/LedgerHistory.cpp
+++ b/src/xrpld/app/ledger/LedgerHistory.cpp
@@ -27,6 +27,7 @@
 #include <memory>
 #include <mutex>
 #include <optional>
+#include <string_view>
 #include <utility>
 #include <vector>

--- a/src/xrpld/app/misc/detail/TxQ.cpp
+++ b/src/xrpld/app/misc/detail/TxQ.cpp
@@ -608,7 +608,8 @@ TxQ::tryClearAccountQueueUpThruTx(
    if (txResult.applied)
    {
        // All of the queued transactions applied, so remove them from the
-        // queue.
+        // queue.  `dist` queued txs preceded the current one in the batch.
+        span.setAttribute(txq_span::attr::numCleared, static_cast<std::int64_t>(dist));
        endTxIter = erase(accountIter->second, beginTxIter, endTxIter);
        // If `tx` is replacing a queued tx, delete that one, too.
        if (endTxIter != accountIter->second.transactions.end() && endTxIter->first == tSeqProx)
@@ -745,6 +746,9 @@ TxQ::apply(
    span.setAttribute(txq_span::attr::txHash, to_string(tx->getTransactionID()).c_str());
    if (auto const* fmt = TxFormats::getInstance().findByType(tx->getTxnType()))
        span.setAttribute(txq_span::attr::txType, fmt->getName().c_str());
+    // Default outcome; overridden below on the direct-apply and queued paths.
+    // Every other early return leaves the tx rejected from the queue.
+    span.setAttribute(txq_span::attr::txqStatus, txq_span::val::rejected);

    NumberSO const stNumberSO{view.rules().enabled(fixUniversalNumber)};

@@ -758,7 +762,10 @@ TxQ::apply(
    // See if the transaction paid a high enough fee that it can go straight
    // into the ledger.
    if (auto directApplied = tryDirectApply(app, view, tx, flags, j))
+    {
+        span.setAttribute(txq_span::attr::txqStatus, txq_span::val::appliedDirect);
        return *directApplied;
+    }

    if ((flags & TapDryRun) != 0u)
        return {telCAN_NOT_QUEUE, false};
@@ -885,6 +892,10 @@ TxQ::apply(
    auto const metricsSnapshot = feeMetrics_.getSnapshot();
    auto const feeLevelPaid = getFeeLevelPaid(view, *tx);
    auto const requiredFeeLevel = getRequiredFeeLevel(view, flags, metricsSnapshot, lock);
+    span.setAttribute(
+        txq_span::attr::feeLevelPaid, static_cast<std::int64_t>(feeLevelPaid.value()));
+    span.setAttribute(
+        txq_span::attr::requiredFeeLevel, static_cast<std::int64_t>(requiredFeeLevel.value()));

    // Is there a blocker already in the account's queue?  If so, don't
    // allow additional transactions in the queue.
@@ -1218,6 +1229,7 @@ TxQ::apply(
            /* Can't erase (*replacedTxIter) here because success
                implies that it has already been deleted.
            */
+            span.setAttribute(txq_span::attr::txqStatus, txq_span::val::applied);
            return result;
        }
    }
@@ -1337,6 +1349,7 @@ TxQ::apply(
                     << " to queue."
                     << " Flags: " << flags;

+    span.setAttribute(txq_span::attr::txqStatus, txq_span::val::queued);
    return {terQUEUED, false};
 }

@@ -1372,6 +1385,7 @@ TxQ::processClosedLedger(Application& app, ReadView const& view, bool timeLeap)

    // Remove any queued candidates whose LastLedgerSequence has gone by.
    auto* const metrics = app.getMetricsRegistry();
+    std::int64_t expiredCount = 0;
    for (auto candidateIter = byFee_.begin(); candidateIter != byFee_.end();)
    {
        if (candidateIter->lastValid && *candidateIter->lastValid <= ledgerSeq)
@@ -1382,12 +1396,14 @@ TxQ::processClosedLedger(Application& app, ReadView const& view, bool timeLeap)
            // escalating fee and were never included before expiry.
            if (metrics != nullptr)
                metrics->incrementTxqExpired();
+            ++expiredCount;
        }
        else
        {
            ++candidateIter;
        }
    }
+    span.setAttribute(txq_span::attr::expiredCount, expiredCount);

    // Remove any TxQAccounts that don't have candidates
    // under them
--- a/src/xrpld/app/misc/detail/TxQSpanNames.h
+++ b/src/xrpld/app/misc/detail/TxQSpanNames.h
@@ -15,12 +15,14 @@
 *    |  +--------------------------------------------------+ |
 *    |  | txq.enqueue                                      | |
 *    |  | TxQ::apply()                                     | |
- *    |  | attrs: tx_hash, status, fee_level                | |
+ *    |  | attrs: tx_hash, tx_type, txq_status,             | |
+ *    |  |        fee_level_paid, required_fee_level         | |
 *    |  |                                                  | |
 *    |  |  +-------------------+ +----------------------+  | |
 *    |  |  | txq.apply_direct  | | txq.batch_clear      |  | |
 *    |  |  | tryDirectApply()  | | tryClearAccount...() |  | |
- *    |  |  +-------------------+ +----------------------+  | |
+ *    |  |  +-------------------+ | attrs: num_cleared   |  | |
+ *    |  |                        +----------------------+  | |
 *    |  +--------------------------------------------------+ |
 *    +-------------------------------------------------------+
 *