diff --git a/.github/scripts/levelization/results/ordering.txt b/.github/scripts/levelization/results/ordering.txt index 8111c48bf5..f7019a422f 100644 --- a/.github/scripts/levelization/results/ordering.txt +++ b/.github/scripts/levelization/results/ordering.txt @@ -242,6 +242,7 @@ xrpl.tx > xrpl.basics xrpl.tx > xrpl.core xrpl.tx > xrpl.ledger xrpl.tx > xrpl.protocol +xrpl.tx > xrpl.telemetry xrpld.app > test.unit_test xrpld.app > xrpl.basics xrpld.app > xrpl.core diff --git a/OpenTelemetryPlan/09-data-collection-reference.md b/OpenTelemetryPlan/09-data-collection-reference.md index 75444cc9e1..fc59db1024 100644 --- a/OpenTelemetryPlan/09-data-collection-reference.md +++ b/OpenTelemetryPlan/09-data-collection-reference.md @@ -101,13 +101,23 @@ Controlled by `trace_rpc=1` in `[telemetry]` config. Controlled by `trace_transactions=1` in `[telemetry]` config. -| Span Name | Parent | Source File | Description | -| ------------ | -------------- | --------------- | ----------------------------------------------------------------- | -| `tx.process` | — | NetworkOPs.cpp | Transaction submission entry point (local or peer-relayed) | -| `tx.receive` | — | PeerImp.cpp | Raw transaction received from peer overlay (before deduplication) | -| `tx.apply` | `ledger.build` | BuildLedger.cpp | Transaction set applied to new ledger during consensus | +| Span Name | Parent | Source File | Description | +| --------------- | -------------- | --------------- | ----------------------------------------------------------------- | +| `tx.process` | — | NetworkOPs.cpp | Transaction submission entry point (local or peer-relayed) | +| `tx.receive` | — | PeerImp.cpp | Raw transaction received from peer overlay (before deduplication) | +| `tx.apply` | `ledger.build` | BuildLedger.cpp | Transaction set applied to new ledger during consensus | +| `tx.preflight` | — | applySteps.cpp | Stateless checks stage (`stage=preflight`) | +| `tx.preclaim` | — | applySteps.cpp | Ledger-aware checks stage before fee claim (`stage=preclaim`) | +| `tx.transactor` | — | Transactor.cpp | Apply stage — the transactor runs (`stage=apply`) | + +The three apply-pipeline spans share a deterministic `trace_id` derived from +`txID[0:16]`, so preflight, preclaim, and transactor for one transaction group +under a single trace even though they run sequentially and often on different +threads. A transaction that hard-fails preflight or preclaim never reaches the +later spans — the `stage` attribute identifies where it stopped. **Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"tx.process|tx.receive"}` +or, for the apply pipeline: `{resource.service.name="xrpld" && name=~"tx.preflight|tx.preclaim|tx.transactor"}` **Grafana dashboard**: _Transaction Overview_ (`xrpld-transactions`) @@ -179,13 +189,19 @@ Every span can carry key-value attributes that provide context for filtering and #### Transaction Attributes -| Attribute | Type | Set On | Description | -| -------------------- | ------- | -------------------------- | ---------------------------------------------------- | -| `xrpl.tx.hash` | string | `tx.process`, `tx.receive` | Transaction hash (hex-encoded) | -| `xrpl.tx.local` | boolean | `tx.process` | `true` if locally submitted, `false` if peer-relayed | -| `xrpl.tx.path` | string | `tx.process` | Submission path: `"sync"` or `"async"` | -| `xrpl.tx.suppressed` | boolean | `tx.receive` | `true` if transaction was suppressed (duplicate) | -| `xrpl.tx.status` | string | `tx.receive` | Transaction status (e.g., `"known_bad"`) | +| Attribute | Type | Set On | Description | +| ------------------- | ------- | ---------------------------------------------- | --------------------------------------------------------------------- | +| `xrpl.tx.hash` | string | `tx.process`, `tx.receive` | Transaction hash (hex-encoded) | +| `local` | boolean | `tx.process` | `true` if locally submitted, `false` if peer-relayed | +| `path` | string | `tx.process` | Submission path: `"sync"` or `"async"` | +| `suppressed` | boolean | `tx.receive` | `true` if transaction was suppressed (duplicate) | +| `tx_status` | string | `tx.receive` | Transaction status (e.g., `"known_bad"`) | +| `xrpl.peer.id` | int64 | `tx.receive` | Peer identifier (also set on peer spans) | +| `xrpl.peer.version` | string | `tx.receive` | Peer protocol version string | +| `stage` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Apply-pipeline stage: `preflight`, `preclaim`, or `apply` | +| `tx_type` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Transaction type name (e.g., `Payment`) | +| `ter_result` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Engine result token for that stage (e.g., `tesSUCCESS`, `terPRE_SEQ`) | +| `applied` | boolean | `tx.transactor` | `true` if the transaction was applied to the ledger | **Tempo query**: `{span.xrpl.tx.hash=""}` to trace a specific transaction across nodes. @@ -250,14 +266,25 @@ The OTel Collector's SpanMetrics connector automatically generates RED (Rate, Er **Additional dimension labels** (configured in `otel-collector-config.yaml`): -| Span Attribute | Prometheus Label | Applies To | -| ------------------------------ | ------------------------------ | ------------------------- | -| `command` | `xrpl_rpc_command` | `rpc.command.*` | -| `rpc_status` | `xrpl_rpc_status` | `rpc.command.*` | -| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` | -| `xrpl.tx.local` | `xrpl_tx_local` | `tx.process` | -| `xrpl.peer.proposal.trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` | -| `xrpl.peer.validation.trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` | +| Span Attribute | Prometheus Label | Applies To | +| --------------------- | ------------------------------ | ---------------------------------------------- | +| `command` | `xrpl_rpc_command` | `rpc.command.*` | +| `rpc_status` | `xrpl_rpc_status` | `rpc.command.*` | +| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` | +| `local` | `xrpl_tx_local` | `tx.process` | +| `proposal_trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` | +| `validation_trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` | +| `stage` | `stage` | `tx.preflight`, `tx.preclaim`, `tx.transactor` | + +The `stage` dimension (3 values: `preflight`, `preclaim`, `apply`) turns the +apply-pipeline spans into per-stage RED metrics with no native instruments — the +_Transaction Overview_ dashboard charts rate, p95 latency, and failure rate by stage. + +> **Sampling caveat**: span-derived metrics inherit the **tracer head-sampling** +> ratio (`sampling_ratio` in `[telemetry]`, via `TraceIdRatioBasedSampler`). At +> `sampling_ratio < 1.0` the stage RED metrics undercount proportionally — they +> reflect sampled traces, not the full transaction volume. Native StatsD/meter +> metrics do not sample. Account for this when reading absolute stage rates. **Where to query**: Prometheus → `traces_span_metrics_calls_total{span_name="rpc.command.server_info"}` diff --git a/OpenTelemetryPlan/Phase3_taskList.md b/OpenTelemetryPlan/Phase3_taskList.md index 1d7d181f0e..fbfeda5f0f 100644 --- a/OpenTelemetryPlan/Phase3_taskList.md +++ b/OpenTelemetryPlan/Phase3_taskList.md @@ -474,17 +474,22 @@ This gives the best of both worlds: guaranteed cross-node correlation via determ **Attributes added**: -| Span | Attribute | Type | Source | -| --------------- | ---------------- | ------ | ------------------------------------------------------------------- | -| `tx.process` | `tx_type` | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` | -| `tx.process` | `fee` | int64 | `stx->getFieldAmount(sfFee).xrp().drops()` | -| `tx.process` | `sequence` | int64 | `stx->getSeqProxy().value()` | -| `tx.process` | `ter_result` | string | `transToken(e.result)` (set after batch application) | -| `tx.process` | `applied` | bool | `e.applied` (set after batch application) | -| `tx.receive` | `tx_type` | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` | -| `txq.enqueue` | `tx_type` | string | same pattern as above | -| `txq.accept.tx` | `txq_status` | string | `applied` / `failed` / `retried` | -| `txq.accept` | `ledger_changed` | bool | set at end of accept loop | +| Span | Attribute | Type | Source | +| ----------------- | -------------------- | ------ | ------------------------------------------------------------------- | +| `tx.process` | `tx_type` | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` | +| `tx.process` | `fee` | int64 | `stx->getFieldAmount(sfFee).xrp().drops()` | +| `tx.process` | `sequence` | int64 | `stx->getSeqProxy().value()` | +| `tx.process` | `ter_result` | string | `transToken(e.result)` (set after batch application) | +| `tx.process` | `applied` | bool | `e.applied` (set after batch application) | +| `tx.receive` | `tx_type` | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` | +| `txq.enqueue` | `tx_type` | string | same pattern as above | +| `txq.enqueue` | `txq_status` | string | `queued` / `applied_direct` / `applied` / `rejected` | +| `txq.enqueue` | `fee_level_paid` | int64 | `getFeeLevelPaid(view, *tx).value()` | +| `txq.enqueue` | `required_fee_level` | int64 | `getRequiredFeeLevel(...).value()` | +| `txq.batch_clear` | `num_cleared` | int64 | queued txs cleared ahead of the applying tx | +| `txq.cleanup` | `expired_count` | int64 | entries dropped for passed `LastLedgerSequence` | +| `txq.accept.tx` | `txq_status` | string | `applied` / `failed` / `retried` | +| `txq.accept` | `ledger_changed` | bool | set at end of accept loop | **New attr keys**: `TxSpanNames.h` (`txType`, `fee`, `sequence`, `terResult`, `applied`), `TxQSpanNames.h` (`txType`). diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index b1c8f64bd2..3f15bb2d8f 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -648,6 +648,138 @@ }, "overrides": [] } + }, + { + "title": "Tx Apply Pipeline Rate by Stage", + "description": "Span rate for each apply-pipeline stage (preflight, preclaim, apply). A drop between stages shows where transactions are filtered out. Requires the stage dimension in spanmetrics.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 64 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (stage, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\"}[5m]))", + "legendFormat": "{{stage}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Spans / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Tx Apply Pipeline Latency by Stage (p95)", + "description": "95th-percentile duration of each apply-pipeline stage. Isolates which stage (preflight, preclaim, apply) dominates transaction processing time.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 64 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, stage, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\"}[5m])))", + "legendFormat": "P95 {{stage}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Tx Apply Pipeline Failure Rate by Stage", + "description": "Rate of apply-pipeline spans whose ter_result is not tesSUCCESS, split by stage. Shows whether failures concentrate in preflight, preclaim, or apply. Filters on ter_result rather than span status because a failing ter code completes the span normally; only thrown exceptions set an error status.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 72 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (stage, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\", ter_result!~\"tesSUCCESS|\"}[5m]))", + "legendFormat": "{{stage}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Failed Spans / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, @@ -747,6 +879,24 @@ }, "sort": 1, "label": "Queue Status" + }, + { + "name": "stage", + "type": "query", + "datasource": { + "type": "prometheus" + }, + "query": "label_values(traces_span_metrics_calls_total{span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage!=\"\"}, stage)", + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "sort": 1, + "label": "Apply Stage" } ] }, diff --git a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml index 3c94482283..95771d12e5 100644 --- a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml +++ b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml @@ -3,12 +3,10 @@ # Access Grafana at http://localhost:3000, then use Explore -> Tempo # to browse xrpld traces using TraceQL. # -# Search filters provide pre-configured dropdowns in the Explore UI. -# Each phase adds filters for the span attributes it introduces. -# Phase 1b (infra): Base filters — node identity, service, span name, status. -# Phase 2 (RPC): RPC command, status, role filters. -# Phase 3 (TX): Transaction hash, local/peer origin, status. -# Phase 4 (Cons): Consensus mode, round, ledger sequence, close time. +# Search filters provide quick-start dropdowns in the Explore UI for the most +# common investigation entry points. This is not an exhaustive attribute list — +# use TraceQL autocomplete or see OpenTelemetryPlan/09-data-collection-reference.md §4 +# for the full attribute inventory and example queries. apiVersion: 1 @@ -40,177 +38,45 @@ datasources: spanEndTimeShift: "1h" search: filters: - # --- Node identification filters --- - # service.name: logical service name (default: "xrpld"). - # Useful when running multiple service types in the same collector. - - id: service-name - tag: service.name - operator: "=" - scope: resource - type: static - # service.instance.id: unique node identifier — configurable via - # the service_instance_id setting in [telemetry], defaults to the - # node's public key. E.g. "Node-1" or "nHB1X37...". + # service.instance.id: unique node identifier (public key or configured name). - id: node-id tag: service.instance.id operator: "=" scope: resource type: static - # service.version: xrpld build version (e.g., "2.4.0-b1"). - # Filter traces from specific software releases. - - id: node-version - tag: service.version - operator: "=" - scope: resource - type: dynamic - # xrpl.network.id: numeric network identifier - # (0 = mainnet, 1 = testnet, 2 = devnet, etc.). - # Derived from the [network_id] config section. - - id: network-id - tag: xrpl.network.id - operator: "=" - scope: resource - type: dynamic - # xrpl.network.type: human-readable network name derived from - # network ID ("mainnet", "testnet", "devnet", "unknown"). - - id: network-type - tag: xrpl.network.type - operator: "=" - scope: resource - type: static - # --- Span intrinsic filters --- - # name: the span operation name (e.g., "rpc.command.server_info"). - # Use to find traces for a specific RPC command or subsystem. + # name: span operation name (e.g., "rpc.command.server_info"). - id: span-name tag: name operator: "=" scope: intrinsic type: static # status: span completion status ("ok", "error", "unset"). - # Filter for failed operations to diagnose errors. - id: span-status tag: status operator: "=" scope: intrinsic type: static - # duration: span wall-clock duration. Use with ">" operator - # to find slow operations (e.g., duration > 500ms). - - id: span-duration - tag: duration - operator: ">" - scope: intrinsic - type: static - # Phase 2: RPC tracing filters + # command: RPC command name (e.g., "server_info", "submit"). - id: rpc-command tag: command operator: "=" scope: span type: static - - id: rpc-status - tag: rpc_status - operator: "=" - scope: span - type: dynamic - - id: rpc-role - tag: rpc_role - operator: "=" - scope: span - type: dynamic - # Phase 3: Transaction tracing filters + # tx_hash: transaction hash — direct lookup for a known transaction. - id: tx-hash tag: tx_hash operator: "=" scope: span type: static - - id: tx-origin - tag: local - operator: "=" - scope: span - type: dynamic - - id: tx-status - tag: tx_status - operator: "=" - scope: span - type: dynamic - # Phase 4: Consensus tracing filters - - id: consensus-mode - tag: xrpl.consensus.mode + # tx_type: transaction type (e.g., "Payment", "OfferCreate"). + - id: tx-type + tag: tx_type operator: "=" scope: span type: static - - id: consensus-round - tag: xrpl.consensus.round - operator: "=" - scope: span - type: dynamic - - id: consensus-ledger-seq - tag: xrpl.ledger.seq + # ledger_hash: ledger hash — scope all spans to a specific closed ledger. + - id: ledger-hash + tag: ledger_hash operator: "=" scope: span type: static - - id: consensus-close-time-correct - tag: close_time_correct - operator: "=" - scope: span - type: dynamic - - id: consensus-state - tag: consensus_state - operator: "=" - scope: span - type: dynamic - - id: consensus-close-resolution - tag: close_resolution_ms - operator: "=" - scope: span - type: dynamic - - id: consensus-proposers - tag: proposers - operator: "=" - scope: span - type: dynamic - - id: consensus-result - tag: consensus_result - operator: "=" - scope: span - type: dynamic - - id: consensus-mode-old - tag: mode_old - operator: "=" - scope: span - type: dynamic - - id: consensus-mode-new - tag: mode_new - operator: "=" - scope: span - type: dynamic - - id: consensus-ledger-id - tag: xrpl.consensus.ledger_id - operator: "=" - scope: span - type: static - # Phase 3/4: Additional transaction and queue filters - - id: tx-path - tag: path - operator: "=" - scope: span - type: dynamic - - id: tx-suppressed - tag: suppressed - operator: "=" - scope: span - type: dynamic - - id: peer-version - tag: peer_version - operator: "=" - scope: span - type: dynamic - - id: txq-status - tag: txq_status - operator: "=" - scope: span - type: dynamic - - id: txq-ter-code - tag: ter_code - operator: "=" - scope: span - type: dynamic diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index 988c1f3d20..dbc2cf34e8 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -94,6 +94,9 @@ connectors: - name: validation_trusted - name: tx_type - name: ter_result + # Apply-pipeline stage (preflight|preclaim|apply) — splits the + # tx.preflight/tx.preclaim/tx.transactor span RED metrics per stage. + - name: stage - name: txq_status - name: load_type - name: is_batch diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index 29b65682e6..cae4b804a7 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -75,11 +75,20 @@ All spans instrumented in xrpld, grouped by subsystem: ### Transaction Spans (Phase 3) -| Span Name | Source File | Attributes | Description | -| ------------ | --------------- | --------------------------------------------------------------------------------- | ------------------------------------- | -| `tx.process` | NetworkOPs.cpp | `tx_hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing | -| `tx.receive` | PeerImp.cpp | `peer_id`, `tx_hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay | -| `tx.apply` | BuildLedger.cpp | `ledger_seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger | +| Span Name | Source File | Attributes | Description | +| --------------- | --------------- | --------------------------------------------------------------------------------- | ------------------------------------- | +| `tx.process` | NetworkOPs.cpp | `tx_hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing | +| `tx.receive` | PeerImp.cpp | `peer_id`, `tx_hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay | +| `tx.apply` | BuildLedger.cpp | `ledger_seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger | +| `tx.preflight` | applySteps.cpp | `stage`, `tx_type`, `ter_result` | Stateless checks stage | +| `tx.preclaim` | applySteps.cpp | `stage`, `tx_type`, `ter_result` | Ledger-aware checks stage | +| `tx.transactor` | Transactor.cpp | `stage`, `tx_type`, `ter_result`, `applied` | Apply stage (transactor runs) | + +The three apply-pipeline spans (`tx.preflight`, `tx.preclaim`, `tx.transactor`) +share a deterministic `trace_id` from `txID[0:16]`, so they group under one +trace per transaction. The `stage` attribute (`preflight` / `preclaim` / +`apply`) drives the collector spanmetrics `stage` dimension, giving per-stage +RED metrics on the _Transaction Overview_ dashboard. ### Transaction Queue Spans (Phase 3) @@ -247,6 +256,43 @@ Conditions are combined with `&&`. The `|` pipeline operator is not supported on {name="tx.receive" && span.tx_type = "OfferCreate"} ``` +### Apply Pipeline by Stage + +``` +# All three stages of one transaction (preflight -> preclaim -> apply) +{name=~"tx.preflight|tx.preclaim|tx.transactor"} + +# Transactions that failed at the preclaim stage +{name="tx.preclaim"} | ter_result != "tesSUCCESS" + +# Transactions that hard-failed preflight (never reached preclaim/apply) +{name="tx.preflight"} | ter_result != "tesSUCCESS" +``` + +PromQL on the span-derived metrics (dashboard: _Transaction Overview_): + +``` +# Per-stage throughput — the funnel preflight >= preclaim >= apply +sum by (stage) (rate(traces_span_metrics_calls_total{span_name=~"tx.preflight|tx.preclaim|tx.transactor"}[5m])) + +# Per-stage p95 latency +histogram_quantile(0.95, sum by (le, stage) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"tx.preflight|tx.preclaim|tx.transactor"}[5m]))) + +# Per-stage failure rate (ter_result != tesSUCCESS; a failing ter completes the +# span normally, so filter on the attribute, not status_code which only flags exceptions) +sum by (stage) (rate(traces_span_metrics_calls_total{span_name=~"tx.preflight|tx.preclaim|tx.transactor", ter_result!~"tesSUCCESS|"}[5m])) +``` + +> **Alerting**: a rising `tx.preflight` / `tx.preclaim` failure rate points to +> malformed or stale-sequence submissions (often spam or a misbehaving client); +> a rising `tx.transactor` failure rate points to apply-time problems. Alert per +> stage rather than on a single aggregate so the failing stage is obvious. + +> **Sampling caveat**: these stage metrics are span-derived and inherit the +> **tracer head-sampling** ratio (`sampling_ratio`). At `sampling_ratio < 1.0` +> they undercount proportionally — treat them as relative trends, not absolute +> transaction counts. Native StatsD metrics are unsampled. + ### Transaction Queue Health ``` diff --git a/include/xrpl/tx/detail/TxApplySpanNames.h b/include/xrpl/tx/detail/TxApplySpanNames.h new file mode 100644 index 0000000000..c007ca3c03 --- /dev/null +++ b/include/xrpl/tx/detail/TxApplySpanNames.h @@ -0,0 +1,109 @@ +#pragma once + +/** Compile-time span name constants for the transaction apply pipeline. + * + * Defines the span names and attribute keys used by the three apply-pipeline + * stages — preflight, preclaim, and transactor (apply) — that run inside the + * library (`src/libxrpl/tx/`). Built on the StaticStr/join() primitives from + * . + * + * Why a separate header from TxSpanNames.h: + * TxSpanNames.h lives under src/xrpld/ (daemon) and serves the overlay/app + * lifecycle spans (tx.receive, tx.process). Library code (applySteps.cpp, + * Transactor.cpp) must not depend on daemon headers, so the apply-pipeline + * constants live here instead. The attribute strings ("tx_type", + * "ter_result", "applied") intentionally match TxSpanNames.h so the collector + * spanmetrics connector aggregates both sets under the same dimensions. + * + * Span hierarchy (deterministic trace_id derived from txID[0:16]): + * + * The three stages run sequentially and often on different threads, so they + * do not auto-parent. Each uses a hash-derived trace_id keyed on the same + * transaction id, placing all three under one trace without context + * propagation. A transaction that hard-fails preflight or preclaim never + * reaches the transactor span — the stage attribute identifies where it + * stopped. + * + * +-----------------------------------------------------------+ + * | trace_id = txID[0:16] | + * | | + * | +-------------------+ +------------------+ +-------+ | + * | | tx.preflight | | tx.preclaim | | tx. | | + * | | stage=preflight |-->| stage=preclaim |-->| trans | | + * | | tx_type | | tx_type | | actor | | + * | | ter_result | | ter_result | | stage=| | + * | +-------------------+ +------------------+ | apply | | + * | stateless checks ledger-aware checks +-------+ | + * | (signature, fields) (sequence, fee) applies | + * +-----------------------------------------------------------+ + * + * Usage: + * @code + * #include + * using namespace telemetry; + * + * // preflight() / preclaim() use hashSpan with a full span name: + * auto span = SpanGuard::hashSpan( + * TraceCategory::Transactions, tx_apply_span::preflight, + * txID.data(), txID.kBytes); + * span.setAttribute(tx_apply_span::attr::stage, tx_apply_span::val::preflight); + * span.setAttribute(tx_apply_span::attr::terResult, transToken(ter).c_str()); + * @endcode + * + * @code + * // Transactor::operator() uses span() with prefix + suffix: + * auto span = SpanGuard::span( + * TraceCategory::Transactions, seg::tx, tx_apply_span::op::transactor); + * span.setAttribute(tx_apply_span::attr::stage, tx_apply_span::val::apply); + * @endcode + */ + +#include + +namespace xrpl::telemetry::tx_apply_span { + +// ===== Span operation suffixes ============================================= + +namespace op { +/// "preflight" — stateless transaction checks (suffix form). +inline constexpr auto preflight = makeStr("preflight"); +/// "preclaim" — ledger-aware checks before fee claim (suffix form). +inline constexpr auto preclaim = makeStr("preclaim"); +/// "transactor" — the apply stage (suffix form, used with span()). +inline constexpr auto transactor = makeStr("transactor"); +} // namespace op + +// ===== Full span names (tx.) =========================================== + +/// "tx.preflight" — full name for hashSpan() at the preflight stage. +inline constexpr auto preflight = join(seg::tx, op::preflight); +/// "tx.preclaim" — full name for hashSpan() at the preclaim stage. +inline constexpr auto preclaim = join(seg::tx, op::preclaim); + +// ===== Attribute keys ====================================================== + +namespace attr { +/// "stage" — which apply-pipeline stage this span represents. Drives the +/// collector spanmetrics `stage` dimension for per-stage RED metrics. +inline constexpr auto stage = makeStr("stage"); +/// "tx_type" — transaction type name (e.g., "Payment", "OfferCreate"). +/// Matches tx_span::attr::txType so both share the spanmetrics dimension. +inline constexpr auto txType = makeStr("tx_type"); +/// "ter_result" — engine result code after the stage (e.g., "tesSUCCESS"). +inline constexpr auto terResult = makeStr("ter_result"); +/// "applied" — whether the transaction was applied to the ledger (apply only). +inline constexpr auto applied = makeStr("applied"); +} // namespace attr + +// ===== Attribute values (stage names) ====================================== + +namespace val { +/// "preflight" — value of the stage attribute on tx.preflight. +inline constexpr auto preflight = makeStr("preflight"); +/// "preclaim" — value of the stage attribute on tx.preclaim. +inline constexpr auto preclaim = makeStr("preclaim"); +/// "apply" — value of the stage attribute on tx.transactor. +inline constexpr auto apply = makeStr("apply"); +} // namespace val + +} // namespace xrpl::telemetry::tx_apply_span diff --git a/src/libxrpl/tx/Transactor.cpp b/src/libxrpl/tx/Transactor.cpp index 5df0504004..b3917e85d9 100644 --- a/src/libxrpl/tx/Transactor.cpp +++ b/src/libxrpl/tx/Transactor.cpp @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -1199,9 +1200,11 @@ Transactor::operator()() auto span = telemetry::SpanGuard::span( telemetry::TraceCategory::Transactions, telemetry::seg::tx, - telemetry::makeStr("transactor")); + telemetry::tx_apply_span::op::transactor); + // "apply" — the third apply-pipeline stage, after preflight and preclaim. + span.setAttribute(telemetry::tx_apply_span::attr::stage, telemetry::tx_apply_span::val::apply); if (auto const* fmt = TxFormats::getInstance().findByType(ctx_.tx.getTxnType())) - span.setAttribute("tx_type", fmt->getName().c_str()); + span.setAttribute(telemetry::tx_apply_span::attr::txType, fmt->getName().c_str()); JLOG(j_.trace()) << "apply: " << ctx_.tx.getTransactionID(); @@ -1429,8 +1432,8 @@ Transactor::operator()() JLOG(j_.trace()) << (applied ? "applied " : "not applied ") << transToken(result); - span.setAttribute("ter_result", transToken(result).c_str()); - span.setAttribute("applied", applied); + span.setAttribute(telemetry::tx_apply_span::attr::terResult, transToken(result).c_str()); + span.setAttribute(telemetry::tx_apply_span::attr::applied, applied); return {result, applied, metadata}; } diff --git a/src/libxrpl/tx/applySteps.cpp b/src/libxrpl/tx/applySteps.cpp index 217fdd717f..fa2021a215 100644 --- a/src/libxrpl/tx/applySteps.cpp +++ b/src/libxrpl/tx/applySteps.cpp @@ -13,13 +13,16 @@ #include #include #include +#include #include #include +#include #include #include #include #include +#include #include #pragma push_macro("TRANSACTION") #undef TRANSACTION @@ -51,6 +54,47 @@ struct UnknownTxnType : std::exception } }; +/** Look up the human-readable transaction type name for span attributes. + * Returns nullptr if the type is unknown so the caller can skip the + * attribute rather than emit an empty value. + */ +char const* +txTypeName(TxType txnType) +{ + if (auto const* fmt = TxFormats::getInstance().findByType(txnType)) + return fmt->getName().c_str(); + return nullptr; +} + +/** Create a deterministic-trace span for an apply-pipeline stage. + * + * The trace_id is derived from txID[0:16] so the preflight, preclaim, and + * transactor spans of one transaction share a trace even though they run + * sequentially and often on different threads. Sets the stage, tx_type, and + * (after the stage runs) ter_result attributes that drive the collector + * spanmetrics dimensions. A no-op when telemetry is disabled. + * + * @param name Full span name (tx_apply_span::preflight / ::preclaim). + * @param stage Stage attribute value (tx_apply_span::val::*). + * @param tx The transaction supplying the id and type. + */ +[[nodiscard]] telemetry::SpanGuard +makeStageSpan(std::string_view name, std::string_view stage, STTx const& tx) +{ + auto const txID = tx.getTransactionID(); + auto span = telemetry::SpanGuard::hashSpan( + telemetry::TraceCategory::Transactions, name, txID.data(), txID.kBytes); + // Guard the type lookup behind the active check: preflight runs for every + // transaction, so findByType() must not run when tracing is off/disabled. + if (span) + { + span.setAttribute(telemetry::tx_apply_span::attr::stage, stage); + if (char const* typeName = txTypeName(tx.getTxnType())) + span.setAttribute(telemetry::tx_apply_span::attr::txType, typeName); + } + return span; +} + // Call a lambda with the concrete transaction type as a template parameter // throw an "UnknownTxnType" exception on error template @@ -133,82 +177,122 @@ consequencesHelper(PreflightContext const& ctx) static std::pair invokePreflight(PreflightContext const& ctx) { + // Trace the preflight stage. The span shares the transaction's + // deterministic trace_id so it correlates with preclaim and transactor. + auto span = makeStageSpan( + telemetry::tx_apply_span::preflight, telemetry::tx_apply_span::val::preflight, ctx.tx); try { - return withTxnType(ctx.rules, ctx.tx.getTxnType(), [&]() { + auto result = withTxnType(ctx.rules, ctx.tx.getTxnType(), [&]() { auto const tec = Transactor::invokePreflight(ctx); return std::make_pair( tec, isTesSuccess(tec) ? consequencesHelper(ctx) : TxConsequences{tec}); }); + if (span) + { + span.setAttribute( + telemetry::tx_apply_span::attr::terResult, transToken(result.first).c_str()); + } + return result; } catch (UnknownTxnType const& e) { // Should never happen // LCOV_EXCL_START JLOG(ctx.j.fatal()) << "Unknown transaction type in preflight: " << e.txnType; + span.recordException(e); UNREACHABLE("xrpl::invokePreflight : unknown transaction type"); return {temUNKNOWN, TxConsequences{temUNKNOWN}}; // LCOV_EXCL_STOP } + catch (std::exception const& e) + { + // The caller's preflight() maps this to tefEXCEPTION. Record it on the + // span before unwinding so per-stage error counts include exceptions. + span.setAttribute( + telemetry::tx_apply_span::attr::terResult, transToken(tefEXCEPTION).c_str()); + span.recordException(e); + throw; + } } static TER invokePreclaim(PreclaimContext const& ctx) { + // Trace the preclaim stage under the transaction's deterministic trace_id. + auto span = makeStageSpan( + telemetry::tx_apply_span::preclaim, telemetry::tx_apply_span::val::preclaim, ctx.tx); try { // use name hiding to accomplish compile-time polymorphism of static // class functions for Transactor and derived classes. - return withTxnType(ctx.view.rules(), ctx.tx.getTxnType(), [&]() -> TER { - // preclaim functionality is divided into two sections: - // 1. Up to and including the signature check: returns NotTEC. - // All transaction checks before and including checkSign - // MUST return NotTEC, or something more restrictive. - // Allowing tec results in these steps risks theft or - // destruction of funds, as a fee will be charged before the - // signature is checked. - // 2. After the signature check: returns TER. + TER const preclaimTer = + withTxnType(ctx.view.rules(), ctx.tx.getTxnType(), [&]() -> TER { + // preclaim functionality is divided into two sections: + // 1. Up to and including the signature check: returns NotTEC. + // All transaction checks before and including checkSign + // MUST return NotTEC, or something more restrictive. + // Allowing tec results in these steps risks theft or + // destruction of funds, as a fee will be charged before the + // signature is checked. + // 2. After the signature check: returns TER. - // If the transactor requires a valid account and the - // transaction doesn't list one, preflight will have already - // a flagged a failure. - auto const id = ctx.tx.getAccountID(sfAccount); + // If the transactor requires a valid account and the + // transaction doesn't list one, preflight will have already + // a flagged a failure. + auto const id = ctx.tx.getAccountID(sfAccount); - if (id != beast::kZero) - { - if (NotTEC const preSigResult = [&]() -> NotTEC { - if (NotTEC const result = T::checkSeqProxy(ctx.view, ctx.tx, ctx.j)) - return result; + if (id != beast::kZero) + { + if (NotTEC const preSigResult = [&]() -> NotTEC { + if (NotTEC const result = T::checkSeqProxy(ctx.view, ctx.tx, ctx.j)) + return result; - if (NotTEC const result = T::checkPriorTxAndLastLedger(ctx)) - return result; + if (NotTEC const result = T::checkPriorTxAndLastLedger(ctx)) + return result; - if (NotTEC const result = T::checkPermission(ctx.view, ctx.tx)) - return result; + if (NotTEC const result = T::checkPermission(ctx.view, ctx.tx)) + return result; - if (NotTEC const result = T::checkSign(ctx)) - return result; + if (NotTEC const result = T::checkSign(ctx)) + return result; - return tesSUCCESS; - }()) - return preSigResult; + return tesSUCCESS; + }()) + return preSigResult; - if (TER const result = T::checkFee(ctx, calculateBaseFee(ctx.view, ctx.tx))) - return result; - } + if (TER const result = T::checkFee(ctx, calculateBaseFee(ctx.view, ctx.tx))) + return result; + } - return T::preclaim(ctx); - }); + return T::preclaim(ctx); + }); + if (span) + { + span.setAttribute( + telemetry::tx_apply_span::attr::terResult, transToken(preclaimTer).c_str()); + } + return preclaimTer; } catch (UnknownTxnType const& e) { // Should never happen // LCOV_EXCL_START JLOG(ctx.j.fatal()) << "Unknown transaction type in preclaim: " << e.txnType; + span.recordException(e); UNREACHABLE("xrpl::invokePreclaim : unknown transaction type"); return temUNKNOWN; // LCOV_EXCL_STOP } + catch (std::exception const& e) + { + // The caller's preclaim() maps this to tefEXCEPTION. Record it on the + // span before unwinding so per-stage error counts include exceptions. + span.setAttribute( + telemetry::tx_apply_span::attr::terResult, transToken(tefEXCEPTION).c_str()); + span.recordException(e); + throw; + } } /** diff --git a/src/tests/libxrpl/telemetry/TxApplySpanNames.cpp b/src/tests/libxrpl/telemetry/TxApplySpanNames.cpp new file mode 100644 index 0000000000..170fef3ae4 --- /dev/null +++ b/src/tests/libxrpl/telemetry/TxApplySpanNames.cpp @@ -0,0 +1,52 @@ +#include + +#include + +#include + +/** Contract tests for the transaction apply-pipeline span constants. + * + * The span names and attribute keys in TxApplySpanNames.h are a cross-component + * contract: the collector spanmetrics connector aggregates on these exact + * strings (dimensions tx_type, ter_result, stage) and the Grafana + * transaction-overview dashboard queries them. A silent rename here would + * break per-stage metrics with no compile error, so these tests pin the + * literal values. They need no telemetry runtime and run in every build. + */ + +using namespace xrpl::telemetry; + +TEST(TxApplySpanNames, span_names_are_dot_qualified) +{ + // Full span names feed SpanGuard::hashSpan() in applySteps.cpp. + EXPECT_EQ(std::string_view(tx_apply_span::preflight), "tx.preflight"); + EXPECT_EQ(std::string_view(tx_apply_span::preclaim), "tx.preclaim"); +} + +TEST(TxApplySpanNames, operation_suffixes) +{ + // Suffix used with SpanGuard::span(cat, seg::tx, suffix) in Transactor.cpp. + EXPECT_EQ(std::string_view(tx_apply_span::op::preflight), "preflight"); + EXPECT_EQ(std::string_view(tx_apply_span::op::preclaim), "preclaim"); + EXPECT_EQ(std::string_view(tx_apply_span::op::transactor), "transactor"); +} + +TEST(TxApplySpanNames, attribute_keys_match_collector_dimensions) +{ + // These keys MUST match docker/telemetry/otel-collector-config.yaml + // spanmetrics dimensions and TxSpanNames.h (so both span sets aggregate + // under one dimension). + EXPECT_EQ(std::string_view(tx_apply_span::attr::stage), "stage"); + EXPECT_EQ(std::string_view(tx_apply_span::attr::txType), "tx_type"); + EXPECT_EQ(std::string_view(tx_apply_span::attr::terResult), "ter_result"); + EXPECT_EQ(std::string_view(tx_apply_span::attr::applied), "applied"); +} + +TEST(TxApplySpanNames, stage_values_are_the_three_pipeline_stages) +{ + // The stage attribute carries exactly these three values; they become the + // spanmetrics `stage` dimension cardinality (3) and the dashboard filter. + EXPECT_EQ(std::string_view(tx_apply_span::val::preflight), "preflight"); + EXPECT_EQ(std::string_view(tx_apply_span::val::preclaim), "preclaim"); + EXPECT_EQ(std::string_view(tx_apply_span::val::apply), "apply"); +} diff --git a/src/xrpld/app/misc/detail/TxQ.cpp b/src/xrpld/app/misc/detail/TxQ.cpp index ad14d6ae5f..714db9314c 100644 --- a/src/xrpld/app/misc/detail/TxQ.cpp +++ b/src/xrpld/app/misc/detail/TxQ.cpp @@ -608,7 +608,8 @@ TxQ::tryClearAccountQueueUpThruTx( if (txResult.applied) { // All of the queued transactions applied, so remove them from the - // queue. + // queue. `dist` queued txs preceded the current one in the batch. + span.setAttribute(txq_span::attr::numCleared, static_cast(dist)); endTxIter = erase(accountIter->second, beginTxIter, endTxIter); // If `tx` is replacing a queued tx, delete that one, too. if (endTxIter != accountIter->second.transactions.end() && endTxIter->first == tSeqProx) @@ -745,6 +746,9 @@ TxQ::apply( span.setAttribute(txq_span::attr::txHash, to_string(tx->getTransactionID()).c_str()); if (auto const* fmt = TxFormats::getInstance().findByType(tx->getTxnType())) span.setAttribute(txq_span::attr::txType, fmt->getName().c_str()); + // Default outcome; overridden below on the direct-apply and queued paths. + // Every other early return leaves the tx rejected from the queue. + span.setAttribute(txq_span::attr::txqStatus, txq_span::val::rejected); NumberSO const stNumberSO{view.rules().enabled(fixUniversalNumber)}; @@ -758,7 +762,10 @@ TxQ::apply( // See if the transaction paid a high enough fee that it can go straight // into the ledger. if (auto directApplied = tryDirectApply(app, view, tx, flags, j)) + { + span.setAttribute(txq_span::attr::txqStatus, txq_span::val::appliedDirect); return *directApplied; + } if ((flags & TapDryRun) != 0u) return {telCAN_NOT_QUEUE, false}; @@ -885,6 +892,10 @@ TxQ::apply( auto const metricsSnapshot = feeMetrics_.getSnapshot(); auto const feeLevelPaid = getFeeLevelPaid(view, *tx); auto const requiredFeeLevel = getRequiredFeeLevel(view, flags, metricsSnapshot, lock); + span.setAttribute( + txq_span::attr::feeLevelPaid, static_cast(feeLevelPaid.value())); + span.setAttribute( + txq_span::attr::requiredFeeLevel, static_cast(requiredFeeLevel.value())); // Is there a blocker already in the account's queue? If so, don't // allow additional transactions in the queue. @@ -1218,6 +1229,7 @@ TxQ::apply( /* Can't erase (*replacedTxIter) here because success implies that it has already been deleted. */ + span.setAttribute(txq_span::attr::txqStatus, txq_span::val::applied); return result; } } @@ -1337,6 +1349,7 @@ TxQ::apply( << " to queue." << " Flags: " << flags; + span.setAttribute(txq_span::attr::txqStatus, txq_span::val::queued); return {terQUEUED, false}; } @@ -1372,6 +1385,7 @@ TxQ::processClosedLedger(Application& app, ReadView const& view, bool timeLeap) // Remove any queued candidates whose LastLedgerSequence has gone by. auto* const metrics = app.getMetricsRegistry(); + std::int64_t expiredCount = 0; for (auto candidateIter = byFee_.begin(); candidateIter != byFee_.end();) { if (candidateIter->lastValid && *candidateIter->lastValid <= ledgerSeq) @@ -1382,12 +1396,14 @@ TxQ::processClosedLedger(Application& app, ReadView const& view, bool timeLeap) // escalating fee and were never included before expiry. if (metrics != nullptr) metrics->incrementTxqExpired(); + ++expiredCount; } else { ++candidateIter; } } + span.setAttribute(txq_span::attr::expiredCount, expiredCount); // Remove any TxQAccounts that don't have candidates // under them diff --git a/src/xrpld/app/misc/detail/TxQSpanNames.h b/src/xrpld/app/misc/detail/TxQSpanNames.h index 9292ba1e7c..3f8f86aa30 100644 --- a/src/xrpld/app/misc/detail/TxQSpanNames.h +++ b/src/xrpld/app/misc/detail/TxQSpanNames.h @@ -15,12 +15,14 @@ * | +--------------------------------------------------+ | * | | txq.enqueue | | * | | TxQ::apply() | | - * | | attrs: tx_hash, status, fee_level | | + * | | attrs: tx_hash, tx_type, txq_status, | | + * | | fee_level_paid, required_fee_level | | * | | | | * | | +-------------------+ +----------------------+ | | * | | | txq.apply_direct | | txq.batch_clear | | | * | | | tryDirectApply() | | tryClearAccount...() | | | - * | | +-------------------+ +----------------------+ | | + * | | +-------------------+ | attrs: num_cleared | | | + * | | +----------------------+ | | * | +--------------------------------------------------+ | * +-------------------------------------------------------+ *