Merge branch 'pratik/otel-phase6-statsd' into pratik/otel-phase7-native-metrics

# Conflicts:
#	OpenTelemetryPlan/09-data-collection-reference.md
This commit is contained in:
Pratik Mankawde
2026-06-05 12:48:31 +01:00
12 changed files with 574 additions and 76 deletions

View File

@@ -242,6 +242,7 @@ xrpl.tx > xrpl.basics
xrpl.tx > xrpl.core
xrpl.tx > xrpl.ledger
xrpl.tx > xrpl.protocol
xrpl.tx > xrpl.telemetry
xrpld.app > test.unit_test
xrpld.app > xrpl.basics
xrpld.app > xrpl.core

View File

@@ -99,13 +99,23 @@ Controlled by `trace_rpc=1` in `[telemetry]` config.
Controlled by `trace_transactions=1` in `[telemetry]` config.
| Span Name | Parent | Source File | Description |
| ------------ | -------------- | --------------- | ----------------------------------------------------------------- |
| `tx.process` | — | NetworkOPs.cpp | Transaction submission entry point (local or peer-relayed) |
| `tx.receive` | — | PeerImp.cpp | Raw transaction received from peer overlay (before deduplication) |
| `tx.apply` | `ledger.build` | BuildLedger.cpp | Transaction set applied to new ledger during consensus |
| Span Name | Parent | Source File | Description |
| --------------- | -------------- | --------------- | ----------------------------------------------------------------- |
| `tx.process` | — | NetworkOPs.cpp | Transaction submission entry point (local or peer-relayed) |
| `tx.receive` | — | PeerImp.cpp | Raw transaction received from peer overlay (before deduplication) |
| `tx.apply` | `ledger.build` | BuildLedger.cpp | Transaction set applied to new ledger during consensus |
| `tx.preflight` | — | applySteps.cpp | Stateless checks stage (`stage=preflight`) |
| `tx.preclaim` | — | applySteps.cpp | Ledger-aware checks stage before fee claim (`stage=preclaim`) |
| `tx.transactor` | — | Transactor.cpp | Apply stage — the transactor runs (`stage=apply`) |
The three apply-pipeline spans share a deterministic `trace_id` derived from
`txID[0:16]`, so preflight, preclaim, and transactor for one transaction group
under a single trace even though they run sequentially and often on different
threads. A transaction that hard-fails preflight or preclaim never reaches the
later spans — the `stage` attribute identifies where it stopped.
**Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"tx.process|tx.receive"}`
or, for the apply pipeline: `{resource.service.name="xrpld" && name=~"tx.preflight|tx.preclaim|tx.transactor"}`
**Grafana dashboard**: _Transaction Overview_ (`xrpld-transactions`)
@@ -177,13 +187,19 @@ Every span can carry key-value attributes that provide context for filtering and
#### Transaction Attributes
| Attribute | Type | Set On | Description |
| -------------------- | ------- | -------------------------- | ---------------------------------------------------- |
| `xrpl.tx.hash` | string | `tx.process`, `tx.receive` | Transaction hash (hex-encoded) |
| `xrpl.tx.local` | boolean | `tx.process` | `true` if locally submitted, `false` if peer-relayed |
| `xrpl.tx.path` | string | `tx.process` | Submission path: `"sync"` or `"async"` |
| `xrpl.tx.suppressed` | boolean | `tx.receive` | `true` if transaction was suppressed (duplicate) |
| `xrpl.tx.status` | string | `tx.receive` | Transaction status (e.g., `"known_bad"`) |
| Attribute | Type | Set On | Description |
| ------------------- | ------- | ---------------------------------------------- | --------------------------------------------------------------------- |
| `xrpl.tx.hash` | string | `tx.process`, `tx.receive` | Transaction hash (hex-encoded) |
| `local` | boolean | `tx.process` | `true` if locally submitted, `false` if peer-relayed |
| `path` | string | `tx.process` | Submission path: `"sync"` or `"async"` |
| `suppressed` | boolean | `tx.receive` | `true` if transaction was suppressed (duplicate) |
| `tx_status` | string | `tx.receive` | Transaction status (e.g., `"known_bad"`) |
| `xrpl.peer.id` | int64 | `tx.receive` | Peer identifier (also set on peer spans) |
| `xrpl.peer.version` | string | `tx.receive` | Peer protocol version string |
| `stage` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Apply-pipeline stage: `preflight`, `preclaim`, or `apply` |
| `tx_type` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Transaction type name (e.g., `Payment`) |
| `ter_result` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Engine result token for that stage (e.g., `tesSUCCESS`, `terPRE_SEQ`) |
| `applied` | boolean | `tx.transactor` | `true` if the transaction was applied to the ledger |
**Tempo query**: `{span.xrpl.tx.hash="<hash>"}` to trace a specific transaction across nodes.
@@ -248,14 +264,25 @@ The OTel Collector's SpanMetrics connector automatically generates RED (Rate, Er
**Additional dimension labels** (configured in `otel-collector-config.yaml`):
| Span Attribute | Prometheus Label | Applies To |
| ------------------------------ | ------------------------------ | ------------------------- |
| `command` | `xrpl_rpc_command` | `rpc.command.*` |
| `rpc_status` | `xrpl_rpc_status` | `rpc.command.*` |
| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` |
| `xrpl.tx.local` | `xrpl_tx_local` | `tx.process` |
| `xrpl.peer.proposal.trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` |
| `xrpl.peer.validation.trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` |
| Span Attribute | Prometheus Label | Applies To |
| --------------------- | ------------------------------ | ---------------------------------------------- |
| `command` | `xrpl_rpc_command` | `rpc.command.*` |
| `rpc_status` | `xrpl_rpc_status` | `rpc.command.*` |
| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` |
| `local` | `xrpl_tx_local` | `tx.process` |
| `proposal_trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` |
| `validation_trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` |
| `stage` | `stage` | `tx.preflight`, `tx.preclaim`, `tx.transactor` |
The `stage` dimension (3 values: `preflight`, `preclaim`, `apply`) turns the
apply-pipeline spans into per-stage RED metrics with no native instruments — the
_Transaction Overview_ dashboard charts rate, p95 latency, and failure rate by stage.
> **Sampling caveat**: span-derived metrics inherit the **tracer head-sampling**
> ratio (`sampling_ratio` in `[telemetry]`, via `TraceIdRatioBasedSampler`). At
> `sampling_ratio < 1.0` the stage RED metrics undercount proportionally — they
> reflect sampled traces, not the full transaction volume. Native StatsD/meter
> metrics do not sample. Account for this when reading absolute stage rates.
**Where to query**: Prometheus → `traces_span_metrics_calls_total{span_name="rpc.command.server_info"}`

View File

@@ -474,17 +474,22 @@ This gives the best of both worlds: guaranteed cross-node correlation via determ
**Attributes added**:
| Span | Attribute | Type | Source |
| --------------- | ---------------- | ------ | ------------------------------------------------------------------- |
| `tx.process` | `tx_type` | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` |
| `tx.process` | `fee` | int64 | `stx->getFieldAmount(sfFee).xrp().drops()` |
| `tx.process` | `sequence` | int64 | `stx->getSeqProxy().value()` |
| `tx.process` | `ter_result` | string | `transToken(e.result)` (set after batch application) |
| `tx.process` | `applied` | bool | `e.applied` (set after batch application) |
| `tx.receive` | `tx_type` | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` |
| `txq.enqueue` | `tx_type` | string | same pattern as above |
| `txq.accept.tx` | `txq_status` | string | `applied` / `failed` / `retried` |
| `txq.accept` | `ledger_changed` | bool | set at end of accept loop |
| Span | Attribute | Type | Source |
| ----------------- | -------------------- | ------ | ------------------------------------------------------------------- |
| `tx.process` | `tx_type` | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` |
| `tx.process` | `fee` | int64 | `stx->getFieldAmount(sfFee).xrp().drops()` |
| `tx.process` | `sequence` | int64 | `stx->getSeqProxy().value()` |
| `tx.process` | `ter_result` | string | `transToken(e.result)` (set after batch application) |
| `tx.process` | `applied` | bool | `e.applied` (set after batch application) |
| `tx.receive` | `tx_type` | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` |
| `txq.enqueue` | `tx_type` | string | same pattern as above |
| `txq.enqueue` | `txq_status` | string | `queued` / `applied_direct` / `applied` / `rejected` |
| `txq.enqueue` | `fee_level_paid` | int64 | `getFeeLevelPaid(view, *tx).value()` |
| `txq.enqueue` | `required_fee_level` | int64 | `getRequiredFeeLevel(...).value()` |
| `txq.batch_clear` | `num_cleared` | int64 | queued txs cleared ahead of the applying tx |
| `txq.cleanup` | `expired_count` | int64 | entries dropped for passed `LastLedgerSequence` |
| `txq.accept.tx` | `txq_status` | string | `applied` / `failed` / `retried` |
| `txq.accept` | `ledger_changed` | bool | set at end of accept loop |
**New attr keys**: `TxSpanNames.h` (`txType`, `fee`, `sequence`, `terResult`, `applied`), `TxQSpanNames.h` (`txType`).

View File

@@ -669,6 +669,138 @@
},
"overrides": []
}
},
{
"title": "Tx Apply Pipeline Rate by Stage",
"description": "Span rate for each apply-pipeline stage (preflight, preclaim, apply). A drop between stages shows where transactions are filtered out. Requires the stage dimension in spanmetrics.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 64
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": ["mean", "max"]
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (stage, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\"}[5m]))",
"legendFormat": "{{stage}} [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"axisLabel": "Spans / Sec",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
},
{
"title": "Tx Apply Pipeline Latency by Stage (p95)",
"description": "95th-percentile duration of each apply-pipeline stage. Isolates which stage (preflight, preclaim, apply) dominates transaction processing time.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 64
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": ["mean", "max"]
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le, stage, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\"}[5m])))",
"legendFormat": "P95 {{stage}} [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "ms",
"custom": {
"axisLabel": "Duration (ms)",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
},
{
"title": "Tx Apply Pipeline Failure Rate by Stage",
"description": "Rate of apply-pipeline spans whose ter_result is not tesSUCCESS, split by stage. Shows whether failures concentrate in preflight, preclaim, or apply. Filters on ter_result rather than span status because a failing ter code completes the span normally; only thrown exceptions set an error status.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 72
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": ["mean", "max"]
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (stage, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\", ter_result!~\"tesSUCCESS|\"}[5m]))",
"legendFormat": "{{stage}} [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"axisLabel": "Failed Spans / Sec",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
}
],
"schemaVersion": 39,
@@ -768,6 +900,24 @@
},
"sort": 1,
"label": "Queue Status"
},
{
"name": "stage",
"type": "query",
"datasource": {
"type": "prometheus"
},
"query": "label_values(traces_span_metrics_calls_total{span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage!=\"\"}, stage)",
"refresh": 2,
"includeAll": true,
"multi": true,
"allValue": ".*",
"current": {
"text": "All",
"value": "$__all"
},
"sort": 1,
"label": "Apply Stage"
}
]
},

View File

@@ -41,6 +41,9 @@ connectors:
- name: validation_trusted
- name: tx_type
- name: ter_result
# Apply-pipeline stage (preflight|preclaim|apply) — splits the
# tx.preflight/tx.preclaim/tx.transactor span RED metrics per stage.
- name: stage
- name: txq_status
- name: consensus_state
- name: load_type

View File

@@ -74,11 +74,20 @@ All spans instrumented in xrpld, grouped by subsystem:
### Transaction Spans (Phase 3)
| Span Name | Source File | Attributes | Description |
| ------------ | --------------- | --------------------------------------------------------------------------------- | ------------------------------------- |
| `tx.process` | NetworkOPs.cpp | `tx_hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing |
| `tx.receive` | PeerImp.cpp | `peer_id`, `tx_hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay |
| `tx.apply` | BuildLedger.cpp | `ledger_seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger |
| Span Name | Source File | Attributes | Description |
| --------------- | --------------- | --------------------------------------------------------------------------------- | ------------------------------------- |
| `tx.process` | NetworkOPs.cpp | `tx_hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing |
| `tx.receive` | PeerImp.cpp | `peer_id`, `tx_hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay |
| `tx.apply` | BuildLedger.cpp | `ledger_seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger |
| `tx.preflight` | applySteps.cpp | `stage`, `tx_type`, `ter_result` | Stateless checks stage |
| `tx.preclaim` | applySteps.cpp | `stage`, `tx_type`, `ter_result` | Ledger-aware checks stage |
| `tx.transactor` | Transactor.cpp | `stage`, `tx_type`, `ter_result`, `applied` | Apply stage (transactor runs) |
The three apply-pipeline spans (`tx.preflight`, `tx.preclaim`, `tx.transactor`)
share a deterministic `trace_id` from `txID[0:16]`, so they group under one
trace per transaction. The `stage` attribute (`preflight` / `preclaim` /
`apply`) drives the collector spanmetrics `stage` dimension, giving per-stage
RED metrics on the _Transaction Overview_ dashboard.
### Transaction Queue Spans (Phase 3)
@@ -182,6 +191,43 @@ This section shows what questions you can answer using the span attributes, with
{name=~"tx\\..*"} | tx_type = "NFTokenMint"
```
### Apply Pipeline by Stage
```
# All three stages of one transaction (preflight -> preclaim -> apply)
{name=~"tx.preflight|tx.preclaim|tx.transactor"}
# Transactions that failed at the preclaim stage
{name="tx.preclaim"} | ter_result != "tesSUCCESS"
# Transactions that hard-failed preflight (never reached preclaim/apply)
{name="tx.preflight"} | ter_result != "tesSUCCESS"
```
PromQL on the span-derived metrics (dashboard: _Transaction Overview_):
```
# Per-stage throughput — the funnel preflight >= preclaim >= apply
sum by (stage) (rate(traces_span_metrics_calls_total{span_name=~"tx.preflight|tx.preclaim|tx.transactor"}[5m]))
# Per-stage p95 latency
histogram_quantile(0.95, sum by (le, stage) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"tx.preflight|tx.preclaim|tx.transactor"}[5m])))
# Per-stage failure rate (ter_result != tesSUCCESS; a failing ter completes the
# span normally, so filter on the attribute, not status_code which only flags exceptions)
sum by (stage) (rate(traces_span_metrics_calls_total{span_name=~"tx.preflight|tx.preclaim|tx.transactor", ter_result!~"tesSUCCESS|"}[5m]))
```
> **Alerting**: a rising `tx.preflight` / `tx.preclaim` failure rate points to
> malformed or stale-sequence submissions (often spam or a misbehaving client);
> a rising `tx.transactor` failure rate points to apply-time problems. Alert per
> stage rather than on a single aggregate so the failing stage is obvious.
> **Sampling caveat**: these stage metrics are span-derived and inherit the
> **tracer head-sampling** ratio (`sampling_ratio`). At `sampling_ratio < 1.0`
> they undercount proportionally — treat them as relative trends, not absolute
> transaction counts. Native StatsD metrics are unsampled.
### Transaction Queue Health
```

View File

@@ -0,0 +1,109 @@
#pragma once
/** Compile-time span name constants for the transaction apply pipeline.
*
* Defines the span names and attribute keys used by the three apply-pipeline
* stages — preflight, preclaim, and transactor (apply) — that run inside the
* library (`src/libxrpl/tx/`). Built on the StaticStr/join() primitives from
* <xrpl/telemetry/SpanNames.h>.
*
* Why a separate header from TxSpanNames.h:
* TxSpanNames.h lives under src/xrpld/ (daemon) and serves the overlay/app
* lifecycle spans (tx.receive, tx.process). Library code (applySteps.cpp,
* Transactor.cpp) must not depend on daemon headers, so the apply-pipeline
* constants live here instead. The attribute strings ("tx_type",
* "ter_result", "applied") intentionally match TxSpanNames.h so the collector
* spanmetrics connector aggregates both sets under the same dimensions.
*
* Span hierarchy (deterministic trace_id derived from txID[0:16]):
*
* The three stages run sequentially and often on different threads, so they
* do not auto-parent. Each uses a hash-derived trace_id keyed on the same
* transaction id, placing all three under one trace without context
* propagation. A transaction that hard-fails preflight or preclaim never
* reaches the transactor span — the stage attribute identifies where it
* stopped.
*
* +-----------------------------------------------------------+
* | trace_id = txID[0:16] |
* | |
* | +-------------------+ +------------------+ +-------+ |
* | | tx.preflight | | tx.preclaim | | tx. | |
* | | stage=preflight |-->| stage=preclaim |-->| trans | |
* | | tx_type | | tx_type | | actor | |
* | | ter_result | | ter_result | | stage=| |
* | +-------------------+ +------------------+ | apply | |
* | stateless checks ledger-aware checks +-------+ |
* | (signature, fields) (sequence, fee) applies |
* +-----------------------------------------------------------+
*
* Usage:
* @code
* #include <xrpl/tx/detail/TxApplySpanNames.h>
* using namespace telemetry;
*
* // preflight() / preclaim() use hashSpan with a full span name:
* auto span = SpanGuard::hashSpan(
* TraceCategory::Transactions, tx_apply_span::preflight,
* txID.data(), txID.kBytes);
* span.setAttribute(tx_apply_span::attr::stage, tx_apply_span::val::preflight);
* span.setAttribute(tx_apply_span::attr::terResult, transToken(ter).c_str());
* @endcode
*
* @code
* // Transactor::operator() uses span() with prefix + suffix:
* auto span = SpanGuard::span(
* TraceCategory::Transactions, seg::tx, tx_apply_span::op::transactor);
* span.setAttribute(tx_apply_span::attr::stage, tx_apply_span::val::apply);
* @endcode
*/
#include <xrpl/telemetry/SpanNames.h>
namespace xrpl::telemetry::tx_apply_span {
// ===== Span operation suffixes =============================================
namespace op {
/// "preflight" — stateless transaction checks (suffix form).
inline constexpr auto preflight = makeStr("preflight");
/// "preclaim" — ledger-aware checks before fee claim (suffix form).
inline constexpr auto preclaim = makeStr("preclaim");
/// "transactor" — the apply stage (suffix form, used with span()).
inline constexpr auto transactor = makeStr("transactor");
} // namespace op
// ===== Full span names (tx.<op>) ===========================================
/// "tx.preflight" — full name for hashSpan() at the preflight stage.
inline constexpr auto preflight = join(seg::tx, op::preflight);
/// "tx.preclaim" — full name for hashSpan() at the preclaim stage.
inline constexpr auto preclaim = join(seg::tx, op::preclaim);
// ===== Attribute keys ======================================================
namespace attr {
/// "stage" — which apply-pipeline stage this span represents. Drives the
/// collector spanmetrics `stage` dimension for per-stage RED metrics.
inline constexpr auto stage = makeStr("stage");
/// "tx_type" — transaction type name (e.g., "Payment", "OfferCreate").
/// Matches tx_span::attr::txType so both share the spanmetrics dimension.
inline constexpr auto txType = makeStr("tx_type");
/// "ter_result" — engine result code after the stage (e.g., "tesSUCCESS").
inline constexpr auto terResult = makeStr("ter_result");
/// "applied" — whether the transaction was applied to the ledger (apply only).
inline constexpr auto applied = makeStr("applied");
} // namespace attr
// ===== Attribute values (stage names) ======================================
namespace val {
/// "preflight" — value of the stage attribute on tx.preflight.
inline constexpr auto preflight = makeStr("preflight");
/// "preclaim" — value of the stage attribute on tx.preclaim.
inline constexpr auto preclaim = makeStr("preclaim");
/// "apply" — value of the stage attribute on tx.transactor.
inline constexpr auto apply = makeStr("apply");
} // namespace val
} // namespace xrpl::telemetry::tx_apply_span

View File

@@ -44,6 +44,7 @@
#include <xrpl/tx/SignerEntries.h>
#include <xrpl/tx/apply.h>
#include <xrpl/tx/applySteps.h>
#include <xrpl/tx/detail/TxApplySpanNames.h>
#include <cstddef>
#include <cstdint>
@@ -1199,9 +1200,11 @@ Transactor::operator()()
auto span = telemetry::SpanGuard::span(
telemetry::TraceCategory::Transactions,
telemetry::seg::tx,
telemetry::makeStr("transactor"));
telemetry::tx_apply_span::op::transactor);
// "apply" — the third apply-pipeline stage, after preflight and preclaim.
span.setAttribute(telemetry::tx_apply_span::attr::stage, telemetry::tx_apply_span::val::apply);
if (auto const* fmt = TxFormats::getInstance().findByType(ctx_.tx.getTxnType()))
span.setAttribute("tx_type", fmt->getName().c_str());
span.setAttribute(telemetry::tx_apply_span::attr::txType, fmt->getName().c_str());
JLOG(j_.trace()) << "apply: " << ctx_.tx.getTransactionID();
@@ -1429,8 +1432,8 @@ Transactor::operator()()
JLOG(j_.trace()) << (applied ? "applied " : "not applied ") << transToken(result);
span.setAttribute("ter_result", transToken(result).c_str());
span.setAttribute("applied", applied);
span.setAttribute(telemetry::tx_apply_span::attr::terResult, transToken(result).c_str());
span.setAttribute(telemetry::tx_apply_span::attr::applied, applied);
return {result, applied, metadata};
}

View File

@@ -13,13 +13,16 @@
#include <xrpl/protocol/SeqProxy.h>
#include <xrpl/protocol/TER.h>
#include <xrpl/protocol/XRPAmount.h>
#include <xrpl/telemetry/SpanGuard.h>
#include <xrpl/tx/ApplyContext.h>
#include <xrpl/tx/Transactor.h>
#include <xrpl/tx/detail/TxApplySpanNames.h>
#include <cstdint>
#include <exception>
#include <memory>
#include <optional>
#include <string_view>
#include <utility>
#pragma push_macro("TRANSACTION")
#undef TRANSACTION
@@ -51,6 +54,47 @@ struct UnknownTxnType : std::exception
}
};
/** Look up the human-readable transaction type name for span attributes.
* Returns nullptr if the type is unknown so the caller can skip the
* attribute rather than emit an empty value.
*/
char const*
txTypeName(TxType txnType)
{
if (auto const* fmt = TxFormats::getInstance().findByType(txnType))
return fmt->getName().c_str();
return nullptr;
}
/** Create a deterministic-trace span for an apply-pipeline stage.
*
* The trace_id is derived from txID[0:16] so the preflight, preclaim, and
* transactor spans of one transaction share a trace even though they run
* sequentially and often on different threads. Sets the stage, tx_type, and
* (after the stage runs) ter_result attributes that drive the collector
* spanmetrics dimensions. A no-op when telemetry is disabled.
*
* @param name Full span name (tx_apply_span::preflight / ::preclaim).
* @param stage Stage attribute value (tx_apply_span::val::*).
* @param tx The transaction supplying the id and type.
*/
[[nodiscard]] telemetry::SpanGuard
makeStageSpan(std::string_view name, std::string_view stage, STTx const& tx)
{
auto const txID = tx.getTransactionID();
auto span = telemetry::SpanGuard::hashSpan(
telemetry::TraceCategory::Transactions, name, txID.data(), txID.kBytes);
// Guard the type lookup behind the active check: preflight runs for every
// transaction, so findByType() must not run when tracing is off/disabled.
if (span)
{
span.setAttribute(telemetry::tx_apply_span::attr::stage, stage);
if (char const* typeName = txTypeName(tx.getTxnType()))
span.setAttribute(telemetry::tx_apply_span::attr::txType, typeName);
}
return span;
}
// Call a lambda with the concrete transaction type as a template parameter
// throw an "UnknownTxnType" exception on error
template <class F>
@@ -133,82 +177,122 @@ consequencesHelper(PreflightContext const& ctx)
static std::pair<NotTEC, TxConsequences>
invokePreflight(PreflightContext const& ctx)
{
// Trace the preflight stage. The span shares the transaction's
// deterministic trace_id so it correlates with preclaim and transactor.
auto span = makeStageSpan(
telemetry::tx_apply_span::preflight, telemetry::tx_apply_span::val::preflight, ctx.tx);
try
{
return withTxnType(ctx.rules, ctx.tx.getTxnType(), [&]<typename T>() {
auto result = withTxnType(ctx.rules, ctx.tx.getTxnType(), [&]<typename T>() {
auto const tec = Transactor::invokePreflight<T>(ctx);
return std::make_pair(
tec, isTesSuccess(tec) ? consequencesHelper<T>(ctx) : TxConsequences{tec});
});
if (span)
{
span.setAttribute(
telemetry::tx_apply_span::attr::terResult, transToken(result.first).c_str());
}
return result;
}
catch (UnknownTxnType const& e)
{
// Should never happen
// LCOV_EXCL_START
JLOG(ctx.j.fatal()) << "Unknown transaction type in preflight: " << e.txnType;
span.recordException(e);
UNREACHABLE("xrpl::invokePreflight : unknown transaction type");
return {temUNKNOWN, TxConsequences{temUNKNOWN}};
// LCOV_EXCL_STOP
}
catch (std::exception const& e)
{
// The caller's preflight() maps this to tefEXCEPTION. Record it on the
// span before unwinding so per-stage error counts include exceptions.
span.setAttribute(
telemetry::tx_apply_span::attr::terResult, transToken(tefEXCEPTION).c_str());
span.recordException(e);
throw;
}
}
static TER
invokePreclaim(PreclaimContext const& ctx)
{
// Trace the preclaim stage under the transaction's deterministic trace_id.
auto span = makeStageSpan(
telemetry::tx_apply_span::preclaim, telemetry::tx_apply_span::val::preclaim, ctx.tx);
try
{
// use name hiding to accomplish compile-time polymorphism of static
// class functions for Transactor and derived classes.
return withTxnType(ctx.view.rules(), ctx.tx.getTxnType(), [&]<typename T>() -> TER {
// preclaim functionality is divided into two sections:
// 1. Up to and including the signature check: returns NotTEC.
// All transaction checks before and including checkSign
// MUST return NotTEC, or something more restrictive.
// Allowing tec results in these steps risks theft or
// destruction of funds, as a fee will be charged before the
// signature is checked.
// 2. After the signature check: returns TER.
TER const preclaimTer =
withTxnType(ctx.view.rules(), ctx.tx.getTxnType(), [&]<typename T>() -> TER {
// preclaim functionality is divided into two sections:
// 1. Up to and including the signature check: returns NotTEC.
// All transaction checks before and including checkSign
// MUST return NotTEC, or something more restrictive.
// Allowing tec results in these steps risks theft or
// destruction of funds, as a fee will be charged before the
// signature is checked.
// 2. After the signature check: returns TER.
// If the transactor requires a valid account and the
// transaction doesn't list one, preflight will have already
// a flagged a failure.
auto const id = ctx.tx.getAccountID(sfAccount);
// If the transactor requires a valid account and the
// transaction doesn't list one, preflight will have already
// a flagged a failure.
auto const id = ctx.tx.getAccountID(sfAccount);
if (id != beast::kZero)
{
if (NotTEC const preSigResult = [&]() -> NotTEC {
if (NotTEC const result = T::checkSeqProxy(ctx.view, ctx.tx, ctx.j))
return result;
if (id != beast::kZero)
{
if (NotTEC const preSigResult = [&]() -> NotTEC {
if (NotTEC const result = T::checkSeqProxy(ctx.view, ctx.tx, ctx.j))
return result;
if (NotTEC const result = T::checkPriorTxAndLastLedger(ctx))
return result;
if (NotTEC const result = T::checkPriorTxAndLastLedger(ctx))
return result;
if (NotTEC const result = T::checkPermission(ctx.view, ctx.tx))
return result;
if (NotTEC const result = T::checkPermission(ctx.view, ctx.tx))
return result;
if (NotTEC const result = T::checkSign(ctx))
return result;
if (NotTEC const result = T::checkSign(ctx))
return result;
return tesSUCCESS;
}())
return preSigResult;
return tesSUCCESS;
}())
return preSigResult;
if (TER const result = T::checkFee(ctx, calculateBaseFee(ctx.view, ctx.tx)))
return result;
}
if (TER const result = T::checkFee(ctx, calculateBaseFee(ctx.view, ctx.tx)))
return result;
}
return T::preclaim(ctx);
});
return T::preclaim(ctx);
});
if (span)
{
span.setAttribute(
telemetry::tx_apply_span::attr::terResult, transToken(preclaimTer).c_str());
}
return preclaimTer;
}
catch (UnknownTxnType const& e)
{
// Should never happen
// LCOV_EXCL_START
JLOG(ctx.j.fatal()) << "Unknown transaction type in preclaim: " << e.txnType;
span.recordException(e);
UNREACHABLE("xrpl::invokePreclaim : unknown transaction type");
return temUNKNOWN;
// LCOV_EXCL_STOP
}
catch (std::exception const& e)
{
// The caller's preclaim() maps this to tefEXCEPTION. Record it on the
// span before unwinding so per-stage error counts include exceptions.
span.setAttribute(
telemetry::tx_apply_span::attr::terResult, transToken(tefEXCEPTION).c_str());
span.recordException(e);
throw;
}
}
/**

View File

@@ -0,0 +1,52 @@
#include <xrpl/tx/detail/TxApplySpanNames.h>
#include <gtest/gtest.h>
#include <string_view>
/** Contract tests for the transaction apply-pipeline span constants.
*
* The span names and attribute keys in TxApplySpanNames.h are a cross-component
* contract: the collector spanmetrics connector aggregates on these exact
* strings (dimensions tx_type, ter_result, stage) and the Grafana
* transaction-overview dashboard queries them. A silent rename here would
* break per-stage metrics with no compile error, so these tests pin the
* literal values. They need no telemetry runtime and run in every build.
*/
using namespace xrpl::telemetry;
TEST(TxApplySpanNames, span_names_are_dot_qualified)
{
// Full span names feed SpanGuard::hashSpan() in applySteps.cpp.
EXPECT_EQ(std::string_view(tx_apply_span::preflight), "tx.preflight");
EXPECT_EQ(std::string_view(tx_apply_span::preclaim), "tx.preclaim");
}
TEST(TxApplySpanNames, operation_suffixes)
{
// Suffix used with SpanGuard::span(cat, seg::tx, suffix) in Transactor.cpp.
EXPECT_EQ(std::string_view(tx_apply_span::op::preflight), "preflight");
EXPECT_EQ(std::string_view(tx_apply_span::op::preclaim), "preclaim");
EXPECT_EQ(std::string_view(tx_apply_span::op::transactor), "transactor");
}
TEST(TxApplySpanNames, attribute_keys_match_collector_dimensions)
{
// These keys MUST match docker/telemetry/otel-collector-config.yaml
// spanmetrics dimensions and TxSpanNames.h (so both span sets aggregate
// under one dimension).
EXPECT_EQ(std::string_view(tx_apply_span::attr::stage), "stage");
EXPECT_EQ(std::string_view(tx_apply_span::attr::txType), "tx_type");
EXPECT_EQ(std::string_view(tx_apply_span::attr::terResult), "ter_result");
EXPECT_EQ(std::string_view(tx_apply_span::attr::applied), "applied");
}
TEST(TxApplySpanNames, stage_values_are_the_three_pipeline_stages)
{
// The stage attribute carries exactly these three values; they become the
// spanmetrics `stage` dimension cardinality (3) and the dashboard filter.
EXPECT_EQ(std::string_view(tx_apply_span::val::preflight), "preflight");
EXPECT_EQ(std::string_view(tx_apply_span::val::preclaim), "preclaim");
EXPECT_EQ(std::string_view(tx_apply_span::val::apply), "apply");
}

View File

@@ -607,7 +607,8 @@ TxQ::tryClearAccountQueueUpThruTx(
if (txResult.applied)
{
// All of the queued transactions applied, so remove them from the
// queue.
// queue. `dist` queued txs preceded the current one in the batch.
span.setAttribute(txq_span::attr::numCleared, static_cast<std::int64_t>(dist));
endTxIter = erase(accountIter->second, beginTxIter, endTxIter);
// If `tx` is replacing a queued tx, delete that one, too.
if (endTxIter != accountIter->second.transactions.end() && endTxIter->first == tSeqProx)
@@ -744,6 +745,9 @@ TxQ::apply(
span.setAttribute(txq_span::attr::txHash, to_string(tx->getTransactionID()).c_str());
if (auto const* fmt = TxFormats::getInstance().findByType(tx->getTxnType()))
span.setAttribute(txq_span::attr::txType, fmt->getName().c_str());
// Default outcome; overridden below on the direct-apply and queued paths.
// Every other early return leaves the tx rejected from the queue.
span.setAttribute(txq_span::attr::txqStatus, txq_span::val::rejected);
NumberSO const stNumberSO{view.rules().enabled(fixUniversalNumber)};
@@ -757,7 +761,10 @@ TxQ::apply(
// See if the transaction paid a high enough fee that it can go straight
// into the ledger.
if (auto directApplied = tryDirectApply(app, view, tx, flags, j))
{
span.setAttribute(txq_span::attr::txqStatus, txq_span::val::appliedDirect);
return *directApplied;
}
if ((flags & TapDryRun) != 0u)
return {telCAN_NOT_QUEUE, false};
@@ -884,6 +891,10 @@ TxQ::apply(
auto const metricsSnapshot = feeMetrics_.getSnapshot();
auto const feeLevelPaid = getFeeLevelPaid(view, *tx);
auto const requiredFeeLevel = getRequiredFeeLevel(view, flags, metricsSnapshot, lock);
span.setAttribute(
txq_span::attr::feeLevelPaid, static_cast<std::int64_t>(feeLevelPaid.value()));
span.setAttribute(
txq_span::attr::requiredFeeLevel, static_cast<std::int64_t>(requiredFeeLevel.value()));
// Is there a blocker already in the account's queue? If so, don't
// allow additional transactions in the queue.
@@ -1217,6 +1228,7 @@ TxQ::apply(
/* Can't erase (*replacedTxIter) here because success
implies that it has already been deleted.
*/
span.setAttribute(txq_span::attr::txqStatus, txq_span::val::applied);
return result;
}
}
@@ -1332,6 +1344,7 @@ TxQ::apply(
<< " to queue."
<< " Flags: " << flags;
span.setAttribute(txq_span::attr::txqStatus, txq_span::val::queued);
return {terQUEUED, false};
}
@@ -1366,18 +1379,21 @@ TxQ::processClosedLedger(Application& app, ReadView const& view, bool timeLeap)
maxSize_ = std::max(snapshot.txnsExpected * setup_.ledgersInQueue, setup_.queueSizeMin);
// Remove any queued candidates whose LastLedgerSequence has gone by.
std::int64_t expiredCount = 0;
for (auto candidateIter = byFee_.begin(); candidateIter != byFee_.end();)
{
if (candidateIter->lastValid && *candidateIter->lastValid <= ledgerSeq)
{
byAccount_.at(candidateIter->account).dropPenalty = true;
candidateIter = erase(candidateIter);
++expiredCount;
}
else
{
++candidateIter;
}
}
span.setAttribute(txq_span::attr::expiredCount, expiredCount);
// Remove any TxQAccounts that don't have candidates
// under them

View File

@@ -15,12 +15,14 @@
* | +--------------------------------------------------+ |
* | | txq.enqueue | |
* | | TxQ::apply() | |
* | | attrs: tx_hash, status, fee_level | |
* | | attrs: tx_hash, tx_type, txq_status, | |
* | | fee_level_paid, required_fee_level | |
* | | | |
* | | +-------------------+ +----------------------+ | |
* | | | txq.apply_direct | | txq.batch_clear | | |
* | | | tryDirectApply() | | tryClearAccount...() | | |
* | | +-------------------+ +----------------------+ | |
* | | +-------------------+ | attrs: num_cleared | | |
* | | +----------------------+ | |
* | +--------------------------------------------------+ |
* +-------------------------------------------------------+
*