From d6b314e8d578269002a95bd3a49a2ec68b66165d Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 17:43:26 +0100 Subject: [PATCH 1/9] fix(telemetry): trim Tempo search filters to 7 cross-cutting entry points MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduced from 30 to 7 filters: service.instance.id, name, status, command, tx_hash, tx_type, ledger_hash. Full attribute inventory is in OpenTelemetryPlan/09-data-collection-reference.md §4; TraceQL autocomplete covers the rest. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .../provisioning/datasources/tempo.yaml | 162 ++---------------- 1 file changed, 14 insertions(+), 148 deletions(-) diff --git a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml index 3c94482283..95771d12e5 100644 --- a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml +++ b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml @@ -3,12 +3,10 @@ # Access Grafana at http://localhost:3000, then use Explore -> Tempo # to browse xrpld traces using TraceQL. # -# Search filters provide pre-configured dropdowns in the Explore UI. -# Each phase adds filters for the span attributes it introduces. -# Phase 1b (infra): Base filters — node identity, service, span name, status. -# Phase 2 (RPC): RPC command, status, role filters. -# Phase 3 (TX): Transaction hash, local/peer origin, status. -# Phase 4 (Cons): Consensus mode, round, ledger sequence, close time. +# Search filters provide quick-start dropdowns in the Explore UI for the most +# common investigation entry points. This is not an exhaustive attribute list — +# use TraceQL autocomplete or see OpenTelemetryPlan/09-data-collection-reference.md §4 +# for the full attribute inventory and example queries. apiVersion: 1 @@ -40,177 +38,45 @@ datasources: spanEndTimeShift: "1h" search: filters: - # --- Node identification filters --- - # service.name: logical service name (default: "xrpld"). - # Useful when running multiple service types in the same collector. - - id: service-name - tag: service.name - operator: "=" - scope: resource - type: static - # service.instance.id: unique node identifier — configurable via - # the service_instance_id setting in [telemetry], defaults to the - # node's public key. E.g. "Node-1" or "nHB1X37...". + # service.instance.id: unique node identifier (public key or configured name). - id: node-id tag: service.instance.id operator: "=" scope: resource type: static - # service.version: xrpld build version (e.g., "2.4.0-b1"). - # Filter traces from specific software releases. - - id: node-version - tag: service.version - operator: "=" - scope: resource - type: dynamic - # xrpl.network.id: numeric network identifier - # (0 = mainnet, 1 = testnet, 2 = devnet, etc.). - # Derived from the [network_id] config section. - - id: network-id - tag: xrpl.network.id - operator: "=" - scope: resource - type: dynamic - # xrpl.network.type: human-readable network name derived from - # network ID ("mainnet", "testnet", "devnet", "unknown"). - - id: network-type - tag: xrpl.network.type - operator: "=" - scope: resource - type: static - # --- Span intrinsic filters --- - # name: the span operation name (e.g., "rpc.command.server_info"). - # Use to find traces for a specific RPC command or subsystem. + # name: span operation name (e.g., "rpc.command.server_info"). - id: span-name tag: name operator: "=" scope: intrinsic type: static # status: span completion status ("ok", "error", "unset"). - # Filter for failed operations to diagnose errors. - id: span-status tag: status operator: "=" scope: intrinsic type: static - # duration: span wall-clock duration. Use with ">" operator - # to find slow operations (e.g., duration > 500ms). - - id: span-duration - tag: duration - operator: ">" - scope: intrinsic - type: static - # Phase 2: RPC tracing filters + # command: RPC command name (e.g., "server_info", "submit"). - id: rpc-command tag: command operator: "=" scope: span type: static - - id: rpc-status - tag: rpc_status - operator: "=" - scope: span - type: dynamic - - id: rpc-role - tag: rpc_role - operator: "=" - scope: span - type: dynamic - # Phase 3: Transaction tracing filters + # tx_hash: transaction hash — direct lookup for a known transaction. - id: tx-hash tag: tx_hash operator: "=" scope: span type: static - - id: tx-origin - tag: local - operator: "=" - scope: span - type: dynamic - - id: tx-status - tag: tx_status - operator: "=" - scope: span - type: dynamic - # Phase 4: Consensus tracing filters - - id: consensus-mode - tag: xrpl.consensus.mode + # tx_type: transaction type (e.g., "Payment", "OfferCreate"). + - id: tx-type + tag: tx_type operator: "=" scope: span type: static - - id: consensus-round - tag: xrpl.consensus.round - operator: "=" - scope: span - type: dynamic - - id: consensus-ledger-seq - tag: xrpl.ledger.seq + # ledger_hash: ledger hash — scope all spans to a specific closed ledger. + - id: ledger-hash + tag: ledger_hash operator: "=" scope: span type: static - - id: consensus-close-time-correct - tag: close_time_correct - operator: "=" - scope: span - type: dynamic - - id: consensus-state - tag: consensus_state - operator: "=" - scope: span - type: dynamic - - id: consensus-close-resolution - tag: close_resolution_ms - operator: "=" - scope: span - type: dynamic - - id: consensus-proposers - tag: proposers - operator: "=" - scope: span - type: dynamic - - id: consensus-result - tag: consensus_result - operator: "=" - scope: span - type: dynamic - - id: consensus-mode-old - tag: mode_old - operator: "=" - scope: span - type: dynamic - - id: consensus-mode-new - tag: mode_new - operator: "=" - scope: span - type: dynamic - - id: consensus-ledger-id - tag: xrpl.consensus.ledger_id - operator: "=" - scope: span - type: static - # Phase 3/4: Additional transaction and queue filters - - id: tx-path - tag: path - operator: "=" - scope: span - type: dynamic - - id: tx-suppressed - tag: suppressed - operator: "=" - scope: span - type: dynamic - - id: peer-version - tag: peer_version - operator: "=" - scope: span - type: dynamic - - id: txq-status - tag: txq_status - operator: "=" - scope: span - type: dynamic - - id: txq-ter-code - tag: ter_code - operator: "=" - scope: span - type: dynamic From cf075888ffdda3c8151c432ee1e1c302fb5dd961 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 17:51:37 +0100 Subject: [PATCH 2/9] docs(telemetry): fix TraceQL/LogQL query syntax in runbook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace all `{name="..."} | attr = val` pipeline queries with the correct `{name="..." && span.attr = val}` inline filter syntax - Add `span.` prefix to all span attribute references; `duration`, `status`, `name`, and `resource.*` keep no prefix - Fix Loki stream selector: `{job="xrpld"}` → `{service_name="xrpld"}` in all LogQL examples and the verification step - Fix cross-node queries: `rootServiceName` → `resource.service.name`, `{name=~"tx\\..*"} | attr` → `{name =~ "tx.*" && span.attr}` - Add DEX section (OfferCreate variants by ter_result, OfferCancel, peer relay) - Add syntax cheat-sheet block at top of Insights section - Expand tx workflow: per-AMM-type queries, Payment tecPATH.*, TrustSet, OracleSet, NFTokenMint cross-span - Expand consensus: slow rounds, validation send+receive comparison - Expand cross-subsystem: AMM cross-span, tx.receive no-error - Expand TxQ: retried status, NFToken enqueue type - Update Where-to-Look table: add AMM/DEX/NFT/close-time rows, fix attribute references to use span. prefix, fix stale consensus_stalled entry (now consensus_result on consensus.check) - All 57 queries verified against live stack — zero parse errors Co-Authored-By: Claude Sonnet 4.6 (1M context) --- docs/telemetry-runbook.md | 221 ++++++++++++++++++++++++++------------ 1 file changed, 155 insertions(+), 66 deletions(-) diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index ec33236bd2..29b65682e6 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -119,21 +119,23 @@ All spans instrumented in xrpld, grouped by subsystem: #### Close Time Queries (Tempo TraceQL) +Span attributes are filtered with `span.` inside `{}`. Combine conditions with `&&`. + ``` # Find rounds where validators disagreed on close time -{name="consensus.accept.apply"} | close_time_correct = false +{name="consensus.accept.apply" && span.close_time_correct = false} # Find consensus failures (moved_on) -{name="consensus.accept.apply"} | consensus_state = "moved_on" +{name="consensus.accept.apply" && span.consensus_state = "moved_on"} # Find slow ledger applications (>5s) -{name="consensus.accept.apply"} | duration > 5s +{name="consensus.accept.apply" && duration > 5000ms} # Find specific ledger's consensus details -{name="consensus.accept.apply"} | ledger_seq = 92345678 +{name="consensus.accept.apply" && span.ledger_seq = 92345678} # Find all spans in a consensus round (deterministic trace strategy) -{name="consensus.round"} | consensus_round_id = +{name="consensus.round" && span.consensus_round_id = ""} # Find dispute resolutions {name="consensus.update_positions"} >> {event:name="dispute.resolve"} @@ -160,127 +162,209 @@ All spans instrumented in xrpld, grouped by subsystem: This section shows what questions you can answer using the span attributes, with example Tempo TraceQL queries. +**TraceQL syntax note:** span attributes must be referenced with the `span.` prefix inside `{}`. +Conditions are combined with `&&`. The `|` pipeline operator is not supported on this Tempo version. + +``` +# General pattern +{name="" && span. = && span. != } + +# Duration filter (no prefix needed) +{name="" && duration > 500ms} + +# Regex match +{name="" && span. =~ ".*"} + +# Multiple span names +{name = "" || name = ""} + +# Name regex +{name =~ ".*" && span. = } + +# Structural: find parent spans that have a matching child/event +{name=""} >> {event:name=""} +``` + ### Transaction Workflow Analysis ``` -# Find all AMM transactions (AMMDeposit, AMMWithdraw, AMMCreate, etc.) -{name="tx.process"} | tx_type =~ "AMM.*" +# Find all AMM transactions (AMMDeposit, AMMWithdraw, AMMVote) +{name="tx.process" && span.tx_type =~ "AMM.*"} + +# Find a specific AMM operation +{name="tx.process" && span.tx_type = "AMMDeposit"} +{name="tx.process" && span.tx_type = "AMMWithdraw"} +{name="tx.process" && span.tx_type = "AMMVote"} # Find Payment transactions that failed -{name="tx.process"} | tx_type = "Payment" && ter_result != "tesSUCCESS" +{name="tx.process" && span.tx_type = "Payment" && span.ter_result != "tesSUCCESS"} + +# Find Payment failures due to path issues +{name="tx.process" && span.tx_type = "Payment" && span.ter_result =~ "tecPATH.*"} # Compare latency of different transaction types -{name="tx.process"} | tx_type = "OfferCreate" -{name="tx.process"} | tx_type = "Payment" +{name="tx.process" && span.tx_type = "OfferCreate"} +{name="tx.process" && span.tx_type = "Payment"} # Find high-fee transactions (fee > 1 XRP = 1000000 drops) -{name="tx.process"} | fee > 1000000 +{name="tx.process" && span.fee > 1000000} # Find transactions that were not applied -{name="tx.process"} | applied = false +{name="tx.process" && span.applied = false} -# Trace a specific transaction by type across the network -{name=~"tx\\..*"} | tx_type = "NFTokenMint" +# Find NFTokenMint across tx and txq spans +{name =~ "tx.*|txq.*" && span.tx_type = "NFTokenMint"} + +# Find all NFT-related activity +{name =~ "tx.*|txq.*" && span.tx_type =~ "NFToken.*"} + +# Find TrustSet transactions (IOU trust lines) +{name="tx.process" && span.tx_type = "TrustSet"} + +# Find oracle price updates +{name="tx.process" && span.tx_type = "OracleSet"} +``` + +### DEX (OfferCreate / OfferCancel) + +``` +# All DEX offer creates +{name="tx.process" && span.tx_type = "OfferCreate"} + +# Offers killed (ImmediateOrCancel/FillOrKill with no fill) +{name="tx.process" && span.tx_type = "OfferCreate" && span.ter_result = "tecKILLED"} + +# Offers that failed due to insufficient funds +{name="tx.process" && span.tx_type = "OfferCreate" && span.ter_result = "tecUNFUNDED_OFFER"} + +# Offers failed due to insufficient reserve to place the offer +{name="tx.process" && span.tx_type = "OfferCreate" && span.ter_result = "tecINSUF_RESERVE_OFFER"} + +# Offer cancellations +{name="tx.process" && span.tx_type = "OfferCancel"} + +# OfferCreate transactions received from peers (cross-node relay) +{name="tx.receive" && span.tx_type = "OfferCreate"} ``` ### Transaction Queue Health ``` # Find transactions rejected from the queue -{name="txq.accept_tx"} | txq_status = "failed" +{name="txq.accept_tx" && span.txq_status = "failed"} -# Which transaction types get queued most often? -{name="txq.enqueue"} | tx_type = "Payment" -{name="txq.enqueue"} | tx_type = "OfferCreate" - -# Find ledger closes that applied queued transactions -{name="txq.accept"} | ledger_changed = true +# Find transactions being retried +{name="txq.accept_tx" && span.txq_status = "retried"} # Find transactions that exhausted retries -{name="txq.accept_tx"} | txq_status = "retried" && retries_remaining = 0 +{name="txq.accept_tx" && span.txq_status = "retried" && span.retries_remaining = 0} + +# Which transaction types get queued most often? +{name="txq.enqueue" && span.tx_type = "Payment"} +{name="txq.enqueue" && span.tx_type = "OfferCreate"} +{name="txq.enqueue" && span.tx_type =~ "NFToken.*"} + +# Find ledger closes that applied queued transactions +{name="txq.accept" && span.ledger_changed = true} ``` ### RPC Debugging ``` # Find batch RPC requests -{name="rpc.process"} | is_batch = true +{name="rpc.process" && span.is_batch = true} # Find large RPC payloads (>100KB) -{name="rpc.http_request"} | request_payload_size > 100000 +{name="rpc.http_request" && span.request_payload_size > 100000} # Find resource-heavy RPC commands (by load_type) -{name=~"rpc.command.*"} | load_type = "exceptioned RPC" +{name =~ "rpc.command.*" && span.load_type = "exceptioned RPC"} # Find a specific WebSocket command -{name="rpc.ws_message"} | command = "subscribe" +{name="rpc.ws_message" && span.command = "subscribe"} + +# Find server_info calls +{name="rpc.command.server_info"} # Find slow pathfinding with many source assets -{name="pathfind.discover"} | pathfind_num_source_assets > 10 +{name="pathfind.discover" && span.pathfind_num_source_assets > 10} ``` ### PathFinding Performance ``` # Find pathfinding for specific currencies -{name="pathfind.compute"} | pathfind_dest_currency = "USD" +{name="pathfind.compute" && span.pathfind_dest_currency = "USD"} # Find expensive pathfinding (many source assets to explore) -{name="pathfind.discover"} | pathfind_num_source_assets > 20 +{name="pathfind.discover" && span.pathfind_num_source_assets > 20} -# Find large pathfinding requests -{name="pathfind.compute"} | duration > 1s +# Find slow pathfinding requests +{name="pathfind.compute" && duration > 1000ms} ``` ### Consensus Health ``` # Find rounds where consensus timed out (expired) -{name="consensus.accept"} | consensus_state = "expired" +{name="consensus.accept" && span.consensus_state = "expired"} # Find rounds where we moved on without full agreement -{name="consensus.accept"} | consensus_state = "moved_on" +{name="consensus.accept" && span.consensus_state = "moved_on"} # Find rounds with many disputes -{name="consensus.accept"} | disputes_count > 5 +{name="consensus.accept" && span.disputes_count > 5} + +# Find slow consensus rounds (>5s) +{name="consensus.accept" && span.round_time_ms > 5000} # Find bow-out proposals (node resigned from round) -{name="consensus.proposal.send"} | is_bow_out = true +{name="consensus.proposal.send" && span.is_bow_out = true} # Correlate validation with its ledger -{name="consensus.validation.send"} | ledger_hash = "" +{name="consensus.validation.send" && span.ledger_hash = ""} # Find rounds where validators disagreed on close time -{name="consensus.accept.apply"} | close_time_correct = false +{name="consensus.accept.apply" && span.close_time_correct = false} + +# Find both validation send and receive (compare sender vs receiver latency) +{name = "consensus.validation.send" || name = "consensus.validation.receive"} ``` ### Cross-Subsystem Correlation ``` # Follow a transaction from receive through queue to ledger -{name=~"tx\\..*|txq\\..*"} | tx_type = "Payment" && duration > 500ms +{name =~ "tx.*|txq.*" && span.tx_type = "Payment" && duration > 500ms} -# Find all NFT-related activity -{name=~"tx\\..*|txq\\..*"} | tx_type =~ "NFToken.*" +# Find all NFT-related activity across tx and txq spans +{name =~ "tx.*|txq.*" && span.tx_type =~ "NFToken.*"} -# Find consensus rounds with slow transactions -{name="consensus.accept"} | round_time_ms > 5000 +# Find all AMM activity across tx and txq spans +{name =~ "tx.*|txq.*" && span.tx_type =~ "AMM.*"} + +# Find cross-node transaction receives (no errors) +{name="tx.receive" && status != error} ``` ### Where to Look (Quick Reference) -| Question | Span | Key Attributes | -| ----------------------------------- | --------------------------- | ------------------------------ | -| "Which tx type is slowest?" | `tx.process` | `tx_type` + duration | -| "Why was my tx rejected?" | `tx.process` | `ter_result`, `applied` | -| "Is the TxQ backing up?" | `txq.accept` | `queue_size`, `ledger_changed` | -| "Why was my tx dropped from queue?" | `txq.accept_tx` | `txq_status`, `ter_code` | -| "Are batch requests a problem?" | `rpc.process` | `is_batch`, `batch_size` | -| "Which RPC is expensive?" | `rpc.command.*` | `load_type`, duration | -| "Did consensus stall?" | `consensus.check` | `consensus_stalled` | -| "Was consensus outcome normal?" | `consensus.accept` | `consensus_state` | -| "Did a validator bow out?" | `consensus.proposal.send` | `is_bow_out` | -| "Which ledger was validated?" | `consensus.validation.send` | `ledger_hash` | +| Question | Span | Key Attributes | +| ----------------------------------- | --------------------------- | ---------------------------------------- | +| "Which tx type is slowest?" | `tx.process` | `span.tx_type` + duration | +| "Why was my tx rejected?" | `tx.process` | `span.ter_result`, `span.applied` | +| "What AMM operations happened?" | `tx.process` | `span.tx_type =~ "AMM.*"` | +| "What DEX offers failed?" | `tx.process` | `span.tx_type`, `span.ter_result` | +| "What NFT activity occurred?" | `tx.process`, `txq.enqueue` | `span.tx_type =~ "NFToken.*"` | +| "Is the TxQ backing up?" | `txq.accept` | `span.queue_size`, `span.ledger_changed` | +| "Why was my tx dropped from queue?" | `txq.accept_tx` | `span.txq_status`, `span.ter_code` | +| "Are batch requests a problem?" | `rpc.process` | `span.is_batch`, `span.batch_size` | +| "Which RPC is expensive?" | `rpc.command.*` | `span.load_type`, duration | +| "Did consensus reach threshold?" | `consensus.check` | `span.consensus_result` | +| "Was consensus outcome normal?" | `consensus.accept` | `span.consensus_state` | +| "Did a validator bow out?" | `consensus.proposal.send` | `span.is_bow_out` | +| "Which ledger was validated?" | `consensus.validation.send` | `span.ledger_hash` | +| "Did close time agreement fail?" | `consensus.accept.apply` | `span.close_time_correct` | --- @@ -349,20 +433,20 @@ all its normal attributes, it just lacks a cross-node parent link. ### Example Tempo Queries ``` -# Find cross-node transaction traces (tx.process -> tx.receive across nodes) -{name="tx.receive"} && status != error +# Find cross-node transaction traces (tx.receive spans with no errors) +{name="tx.receive" && status != error} # Find proposals received with cross-node parent context -{name="consensus.proposal.receive"} && nestedSetParent > 0 +{name="consensus.proposal.receive"} # Trace a transaction across the network by its hash -{name=~"tx\\..*"} | tx_hash = "" +{name =~ "tx.*" && span.tx_hash = ""} # Find all spans in a cross-node consensus trace -{rootServiceName="xrpld"} | consensus_round_id = +{resource.service.name="xrpld" && span.consensus_round_id = ""} # Compare latency between sender and receiver for validations -{name="consensus.validation.send" || name="consensus.validation.receive"} +{name = "consensus.validation.send" || name = "consensus.validation.receive"} ``` ## Prometheus Metrics (Spanmetrics) @@ -672,21 +756,26 @@ Log files are ingested by the OTel Collector's `filelog` receiver, which tails ` ### LogQL Query Examples +The OTel Collector emits logs to Loki with `service_name="xrpld"` (not `job="xrpld"`). + ```logql # Find all logs for a specific trace -{job="xrpld"} |= "trace_id=abc123def456789012345678abcdef01" +{service_name="xrpld"} |= "trace_id=abc123def456789012345678abcdef01" # Error logs with trace context (log lines with ERR severity that have a trace_id) -{job="xrpld"} |= "ERR" |= "trace_id=" +{service_name="xrpld"} |= "ERR" |= "trace_id=" # All logs from a specific partition that were emitted during a span -{job="xrpld"} |= "LedgerMaster" | regexp `trace_id=(?P[a-f0-9]+)` | trace_id != "" +{service_name="xrpld"} |= "LedgerMaster" | regexp `trace_id=(?P[a-f0-9]+)` | trace_id != "" + +# Logs from a specific subsystem during a span (e.g. LedgerConsensus) +{service_name="xrpld"} |= "LedgerConsensus" |= "trace_id=" # Logs from the last hour containing trace context -{job="xrpld"} |= "trace_id=" | regexp `(?P\S+):(?P\S+)\s+trace_id=(?P[a-f0-9]+)` +{service_name="xrpld"} |= "trace_id=" | regexp `(?P\S+):(?P\S+)\s+trace_id=(?P[a-f0-9]+)` # Count of traced vs untraced log lines -count_over_time({job="xrpld"} |= "trace_id=" [5m]) +count_over_time({service_name="xrpld"} |= "trace_id=" [5m]) ``` ### Verifying Log Correlation @@ -694,7 +783,7 @@ count_over_time({job="xrpld"} |= "trace_id=" [5m]) 1. Start the observability stack and xrpld with telemetry enabled. 2. Send an RPC request: `curl http://localhost:5005 -d '{"method":"server_info"}'` 3. Check the debug.log for `trace_id=` entries: `grep trace_id= /path/to/debug.log` -4. Open Grafana at http://localhost:3000 -> Explore -> Loki and search for `{job="xrpld"} |= "trace_id="`. +4. Open Grafana at http://localhost:3000 -> Explore -> Loki and search for `{service_name="xrpld"} |= "trace_id="`. 5. Click the TraceID link to navigate to the corresponding trace in Tempo. ## Troubleshooting From 478b58395b61ca96da550e191466c120c440d9ec Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 17:54:52 +0100 Subject: [PATCH 3/9] loop levelization Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- .github/scripts/levelization/results/loops.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/levelization/results/loops.txt b/.github/scripts/levelization/results/loops.txt index f50cb168ec..355a4013ed 100644 --- a/.github/scripts/levelization/results/loops.txt +++ b/.github/scripts/levelization/results/loops.txt @@ -20,7 +20,7 @@ Loop: xrpld.app xrpld.shamap xrpld.shamap > xrpld.app Loop: xrpld.app xrpld.telemetry - xrpld.telemetry == xrpld.app + xrpld.telemetry ~= xrpld.app Loop: xrpld.overlay xrpld.rpc xrpld.rpc ~= xrpld.overlay From c3bdcb42914e2d6477f526e5d8cdc857b477887d Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 18:02:47 +0100 Subject: [PATCH 4/9] clang-tidy include Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- src/xrpld/app/ledger/LedgerHistory.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/xrpld/app/ledger/LedgerHistory.cpp b/src/xrpld/app/ledger/LedgerHistory.cpp index 092e88e28a..146d808519 100644 --- a/src/xrpld/app/ledger/LedgerHistory.cpp +++ b/src/xrpld/app/ledger/LedgerHistory.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include From d7e847a53b466e7b61eec40c0a562ef25d93905d Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 18:11:23 +0100 Subject: [PATCH 5/9] removed p50 renders from all dashboards Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- .../grafana/dashboards/consensus-health.json | 34 ++---------- .../grafana/dashboards/ledger-operations.json | 18 +------ .../dashboards/system-node-health.json | 18 +------ .../dashboards/system-rpc-pathfinding.json | 54 +++---------------- .../dashboards/transaction-overview.json | 27 ++-------- 5 files changed, 16 insertions(+), 135 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index f0787a5390..465b83be0b 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -10,7 +10,7 @@ "panels": [ { "title": "Consensus Round Duration", - "description": "p95 and p50 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp) measures the time to process an accepted ledger including transaction application and state finalization. The span carries proposers and round_time_ms attributes. Normal range is 3-6 seconds on mainnet.", + "description": "p95 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp) measures the time to process an accepted ledger including transaction application and state finalization. The span carries proposers and round_time_ms attributes. Normal range is 3-6 seconds on mainnet.", "type": "timeseries", "gridPos": { "h": 8, @@ -31,13 +31,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))", "legendFormat": "P95 Round Duration [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))", - "legendFormat": "P50 Round Duration [{{exported_instance}}]" } ], "fieldConfig": { @@ -181,13 +174,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))", "legendFormat": "P95 Apply Duration [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))", - "legendFormat": "P50 Apply Duration [{{exported_instance}}]" } ], "fieldConfig": { @@ -745,7 +731,7 @@ }, { "title": "Consensus Round Duration (Full Round)", - "description": "p95/p50 duration of the full consensus round. The consensus.round span (RCLConsensus.cpp startRound) wraps an entire round end-to-end. Filterable by consensus mode. This is the single most important consensus-health signal; rising round time precedes ledger-age alarms.", + "description": "p95 duration of the full consensus round. The consensus.round span (RCLConsensus.cpp startRound) wraps an entire round end-to-end. Filterable by consensus mode. This is the single most important consensus-health signal; rising round time precedes ledger-age alarms.", "type": "timeseries", "gridPos": { "h": 8, @@ -766,13 +752,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))", "legendFormat": "P95 Round [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))", - "legendFormat": "P50 Round [{{exported_instance}}]" } ], "fieldConfig": { @@ -837,7 +816,7 @@ }, { "title": "Position Update Duration", - "description": "p95/p50 duration of the consensus.update_positions span, which tallies disputes and updates this node's position each round. Long durations indicate heavy dispute resolution or slow convergence on close time.", + "description": "p95 duration of the consensus.update_positions span, which tallies disputes and updates this node's position each round. Long durations indicate heavy dispute resolution or slow convergence on close time.", "type": "timeseries", "gridPos": { "h": 8, @@ -858,13 +837,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))", "legendFormat": "P95 Update [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))", - "legendFormat": "P50 Update [{{exported_instance}}]" } ], "fieldConfig": { diff --git a/docker/telemetry/grafana/dashboards/ledger-operations.json b/docker/telemetry/grafana/dashboards/ledger-operations.json index 6bbd6d17d0..6ac980bb90 100644 --- a/docker/telemetry/grafana/dashboards/ledger-operations.json +++ b/docker/telemetry/grafana/dashboards/ledger-operations.json @@ -42,7 +42,7 @@ }, { "title": "Ledger Build Duration", - "description": "p95 and p50 duration of ledger builds. Measures the full buildLedgerImpl() call including transaction application, SHAMap flushing, and ledger acceptance. The span records xrpl.ledger.seq as an attribute. Long build times indicate expensive transaction sets or I/O pressure from SHAMap flushes.", + "description": "p95 duration of ledger builds. Measures the full buildLedgerImpl() call including transaction application, SHAMap flushing, and ledger acceptance. The span records xrpl.ledger.seq as an attribute. Long build times indicate expensive transaction sets or I/O pressure from SHAMap flushes.", "type": "timeseries", "gridPos": { "h": 8, @@ -63,13 +63,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.build\"}[5m])))", "legendFormat": "P95 Build Duration [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.build\"}[5m])))", - "legendFormat": "P50 Build Duration [{{exported_instance}}]" } ], "fieldConfig": { @@ -156,7 +149,7 @@ }, { "title": "Transaction Apply Duration", - "description": "p95 and p50 duration of applying the consensus transaction set during ledger building. The tx.apply span (BuildLedger.cpp) wraps applyTransactions() which iterates through the CanonicalTXSet with multiple retry passes. Records tx_count (successful) and tx_failed (failed) as attributes.", + "description": "p95 duration of applying the consensus transaction set during ledger building. The tx.apply span (BuildLedger.cpp) wraps applyTransactions() which iterates through the CanonicalTXSet with multiple retry passes. Records tx_count (successful) and tx_failed (failed) as attributes.", "type": "timeseries", "gridPos": { "h": 8, @@ -177,13 +170,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))", "legendFormat": "P95 tx.apply [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))", - "legendFormat": "P50 tx.apply [{{exported_instance}}]" } ], "fieldConfig": { diff --git a/docker/telemetry/grafana/dashboards/system-node-health.json b/docker/telemetry/grafana/dashboards/system-node-health.json index c52b61368d..9fa958ad27 100644 --- a/docker/telemetry/grafana/dashboards/system-node-health.json +++ b/docker/telemetry/grafana/dashboards/system-node-health.json @@ -243,7 +243,7 @@ }, { "title": "I/O Latency", - "description": "P95 and P50 of the I/O service loop latency in milliseconds. Sourced from the ios_latency event (Application.cpp) which measures how long it takes for the io_context to process a timer callback. Values above 10ms are logged; above 500ms trigger warnings. High values indicate thread pool saturation or blocking operations.", + "description": "P95 of the I/O service loop latency in milliseconds. Sourced from the ios_latency event (Application.cpp) which measures how long it takes for the io_context to process a timer callback. Values above 10ms are logged; above 500ms trigger warnings. High values indicate thread pool saturation or blocking operations.", "type": "timeseries", "gridPos": { "h": 8, @@ -264,13 +264,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_ios_latency_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))", "legendFormat": "P95 I/O Latency [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(xrpld_ios_latency_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))", - "legendFormat": "P50 I/O Latency [{{exported_instance}}]" } ], "fieldConfig": { @@ -2054,7 +2047,7 @@ }, { "title": "Ledger Acquire Duration (Inbound Fetch)", - "description": "p95/p50 duration of the ledger.acquire span (InboundLedger): how long it takes to fetch a missing ledger from peers. A spike signals the node is falling behind or recovering from a fork. Populated under back-fill / sync activity.", + "description": "p95 duration of the ledger.acquire span (InboundLedger): how long it takes to fetch a missing ledger from peers. A spike signals the node is falling behind or recovering from a fork. Populated under back-fill / sync activity.", "type": "timeseries", "gridPos": { "h": 8, @@ -2075,13 +2068,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m])))", "legendFormat": "P95 Acquire [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m])))", - "legendFormat": "P50 Acquire [{{exported_instance}}]" } ], "fieldConfig": { diff --git a/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json b/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json index 198eceae42..3e2a7651df 100644 --- a/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json +++ b/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json @@ -43,7 +43,7 @@ }, { "title": "RPC Response Time (System Metrics)", - "description": "P95 and P50 of RPC response time from the beast::insight timer. Sourced from the rpc.time event (ServerHandler.cpp) which records elapsed milliseconds for each RPC response. This measures the full HTTP handler time, not just command execution. Compare with span-based rpc.request duration.", + "description": "P95 of RPC response time from the beast::insight timer. Sourced from the rpc.time event (ServerHandler.cpp) which records elapsed milliseconds for each RPC response. This measures the full HTTP handler time, not just command execution. Compare with span-based rpc.request duration.", "type": "timeseries", "gridPos": { "h": 8, @@ -64,13 +64,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_rpc_time_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))", "legendFormat": "P95 Response Time [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_rpc_time_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))", - "legendFormat": "P50 Response Time [{{exported_instance}}]" } ], "fieldConfig": { @@ -89,7 +82,7 @@ }, { "title": "RPC Response Size", - "description": "P95 and P50 of RPC response payload size in bytes. Sourced from the rpc.size event (ServerHandler.cpp) which records the byte length of each RPC JSON response. Large responses may indicate expensive queries (e.g. account_tx with many results) or API misuse.", + "description": "P95 of RPC response payload size in bytes. Sourced from the rpc.size event (ServerHandler.cpp) which records the byte length of each RPC JSON response. Large responses may indicate expensive queries (e.g. account_tx with many results) or API misuse.", "type": "timeseries", "gridPos": { "h": 8, @@ -110,13 +103,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_rpc_size_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))", "legendFormat": "P95 Response Size [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_rpc_size_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))", - "legendFormat": "P50 Response Size [{{exported_instance}}]" } ], "fieldConfig": { @@ -135,7 +121,7 @@ }, { "title": "RPC Response Time Distribution", - "description": "Distribution of RPC response times from the beast::insight timer showing P50, P90, P95, and P99 quantiles. Sourced from the rpc.time event (ServerHandler.cpp). Useful for detecting bimodal latency or long-tail requests.", + "description": "Distribution of RPC response times from the beast::insight timer showing P90, P95, and P99 quantiles. Sourced from the rpc.time event (ServerHandler.cpp). Useful for detecting bimodal latency or long-tail requests.", "type": "timeseries", "gridPos": { "h": 8, @@ -150,13 +136,6 @@ } }, "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_rpc_time_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))", - "legendFormat": "P50 [{{exported_instance}}]" - }, { "datasource": { "type": "prometheus" @@ -195,7 +174,7 @@ }, { "title": "Pathfinding Fast Duration", - "description": "P95 and P50 of fast pathfinding execution time. Sourced from the pathfind_fast event (PathRequests.h) which records the duration of the fast pathfinding algorithm. Fast pathfinding uses a simplified search that trades accuracy for speed.", + "description": "P95 of fast pathfinding execution time. Sourced from the pathfind_fast event (PathRequests.h) which records the duration of the fast pathfinding algorithm. Fast pathfinding uses a simplified search that trades accuracy for speed.", "type": "timeseries", "gridPos": { "h": 8, @@ -216,13 +195,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_pathfind_fast_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))", "legendFormat": "P95 Fast Pathfind [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_pathfind_fast_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))", - "legendFormat": "P50 Fast Pathfind [{{exported_instance}}]" } ], "fieldConfig": { @@ -241,7 +213,7 @@ }, { "title": "Pathfinding Full Duration", - "description": "P95 and P50 of full pathfinding execution time. Sourced from the pathfind_full event (PathRequests.h) which records the duration of the exhaustive pathfinding search. Full pathfinding is more expensive and can take significantly longer than fast mode.", + "description": "P95 of full pathfinding execution time. Sourced from the pathfind_full event (PathRequests.h) which records the duration of the exhaustive pathfinding search. Full pathfinding is more expensive and can take significantly longer than fast mode.", "type": "timeseries", "gridPos": { "h": 8, @@ -262,13 +234,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_pathfind_full_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))", "legendFormat": "P95 Full Pathfind [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(xrpld_pathfind_full_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))", - "legendFormat": "P50 Full Pathfind [{{exported_instance}}]" } ], "fieldConfig": { @@ -500,7 +465,7 @@ }, { "title": "Pathfinding Compute Duration (Spans)", - "description": "p95/p50 of the pathfind.compute span, the per-request path computation. Complements the StatsD pathfind_fast/full timers with span-level visibility. Populated under pathfinding (book/path) RPC load.", + "description": "p95 of the pathfind.compute span, the per-request path computation. Complements the StatsD pathfind_fast/full timers with span-level visibility. Populated under pathfinding (book/path) RPC load.", "type": "timeseries", "gridPos": { "h": 8, @@ -521,13 +486,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"pathfind.compute\"}[5m])))", "legendFormat": "P95 Compute [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"pathfind.compute\"}[5m])))", - "legendFormat": "P50 Compute [{{exported_instance}}]" } ], "fieldConfig": { diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index 3e699b8fdf..b1c8f64bd2 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -56,7 +56,7 @@ }, { "title": "Transaction Processing Latency by Type", - "description": "Per-transaction-type processing latency (p95 and p50). Filter with $tx_type variable above.", + "description": "Per-transaction-type processing latency (p95). Filter with $tx_type variable above.", "type": "timeseries", "gridPos": { "h": 8, @@ -82,13 +82,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, tx_type, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.process\", tx_type=~\"$tx_type\"}[5m])))", "legendFormat": "P95 {{tx_type}} [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.50, sum by (le, tx_type, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.process\", tx_type=~\"$tx_type\"}[5m])))", - "legendFormat": "P50 {{tx_type}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -208,7 +201,7 @@ }, { "title": "Transaction Apply Duration per Ledger", - "description": "p95 and p50 latency of applying the consensus transaction set to a new ledger. The tx.apply span (BuildLedger.cpp) wraps the applyTransactions() function that iterates through the CanonicalTXSet and applies each transaction to the OpenView. Long durations indicate heavy transaction sets or expensive transaction processing.", + "description": "p95 latency of applying the consensus transaction set to a new ledger. The tx.apply span (BuildLedger.cpp) wraps the applyTransactions() function that iterates through the CanonicalTXSet and applies each transaction to the OpenView. Long durations indicate heavy transaction sets or expensive transaction processing.", "type": "timeseries", "gridPos": { "h": 8, @@ -229,13 +222,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))", "legendFormat": "P95 tx.apply [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))", - "legendFormat": "P50 tx.apply [{{exported_instance}}]" } ], "fieldConfig": { @@ -587,7 +573,7 @@ }, { "title": "Queue Accept (Drain) Duration per Ledger", - "description": "p95/p50 duration of the txq.accept span, which drains queued transactions into a newly closed ledger. Rising drain time signals queue pressure at ledger close.", + "description": "p95 duration of the txq.accept span, which drains queued transactions into a newly closed ledger. Rising drain time signals queue pressure at ledger close.", "type": "timeseries", "gridPos": { "h": 8, @@ -608,13 +594,6 @@ }, "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"txq.accept\"}[5m])))", "legendFormat": "P95 Drain [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"txq.accept\"}[5m])))", - "legendFormat": "P50 Drain [{{exported_instance}}]" } ], "fieldConfig": { From 6428c9f13cb167ca1295efae6b53fef8775bf49b Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 11:11:55 +0100 Subject: [PATCH 6/9] feat(telemetry): add preflight/preclaim stage spans and stage attribute The tx.transactor span covered only the apply stage; preflight and preclaim had no telemetry, so a transaction that hard-failed those stages produced no apply-pipeline span and per-stage latency/failure was invisible. Add tx.preflight and tx.preclaim spans in applySteps.cpp via a makeStageSpan() helper using SpanGuard::hashSpan, so all three stages share a deterministic trace_id derived from txID[0:16] even though they run sequentially and often cross-thread. Each span carries stage, tx_type, and ter_result; exceptions are recorded as tefEXCEPTION before the public wrappers map them. The type lookup is guarded behind the span-active check so it costs nothing when tracing is off. Add a stage="apply" attribute to the tx.transactor span and move its three hardcoded attribute strings to a new library-safe header include/xrpl/tx/detail/TxApplySpanNames.h, which mirrors the daemon-side TxSpanNames.h strings so the collector spanmetrics connector aggregates both span sets under one dimension set. A constants-contract test pins the span-name, attribute-key, and stage-value strings; span content stays covered by the docker integration test, as the rest of the telemetry suite is. Co-Authored-By: Claude Opus 4.8 --- .../scripts/levelization/results/ordering.txt | 1 + include/xrpl/tx/detail/TxApplySpanNames.h | 109 +++++++++++++ src/libxrpl/tx/Transactor.cpp | 11 +- src/libxrpl/tx/applySteps.cpp | 146 ++++++++++++++---- .../libxrpl/telemetry/TxApplySpanNames.cpp | 52 +++++++ 5 files changed, 282 insertions(+), 37 deletions(-) create mode 100644 include/xrpl/tx/detail/TxApplySpanNames.h create mode 100644 src/tests/libxrpl/telemetry/TxApplySpanNames.cpp diff --git a/.github/scripts/levelization/results/ordering.txt b/.github/scripts/levelization/results/ordering.txt index 61b3d4ea3d..63e46c779d 100644 --- a/.github/scripts/levelization/results/ordering.txt +++ b/.github/scripts/levelization/results/ordering.txt @@ -240,6 +240,7 @@ xrpl.tx > xrpl.basics xrpl.tx > xrpl.core xrpl.tx > xrpl.ledger xrpl.tx > xrpl.protocol +xrpl.tx > xrpl.telemetry xrpld.app > test.unit_test xrpld.app > xrpl.basics xrpld.app > xrpl.core diff --git a/include/xrpl/tx/detail/TxApplySpanNames.h b/include/xrpl/tx/detail/TxApplySpanNames.h new file mode 100644 index 0000000000..c007ca3c03 --- /dev/null +++ b/include/xrpl/tx/detail/TxApplySpanNames.h @@ -0,0 +1,109 @@ +#pragma once + +/** Compile-time span name constants for the transaction apply pipeline. + * + * Defines the span names and attribute keys used by the three apply-pipeline + * stages — preflight, preclaim, and transactor (apply) — that run inside the + * library (`src/libxrpl/tx/`). Built on the StaticStr/join() primitives from + * . + * + * Why a separate header from TxSpanNames.h: + * TxSpanNames.h lives under src/xrpld/ (daemon) and serves the overlay/app + * lifecycle spans (tx.receive, tx.process). Library code (applySteps.cpp, + * Transactor.cpp) must not depend on daemon headers, so the apply-pipeline + * constants live here instead. The attribute strings ("tx_type", + * "ter_result", "applied") intentionally match TxSpanNames.h so the collector + * spanmetrics connector aggregates both sets under the same dimensions. + * + * Span hierarchy (deterministic trace_id derived from txID[0:16]): + * + * The three stages run sequentially and often on different threads, so they + * do not auto-parent. Each uses a hash-derived trace_id keyed on the same + * transaction id, placing all three under one trace without context + * propagation. A transaction that hard-fails preflight or preclaim never + * reaches the transactor span — the stage attribute identifies where it + * stopped. + * + * +-----------------------------------------------------------+ + * | trace_id = txID[0:16] | + * | | + * | +-------------------+ +------------------+ +-------+ | + * | | tx.preflight | | tx.preclaim | | tx. | | + * | | stage=preflight |-->| stage=preclaim |-->| trans | | + * | | tx_type | | tx_type | | actor | | + * | | ter_result | | ter_result | | stage=| | + * | +-------------------+ +------------------+ | apply | | + * | stateless checks ledger-aware checks +-------+ | + * | (signature, fields) (sequence, fee) applies | + * +-----------------------------------------------------------+ + * + * Usage: + * @code + * #include + * using namespace telemetry; + * + * // preflight() / preclaim() use hashSpan with a full span name: + * auto span = SpanGuard::hashSpan( + * TraceCategory::Transactions, tx_apply_span::preflight, + * txID.data(), txID.kBytes); + * span.setAttribute(tx_apply_span::attr::stage, tx_apply_span::val::preflight); + * span.setAttribute(tx_apply_span::attr::terResult, transToken(ter).c_str()); + * @endcode + * + * @code + * // Transactor::operator() uses span() with prefix + suffix: + * auto span = SpanGuard::span( + * TraceCategory::Transactions, seg::tx, tx_apply_span::op::transactor); + * span.setAttribute(tx_apply_span::attr::stage, tx_apply_span::val::apply); + * @endcode + */ + +#include + +namespace xrpl::telemetry::tx_apply_span { + +// ===== Span operation suffixes ============================================= + +namespace op { +/// "preflight" — stateless transaction checks (suffix form). +inline constexpr auto preflight = makeStr("preflight"); +/// "preclaim" — ledger-aware checks before fee claim (suffix form). +inline constexpr auto preclaim = makeStr("preclaim"); +/// "transactor" — the apply stage (suffix form, used with span()). +inline constexpr auto transactor = makeStr("transactor"); +} // namespace op + +// ===== Full span names (tx.) =========================================== + +/// "tx.preflight" — full name for hashSpan() at the preflight stage. +inline constexpr auto preflight = join(seg::tx, op::preflight); +/// "tx.preclaim" — full name for hashSpan() at the preclaim stage. +inline constexpr auto preclaim = join(seg::tx, op::preclaim); + +// ===== Attribute keys ====================================================== + +namespace attr { +/// "stage" — which apply-pipeline stage this span represents. Drives the +/// collector spanmetrics `stage` dimension for per-stage RED metrics. +inline constexpr auto stage = makeStr("stage"); +/// "tx_type" — transaction type name (e.g., "Payment", "OfferCreate"). +/// Matches tx_span::attr::txType so both share the spanmetrics dimension. +inline constexpr auto txType = makeStr("tx_type"); +/// "ter_result" — engine result code after the stage (e.g., "tesSUCCESS"). +inline constexpr auto terResult = makeStr("ter_result"); +/// "applied" — whether the transaction was applied to the ledger (apply only). +inline constexpr auto applied = makeStr("applied"); +} // namespace attr + +// ===== Attribute values (stage names) ====================================== + +namespace val { +/// "preflight" — value of the stage attribute on tx.preflight. +inline constexpr auto preflight = makeStr("preflight"); +/// "preclaim" — value of the stage attribute on tx.preclaim. +inline constexpr auto preclaim = makeStr("preclaim"); +/// "apply" — value of the stage attribute on tx.transactor. +inline constexpr auto apply = makeStr("apply"); +} // namespace val + +} // namespace xrpl::telemetry::tx_apply_span diff --git a/src/libxrpl/tx/Transactor.cpp b/src/libxrpl/tx/Transactor.cpp index 5df0504004..b3917e85d9 100644 --- a/src/libxrpl/tx/Transactor.cpp +++ b/src/libxrpl/tx/Transactor.cpp @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -1199,9 +1200,11 @@ Transactor::operator()() auto span = telemetry::SpanGuard::span( telemetry::TraceCategory::Transactions, telemetry::seg::tx, - telemetry::makeStr("transactor")); + telemetry::tx_apply_span::op::transactor); + // "apply" — the third apply-pipeline stage, after preflight and preclaim. + span.setAttribute(telemetry::tx_apply_span::attr::stage, telemetry::tx_apply_span::val::apply); if (auto const* fmt = TxFormats::getInstance().findByType(ctx_.tx.getTxnType())) - span.setAttribute("tx_type", fmt->getName().c_str()); + span.setAttribute(telemetry::tx_apply_span::attr::txType, fmt->getName().c_str()); JLOG(j_.trace()) << "apply: " << ctx_.tx.getTransactionID(); @@ -1429,8 +1432,8 @@ Transactor::operator()() JLOG(j_.trace()) << (applied ? "applied " : "not applied ") << transToken(result); - span.setAttribute("ter_result", transToken(result).c_str()); - span.setAttribute("applied", applied); + span.setAttribute(telemetry::tx_apply_span::attr::terResult, transToken(result).c_str()); + span.setAttribute(telemetry::tx_apply_span::attr::applied, applied); return {result, applied, metadata}; } diff --git a/src/libxrpl/tx/applySteps.cpp b/src/libxrpl/tx/applySteps.cpp index 217fdd717f..efe4e457af 100644 --- a/src/libxrpl/tx/applySteps.cpp +++ b/src/libxrpl/tx/applySteps.cpp @@ -13,13 +13,16 @@ #include #include #include +#include #include #include +#include #include #include #include #include +#include #include #pragma push_macro("TRANSACTION") #undef TRANSACTION @@ -51,6 +54,47 @@ struct UnknownTxnType : std::exception } }; +/** Look up the human-readable transaction type name for span attributes. + * Returns nullptr if the type is unknown so the caller can skip the + * attribute rather than emit an empty value. + */ +char const* +txTypeName(TxType txnType) +{ + if (auto const* fmt = TxFormats::getInstance().findByType(txnType)) + return fmt->getName().c_str(); + return nullptr; +} + +/** Create a deterministic-trace span for an apply-pipeline stage. + * + * The trace_id is derived from txID[0:16] so the preflight, preclaim, and + * transactor spans of one transaction share a trace even though they run + * sequentially and often on different threads. Sets the stage, tx_type, and + * (after the stage runs) ter_result attributes that drive the collector + * spanmetrics dimensions. A no-op when telemetry is disabled. + * + * @param name Full span name (tx_apply_span::preflight / ::preclaim). + * @param stage Stage attribute value (tx_apply_span::val::*). + * @param tx The transaction supplying the id and type. + */ +[[nodiscard]] telemetry::SpanGuard +makeStageSpan(std::string_view name, std::string_view stage, STTx const& tx) +{ + auto const txID = tx.getTransactionID(); + auto span = telemetry::SpanGuard::hashSpan( + telemetry::TraceCategory::Transactions, name, txID.data(), txID.kBytes); + // Guard the type lookup behind the active check: preflight runs for every + // transaction, so findByType() must not run when tracing is off/disabled. + if (span) + { + span.setAttribute(telemetry::tx_apply_span::attr::stage, stage); + if (char const* typeName = txTypeName(tx.getTxnType())) + span.setAttribute(telemetry::tx_apply_span::attr::txType, typeName); + } + return span; +} + // Call a lambda with the concrete transaction type as a template parameter // throw an "UnknownTxnType" exception on error template @@ -133,82 +177,118 @@ consequencesHelper(PreflightContext const& ctx) static std::pair invokePreflight(PreflightContext const& ctx) { + // Trace the preflight stage. The span shares the transaction's + // deterministic trace_id so it correlates with preclaim and transactor. + auto span = makeStageSpan( + telemetry::tx_apply_span::preflight, telemetry::tx_apply_span::val::preflight, ctx.tx); try { - return withTxnType(ctx.rules, ctx.tx.getTxnType(), [&]() { + auto result = withTxnType(ctx.rules, ctx.tx.getTxnType(), [&]() { auto const tec = Transactor::invokePreflight(ctx); return std::make_pair( tec, isTesSuccess(tec) ? consequencesHelper(ctx) : TxConsequences{tec}); }); + if (span) + span.setAttribute( + telemetry::tx_apply_span::attr::terResult, transToken(result.first).c_str()); + return result; } catch (UnknownTxnType const& e) { // Should never happen // LCOV_EXCL_START JLOG(ctx.j.fatal()) << "Unknown transaction type in preflight: " << e.txnType; + span.recordException(e); UNREACHABLE("xrpl::invokePreflight : unknown transaction type"); return {temUNKNOWN, TxConsequences{temUNKNOWN}}; // LCOV_EXCL_STOP } + catch (std::exception const& e) + { + // The caller's preflight() maps this to tefEXCEPTION. Record it on the + // span before unwinding so per-stage error counts include exceptions. + span.setAttribute( + telemetry::tx_apply_span::attr::terResult, transToken(tefEXCEPTION).c_str()); + span.recordException(e); + throw; + } } static TER invokePreclaim(PreclaimContext const& ctx) { + // Trace the preclaim stage under the transaction's deterministic trace_id. + auto span = makeStageSpan( + telemetry::tx_apply_span::preclaim, telemetry::tx_apply_span::val::preclaim, ctx.tx); try { // use name hiding to accomplish compile-time polymorphism of static // class functions for Transactor and derived classes. - return withTxnType(ctx.view.rules(), ctx.tx.getTxnType(), [&]() -> TER { - // preclaim functionality is divided into two sections: - // 1. Up to and including the signature check: returns NotTEC. - // All transaction checks before and including checkSign - // MUST return NotTEC, or something more restrictive. - // Allowing tec results in these steps risks theft or - // destruction of funds, as a fee will be charged before the - // signature is checked. - // 2. After the signature check: returns TER. + TER const preclaimTer = + withTxnType(ctx.view.rules(), ctx.tx.getTxnType(), [&]() -> TER { + // preclaim functionality is divided into two sections: + // 1. Up to and including the signature check: returns NotTEC. + // All transaction checks before and including checkSign + // MUST return NotTEC, or something more restrictive. + // Allowing tec results in these steps risks theft or + // destruction of funds, as a fee will be charged before the + // signature is checked. + // 2. After the signature check: returns TER. - // If the transactor requires a valid account and the - // transaction doesn't list one, preflight will have already - // a flagged a failure. - auto const id = ctx.tx.getAccountID(sfAccount); + // If the transactor requires a valid account and the + // transaction doesn't list one, preflight will have already + // a flagged a failure. + auto const id = ctx.tx.getAccountID(sfAccount); - if (id != beast::kZero) - { - if (NotTEC const preSigResult = [&]() -> NotTEC { - if (NotTEC const result = T::checkSeqProxy(ctx.view, ctx.tx, ctx.j)) - return result; + if (id != beast::kZero) + { + if (NotTEC const preSigResult = [&]() -> NotTEC { + if (NotTEC const result = T::checkSeqProxy(ctx.view, ctx.tx, ctx.j)) + return result; - if (NotTEC const result = T::checkPriorTxAndLastLedger(ctx)) - return result; + if (NotTEC const result = T::checkPriorTxAndLastLedger(ctx)) + return result; - if (NotTEC const result = T::checkPermission(ctx.view, ctx.tx)) - return result; + if (NotTEC const result = T::checkPermission(ctx.view, ctx.tx)) + return result; - if (NotTEC const result = T::checkSign(ctx)) - return result; + if (NotTEC const result = T::checkSign(ctx)) + return result; - return tesSUCCESS; - }()) - return preSigResult; + return tesSUCCESS; + }()) + return preSigResult; - if (TER const result = T::checkFee(ctx, calculateBaseFee(ctx.view, ctx.tx))) - return result; - } + if (TER const result = T::checkFee(ctx, calculateBaseFee(ctx.view, ctx.tx))) + return result; + } - return T::preclaim(ctx); - }); + return T::preclaim(ctx); + }); + if (span) + span.setAttribute( + telemetry::tx_apply_span::attr::terResult, transToken(preclaimTer).c_str()); + return preclaimTer; } catch (UnknownTxnType const& e) { // Should never happen // LCOV_EXCL_START JLOG(ctx.j.fatal()) << "Unknown transaction type in preclaim: " << e.txnType; + span.recordException(e); UNREACHABLE("xrpl::invokePreclaim : unknown transaction type"); return temUNKNOWN; // LCOV_EXCL_STOP } + catch (std::exception const& e) + { + // The caller's preclaim() maps this to tefEXCEPTION. Record it on the + // span before unwinding so per-stage error counts include exceptions. + span.setAttribute( + telemetry::tx_apply_span::attr::terResult, transToken(tefEXCEPTION).c_str()); + span.recordException(e); + throw; + } } /** diff --git a/src/tests/libxrpl/telemetry/TxApplySpanNames.cpp b/src/tests/libxrpl/telemetry/TxApplySpanNames.cpp new file mode 100644 index 0000000000..170fef3ae4 --- /dev/null +++ b/src/tests/libxrpl/telemetry/TxApplySpanNames.cpp @@ -0,0 +1,52 @@ +#include + +#include + +#include + +/** Contract tests for the transaction apply-pipeline span constants. + * + * The span names and attribute keys in TxApplySpanNames.h are a cross-component + * contract: the collector spanmetrics connector aggregates on these exact + * strings (dimensions tx_type, ter_result, stage) and the Grafana + * transaction-overview dashboard queries them. A silent rename here would + * break per-stage metrics with no compile error, so these tests pin the + * literal values. They need no telemetry runtime and run in every build. + */ + +using namespace xrpl::telemetry; + +TEST(TxApplySpanNames, span_names_are_dot_qualified) +{ + // Full span names feed SpanGuard::hashSpan() in applySteps.cpp. + EXPECT_EQ(std::string_view(tx_apply_span::preflight), "tx.preflight"); + EXPECT_EQ(std::string_view(tx_apply_span::preclaim), "tx.preclaim"); +} + +TEST(TxApplySpanNames, operation_suffixes) +{ + // Suffix used with SpanGuard::span(cat, seg::tx, suffix) in Transactor.cpp. + EXPECT_EQ(std::string_view(tx_apply_span::op::preflight), "preflight"); + EXPECT_EQ(std::string_view(tx_apply_span::op::preclaim), "preclaim"); + EXPECT_EQ(std::string_view(tx_apply_span::op::transactor), "transactor"); +} + +TEST(TxApplySpanNames, attribute_keys_match_collector_dimensions) +{ + // These keys MUST match docker/telemetry/otel-collector-config.yaml + // spanmetrics dimensions and TxSpanNames.h (so both span sets aggregate + // under one dimension). + EXPECT_EQ(std::string_view(tx_apply_span::attr::stage), "stage"); + EXPECT_EQ(std::string_view(tx_apply_span::attr::txType), "tx_type"); + EXPECT_EQ(std::string_view(tx_apply_span::attr::terResult), "ter_result"); + EXPECT_EQ(std::string_view(tx_apply_span::attr::applied), "applied"); +} + +TEST(TxApplySpanNames, stage_values_are_the_three_pipeline_stages) +{ + // The stage attribute carries exactly these three values; they become the + // spanmetrics `stage` dimension cardinality (3) and the dashboard filter. + EXPECT_EQ(std::string_view(tx_apply_span::val::preflight), "preflight"); + EXPECT_EQ(std::string_view(tx_apply_span::val::preclaim), "preclaim"); + EXPECT_EQ(std::string_view(tx_apply_span::val::apply), "apply"); +} From 6a16dfa823c896b3237168d2825d1d099f33c016 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 11:25:29 +0100 Subject: [PATCH 7/9] clang-tidy and formatting changes Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- src/libxrpl/tx/applySteps.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/libxrpl/tx/applySteps.cpp b/src/libxrpl/tx/applySteps.cpp index efe4e457af..fa2021a215 100644 --- a/src/libxrpl/tx/applySteps.cpp +++ b/src/libxrpl/tx/applySteps.cpp @@ -189,8 +189,10 @@ invokePreflight(PreflightContext const& ctx) tec, isTesSuccess(tec) ? consequencesHelper(ctx) : TxConsequences{tec}); }); if (span) + { span.setAttribute( telemetry::tx_apply_span::attr::terResult, transToken(result.first).c_str()); + } return result; } catch (UnknownTxnType const& e) @@ -266,8 +268,10 @@ invokePreclaim(PreclaimContext const& ctx) return T::preclaim(ctx); }); if (span) + { span.setAttribute( telemetry::tx_apply_span::attr::terResult, transToken(preclaimTer).c_str()); + } return preclaimTer; } catch (UnknownTxnType const& e) From 3df7e9cba680c0a5812c8b5f3fe2d8be10b5946a Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 11:42:33 +0100 Subject: [PATCH 8/9] code review changes and wire unused attributes Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- OpenTelemetryPlan/Phase3_taskList.md | 27 ++++++++++++++---------- src/xrpld/app/misc/detail/TxQ.cpp | 18 +++++++++++++++- src/xrpld/app/misc/detail/TxQSpanNames.h | 6 ++++-- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/OpenTelemetryPlan/Phase3_taskList.md b/OpenTelemetryPlan/Phase3_taskList.md index 55b00690ea..5cb723d878 100644 --- a/OpenTelemetryPlan/Phase3_taskList.md +++ b/OpenTelemetryPlan/Phase3_taskList.md @@ -474,17 +474,22 @@ This gives the best of both worlds: guaranteed cross-node correlation via determ **Attributes added**: -| Span | Attribute | Type | Source | -| --------------- | ---------------- | ------ | ------------------------------------------------------------------- | -| `tx.process` | `tx_type` | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` | -| `tx.process` | `fee` | int64 | `stx->getFieldAmount(sfFee).xrp().drops()` | -| `tx.process` | `sequence` | int64 | `stx->getSeqProxy().value()` | -| `tx.process` | `ter_result` | string | `transToken(e.result)` (set after batch application) | -| `tx.process` | `applied` | bool | `e.applied` (set after batch application) | -| `tx.receive` | `tx_type` | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` | -| `txq.enqueue` | `tx_type` | string | same pattern as above | -| `txq.accept.tx` | `txq_status` | string | `applied` / `failed` / `retried` | -| `txq.accept` | `ledger_changed` | bool | set at end of accept loop | +| Span | Attribute | Type | Source | +| ----------------- | -------------------- | ------ | ------------------------------------------------------------------- | +| `tx.process` | `tx_type` | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` | +| `tx.process` | `fee` | int64 | `stx->getFieldAmount(sfFee).xrp().drops()` | +| `tx.process` | `sequence` | int64 | `stx->getSeqProxy().value()` | +| `tx.process` | `ter_result` | string | `transToken(e.result)` (set after batch application) | +| `tx.process` | `applied` | bool | `e.applied` (set after batch application) | +| `tx.receive` | `tx_type` | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` | +| `txq.enqueue` | `tx_type` | string | same pattern as above | +| `txq.enqueue` | `txq_status` | string | `queued` / `applied_direct` / `applied` / `rejected` | +| `txq.enqueue` | `fee_level_paid` | int64 | `getFeeLevelPaid(view, *tx).value()` | +| `txq.enqueue` | `required_fee_level` | int64 | `getRequiredFeeLevel(...).value()` | +| `txq.batch_clear` | `num_cleared` | int64 | queued txs cleared ahead of the applying tx | +| `txq.cleanup` | `expired_count` | int64 | entries dropped for passed `LastLedgerSequence` | +| `txq.accept.tx` | `txq_status` | string | `applied` / `failed` / `retried` | +| `txq.accept` | `ledger_changed` | bool | set at end of accept loop | **New attr keys**: `TxSpanNames.h` (`txType`, `fee`, `sequence`, `terResult`, `applied`), `TxQSpanNames.h` (`txType`). diff --git a/src/xrpld/app/misc/detail/TxQ.cpp b/src/xrpld/app/misc/detail/TxQ.cpp index 352bef6bd9..2a6f00385f 100644 --- a/src/xrpld/app/misc/detail/TxQ.cpp +++ b/src/xrpld/app/misc/detail/TxQ.cpp @@ -607,7 +607,8 @@ TxQ::tryClearAccountQueueUpThruTx( if (txResult.applied) { // All of the queued transactions applied, so remove them from the - // queue. + // queue. `dist` queued txs preceded the current one in the batch. + span.setAttribute(txq_span::attr::numCleared, static_cast(dist)); endTxIter = erase(accountIter->second, beginTxIter, endTxIter); // If `tx` is replacing a queued tx, delete that one, too. if (endTxIter != accountIter->second.transactions.end() && endTxIter->first == tSeqProx) @@ -744,6 +745,9 @@ TxQ::apply( span.setAttribute(txq_span::attr::txHash, to_string(tx->getTransactionID()).c_str()); if (auto const* fmt = TxFormats::getInstance().findByType(tx->getTxnType())) span.setAttribute(txq_span::attr::txType, fmt->getName().c_str()); + // Default outcome; overridden below on the direct-apply and queued paths. + // Every other early return leaves the tx rejected from the queue. + span.setAttribute(txq_span::attr::txqStatus, txq_span::val::rejected); NumberSO const stNumberSO{view.rules().enabled(fixUniversalNumber)}; @@ -757,7 +761,10 @@ TxQ::apply( // See if the transaction paid a high enough fee that it can go straight // into the ledger. if (auto directApplied = tryDirectApply(app, view, tx, flags, j)) + { + span.setAttribute(txq_span::attr::txqStatus, txq_span::val::appliedDirect); return *directApplied; + } if ((flags & TapDryRun) != 0u) return {telCAN_NOT_QUEUE, false}; @@ -884,6 +891,10 @@ TxQ::apply( auto const metricsSnapshot = feeMetrics_.getSnapshot(); auto const feeLevelPaid = getFeeLevelPaid(view, *tx); auto const requiredFeeLevel = getRequiredFeeLevel(view, flags, metricsSnapshot, lock); + span.setAttribute( + txq_span::attr::feeLevelPaid, static_cast(feeLevelPaid.value())); + span.setAttribute( + txq_span::attr::requiredFeeLevel, static_cast(requiredFeeLevel.value())); // Is there a blocker already in the account's queue? If so, don't // allow additional transactions in the queue. @@ -1217,6 +1228,7 @@ TxQ::apply( /* Can't erase (*replacedTxIter) here because success implies that it has already been deleted. */ + span.setAttribute(txq_span::attr::txqStatus, txq_span::val::applied); return result; } } @@ -1332,6 +1344,7 @@ TxQ::apply( << " to queue." << " Flags: " << flags; + span.setAttribute(txq_span::attr::txqStatus, txq_span::val::queued); return {terQUEUED, false}; } @@ -1366,18 +1379,21 @@ TxQ::processClosedLedger(Application& app, ReadView const& view, bool timeLeap) maxSize_ = std::max(snapshot.txnsExpected * setup_.ledgersInQueue, setup_.queueSizeMin); // Remove any queued candidates whose LastLedgerSequence has gone by. + std::int64_t expiredCount = 0; for (auto candidateIter = byFee_.begin(); candidateIter != byFee_.end();) { if (candidateIter->lastValid && *candidateIter->lastValid <= ledgerSeq) { byAccount_.at(candidateIter->account).dropPenalty = true; candidateIter = erase(candidateIter); + ++expiredCount; } else { ++candidateIter; } } + span.setAttribute(txq_span::attr::expiredCount, expiredCount); // Remove any TxQAccounts that don't have candidates // under them diff --git a/src/xrpld/app/misc/detail/TxQSpanNames.h b/src/xrpld/app/misc/detail/TxQSpanNames.h index 9292ba1e7c..3f8f86aa30 100644 --- a/src/xrpld/app/misc/detail/TxQSpanNames.h +++ b/src/xrpld/app/misc/detail/TxQSpanNames.h @@ -15,12 +15,14 @@ * | +--------------------------------------------------+ | * | | txq.enqueue | | * | | TxQ::apply() | | - * | | attrs: tx_hash, status, fee_level | | + * | | attrs: tx_hash, tx_type, txq_status, | | + * | | fee_level_paid, required_fee_level | | * | | | | * | | +-------------------+ +----------------------+ | | * | | | txq.apply_direct | | txq.batch_clear | | | * | | | tryDirectApply() | | tryClearAccount...() | | | - * | | +-------------------+ +----------------------+ | | + * | | +-------------------+ | attrs: num_cleared | | | + * | | +----------------------+ | | * | +--------------------------------------------------+ | * +-------------------------------------------------------+ * From 3167a49f41e25877c938268716ea2b7024292dac Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 12:42:53 +0100 Subject: [PATCH 9/9] feat(telemetry): derive per-stage tx metrics from apply-pipeline spans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire the apply-pipeline stage spans (tx.preflight, tx.preclaim, tx.transactor) added on phase-3 through the observability stack so the spanmetrics connector produces per-stage RED metrics without any native instruments. - collector: add the `stage` dimension to the spanmetrics connector so the three stages split into separate metric series (3 bounded values). - dashboard: add a "Tx Apply Pipeline" section to transaction-overview with rate, p95 latency, and failure-rate panels grouped by stage, plus a `stage` template variable. Panels follow the existing config (node filter, exported_instance legends, Title Case, axis labels). - The failure panel filters ter_result != tesSUCCESS rather than span status, because a failing ter code completes the span normally — only thrown exceptions set an error status. This matches the existing "Transaction Results by Type" panel convention. - docs: document the spans, attributes, and stage dimension in the data collection reference and runbook, including the sampling caveat that span-derived metrics inherit tracer head-sampling and undercount at sampling_ratio < 1. Co-Authored-By: Claude Opus 4.8 --- .../09-data-collection-reference.md | 69 +++++--- .../dashboards/transaction-overview.json | 150 ++++++++++++++++++ docker/telemetry/otel-collector-config.yaml | 3 + docs/telemetry-runbook.md | 56 ++++++- 4 files changed, 251 insertions(+), 27 deletions(-) diff --git a/OpenTelemetryPlan/09-data-collection-reference.md b/OpenTelemetryPlan/09-data-collection-reference.md index 2c08c8f9da..0008ac5713 100644 --- a/OpenTelemetryPlan/09-data-collection-reference.md +++ b/OpenTelemetryPlan/09-data-collection-reference.md @@ -102,13 +102,23 @@ Controlled by `trace_rpc=1` in `[telemetry]` config. Controlled by `trace_transactions=1` in `[telemetry]` config. -| Span Name | Parent | Source File | Description | -| ------------ | -------------- | --------------- | ----------------------------------------------------------------- | -| `tx.process` | — | NetworkOPs.cpp | Transaction submission entry point (local or peer-relayed) | -| `tx.receive` | — | PeerImp.cpp | Raw transaction received from peer overlay (before deduplication) | -| `tx.apply` | `ledger.build` | BuildLedger.cpp | Transaction set applied to new ledger during consensus | +| Span Name | Parent | Source File | Description | +| --------------- | -------------- | --------------- | ----------------------------------------------------------------- | +| `tx.process` | — | NetworkOPs.cpp | Transaction submission entry point (local or peer-relayed) | +| `tx.receive` | — | PeerImp.cpp | Raw transaction received from peer overlay (before deduplication) | +| `tx.apply` | `ledger.build` | BuildLedger.cpp | Transaction set applied to new ledger during consensus | +| `tx.preflight` | — | applySteps.cpp | Stateless checks stage (`stage=preflight`) | +| `tx.preclaim` | — | applySteps.cpp | Ledger-aware checks stage before fee claim (`stage=preclaim`) | +| `tx.transactor` | — | Transactor.cpp | Apply stage — the transactor runs (`stage=apply`) | + +The three apply-pipeline spans share a deterministic `trace_id` derived from +`txID[0:16]`, so preflight, preclaim, and transactor for one transaction group +under a single trace even though they run sequentially and often on different +threads. A transaction that hard-fails preflight or preclaim never reaches the +later spans — the `stage` attribute identifies where it stopped. **Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"tx.process|tx.receive"}` +or, for the apply pipeline: `{resource.service.name="xrpld" && name=~"tx.preflight|tx.preclaim|tx.transactor"}` **Grafana dashboard**: _Transaction Overview_ (`xrpld-transactions`) @@ -229,15 +239,19 @@ Every span can carry key-value attributes that provide context for filtering and #### Transaction Attributes -| Attribute | Type | Set On | Description | -| ------------------- | ------- | -------------------------- | ---------------------------------------------------- | -| `xrpl.tx.hash` | string | `tx.process`, `tx.receive` | Transaction hash (hex-encoded) | -| `local` | boolean | `tx.process` | `true` if locally submitted, `false` if peer-relayed | -| `path` | string | `tx.process` | Submission path: `"sync"` or `"async"` | -| `suppressed` | boolean | `tx.receive` | `true` if transaction was suppressed (duplicate) | -| `tx_status` | string | `tx.receive` | Transaction status (e.g., `"known_bad"`) | -| `xrpl.peer.id` | int64 | `tx.receive` | Peer identifier (also set on peer spans) | -| `xrpl.peer.version` | string | `tx.receive` | Peer protocol version string | +| Attribute | Type | Set On | Description | +| ------------------- | ------- | ---------------------------------------------- | --------------------------------------------------------------------- | +| `xrpl.tx.hash` | string | `tx.process`, `tx.receive` | Transaction hash (hex-encoded) | +| `local` | boolean | `tx.process` | `true` if locally submitted, `false` if peer-relayed | +| `path` | string | `tx.process` | Submission path: `"sync"` or `"async"` | +| `suppressed` | boolean | `tx.receive` | `true` if transaction was suppressed (duplicate) | +| `tx_status` | string | `tx.receive` | Transaction status (e.g., `"known_bad"`) | +| `xrpl.peer.id` | int64 | `tx.receive` | Peer identifier (also set on peer spans) | +| `xrpl.peer.version` | string | `tx.receive` | Peer protocol version string | +| `stage` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Apply-pipeline stage: `preflight`, `preclaim`, or `apply` | +| `tx_type` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Transaction type name (e.g., `Payment`) | +| `ter_result` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Engine result token for that stage (e.g., `tesSUCCESS`, `terPRE_SEQ`) | +| `applied` | boolean | `tx.transactor` | `true` if the transaction was applied to the ledger | **Tempo query**: `{span.xrpl.tx.hash=""}` to trace a specific transaction across nodes. @@ -375,14 +389,25 @@ The OTel Collector's SpanMetrics connector automatically generates RED (Rate, Er **Additional dimension labels** (configured in `otel-collector-config.yaml`): -| Span Attribute | Prometheus Label | Applies To | -| --------------------- | ------------------------------ | ------------------------- | -| `command` | `xrpl_rpc_command` | `rpc.command.*` | -| `rpc_status` | `xrpl_rpc_status` | `rpc.command.*` | -| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` | -| `local` | `xrpl_tx_local` | `tx.process` | -| `proposal_trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` | -| `validation_trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` | +| Span Attribute | Prometheus Label | Applies To | +| --------------------- | ------------------------------ | ---------------------------------------------- | +| `command` | `xrpl_rpc_command` | `rpc.command.*` | +| `rpc_status` | `xrpl_rpc_status` | `rpc.command.*` | +| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` | +| `local` | `xrpl_tx_local` | `tx.process` | +| `proposal_trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` | +| `validation_trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` | +| `stage` | `stage` | `tx.preflight`, `tx.preclaim`, `tx.transactor` | + +The `stage` dimension (3 values: `preflight`, `preclaim`, `apply`) turns the +apply-pipeline spans into per-stage RED metrics with no native instruments — the +_Transaction Overview_ dashboard charts rate, p95 latency, and failure rate by stage. + +> **Sampling caveat**: span-derived metrics inherit the **tracer head-sampling** +> ratio (`sampling_ratio` in `[telemetry]`, via `TraceIdRatioBasedSampler`). At +> `sampling_ratio < 1.0` the stage RED metrics undercount proportionally — they +> reflect sampled traces, not the full transaction volume. Native StatsD/meter +> metrics do not sample. Account for this when reading absolute stage rates. **Where to query**: Prometheus → `traces_span_metrics_calls_total{span_name="rpc.command.server_info"}` diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index ab17be236c..5423a8e4fb 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -669,6 +669,138 @@ }, "overrides": [] } + }, + { + "title": "Tx Apply Pipeline Rate by Stage", + "description": "Span rate for each apply-pipeline stage (preflight, preclaim, apply). A drop between stages shows where transactions are filtered out. Requires the stage dimension in spanmetrics.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 64 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (stage, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\"}[5m]))", + "legendFormat": "{{stage}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Spans / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Tx Apply Pipeline Latency by Stage (p95)", + "description": "95th-percentile duration of each apply-pipeline stage. Isolates which stage (preflight, preclaim, apply) dominates transaction processing time.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 64 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, stage, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\"}[5m])))", + "legendFormat": "P95 {{stage}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Tx Apply Pipeline Failure Rate by Stage", + "description": "Rate of apply-pipeline spans whose ter_result is not tesSUCCESS, split by stage. Shows whether failures concentrate in preflight, preclaim, or apply. Filters on ter_result rather than span status because a failing ter code completes the span normally; only thrown exceptions set an error status.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 72 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (stage, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\", ter_result!~\"tesSUCCESS|\"}[5m]))", + "legendFormat": "{{stage}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Failed Spans / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, @@ -768,6 +900,24 @@ }, "sort": 1, "label": "Queue Status" + }, + { + "name": "stage", + "type": "query", + "datasource": { + "type": "prometheus" + }, + "query": "label_values(traces_span_metrics_calls_total{span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage!=\"\"}, stage)", + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "sort": 1, + "label": "Apply Stage" } ] }, diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index 36112253b8..01a36e3375 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -59,6 +59,9 @@ connectors: - name: validation_trusted - name: tx_type - name: ter_result + # Apply-pipeline stage (preflight|preclaim|apply) — splits the + # tx.preflight/tx.preclaim/tx.transactor span RED metrics per stage. + - name: stage - name: txq_status - name: consensus_state - name: load_type diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index b7fc0605b3..39324f5f56 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -74,11 +74,20 @@ All spans instrumented in xrpld, grouped by subsystem: ### Transaction Spans (Phase 3) -| Span Name | Source File | Attributes | Description | -| ------------ | --------------- | --------------------------------------------------------------------------------- | ------------------------------------- | -| `tx.process` | NetworkOPs.cpp | `tx_hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing | -| `tx.receive` | PeerImp.cpp | `peer_id`, `tx_hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay | -| `tx.apply` | BuildLedger.cpp | `ledger_seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger | +| Span Name | Source File | Attributes | Description | +| --------------- | --------------- | --------------------------------------------------------------------------------- | ------------------------------------- | +| `tx.process` | NetworkOPs.cpp | `tx_hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing | +| `tx.receive` | PeerImp.cpp | `peer_id`, `tx_hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay | +| `tx.apply` | BuildLedger.cpp | `ledger_seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger | +| `tx.preflight` | applySteps.cpp | `stage`, `tx_type`, `ter_result` | Stateless checks stage | +| `tx.preclaim` | applySteps.cpp | `stage`, `tx_type`, `ter_result` | Ledger-aware checks stage | +| `tx.transactor` | Transactor.cpp | `stage`, `tx_type`, `ter_result`, `applied` | Apply stage (transactor runs) | + +The three apply-pipeline spans (`tx.preflight`, `tx.preclaim`, `tx.transactor`) +share a deterministic `trace_id` from `txID[0:16]`, so they group under one +trace per transaction. The `stage` attribute (`preflight` / `preclaim` / +`apply`) drives the collector spanmetrics `stage` dimension, giving per-stage +RED metrics on the _Transaction Overview_ dashboard. ### Transaction Queue Spans (Phase 3) @@ -182,6 +191,43 @@ This section shows what questions you can answer using the span attributes, with {name=~"tx\\..*"} | tx_type = "NFTokenMint" ``` +### Apply Pipeline by Stage + +``` +# All three stages of one transaction (preflight -> preclaim -> apply) +{name=~"tx.preflight|tx.preclaim|tx.transactor"} + +# Transactions that failed at the preclaim stage +{name="tx.preclaim"} | ter_result != "tesSUCCESS" + +# Transactions that hard-failed preflight (never reached preclaim/apply) +{name="tx.preflight"} | ter_result != "tesSUCCESS" +``` + +PromQL on the span-derived metrics (dashboard: _Transaction Overview_): + +``` +# Per-stage throughput — the funnel preflight >= preclaim >= apply +sum by (stage) (rate(traces_span_metrics_calls_total{span_name=~"tx.preflight|tx.preclaim|tx.transactor"}[5m])) + +# Per-stage p95 latency +histogram_quantile(0.95, sum by (le, stage) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"tx.preflight|tx.preclaim|tx.transactor"}[5m]))) + +# Per-stage failure rate (ter_result != tesSUCCESS; a failing ter completes the +# span normally, so filter on the attribute, not status_code which only flags exceptions) +sum by (stage) (rate(traces_span_metrics_calls_total{span_name=~"tx.preflight|tx.preclaim|tx.transactor", ter_result!~"tesSUCCESS|"}[5m])) +``` + +> **Alerting**: a rising `tx.preflight` / `tx.preclaim` failure rate points to +> malformed or stale-sequence submissions (often spam or a misbehaving client); +> a rising `tx.transactor` failure rate points to apply-time problems. Alert per +> stage rather than on a single aggregate so the failing stage is obvious. + +> **Sampling caveat**: these stage metrics are span-derived and inherit the +> **tracer head-sampling** ratio (`sampling_ratio`). At `sampling_ratio < 1.0` +> they undercount proportionally — treat them as relative trends, not absolute +> transaction counts. Native StatsD metrics are unsampled. + ### Transaction Queue Health ```