From cf075888ffdda3c8151c432ee1e1c302fb5dd961 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 17:51:37 +0100 Subject: [PATCH] docs(telemetry): fix TraceQL/LogQL query syntax in runbook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace all `{name="..."} | attr = val` pipeline queries with the correct `{name="..." && span.attr = val}` inline filter syntax - Add `span.` prefix to all span attribute references; `duration`, `status`, `name`, and `resource.*` keep no prefix - Fix Loki stream selector: `{job="xrpld"}` → `{service_name="xrpld"}` in all LogQL examples and the verification step - Fix cross-node queries: `rootServiceName` → `resource.service.name`, `{name=~"tx\\..*"} | attr` → `{name =~ "tx.*" && span.attr}` - Add DEX section (OfferCreate variants by ter_result, OfferCancel, peer relay) - Add syntax cheat-sheet block at top of Insights section - Expand tx workflow: per-AMM-type queries, Payment tecPATH.*, TrustSet, OracleSet, NFTokenMint cross-span - Expand consensus: slow rounds, validation send+receive comparison - Expand cross-subsystem: AMM cross-span, tx.receive no-error - Expand TxQ: retried status, NFToken enqueue type - Update Where-to-Look table: add AMM/DEX/NFT/close-time rows, fix attribute references to use span. prefix, fix stale consensus_stalled entry (now consensus_result on consensus.check) - All 57 queries verified against live stack — zero parse errors Co-Authored-By: Claude Sonnet 4.6 (1M context) --- docs/telemetry-runbook.md | 221 ++++++++++++++++++++++++++------------ 1 file changed, 155 insertions(+), 66 deletions(-) diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index ec33236bd2..29b65682e6 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -119,21 +119,23 @@ All spans instrumented in xrpld, grouped by subsystem: #### Close Time Queries (Tempo TraceQL) +Span attributes are filtered with `span.` inside `{}`. Combine conditions with `&&`. + ``` # Find rounds where validators disagreed on close time -{name="consensus.accept.apply"} | close_time_correct = false +{name="consensus.accept.apply" && span.close_time_correct = false} # Find consensus failures (moved_on) -{name="consensus.accept.apply"} | consensus_state = "moved_on" +{name="consensus.accept.apply" && span.consensus_state = "moved_on"} # Find slow ledger applications (>5s) -{name="consensus.accept.apply"} | duration > 5s +{name="consensus.accept.apply" && duration > 5000ms} # Find specific ledger's consensus details -{name="consensus.accept.apply"} | ledger_seq = 92345678 +{name="consensus.accept.apply" && span.ledger_seq = 92345678} # Find all spans in a consensus round (deterministic trace strategy) -{name="consensus.round"} | consensus_round_id = +{name="consensus.round" && span.consensus_round_id = ""} # Find dispute resolutions {name="consensus.update_positions"} >> {event:name="dispute.resolve"} @@ -160,127 +162,209 @@ All spans instrumented in xrpld, grouped by subsystem: This section shows what questions you can answer using the span attributes, with example Tempo TraceQL queries. +**TraceQL syntax note:** span attributes must be referenced with the `span.` prefix inside `{}`. +Conditions are combined with `&&`. The `|` pipeline operator is not supported on this Tempo version. + +``` +# General pattern +{name="" && span. = && span. != } + +# Duration filter (no prefix needed) +{name="" && duration > 500ms} + +# Regex match +{name="" && span. =~ ".*"} + +# Multiple span names +{name = "" || name = ""} + +# Name regex +{name =~ ".*" && span. = } + +# Structural: find parent spans that have a matching child/event +{name=""} >> {event:name=""} +``` + ### Transaction Workflow Analysis ``` -# Find all AMM transactions (AMMDeposit, AMMWithdraw, AMMCreate, etc.) -{name="tx.process"} | tx_type =~ "AMM.*" +# Find all AMM transactions (AMMDeposit, AMMWithdraw, AMMVote) +{name="tx.process" && span.tx_type =~ "AMM.*"} + +# Find a specific AMM operation +{name="tx.process" && span.tx_type = "AMMDeposit"} +{name="tx.process" && span.tx_type = "AMMWithdraw"} +{name="tx.process" && span.tx_type = "AMMVote"} # Find Payment transactions that failed -{name="tx.process"} | tx_type = "Payment" && ter_result != "tesSUCCESS" +{name="tx.process" && span.tx_type = "Payment" && span.ter_result != "tesSUCCESS"} + +# Find Payment failures due to path issues +{name="tx.process" && span.tx_type = "Payment" && span.ter_result =~ "tecPATH.*"} # Compare latency of different transaction types -{name="tx.process"} | tx_type = "OfferCreate" -{name="tx.process"} | tx_type = "Payment" +{name="tx.process" && span.tx_type = "OfferCreate"} +{name="tx.process" && span.tx_type = "Payment"} # Find high-fee transactions (fee > 1 XRP = 1000000 drops) -{name="tx.process"} | fee > 1000000 +{name="tx.process" && span.fee > 1000000} # Find transactions that were not applied -{name="tx.process"} | applied = false +{name="tx.process" && span.applied = false} -# Trace a specific transaction by type across the network -{name=~"tx\\..*"} | tx_type = "NFTokenMint" +# Find NFTokenMint across tx and txq spans +{name =~ "tx.*|txq.*" && span.tx_type = "NFTokenMint"} + +# Find all NFT-related activity +{name =~ "tx.*|txq.*" && span.tx_type =~ "NFToken.*"} + +# Find TrustSet transactions (IOU trust lines) +{name="tx.process" && span.tx_type = "TrustSet"} + +# Find oracle price updates +{name="tx.process" && span.tx_type = "OracleSet"} +``` + +### DEX (OfferCreate / OfferCancel) + +``` +# All DEX offer creates +{name="tx.process" && span.tx_type = "OfferCreate"} + +# Offers killed (ImmediateOrCancel/FillOrKill with no fill) +{name="tx.process" && span.tx_type = "OfferCreate" && span.ter_result = "tecKILLED"} + +# Offers that failed due to insufficient funds +{name="tx.process" && span.tx_type = "OfferCreate" && span.ter_result = "tecUNFUNDED_OFFER"} + +# Offers failed due to insufficient reserve to place the offer +{name="tx.process" && span.tx_type = "OfferCreate" && span.ter_result = "tecINSUF_RESERVE_OFFER"} + +# Offer cancellations +{name="tx.process" && span.tx_type = "OfferCancel"} + +# OfferCreate transactions received from peers (cross-node relay) +{name="tx.receive" && span.tx_type = "OfferCreate"} ``` ### Transaction Queue Health ``` # Find transactions rejected from the queue -{name="txq.accept_tx"} | txq_status = "failed" +{name="txq.accept_tx" && span.txq_status = "failed"} -# Which transaction types get queued most often? -{name="txq.enqueue"} | tx_type = "Payment" -{name="txq.enqueue"} | tx_type = "OfferCreate" - -# Find ledger closes that applied queued transactions -{name="txq.accept"} | ledger_changed = true +# Find transactions being retried +{name="txq.accept_tx" && span.txq_status = "retried"} # Find transactions that exhausted retries -{name="txq.accept_tx"} | txq_status = "retried" && retries_remaining = 0 +{name="txq.accept_tx" && span.txq_status = "retried" && span.retries_remaining = 0} + +# Which transaction types get queued most often? +{name="txq.enqueue" && span.tx_type = "Payment"} +{name="txq.enqueue" && span.tx_type = "OfferCreate"} +{name="txq.enqueue" && span.tx_type =~ "NFToken.*"} + +# Find ledger closes that applied queued transactions +{name="txq.accept" && span.ledger_changed = true} ``` ### RPC Debugging ``` # Find batch RPC requests -{name="rpc.process"} | is_batch = true +{name="rpc.process" && span.is_batch = true} # Find large RPC payloads (>100KB) -{name="rpc.http_request"} | request_payload_size > 100000 +{name="rpc.http_request" && span.request_payload_size > 100000} # Find resource-heavy RPC commands (by load_type) -{name=~"rpc.command.*"} | load_type = "exceptioned RPC" +{name =~ "rpc.command.*" && span.load_type = "exceptioned RPC"} # Find a specific WebSocket command -{name="rpc.ws_message"} | command = "subscribe" +{name="rpc.ws_message" && span.command = "subscribe"} + +# Find server_info calls +{name="rpc.command.server_info"} # Find slow pathfinding with many source assets -{name="pathfind.discover"} | pathfind_num_source_assets > 10 +{name="pathfind.discover" && span.pathfind_num_source_assets > 10} ``` ### PathFinding Performance ``` # Find pathfinding for specific currencies -{name="pathfind.compute"} | pathfind_dest_currency = "USD" +{name="pathfind.compute" && span.pathfind_dest_currency = "USD"} # Find expensive pathfinding (many source assets to explore) -{name="pathfind.discover"} | pathfind_num_source_assets > 20 +{name="pathfind.discover" && span.pathfind_num_source_assets > 20} -# Find large pathfinding requests -{name="pathfind.compute"} | duration > 1s +# Find slow pathfinding requests +{name="pathfind.compute" && duration > 1000ms} ``` ### Consensus Health ``` # Find rounds where consensus timed out (expired) -{name="consensus.accept"} | consensus_state = "expired" +{name="consensus.accept" && span.consensus_state = "expired"} # Find rounds where we moved on without full agreement -{name="consensus.accept"} | consensus_state = "moved_on" +{name="consensus.accept" && span.consensus_state = "moved_on"} # Find rounds with many disputes -{name="consensus.accept"} | disputes_count > 5 +{name="consensus.accept" && span.disputes_count > 5} + +# Find slow consensus rounds (>5s) +{name="consensus.accept" && span.round_time_ms > 5000} # Find bow-out proposals (node resigned from round) -{name="consensus.proposal.send"} | is_bow_out = true +{name="consensus.proposal.send" && span.is_bow_out = true} # Correlate validation with its ledger -{name="consensus.validation.send"} | ledger_hash = "" +{name="consensus.validation.send" && span.ledger_hash = ""} # Find rounds where validators disagreed on close time -{name="consensus.accept.apply"} | close_time_correct = false +{name="consensus.accept.apply" && span.close_time_correct = false} + +# Find both validation send and receive (compare sender vs receiver latency) +{name = "consensus.validation.send" || name = "consensus.validation.receive"} ``` ### Cross-Subsystem Correlation ``` # Follow a transaction from receive through queue to ledger -{name=~"tx\\..*|txq\\..*"} | tx_type = "Payment" && duration > 500ms +{name =~ "tx.*|txq.*" && span.tx_type = "Payment" && duration > 500ms} -# Find all NFT-related activity -{name=~"tx\\..*|txq\\..*"} | tx_type =~ "NFToken.*" +# Find all NFT-related activity across tx and txq spans +{name =~ "tx.*|txq.*" && span.tx_type =~ "NFToken.*"} -# Find consensus rounds with slow transactions -{name="consensus.accept"} | round_time_ms > 5000 +# Find all AMM activity across tx and txq spans +{name =~ "tx.*|txq.*" && span.tx_type =~ "AMM.*"} + +# Find cross-node transaction receives (no errors) +{name="tx.receive" && status != error} ``` ### Where to Look (Quick Reference) -| Question | Span | Key Attributes | -| ----------------------------------- | --------------------------- | ------------------------------ | -| "Which tx type is slowest?" | `tx.process` | `tx_type` + duration | -| "Why was my tx rejected?" | `tx.process` | `ter_result`, `applied` | -| "Is the TxQ backing up?" | `txq.accept` | `queue_size`, `ledger_changed` | -| "Why was my tx dropped from queue?" | `txq.accept_tx` | `txq_status`, `ter_code` | -| "Are batch requests a problem?" | `rpc.process` | `is_batch`, `batch_size` | -| "Which RPC is expensive?" | `rpc.command.*` | `load_type`, duration | -| "Did consensus stall?" | `consensus.check` | `consensus_stalled` | -| "Was consensus outcome normal?" | `consensus.accept` | `consensus_state` | -| "Did a validator bow out?" | `consensus.proposal.send` | `is_bow_out` | -| "Which ledger was validated?" | `consensus.validation.send` | `ledger_hash` | +| Question | Span | Key Attributes | +| ----------------------------------- | --------------------------- | ---------------------------------------- | +| "Which tx type is slowest?" | `tx.process` | `span.tx_type` + duration | +| "Why was my tx rejected?" | `tx.process` | `span.ter_result`, `span.applied` | +| "What AMM operations happened?" | `tx.process` | `span.tx_type =~ "AMM.*"` | +| "What DEX offers failed?" | `tx.process` | `span.tx_type`, `span.ter_result` | +| "What NFT activity occurred?" | `tx.process`, `txq.enqueue` | `span.tx_type =~ "NFToken.*"` | +| "Is the TxQ backing up?" | `txq.accept` | `span.queue_size`, `span.ledger_changed` | +| "Why was my tx dropped from queue?" | `txq.accept_tx` | `span.txq_status`, `span.ter_code` | +| "Are batch requests a problem?" | `rpc.process` | `span.is_batch`, `span.batch_size` | +| "Which RPC is expensive?" | `rpc.command.*` | `span.load_type`, duration | +| "Did consensus reach threshold?" | `consensus.check` | `span.consensus_result` | +| "Was consensus outcome normal?" | `consensus.accept` | `span.consensus_state` | +| "Did a validator bow out?" | `consensus.proposal.send` | `span.is_bow_out` | +| "Which ledger was validated?" | `consensus.validation.send` | `span.ledger_hash` | +| "Did close time agreement fail?" | `consensus.accept.apply` | `span.close_time_correct` | --- @@ -349,20 +433,20 @@ all its normal attributes, it just lacks a cross-node parent link. ### Example Tempo Queries ``` -# Find cross-node transaction traces (tx.process -> tx.receive across nodes) -{name="tx.receive"} && status != error +# Find cross-node transaction traces (tx.receive spans with no errors) +{name="tx.receive" && status != error} # Find proposals received with cross-node parent context -{name="consensus.proposal.receive"} && nestedSetParent > 0 +{name="consensus.proposal.receive"} # Trace a transaction across the network by its hash -{name=~"tx\\..*"} | tx_hash = "" +{name =~ "tx.*" && span.tx_hash = ""} # Find all spans in a cross-node consensus trace -{rootServiceName="xrpld"} | consensus_round_id = +{resource.service.name="xrpld" && span.consensus_round_id = ""} # Compare latency between sender and receiver for validations -{name="consensus.validation.send" || name="consensus.validation.receive"} +{name = "consensus.validation.send" || name = "consensus.validation.receive"} ``` ## Prometheus Metrics (Spanmetrics) @@ -672,21 +756,26 @@ Log files are ingested by the OTel Collector's `filelog` receiver, which tails ` ### LogQL Query Examples +The OTel Collector emits logs to Loki with `service_name="xrpld"` (not `job="xrpld"`). + ```logql # Find all logs for a specific trace -{job="xrpld"} |= "trace_id=abc123def456789012345678abcdef01" +{service_name="xrpld"} |= "trace_id=abc123def456789012345678abcdef01" # Error logs with trace context (log lines with ERR severity that have a trace_id) -{job="xrpld"} |= "ERR" |= "trace_id=" +{service_name="xrpld"} |= "ERR" |= "trace_id=" # All logs from a specific partition that were emitted during a span -{job="xrpld"} |= "LedgerMaster" | regexp `trace_id=(?P[a-f0-9]+)` | trace_id != "" +{service_name="xrpld"} |= "LedgerMaster" | regexp `trace_id=(?P[a-f0-9]+)` | trace_id != "" + +# Logs from a specific subsystem during a span (e.g. LedgerConsensus) +{service_name="xrpld"} |= "LedgerConsensus" |= "trace_id=" # Logs from the last hour containing trace context -{job="xrpld"} |= "trace_id=" | regexp `(?P\S+):(?P\S+)\s+trace_id=(?P[a-f0-9]+)` +{service_name="xrpld"} |= "trace_id=" | regexp `(?P\S+):(?P\S+)\s+trace_id=(?P[a-f0-9]+)` # Count of traced vs untraced log lines -count_over_time({job="xrpld"} |= "trace_id=" [5m]) +count_over_time({service_name="xrpld"} |= "trace_id=" [5m]) ``` ### Verifying Log Correlation @@ -694,7 +783,7 @@ count_over_time({job="xrpld"} |= "trace_id=" [5m]) 1. Start the observability stack and xrpld with telemetry enabled. 2. Send an RPC request: `curl http://localhost:5005 -d '{"method":"server_info"}'` 3. Check the debug.log for `trace_id=` entries: `grep trace_id= /path/to/debug.log` -4. Open Grafana at http://localhost:3000 -> Explore -> Loki and search for `{job="xrpld"} |= "trace_id="`. +4. Open Grafana at http://localhost:3000 -> Explore -> Loki and search for `{service_name="xrpld"} |= "trace_id="`. 5. Click the TraceID link to navigate to the corresponding trace in Tempo. ## Troubleshooting