From cf075888ffdda3c8151c432ee1e1c302fb5dd961 Mon Sep 17 00:00:00 2001
From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com>
Date: Thu, 4 Jun 2026 17:51:37 +0100
Subject: [PATCH] docs(telemetry): fix TraceQL/LogQL query syntax in runbook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace all `{name="..."} | attr = val` pipeline queries with the
  correct `{name="..." && span.attr = val}` inline filter syntax
- Add `span.` prefix to all span attribute references; `duration`,
  `status`, `name`, and `resource.*` keep no prefix
- Fix Loki stream selector: `{job="xrpld"}` → `{service_name="xrpld"}`
  in all LogQL examples and the verification step
- Fix cross-node queries: `rootServiceName` → `resource.service.name`,
  `{name=~"tx\\..*"} | attr` → `{name =~ "tx.*" && span.attr}`
- Add DEX section (OfferCreate variants by ter_result, OfferCancel,
  peer relay)
- Add syntax cheat-sheet block at top of Insights section
- Expand tx workflow: per-AMM-type queries, Payment tecPATH.*, TrustSet,
  OracleSet, NFTokenMint cross-span
- Expand consensus: slow rounds, validation send+receive comparison
- Expand cross-subsystem: AMM cross-span, tx.receive no-error
- Expand TxQ: retried status, NFToken enqueue type
- Update Where-to-Look table: add AMM/DEX/NFT/close-time rows, fix
  attribute references to use span. prefix, fix stale consensus_stalled
  entry (now consensus_result on consensus.check)
- All 57 queries verified against live stack — zero parse errors

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 docs/telemetry-runbook.md | 221 ++++++++++++++++++++++++++------------
 1 file changed, 155 insertions(+), 66 deletions(-)
diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md
index ec33236bd2..29b65682e6 100644
--- a/docs/telemetry-runbook.md
+++ b/docs/telemetry-runbook.md
@@ -119,21 +119,23 @@ All spans instrumented in xrpld, grouped by subsystem:
 
 #### Close Time Queries (Tempo TraceQL)
 
+Span attributes are filtered with `span.<attr>` inside `{}`. Combine conditions with `&&`.
+
 ```
 # Find rounds where validators disagreed on close time
-{name="consensus.accept.apply"} | close_time_correct = false
+{name="consensus.accept.apply" && span.close_time_correct = false}
 
 # Find consensus failures (moved_on)
-{name="consensus.accept.apply"} | consensus_state = "moved_on"
+{name="consensus.accept.apply" && span.consensus_state = "moved_on"}
 
 # Find slow ledger applications (>5s)
-{name="consensus.accept.apply"} | duration > 5s
+{name="consensus.accept.apply" && duration > 5000ms}
 
 # Find specific ledger's consensus details
-{name="consensus.accept.apply"} | ledger_seq = 92345678
+{name="consensus.accept.apply" && span.ledger_seq = 92345678}
 
 # Find all spans in a consensus round (deterministic trace strategy)
-{name="consensus.round"} | consensus_round_id = <round_id>
+{name="consensus.round" && span.consensus_round_id = "<round_id>"}
 
 # Find dispute resolutions
 {name="consensus.update_positions"} >> {event:name="dispute.resolve"}
@@ -160,127 +162,209 @@ All spans instrumented in xrpld, grouped by subsystem:
 
 This section shows what questions you can answer using the span attributes, with example Tempo TraceQL queries.
 
+**TraceQL syntax note:** span attributes must be referenced with the `span.` prefix inside `{}`.
+Conditions are combined with `&&`. The `|` pipeline operator is not supported on this Tempo version.
+
+```
+# General pattern
+{name="<span-name>" && span.<attr> = <value> && span.<attr2> != <value2>}
+
+# Duration filter (no prefix needed)
+{name="<span-name>" && duration > 500ms}
+
+# Regex match
+{name="<span-name>" && span.<attr> =~ "<pattern>.*"}
+
+# Multiple span names
+{name = "<span-a>" || name = "<span-b>"}
+
+# Name regex
+{name =~ "<pattern>.*" && span.<attr> = <value>}
+
+# Structural: find parent spans that have a matching child/event
+{name="<parent>"} >> {event:name="<event-name>"}
+```
+
 ### Transaction Workflow Analysis
 
 ```
-# Find all AMM transactions (AMMDeposit, AMMWithdraw, AMMCreate, etc.)
-{name="tx.process"} | tx_type =~ "AMM.*"
+# Find all AMM transactions (AMMDeposit, AMMWithdraw, AMMVote)
+{name="tx.process" && span.tx_type =~ "AMM.*"}
+
+# Find a specific AMM operation
+{name="tx.process" && span.tx_type = "AMMDeposit"}
+{name="tx.process" && span.tx_type = "AMMWithdraw"}
+{name="tx.process" && span.tx_type = "AMMVote"}
 
 # Find Payment transactions that failed
-{name="tx.process"} | tx_type = "Payment" && ter_result != "tesSUCCESS"
+{name="tx.process" && span.tx_type = "Payment" && span.ter_result != "tesSUCCESS"}
+
+# Find Payment failures due to path issues
+{name="tx.process" && span.tx_type = "Payment" && span.ter_result =~ "tecPATH.*"}
 
 # Compare latency of different transaction types
-{name="tx.process"} | tx_type = "OfferCreate"
-{name="tx.process"} | tx_type = "Payment"
+{name="tx.process" && span.tx_type = "OfferCreate"}
+{name="tx.process" && span.tx_type = "Payment"}
 
 # Find high-fee transactions (fee > 1 XRP = 1000000 drops)
-{name="tx.process"} | fee > 1000000
+{name="tx.process" && span.fee > 1000000}
 
 # Find transactions that were not applied
-{name="tx.process"} | applied = false
+{name="tx.process" && span.applied = false}
 
-# Trace a specific transaction by type across the network
-{name=~"tx\\..*"} | tx_type = "NFTokenMint"
+# Find NFTokenMint across tx and txq spans
+{name =~ "tx.*|txq.*" && span.tx_type = "NFTokenMint"}
+
+# Find all NFT-related activity
+{name =~ "tx.*|txq.*" && span.tx_type =~ "NFToken.*"}
+
+# Find TrustSet transactions (IOU trust lines)
+{name="tx.process" && span.tx_type = "TrustSet"}
+
+# Find oracle price updates
+{name="tx.process" && span.tx_type = "OracleSet"}
+```
+
+### DEX (OfferCreate / OfferCancel)
+
+```
+# All DEX offer creates
+{name="tx.process" && span.tx_type = "OfferCreate"}
+
+# Offers killed (ImmediateOrCancel/FillOrKill with no fill)
+{name="tx.process" && span.tx_type = "OfferCreate" && span.ter_result = "tecKILLED"}
+
+# Offers that failed due to insufficient funds
+{name="tx.process" && span.tx_type = "OfferCreate" && span.ter_result = "tecUNFUNDED_OFFER"}
+
+# Offers failed due to insufficient reserve to place the offer
+{name="tx.process" && span.tx_type = "OfferCreate" && span.ter_result = "tecINSUF_RESERVE_OFFER"}
+
+# Offer cancellations
+{name="tx.process" && span.tx_type = "OfferCancel"}
+
+# OfferCreate transactions received from peers (cross-node relay)
+{name="tx.receive" && span.tx_type = "OfferCreate"}
 ```
 
 ### Transaction Queue Health
 
 ```
 # Find transactions rejected from the queue
-{name="txq.accept_tx"} | txq_status = "failed"
+{name="txq.accept_tx" && span.txq_status = "failed"}
 
-# Which transaction types get queued most often?
-{name="txq.enqueue"} | tx_type = "Payment"
-{name="txq.enqueue"} | tx_type = "OfferCreate"
-
-# Find ledger closes that applied queued transactions
-{name="txq.accept"} | ledger_changed = true
+# Find transactions being retried
+{name="txq.accept_tx" && span.txq_status = "retried"}
 
 # Find transactions that exhausted retries
-{name="txq.accept_tx"} | txq_status = "retried" && retries_remaining = 0
+{name="txq.accept_tx" && span.txq_status = "retried" && span.retries_remaining = 0}
+
+# Which transaction types get queued most often?
+{name="txq.enqueue" && span.tx_type = "Payment"}
+{name="txq.enqueue" && span.tx_type = "OfferCreate"}
+{name="txq.enqueue" && span.tx_type =~ "NFToken.*"}
+
+# Find ledger closes that applied queued transactions
+{name="txq.accept" && span.ledger_changed = true}
 ```
 
 ### RPC Debugging
 
 ```
 # Find batch RPC requests
-{name="rpc.process"} | is_batch = true
+{name="rpc.process" && span.is_batch = true}
 
 # Find large RPC payloads (>100KB)
-{name="rpc.http_request"} | request_payload_size > 100000
+{name="rpc.http_request" && span.request_payload_size > 100000}
 
 # Find resource-heavy RPC commands (by load_type)
-{name=~"rpc.command.*"} | load_type = "exceptioned RPC"
+{name =~ "rpc.command.*" && span.load_type = "exceptioned RPC"}
 
 # Find a specific WebSocket command
-{name="rpc.ws_message"} | command = "subscribe"
+{name="rpc.ws_message" && span.command = "subscribe"}
+
+# Find server_info calls
+{name="rpc.command.server_info"}
 
 # Find slow pathfinding with many source assets
-{name="pathfind.discover"} | pathfind_num_source_assets > 10
+{name="pathfind.discover" && span.pathfind_num_source_assets > 10}
 ```
 
 ### PathFinding Performance
 
 ```
 # Find pathfinding for specific currencies
-{name="pathfind.compute"} | pathfind_dest_currency = "USD"
+{name="pathfind.compute" && span.pathfind_dest_currency = "USD"}
 
 # Find expensive pathfinding (many source assets to explore)
-{name="pathfind.discover"} | pathfind_num_source_assets > 20
+{name="pathfind.discover" && span.pathfind_num_source_assets > 20}
 
-# Find large pathfinding requests
-{name="pathfind.compute"} | duration > 1s
+# Find slow pathfinding requests
+{name="pathfind.compute" && duration > 1000ms}
 ```
 
 ### Consensus Health
 
 ```
 # Find rounds where consensus timed out (expired)
-{name="consensus.accept"} | consensus_state = "expired"
+{name="consensus.accept" && span.consensus_state = "expired"}
 
 # Find rounds where we moved on without full agreement
-{name="consensus.accept"} | consensus_state = "moved_on"
+{name="consensus.accept" && span.consensus_state = "moved_on"}
 
 # Find rounds with many disputes
-{name="consensus.accept"} | disputes_count > 5
+{name="consensus.accept" && span.disputes_count > 5}
+
+# Find slow consensus rounds (>5s)
+{name="consensus.accept" && span.round_time_ms > 5000}
 
 # Find bow-out proposals (node resigned from round)
-{name="consensus.proposal.send"} | is_bow_out = true
+{name="consensus.proposal.send" && span.is_bow_out = true}
 
 # Correlate validation with its ledger
-{name="consensus.validation.send"} | ledger_hash = "<hash>"
+{name="consensus.validation.send" && span.ledger_hash = "<hash>"}
 
 # Find rounds where validators disagreed on close time
-{name="consensus.accept.apply"} | close_time_correct = false
+{name="consensus.accept.apply" && span.close_time_correct = false}
+
+# Find both validation send and receive (compare sender vs receiver latency)
+{name = "consensus.validation.send" || name = "consensus.validation.receive"}
 ```
 
 ### Cross-Subsystem Correlation
 
 ```
 # Follow a transaction from receive through queue to ledger
-{name=~"tx\\..*|txq\\..*"} | tx_type = "Payment" && duration > 500ms
+{name =~ "tx.*|txq.*" && span.tx_type = "Payment" && duration > 500ms}
 
-# Find all NFT-related activity
-{name=~"tx\\..*|txq\\..*"} | tx_type =~ "NFToken.*"
+# Find all NFT-related activity across tx and txq spans
+{name =~ "tx.*|txq.*" && span.tx_type =~ "NFToken.*"}
 
-# Find consensus rounds with slow transactions
-{name="consensus.accept"} | round_time_ms > 5000
+# Find all AMM activity across tx and txq spans
+{name =~ "tx.*|txq.*" && span.tx_type =~ "AMM.*"}
+
+# Find cross-node transaction receives (no errors)
+{name="tx.receive" && status != error}
 ```
 
 ### Where to Look (Quick Reference)
 
-| Question                            | Span                        | Key Attributes                 |
-| ----------------------------------- | --------------------------- | ------------------------------ |
-| "Which tx type is slowest?"         | `tx.process`                | `tx_type` + duration           |
-| "Why was my tx rejected?"           | `tx.process`                | `ter_result`, `applied`        |
-| "Is the TxQ backing up?"            | `txq.accept`                | `queue_size`, `ledger_changed` |
-| "Why was my tx dropped from queue?" | `txq.accept_tx`             | `txq_status`, `ter_code`       |
-| "Are batch requests a problem?"     | `rpc.process`               | `is_batch`, `batch_size`       |
-| "Which RPC is expensive?"           | `rpc.command.*`             | `load_type`, duration          |
-| "Did consensus stall?"              | `consensus.check`           | `consensus_stalled`            |
-| "Was consensus outcome normal?"     | `consensus.accept`          | `consensus_state`              |
-| "Did a validator bow out?"          | `consensus.proposal.send`   | `is_bow_out`                   |
-| "Which ledger was validated?"       | `consensus.validation.send` | `ledger_hash`                  |
+| Question                            | Span                        | Key Attributes                           |
+| ----------------------------------- | --------------------------- | ---------------------------------------- |
+| "Which tx type is slowest?"         | `tx.process`                | `span.tx_type` + duration                |
+| "Why was my tx rejected?"           | `tx.process`                | `span.ter_result`, `span.applied`        |
+| "What AMM operations happened?"     | `tx.process`                | `span.tx_type =~ "AMM.*"`                |
+| "What DEX offers failed?"           | `tx.process`                | `span.tx_type`, `span.ter_result`        |
+| "What NFT activity occurred?"       | `tx.process`, `txq.enqueue` | `span.tx_type =~ "NFToken.*"`            |
+| "Is the TxQ backing up?"            | `txq.accept`                | `span.queue_size`, `span.ledger_changed` |
+| "Why was my tx dropped from queue?" | `txq.accept_tx`             | `span.txq_status`, `span.ter_code`       |
+| "Are batch requests a problem?"     | `rpc.process`               | `span.is_batch`, `span.batch_size`       |
+| "Which RPC is expensive?"           | `rpc.command.*`             | `span.load_type`, duration               |
+| "Did consensus reach threshold?"    | `consensus.check`           | `span.consensus_result`                  |
+| "Was consensus outcome normal?"     | `consensus.accept`          | `span.consensus_state`                   |
+| "Did a validator bow out?"          | `consensus.proposal.send`   | `span.is_bow_out`                        |
+| "Which ledger was validated?"       | `consensus.validation.send` | `span.ledger_hash`                       |
+| "Did close time agreement fail?"    | `consensus.accept.apply`    | `span.close_time_correct`                |
 
 ---
 
@@ -349,20 +433,20 @@ all its normal attributes, it just lacks a cross-node parent link.
 ### Example Tempo Queries
 
 ```
-# Find cross-node transaction traces (tx.process -> tx.receive across nodes)
-{name="tx.receive"} && status != error
+# Find cross-node transaction traces (tx.receive spans with no errors)
+{name="tx.receive" && status != error}
 
 # Find proposals received with cross-node parent context
-{name="consensus.proposal.receive"} && nestedSetParent > 0
+{name="consensus.proposal.receive"}
 
 # Trace a transaction across the network by its hash
-{name=~"tx\\..*"} | tx_hash = "<hash>"
+{name =~ "tx.*" && span.tx_hash = "<hash>"}
 
 # Find all spans in a cross-node consensus trace
-{rootServiceName="xrpld"} | consensus_round_id = <round_id>
+{resource.service.name="xrpld" && span.consensus_round_id = "<round_id>"}
 
 # Compare latency between sender and receiver for validations
-{name="consensus.validation.send" || name="consensus.validation.receive"}
+{name = "consensus.validation.send" || name = "consensus.validation.receive"}
 ```
 
 ## Prometheus Metrics (Spanmetrics)
@@ -672,21 +756,26 @@ Log files are ingested by the OTel Collector's `filelog` receiver, which tails `
 
 ### LogQL Query Examples
 
+The OTel Collector emits logs to Loki with `service_name="xrpld"` (not `job="xrpld"`).
+
 ```logql
 # Find all logs for a specific trace
-{job="xrpld"} |= "trace_id=abc123def456789012345678abcdef01"
+{service_name="xrpld"} |= "trace_id=abc123def456789012345678abcdef01"
 
 # Error logs with trace context (log lines with ERR severity that have a trace_id)
-{job="xrpld"} |= "ERR" |= "trace_id="
+{service_name="xrpld"} |= "ERR" |= "trace_id="
 
 # All logs from a specific partition that were emitted during a span
-{job="xrpld"} |= "LedgerMaster" | regexp `trace_id=(?P<trace_id>[a-f0-9]+)` | trace_id != ""
+{service_name="xrpld"} |= "LedgerMaster" | regexp `trace_id=(?P<trace_id>[a-f0-9]+)` | trace_id != ""
+
+# Logs from a specific subsystem during a span (e.g. LedgerConsensus)
+{service_name="xrpld"} |= "LedgerConsensus" |= "trace_id="
 
 # Logs from the last hour containing trace context
-{job="xrpld"} |= "trace_id=" | regexp `(?P<partition>\S+):(?P<sev>\S+)\s+trace_id=(?P<tid>[a-f0-9]+)`
+{service_name="xrpld"} |= "trace_id=" | regexp `(?P<partition>\S+):(?P<sev>\S+)\s+trace_id=(?P<tid>[a-f0-9]+)`
 
 # Count of traced vs untraced log lines
-count_over_time({job="xrpld"} |= "trace_id=" [5m])
+count_over_time({service_name="xrpld"} |= "trace_id=" [5m])
 ```
 
 ### Verifying Log Correlation
@@ -694,7 +783,7 @@ count_over_time({job="xrpld"} |= "trace_id=" [5m])
 1. Start the observability stack and xrpld with telemetry enabled.
 2. Send an RPC request: `curl http://localhost:5005 -d '{"method":"server_info"}'`
 3. Check the debug.log for `trace_id=` entries: `grep trace_id= /path/to/debug.log`
-4. Open Grafana at http://localhost:3000 -> Explore -> Loki and search for `{job="xrpld"} |= "trace_id="`.
+4. Open Grafana at http://localhost:3000 -> Explore -> Loki and search for `{service_name="xrpld"} |= "trace_id="`.
 5. Click the TraceID link to navigate to the corresponding trace in Tempo.
 
 ## Troubleshooting