From 0d1d1aa0e1719db39a04a23f226100f038bb5204 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 16:58:29 +0100 Subject: [PATCH] fix(telemetry): wire consensus close-time panels via TraceQL metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The five Close Time panels (Raw Proposals, Effective/Quantized, Vote Bins & Resolution, Resolution Direction, Bin Distribution) rendered empty: they used TraceQL `| select(attr)`, which returns a trace list that a timeseries/barchart panel cannot plot. Enable TraceQL metrics in Tempo and rewrite the panels to use it: - tempo.yaml: add the local-blocks processor to the metrics generator so recent blocks are queryable via /api/metrics/query_range. Set filter_server_spans=false because the consensus spans are SPAN_KIND_INTERNAL (the default keeps only server spans, so attribute aggregations over internal spans returned nothing), and flush_to_storage=true with a traces_storage path so query_range can read the flushed blocks. - consensus-health.json: replace each panel's select() with a metrics query — quantile_over_time on the integer close-time attributes, avg_over_time for vote bins / resolution, and count_over_time by the resolution_direction and vote-bin dimensions. Set the raw/effective panels' unit to seconds (the values are Ripple-epoch seconds, which dateTimeFromNow rendered with the wrong epoch). Verified the query forms compile and return series against live internal spans; the close-time series populate once the node reaches full sync. Co-Authored-By: Claude Opus 4.8 --- .../grafana/dashboards/consensus-health.json | 44 ++++++++++++------- docker/telemetry/tempo.yaml | 21 +++++++-- 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 465b83be0b..c5baf674ca 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -392,7 +392,7 @@ }, { "title": "Close Time: Raw Proposals (Per Node)", - "description": "Each node's raw proposed close time (close_time_self) \u2014 the unrounded wall clock value at the moment the node closed its ledger. Compare across nodes to see clock drift.", + "description": "Each node's raw proposed close time (close_time_self) \u2014 the unrounded wall clock value at the moment the node closed its ledger. Compare across nodes to see clock drift. Values are Ripple-epoch seconds (since 2000-01-01).", "type": "timeseries", "gridPos": { "h": 8, @@ -402,7 +402,7 @@ }, "fieldConfig": { "defaults": { - "unit": "dateTimeFromNow", + "unit": "s", "custom": { "drawStyle": "points", "pointSize": 6, @@ -425,17 +425,19 @@ "targets": [ { "datasource": { - "type": "tempo" + "type": "tempo", + "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time_self)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | quantile_over_time(span.close_time_self, .5) by (resource.service.instance.id)", + "legendFormat": "{{service.instance.id}}", "refId": "A" } ] }, { "title": "Close Time: Effective / Quantized", - "description": "The consensus-agreed close time after rounding to the current resolution bin (close_time). This is the value written to the ledger header. All nodes in agreement produce the same value.", + "description": "The consensus-agreed close time after rounding to the current resolution bin (close_time). This is the value written to the ledger header. All nodes in agreement produce the same value. Values are Ripple-epoch seconds (since 2000-01-01).", "type": "timeseries", "gridPos": { "h": 8, @@ -445,7 +447,7 @@ }, "fieldConfig": { "defaults": { - "unit": "dateTimeFromNow", + "unit": "s", "custom": { "drawStyle": "points", "pointSize": 6, @@ -468,10 +470,12 @@ "targets": [ { "datasource": { - "type": "tempo" + "type": "tempo", + "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | quantile_over_time(span.close_time, .5) by (resource.service.instance.id)", + "legendFormat": "{{service.instance.id}}", "refId": "A" } ] @@ -544,18 +548,22 @@ "targets": [ { "datasource": { - "type": "tempo" + "type": "tempo", + "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time_vote_bins)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | avg_over_time(span.close_time_vote_bins)", + "legendFormat": "Avg Vote Bins", "refId": "A" }, { "datasource": { - "type": "tempo" + "type": "tempo", + "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_resolution_ms)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | avg_over_time(span.close_resolution_ms)", + "legendFormat": "Avg Resolution (ms)", "refId": "B" } ] @@ -595,10 +603,12 @@ "targets": [ { "datasource": { - "type": "tempo" + "type": "tempo", + "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\" && span.resolution_direction=~\"$resolution_direction\"} | select(span.resolution_direction)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\" && span.resolution_direction=~\"$resolution_direction\"} | count_over_time() by (span.resolution_direction)", + "legendFormat": "{{span.resolution_direction}}", "refId": "A" } ] @@ -639,10 +649,12 @@ "targets": [ { "datasource": { - "type": "tempo" + "type": "tempo", + "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time, span.close_time_vote_bins)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | count_over_time() by (span.close_time_vote_bins)", + "legendFormat": "{{span.close_time_vote_bins}} bins", "refId": "A" } ] diff --git a/docker/telemetry/tempo.yaml b/docker/telemetry/tempo.yaml index 7e56f60c6d..b2997bda53 100644 --- a/docker/telemetry/tempo.yaml +++ b/docker/telemetry/tempo.yaml @@ -31,9 +31,11 @@ compactor: compaction: block_retention: 1h -# Enable metrics generator for service graph and span metrics. -# Produces RED metrics (rate, errors, duration) per service/span, -# feeding Grafana's service map visualization. +# Enable metrics generator for service graph, span metrics, and the +# local-blocks processor. Produces RED metrics (rate, errors, duration) per +# service/span for the service map, and keeps recent trace blocks queryable so +# TraceQL metrics queries (quantile_over_time, count_over_time, etc. via +# /api/metrics/query_range) work. metrics_generator: registry: external_labels: @@ -44,6 +46,18 @@ metrics_generator: # to enable remote_write for service graph metrics: # remote_write: # - url: http://prometheus:9090/api/v1/write + # Separate WAL the local-blocks processor flushes traces to for metrics + # queries. Required when flush_to_storage is true. + traces_storage: + path: /var/tempo/generator/traces + processor: + local_blocks: + # xrpld consensus/transaction spans are SPAN_KIND_INTERNAL. By default + # local-blocks keeps only server spans for TraceQL metrics, so attribute + # aggregations over internal spans return nothing. Keep all spans. + filter_server_spans: false + # Flush recent blocks to traces_storage so query_range can read them. + flush_to_storage: true overrides: defaults: @@ -51,6 +65,7 @@ overrides: processors: - service-graphs - span-metrics + - local-blocks storage: trace: