From 283218896bd56a031a62d53605d20c07916c82c6 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 18:48:31 +0100 Subject: [PATCH] fix(telemetry): use avg not quantile for close-time value panels The Raw Proposals and Effective/Quantized panels showed wrong values (e.g. 759M, 852M, even 0) against a true value of ~834M. Cause: quantile_over_time bucketizes into an exponential histogram tuned for duration distributions, so it cannot represent large absolute integers (Ripple-epoch seconds) accurately. Switch both panels to avg_over_time, which returns the correct value (verified ~833,996,7xx matching the raw span attribute). Average is also the semantically right aggregation here: close time is a single agreed value per consensus round, not a latency distribution, so a median was never meaningful. Set the unit to none rather than seconds: the value is Ripple-epoch seconds (Unix = value + 946684800) and TraceQL metrics cannot do the offset arithmetic in-query, so a duration unit would misrender it. Clarify in the description that the absolute level tracks wall-clock and the useful signal is per-node spread / raw-vs-effective gap. Co-Authored-By: Claude Opus 4.8 --- .../grafana/dashboards/consensus-health.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 239c7aed9f..04c4addce7 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -392,7 +392,7 @@ }, { "title": "Close Time: Raw Proposals (Per Node)", - "description": "Each node's raw proposed close time (close_time_self) \u2014 the unrounded wall clock value at the moment the node closed its ledger. Compare across nodes to see clock drift. Values are Ripple-epoch seconds (since 2000-01-01).", + "description": "Each node's raw proposed close time (close_time_self) \u2014 the unrounded wall clock value at the moment the node closed its ledger. Compare across nodes to see clock drift. Value is Ripple-epoch seconds (Unix = value + 946684800); it tracks wall-clock so the absolute number is large and near-constant \u2014 watch per-node spread and raw-vs-effective gap, not the absolute level.", "type": "timeseries", "gridPos": { "h": 8, @@ -402,7 +402,7 @@ }, "fieldConfig": { "defaults": { - "unit": "s", + "unit": "none", "custom": { "drawStyle": "points", "pointSize": 6, @@ -429,7 +429,7 @@ "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | quantile_over_time(span.close_time_self, .5) by (resource.service.instance.id)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | avg_over_time(span.close_time_self) by (resource.service.instance.id)", "legendFormat": "{{service.instance.id}}", "refId": "A" } @@ -437,7 +437,7 @@ }, { "title": "Close Time: Effective / Quantized", - "description": "The consensus-agreed close time after rounding to the current resolution bin (close_time). This is the value written to the ledger header. All nodes in agreement produce the same value. Values are Ripple-epoch seconds (since 2000-01-01).", + "description": "The consensus-agreed close time after rounding to the current resolution bin (close_time). This is the value written to the ledger header. All nodes in agreement produce the same value. Value is Ripple-epoch seconds (Unix = value + 946684800); it tracks wall-clock so the absolute number is large and near-constant \u2014 watch per-node spread and raw-vs-effective gap, not the absolute level.", "type": "timeseries", "gridPos": { "h": 8, @@ -447,7 +447,7 @@ }, "fieldConfig": { "defaults": { - "unit": "s", + "unit": "none", "custom": { "drawStyle": "points", "pointSize": 6, @@ -474,7 +474,7 @@ "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | quantile_over_time(span.close_time, .5) by (resource.service.instance.id)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | avg_over_time(span.close_time) by (resource.service.instance.id)", "legendFormat": "{{service.instance.id}}", "refId": "A" }