mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-05 17:56:49 +00:00
fix(telemetry): wire consensus close-time panels via TraceQL metrics
The five Close Time panels (Raw Proposals, Effective/Quantized, Vote Bins & Resolution, Resolution Direction, Bin Distribution) rendered empty: they used TraceQL `| select(attr)`, which returns a trace list that a timeseries/barchart panel cannot plot. Enable TraceQL metrics in Tempo and rewrite the panels to use it: - tempo.yaml: add the local-blocks processor to the metrics generator so recent blocks are queryable via /api/metrics/query_range. Set filter_server_spans=false because the consensus spans are SPAN_KIND_INTERNAL (the default keeps only server spans, so attribute aggregations over internal spans returned nothing), and flush_to_storage=true with a traces_storage path so query_range can read the flushed blocks. - consensus-health.json: replace each panel's select() with a metrics query — quantile_over_time on the integer close-time attributes, avg_over_time for vote bins / resolution, and count_over_time by the resolution_direction and vote-bin dimensions. Set the raw/effective panels' unit to seconds (the values are Ripple-epoch seconds, which dateTimeFromNow rendered with the wrong epoch). Verified the query forms compile and return series against live internal spans; the close-time series populate once the node reaches full sync. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -392,7 +392,7 @@
|
||||
},
|
||||
{
|
||||
"title": "Close Time: Raw Proposals (Per Node)",
|
||||
"description": "Each node's raw proposed close time (close_time_self) \u2014 the unrounded wall clock value at the moment the node closed its ledger. Compare across nodes to see clock drift.",
|
||||
"description": "Each node's raw proposed close time (close_time_self) \u2014 the unrounded wall clock value at the moment the node closed its ledger. Compare across nodes to see clock drift. Values are Ripple-epoch seconds (since 2000-01-01).",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
@@ -402,7 +402,7 @@
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "dateTimeFromNow",
|
||||
"unit": "s",
|
||||
"custom": {
|
||||
"drawStyle": "points",
|
||||
"pointSize": 6,
|
||||
@@ -425,17 +425,19 @@
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "tempo"
|
||||
"type": "tempo",
|
||||
"uid": "tempo"
|
||||
},
|
||||
"queryType": "traceql",
|
||||
"query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time_self)",
|
||||
"query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | quantile_over_time(span.close_time_self, .5) by (resource.service.instance.id)",
|
||||
"legendFormat": "{{service.instance.id}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Close Time: Effective / Quantized",
|
||||
"description": "The consensus-agreed close time after rounding to the current resolution bin (close_time). This is the value written to the ledger header. All nodes in agreement produce the same value.",
|
||||
"description": "The consensus-agreed close time after rounding to the current resolution bin (close_time). This is the value written to the ledger header. All nodes in agreement produce the same value. Values are Ripple-epoch seconds (since 2000-01-01).",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
@@ -445,7 +447,7 @@
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "dateTimeFromNow",
|
||||
"unit": "s",
|
||||
"custom": {
|
||||
"drawStyle": "points",
|
||||
"pointSize": 6,
|
||||
@@ -468,10 +470,12 @@
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "tempo"
|
||||
"type": "tempo",
|
||||
"uid": "tempo"
|
||||
},
|
||||
"queryType": "traceql",
|
||||
"query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time)",
|
||||
"query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | quantile_over_time(span.close_time, .5) by (resource.service.instance.id)",
|
||||
"legendFormat": "{{service.instance.id}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
@@ -544,18 +548,22 @@
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "tempo"
|
||||
"type": "tempo",
|
||||
"uid": "tempo"
|
||||
},
|
||||
"queryType": "traceql",
|
||||
"query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time_vote_bins)",
|
||||
"query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | avg_over_time(span.close_time_vote_bins)",
|
||||
"legendFormat": "Avg Vote Bins",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "tempo"
|
||||
"type": "tempo",
|
||||
"uid": "tempo"
|
||||
},
|
||||
"queryType": "traceql",
|
||||
"query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_resolution_ms)",
|
||||
"query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | avg_over_time(span.close_resolution_ms)",
|
||||
"legendFormat": "Avg Resolution (ms)",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
@@ -595,10 +603,12 @@
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "tempo"
|
||||
"type": "tempo",
|
||||
"uid": "tempo"
|
||||
},
|
||||
"queryType": "traceql",
|
||||
"query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\" && span.resolution_direction=~\"$resolution_direction\"} | select(span.resolution_direction)",
|
||||
"query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\" && span.resolution_direction=~\"$resolution_direction\"} | count_over_time() by (span.resolution_direction)",
|
||||
"legendFormat": "{{span.resolution_direction}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
@@ -639,10 +649,12 @@
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "tempo"
|
||||
"type": "tempo",
|
||||
"uid": "tempo"
|
||||
},
|
||||
"queryType": "traceql",
|
||||
"query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time, span.close_time_vote_bins)",
|
||||
"query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | count_over_time() by (span.close_time_vote_bins)",
|
||||
"legendFormat": "{{span.close_time_vote_bins}} bins",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
|
||||
@@ -31,9 +31,11 @@ compactor:
|
||||
compaction:
|
||||
block_retention: 1h
|
||||
|
||||
# Enable metrics generator for service graph and span metrics.
|
||||
# Produces RED metrics (rate, errors, duration) per service/span,
|
||||
# feeding Grafana's service map visualization.
|
||||
# Enable metrics generator for service graph, span metrics, and the
|
||||
# local-blocks processor. Produces RED metrics (rate, errors, duration) per
|
||||
# service/span for the service map, and keeps recent trace blocks queryable so
|
||||
# TraceQL metrics queries (quantile_over_time, count_over_time, etc. via
|
||||
# /api/metrics/query_range) work.
|
||||
metrics_generator:
|
||||
registry:
|
||||
external_labels:
|
||||
@@ -44,6 +46,18 @@ metrics_generator:
|
||||
# to enable remote_write for service graph metrics:
|
||||
# remote_write:
|
||||
# - url: http://prometheus:9090/api/v1/write
|
||||
# Separate WAL the local-blocks processor flushes traces to for metrics
|
||||
# queries. Required when flush_to_storage is true.
|
||||
traces_storage:
|
||||
path: /var/tempo/generator/traces
|
||||
processor:
|
||||
local_blocks:
|
||||
# xrpld consensus/transaction spans are SPAN_KIND_INTERNAL. By default
|
||||
# local-blocks keeps only server spans for TraceQL metrics, so attribute
|
||||
# aggregations over internal spans return nothing. Keep all spans.
|
||||
filter_server_spans: false
|
||||
# Flush recent blocks to traces_storage so query_range can read them.
|
||||
flush_to_storage: true
|
||||
|
||||
overrides:
|
||||
defaults:
|
||||
@@ -51,6 +65,7 @@ overrides:
|
||||
processors:
|
||||
- service-graphs
|
||||
- span-metrics
|
||||
- local-blocks
|
||||
|
||||
storage:
|
||||
trace:
|
||||
|
||||
Reference in New Issue
Block a user