Merge branch 'pratik/otel-phase6-statsd' into pratik/otel-phase7-native-metrics

# Conflicts:
#	OpenTelemetryPlan/09-data-collection-reference.md
This commit is contained in:
Pratik Mankawde
2026-06-05 12:48:31 +01:00
12 changed files with 574 additions and 76 deletions

View File

@@ -669,6 +669,138 @@
},
"overrides": []
}
},
{
"title": "Tx Apply Pipeline Rate by Stage",
"description": "Span rate for each apply-pipeline stage (preflight, preclaim, apply). A drop between stages shows where transactions are filtered out. Requires the stage dimension in spanmetrics.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 64
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": ["mean", "max"]
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (stage, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\"}[5m]))",
"legendFormat": "{{stage}} [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"axisLabel": "Spans / Sec",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
},
{
"title": "Tx Apply Pipeline Latency by Stage (p95)",
"description": "95th-percentile duration of each apply-pipeline stage. Isolates which stage (preflight, preclaim, apply) dominates transaction processing time.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 64
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": ["mean", "max"]
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le, stage, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\"}[5m])))",
"legendFormat": "P95 {{stage}} [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "ms",
"custom": {
"axisLabel": "Duration (ms)",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
},
{
"title": "Tx Apply Pipeline Failure Rate by Stage",
"description": "Rate of apply-pipeline spans whose ter_result is not tesSUCCESS, split by stage. Shows whether failures concentrate in preflight, preclaim, or apply. Filters on ter_result rather than span status because a failing ter code completes the span normally; only thrown exceptions set an error status.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 72
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": ["mean", "max"]
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (stage, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage=~\"$stage\", ter_result!~\"tesSUCCESS|\"}[5m]))",
"legendFormat": "{{stage}} [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"axisLabel": "Failed Spans / Sec",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
}
],
"schemaVersion": 39,
@@ -768,6 +900,24 @@
},
"sort": 1,
"label": "Queue Status"
},
{
"name": "stage",
"type": "query",
"datasource": {
"type": "prometheus"
},
"query": "label_values(traces_span_metrics_calls_total{span_name=~\"tx.preflight|tx.preclaim|tx.transactor\", stage!=\"\"}, stage)",
"refresh": 2,
"includeAll": true,
"multi": true,
"allValue": ".*",
"current": {
"text": "All",
"value": "$__all"
},
"sort": 1,
"label": "Apply Stage"
}
]
},

View File

@@ -41,6 +41,9 @@ connectors:
- name: validation_trusted
- name: tx_type
- name: ter_result
# Apply-pipeline stage (preflight|preclaim|apply) — splits the
# tx.preflight/tx.preclaim/tx.transactor span RED metrics per stage.
- name: stage
- name: txq_status
- name: consensus_state
- name: load_type