From 0e25103fdb86ec88417d5b0c8e1646eb5916fd5c Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 14 May 2026 17:29:49 +0100 Subject: [PATCH] fix(telemetry): make Loki ingestion and filelog parsing work end-to-end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three interrelated fixes in otel-collector-config.yaml; without them the Phase 8 log-trace correlation pipeline is silently broken. 1. `resource/logs` processor now upserts `job: xrpld` alongside `service.name: xrpld`. Loki 3.x OTLP ingestion renames `service.name` to the label `service_name`, so the runbook / integration-test queries (`{job="xrpld"} |= "trace_id="`) returned empty. Upserting the `job` resource attribute at the collector lets the canonical Loki label flow through unchanged. 2. `filelog` regex makes the `partition:` capture non-capturing-optional. `Logs::format()` omits the `partition:` prefix when partition is empty (common for framework-level log lines); the old regex required it and silently dropped those records. 3. Timestamp parser now matches the real log format. `Logs::format()` writes microsecond-precision timestamps like `2026-04-15 10:30:45.123456 UTC`. The layout was `%Y-%b-%d %H:%M:%S` — missing fractional seconds and timezone — which failed strptime and dropped timestamps. New layout is `%Y-%b-%d %H:%M:%S.%f` with `location: UTC`. Also adds a block-comment documenting the real log format so the next person to touch this doesn't re-introduce the same gaps. --- docker/telemetry/otel-collector-config.yaml | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index 45c40c6b55..f7d5f6d69f 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -38,11 +38,17 @@ receivers: filelog: include: [/var/log/rippled/*/debug.log] operators: + # Log format emitted by Logs::format() is: + # YYYY-Mmm-DD HH:MM:SS.ffffff UTC : [trace_id=... span_id=...] + # The `partition:` prefix is omitted when partition is empty, so the + # capture group is non-capturing optional. Fractional seconds up to 6 + # digits are parsed via the `%f` strptime directive. - type: regex_parser - regex: '^(?P\S+\s+\S+)\s+\S+\s+(?P\S+):(?P\S+)\s+(?:trace_id=(?P[a-f0-9]+)\s+span_id=(?P[a-f0-9]+)\s+)?(?P.*)$' + regex: '^(?P\S+\s+\S+)\s+\S+\s+(?:(?P\S+):)?(?P\S+)\s+(?:trace_id=(?P[a-f0-9]+)\s+span_id=(?P[a-f0-9]+)\s+)?(?P.*)$' timestamp: parse_from: attributes.timestamp - layout: "%Y-%b-%d %H:%M:%S" + layout: "%Y-%b-%d %H:%M:%S.%f" + location: UTC processors: batch: @@ -53,6 +59,15 @@ processors: - key: service.name value: xrpld action: upsert + # Loki 3.x OTLP ingestion converts `service.name` to the label + # `service_name`. The runbook and integration-test queries use the + # canonical Loki label `job` so operators can paste `{job="xrpld"}` + # without guessing the otel-to-loki naming convention. Upsert the + # `job` resource attribute here so it round-trips through OTLP + # into Loki as the `job` label. + - key: job + value: xrpld + action: upsert connectors: spanmetrics: