Phase 8: Implement log-trace correlation and Loki log ingestion

Task 8.1: Inject trace_id/span_id into Logs::format() when an active
OTel span exists, guarded by #ifdef XRPL_ENABLE_TELEMETRY. Uses
thread-local GetSpan()/GetContext() with <10ns overhead per call.

Task 8.2: Add Grafana Loki service (grafana/loki:2.9.0) to the Docker
Compose stack with port 3100 exposed. Add Loki as a dependency for
otel-collector and grafana services.

Task 8.3: Add filelog receiver to OTel Collector config to tail rippled
debug.log files with regex_parser extracting timestamp, partition,
severity, trace_id, span_id, and message fields. Add loki exporter and
logs pipeline. Mount rippled log directory into collector container.

Task 8.4: Add tracesToLogs config in Tempo datasource provisioning
pointing to Loki with filterByTraceID enabled, enabling one-click
trace-to-log navigation in Grafana.

Task 8.5: Add check_log_correlation() function to integration-test.sh
that greps debug.log files for trace_id pattern and cross-checks the
trace_id exists in Jaeger.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Pratik Mankawde
2026-03-10 16:04:12 +00:00
parent 4df9692611
commit 4e24569892
5 changed files with 144 additions and 3 deletions

View File

@@ -2,13 +2,16 @@
#
# Provides services for local development:
# - otel-collector: receives OTLP traces from rippled, batches and
# forwards them to Jaeger and Tempo. Listens on ports 4317 (gRPC)
# and 4318 (HTTP).
# forwards them to Jaeger and Tempo. Also tails rippled log files
# via filelog receiver and exports to Loki. Listens on ports
# 4317 (gRPC), 4318 (HTTP), and 8125 (StatsD UDP).
# - jaeger: all-in-one tracing backend with UI on port 16686.
# - tempo: Grafana Tempo tracing backend, queryable via Grafana Explore
# on port 3000. Recommended for production (S3/GCS storage, TraceQL).
# - loki: Grafana Loki log aggregation backend for centralized log
# ingestion and log-trace correlation (Phase 8).
# - grafana: dashboards on port 3000, pre-configured with Jaeger, Tempo,
# and Prometheus datasources.
# Prometheus, and Loki datasources.
#
# Usage:
# docker compose -f docker/telemetry/docker-compose.yml up -d
@@ -34,9 +37,14 @@ services:
# - "8125:8125/udp"
volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml:ro
# Phase 8: Mount rippled log directory for filelog receiver.
# The integration test writes logs to /tmp/xrpld-integration/;
# mount it read-only so the collector can tail debug.log files.
- /tmp/xrpld-integration:/var/log/rippled:ro
depends_on:
- jaeger
- tempo
- loki
networks:
- rippled-telemetry
@@ -61,6 +69,17 @@ services:
networks:
- rippled-telemetry
# Phase 8: Grafana Loki for centralized log ingestion and log-trace
# correlation. Receives logs from OTel Collector's loki exporter.
# Query logs via Grafana Explore -> Loki at http://localhost:3000.
loki:
image: grafana/loki:2.9.0
ports:
- "3100:3100"
command: -config.file=/etc/loki/local-config.yaml
networks:
- rippled-telemetry
prometheus:
image: prom/prometheus:latest
ports:
@@ -86,6 +105,7 @@ services:
- jaeger
- tempo
- prometheus
- loki
networks:
- rippled-telemetry

View File

@@ -23,6 +23,14 @@ datasources:
enabled: true
serviceMap:
datasourceUid: prometheus
# Phase 8: Trace-to-log correlation — enables one-click navigation
# from a Tempo trace to the corresponding Loki log lines. Filters
# by trace_id so only logs from the same trace are shown.
tracesToLogs:
datasourceUid: loki
filterByTraceID: true
filterBySpanID: false
tags: ["partition", "severity"]
tracesToMetrics:
datasourceUid: prometheus
spanStartTimeShift: "-1h"

View File

@@ -62,6 +62,49 @@ check_span() {
fi
}
# Phase 8: Verify trace_id injection in rippled log output.
# Greps all node debug.log files for the "trace_id=<hex> span_id=<hex>"
# pattern that Logs::format() injects when an active OTel span exists.
# Also cross-checks that a trace_id found in logs matches a trace in Jaeger.
check_log_correlation() {
log "Checking log-trace correlation..."
local total_matches=0
local sample_trace_id=""
for i in $(seq 1 "$NUM_NODES"); do
local logfile="$WORKDIR/node$i/debug.log"
if [ ! -f "$logfile" ]; then
continue
fi
local matches
matches=$(grep -c 'trace_id=[a-f0-9]\{32\} span_id=[a-f0-9]\{16\}' "$logfile" 2>/dev/null || echo 0)
total_matches=$((total_matches + matches))
# Capture the first trace_id we find for cross-referencing with Jaeger
if [ -z "$sample_trace_id" ] && [ "$matches" -gt 0 ]; then
sample_trace_id=$(grep -o 'trace_id=[a-f0-9]\{32\}' "$logfile" | head -1 | cut -d= -f2)
fi
done
if [ "$total_matches" -gt 0 ]; then
ok "Log correlation: found $total_matches log lines with trace_id"
else
fail "Log correlation: no trace_id found in any node debug.log"
fi
# Cross-check: verify the sample trace_id exists in Jaeger
if [ -n "$sample_trace_id" ]; then
local trace_found
trace_found=$(curl -sf "$JAEGER/api/traces/$sample_trace_id" \
| jq '.data | length' 2>/dev/null || echo 0)
if [ "$trace_found" -gt 0 ]; then
ok "Log-Jaeger cross-check: trace_id=$sample_trace_id found in Jaeger"
else
fail "Log-Jaeger cross-check: trace_id=$sample_trace_id NOT found in Jaeger"
fi
fi
}
cleanup() {
log "Cleaning up..."
# Kill xrpld nodes
@@ -507,6 +550,13 @@ log "--- Phase 5: Peer Spans (trace_peer=1) ---"
check_span "peer.proposal.receive"
check_span "peer.validation.receive"
# ---------------------------------------------------------------------------
# Step 9b: Verify log-trace correlation (Phase 8)
# ---------------------------------------------------------------------------
log ""
log "--- Phase 8: Log-Trace Correlation ---"
check_log_correlation
# ---------------------------------------------------------------------------
# Step 10: Verify Prometheus spanmetrics
# ---------------------------------------------------------------------------
@@ -602,6 +652,7 @@ echo ""
echo " Jaeger UI: http://localhost:16686"
echo " Grafana: http://localhost:3000"
echo " Prometheus: http://localhost:9090"
echo " Loki: http://localhost:3100"
echo ""
echo " xrpld nodes (6) are running:"
for i in $(seq 1 "$NUM_NODES"); do

View File

@@ -3,6 +3,7 @@
# Pipelines:
# traces: OTLP receiver -> batch processor -> debug + Jaeger + Tempo + spanmetrics
# metrics: OTLP receiver + spanmetrics connector -> Prometheus exporter
# logs: filelog receiver -> batch processor -> Loki exporter (Phase 8)
#
# rippled sends traces via OTLP/HTTP to port 4318. The collector batches
# them, forwards to both Jaeger and Tempo, and derives RED metrics via the
@@ -17,6 +18,12 @@
# but commented out. If you need StatsD fallback (server=statsd in
# [insight]), uncomment the statsd receiver and add it to the metrics
# pipeline receivers list.
#
# Phase 8: The filelog receiver tails rippled's debug.log files under
# /var/log/rippled/ (mounted from the host). A regex_parser operator
# extracts timestamp, partition, severity, and optional trace_id/span_id
# fields injected by Logs::format(). Parsed logs are exported to Grafana
# Loki for log-trace correlation.
receivers:
otlp:
@@ -41,6 +48,18 @@ receivers:
# observer_type: "summary"
# summary:
# percentiles: [0, 50, 90, 95, 99, 100]
# Phase 8: Filelog receiver tails rippled debug.log files for log-trace
# correlation. Extracts structured fields (timestamp, partition, severity,
# trace_id, span_id, message) via regex. The trace_id and span_id are
# optional — only present when the log was emitted within an active span.
filelog:
include: [/var/log/rippled/*/debug.log]
operators:
- type: regex_parser
regex: '^(?P<timestamp>\S+)\s+(?P<partition>\S+):(?P<severity>\S+)\s+(?:trace_id=(?P<trace_id>[a-f0-9]+)\s+span_id=(?P<span_id>[a-f0-9]+)\s+)?(?P<message>.*)$'
timestamp:
parse_from: attributes.timestamp
layout: "%Y-%b-%d %H:%M:%S.%f"
processors:
batch:
@@ -75,6 +94,11 @@ exporters:
endpoint: tempo:4317
tls:
insecure: true
# Phase 8: Loki exporter sends parsed log entries to Grafana Loki.
# Uses the native loki exporter which translates OTel log records
# into Loki's push API format with label extraction.
loki:
endpoint: http://loki:3100/loki/api/v1/push
prometheus:
endpoint: 0.0.0.0:8889
@@ -88,3 +112,9 @@ service:
receivers: [otlp, spanmetrics]
processors: [batch]
exporters: [prometheus]
# Phase 8: Log pipeline ingests rippled debug.log via filelog receiver,
# batches entries, and exports to Loki for log-trace correlation.
logs:
receivers: [filelog]
processors: [batch]
exporters: [loki]

View File

@@ -6,6 +6,14 @@
#include <boost/algorithm/string/predicate.hpp>
#include <boost/filesystem/path.hpp>
// Phase 8: OTel trace context headers for log-trace correlation.
// GetSpan() and RuntimeContext::GetCurrent() are thread-local reads
// with no locking — measured at <10ns per call.
#ifdef XRPL_ENABLE_TELEMETRY
#include <opentelemetry/context/runtime_context.h>
#include <opentelemetry/trace/provider.h>
#endif // XRPL_ENABLE_TELEMETRY
#include <chrono>
#include <cstring>
#include <fstream>
@@ -345,6 +353,30 @@ Logs::format(
break;
}
// Phase 8: Inject OTel trace context (trace_id, span_id) into log lines
// for log-trace correlation. Only appended when an active span exists.
// GetSpan() reads thread-local storage — no locks, <10ns overhead.
#ifdef XRPL_ENABLE_TELEMETRY
{
auto span =
opentelemetry::trace::GetSpan(opentelemetry::context::RuntimeContext::GetCurrent());
auto ctx = span->GetContext();
if (ctx.IsValid())
{
// Append trace context as structured key=value fields that the
// OTel Collector filelog receiver regex_parser can extract.
char traceId[33], spanId[17];
ctx.trace_id().ToLowerBase16(traceId);
ctx.span_id().ToLowerBase16(spanId);
output += "trace_id=";
output.append(traceId, 32);
output += " span_id=";
output.append(spanId, 16);
output += ' ';
}
}
#endif // XRPL_ENABLE_TELEMETRY
output += message;
// Limit the maximum length of the output