diff --git a/docker/telemetry/docker-compose.yml b/docker/telemetry/docker-compose.yml index be40c11773..ab83fb42a2 100644 --- a/docker/telemetry/docker-compose.yml +++ b/docker/telemetry/docker-compose.yml @@ -2,13 +2,16 @@ # # Provides services for local development: # - otel-collector: receives OTLP traces from rippled, batches and -# forwards them to Jaeger and Tempo. Listens on ports 4317 (gRPC) -# and 4318 (HTTP). +# forwards them to Jaeger and Tempo. Also tails rippled log files +# via filelog receiver and exports to Loki. Listens on ports +# 4317 (gRPC), 4318 (HTTP), and 8125 (StatsD UDP). # - jaeger: all-in-one tracing backend with UI on port 16686. # - tempo: Grafana Tempo tracing backend, queryable via Grafana Explore # on port 3000. Recommended for production (S3/GCS storage, TraceQL). +# - loki: Grafana Loki log aggregation backend for centralized log +# ingestion and log-trace correlation (Phase 8). # - grafana: dashboards on port 3000, pre-configured with Jaeger, Tempo, -# and Prometheus datasources. +# Prometheus, and Loki datasources. # # Usage: # docker compose -f docker/telemetry/docker-compose.yml up -d @@ -34,9 +37,14 @@ services: # - "8125:8125/udp" volumes: - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml:ro + # Phase 8: Mount rippled log directory for filelog receiver. + # The integration test writes logs to /tmp/xrpld-integration/; + # mount it read-only so the collector can tail debug.log files. + - /tmp/xrpld-integration:/var/log/rippled:ro depends_on: - jaeger - tempo + - loki networks: - rippled-telemetry @@ -61,6 +69,17 @@ services: networks: - rippled-telemetry + # Phase 8: Grafana Loki for centralized log ingestion and log-trace + # correlation. Receives logs from OTel Collector's loki exporter. + # Query logs via Grafana Explore -> Loki at http://localhost:3000. + loki: + image: grafana/loki:2.9.0 + ports: + - "3100:3100" + command: -config.file=/etc/loki/local-config.yaml + networks: + - rippled-telemetry + prometheus: image: prom/prometheus:latest ports: @@ -86,6 +105,7 @@ services: - jaeger - tempo - prometheus + - loki networks: - rippled-telemetry diff --git a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml index 838bfe0919..bcdae53278 100644 --- a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml +++ b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml @@ -23,6 +23,14 @@ datasources: enabled: true serviceMap: datasourceUid: prometheus + # Phase 8: Trace-to-log correlation — enables one-click navigation + # from a Tempo trace to the corresponding Loki log lines. Filters + # by trace_id so only logs from the same trace are shown. + tracesToLogs: + datasourceUid: loki + filterByTraceID: true + filterBySpanID: false + tags: ["partition", "severity"] tracesToMetrics: datasourceUid: prometheus spanStartTimeShift: "-1h" diff --git a/docker/telemetry/integration-test.sh b/docker/telemetry/integration-test.sh index 07b30b9a73..5e6696a33c 100755 --- a/docker/telemetry/integration-test.sh +++ b/docker/telemetry/integration-test.sh @@ -62,6 +62,49 @@ check_span() { fi } +# Phase 8: Verify trace_id injection in rippled log output. +# Greps all node debug.log files for the "trace_id= span_id=" +# pattern that Logs::format() injects when an active OTel span exists. +# Also cross-checks that a trace_id found in logs matches a trace in Jaeger. +check_log_correlation() { + log "Checking log-trace correlation..." + + local total_matches=0 + local sample_trace_id="" + + for i in $(seq 1 "$NUM_NODES"); do + local logfile="$WORKDIR/node$i/debug.log" + if [ ! -f "$logfile" ]; then + continue + fi + local matches + matches=$(grep -c 'trace_id=[a-f0-9]\{32\} span_id=[a-f0-9]\{16\}' "$logfile" 2>/dev/null || echo 0) + total_matches=$((total_matches + matches)) + # Capture the first trace_id we find for cross-referencing with Jaeger + if [ -z "$sample_trace_id" ] && [ "$matches" -gt 0 ]; then + sample_trace_id=$(grep -o 'trace_id=[a-f0-9]\{32\}' "$logfile" | head -1 | cut -d= -f2) + fi + done + + if [ "$total_matches" -gt 0 ]; then + ok "Log correlation: found $total_matches log lines with trace_id" + else + fail "Log correlation: no trace_id found in any node debug.log" + fi + + # Cross-check: verify the sample trace_id exists in Jaeger + if [ -n "$sample_trace_id" ]; then + local trace_found + trace_found=$(curl -sf "$JAEGER/api/traces/$sample_trace_id" \ + | jq '.data | length' 2>/dev/null || echo 0) + if [ "$trace_found" -gt 0 ]; then + ok "Log-Jaeger cross-check: trace_id=$sample_trace_id found in Jaeger" + else + fail "Log-Jaeger cross-check: trace_id=$sample_trace_id NOT found in Jaeger" + fi + fi +} + cleanup() { log "Cleaning up..." # Kill xrpld nodes @@ -507,6 +550,13 @@ log "--- Phase 5: Peer Spans (trace_peer=1) ---" check_span "peer.proposal.receive" check_span "peer.validation.receive" +# --------------------------------------------------------------------------- +# Step 9b: Verify log-trace correlation (Phase 8) +# --------------------------------------------------------------------------- +log "" +log "--- Phase 8: Log-Trace Correlation ---" +check_log_correlation + # --------------------------------------------------------------------------- # Step 10: Verify Prometheus spanmetrics # --------------------------------------------------------------------------- @@ -602,6 +652,7 @@ echo "" echo " Jaeger UI: http://localhost:16686" echo " Grafana: http://localhost:3000" echo " Prometheus: http://localhost:9090" +echo " Loki: http://localhost:3100" echo "" echo " xrpld nodes (6) are running:" for i in $(seq 1 "$NUM_NODES"); do diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index 3cd3e2b639..8ee2180a15 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -3,6 +3,7 @@ # Pipelines: # traces: OTLP receiver -> batch processor -> debug + Jaeger + Tempo + spanmetrics # metrics: OTLP receiver + spanmetrics connector -> Prometheus exporter +# logs: filelog receiver -> batch processor -> Loki exporter (Phase 8) # # rippled sends traces via OTLP/HTTP to port 4318. The collector batches # them, forwards to both Jaeger and Tempo, and derives RED metrics via the @@ -17,6 +18,12 @@ # but commented out. If you need StatsD fallback (server=statsd in # [insight]), uncomment the statsd receiver and add it to the metrics # pipeline receivers list. +# +# Phase 8: The filelog receiver tails rippled's debug.log files under +# /var/log/rippled/ (mounted from the host). A regex_parser operator +# extracts timestamp, partition, severity, and optional trace_id/span_id +# fields injected by Logs::format(). Parsed logs are exported to Grafana +# Loki for log-trace correlation. receivers: otlp: @@ -41,6 +48,18 @@ receivers: # observer_type: "summary" # summary: # percentiles: [0, 50, 90, 95, 99, 100] + # Phase 8: Filelog receiver tails rippled debug.log files for log-trace + # correlation. Extracts structured fields (timestamp, partition, severity, + # trace_id, span_id, message) via regex. The trace_id and span_id are + # optional — only present when the log was emitted within an active span. + filelog: + include: [/var/log/rippled/*/debug.log] + operators: + - type: regex_parser + regex: '^(?P\S+)\s+(?P\S+):(?P\S+)\s+(?:trace_id=(?P[a-f0-9]+)\s+span_id=(?P[a-f0-9]+)\s+)?(?P.*)$' + timestamp: + parse_from: attributes.timestamp + layout: "%Y-%b-%d %H:%M:%S.%f" processors: batch: @@ -75,6 +94,11 @@ exporters: endpoint: tempo:4317 tls: insecure: true + # Phase 8: Loki exporter sends parsed log entries to Grafana Loki. + # Uses the native loki exporter which translates OTel log records + # into Loki's push API format with label extraction. + loki: + endpoint: http://loki:3100/loki/api/v1/push prometheus: endpoint: 0.0.0.0:8889 @@ -88,3 +112,9 @@ service: receivers: [otlp, spanmetrics] processors: [batch] exporters: [prometheus] + # Phase 8: Log pipeline ingests rippled debug.log via filelog receiver, + # batches entries, and exports to Loki for log-trace correlation. + logs: + receivers: [filelog] + processors: [batch] + exporters: [loki] diff --git a/src/libxrpl/basics/Log.cpp b/src/libxrpl/basics/Log.cpp index 3e36b7d458..7ae4814031 100644 --- a/src/libxrpl/basics/Log.cpp +++ b/src/libxrpl/basics/Log.cpp @@ -6,6 +6,14 @@ #include #include +// Phase 8: OTel trace context headers for log-trace correlation. +// GetSpan() and RuntimeContext::GetCurrent() are thread-local reads +// with no locking — measured at <10ns per call. +#ifdef XRPL_ENABLE_TELEMETRY +#include +#include +#endif // XRPL_ENABLE_TELEMETRY + #include #include #include @@ -345,6 +353,30 @@ Logs::format( break; } + // Phase 8: Inject OTel trace context (trace_id, span_id) into log lines + // for log-trace correlation. Only appended when an active span exists. + // GetSpan() reads thread-local storage — no locks, <10ns overhead. +#ifdef XRPL_ENABLE_TELEMETRY + { + auto span = + opentelemetry::trace::GetSpan(opentelemetry::context::RuntimeContext::GetCurrent()); + auto ctx = span->GetContext(); + if (ctx.IsValid()) + { + // Append trace context as structured key=value fields that the + // OTel Collector filelog receiver regex_parser can extract. + char traceId[33], spanId[17]; + ctx.trace_id().ToLowerBase16(traceId); + ctx.span_id().ToLowerBase16(spanId); + output += "trace_id="; + output.append(traceId, 32); + output += " span_id="; + output.append(spanId, 16); + output += ' '; + } + } +#endif // XRPL_ENABLE_TELEMETRY + output += message; // Limit the maximum length of the output