Compare commits

...

68 Commits

Author SHA1 Message Date
Pratik Mankawde
b46ee12a19 formatting fixes
Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com>
2026-06-03 14:07:21 +01:00
Pratik Mankawde
154d441ff2 Merge branch 'develop' into pratik/otel-phase1a-plan-docs 2026-06-01 11:52:46 +01:00
Michael Legleux
0fffe23abc fix: Adjust xrpld systemd service and update timer (#7374) 2026-06-01 03:33:19 +00:00
Bart
7e15621e7b release: Bump version to 3.2.0-rc3 (#7371)
Co-authored-by: Bart <11445373+bthomee@users.noreply.github.com>
2026-05-31 22:55:18 +00:00
Vito Tumas
99431d7833 fix: Pin overpayment principal reduction to exact on-grid value (#7360) 2026-05-31 22:54:23 +00:00
Ed Hennis
47365f4220 fix: Improve upward rounding edge cases for Number::operator/= (#7328)
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Vito Tumas <5780819+Tapanito@users.noreply.github.com>
2026-05-31 00:23:29 +00:00
Bart
1599c1a672 refactor: Revert "perf: Remove unnecessary caches (#5439)" (#7359)
Co-authored-by: Bart <11445373+bthomee@users.noreply.github.com>
2026-05-30 18:48:59 +00:00
yinyiqian1
763dd503be fix: Add zero domainID check for permissionedDomain (#7362) 2026-05-30 00:16:25 +00:00
Pratik Mankawde
e1163f7180 Merge branch 'develop' into pratik/otel-phase1a-plan-docs
Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com>
2026-05-29 15:30:02 +01:00
Bart
2f3558c610 ci: Run PR title and description checks on staging and release branches (#7331)
Co-authored-by: Bart <11445373+bthomee@users.noreply.github.com>
2026-05-28 14:57:29 +00:00
Ayaz Salikhov
f9551ac5ca style: Run shfmt on workflows, actions and markdown bash code (#7333) 2026-05-27 19:24:18 +00:00
Bart
1acc42313c release: Bump version to 3.2.0-rc2 (#7348) 2026-05-27 15:11:38 -04:00
Bart
396d772a15 refactor: Enable support for fixCleanup3_2_0 amendment (#7347) 2026-05-27 19:10:33 +00:00
Ayaz Salikhov
1438bf1c67 release: Bump version to 3.2.0-rc1 (#7335) 2026-05-27 13:20:57 -04:00
Ed Hennis
7da643d864 fix: Fix a rounding error at the Number::maxRep cusp (#7051)
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Vito Tumas <5780819+Tapanito@users.noreply.github.com>
2026-05-27 15:19:20 +00:00
Ayaz Salikhov
1162371def ci: Only push docker images in XRPLF/rippled (#7330) 2026-05-26 20:03:04 +00:00
dependabot[bot]
2a0feca46b ci: [DEPENDABOT] bump docker/setup-buildx-action from 4.0.0 to 4.1.0 (#7322)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-05-26 19:36:32 +00:00
dependabot[bot]
108a4c8217 ci: [DEPENDABOT] bump codecov/codecov-action from 6.0.0 to 6.0.1 (#7321)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-05-26 19:36:21 +00:00
dependabot[bot]
4584b01bde ci: [DEPENDABOT] bump docker/build-push-action from 7.1.0 to 7.2.0 (#7320)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-05-26 19:36:13 +00:00
dependabot[bot]
7c59786565 ci: [DEPENDABOT] bump docker/metadata-action from 6.0.0 to 6.1.0 (#7319)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-05-26 19:36:00 +00:00
dependabot[bot]
9623e67b76 ci: [DEPENDABOT] bump docker/login-action from 4.1.0 to 4.2.0 (#7318)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-05-26 19:35:52 +00:00
Andrzej Budzanowski
85af406a0f fix: Update clang-tidy to include src/tests directory header check (#7307) 2026-05-26 19:35:32 +00:00
Ayaz Salikhov
ac33fb32a7 chore: Pin Python packages for codegen using uv (#7329) 2026-05-26 18:35:38 +00:00
Ayaz Salikhov
23d0812827 style: Use shfmt instead of bashate (#7326) 2026-05-26 18:28:23 +00:00
Vito Tumas
49567e7283 fix: Fix edge-case where vault-depositor may get stuck (#7139) 2026-05-26 18:18:40 +00:00
Vito Tumas
633ef4706f fix: Fix VaultInvariant and VaultDeposit precision bugs at IOU scale boundaries (#7272)
Co-authored-by: Bart <bthomee@users.noreply.github.com>
2026-05-26 16:32:44 +00:00
Ayaz Salikhov
49cb3f45a4 ci: Add clang to nix images (#7308)
Co-authored-by: semgrep-companion-app[bot] <218312740+semgrep-companion-app[bot]@users.noreply.github.com>
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
2026-05-26 15:45:33 +00:00
Vito Tumas
22a21b175e fix: Include management-fee delta in doOverpayment assertion (#7039) 2026-05-26 14:01:52 +00:00
Pratik Mankawde
e9d885bd9b fix: Fix clang-tidy pre-commit hook to locate compile_commands.json from repo root (#7325)
Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com>
2026-05-26 13:50:18 +00:00
Jingchen
a911f9089e fix: Use consistent scale for debtTotal (#7093)
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
2026-05-24 20:44:29 +00:00
Peter Chen
e34c2667d7 fix: Skip deleted book directories and non-root modifications in ValidBookDirectory invariant (#7312) 2026-05-24 20:37:16 +00:00
Valentin Balaschenko
30de556224 fix: Address review feedback on FD/handle guarding (#5823 follow-up) (#7310) 2026-05-23 14:48:48 +00:00
Gregory Tsipenyuk
dcd2ff0b5f fix: Fix non-canonical MPT amount (#7117)
Co-authored-by: xrplf-ai-reviewer[bot] <266832837+xrplf-ai-reviewer[bot]@users.noreply.github.com>
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
2026-05-23 06:40:26 +00:00
Bart
dfb9b8ed9a release: Bump version to 3.2.0-b7 (#7316)
Co-authored-by: Bart <11445373+bthomee@users.noreply.github.com>
2026-05-22 19:32:12 +00:00
Jingchen
179e73594a fix: Check if the MPT first loss cover can be sent to the broker before deleting the broker (#7125)
Co-authored-by: xrplf-ai-reviewer[bot] <266832837+xrplf-ai-reviewer[bot]@users.noreply.github.com>
2026-05-22 11:58:48 +00:00
Michael Legleux
15dd653e4b fix: Fix RPM prerelease ordering and start xrpld on DEB install (#7313) 2026-05-22 11:30:45 +00:00
Michael Legleux
a37afe13ff ci: Re-enable full nproc for Linux (#7315) 2026-05-22 11:30:37 +00:00
Gregory Tsipenyuk
3547a9335f fix: Add assorted MPT/DEX fixes (#7040)
Co-authored-by: xrplf-ai-reviewer[bot] <266832837+xrplf-ai-reviewer[bot]@users.noreply.github.com>
Co-authored-by: Shawn Xie <35279399+shawnxie999@users.noreply.github.com>
2026-05-21 18:29:53 +00:00
Bart
1a98182e23 refactor: Remove dead fetchBatch code (#7309)
Co-authored-by: Bart <11445373+bthomee@users.noreply.github.com>
2026-05-21 17:52:41 +00:00
Bart
79308705c5 release: Bump version to 3.2.0-b6 (#7311)
Co-authored-by: Bart <11445373+bthomee@users.noreply.github.com>
2026-05-21 17:50:59 +00:00
Vito Tumas
e24de65f42 chore: Revert graceful peer disconnection and follow-up fix (#7296) 2026-05-21 16:13:41 +00:00
Vito Tumas
7fdaa0a5ef fix: Fix IOU precision issues in LoanBrokerCover transactions (#7274) 2026-05-21 14:51:58 +00:00
Vito Tumas
795dc5e364 fix: Avoid principal-zeroing in non-final loan payments at coarse scale (#7050)
Co-authored-by: Ed Hennis <ed@ripple.com>
2026-05-21 14:46:26 +00:00
Pratik Mankawde
f6fd5ddb0a fix: Add null check (#7305)
Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-21 13:24:04 +00:00
Rithvik Reddygari
afcf6fbcdc docs: Add --parallel flag to cmake build commands in BUILD.md (#7302) 2026-05-21 06:33:19 +00:00
Shawn Xie
28cc20c816 fix: Fix wrong hybrid offer orderbook placement and update LedgerStateFix to amend ExchangeRate meta (#7087)
Co-authored-by: Peter Chen <ychen@ripple.com>
2026-05-21 06:19:04 +00:00
Alex Kremer
a830ab10ef style: More clang-tidy identifier renaming (#7290) 2026-05-20 21:31:15 +00:00
Shawn Xie
8c0080020f fix: Update pDEX invariant firing under a valid offer deletion (#7118)
Co-authored-by: Peter Chen <ychen@ripple.com>
2026-05-20 21:10:04 +00:00
yinyiqian1
9cb0740673 fix: Fix multisign and signfor to check for delegate (#7064) 2026-05-20 20:24:09 +00:00
Mayukha Vadari
242ce3e9e4 refactor: Fix sfGeneric and sfInvalid field names (#7300) 2026-05-20 19:47:59 +00:00
box4wangjing
a5d238e7d4 docs: Fix some comments to improve readability (#7122)
Signed-off-by: box4wangjing <box4wangjing@outlook.com>
Co-authored-by: Mayukha Vadari <mvadari@ripple.com>
2026-05-20 19:46:45 +00:00
Vito Tumas
9cb049276d feat: Propagate underlying MPT flags to vault shares (#7077)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: Denis Angell <dangell@transia.co>
Co-authored-by: Fomo <508629+shortthefomo@users.noreply.github.com>
Co-authored-by: Bart <bthomee@users.noreply.github.com>
Co-authored-by: Ayaz Salikhov <mathbunnyru@users.noreply.github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-05-20 19:44:09 +00:00
Vito Tumas
93ac1aa7aa fix: Disable unnecessary sanity-check in VaultDeposit (#7288) 2026-05-19 16:38:50 +00:00
dependabot[bot]
d9a3af8207 ci: [DEPENDABOT] bump actions/upload-artifact from 7.0.0 to 7.0.1 (#7286)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-05-19 16:35:38 +00:00
Ayaz Salikhov
8d1083e5ea ci: Only run reusable package in public repos (#7293) 2026-05-19 13:15:11 +00:00
Pratik Mankawde
f3a095ab65 docs(telemetry): align Phase 1a plan docs with Phase 1b implementation
Phase-1a plan documents advertised OTLP/gRPC on port 4317 as the default
exporter, four unparsed [telemetry] config keys, and "Phase 4a Complete"
status with exit-criteria checkboxes marked done. Every downstream branch
through Phase 5 ships only OTLP/HTTP on port 4318 via OtlpHttpExporterFactory,
never parses the advertised keys, and the Phase 4 work is not yet delivered.

Fixes:
- 02-design-decisions.md: flip §2.1.1 SDK dependency recommendations to
  OTLP/HTTP (shipped) with OTLP/gRPC marked Future. Update §2.2 architecture
  diagram and text from OTLP/gRPC:4317 to OTLP/HTTP:4318. Rewrite §2.2.1 as
  "OTLP/HTTP (Shipped)" and §2.2.2 as "OTLP/gRPC (Future Work — Planned
  Upgrade)" with a concrete checklist (Conan dep, config parsing, factory
  branch, runbook/dashboard updates) for landing the gRPC transport later.
- 05-configuration-reference.md: drop the fabricated exporter/otlp_grpc key
  and the :4317 default from the sample config block and the options-summary
  table. Move trace_pathfind, trace_txq, trace_validator, trace_amendment
  into a new "Planned (not yet implemented)" table citing the phase that will
  add each one. Keep the example config minimal so copy-paste does not produce
  a silently-ignored stanza.
- 06-implementation-phases.md: reset Phase 4 Exit Criteria checkboxes from
  [x] to [ ] (Phase 4 is not shipped at Phase-1a time). Rename "Phase 4a
  Complete" to "Phase 4a Plan" and describe the work as future. Replace the
  broken forward link to Phase4_taskList.md (introduced in the Phase 2 PR)
  with a sentence pointing readers to where that spec will land. Renumber
  the final section 6.12 to 6.11 so it sits directly after 6.10; section 6.11
  ("Effort Summary") was intentionally removed in earlier edits.
2026-05-14 16:09:48 +01:00
Pratik Mankawde
1fd971b78b fix(docs): apply rename scripts to OpenTelemetry plan docs
Run .github/scripts/rename/docs.sh to replace rippled → xrpld
references in all plan documentation files, fixing the check-rename
CI failure.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-28 13:57:38 +01:00
Pratik Mankawde
d6c8dec451 Merge branch 'develop' into pratik/otel-phase1a-plan-docs 2026-04-28 11:19:51 +01:00
Pratik Mankawde
30ecb32a6f Merge branch 'develop' into pratik/otel-phase1a-plan-docs 2026-04-27 19:42:09 +01:00
Pratik Mankawde
a01b274352 Merge branch 'develop' into pratik/otel-phase1a-plan-docs 2026-04-20 17:21:44 +01:00
Pratik Mankawde
193f5b39cb docs(telemetry): update plan docs for ServiceRegistry migration
Plan documents referenced Application.h and app_ for getTelemetry()
but the codebase now uses ServiceRegistry as the interface. Updated:

- 05-configuration-reference.md: getTelemetry() on ServiceRegistry,
  deferred serviceInstanceId pattern in ApplicationImp
- POC_taskList.md Task 4: target ServiceRegistry.h not Application.h,
  correct config file path and constructor pattern
- 04-code-samples.md: fix overlay() -> getOverlay(), rewrite JobQueue
  sample to reflect actual architecture (no app_ member)
- 03-implementation-strategy.md: fix file impact table path

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-16 15:37:13 +01:00
Pratik Mankawde
db8111ef7c docs(telemetry): replace Jaeger with Tempo in architecture diagram
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 15:00:48 +01:00
Pratik Mankawde
913a4b794c docs: correct OTel overhead estimates against SDK benchmarks
Verified CPU, memory, and network overhead calculations against
official OTel C++ SDK benchmarks (969 CI runs) and source code
analysis. Key corrections:

- Span creation: 200-500ns → 500-1000ns (SDK BM_SpanCreation median
  ~1000ns; original estimate matched API no-op, not SDK path)
- Per-TX overhead: 2.4μs → 4.0μs (2.0% vs 1.2%; still within 1-3%)
- Active span memory: ~200 bytes → ~500-800 bytes (Span wrapper +
  SpanData + std::map attribute storage)
- Static memory: ~456KB → ~8.3MB (BatchSpanProcessor worker thread
  stack ~8MB was omitted)
- Total memory ceiling: ~2.3MB → ~10MB
- Memory success metric target: <5MB → <10MB
- AddEvent: 50-80ns → 100-200ns

Added Section 3.5.4 with links to all benchmark sources.
Updated presentation.md with matching corrections.
High-level conclusions unchanged (1-3% CPU, negligible consensus).

Also includes: review fixes, cross-document consistency improvements,
additional component tracing docs (PathFinding, TxQ, Validator, etc.),
context size corrections (32 → 25 bytes).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-16 15:00:47 +01:00
Pratik Mankawde
accea17e9d moved presentation.md file
Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com>
2026-04-16 15:00:47 +01:00
Pratik Mankawde
c6fa00fbe3 Remove effort estimates from implementation phases document
Strip effort/risk columns from task tables and remove the §6.9 Effort
Summary section with its pie chart and resource requirements table.
Renumber §6.10 Quick Wins → §6.9.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-16 15:00:47 +01:00
Pratik Mankawde
bfb8f4f01a Add Phase 4a implementation status to plan docs
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-16 15:00:47 +01:00
Pratik Mankawde
4b745a86b7 Appendix: add 00-tracing-fundamentals.md and POC_taskList.md to document index
Split document index into Plan Documents and Task Lists sections.
These files were introduced in this branch but missing from the index.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-16 15:00:47 +01:00
Pratik Mankawde
ddf894dcb0 Phase 1a: OpenTelemetry plan documentation
Add comprehensive planning documentation for the OpenTelemetry
distributed tracing integration:

- Tracing fundamentals and concepts
- Architecture analysis of rippled's tracing surface area
- Design decisions and trade-offs
- Implementation strategy and code samples
- Configuration reference
- Implementation phases roadmap
- Observability backend comparison
- POC task list and presentation materials

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-16 15:00:47 +01:00
349 changed files with 20891 additions and 4715 deletions

View File

@@ -191,11 +191,14 @@ CheckOptions:
readability-identifier-naming.ParameterCase: camelBack
readability-identifier-naming.FunctionCase: camelBack
readability-identifier-naming.MemberCase: camelBack
readability-identifier-naming.PrivateMemberCase: camelBack
readability-identifier-naming.PrivateMemberSuffix: _
readability-identifier-naming.ProtectedMemberCase: camelBack
readability-identifier-naming.ProtectedMemberSuffix: _
readability-identifier-naming.PublicMemberCase: camelBack
readability-identifier-naming.PublicMemberSuffix: ""
readability-identifier-naming.GlobalFunctionIgnoredRegexp: "^(to_string|hash_append|tuple_hash)$"
HeaderFilterRegex: '^.*/(test|xrpl|xrpld)/.*\.(h|hpp|ipp)$'
HeaderFilterRegex: '^.*/(tests?|xrpl|xrpld)/.*\.(h|hpp|ipp)$'
ExcludeHeaderFilterRegex: '^.*/protocol_autogen/.*\.(h|hpp)$'
WarningsAsErrors: "*"

View File

@@ -37,12 +37,12 @@ runs:
run: |
echo 'Installing dependencies.'
conan install \
--profile ci \
--build="${BUILD_OPTION}" \
--options:host='&:tests=True' \
--options:host='&:xrpld=True' \
--settings:all build_type="${BUILD_TYPE}" \
--conf:all tools.build:jobs=${BUILD_NPROC} \
--conf:all tools.build:verbosity="${LOG_VERBOSITY}" \
--conf:all tools.compilation:verbosity="${LOG_VERBOSITY}" \
.
--profile ci \
--build="${BUILD_OPTION}" \
--options:host='&:tests=True' \
--options:host='&:xrpld=True' \
--settings:all build_type="${BUILD_TYPE}" \
--conf:all tools.build:jobs=${BUILD_NPROC} \
--conf:all tools.build:verbosity="${LOG_VERBOSITY}" \
--conf:all tools.compilation:verbosity="${LOG_VERBOSITY}" \
.

View File

@@ -15,7 +15,7 @@ runs:
shell: bash
env:
VERSION: ${{ github.ref_name }}
run: echo "VERSION=${VERSION}" >> "${GITHUB_ENV}"
run: echo "VERSION=${VERSION}" >>"${GITHUB_ENV}"
# When a tag is not pushed, then the version (e.g. 1.2.3-b0) is extracted
# from the BuildInfo.cpp file and the shortened commit hash appended to it.
@@ -28,17 +28,17 @@ runs:
echo 'Extracting version from BuildInfo.cpp.'
VERSION="$(cat src/libxrpl/protocol/BuildInfo.cpp | grep "versionString =" | awk -F '"' '{print $2}')"
if [[ -z "${VERSION}" ]]; then
echo 'Unable to extract version from BuildInfo.cpp.'
exit 1
echo 'Unable to extract version from BuildInfo.cpp.'
exit 1
fi
echo 'Appending shortened commit hash to version.'
SHA='${{ github.sha }}'
VERSION="${VERSION}+${SHA:0:7}"
echo "VERSION=${VERSION}" >> "${GITHUB_ENV}"
echo "VERSION=${VERSION}" >>"${GITHUB_ENV}"
- name: Output version
id: version
shell: bash
run: echo "version=${VERSION}" >> "${GITHUB_OUTPUT}"
run: echo "version=${VERSION}" >>"${GITHUB_OUTPUT}"

403
.github/scripts/format-inline-bash.py vendored Executable file
View File

@@ -0,0 +1,403 @@
#!/usr/bin/env python3
"""
Format embedded shell snippets using the shfmt hook configured in
.pre-commit-config.yaml.
Two shapes are recognised:
* YAML workflow/action files: literal block-scalar runs (`run: |`) and
single-line runs (`run: some command`). A single-line run is upgraded to
a `run: |` block scalar if shfmt's output spans multiple lines.
* Markdown files: ``` ```bash ``` fenced code blocks.
Any block that shfmt cannot parse is skipped with a warning on stderr, so
the file is left untouched and surrounding blocks still get formatted.
For each occurrence the body is dedented, written to a temp .sh file,
formatted via `pre-commit run shfmt --files <temp>` (falling back to
`prek`), then re-indented and written back in place.
When invoked without arguments, every .yml/.yaml under .github/ plus every
.md file in the repo is scanned. When invoked with file arguments (the
pre-commit case), only those files are processed.
"""
from __future__ import annotations
import re
import shutil
import subprocess
import sys
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import Union
REPO = Path(__file__).resolve().parents[2]
_HOOK_RUNNER = next((cmd for cmd in ("pre-commit", "prek") if shutil.which(cmd)), None)
if _HOOK_RUNNER is None:
sys.exit("error: neither `pre-commit` nor `prek` found on PATH")
RUN_BLOCK_RE = re.compile(r"^(?P<prefix>[ \t]*(?:- )?)run:[ \t]*\|[+-]?[ \t]*$")
RUN_INLINE_RE = re.compile(
r"^(?P<prefix>[ \t]*(?:- )?)run:[ \t]+" r"(?P<value>(?!\|[+-]?[ \t]*$)\S.*?)[ \t]*$"
)
MD_BASH_OPEN_RE = re.compile(r"^(?P<indent>[ ]{0,3})`{3}bash[ \t]*$")
MD_FENCE_CLOSE_RE = re.compile(r"^[ ]{0,3}`{3,}[ \t]*$")
@dataclass(frozen=True)
class BlockRun:
"""A `run: |` block scalar; `body_start:body_end` slices into `lines`."""
body_start: int
body_end: int
body_indent: int
@dataclass(frozen=True)
class InlineRun:
"""A single-line `run: value` at `line_idx`."""
line_idx: int
prefix: str
value: str
@dataclass(frozen=True)
class MdBashBlock:
"""A markdown ``` ```bash ``` fenced code block.
`body_start:body_end` slices into the file's lines; `open_line_idx`
points at the opening fence line.
"""
open_line_idx: int
body_start: int
body_end: int
body_indent: int
RunItem = Union[BlockRun, InlineRun]
def _scan_block_body(
lines: list[str], body_start: int, run_col: int
) -> tuple[int | None, int]:
"""Locate the body of a `run: |` block scalar starting at `body_start`.
Returns `(body_indent, scan_end)`. `scan_end` is the line index where the
outer scanner should resume. `body_indent` is `None` when no body is
present (the scalar is empty, or the next non-blank line has indent
`<= run_col`).
"""
body_indent: int | None = None
scan_end = len(lines)
for idx in range(body_start, len(lines)):
line = lines[idx]
if line.strip() == "":
continue
indent = len(line) - len(line.lstrip(" "))
if body_indent is None:
if indent > run_col:
body_indent = indent
else:
scan_end = idx
break
elif indent < body_indent:
scan_end = idx
break
if body_indent is not None:
while scan_end > body_start and lines[scan_end - 1].strip() == "":
scan_end -= 1
if scan_end <= body_start:
body_indent = None
return body_indent, scan_end
def find_run_blocks(lines: list[str]) -> list[RunItem]:
"""Return run items in document order."""
items: list[RunItem] = []
line_idx = 0
while line_idx < len(lines):
line = lines[line_idx]
if block_match := RUN_BLOCK_RE.match(line):
run_col = len(block_match.group("prefix"))
body_start = line_idx + 1
body_indent, scan_end = _scan_block_body(lines, body_start, run_col)
if body_indent is not None:
items.append(
BlockRun(
body_start=body_start,
body_end=scan_end,
body_indent=body_indent,
)
)
line_idx = scan_end
continue
if inline_match := RUN_INLINE_RE.match(line):
items.append(
InlineRun(
line_idx=line_idx,
prefix=inline_match.group("prefix"),
value=inline_match.group("value"),
)
)
line_idx += 1
return items
def find_md_bash_blocks(lines: list[str]) -> list[MdBashBlock]:
"""Return ``` ```bash ``` fenced code blocks in document order."""
blocks: list[MdBashBlock] = []
line_idx = 0
while line_idx < len(lines):
open_match = MD_BASH_OPEN_RE.match(lines[line_idx])
if not open_match:
line_idx += 1
continue
body_start = line_idx + 1
close_idx = next(
(
j
for j in range(body_start, len(lines))
if MD_FENCE_CLOSE_RE.match(lines[j])
),
None,
)
if close_idx is None:
line_idx = body_start
continue
body = lines[body_start:close_idx]
non_blank = [b for b in body if b.strip()]
body_indent = (
min(len(b) - len(b.lstrip(" ")) for b in non_blank)
if non_blank
else len(open_match.group("indent"))
)
blocks.append(
MdBashBlock(
open_line_idx=line_idx,
body_start=body_start,
body_end=close_idx,
body_indent=body_indent,
)
)
line_idx = close_idx + 1
return blocks
def dedent(lines: list[str], n: int) -> list[str]:
pad = " " * n
return [
(
""
if line.strip() == ""
else (line[n:] if line.startswith(pad) else line.lstrip(" "))
)
for line in lines
]
def reindent(lines: list[str], n: int) -> list[str]:
pad = " " * n
return [pad + line if line else "" for line in lines]
_SHFMT_ERR_RE = re.compile(r"\.sh:\d+:\d+:\s")
_GHA_EXPR_RE = re.compile(r"\$\{\{.*?\}\}", re.DOTALL)
_GHA_PLACEHOLDER_RE = re.compile(r"__GHA_EXPR_(\d+)__")
def _encode_gha_exprs(text: str) -> tuple[str, list[str]]:
"""Replace `${{ ... }}` expressions with bash-safe placeholder identifiers."""
exprs: list[str] = []
def repl(match: re.Match[str]) -> str:
exprs.append(match.group(0))
return f"__GHA_EXPR_{len(exprs) - 1}__"
return _GHA_EXPR_RE.sub(repl, text), exprs
def _decode_gha_exprs(text: str, exprs: list[str]) -> str:
"""Restore `${{ ... }}` expressions from placeholder identifiers."""
return _GHA_PLACEHOLDER_RE.sub(lambda m: exprs[int(m.group(1))], text)
def shfmt_via_hook(tmp_path: Path) -> tuple[bool, str]:
# `${{ ... }}` is not valid shell, so swap it for a placeholder identifier
# that shfmt can parse, then restore it after formatting.
encoded, exprs = _encode_gha_exprs(tmp_path.read_text())
if exprs:
tmp_path.write_text(encoded)
res = subprocess.run(
[_HOOK_RUNNER, "run", "shfmt", "--files", str(tmp_path)],
cwd=REPO,
capture_output=True,
text=True,
)
output = res.stdout + res.stderr
# shfmt emits parse errors as "<path>:<line>:<col>: <message>".
parse_err = bool(_SHFMT_ERR_RE.search(output))
# A non-zero exit that is neither a parse error nor pre-commit's "I had
# to modify files" signal means the hook itself failed to run (missing
# binary, install failure, bad config, ...). Surface that loudly rather
# than silently treating it as a no-op.
if (
res.returncode != 0
and not parse_err
and "files were modified by this hook" not in output
):
sys.exit(
f"error: `{_HOOK_RUNNER} run shfmt` failed with exit {res.returncode}:\n{output}"
)
if exprs and not parse_err:
tmp_path.write_text(_decode_gha_exprs(tmp_path.read_text(), exprs))
return not parse_err, output
def _skip(path: Path, where: int, kind: str, output: str) -> None:
print(
f" shfmt could not parse {kind} at {path}:{where + 1} — skipped",
file=sys.stderr,
)
print(f" {output.strip()}", file=sys.stderr)
def process_yaml_file(path: Path, tmp_path: Path) -> int:
text = path.read_text()
had_nl = text.endswith("\n")
lines = text.split("\n")
if had_nl:
lines = lines[:-1]
items = find_run_blocks(lines)
if not items:
return 0
changed = 0
# Process in reverse so earlier indices remain valid as we splice.
for item in reversed(items):
if isinstance(item, BlockRun):
body = lines[item.body_start : item.body_end]
tmp_path.write_text("\n".join(dedent(body, item.body_indent)) + "\n")
ok, output = shfmt_via_hook(tmp_path)
if not ok:
_skip(path, item.body_start, "block", output)
continue
formatted = tmp_path.read_text().rstrip("\n")
new_body = reindent(formatted.split("\n"), item.body_indent)
if new_body != body:
lines[item.body_start : item.body_end] = new_body
changed += 1
else:
tmp_path.write_text(item.value + "\n")
ok, output = shfmt_via_hook(tmp_path)
if not ok:
_skip(path, item.line_idx, "inline run", output)
continue
formatted = tmp_path.read_text().rstrip("\n")
if formatted == item.value:
continue
formatted_lines = formatted.split("\n")
if len(formatted_lines) == 1:
lines[item.line_idx] = f"{item.prefix}run: {formatted}"
else:
body_indent = len(item.prefix) + 2
lines[item.line_idx : item.line_idx + 1] = [
f"{item.prefix}run: |",
*reindent(formatted_lines, body_indent),
]
changed += 1
new_text = "\n".join(lines) + ("\n" if had_nl else "")
if new_text != text:
path.write_text(new_text)
return changed
def process_md_file(path: Path, tmp_path: Path) -> int:
text = path.read_text()
had_nl = text.endswith("\n")
lines = text.split("\n")
if had_nl:
lines = lines[:-1]
blocks = find_md_bash_blocks(lines)
if not blocks:
return 0
changed = 0
for block in reversed(blocks):
body = lines[block.body_start : block.body_end]
tmp_path.write_text("\n".join(dedent(body, block.body_indent)) + "\n")
ok, output = shfmt_via_hook(tmp_path)
if not ok:
_skip(path, block.open_line_idx, "```bash block", output)
continue
formatted = tmp_path.read_text().rstrip("\n")
formatted_lines = formatted.split("\n") if formatted else []
new_body = reindent(formatted_lines, block.body_indent)
if new_body != body:
lines[block.body_start : block.body_end] = new_body
changed += 1
new_text = "\n".join(lines) + ("\n" if had_nl else "")
if new_text != text:
path.write_text(new_text)
return changed
def process_file(path: Path, tmp_path: Path) -> int:
if path.suffix in (".yml", ".yaml"):
return process_yaml_file(path, tmp_path)
if path.suffix == ".md":
return process_md_file(path, tmp_path)
return 0
def gather_files(argv: list[str]) -> list[Path]:
"""Return YAML workflow/action files and markdown files that we should
process — either the paths in `argv` or, when `argv` is empty, every
such file in the repo (skipping `external/`)."""
if argv:
candidates: list[Path] = [
(REPO / a).resolve() if not Path(a).is_absolute() else Path(a) for a in argv
]
else:
gh = REPO / ".github"
candidates = [
*gh.rglob("*.yml"),
*gh.rglob("*.yaml"),
*(
p
for p in REPO.rglob("*.md")
if "external" not in p.relative_to(REPO).parts
),
]
return sorted(
p
for p in candidates
if p.exists()
and (
(p.suffix in (".yml", ".yaml") and ".github" in p.parts)
or p.suffix == ".md"
)
)
def main(argv: list[str]) -> int:
files = gather_files(argv)
if not files:
return 0
with tempfile.TemporaryDirectory(prefix="format-inline-bash-") as tmpdir:
tmp_path = Path(tmpdir) / "shfmt.sh"
total = 0
for f in files:
n = process_file(f, tmp_path)
if n:
print(f"{f.relative_to(REPO)}: reformatted {n} block(s)")
total += n
return 1 if total else 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

0
.github/scripts/levelization/generate.py vendored Normal file → Executable file
View File

View File

@@ -6,7 +6,7 @@ set -e
# On MacOS, ensure that GNU sed is installed and available as `gsed`.
SED_COMMAND=sed
if [[ "${OSTYPE}" == 'darwin'* ]]; then
if ! command -v gsed &> /dev/null; then
if ! command -v gsed &>/dev/null; then
echo "Error: gsed is not installed. Please install it using 'brew install gnu-sed'."
exit 1
fi

View File

@@ -8,12 +8,12 @@ set -e
SED_COMMAND=sed
HEAD_COMMAND=head
if [[ "${OSTYPE}" == 'darwin'* ]]; then
if ! command -v gsed &> /dev/null; then
if ! command -v gsed &>/dev/null; then
echo "Error: gsed is not installed. Please install it using 'brew install gnu-sed'."
exit 1
fi
SED_COMMAND=gsed
if ! command -v ghead &> /dev/null; then
if ! command -v ghead &>/dev/null; then
echo "Error: ghead is not installed. Please install it using 'brew install coreutils'."
exit 1
fi
@@ -74,10 +74,10 @@ if grep -q '"xrpld"' cmake/XrplCore.cmake; then
# The script has been rerun, so just restore the name of the binary.
${SED_COMMAND} -i 's/"xrpld"/"rippled"/' cmake/XrplCore.cmake
elif ! grep -q '"rippled"' cmake/XrplCore.cmake; then
${HEAD_COMMAND} -n -1 cmake/XrplCore.cmake > cmake.tmp
echo ' # For the time being, we will keep the name of the binary as it was.' >> cmake.tmp
echo ' set_target_properties(xrpld PROPERTIES OUTPUT_NAME "rippled")' >> cmake.tmp
tail -1 cmake/XrplCore.cmake >> cmake.tmp
${HEAD_COMMAND} -n -1 cmake/XrplCore.cmake >cmake.tmp
echo ' # For the time being, we will keep the name of the binary as it was.' >>cmake.tmp
echo ' set_target_properties(xrpld PROPERTIES OUTPUT_NAME "rippled")' >>cmake.tmp
tail -1 cmake/XrplCore.cmake >>cmake.tmp
mv cmake.tmp cmake/XrplCore.cmake
fi

View File

@@ -6,7 +6,7 @@ set -e
# On MacOS, ensure that GNU sed is installed and available as `gsed`.
SED_COMMAND=sed
if [[ "${OSTYPE}" == 'darwin'* ]]; then
if ! command -v gsed &> /dev/null; then
if ! command -v gsed &>/dev/null; then
echo "Error: gsed is not installed. Please install it using 'brew install gnu-sed'."
exit 1
fi

View File

@@ -6,7 +6,7 @@ set -e
# On MacOS, ensure that GNU sed is installed and available as `gsed`.
SED_COMMAND=sed
if [[ "${OSTYPE}" == 'darwin'* ]]; then
if ! command -v gsed &> /dev/null; then
if ! command -v gsed &>/dev/null; then
echo "Error: gsed is not installed. Please install it using 'brew install gnu-sed'."
exit 1
fi
@@ -62,37 +62,37 @@ done
# restoring the verbiage that is already present in LICENSE.md. Ensure that if
# the script is run multiple times, duplicate notices are not added.
if ! grep -q 'Raw Material Software' include/xrpl/beast/core/CurrentThreadName.h; then
echo -e "// Portions of this file are from JUCE (http://www.juce.com).\n// Copyright (c) 2013 - Raw Material Software Ltd.\n// Please visit http://www.juce.com\n\n$(cat include/xrpl/beast/core/CurrentThreadName.h)" > include/xrpl/beast/core/CurrentThreadName.h
echo -e "// Portions of this file are from JUCE (http://www.juce.com).\n// Copyright (c) 2013 - Raw Material Software Ltd.\n// Please visit http://www.juce.com\n\n$(cat include/xrpl/beast/core/CurrentThreadName.h)" >include/xrpl/beast/core/CurrentThreadName.h
fi
if ! grep -q 'Dev Null' src/test/app/NetworkID_test.cpp; then
echo -e "// Copyright (c) 2020 Dev Null Productions\n\n$(cat src/test/app/NetworkID_test.cpp)" > src/test/app/NetworkID_test.cpp
echo -e "// Copyright (c) 2020 Dev Null Productions\n\n$(cat src/test/app/NetworkID_test.cpp)" >src/test/app/NetworkID_test.cpp
fi
if ! grep -q 'Dev Null' src/test/app/tx/apply_test.cpp; then
echo -e "// Copyright (c) 2020 Dev Null Productions\n\n$(cat src/test/app/tx/apply_test.cpp)" > src/test/app/tx/apply_test.cpp
echo -e "// Copyright (c) 2020 Dev Null Productions\n\n$(cat src/test/app/tx/apply_test.cpp)" >src/test/app/tx/apply_test.cpp
fi
if ! grep -q 'Dev Null' src/test/rpc/ManifestRPC_test.cpp; then
echo -e "// Copyright (c) 2020 Dev Null Productions\n\n$(cat src/test/rpc/ManifestRPC_test.cpp)" > src/test/rpc/ManifestRPC_test.cpp
echo -e "// Copyright (c) 2020 Dev Null Productions\n\n$(cat src/test/rpc/ManifestRPC_test.cpp)" >src/test/rpc/ManifestRPC_test.cpp
fi
if ! grep -q 'Dev Null' src/test/rpc/ValidatorInfo_test.cpp; then
echo -e "// Copyright (c) 2020 Dev Null Productions\n\n$(cat src/test/rpc/ValidatorInfo_test.cpp)" > src/test/rpc/ValidatorInfo_test.cpp
echo -e "// Copyright (c) 2020 Dev Null Productions\n\n$(cat src/test/rpc/ValidatorInfo_test.cpp)" >src/test/rpc/ValidatorInfo_test.cpp
fi
if ! grep -q 'Dev Null' src/xrpld/rpc/handlers/server_info/Manifest.cpp; then
echo -e "// Copyright (c) 2019 Dev Null Productions\n\n$(cat src/xrpld/rpc/handlers/server_info/Manifest.cpp)" > src/xrpld/rpc/handlers/server_info/Manifest.cpp
echo -e "// Copyright (c) 2019 Dev Null Productions\n\n$(cat src/xrpld/rpc/handlers/server_info/Manifest.cpp)" >src/xrpld/rpc/handlers/server_info/Manifest.cpp
fi
if ! grep -q 'Dev Null' src/xrpld/rpc/handlers/admin/status/ValidatorInfo.cpp; then
echo -e "// Copyright (c) 2019 Dev Null Productions\n\n$(cat src/xrpld/rpc/handlers/admin/status/ValidatorInfo.cpp)" > src/xrpld/rpc/handlers/admin/status/ValidatorInfo.cpp
echo -e "// Copyright (c) 2019 Dev Null Productions\n\n$(cat src/xrpld/rpc/handlers/admin/status/ValidatorInfo.cpp)" >src/xrpld/rpc/handlers/admin/status/ValidatorInfo.cpp
fi
if ! grep -q 'Bougalis' include/xrpl/basics/SlabAllocator.h; then
echo -e "// Copyright (c) 2022, Nikolaos D. Bougalis <nikb@bougalis.net>\n\n$(cat include/xrpl/basics/SlabAllocator.h)" > include/xrpl/basics/SlabAllocator.h # cspell: ignore Nikolaos Bougalis nikb
echo -e "// Copyright (c) 2022, Nikolaos D. Bougalis <nikb@bougalis.net>\n\n$(cat include/xrpl/basics/SlabAllocator.h)" >include/xrpl/basics/SlabAllocator.h # cspell: ignore Nikolaos Bougalis nikb
fi
if ! grep -q 'Bougalis' include/xrpl/basics/spinlock.h; then
echo -e "// Copyright (c) 2022, Nikolaos D. Bougalis <nikb@bougalis.net>\n\n$(cat include/xrpl/basics/spinlock.h)" > include/xrpl/basics/spinlock.h # cspell: ignore Nikolaos Bougalis nikb
echo -e "// Copyright (c) 2022, Nikolaos D. Bougalis <nikb@bougalis.net>\n\n$(cat include/xrpl/basics/spinlock.h)" >include/xrpl/basics/spinlock.h # cspell: ignore Nikolaos Bougalis nikb
fi
if ! grep -q 'Bougalis' include/xrpl/basics/tagged_integer.h; then
echo -e "// Copyright (c) 2014, Nikolaos D. Bougalis <nikb@bougalis.net>\n\n$(cat include/xrpl/basics/tagged_integer.h)" > include/xrpl/basics/tagged_integer.h # cspell: ignore Nikolaos Bougalis nikb
echo -e "// Copyright (c) 2014, Nikolaos D. Bougalis <nikb@bougalis.net>\n\n$(cat include/xrpl/basics/tagged_integer.h)" >include/xrpl/basics/tagged_integer.h # cspell: ignore Nikolaos Bougalis nikb
fi
if ! grep -q 'Ritchford' include/xrpl/beast/utility/Zero.h; then
echo -e "// Copyright (c) 2014, Tom Ritchford <tom@swirly.com>\n\n$(cat include/xrpl/beast/utility/Zero.h)" > include/xrpl/beast/utility/Zero.h # cspell: ignore Ritchford
echo -e "// Copyright (c) 2014, Tom Ritchford <tom@swirly.com>\n\n$(cat include/xrpl/beast/utility/Zero.h)" >include/xrpl/beast/utility/Zero.h # cspell: ignore Ritchford
fi
# Restore newlines and tabs in string literals in the affected file.

View File

@@ -6,7 +6,7 @@ set -e
# On MacOS, ensure that GNU sed is installed and available as `gsed`.
SED_COMMAND=sed
if [[ "${OSTYPE}" == 'darwin'* ]]; then
if ! command -v gsed &> /dev/null; then
if ! command -v gsed &>/dev/null; then
echo "Error: gsed is not installed. Please install it using 'brew install gnu-sed'."
exit 1
fi

View File

@@ -6,7 +6,7 @@ set -e
# On MacOS, ensure that GNU sed is installed and available as `gsed`.
SED_COMMAND=sed
if [[ "${OSTYPE}" == 'darwin'* ]]; then
if ! command -v gsed &> /dev/null; then
if ! command -v gsed &>/dev/null; then
echo "Error: gsed is not installed. Please install it using 'brew install gnu-sed'."
exit 1
fi

View File

@@ -6,7 +6,7 @@ set -e
# On MacOS, ensure that GNU sed is installed and available as `gsed`.
SED_COMMAND=sed
if [[ "${OSTYPE}" == 'darwin'* ]]; then
if ! command -v gsed &> /dev/null; then
if ! command -v gsed &>/dev/null; then
echo "Error: gsed is not installed. Please install it using 'brew install gnu-sed'."
exit 1
fi

View File

@@ -6,14 +6,16 @@ on:
- develop
paths:
- ".github/workflows/build-nix-image.yml"
- "docker/nix.Dockerfile"
- ".github/workflows/reusable-build-docker-image.yml"
- "docker/**"
- "flake.nix"
- "flake.lock"
- "nix/**"
pull_request:
paths:
- ".github/workflows/build-nix-image.yml"
- "docker/nix.Dockerfile"
- ".github/workflows/reusable-build-docker-image.yml"
- "docker/**"
- "flake.nix"
- "flake.lock"
- "nix/**"
@@ -27,75 +29,81 @@ defaults:
run:
shell: bash
env:
UBUNTU_VERSION: "20.04"
RHEL_VERSION: "9"
DEBIAN_VERSION: "bookworm"
jobs:
build:
name: Build and push Nix image (${{ matrix.distro }})
name: Build ${{ matrix.distro.name }} (${{ matrix.target.platform }})
permissions:
contents: read
packages: write
strategy:
fail-fast: false
matrix:
# The base images are the oldest supported version of each distro
# that we want to build images for.
distro:
- name: nixos
base_image: nixos/nix:latest
- name: ubuntu
base_image: ubuntu:20.04
- name: rhel
base_image: registry.access.redhat.com/ubi9/ubi:latest
- name: debian
base_image: debian:bookworm
target:
- platform: linux/amd64
runner: ubuntu-latest
- platform: linux/arm64
runner: ubuntu-24.04-arm
uses: ./.github/workflows/reusable-build-docker-image.yml
with:
image_name: ghcr.io/xrplf/xrpld/nix-${{ matrix.distro.name }}
dockerfile: docker/nix.Dockerfile
base_image: ${{ matrix.distro.base_image }}
platform: ${{ matrix.target.platform }}
runner: ${{ matrix.target.runner }}
push: ${{ github.repository == 'XRPLF/rippled' && github.event_name == 'push' }}
merge:
name: Merge ${{ matrix.distro }} manifest
needs: build
if: ${{ github.repository == 'XRPLF/rippled' && github.event_name == 'push' }}
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
strategy:
fail-fast: false
matrix:
include:
- distro: nixos
- distro: ubuntu
- distro: rhel
- distro: debian
distro: [nixos, ubuntu, rhel, debian]
env:
IMAGE_NAME: ghcr.io/xrplf/xrpld/nix-${{ matrix.distro }}
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Determine base image
id: vars
run: |
case "${{ matrix.distro }}" in
nixos)
echo "base_image=nixos/nix:latest" >> $GITHUB_OUTPUT
;;
ubuntu)
echo "base_image=ubuntu:${UBUNTU_VERSION}" >> $GITHUB_OUTPUT
;;
rhel)
echo "base_image=registry.access.redhat.com/ubi${RHEL_VERSION}/ubi:latest" >> $GITHUB_OUTPUT
;;
debian)
echo "base_image=debian:${DEBIAN_VERSION}" >> $GITHUB_OUTPUT
;;
esac
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0
uses: docker/setup-buildx-action@d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5 # v4.1.0
- name: Docker metadata
id: meta
uses: docker/metadata-action@80c7e94dd9b9319bd5eb7a0e0fe9291e23a2a2e9 # v6.1.0
with:
images: ${{ env.IMAGE_NAME }}
tags: |
type=sha,prefix=sha-,format=short
type=raw,value=latest
- name: Login to GitHub Container Registry
if: github.event_name == 'push'
uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Docker metadata
id: meta
uses: docker/metadata-action@030e881283bb7a6894de51c315a6bfe6a94e05cf # v6.0.0
with:
images: ghcr.io/xrplf/ci/nix-${{ matrix.distro }}
tags: |
type=sha,prefix=sha-,format=short
type=raw,value=latest
- name: Create multi-arch manifests
run: |
for tag in $(jq -cr '.tags[]' <<<"$DOCKER_METADATA_OUTPUT_JSON"); do
docker buildx imagetools create -t "$tag" "${tag}-amd64" "${tag}-arm64"
done
- name: Build and push
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
with:
context: .
file: docker/nix.Dockerfile
platforms: linux/amd64
push: ${{ github.event_name == 'push' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
build-args: BASE_IMAGE=${{ steps.vars.outputs.base_image }}
- name: Inspect image
run: |
docker buildx imagetools inspect "${IMAGE_NAME}:${{ steps.meta.outputs.version }}"

View File

@@ -5,8 +5,17 @@ on:
types:
- checks_requested
pull_request:
types: [opened, edited, reopened, synchronize, ready_for_review]
branches: [develop]
types:
- opened
- edited
- reopened
- synchronize
- ready_for_review
branches:
- develop
- "release-*"
- "release/*"
- "staging/*"
jobs:
check_description:
@@ -20,11 +29,11 @@ jobs:
env:
PR_BODY: ${{ github.event.pull_request.body }}
if: ${{ github.event_name == 'pull_request' }}
run: printenv PR_BODY > pr_body.md
run: printenv PR_BODY >pr_body.md
- name: Check PR description differs from template
if: ${{ github.event_name == 'pull_request' }}
run: >
python .github/scripts/check-pr-description.py
--template-file .github/pull_request_template.md
--pr-body-file pr_body.md
run: |
python .github/scripts/check-pr-description.py \
--template-file .github/pull_request_template.md \
--pr-body-file pr_body.md

View File

@@ -5,10 +5,19 @@ on:
types:
- checks_requested
pull_request:
types: [opened, edited, reopened, synchronize, ready_for_review]
branches: [develop]
types:
- opened
- edited
- reopened
- synchronize
- ready_for_review
branches:
- develop
- "release-*"
- "release/*"
- "staging/*"
jobs:
check_title:
if: ${{ github.event.pull_request.draft != true }}
uses: XRPLF/actions/.github/workflows/check-pr-title.yml@291206777251b4d493641b5afbdf7c23009d2988
uses: XRPLF/actions/.github/workflows/check-pr-title.yml@cba1f0891650baf1a9c88624dc2d72573be2eb81

View File

@@ -98,7 +98,7 @@ jobs:
READY: ${{ contains(github.event.pull_request.labels.*.name, 'Ready to merge') }}
MERGE: ${{ github.event_name == 'merge_group' }}
run: |
echo "go=${{ (env.DRAFT != 'true' && env.READY == 'true') || env.FILES == 'true' || env.MERGE == 'true' }}" >> "${GITHUB_OUTPUT}"
echo "go=${{ (env.DRAFT != 'true' && env.READY == 'true') || env.FILES == 'true' || env.MERGE == 'true' }}" >>"${GITHUB_OUTPUT}"
cat "${GITHUB_OUTPUT}"
outputs:
go: ${{ steps.go.outputs.go == 'true' }}
@@ -168,9 +168,9 @@ jobs:
PR_URL: ${{ github.event.pull_request.html_url }}
run: |
gh api --method POST -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" \
/repos/xrplf/clio/dispatches -f "event_type=check_libxrpl" \
-F "client_payload[ref]=${{ needs.upload-recipe.outputs.recipe_ref }}" \
-F "client_payload[pr_url]=${PR_URL}"
/repos/xrplf/clio/dispatches -f "event_type=check_libxrpl" \
-F "client_payload[ref]=${{ needs.upload-recipe.outputs.recipe_ref }}" \
-F "client_payload[pr_url]=${PR_URL}"
passed:
if: failure() || cancelled()

View File

@@ -14,7 +14,7 @@ on:
jobs:
# Call the workflow in the XRPLF/actions repo that runs the pre-commit hooks.
run-hooks:
uses: XRPLF/actions/.github/workflows/pre-commit.yml@5e942d61bf32f7557a7c159cfac4712a687b3e3a
uses: XRPLF/actions/.github/workflows/pre-commit.yml@cba1f0891650baf1a9c88624dc2d72573be2eb81
with:
runs_on: ubuntu-latest
container: '{ "image": "ghcr.io/xrplf/ci/tools-rippled-pre-commit:sha-41ec7c1" }'

View File

@@ -0,0 +1,89 @@
# Build a single-platform Docker image. On push, the image is pushed to
# GHCR with arch-suffixed tags (e.g. `:latest-amd64`, `:sha-abc-amd64`)
# so the calling workflow can stitch per-arch builds into a multi-arch
# manifest without needing to pass digests around.
name: Reusable build Docker image (single platform)
on:
workflow_call:
inputs:
image_name:
description: "Full image name without tag (e.g. 'ghcr.io/xrplf/xrpld/nix-ubuntu')"
required: true
type: string
dockerfile:
description: "Path to the Dockerfile, relative to the repository root"
required: true
type: string
base_image:
description: "Value passed to the Dockerfile as the BASE_IMAGE build arg"
required: true
type: string
platform:
description: "Docker platform string, e.g. linux/amd64"
required: true
type: string
runner:
description: "GitHub Actions runner label to build on"
required: true
type: string
push:
description: "Whether to push the image to GHCR"
required: true
type: boolean
defaults:
run:
shell: bash
jobs:
build:
name: Build (${{ inputs.platform }})
runs-on: ${{ inputs.runner }}
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Determine arch
id: vars
env:
PLATFORM: ${{ inputs.platform }}
run: |
echo "arch=${PLATFORM##*/}" >>$GITHUB_OUTPUT
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5 # v4.1.0
- name: Login to GitHub Container Registry
if: inputs.push
uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Docker metadata
id: meta
uses: docker/metadata-action@80c7e94dd9b9319bd5eb7a0e0fe9291e23a2a2e9 # v6.1.0
with:
images: ${{ inputs.image_name }}
tags: |
type=sha,prefix=sha-,format=short
type=raw,value=latest
flavor: |
suffix=-${{ steps.vars.outputs.arch }},onlatest=true
- name: Build and push
uses: docker/build-push-action@f9f3042f7e2789586610d6e8b85c8f03e5195baf # v7.2.0
with:
context: .
file: ${{ inputs.dockerfile }}
platforms: ${{ inputs.platform }}
push: ${{ inputs.push }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
build-args: BASE_IMAGE=${{ inputs.base_image }}

View File

@@ -113,7 +113,7 @@ jobs:
- name: Set ccache log file
if: ${{ inputs.ccache_enabled && runner.debug == '1' }}
run: echo "CCACHE_LOGFILE=${{ runner.temp }}/ccache.log" >> "${GITHUB_ENV}"
run: echo "CCACHE_LOGFILE=${{ runner.temp }}/ccache.log" >>"${GITHUB_ENV}"
- name: Print build environment
uses: XRPLF/actions/print-build-env@59dec886e4afb05a1724443af08baccbc045b574
@@ -146,11 +146,11 @@ jobs:
CMAKE_ARGS: ${{ inputs.cmake_args }}
run: |
cmake \
-G '${{ runner.os == 'Windows' && 'Visual Studio 17 2022' || 'Ninja' }}' \
-DCMAKE_TOOLCHAIN_FILE:FILEPATH=build/generators/conan_toolchain.cmake \
-DCMAKE_BUILD_TYPE="${BUILD_TYPE}" \
${CMAKE_ARGS} \
..
-G '${{ runner.os == 'Windows' && 'Visual Studio 17 2022' || 'Ninja' }}' \
-DCMAKE_TOOLCHAIN_FILE:FILEPATH=build/generators/conan_toolchain.cmake \
-DCMAKE_BUILD_TYPE="${BUILD_TYPE}" \
${CMAKE_ARGS} \
..
- name: Check protocol autogen files are up-to-date
working-directory: ${{ env.BUILD_DIR }}
@@ -172,32 +172,32 @@ jobs:
cmake --build . --target code_gen
DIFF=$(git -C .. status --porcelain -- include/xrpl/protocol_autogen src/tests/libxrpl/protocol_autogen)
if [ -n "${DIFF}" ]; then
echo "::error::Generated protocol files are out of date"
git -C .. diff -- include/xrpl/protocol_autogen src/tests/libxrpl/protocol_autogen
echo "${MESSAGE}"
exit 1
echo "::error::Generated protocol files are out of date"
git -C .. diff -- include/xrpl/protocol_autogen src/tests/libxrpl/protocol_autogen
echo "${MESSAGE}"
exit 1
fi
- name: Build the binary
working-directory: ${{ env.BUILD_DIR }}
env:
BUILD_NPROC: ${{ runner.os == 'Linux' && '16' || steps.nproc.outputs.nproc }}
BUILD_NPROC: ${{ steps.nproc.outputs.nproc }}
BUILD_TYPE: ${{ inputs.build_type }}
CMAKE_TARGET: ${{ inputs.cmake_target }}
run: |
cmake \
--build . \
--config "${BUILD_TYPE}" \
--parallel "${BUILD_NPROC}" \
--target "${CMAKE_TARGET}"
--build . \
--config "${BUILD_TYPE}" \
--parallel "${BUILD_NPROC}" \
--target "${CMAKE_TARGET}"
- name: Show ccache statistics
if: ${{ inputs.ccache_enabled }}
run: |
ccache --show-stats -vv
if [ '${{ runner.debug }}' = '1' ]; then
cat "${CCACHE_LOGFILE}"
curl ${CCACHE_REMOTE_STORAGE%|*}/status || true
cat "${CCACHE_LOGFILE}"
curl ${CCACHE_REMOTE_STORAGE%|*}/status || true
fi
- name: Upload the binary (Linux)
@@ -214,7 +214,7 @@ jobs:
working-directory: ${{ env.BUILD_DIR }}
run: |
set -o pipefail
./xrpld --definitions | python3 -m json.tool > server_definitions.json
./xrpld --definitions | python3 -m json.tool >server_definitions.json
- name: Upload server definitions
if: ${{ github.event.repository.visibility == 'public' && inputs.config_name == 'debian-bookworm-gcc-13-amd64-release' }}
@@ -231,10 +231,10 @@ jobs:
run: |
ldd ./xrpld
if [ "$(ldd ./xrpld | grep -E '(libstdc\+\+|libgcc)' | wc -l)" -eq 0 ]; then
echo 'The binary is statically linked.'
echo 'The binary is statically linked.'
else
echo 'The binary is dynamically linked.'
exit 1
echo 'The binary is dynamically linked.'
exit 1
fi
- name: Verify presence of instrumentation (Linux)
@@ -250,12 +250,12 @@ jobs:
run: |
ASAN_OPTS="include=${GITHUB_WORKSPACE}/sanitizers/suppressions/runtime-asan-options.txt:suppressions=${GITHUB_WORKSPACE}/sanitizers/suppressions/asan.supp"
if [[ "${CONFIG_NAME}" == *gcc* ]]; then
ASAN_OPTS="${ASAN_OPTS}:alloc_dealloc_mismatch=0"
ASAN_OPTS="${ASAN_OPTS}:alloc_dealloc_mismatch=0"
fi
echo "ASAN_OPTIONS=${ASAN_OPTS}" >> ${GITHUB_ENV}
echo "TSAN_OPTIONS=include=${GITHUB_WORKSPACE}/sanitizers/suppressions/runtime-tsan-options.txt:suppressions=${GITHUB_WORKSPACE}/sanitizers/suppressions/tsan.supp" >> ${GITHUB_ENV}
echo "UBSAN_OPTIONS=include=${GITHUB_WORKSPACE}/sanitizers/suppressions/runtime-ubsan-options.txt:suppressions=${GITHUB_WORKSPACE}/sanitizers/suppressions/ubsan.supp" >> ${GITHUB_ENV}
echo "LSAN_OPTIONS=include=${GITHUB_WORKSPACE}/sanitizers/suppressions/runtime-lsan-options.txt:suppressions=${GITHUB_WORKSPACE}/sanitizers/suppressions/lsan.supp" >> ${GITHUB_ENV}
echo "ASAN_OPTIONS=${ASAN_OPTS}" >>${GITHUB_ENV}
echo "TSAN_OPTIONS=include=${GITHUB_WORKSPACE}/sanitizers/suppressions/runtime-tsan-options.txt:suppressions=${GITHUB_WORKSPACE}/sanitizers/suppressions/tsan.supp" >>${GITHUB_ENV}
echo "UBSAN_OPTIONS=include=${GITHUB_WORKSPACE}/sanitizers/suppressions/runtime-ubsan-options.txt:suppressions=${GITHUB_WORKSPACE}/sanitizers/suppressions/ubsan.supp" >>${GITHUB_ENV}
echo "LSAN_OPTIONS=include=${GITHUB_WORKSPACE}/sanitizers/suppressions/runtime-lsan-options.txt:suppressions=${GITHUB_WORKSPACE}/sanitizers/suppressions/lsan.supp" >>${GITHUB_ENV}
- name: Run the separate tests
if: ${{ !inputs.build_only }}
@@ -266,9 +266,9 @@ jobs:
PARALLELISM: ${{ runner.os == 'Windows' && '1' || steps.nproc.outputs.nproc }}
run: |
ctest \
--output-on-failure \
-C "${BUILD_TYPE}" \
-j "${PARALLELISM}"
--output-on-failure \
-C "${BUILD_TYPE}" \
-j "${PARALLELISM}"
- name: Run the embedded tests
if: ${{ !inputs.build_only }}
@@ -278,7 +278,7 @@ jobs:
run: |
set -o pipefail
# Coverage builds are slower due to instrumentation; use fewer parallel jobs to avoid flakiness
[ "$COVERAGE_ENABLED" = "true" ] && BUILD_NPROC=$(( BUILD_NPROC - 2 ))
[ "$COVERAGE_ENABLED" = "true" ] && BUILD_NPROC=$((BUILD_NPROC - 2))
./xrpld --unittest --unittest-jobs "${BUILD_NPROC}" 2>&1 | tee unittest.log
- name: Show test failure summary
@@ -287,19 +287,19 @@ jobs:
WORKING_DIR: ${{ runner.os == 'Windows' && format('{0}\{1}', env.BUILD_DIR, inputs.build_type) || env.BUILD_DIR }}
run: |
if [ ! -d "${WORKING_DIR}" ]; then
echo "Working directory '${WORKING_DIR}' does not exist."
exit 0
echo "Working directory '${WORKING_DIR}' does not exist."
exit 0
fi
cd "${WORKING_DIR}"
if [ ! -f unittest.log ]; then
echo "unittest.log not found; embedded tests may not have run."
exit 0
echo "unittest.log not found; embedded tests may not have run."
exit 0
fi
if ! grep -E "failed" unittest.log; then
echo "Log present but no failure lines found in unittest.log."
echo "Log present but no failure lines found in unittest.log."
fi
- name: Debug failure (Linux)
if: ${{ failure() && runner.os == 'Linux' && !inputs.build_only }}
@@ -317,14 +317,14 @@ jobs:
BUILD_TYPE: ${{ inputs.build_type }}
run: |
cmake \
--build . \
--config "${BUILD_TYPE}" \
--parallel "${BUILD_NPROC}" \
--target coverage
--build . \
--config "${BUILD_TYPE}" \
--parallel "${BUILD_NPROC}" \
--target coverage
- name: Upload coverage report
if: ${{ github.repository == 'XRPLF/rippled' && !inputs.build_only && env.COVERAGE_ENABLED == 'true' }}
uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2 # v6.0.0
uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 # v6.0.1
with:
disable_search: true
disable_telem: true

View File

@@ -38,9 +38,9 @@ jobs:
run: |
DIFF=$(git status --porcelain)
if [ -n "${DIFF}" ]; then
# Print the differences to give the contributor a hint about what to
# expect when running levelization on their own machine.
git diff
echo "${MESSAGE}"
exit 1
# Print the differences to give the contributor a hint about what to
# expect when running levelization on their own machine.
git diff
echo "${MESSAGE}"
exit 1
fi

View File

@@ -48,9 +48,9 @@ jobs:
run: |
DIFF=$(git status --porcelain)
if [ -n "${DIFF}" ]; then
# Print the differences to give the contributor a hint about what to
# expect when running the renaming scripts on their own machine.
git diff
echo "${MESSAGE}"
exit 1
# Print the differences to give the contributor a hint about what to
# expect when running the renaming scripts on their own machine.
git diff
echo "${MESSAGE}"
exit 1
fi

View File

@@ -70,13 +70,13 @@ jobs:
working-directory: ${{ env.BUILD_DIR }}
run: |
cmake \
-G 'Ninja' \
-DCMAKE_TOOLCHAIN_FILE:FILEPATH=build/generators/conan_toolchain.cmake \
-DCMAKE_BUILD_TYPE="${BUILD_TYPE}" \
-Dtests=ON \
-Dwerr=ON \
-Dxrpld=ON \
..
-G 'Ninja' \
-DCMAKE_TOOLCHAIN_FILE:FILEPATH=build/generators/conan_toolchain.cmake \
-DCMAKE_BUILD_TYPE="${BUILD_TYPE}" \
-Dtests=ON \
-Dwerr=ON \
-Dxrpld=ON \
..
# clang-tidy needs headers generated from proto files
- name: Build libxrpl.libpb
@@ -133,7 +133,7 @@ jobs:
- name: Write issue header
if: ${{ steps.run_clang_tidy.outcome != 'success' }}
run: |
cat > "${ISSUE_FILE}" <<EOF
cat >"${ISSUE_FILE}" <<EOF
## Clang-tidy Check Failed
### Clang-tidy Output:
@@ -144,30 +144,30 @@ jobs:
if: ${{ steps.run_clang_tidy.outcome != 'success' }}
run: |
if [ -f "${OUTPUT_FILE}" ]; then
# Extract lines containing 'error:', 'warning:', or 'note:'
grep -E '(error:|warning:|note:)' "${OUTPUT_FILE}" > filtered-output.txt || true
# Extract lines containing 'error:', 'warning:', or 'note:'
grep -E '(error:|warning:|note:)' "${OUTPUT_FILE}" >filtered-output.txt || true
# If filtered output is empty, use original (might be a different error format)
if [ ! -s filtered-output.txt ]; then
cp "${OUTPUT_FILE}" filtered-output.txt
fi
# If filtered output is empty, use original (might be a different error format)
if [ ! -s filtered-output.txt ]; then
cp "${OUTPUT_FILE}" filtered-output.txt
fi
# Truncate if too large
head -c 60000 filtered-output.txt >> "${ISSUE_FILE}"
if [ "$(wc -c < filtered-output.txt)" -gt 60000 ]; then
echo "" >> "${ISSUE_FILE}"
echo "... (output truncated, see artifacts for full output)" >> "${ISSUE_FILE}"
fi
# Truncate if too large
head -c 60000 filtered-output.txt >>"${ISSUE_FILE}"
if [ "$(wc -c <filtered-output.txt)" -gt 60000 ]; then
echo "" >>"${ISSUE_FILE}"
echo "... (output truncated, see artifacts for full output)" >>"${ISSUE_FILE}"
fi
rm filtered-output.txt
rm filtered-output.txt
else
echo "No output file found" >> "${ISSUE_FILE}"
echo "No output file found" >>"${ISSUE_FILE}"
fi
- name: Append issue footer
if: ${{ steps.run_clang_tidy.outcome != 'success' }}
run: |
cat >> "${ISSUE_FILE}" <<EOF
cat >>"${ISSUE_FILE}" <<EOF
\`\`\`
---
@@ -176,7 +176,7 @@ jobs:
- name: Create issue
if: ${{ steps.run_clang_tidy.outcome != 'success' && inputs.create_issue_on_failure }}
uses: XRPLF/actions/create-issue@36d450d12d301e8410c1b7936e5de70c291cbe36
uses: XRPLF/actions/create-issue@2b8bc36af85b88bca0dd7bfac2e2dc05f94ad712
with:
title: "Clang-tidy check failed"
body_file: ${{ env.ISSUE_FILE }}

View File

@@ -39,7 +39,7 @@ jobs:
id: generate
working-directory: .github/scripts/strategy-matrix
run: |
./generate.py --packaging --config=linux.json >> "${GITHUB_OUTPUT}"
./generate.py --packaging --config=linux.json >>"${GITHUB_OUTPUT}"
generate-version:
runs-on: ubuntu-latest
@@ -58,6 +58,7 @@ jobs:
package:
needs: [generate-matrix, generate-version]
if: ${{ github.event.repository.visibility == 'public' }}
strategy:
fail-fast: false
matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
@@ -88,8 +89,7 @@ jobs:
run: ./package/build_pkg.sh
- name: Upload package artifact
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
if: ${{ github.event.repository.visibility == 'public' }}
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: ${{ matrix.artifact_name }}-pkg-${{ needs.generate-version.outputs.version }}
path: |

View File

@@ -42,4 +42,4 @@ jobs:
env:
GENERATE_CONFIG: ${{ inputs.os != '' && format('--config={0}.json', inputs.os) || '' }}
GENERATE_OPTION: ${{ inputs.strategy_matrix == 'all' && '--all' || '' }}
run: ./generate.py ${GENERATE_OPTION} ${GENERATE_CONFIG} >> "${GITHUB_OUTPUT}"
run: ./generate.py ${GENERATE_OPTION} ${GENERATE_CONFIG} >>"${GITHUB_OUTPUT}"

View File

@@ -37,37 +37,50 @@ repos:
exclude: ^include/xrpl/protocol_autogen/(transactions|ledger_entries)/
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: cd481d7b0bfb5c7b3090c21846317f9a8262e891 # frozen: v22.1.0
rev: dd18dad857d6133e90bbe478f4f2f22ec0030269 # frozen: v22.1.5
hooks:
- id: clang-format
args: [--style=file]
"types_or": [c++, c, proto]
exclude: ^include/xrpl/protocol_autogen/(transactions|ledger_entries)/
- repo: https://github.com/BlankSpruce/gersemi
rev: 0.26.0
- repo: https://github.com/BlankSpruce/gersemi-pre-commit
rev: faadd6a9d852369ca94f4d15b2404c967ba8cb01 # frozen: 0.27.6
hooks:
- id: gersemi
- repo: https://github.com/rbubley/mirrors-prettier
rev: c2bc67fe8f8f549cc489e00ba8b45aa18ee713b1 # frozen: v3.8.1
rev: 515f543f5718ebfd6ce22e16708bb32c68ff96e1 # frozen: v3.8.3
hooks:
- id: prettier
args: [--end-of-line=auto]
- repo: https://github.com/psf/black-pre-commit-mirror
rev: ea488cebbfd88a5f50b8bd95d5c829d0bb76feb8 # frozen: 26.1.0
rev: 4160603246a6b365d4a2af661c6d71b0a0f50478 # frozen: 26.5.1
hooks:
- id: black
- repo: https://github.com/openstack/bashate
rev: 5798d24d571676fc407e81df574c1ef57b520f23 # frozen: 2.1.1
- repo: https://github.com/scop/pre-commit-shfmt
rev: 05c1426671b9237fb5e1444dd63aa5731bec0dfb # frozen: v3.13.1-1
hooks:
- id: bashate
args: ["--ignore=E006"]
- id: shfmt
args: [--write, --indent=4, --case-indent=true]
- repo: local
hooks:
- id: format-inline-bash-workflows
name: "format `run:` blocks in workflows/actions"
entry: ./.github/scripts/format-inline-bash.py
language: python
files: ^\.github/(workflows|actions)/.*\.ya?ml$
- id: format-inline-bash-markdown
name: "format ```bash blocks in markdown"
entry: ./.github/scripts/format-inline-bash.py
language: python
files: \.md$
- repo: https://github.com/streetsidesoftware/cspell-cli
rev: a42085ade523f591dca134379a595e7859986445 # frozen: v9.7.0
rev: 4643f154907327ee0a2c7038f0296e0dd77d9776 # frozen: v10.0.0
hooks:
- id: cspell # Spell check changed files
exclude: |

View File

@@ -151,8 +151,8 @@ git init
git remote add origin git@github.com:XRPLF/conan-center-index.git
git sparse-checkout init
for recipe in "${recipes[@]}"; do
echo "Checking out recipe '${recipe}'..."
git sparse-checkout add recipes/${recipe}
echo "Checking out recipe '${recipe}'..."
git sparse-checkout add recipes/${recipe}
done
git fetch origin master
git checkout master
@@ -180,7 +180,7 @@ the new recipe will be automatically pulled from the official Conan Center.
If you see an error similar to the following after running `conan profile show`:
```bash
```text
ERROR: Invalid setting '17' is not a valid 'settings.compiler.version' value.
Possible values are ['5.0', '5.1', '6.0', '6.1', '7.0', '7.3', '8.0', '8.1',
'9.0', '9.1', '10.0', '11.0', '12.0', '13', '13.0', '13.1', '14', '14.0', '15',
@@ -427,16 +427,19 @@ install ccache --version 4.11.3 --allow-downgrade`.
Single-config generators:
```
cmake --build .
cmake --build . --parallel N
```
Multi-config generators:
```
cmake --build . --config Release
cmake --build . --config Debug
cmake --build . --config Release --parallel N
cmake --build . --config Debug --parallel N
```
Replace the `--parallel` parameter N with the desired number of parallel jobs. A common starting point is half of the number of available CPU
cores.
5. Test xrpld.
Single-config generators:

View File

@@ -0,0 +1,567 @@
# Distributed Tracing Fundamentals
> **Parent Document**: [OpenTelemetryPlan.md](./OpenTelemetryPlan.md)
> **Next**: [Architecture Analysis](./01-architecture-analysis.md)
---
## What is Distributed Tracing?
Distributed tracing is a method for tracking data objects as they flow through distributed systems. In a network like XRP Ledger, a single transaction touches multiple independent nodes—each with no shared memory or logging. Distributed tracing connects these dots.
**Without tracing:** You see isolated logs on each node with no way to correlate them.
**With tracing:** You see the complete journey of a transaction or an event across all nodes it touched.
---
## Actors and Actions at a Glance
### Actors
| Who (Plain English) | Technical Term |
| ---------------------------------------------- | --------------- |
| A single unit of work being tracked | Span |
| The complete journey of a request | Trace |
| Data that links spans across services | Trace Context |
| Code that creates spans and propagates context | Instrumentation |
| Service that receives and processes traces | Collector |
| Storage and visualization system | Backend (Tempo) |
| Decision logic for which traces to keep | Sampler |
### Actions
| What Happens (Plain English) | Technical Term |
| --------------------------------------- | ----------------------- |
| Start tracking a new operation | Create a Span |
| Connect a child operation to its parent | Set `parent_span_id` |
| Group all related operations together | Share a `trace_id` |
| Pass tracking data between services | Context Propagation |
| Decide whether to record a trace | Sampling (Head or Tail) |
| Send completed traces to storage | Export (OTLP) |
---
## Core Concepts
### 1. Trace
A **trace** represents the entire journey of a request through the system. It has a unique `trace_id` that stays constant across all nodes.
```
Trace ID: abc123
├── Node A: received transaction
├── Node B: relayed transaction
├── Node C: included in consensus
└── Node D: applied to ledger
```
### 2. Span
A **span** represents a single unit of work within a trace. Each span has:
| Attribute | Description | Example |
| ---------------- | -------------------------------- | -------------------------- |
| `trace_id` | Identifies the trace | `event123` |
| `span_id` | Unique identifier | `span456` |
| `parent_span_id` | Parent span (if any) | `p_span123` |
| `name` | Operation name | `rpc.submit` |
| `start_time` | When work began (local time) | `2024-01-15T10:30:00Z` |
| `end_time` | When work completed (local time) | `2024-01-15T10:30:00.050Z` |
| `attributes` | Key-value metadata | `tx.hash=ABC...` |
| `status` | OK, ERROR MSG | `OK` |
### 3. Trace Context
**Trace context** is the data that propagates between services to link spans together. It contains:
- `trace_id` - The trace this span belongs to
- `span_id` - The current span (becomes parent for child spans)
- `trace_flags` - Sampling decisions
---
## How Spans Form a Trace
Spans have parent-child relationships forming a tree structure:
```mermaid
flowchart TB
subgraph trace["Trace: abc123"]
A["tx.submit<br/>span_id: 001<br/>50ms"] --> B["tx.validate<br/>span_id: 002<br/>5ms"]
A --> C["tx.relay<br/>span_id: 003<br/>10ms"]
A --> D["tx.apply<br/>span_id: 004<br/>30ms"]
D --> E["ledger.update<br/>span_id: 005<br/>20ms"]
end
style A fill:#0d47a1,stroke:#082f6a,color:#ffffff
style B fill:#1b5e20,stroke:#0d3d14,color:#ffffff
style C fill:#1b5e20,stroke:#0d3d14,color:#ffffff
style D fill:#1b5e20,stroke:#0d3d14,color:#ffffff
style E fill:#bf360c,stroke:#8c2809,color:#ffffff
```
**Reading the diagram:**
- **tx.submit (blue, root)**: The top-level span representing the entire transaction submission; all other spans are its descendants.
- **tx.validate, tx.relay, tx.apply (green)**: Direct children of tx.submit, representing the three main stages -- validation, relay to peers, and application to the ledger.
- **ledger.update (red)**: A grandchild span nested under tx.apply, representing the actual ledger state mutation triggered by applying the transaction.
- **Arrows (parent to child)**: Each arrow indicates a parent-child span relationship where the parent's completion depends on the child finishing.
The same trace visualized as a **timeline (Gantt chart)**:
```
Time → 0ms 10ms 20ms 30ms 40ms 50ms
├───────────────────────────────────────────┤
tx.submit│▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓│
├─────┤
tx.valid │▓▓▓▓▓│
│ ├──────────┤
tx.relay │ │▓▓▓▓▓▓▓▓▓▓│
│ ├────────────────────────────┤
tx.apply │ │▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓│
│ ├──────────────────┤
ledger │ │▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓│
```
---
## Span Relationships
Spans don't always form simple parent-child trees. Distributed tracing defines several relationship types to capture different causal patterns:
### 1. Parent-Child (ChildOf)
The default relationship. The parent span **depends on** or **contains** the child span. The child runs within the scope of the parent.
```
tx.submit (parent)
├── tx.validate (child) ← parent waits for this
├── tx.relay (child) ← parent waits for this
└── tx.apply (child) ← parent waits for this
```
**When to use:** Synchronous calls, nested operations, any case where the parent's completion depends on the child.
### 2. Follows-From
A causal relationship where the first span **triggers** the second, but does **not wait** for it. The originator fires and moves on.
```
Time →
tx.receive [=======]
↓ triggers (follows-from)
tx.relay [===========] ← runs independently
```
**When to use:** Asynchronous jobs, queued work, fire-and-forget patterns. For example, a node receives a transaction and queues it for relay — the relay span _follows from_ the receive span but the receiver doesn't wait for relaying to complete.
> **OpenTracing** defined `FollowsFrom` as a first-class reference type alongside `ChildOf`.
> **OpenTelemetry** represents this using **Span Links** with descriptive attributes instead (see below).
### 3. Span Links (Cross-Trace and Non-Hierarchical)
Links connect spans that are **causally related but not in a parent-child hierarchy**. Unlike parent-child, links can cross trace boundaries.
```
Trace A Trace B
────── ──────
batch.schedule batch.execute
├─ item.enqueue (span X) ┌──► process.item
├─ item.enqueue (span Y) ───┤ (links to X, Y, Z)
├─ item.enqueue (span Z) └──►
```
**Use cases:**
| Pattern | Description |
| -------------------- | --------------------------------------------------------------------------- |
| **Batch processing** | A batch span links back to all individual spans that contributed to it |
| **Fan-in** | An aggregation span links to the multiple producer spans it merges |
| **Fan-out** | Multiple downstream spans link back to the single span that triggered them |
| **Async handoff** | A deferred job links back to the request that queued it (follows-from) |
| **Cross-trace** | Correlating spans across independent traces (e.g., retries, related events) |
**Link structure:** Each link carries the target span's context plus optional attributes:
```
Link {
trace_id: <target trace>
span_id: <target span>
attributes: { "link.description": "triggered by batch scheduler" }
}
```
### Relationship Summary
```mermaid
flowchart LR
subgraph parent_child["Parent-Child"]
direction TB
P["Parent"] --> C["Child"]
end
subgraph follows_from["Follows-From"]
direction TB
A["Span A"] -.->|triggers| B["Span B"]
end
subgraph links["Span Links"]
direction TB
X["Span X\n(Trace 1)"] -.-|link| Y["Span Y\n(Trace 2)"]
end
parent_child ~~~ follows_from ~~~ links
style P fill:#0d47a1,stroke:#082f6a,color:#ffffff
style C fill:#1b5e20,stroke:#0d3d14,color:#ffffff
style A fill:#0d47a1,stroke:#082f6a,color:#ffffff
style B fill:#bf360c,stroke:#8c2809,color:#ffffff
style X fill:#4a148c,stroke:#38006b,color:#ffffff
style Y fill:#4a148c,stroke:#38006b,color:#ffffff
```
| Relationship | Same Trace? | Dependency? | OTel Mechanism |
| ---------------- | ----------- | -------------------------- | ----------------- |
| **Parent-Child** | Yes | Parent depends on child | `parent_span_id` |
| **Follows-From** | Usually | Causal but no dependency | Link + attributes |
| **Span Link** | Either | Correlation, no dependency | Link + attributes |
---
## Trace ID Generation
A `trace_id` is a 128-bit (16-byte) identifier that groups all spans belonging to one logical operation. How it's generated determines how easily you can find and correlate traces later.
### General Approaches
#### 1. Random (W3C Default)
Generate a random 128-bit ID when a trace starts. Standard approach for most services.
```
trace_id = random_128_bits()
```
| Pros | Cons |
| --------------------------- | --------------------------------------------- |
| Simple, standard | No natural correlation to domain events |
| Guaranteed unique per trace | If propagation is lost, trace is broken |
| Works with all OTel tooling | "Find trace for TX abc" requires index lookup |
#### 2. Deterministic (Derived from Domain Data)
Compute the trace_id from a hash of a natural identifier. Every node independently derives the **same** trace_id for the same event.
```
trace_id = SHA-256(domain_identifier)[0:16] // truncate to 128 bits
```
| Pros | Cons |
| --------------------------------------------------- | ---------------------------------------------------------- |
| Propagation-resilient — same ID computed everywhere | Same event processed twice (retry) shares trace_id |
| Natural search — domain ID maps directly to trace | Non-standard (tooling assumes random) |
| No coordination needed between nodes | 256→128 bit truncation (collision risk negligible at ~2⁶⁴) |
#### 3. Hybrid (Deterministic Prefix + Random Suffix)
First 8 bytes derived from domain data, last 8 bytes random.
```
trace_id = SHA-256(domain_identifier)[0:8] || random_64_bits()
```
| Pros | Cons |
| ------------------------------------------- | ---------------------------------------- |
| Prefix search: "find all traces for TX abc" | Must propagate to maintain full trace_id |
| Unique per processing instance | More complex generation logic |
| Retries get distinct trace_ids | Partial correlation only (prefix match) |
### XRPL Workflow Analysis
XRPL has a unique advantage: its core workflows produce **globally unique 256-bit hashes** that are known on every node. This makes deterministic trace_id generation practical in ways most systems can't achieve.
#### Natural Identifiers by Workflow
| Workflow | Natural Identifier | Size | Known at Start? | Same on All Nodes? |
| ------------------- | --------------------------------- | ---------- | ----------------------------- | -------------------------------- |
| **Transaction** | Transaction hash (`tid_`) | 256-bit | Yes — computed before signing | Yes — hash of canonical tx data |
| **Consensus round** | Previous ledger hash + ledger seq | 256+32 bit | Yes — known when round opens | Yes — all validators agree |
| **Validation** | Ledger hash being validated | 256-bit | Yes — from consensus result | Yes — same closed ledger |
| **Ledger catch-up** | Target ledger hash | 256-bit | Yes — we know what to fetch | Yes — identifies ledger globally |
#### Where These Identifiers Live in Code
```
Transaction: STTx::getTransactionID() → uint256 tid_
TMTransaction::rawTransaction → recompute hash from bytes
Consensus: ConsensusProposal::prevLedger_ → uint256 (previous ledger hash)
ConsensusProposal::position_ → uint256 (TxSet hash)
LedgerHeader::seq → uint32_t (ledger sequence)
Validation: STValidation::getLedgerHash() → uint256
STValidation::getNodeID() → NodeID (160-bit)
Ledger fetch: InboundLedger constructor → uint256 hash, uint32_t seq
TMGetLedger::ledgerHash → bytes (uint256)
```
### Recommended Strategy: Workflow-Scoped Deterministic
Each workflow type derives its trace_id from its natural domain identifier:
```
Transaction trace: trace_id = SHA-256("tx" || tx_hash)[0:16]
Consensus trace: trace_id = SHA-256("cons" || prev_ledger_hash || ledger_seq)[0:16]
Ledger catch-up: trace_id = SHA-256("fetch" || target_ledger_hash)[0:16]
```
The string prefix (`"tx"`, `"cons"`, `"fetch"`) prevents collisions between workflows that might share underlying hashes.
**Why this works for XRPL:**
1. **Propagation-resilient** — Even if a P2P message drops trace context, every node independently computes the same trace_id from the same tx_hash or ledger_hash. Spans still correlate.
2. **Zero-cost search** — "Show me the trace for transaction ABC" becomes a direct lookup: compute `SHA-256("tx" || ABC)[0:16]` and query. No secondary index needed.
3. **Cross-workflow linking via Span Links** — A consensus trace links to individual transaction traces. A validation span links to the consensus trace. This connects the full picture without forcing everything into one giant trace.
### Cross-Workflow Correlation
Each workflow gets its own trace. Span Links tie them together:
```mermaid
flowchart TB
subgraph tx_trace["Transaction Trace"]
direction LR
Tn["trace_id = f(tx_hash)"]:::note --> T1["tx.receive"] --> T2["tx.validate"] --> T3["tx.relay"]
end
subgraph cons_trace["Consensus Trace"]
direction LR
Cn["trace_id = f(prev_ledger, seq)"]:::note --> C1["cons.open"] --> C2["cons.propose"] --> C3["cons.accept"]
end
subgraph val_trace["Validation"]
direction LR
Vn["spans within consensus trace"]:::note --> V1["val.create"] --> V2["val.broadcast"]
end
subgraph fetch_trace["Catch-Up Trace"]
direction LR
Fn["trace_id = f(ledger_hash)"]:::note --> F1["fetch.request"] --> F2["fetch.receive"] --> F3["fetch.apply"]
end
C1 -.-|"span link\n(tx traces)"| T3
C3 --> V1
F1 -.-|"span link\n(target ledger)"| C3
classDef note fill:none,stroke:#888,stroke-dasharray:5 5,color:#333,font-style:italic
style T1 fill:#0d47a1,stroke:#082f6a,color:#ffffff
style T2 fill:#0d47a1,stroke:#082f6a,color:#ffffff
style T3 fill:#0d47a1,stroke:#082f6a,color:#ffffff
style C1 fill:#1b5e20,stroke:#0d3d14,color:#ffffff
style C2 fill:#1b5e20,stroke:#0d3d14,color:#ffffff
style C3 fill:#1b5e20,stroke:#0d3d14,color:#ffffff
style V1 fill:#bf360c,stroke:#8c2809,color:#ffffff
style V2 fill:#bf360c,stroke:#8c2809,color:#ffffff
style F1 fill:#4a148c,stroke:#38006b,color:#ffffff
style F2 fill:#4a148c,stroke:#38006b,color:#ffffff
style F3 fill:#4a148c,stroke:#38006b,color:#ffffff
```
**Reading the diagram:**
- **Transaction Trace (blue)**: An independent trace whose `trace_id` is deterministically derived from the transaction hash. Contains receive, validate, and relay spans.
- **Consensus Trace (green)**: An independent trace whose `trace_id` is derived from the previous ledger hash and sequence number. Covers the open, propose, and accept phases.
- **Validation (red)**: Validation spans live within the consensus trace (not a separate trace). They are created after the accept phase completes.
- **Catch-Up Trace (purple)**: An independent trace for ledger acquisition, derived from the target ledger hash. Used when a node is behind and fetching missing ledgers.
- **Dotted arrows (span links)**: Cross-trace correlations. Consensus links to transaction traces it included; catch-up links to the consensus trace that produced the target ledger.
- **Solid arrow (C3 to V1)**: A parent-child relationship -- validation spans are direct children of the consensus accept span within the same trace.
**How a query flows:**
```
"Why was TX abc slow?"
1. Compute trace_id = SHA-256("tx" || abc)[0:16]
2. Find transaction trace → see it was included in consensus round N
3. Follow span link → consensus trace for round N
4. See which phase was slow (propose? accept?)
5. If a node was catching up, follow link → catch-up trace
```
### Trade-offs to Consider
| Concern | Mitigation |
| ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
| **Retries get same trace_id** | Add `attempt` attribute to root span; spans have unique span_ids and timestamps |
| **256→128 bit truncation** | Birthday-bound collision at ~2⁶⁴ operations — negligible for XRPL's throughput |
| **Non-standard generation** | OTel spec allows any 16-byte non-zero value; tooling works on the hex string |
| **Hash computation cost** | SHA-256 is ~0.3μs per call; XRPL already computes these hashes for other purposes |
| **Late-binding identifiers** | Ledger hash isn't known until after consensus — validation spans use ledger_seq as fallback, then link to the consensus trace |
---
## Distributed Traces Across Nodes
In distributed systems like xrpld, traces span **multiple independent nodes**. The trace context must be propagated in network messages:
```mermaid
sequenceDiagram
participant Client
participant NodeA as Node A
participant NodeB as Node B
participant NodeC as Node C
Client->>NodeA: Submit TX<br/>(no trace context)
Note over NodeA: Creates new trace<br/>trace_id: abc123<br/>span: tx.receive
NodeA->>NodeB: Relay TX<br/>(trace_id: abc123, parent: 001)
Note over NodeB: Creates child span<br/>span: tx.relay<br/>parent_span_id: 001
NodeA->>NodeC: Relay TX<br/>(trace_id: abc123, parent: 001)
Note over NodeC: Creates child span<br/>span: tx.relay<br/>parent_span_id: 001
Note over NodeA,NodeC: All spans share trace_id: abc123<br/>enabling correlation across nodes
```
**Reading the diagram:**
- **Client**: The external entity that submits a transaction. It does not carry trace context -- the trace originates at the first node.
- **Node A**: The entry point that creates a new trace (trace_id: abc123) and the root span `tx.receive`. It relays the transaction to peers with trace context attached.
- **Node B and Node C**: Peer nodes that receive the relayed transaction along with the propagated trace context. Each creates a child span under Node A's span, preserving the same `trace_id`.
- **Arrows with trace context**: The relay messages carry `trace_id` and `parent_span_id`, allowing each downstream node to link its spans back to the originating span on Node A.
---
## Context Propagation
For traces to work across nodes, **trace context must be propagated** in messages.
### What's in the Context (~26 bytes)
| Field | Size | Description |
| ------------- | -------- | ------------------------------------------------------- |
| `trace_id` | 16 bytes | Identifies the entire trace (constant across all nodes) |
| `span_id` | 8 bytes | The sender's current span (becomes parent on receiver) |
| `trace_flags` | 1 byte | Sampling decision (bit 0 = sampled; bits 1-7 reserved) |
| `trace_state` | variable | Optional vendor-specific data (typically omitted) |
### How span_id Changes at Each Hop
Only **one** `span_id` travels in the context - the sender's current span. Each node:
1. Extracts the received `span_id` and uses it as the `parent_span_id`
2. Creates a **new** `span_id` for its own span
3. Sends its own `span_id` as the parent when forwarding
```
Node A Node B Node C
────── ────── ──────
Span AAA Span BBB Span CCC
│ │ │
▼ ▼ ▼
Context out: Context out: Context out:
├─ trace_id: abc123 ├─ trace_id: abc123 ├─ trace_id: abc123
├─ span_id: AAA ──────────► ├─ span_id: BBB ──────────► ├─ span_id: CCC ──────►
└─ flags: 01 └─ flags: 01 └─ flags: 01
│ │
parent = AAA parent = BBB
```
The `trace_id` stays constant, but `span_id` **changes at every hop** to maintain the parent-child chain.
### Propagation Formats
There are two patterns:
### HTTP/RPC Headers (W3C Trace Context)
```
traceparent: 00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01
│ │ │ │
│ │ │ └── Flags (sampled)
│ │ └── Parent span ID (16 hex)
│ └── Trace ID (32 hex)
└── Version
```
### Protocol Buffers (xrpld P2P messages)
```protobuf
message TMTransaction {
bytes rawTransaction = 1;
// ... existing fields ...
// Trace context extension
bytes trace_parent = 100; // W3C traceparent
bytes trace_state = 101; // W3C tracestate
}
```
---
## Sampling
Not every trace needs to be recorded. **Sampling** reduces overhead:
### Head Sampling (at trace start)
```
Request arrives → Random 10% chance → Record or skip entire trace
```
- ✅ Low overhead
- ❌ May miss interesting traces
### Tail Sampling (after trace completes)
```
Trace completes → Collector evaluates:
- Error? → KEEP
- Slow? → KEEP
- Normal? → Sample 10%
```
- ✅ Never loses important traces
- ❌ Higher memory usage at collector
---
## Key Benefits for xrpld
| Challenge | How Tracing Helps |
| ---------------------------------- | ---------------------------------------- |
| "Where is my transaction?" | Follow trace across all nodes it touched |
| "Why was consensus slow?" | See timing breakdown of each phase |
| "Which node is the bottleneck?" | Compare span durations across nodes |
| "What happened during the outage?" | Correlate errors across the network |
---
## Glossary
| Term | Definition |
| -------------------- | ------------------------------------------------------------------- |
| **Trace** | Complete journey of a request, identified by `trace_id` |
| **Span** | Single operation within a trace |
| **Parent-Child** | Span relationship where the parent depends on the child |
| **Follows-From** | Causal relationship where originator doesn't wait for the result |
| **Span Link** | Non-hierarchical connection between spans, possibly across traces |
| **Deterministic ID** | Trace ID derived from domain data (e.g., tx_hash) instead of random |
| **Context** | Data propagated between services (`trace_id`, `span_id`, flags) |
| **Instrumentation** | Code that creates spans and propagates context |
| **Collector** | Service that receives, processes, and exports traces |
| **Backend** | Storage/visualization system (Tempo) |
| **Head Sampling** | Sampling decision at trace start |
| **Tail Sampling** | Sampling decision after trace completes |
---
_Next: [Architecture Analysis](./01-architecture-analysis.md)_ | _Back to: [Overview](./OpenTelemetryPlan.md)_

View File

@@ -0,0 +1,467 @@
# Architecture Analysis
> **Parent Document**: [OpenTelemetryPlan.md](./OpenTelemetryPlan.md)
> **Related**: [Design Decisions](./02-design-decisions.md) | [Implementation Strategy](./03-implementation-strategy.md)
---
## 1.1 Current xrpld Architecture Overview
> **WS** = WebSocket | **UNL** = Unique Node List | **TxQ** = Transaction Queue | **StatsD** = Statistics Daemon
The xrpld node software consists of several interconnected components that need instrumentation for distributed tracing:
```mermaid
flowchart TB
subgraph xrpld["xrpld Node"]
subgraph services["Core Services"]
RPC["RPC Server<br/>(HTTP/WS/gRPC)"]
Overlay["Overlay<br/>(P2P Network)"]
Consensus["Consensus<br/>(RCLConsensus)"]
ValidatorList["ValidatorList<br/>(UNL Mgmt)"]
end
JobQueue["JobQueue<br/>(Thread Pool)"]
subgraph processing["Processing Layer"]
NetworkOPs["NetworkOPs<br/>(Tx Processing)"]
LedgerMaster["LedgerMaster<br/>(Ledger Mgmt)"]
NodeStore["NodeStore<br/>(Database)"]
InboundLedgers["InboundLedgers<br/>(Ledger Sync)"]
end
subgraph appservices["Application Services"]
PathFind["PathFinding<br/>(Payment Paths)"]
TxQ["TxQ<br/>(Fee Escalation)"]
LoadMgr["LoadManager<br/>(Fee/Load)"]
end
subgraph observability["Existing Observability"]
PerfLog["PerfLog<br/>(JSON)"]
Insight["Insight<br/>(StatsD)"]
Logging["Logging<br/>(Journal)"]
end
services --> JobQueue
JobQueue --> processing
JobQueue --> appservices
end
style xrpld fill:#424242,stroke:#212121,color:#ffffff
style services fill:#1565c0,stroke:#0d47a1,color:#ffffff
style processing fill:#2e7d32,stroke:#1b5e20,color:#ffffff
style appservices fill:#6a1b9a,stroke:#4a148c,color:#ffffff
style observability fill:#e65100,stroke:#bf360c,color:#ffffff
```
**Reading the diagram:**
- **Core Services (blue)**: The entry points into xrpld -- RPC Server handles client requests, Overlay manages peer-to-peer networking, Consensus drives agreement, and ValidatorList manages trusted validators.
- **JobQueue (center)**: The asynchronous thread pool that decouples Core Services from the Processing and Application layers. All work flows through it.
- **Processing Layer (green)**: Core business logic -- NetworkOPs processes transactions, LedgerMaster manages ledger state, NodeStore handles persistence, and InboundLedgers synchronizes missing data.
- **Application Services (purple)**: Higher-level features -- PathFinding computes payment routes, TxQ manages fee-based queuing, and LoadManager tracks server load.
- **Existing Observability (orange)**: The current monitoring stack (PerfLog, Insight, Journal logging) that OpenTelemetry will complement, not replace.
- **Arrows (Services to JobQueue to layers)**: Work originates at Core Services, is enqueued onto the JobQueue, and dispatched to Processing or Application layers for execution.
---
## 1.1.1 Actors and Actions
### Actors
| Who (Plain English) | Technical Term |
| ----------------------------------------- | -------------------------- |
| Network node running XRPL software | xrpld node |
| External client submitting requests | RPC Client |
| Network neighbor sharing data | Peer (PeerImp) |
| Request handler for client queries | RPC Server (ServerHandler) |
| Command executor for specific RPC methods | RPCHandler |
| Agreement process between nodes | Consensus (RCLConsensus) |
| Transaction processing coordinator | NetworkOPs |
| Background task scheduler | JobQueue |
| Ledger state manager | LedgerMaster |
| Payment route calculator | PathFinding (Pathfinder) |
| Transaction waiting room | TxQ (Transaction Queue) |
| Fee adjustment system | LoadManager |
| Trusted validator list manager | ValidatorList |
| Protocol upgrade tracker | AmendmentTable |
| Ledger state hash tree | SHAMap |
| Persistent key-value storage | NodeStore |
### Actions
| What Happens (Plain English) | Technical Term |
| ---------------------------------------------- | ---------------------- |
| Client sends a request to a node | `rpc.request` |
| Node executes a specific RPC command | `rpc.command.*` |
| Node receives a transaction from a peer | `tx.receive` |
| Node checks if a transaction is valid | `tx.validate` |
| Node forwards a transaction to neighbors | `tx.relay` |
| Nodes agree on which transactions to include | `consensus.round` |
| Consensus progresses through phases | `consensus.phase.*` |
| Node builds a new confirmed ledger | `ledger.build` |
| Node fetches missing ledger data from peers | `ledger.acquire` |
| Node computes payment routes | `pathfind.compute` |
| Node queues a transaction for later processing | `txq.enqueue` |
| Node increases fees due to high load | `fee.escalate` |
| Node fetches the latest trusted validator list | `validator.list.fetch` |
| Node votes on a protocol amendment | `amendment.vote` |
| Node synchronizes state tree data | `shamap.sync` |
---
## 1.2 Key Components for Instrumentation
> **TxQ** = Transaction Queue | **UNL** = Unique Node List
| Component | Location | Purpose | Trace Value |
| ------------------ | ------------------------------------------ | ------------------------ | -------------------------------- |
| **Overlay** | `src/xrpld/overlay/` | P2P communication | Message propagation timing |
| **PeerImp** | `src/xrpld/overlay/detail/PeerImp.cpp` | Individual peer handling | Per-peer latency |
| **RCLConsensus** | `src/xrpld/app/consensus/RCLConsensus.cpp` | Consensus algorithm | Round timing, phase analysis |
| **NetworkOPs** | `src/xrpld/app/misc/NetworkOPs.cpp` | Transaction processing | Tx lifecycle tracking |
| **ServerHandler** | `src/xrpld/rpc/detail/ServerHandler.cpp` | RPC entry point | Request latency |
| **RPCHandler** | `src/xrpld/rpc/detail/RPCHandler.cpp` | Command execution | Per-command timing |
| **JobQueue** | `src/xrpl/core/JobQueue.h` | Async task execution | Queue wait times |
| **PathFinding** | `src/xrpld/app/paths/` | Payment path computation | Path latency, cache hits |
| **TxQ** | `src/xrpld/app/misc/TxQ.cpp` | Transaction queue/fees | Queue depth, eviction rates |
| **LoadManager** | `src/xrpld/app/main/LoadManager.cpp` | Fee escalation/load | Fee levels, load factors |
| **InboundLedgers** | `src/xrpld/app/ledger/InboundLedgers.cpp` | Ledger acquisition | Sync time, peer reliability |
| **ValidatorList** | `src/xrpld/app/misc/ValidatorList.cpp` | UNL management | List freshness, fetch failures |
| **AmendmentTable** | `src/xrpld/app/misc/AmendmentTable.cpp` | Protocol amendments | Voting status, activation events |
| **SHAMap** | `src/xrpld/shamap/` | State hash tree | Sync speed, missing nodes |
---
## 1.3 Transaction Flow Diagram
Transaction flow spans multiple nodes in the network. Each node creates linked spans to form a distributed trace:
```mermaid
sequenceDiagram
participant Client
participant PeerA as Peer A (Receive)
participant PeerB as Peer B (Relay)
participant PeerC as Peer C (Validate)
Client->>PeerA: 1. Submit TX
rect rgb(230, 245, 255)
Note over PeerA: tx.receive SPAN START
PeerA->>PeerA: HashRouter Deduplication
PeerA->>PeerA: tx.validate (child span)
end
PeerA->>PeerB: 2. Relay TX (with trace ctx)
rect rgb(230, 245, 255)
Note over PeerB: tx.receive (linked span)
end
PeerB->>PeerC: 3. Relay TX
rect rgb(230, 245, 255)
Note over PeerC: tx.receive (linked span)
PeerC->>PeerC: tx.process
end
Note over Client,PeerC: DISTRIBUTED TRACE (same trace_id: abc123)
```
**Reading the diagram:**
- **Client**: The external entity that submits a transaction to Peer A. It has no trace context -- the trace starts at the first node.
- **Peer A (Receive)**: The entry node that creates the root span `tx.receive`, runs HashRouter deduplication to avoid processing duplicates, and creates a child `tx.validate` span.
- **Peer A to Peer B arrow**: The relay message carries trace context (trace_id + parent span_id), enabling Peer B to create a linked span under the same trace.
- **Peer B (Relay)**: Receives the transaction and trace context, creates a `tx.receive` span linked to Peer A's trace, then relays onward.
- **Peer C (Validate)**: Final hop in this example. Creates a linked `tx.receive` span and runs `tx.process` to fully process the transaction.
- **Blue rectangles**: Highlight the span boundaries on each node, showing where instrumentation creates and closes spans.
### Trace Structure
```
trace_id: abc123
├── span: tx.receive (Peer A)
│ ├── span: tx.validate
│ └── span: tx.relay
├── span: tx.receive (Peer B) [parent: Peer A]
│ └── span: tx.relay
└── span: tx.receive (Peer C) [parent: Peer B]
└── span: tx.process
```
---
## 1.4 Consensus Round Flow
Consensus rounds are multi-phase operations that benefit significantly from tracing:
```mermaid
flowchart TB
subgraph round["consensus.round (root span)"]
attrs["Attributes:<br/>xrpl.consensus.ledger.seq = 12345678<br/>xrpl.consensus.mode = proposing<br/>xrpl.consensus.proposers = 35"]
subgraph open["consensus.phase.open"]
open_desc["Duration: ~3s<br/>Waiting for transactions"]
end
subgraph establish["consensus.phase.establish"]
est_attrs["proposals_received = 28<br/>disputes_resolved = 3"]
est_children["├── consensus.proposal.receive (×28)<br/>├── consensus.proposal.send (×1)<br/>└── consensus.dispute.resolve (×3)"]
end
subgraph accept["consensus.phase.accept"]
acc_attrs["transactions_applied = 150<br/>ledger.hash = DEF456..."]
acc_children["├── ledger.build<br/>└── ledger.validate"]
end
attrs --> open
open --> establish
establish --> accept
end
style round fill:#f57f17,stroke:#e65100,color:#ffffff
style open fill:#1565c0,stroke:#0d47a1,color:#ffffff
style establish fill:#2e7d32,stroke:#1b5e20,color:#ffffff
style accept fill:#c2185b,stroke:#880e4f,color:#ffffff
```
**Reading the diagram:**
- **consensus.round (orange, root span)**: The top-level span encompassing the entire consensus round, with attributes like ledger sequence, mode, and proposer count.
- **consensus.phase.open (blue)**: The first phase where the node waits (~3s) to collect incoming transactions before proposing.
- **consensus.phase.establish (green)**: The negotiation phase where validators exchange proposals, resolve disputes, and converge on a transaction set. Child spans track each proposal received/sent and each dispute resolved.
- **consensus.phase.accept (pink)**: The final phase where the agreed transaction set is applied, a new ledger is built, and the ledger is validated. Child spans cover `ledger.build` and `ledger.validate`.
- **Arrows (open to establish to accept)**: The sequential flow through the three consensus phases. Each phase must complete before the next begins.
---
## 1.5 RPC Request Flow
> **WS** = WebSocket
RPC requests support W3C Trace Context headers for distributed tracing across services:
```mermaid
flowchart TB
subgraph request["rpc.request (root span)"]
http["HTTP Request — POST /<br/>traceparent:<br/>00-abc123...-def456...-01"]
attrs["Attributes:<br/>http.method = POST<br/>net.peer.ip = 192.168.1.100<br/>xrpl.rpc.command = submit"]
subgraph enqueue["jobqueue.enqueue"]
job_attr["xrpl.job.type = jtCLIENT_RPC"]
end
subgraph command["rpc.command.submit"]
cmd_attrs["xrpl.rpc.version = 2<br/>xrpl.rpc.role = user"]
cmd_children["├── tx.deserialize<br/>├── tx.validate_local<br/>└── tx.submit_to_network"]
end
response["Response: 200 OK<br/>Duration: 45ms"]
http --> attrs
attrs --> enqueue
enqueue --> command
command --> response
end
style request fill:#2e7d32,stroke:#1b5e20,color:#ffffff
style enqueue fill:#1565c0,stroke:#0d47a1,color:#ffffff
style command fill:#e65100,stroke:#bf360c,color:#ffffff
```
**Reading the diagram:**
- **rpc.request (green, root span)**: The outermost span representing the full RPC request lifecycle, from HTTP receipt to response. Carries the W3C `traceparent` header for distributed tracing.
- **HTTP Request node**: Shows the incoming POST request with its `traceparent` header and extracted attributes (method, peer IP, command name).
- **jobqueue.enqueue (blue)**: The span covering the asynchronous handoff from the RPC thread to the JobQueue worker thread. The trace context is preserved across this async boundary.
- **rpc.command.submit (orange)**: The span for the actual command execution, with child spans for deserialization, local validation, and network submission.
- **Response node**: The final output with HTTP status and total duration, marking the end of the root span.
- **Arrows (top to bottom)**: The sequential processing pipeline -- receive request, extract attributes, enqueue job, execute command, return response.
---
## 1.6 Key Trace Points
> **TxQ** = Transaction Queue
The following table identifies priority instrumentation points across the codebase:
| Category | Span Name | File | Method | Priority |
| --------------- | ---------------------- | ---------------------- | ----------------------- | -------- |
| **Transaction** | `tx.receive` | `PeerImp.cpp` | `handleTransaction()` | High |
| **Transaction** | `tx.validate` | `NetworkOPs.cpp` | `processTransaction()` | High |
| **Transaction** | `tx.process` | `NetworkOPs.cpp` | `doTransactionSync()` | High |
| **Transaction** | `tx.relay` | `OverlayImpl.cpp` | `relay()` | Medium |
| **Consensus** | `consensus.round` | `RCLConsensus.cpp` | `startRound()` | High |
| **Consensus** | `consensus.phase.*` | `Consensus.h` | `timerEntry()` | High |
| **Consensus** | `consensus.proposal.*` | `RCLConsensus.cpp` | `peerProposal()` | Medium |
| **RPC** | `rpc.request` | `ServerHandler.cpp` | `onRequest()` | High |
| **RPC** | `rpc.command.*` | `RPCHandler.cpp` | `doCommand()` | High |
| **Peer** | `peer.connect` | `OverlayImpl.cpp` | `onHandoff()` | Low |
| **Peer** | `peer.message.*` | `PeerImp.cpp` | `onMessage()` | Low |
| **Ledger** | `ledger.acquire` | `InboundLedgers.cpp` | `acquire()` | Medium |
| **Ledger** | `ledger.build` | `RCLConsensus.cpp` | `buildLCL()` | High |
| **PathFinding** | `pathfind.request` | `PathRequest.cpp` | `doUpdate()` | High |
| **PathFinding** | `pathfind.compute` | `Pathfinder.cpp` | `findPaths()` | High |
| **TxQ** | `txq.enqueue` | `TxQ.cpp` | `apply()` | High |
| **TxQ** | `txq.apply` | `TxQ.cpp` | `processClosedLedger()` | High |
| **Fee** | `fee.escalate` | `LoadManager.cpp` | `raiseLocalFee()` | Medium |
| **Ledger** | `ledger.replay` | `LedgerReplayer.h` | `replay()` | Medium |
| **Ledger** | `ledger.delta` | `LedgerDeltaAcquire.h` | `processData()` | Medium |
| **Validator** | `validator.list.fetch` | `ValidatorList.cpp` | `verify()` | Medium |
| **Validator** | `validator.manifest` | `Manifest.cpp` | `applyManifest()` | Low |
| **Amendment** | `amendment.vote` | `AmendmentTable.cpp` | `doVoting()` | Low |
| **SHAMap** | `shamap.sync` | `SHAMap.cpp` | `fetchRoot()` | Medium |
---
## 1.7 Instrumentation Priority
> **TxQ** = Transaction Queue
```mermaid
quadrantChart
title Instrumentation Priority Matrix
x-axis Low Complexity --> High Complexity
y-axis Low Value --> High Value
quadrant-1 Implement First
quadrant-2 Plan Carefully
quadrant-3 Quick Wins
quadrant-4 Consider Later
RPC Tracing: [0.2, 0.92]
Transaction Tracing: [0.55, 0.88]
Consensus Tracing: [0.78, 0.82]
PathFinding: [0.38, 0.75]
TxQ and Fees: [0.25, 0.65]
Ledger Sync: [0.62, 0.58]
Peer Message Tracing: [0.35, 0.25]
JobQueue Tracing: [0.2, 0.48]
Validator Mgmt: [0.48, 0.42]
Amendment Tracking: [0.15, 0.32]
SHAMap Operations: [0.72, 0.45]
```
---
## 1.8 Observable Outcomes
> **TxQ** = Transaction Queue | **UNL** = Unique Node List
After implementing OpenTelemetry, operators and developers will gain visibility into the following:
### 1.8.1 What You Will See: Traces
| Trace Type | Description | Example Query in Grafana/Tempo |
| -------------------------- | ------------------------------------------------------------------------------------------- | ---------------------------------------------------- |
| **Transaction Lifecycle** | Full journey from RPC submission through validation, relay, consensus, and ledger inclusion | `{service.name="xrpld" && xrpl.tx.hash="ABC123..."}` |
| **Cross-Node Propagation** | Transaction path across multiple xrpld nodes with timing | `{xrpl.tx.relay_count > 0}` |
| **Consensus Rounds** | Complete round with all phases (open, establish, accept) | `{span.name=~"consensus.round.*"}` |
| **RPC Request Processing** | Individual command execution with timing breakdown | `{xrpl.rpc.command="account_info"}` |
| **Ledger Acquisition** | Peer-to-peer ledger data requests and responses | `{span.name="ledger.acquire"}` |
| **PathFinding Latency** | Path computation time and cache effectiveness for payment RPCs | `{span.name="pathfind.compute"}` |
| **TxQ Behavior** | Queue depth, eviction patterns, fee escalation during congestion | `{span.name=~"txq.*"}` |
| **Ledger Sync** | Full acquisition timeline including delta and transaction fetches | `{span.name=~"ledger.acquire.*"}` |
| **Validator Health** | UNL fetch success, manifest updates, stale list detection | `{span.name=~"validator.*"}` |
### 1.8.2 What You Will See: Metrics (Derived from Traces)
| Metric | Description | Dashboard Panel |
| ----------------------------- | --------------------------------------- | --------------------------- |
| **RPC Latency (p50/p95/p99)** | Response time distribution per command | Heatmap by command |
| **Transaction Throughput** | Transactions processed per second | Time series graph |
| **Consensus Round Duration** | Time to complete consensus phases | Histogram |
| **Cross-Node Latency** | Time for transaction to reach N nodes | Line chart with percentiles |
| **Error Rate** | Failed transactions/RPC calls by type | Stacked bar chart |
| **PathFinding Latency** | Path computation time per currency pair | Heatmap by currency |
| **TxQ Depth** | Queued transactions over time | Time series with thresholds |
| **Fee Escalation Level** | Current fee multiplier | Gauge with alert thresholds |
| **Ledger Sync Duration** | Time to acquire missing ledgers | Histogram |
### 1.8.3 Concrete Dashboard Examples
**Transaction Trace View (Tempo):**
```
┌────────────────────────────────────────────────────────────────────────────────┐
│ Trace: abc123... (Transaction Submission) Duration: 847ms │
├────────────────────────────────────────────────────────────────────────────────┤
│ ├── rpc.request [ServerHandler] ████░░░░░░ 45ms │
│ │ └── rpc.command.submit [RPCHandler] ████░░░░░░ 42ms │
│ │ └── tx.receive [NetworkOPs] ███░░░░░░░ 35ms │
│ │ ├── tx.validate [TxQ] █░░░░░░░░░ 8ms │
│ │ └── tx.relay [Overlay] ██░░░░░░░░ 15ms │
│ │ ├── tx.receive [Node-B] █████░░░░░ 52ms │
│ │ │ └── tx.relay [Node-B] ██░░░░░░░░ 18ms │
│ │ └── tx.receive [Node-C] ██████░░░░ 65ms │
│ └── consensus.round [RCLConsensus] ████████░░ 720ms │
│ ├── consensus.phase.open ██░░░░░░░░ 180ms │
│ ├── consensus.phase.establish █████░░░░░ 480ms │
│ └── consensus.phase.accept █░░░░░░░░░ 60ms │
└────────────────────────────────────────────────────────────────────────────────┘
```
**RPC Performance Dashboard Panel:**
```
┌─────────────────────────────────────────────────────────────┐
│ RPC Command Latency (Last 1 Hour) │
├─────────────────────────────────────────────────────────────┤
│ Command │ p50 │ p95 │ p99 │ Errors │ Rate │
│──────────────────┼────────┼────────┼────────┼────────┼──────│
│ account_info │ 12ms │ 45ms │ 89ms │ 0.1% │ 150/s│
│ submit │ 35ms │ 120ms │ 250ms │ 2.3% │ 45/s│
│ ledger │ 8ms │ 25ms │ 55ms │ 0.0% │ 80/s│
│ tx │ 15ms │ 50ms │ 100ms │ 0.5% │ 60/s│
│ server_info │ 5ms │ 12ms │ 20ms │ 0.0% │ 200/s│
└─────────────────────────────────────────────────────────────┘
```
**Consensus Health Dashboard Panel:**
```mermaid
---
config:
xyChart:
width: 1200
height: 400
plotReservedSpacePercent: 50
chartOrientation: vertical
themeVariables:
xyChart:
plotColorPalette: "#3498db"
---
xychart-beta
title "Consensus Round Duration (Last 24 Hours)"
x-axis "Time of Day (Hours)" [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24]
y-axis "Duration (seconds)" 1 --> 5
line [2.1, 2.4, 2.8, 3.2, 3.8, 4.3, 4.5, 5.0, 4.7, 4.0, 3.2, 2.6, 2.0]
```
### 1.8.4 Operator Actionable Insights
| Scenario | What You'll See | Action |
| ------------------------- | ---------------------------------------------------------------------------- | ------------------------------------------------ |
| **Slow RPC** | Span showing which phase is slow (parsing, execution, serialization) | Optimize specific code path |
| **Transaction Stuck** | Trace stops at validation; error attribute shows reason | Fix transaction parameters |
| **Consensus Delay** | Phase.establish taking too long; proposer attribute shows missing validators | Investigate network connectivity |
| **Memory Spike** | Large batch of spans correlating with memory increase | Tune batch_size or sampling |
| **Network Partition** | Traces missing cross-node links for specific peer | Check peer connectivity |
| **Path Computation Slow** | pathfind.compute span shows high latency; cache miss rate in attributes | Warm the RippleLineCache, check order book depth |
| **TxQ Full** | txq.enqueue spans show evictions; fee.escalate spans increasing | Monitor fee levels, alert operators |
| **Ledger Sync Stalled** | ledger.acquire spans timing out; peer reliability attributes show issues | Check peer connectivity, add trusted peers |
| **UNL Stale** | validator.list.fetch spans failing; last_update attribute aging | Verify validator site URLs, check DNS |
### 1.8.5 Developer Debugging Workflow
1. **Find Transaction**: Query by `xrpl.tx.hash` to get full trace
2. **Identify Bottleneck**: Look at span durations to find slowest component
3. **Check Attributes**: Review `xrpl.tx.validity`, `xrpl.rpc.status` for errors
4. **Correlate Logs**: Use `trace_id` to find related PerfLog entries
5. **Compare Nodes**: Filter by `service.instance.id` to compare behavior across nodes
---
_Next: [Design Decisions](./02-design-decisions.md)_ | _Back to: [Overview](./OpenTelemetryPlan.md)_

View File

@@ -0,0 +1,633 @@
# Design Decisions
> **Parent Document**: [OpenTelemetryPlan.md](./OpenTelemetryPlan.md)
> **Related**: [Architecture Analysis](./01-architecture-analysis.md) | [Code Samples](./04-code-samples.md)
---
## 2.1 OpenTelemetry Components
> **OTLP** = OpenTelemetry Protocol
### 2.1.1 SDK Selection
**Primary Choice**: OpenTelemetry C++ SDK (`opentelemetry-cpp`)
| Component | Purpose | Required |
| --------------------------------------- | ---------------------- | ------------------------- |
| `opentelemetry-cpp::api` | Tracing API headers | Yes |
| `opentelemetry-cpp::sdk` | SDK implementation | Yes |
| `opentelemetry-cpp::ext` | Extensions (exporters) | Yes |
| `opentelemetry-cpp::otlp_http_exporter` | OTLP/HTTP export | Yes (shipped in Phase 1b) |
| `opentelemetry-cpp::otlp_grpc_exporter` | OTLP/gRPC export | Future (not yet wired up) |
### 2.1.2 Instrumentation Strategy
**Manual Instrumentation** (recommended):
| Approach | Pros | Cons |
| ---------- | --------------------------------------------------------------- | ------------------------------------------------------- |
| **Manual** | Precise control, optimized placement, xrpld-specific attributes | More development effort |
| **Auto** | Less code, automatic coverage | Less control, potential overhead, limited customization |
---
## 2.2 Exporter Configuration
> **OTLP** = OpenTelemetry Protocol
```mermaid
flowchart TB
subgraph nodes["xrpld Nodes"]
node1["xrpld<br/>Node 1"]
node2["xrpld<br/>Node 2"]
node3["xrpld<br/>Node 3"]
end
collector["OpenTelemetry<br/>Collector<br/>(sidecar or standalone)"]
subgraph backends["Observability Backends"]
tempo["Tempo"]
elastic["Elastic<br/>APM"]
end
node1 -->|"OTLP/HTTP<br/>:4318"| collector
node2 -->|"OTLP/HTTP<br/>:4318"| collector
node3 -->|"OTLP/HTTP<br/>:4318"| collector
collector --> tempo
collector --> elastic
style nodes fill:#0d47a1,stroke:#082f6a,color:#ffffff
style backends fill:#1b5e20,stroke:#0d3d14,color:#ffffff
style collector fill:#bf360c,stroke:#8c2809,color:#ffffff
```
**Reading the diagram:**
- **xrpld Nodes (blue)**: The source of telemetry data. Each xrpld node exports spans via OTLP/HTTP on port 4318 (the only exporter shipped in Phase 1b).
- **OpenTelemetry Collector (red)**: The central aggregation point that receives spans from all nodes. Can run as a sidecar (per-node) or standalone (shared). Handles batching, filtering, and routing.
- **Observability Backends (green)**: The storage and visualization destinations. Tempo is the recommended backend for both development and production, and Elastic APM is an alternative. The Collector routes to one or more backends.
- **Arrows (nodes to collector to backends)**: The data pipeline -- spans flow from nodes to the Collector over HTTP, then the Collector fans out to the configured backends.
### 2.2.1 OTLP/HTTP (Shipped in Phase 1b)
```cpp
// Configuration for OTLP over HTTP (the only exporter currently wired up).
namespace otlp = opentelemetry::exporter::otlp;
otlp::OtlpHttpExporterOptions opts;
opts.url = "http://localhost:4318/v1/traces";
opts.content_type = otlp::HttpRequestContentType::kJson; // or kBinary
```
### 2.2.2 OTLP/gRPC (Future Work — Planned Upgrade)
OTLP/gRPC is planned as a future upgrade from the HTTP exporter. The gRPC
transport offers lower per-span overhead and tighter back-pressure semantics
than HTTP/JSON, making it attractive for production deployments once the HTTP
path is validated in earlier phases.
Required to land this upgrade:
1. Add `opentelemetry-cpp::otlp_grpc_exporter` to the Conan recipe (the
dependency already exists but is not linked in Phase 1b builds).
2. Extend `TelemetryConfig.cpp` to parse an `exporter` key (`otlp_http`
default, `otlp_grpc` opt-in) and a gRPC endpoint override.
3. In `Telemetry::start()` branch on the parsed exporter type and construct
either `OtlpHttpExporterFactory::Create(httpOpts)` or
`OtlpGrpcExporterFactory::Create(grpcOpts)` accordingly.
4. Update the runbook and dashboards to document the alternate port and TLS
settings.
Example Phase 1b+ gRPC configuration (when wired up):
```cpp
// Configuration for OTLP over gRPC (future work).
namespace otlp = opentelemetry::exporter::otlp;
otlp::OtlpGrpcExporterOptions opts;
opts.endpoint = "<otel-collector-host>:4317";
opts.use_ssl_credentials = true;
opts.ssl_credentials_cacert_path = "/path/to/ca.crt";
```
Until that work lands, `OtlpGrpcExporterOptions` is **not** used by any code
path in Phase 1b through Phase 5.
---
## 2.3 Span Naming Conventions
> **TxQ** = Transaction Queue | **UNL** = Unique Node List | **WS** = WebSocket
### 2.3.1 Naming Schema
```
<component>.<operation>[.<sub-operation>]
```
**Examples**:
- `tx.receive` - Transaction received from peer
- `consensus.phase.establish` - Consensus establish phase
- `rpc.command.server_info` - server_info RPC command
### 2.3.2 Complete Span Catalog
```yaml
# Transaction Spans
tx:
receive: "Transaction received from network"
validate: "Transaction signature/format validation"
process: "Full transaction processing"
relay: "Transaction relay to peers"
apply: "Apply transaction to ledger"
# Consensus Spans
consensus:
round: "Complete consensus round"
phase:
open: "Open phase - collecting transactions"
establish: "Establish phase - reaching agreement"
accept: "Accept phase - applying consensus"
proposal:
receive: "Receive peer proposal"
send: "Send our proposal"
validation:
receive: "Receive peer validation"
send: "Send our validation"
# RPC Spans
rpc:
request: "HTTP/WebSocket request handling"
command:
"*": "Specific RPC command (dynamic)"
# Peer Spans
peer:
connect: "Peer connection establishment"
disconnect: "Peer disconnection"
message:
send: "Send protocol message"
receive: "Receive protocol message"
# Ledger Spans
ledger:
acquire: "Ledger acquisition from network"
build: "Build new ledger"
validate: "Ledger validation"
close: "Close ledger"
replay: "Ledger replay executed"
delta: "Delta-based ledger acquired"
# PathFinding Spans
pathfind:
request: "Path request initiated"
compute: "Path computation executed"
# TxQ Spans
txq:
enqueue: "Transaction queued"
apply: "Queued transaction applied"
# Fee/Load Spans
fee:
escalate: "Fee escalation triggered"
# Validator Spans
validator:
list:
fetch: "UNL list fetched"
manifest: "Manifest update processed"
# Amendment Spans
amendment:
vote: "Amendment voting executed"
# SHAMap Spans
shamap:
sync: "State tree synchronization"
# Job Spans
job:
enqueue: "Job added to queue"
execute: "Job execution"
```
---
## 2.4 Attribute Schema
> **TxQ** = Transaction Queue | **UNL** = Unique Node List | **OTLP** = OpenTelemetry Protocol
### 2.4.1 Resource Attributes (Set Once at Startup)
```cpp
// Standard OpenTelemetry semantic conventions
resource::SemanticConventions::SERVICE_NAME = "xrpld"
resource::SemanticConventions::SERVICE_VERSION = BuildInfo::getVersionString()
resource::SemanticConventions::SERVICE_INSTANCE_ID = <node_public_key_base58>
// Custom xrpld attributes
"xrpl.network.id" = <network_id> // e.g., 0 for mainnet
"xrpl.network.type" = "mainnet" | "testnet" | "devnet" | "standalone"
"xrpl.node.type" = "validator" | "stock" | "reporting"
"xrpl.node.cluster" = <cluster_name> // If clustered
```
### 2.4.2 Span Attributes by Category
#### Transaction Attributes
```cpp
"xrpl.tx.hash" = string // Transaction hash (hex)
"xrpl.tx.type" = string // "Payment", "OfferCreate", etc.
"xrpl.tx.account" = string // Source account (redacted in prod)
"xrpl.tx.sequence" = int64 // Account sequence number
"xrpl.tx.fee" = int64 // Fee in drops
"xrpl.tx.result" = string // "tesSUCCESS", "tecPATH_DRY", etc.
"xrpl.tx.ledger_index" = int64 // Ledger containing transaction
```
#### Consensus Attributes
```cpp
"xrpl.consensus.round" = int64 // Round number
"xrpl.consensus.phase" = string // "open", "establish", "accept"
"xrpl.consensus.mode" = string // "proposing", "observing", etc.
"xrpl.consensus.proposers" = int64 // Number of proposers
"xrpl.consensus.ledger.prev" = string // Previous ledger hash
"xrpl.consensus.ledger.seq" = int64 // Ledger sequence
"xrpl.consensus.tx_count" = int64 // Transactions in consensus set
"xrpl.consensus.duration_ms" = float64 // Round duration
```
#### RPC Attributes
```cpp
"xrpl.rpc.command" = string // Command name
"xrpl.rpc.version" = int64 // API version
"xrpl.rpc.role" = string // "admin" or "user"
"xrpl.rpc.params" = string // Sanitized parameters (optional)
```
#### Peer & Message Attributes
```cpp
"xrpl.peer.id" = string // Peer public key (base58)
"xrpl.peer.address" = string // IP:port
"xrpl.peer.latency_ms" = float64 // Measured latency
"xrpl.peer.cluster" = string // Cluster name if clustered
"xrpl.message.type" = string // Protocol message type name
"xrpl.message.size_bytes" = int64 // Message size
"xrpl.message.compressed" = bool // Whether compressed
```
#### Ledger & Job Attributes
```cpp
"xrpl.ledger.hash" = string // Ledger hash
"xrpl.ledger.index" = int64 // Ledger sequence/index
"xrpl.ledger.close_time" = int64 // Close time (epoch)
"xrpl.ledger.tx_count" = int64 // Transaction count
"xrpl.job.type" = string // Job type name
"xrpl.job.queue_ms" = float64 // Time spent in queue
"xrpl.job.worker" = int64 // Worker thread ID
```
#### PathFinding Attributes
```cpp
"xrpl.pathfind.source_currency" = string // Source currency code
"xrpl.pathfind.dest_currency" = string // Destination currency code
"xrpl.pathfind.path_count" = int64 // Number of paths found
"xrpl.pathfind.cache_hit" = bool // RippleLineCache hit
```
#### TxQ Attributes
```cpp
"xrpl.txq.queue_depth" = int64 // Current queue depth
"xrpl.txq.fee_level" = int64 // Fee level of transaction
"xrpl.txq.eviction_reason" = string // Why transaction was evicted
```
#### Fee Attributes
```cpp
"xrpl.fee.load_factor" = int64 // Current load factor
"xrpl.fee.escalation_level" = int64 // Fee escalation multiplier
```
#### Validator Attributes
```cpp
"xrpl.validator.list_size" = int64 // UNL size
"xrpl.validator.list_age_sec" = int64 // Seconds since last update
```
#### Amendment Attributes
```cpp
"xrpl.amendment.name" = string // Amendment name
"xrpl.amendment.status" = string // "enabled", "vetoed", "supported"
```
#### SHAMap Attributes
```cpp
"xrpl.shamap.type" = string // "transaction", "state", "account_state"
"xrpl.shamap.missing_nodes" = int64 // Number of missing nodes during sync
"xrpl.shamap.duration_ms" = float64 // Sync duration
```
### 2.4.3 Data Collection Summary
The following table summarizes what data is collected by category:
| Category | Attributes Collected | Purpose |
| --------------- | ---------------------------------------------------------------------- | ---------------------------- |
| **Transaction** | `tx.hash`, `tx.type`, `tx.result`, `tx.fee`, `ledger_index` | Trace transaction lifecycle |
| **Consensus** | `round`, `phase`, `mode`, `proposers` (public keys), `duration_ms` | Analyze consensus timing |
| **RPC** | `command`, `version`, `status`, `duration_ms` | Monitor RPC performance |
| **Peer** | `peer.id` (public key), `latency_ms`, `message.type`, `message.size` | Network topology analysis |
| **Ledger** | `ledger.hash`, `ledger.index`, `close_time`, `tx_count` | Ledger progression tracking |
| **Job** | `job.type`, `queue_ms`, `worker` | JobQueue performance |
| **PathFinding** | `pathfind.source_currency`, `dest_currency`, `path_count`, `cache_hit` | Payment path analysis |
| **TxQ** | `txq.queue_depth`, `fee_level`, `eviction_reason` | Queue depth and fee tracking |
| **Fee** | `fee.load_factor`, `escalation_level` | Fee escalation monitoring |
| **Validator** | `validator.list_size`, `list_age_sec` | UNL health monitoring |
| **Amendment** | `amendment.name`, `status` | Protocol upgrade tracking |
| **SHAMap** | `shamap.type`, `missing_nodes`, `duration_ms` | State tree sync performance |
### 2.4.4 Privacy & Sensitive Data Policy
> **PII** = Personally Identifiable Information
OpenTelemetry instrumentation is designed to collect **operational metadata only**, never sensitive content.
#### Data NOT Collected
The following data is explicitly **excluded** from telemetry collection:
| Excluded Data | Reason |
| ----------------------- | ----------------------------------------- |
| **Private Keys** | Never exposed; not relevant to tracing |
| **Account Balances** | Financial data; privacy sensitive |
| **Transaction Amounts** | Financial data; privacy sensitive |
| **Raw TX Payloads** | May contain sensitive memo/data fields |
| **Personal Data** | No PII collected |
| **IP Addresses** | Configurable; excluded by default in prod |
#### Privacy Protection Mechanisms
| Mechanism | Description |
| ----------------------------- | ------------------------------------------------------------------------- |
| **Account Hashing** | `xrpl.tx.account` is hashed at collector level before storage |
| **Configurable Redaction** | Sensitive fields can be excluded via `[telemetry]` config section |
| **Sampling** | Only 10% of traces recorded by default, reducing data exposure |
| **Local Control** | Node operators have full control over what gets exported |
| **No Raw Payloads** | Transaction content is never recorded, only metadata (hash, type, result) |
| **Collector-Level Filtering** | Additional redaction/hashing can be configured at OTel Collector |
#### Collector-Level Data Protection
The OpenTelemetry Collector can be configured to hash or redact sensitive attributes before export:
```yaml
processors:
attributes:
actions:
# Hash account addresses before storage
- key: xrpl.tx.account
action: hash
# Remove IP addresses entirely
- key: xrpl.peer.address
action: delete
# Redact specific fields
- key: xrpl.rpc.params
action: delete
```
#### Configuration Options for Privacy
In `xrpld.cfg`, operators can control data collection granularity:
```ini
[telemetry]
enabled=1
# Disable collection of specific components
trace_transactions=1
trace_consensus=1
trace_rpc=1
trace_peer=0 # Disable peer tracing (high volume, includes addresses)
# Redact specific attributes
redact_account=1 # Hash account addresses before export
redact_peer_address=1 # Remove peer IP addresses
```
> **Note**: The `redact_account` configuration in `xrpld.cfg` controls SDK-level redaction before export, while collector-level filtering (see [Collector-Level Data Protection](#collector-level-data-protection) above) provides an additional defense-in-depth layer. Both can operate independently.
> **Key Principle**: Telemetry collects **operational metadata** (timing, counts, hashes) — never **sensitive content** (keys, balances, amounts, raw payloads).
---
## 2.5 Context Propagation Design
> **WS** = WebSocket
### 2.5.1 Propagation Boundaries
```mermaid
flowchart TB
subgraph http["HTTP/WebSocket (RPC)"]
w3c["W3C Trace Context Headers:<br/>traceparent:<br/>00-trace_id-span_id-flags<br/>tracestate: xrpld=..."]
end
subgraph protobuf["Protocol Buffers (P2P)"]
proto["message TraceContext {<br/> bytes trace_id = 1; // 16 bytes<br/> bytes span_id = 2; // 8 bytes<br/> uint32 trace_flags = 3;<br/> string trace_state = 4;<br/>}"]
end
subgraph jobqueue["JobQueue (Internal Async)"]
job["Context captured at job creation,<br/>restored at execution<br/><br/>class Job {<br/> otel::context::Context<br/> traceContext_;<br/>};"]
end
style http fill:#0d47a1,stroke:#082f6a,color:#ffffff
style protobuf fill:#1b5e20,stroke:#0d3d14,color:#ffffff
style jobqueue fill:#bf360c,stroke:#8c2809,color:#ffffff
```
**Reading the diagram:**
- **HTTP/WebSocket - RPC (blue)**: For client-facing RPC requests, trace context is propagated using the W3C `traceparent` header. This is the standard approach and works with any OTel-compatible client.
- **Protocol Buffers - P2P (green)**: For peer-to-peer messages between xrpld nodes, trace context is embedded as a protobuf `TraceContext` message carrying trace_id, span_id, flags, and optional trace_state.
- **JobQueue - Internal Async (red)**: For asynchronous work within a single node, the OTel context is captured when a job is created and restored when the job executes on a worker thread. This bridges the async gap so spans remain linked.
---
## 2.6 Integration with Existing Observability
> **OTLP** = OpenTelemetry Protocol | **WS** = WebSocket
### 2.6.1 Existing Frameworks Comparison
xrpld already has two observability mechanisms. OpenTelemetry complements (not replaces) them:
| Aspect | PerfLog | Beast Insight (StatsD) | OpenTelemetry |
| --------------------- | ----------------------------- | ---------------------------- | ------------------------- |
| **Type** | Logging | Metrics | Distributed Tracing |
| **Data** | JSON log entries | Counters, gauges, histograms | Spans with context |
| **Scope** | Single node | Single node | **Cross-node** |
| **Output** | `perf.log` file | StatsD server | OTLP Collector |
| **Question answered** | "What happened on this node?" | "How many? How fast?" | "What was the journey?" |
| **Correlation** | By timestamp | By metric name | By `trace_id` |
| **Overhead** | Low (file I/O) | Low (UDP packets) | Low-Medium (configurable) |
### 2.6.2 What Each Framework Does Best
#### PerfLog
- **Purpose**: Detailed local event logging for RPC and job execution
- **Strengths**:
- Rich JSON output with timing data
- Already integrated in RPC handlers
- File-based, no external dependencies
- **Limitations**:
- Single-node only (no cross-node correlation)
- No parent-child relationships between events
- Manual log parsing required
```json
// Example PerfLog entry
{
"time": "2024-01-15T10:30:00.123Z",
"method": "submit",
"duration_us": 1523,
"result": "tesSUCCESS"
}
```
#### Beast Insight (StatsD)
- **Purpose**: Real-time metrics for monitoring dashboards
- **Strengths**:
- Aggregated metrics (counters, gauges, histograms)
- Low overhead (UDP, fire-and-forget)
- Good for alerting thresholds
- **Limitations**:
- No request-level detail
- No causal relationships
- Single-node perspective
```cpp
// Example StatsD usage in xrpld
insight.increment("rpc.submit.count");
insight.gauge("ledger.age", age);
insight.timing("consensus.round", duration);
```
#### OpenTelemetry (NEW)
- **Purpose**: Distributed request tracing across nodes
- **Strengths**:
- **Cross-node correlation** via `trace_id`
- Parent-child span relationships
- Rich attributes per span
- Industry standard (CNCF)
- **Limitations**:
- Requires collector infrastructure
- Higher complexity than logging
```cpp
// Example OpenTelemetry span
auto span = telemetry.startSpan("tx.relay");
span->SetAttribute("tx.hash", hash);
span->SetAttribute("peer.id", peerId);
// Span automatically linked to parent via context
```
### 2.6.3 When to Use Each
| Scenario | PerfLog | StatsD | OpenTelemetry |
| --------------------------------------- | ---------- | ------ | ------------- |
| "How many TXs per second?" | ❌ | ✅ | ✅ |
| "What's the p99 RPC latency?" | ❌ | ✅ | ✅ |
| "Why was this specific TX slow?" | ⚠️ partial | ❌ | ✅ |
| "Which node delayed consensus?" | ❌ | ❌ | ✅ |
| "What happened on node X at time T?" | ✅ | ❌ | ✅ |
| "Show me the TX journey across 5 nodes" | ❌ | ❌ | ✅ |
### 2.6.4 Coexistence Strategy
```mermaid
flowchart TB
subgraph xrpld["xrpld Process"]
perflog["PerfLog<br/>(JSON to file)"]
insight["Beast Insight<br/>(StatsD)"]
otel["OpenTelemetry<br/>(Tracing)"]
end
perflog --> perffile["perf.log"]
insight --> statsd["StatsD Server"]
otel --> collector["OTLP Collector"]
perffile --> grafana["Grafana<br/>(Unified UI)"]
statsd --> grafana
collector --> grafana
style xrpld fill:#212121,stroke:#0a0a0a,color:#ffffff
style grafana fill:#bf360c,stroke:#8c2809,color:#ffffff
```
**Reading the diagram:**
- **xrpld Process (dark gray)**: The single xrpld node running all three observability frameworks side by side. Each framework operates independently with no interference.
- **PerfLog to perf.log**: PerfLog writes JSON-formatted event logs to a local file. Grafana can ingest these via Loki or a file-based datasource.
- **Beast Insight to StatsD Server**: Insight sends aggregated metrics (counters, gauges) over UDP to a StatsD server. Grafana reads from StatsD-compatible backends like Graphite or Prometheus (via StatsD exporter).
- **OpenTelemetry to OTLP Collector**: OTel exports spans over OTLP/gRPC to a Collector, which then forwards to a trace backend (Tempo).
- **Grafana (red, unified UI)**: All three data streams converge in Grafana, enabling operators to correlate logs, metrics, and traces in a single dashboard.
### 2.6.5 Correlation with PerfLog
Trace IDs can be correlated with existing PerfLog entries for comprehensive debugging:
```cpp
// In RPCHandler.cpp - correlate trace with PerfLog
Status doCommand(RPC::JsonContext& context, Json::Value& result)
{
// Start OpenTelemetry span
auto span = context.app.getTelemetry().startSpan(
"rpc.command." + context.method);
// Get trace ID for correlation
auto traceId = span->GetContext().trace_id().IsValid()
? toHex(span->GetContext().trace_id())
: "";
// Use existing PerfLog with trace correlation
auto const curId = context.app.getPerfLog().currentId();
context.app.getPerfLog().rpcStart(context.method, curId);
// Future: Add trace ID to PerfLog entry
// context.app.getPerfLog().setTraceId(curId, traceId);
try {
auto ret = handler(context, result);
context.app.getPerfLog().rpcFinish(context.method, curId);
span->SetStatus(opentelemetry::trace::StatusCode::kOk);
return ret;
} catch (std::exception const& e) {
context.app.getPerfLog().rpcError(context.method, curId);
span->RecordException(e);
span->SetStatus(opentelemetry::trace::StatusCode::kError, e.what());
throw;
}
}
```
---
_Previous: [Architecture Analysis](./01-architecture-analysis.md)_ | _Next: [Implementation Strategy](./03-implementation-strategy.md)_ | _Back to: [Overview](./OpenTelemetryPlan.md)_

View File

@@ -0,0 +1,528 @@
# Implementation Strategy
> **Parent Document**: [OpenTelemetryPlan.md](./OpenTelemetryPlan.md)
> **Related**: [Code Samples](./04-code-samples.md) | [Configuration Reference](./05-configuration-reference.md)
---
## 3.1 Directory Structure
The telemetry implementation follows xrpld's existing code organization pattern:
```
include/xrpl/
├── telemetry/
│ ├── Telemetry.h # Main telemetry interface
│ ├── TelemetryConfig.h # Configuration structures
│ ├── TraceContext.h # Context propagation utilities
│ ├── SpanGuard.h # RAII span management
│ └── SpanAttributes.h # Attribute helper functions
src/libxrpl/
├── telemetry/
│ ├── Telemetry.cpp # Implementation
│ ├── TelemetryConfig.cpp # Config parsing
│ ├── TraceContext.cpp # Context serialization
│ └── NullTelemetry.cpp # No-op implementation
src/xrpld/
├── telemetry/
│ ├── TracingInstrumentation.h # Instrumentation macros
│ └── TracingInstrumentation.cpp
```
---
## 3.2 Implementation Approach
<div align="center">
```mermaid
%%{init: {'flowchart': {'nodeSpacing': 20, 'rankSpacing': 30}}}%%
flowchart TB
subgraph phase1["Phase 1: Core"]
direction LR
sdk["SDK Integration"] ~~~ interface["Telemetry Interface"] ~~~ config["Configuration"]
end
subgraph phase2["Phase 2: RPC"]
direction LR
http["HTTP Context"] ~~~ rpc["RPC Handlers"]
end
subgraph phase3["Phase 3: P2P"]
direction LR
proto["Protobuf Context"] ~~~ tx["Transaction Relay"]
end
subgraph phase4["Phase 4: Consensus"]
direction LR
consensus["Consensus Rounds"] ~~~ proposals["Proposals"]
end
phase1 --> phase2 --> phase3 --> phase4
style phase1 fill:#1565c0,stroke:#0d47a1,color:#ffffff
style phase2 fill:#2e7d32,stroke:#1b5e20,color:#ffffff
style phase3 fill:#e65100,stroke:#bf360c,color:#ffffff
style phase4 fill:#c2185b,stroke:#880e4f,color:#ffffff
```
</div>
### Key Principles
1. **Minimal Intrusion**: Instrumentation should not alter existing control flow
2. **Zero-Cost When Disabled**: Use compile-time flags and no-op implementations
3. **Backward Compatibility**: Protocol Buffer extensions use high field numbers
4. **Graceful Degradation**: Tracing failures must not affect node operation
---
## 3.3 Performance Overhead Summary
> **OTLP** = OpenTelemetry Protocol
| Metric | Overhead | Notes |
| ------------- | ---------- | ------------------------------------------------ |
| CPU | 1-3% | Of per-transaction CPU cost (~200μs baseline) |
| Memory | ~10 MB | SDK statics + batch buffer + worker thread stack |
| Network | 10-50 KB/s | Compressed OTLP export to collector |
| Latency (p99) | <2% | With proper sampling configuration |
---
## 3.4 Detailed CPU Overhead Analysis
### 3.4.1 Per-Operation Costs
> **Note on hardware assumptions**: The costs below are based on the official OTel C++ SDK CI benchmarks
> (969 runs on GitHub Actions 2-core shared runners). On production server hardware (3+ GHz Xeon),
> expect costs at the **lower end** of each range (~30-50% improvement over CI hardware).
| Operation | Time (ns) | Frequency | Impact |
| --------------------- | --------- | ---------------------- | ---------- |
| Span creation | 500-1000 | Every traced operation | Low |
| Span end | 100-200 | Every traced operation | Low |
| SetAttribute (string) | 80-120 | 3-5 per span | Low |
| SetAttribute (int) | 40-60 | 2-3 per span | Negligible |
| AddEvent | 100-200 | 0-2 per span | Low |
| Context injection | 150-250 | Per outgoing message | Low |
| Context extraction | 100-180 | Per incoming message | Low |
| GetCurrent context | 10-20 | Thread-local access | Negligible |
**Source**: Span creation based on OTel C++ SDK `BM_SpanCreation` benchmark (AlwaysOnSampler +
SimpleSpanProcessor + InMemoryExporter), median ~1,000 ns on CI hardware. AddEvent includes
timestamp read + string copy + vector push + mutex acquisition. Context injection/extraction
confirmed by `BM_SpanCreationWithScope` benchmark delta (~160 ns).
### 3.4.2 Transaction Processing Overhead
<div align="center">
```mermaid
%%{init: {'pie': {'textPosition': 0.75}}}%%
pie showData
"tx.receive (1400ns)" : 1400
"tx.validate (1200ns)" : 1200
"tx.relay (1200ns)" : 1200
"Context inject (200ns)" : 200
```
**Transaction Tracing Overhead (~4.0μs total)**
</div>
**Overhead percentage**: 4.0 μs / 200 μs (avg tx processing) = **~2.0%**
> **Breakdown**: Each span (tx.receive, tx.validate, tx.relay) costs ~1,000 ns for creation plus
> ~200-400 ns for 3-5 attribute sets. Context injection is ~200 ns (confirmed by benchmarks).
> On production hardware, expect ~2.6 μs total (~1.3% overhead) due to faster span creation (~500-600 ns).
### 3.4.3 Consensus Round Overhead
| Operation | Count | Cost (ns) | Total |
| ---------------------- | ----- | --------- | ---------- |
| consensus.round span | 1 | ~1200 | ~1.2 μs |
| consensus.phase spans | 3 | ~1100 | ~3.3 μs |
| proposal.receive spans | ~20 | ~1100 | ~22 μs |
| proposal.send spans | ~3 | ~1100 | ~3.3 μs |
| Context operations | ~30 | ~200 | ~6 μs |
| **TOTAL** | | | **~36 μs** |
> **Why higher**: Each span costs ~1,000 ns creation + ~100-200 ns for 1-2 attributes, totaling ~1,100-1,200 ns.
> Context operations remain ~200 ns (confirmed by benchmarks). On production hardware, expect ~24 μs total.
**Overhead percentage**: 36 μs / 3s (typical round) = **~0.001%** (negligible)
### 3.4.4 RPC Request Overhead
| Operation | Cost (ns) |
| ---------------- | ------------ |
| rpc.request span | ~1200 |
| rpc.command span | ~1100 |
| Context extract | ~250 |
| Context inject | ~200 |
| **TOTAL** | **~2.75 μs** |
> **Why higher**: Each span costs ~1,000 ns creation + ~100-200 ns for attributes (command name,
> version, role). Context extract/inject costs are confirmed by OTel C++ benchmarks.
- Fast RPC (1ms): 2.75 μs / 1ms = **~0.275%**
- Slow RPC (100ms): 2.75 μs / 100ms = **~0.003%**
---
## 3.5 Memory Overhead Analysis
> **OTLP** = OpenTelemetry Protocol
### 3.5.1 Static Memory
| Component | Size | Allocated |
| ------------------------------------ | ----------- | ---------- |
| TracerProvider singleton | ~64 KB | At startup |
| BatchSpanProcessor (circular buffer) | ~16 KB | At startup |
| BatchSpanProcessor (worker thread) | ~8 MB | At startup |
| OTLP exporter (gRPC channel init) | ~256 KB | At startup |
| Propagator registry | ~8 KB | At startup |
| **Total static** | **~8.3 MB** | |
> **Why higher than earlier estimate**: The BatchSpanProcessor's circular buffer itself is only ~16 KB
> (2049 x 8-byte `AtomicUniquePtr` entries), but it spawns a dedicated worker thread whose default
> stack size on Linux is ~8 MB. The OTLP gRPC exporter allocates memory for channel stubs and TLS
> initialization. The worker thread stack dominates the static footprint.
### 3.5.2 Dynamic Memory
| Component | Size per unit | Max units | Peak |
| -------------------- | -------------- | ---------- | --------------- |
| Active span | ~500-800 bytes | 1000 | ~500-800 KB |
| Queued span (export) | ~500 bytes | 2048 | ~1 MB |
| Attribute storage | ~80 bytes | 5 per span | Included |
| Context storage | ~64 bytes | Per thread | ~6.4 KB |
| **Total dynamic** | | | **~1.5-1.8 MB** |
> **Why active spans are larger**: An active `Span` object includes the wrapper (~88 bytes: shared_ptr,
> mutex, unique_ptr to Recordable) plus `SpanData` (~250 bytes: SpanContext, timestamps, name, status,
> empty containers) plus attribute storage (~200-500 bytes for 3-5 string attributes in a `std::map`).
> Source: `sdk/src/trace/span.h` and `sdk/include/opentelemetry/sdk/trace/span_data.h`.
> Queued spans release the wrapper, keeping only `SpanData` + attributes (~500 bytes).
### 3.5.3 Memory Growth Characteristics
```mermaid
---
config:
xyChart:
width: 700
height: 400
---
xychart-beta
title "Memory Usage vs Span Rate (bounded by queue limit)"
x-axis "Spans/second" [0, 200, 400, 600, 800, 1000]
y-axis "Memory (MB)" 0 --> 12
line [8.5, 9.2, 9.6, 9.9, 10.0, 10.0]
```
**Notes**:
- Memory increases with span rate but **plateaus at queue capacity** (default 2048 spans)
- Batch export prevents unbounded growth
- At queue limit, oldest spans are dropped (not blocked)
- Maximum memory is bounded: ~8.3 MB static (dominated by worker thread stack) + 2048 queued spans x ~500 bytes (~1 MB) + active spans (~0.8 MB) ≈ **~10 MB ceiling**
- The worker thread stack (~8 MB) is virtual memory; actual RSS depends on stack usage (typically much less)
### 3.5.4 Performance Data Sources
The overhead estimates in Sections 3.3-3.5 are derived from the following sources:
| Source | What it covers | URL |
| ------------------------------------------------ | ----------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| OTel C++ SDK CI benchmarks (969 runs) | Span creation, context activation, sampler overhead | [Benchmark Dashboard](https://open-telemetry.github.io/opentelemetry-cpp/benchmarks/) |
| `api/test/trace/span_benchmark.cc` | API-level span creation (~22 ns no-op) | [Source](https://github.com/open-telemetry/opentelemetry-cpp/blob/main/api/test/trace/span_benchmark.cc) |
| `sdk/test/trace/sampler_benchmark.cc` | SDK span creation with samplers (~1,000 ns AlwaysOn) | [Source](https://github.com/open-telemetry/opentelemetry-cpp/blob/main/sdk/test/trace/sampler_benchmark.cc) |
| `sdk/include/.../span_data.h` | SpanData memory layout (~250 bytes base) | [Source](https://github.com/open-telemetry/opentelemetry-cpp/blob/main/sdk/include/opentelemetry/sdk/trace/span_data.h) |
| `sdk/src/trace/span.h` | Span wrapper memory layout (~88 bytes) | [Source](https://github.com/open-telemetry/opentelemetry-cpp/blob/main/sdk/src/trace/span.h) |
| `sdk/include/.../batch_span_processor_options.h` | Default queue size (2048), batch size (512) | [Source](https://github.com/open-telemetry/opentelemetry-cpp/blob/main/sdk/include/opentelemetry/sdk/trace/batch_span_processor_options.h) |
| `sdk/include/.../circular_buffer.h` | CircularBuffer implementation (AtomicUniquePtr array) | [Source](https://github.com/open-telemetry/opentelemetry-cpp/blob/main/sdk/include/opentelemetry/sdk/common/circular_buffer.h) |
| OTLP proto definition | Serialized span size estimation | [Proto](https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/trace/v1/trace.proto) |
---
## 3.6 Network Overhead Analysis
### 3.6.1 Export Bandwidth
> **Bytes per span**: Estimates use ~500 bytes/span (conservative upper bound). OTLP protobuf analysis
> shows a typical span with 3-5 string attributes serializes to ~200-300 bytes raw; with gzip
> compression (~60-70% of raw) and batching (amortized headers), ~350 bytes/span is more realistic.
> The table uses the conservative estimate for capacity planning.
| Sampling Rate | Spans/sec | Bandwidth | Notes |
| ------------- | --------- | --------- | ---------------- |
| 100% | ~500 | ~250 KB/s | Development only |
| 10% | ~50 | ~25 KB/s | Staging |
| 1% | ~5 | ~2.5 KB/s | Production |
| Error-only | ~1 | ~0.5 KB/s | Minimal overhead |
### 3.6.2 Trace Context Propagation
| Message Type | Context Size | Messages/sec | Overhead |
| ---------------------- | ------------ | ------------ | ----------- |
| TMTransaction | 25 bytes | ~100 | ~2.5 KB/s |
| TMProposeSet | 25 bytes | ~10 | ~250 B/s |
| TMValidation | 25 bytes | ~50 | ~1.25 KB/s |
| **Total P2P overhead** | | | **~4 KB/s** |
---
## 3.7 Optimization Strategies
### 3.7.1 Sampling Strategies
#### Tail Sampling
```mermaid
flowchart TD
trace["New Trace"]
trace --> errors{"Is Error?"}
errors -->|Yes| sample["SAMPLE"]
errors -->|No| consensus{"Is Consensus?"}
consensus -->|Yes| sample
consensus -->|No| slow{"Is Slow?"}
slow -->|Yes| sample
slow -->|No| prob{"Random < 10%?"}
prob -->|Yes| sample
prob -->|No| drop["DROP"]
style sample fill:#4caf50,stroke:#388e3c,color:#fff
style drop fill:#f44336,stroke:#c62828,color:#fff
```
### 3.7.2 Batch Tuning Recommendations
| Environment | Batch Size | Batch Delay | Max Queue |
| ------------------ | ---------- | ----------- | --------- |
| Low-latency | 128 | 1000ms | 512 |
| High-throughput | 1024 | 10000ms | 8192 |
| Memory-constrained | 256 | 2000ms | 512 |
### 3.7.3 Conditional Instrumentation
```cpp
// Compile-time feature flag
#ifndef XRPL_ENABLE_TELEMETRY
// Zero-cost when disabled
#define XRPL_TRACE_SPAN(t, n) ((void)0)
#endif
// Runtime component filtering
if (telemetry.shouldTracePeer())
{
XRPL_TRACE_SPAN(telemetry, "peer.message.receive");
// ... instrumentation
}
// No overhead when component tracing disabled
```
---
## 3.8 Links to Detailed Documentation
- **[Code Samples](./04-code-samples.md)**: Complete implementation code for all components
- **[Configuration Reference](./05-configuration-reference.md)**: Configuration options and collector setup
- **[Implementation Phases](./06-implementation-phases.md)**: Detailed timeline and milestones
---
## 3.9 Code Intrusiveness Assessment
> **TxQ** = Transaction Queue
This section provides a detailed assessment of how intrusive the OpenTelemetry integration is to the existing xrpld codebase.
### 3.9.1 Files Modified Summary
| Component | Files Modified | Lines Added | Lines Changed | Architectural Impact |
| --------------------- | -------------- | ----------- | ------------- | -------------------- |
| **Core Telemetry** | 5 new files | ~800 | 0 | None (new module) |
| **Application Init** | 2 files | ~30 | ~5 | Minimal |
| **RPC Layer** | 3 files | ~80 | ~20 | Minimal |
| **Transaction Relay** | 4 files | ~120 | ~40 | Low |
| **Consensus** | 3 files | ~100 | ~30 | Low-Medium |
| **Protocol Buffers** | 1 file | ~25 | 0 | Low |
| **CMake/Build** | 3 files | ~50 | ~10 | Minimal |
| **PathFinding** | 2 | ~80 | ~5 | Minimal |
| **TxQ/Fee** | 2 | ~60 | ~5 | Minimal |
| **Validator/Amend** | 3 | ~40 | ~5 | Minimal |
| **Total** | **~28 files** | **~1,490** | **~120** | **Low** |
### 3.9.2 Detailed File Impact
```mermaid
pie title Code Changes by Component
"New Telemetry Module" : 800
"Transaction Relay" : 160
"Consensus" : 130
"RPC Layer" : 100
"PathFinding" : 80
"TxQ/Fee" : 60
"Validator/Amendment" : 40
"Application Init" : 35
"Protocol Buffers" : 25
"Build System" : 60
```
#### New Files (No Impact on Existing Code)
| File | Lines | Purpose |
| ---------------------------------------------- | ----- | -------------------- |
| `include/xrpl/telemetry/Telemetry.h` | ~160 | Main interface |
| `include/xrpl/telemetry/SpanGuard.h` | ~120 | RAII wrapper |
| `include/xrpl/telemetry/TraceContext.h` | ~80 | Context propagation |
| `src/xrpld/telemetry/TracingInstrumentation.h` | ~60 | Macros |
| `src/libxrpl/telemetry/Telemetry.cpp` | ~200 | Implementation |
| `src/libxrpl/telemetry/TelemetryConfig.cpp` | ~60 | Config parsing |
| `src/libxrpl/telemetry/NullTelemetry.cpp` | ~40 | No-op implementation |
#### Modified Files (Existing Xrpld Code)
| File | Lines Added | Lines Changed | Risk Level |
| ------------------------------------------------- | ----------- | ------------- | ---------- |
| `src/xrpld/app/main/Application.cpp` | ~15 | ~3 | Low |
| `include/xrpl/core/ServiceRegistry.h` | ~5 | ~2 | Low |
| `src/xrpld/rpc/detail/ServerHandler.cpp` | ~40 | ~10 | Low |
| `src/xrpld/rpc/handlers/*.cpp` | ~30 | ~8 | Low |
| `src/xrpld/overlay/detail/PeerImp.cpp` | ~60 | ~15 | Medium |
| `src/xrpld/overlay/detail/OverlayImpl.cpp` | ~30 | ~10 | Medium |
| `src/xrpld/app/consensus/RCLConsensus.cpp` | ~50 | ~15 | Medium |
| `src/xrpld/app/consensus/RCLConsensusAdaptor.cpp` | ~40 | ~12 | Medium |
| `src/xrpld/core/JobQueue.cpp` | ~20 | ~5 | Low |
| `src/xrpld/app/paths/PathRequest.cpp` | ~40 | ~3 | Low |
| `src/xrpld/app/paths/Pathfinder.cpp` | ~40 | ~2 | Low |
| `src/xrpld/app/misc/TxQ.cpp` | ~40 | ~3 | Low |
| `src/xrpld/app/main/LoadManager.cpp` | ~20 | ~2 | Low |
| `src/xrpld/app/misc/ValidatorList.cpp` | ~20 | ~2 | Low |
| `src/xrpld/app/misc/AmendmentTable.cpp` | ~10 | ~2 | Low |
| `src/xrpld/app/misc/Manifest.cpp` | ~10 | ~1 | Low |
| `src/xrpld/shamap/SHAMap.cpp` | ~20 | ~3 | Low |
| `src/xrpld/overlay/detail/ripple.proto` | ~25 | 0 | Low |
| `CMakeLists.txt` | ~40 | ~8 | Low |
| `cmake/FindOpenTelemetry.cmake` | ~50 | 0 | None (new) |
### 3.9.3 Risk Assessment by Component
<div align="center">
**Do First** ↖ ↗ **Plan Carefully**
```mermaid
quadrantChart
title Code Intrusiveness Risk Matrix
x-axis Low Risk --> High Risk
y-axis Low Value --> High Value
RPC Tracing: [0.2, 0.55]
Transaction Relay: [0.55, 0.85]
Consensus Tracing: [0.75, 0.92]
Peer Message Tracing: [0.85, 0.35]
JobQueue Context: [0.3, 0.42]
Ledger Acquisition: [0.48, 0.65]
PathFinding: [0.38, 0.72]
TxQ and Fees: [0.25, 0.62]
Validator Mgmt: [0.15, 0.35]
```
**Optional** ↙ ↘ **Avoid**
</div>
#### Risk Level Definitions
| Risk Level | Definition | Mitigation |
| ---------- | ---------------------------------------------------------------- | ---------------------------------- |
| **Low** | Additive changes only; no modification to existing logic | Standard code review |
| **Medium** | Minor modifications to existing functions; clear boundaries | Comprehensive unit tests |
| **High** | Changes to core logic or data structures; potential side effects | Integration tests + staged rollout |
### 3.9.4 Architectural Impact Assessment
| Aspect | Impact | Justification |
| -------------------- | ------- | -------------------------------------------------------------------------------- |
| **Data Flow** | Minimal | Read-only instrumentation; no modification to consensus or transaction data flow |
| **Threading Model** | Minimal | Context propagation uses thread-local storage (standard OTel pattern) |
| **Memory Model** | Low | Bounded queues prevent unbounded growth; RAII ensures cleanup |
| **Network Protocol** | Low | Optional fields in protobuf (high field numbers); backward compatible |
| **Configuration** | None | New config section; existing configs unaffected |
| **Build System** | Low | Optional CMake flag; builds work without OpenTelemetry |
| **Dependencies** | Low | OpenTelemetry SDK is optional; null implementation when disabled |
### 3.9.5 Backward Compatibility
| Compatibility | Status | Notes |
| --------------- | ------- | ----------------------------------------------------- |
| **Config File** | ✅ Full | New `[telemetry]` section is optional |
| **Protocol** | ✅ Full | Optional protobuf fields with high field numbers |
| **Build** | ✅ Full | `XRPL_ENABLE_TELEMETRY=OFF` produces identical binary |
| **Runtime** | ✅ Full | `enabled=0` produces zero overhead |
| **API** | ✅ Full | No changes to public RPC or P2P APIs |
### 3.9.6 Rollback Strategy
If issues are discovered after deployment:
1. **Immediate**: Set `enabled=0` in config and restart (zero code change)
2. **Quick**: Rebuild with `XRPL_ENABLE_TELEMETRY=OFF`
3. **Complete**: Revert telemetry commits (clean separation makes this easy)
### 3.9.7 Code Change Examples
**Minimal RPC Instrumentation (Low Intrusiveness):**
```cpp
// Before
void ServerHandler::onRequest(...) {
auto result = processRequest(req);
send(result);
}
// After (only ~10 lines added)
void ServerHandler::onRequest(...) {
XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.request"); // +1 line
XRPL_TRACE_SET_ATTR("xrpl.rpc.command", command); // +1 line
auto result = processRequest(req);
XRPL_TRACE_SET_ATTR("xrpl.rpc.status", status); // +1 line
send(result);
}
```
**Consensus Instrumentation (Medium Intrusiveness):**
```cpp
// Before
void RCLConsensusAdaptor::startRound(...) {
// ... existing logic
}
// After (context storage required)
void RCLConsensusAdaptor::startRound(...) {
XRPL_TRACE_CONSENSUS(app_.getTelemetry(), "consensus.round");
XRPL_TRACE_SET_ATTR("xrpl.consensus.ledger.seq", seq);
// Store context for child spans in phase transitions
currentRoundContext_ = _xrpl_guard_->context(); // New member variable
// ... existing logic unchanged
}
```
---
_Previous: [Design Decisions](./02-design-decisions.md)_ | _Next: [Code Samples](./04-code-samples.md)_ | _Back to: [Overview](./OpenTelemetryPlan.md)_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,972 @@
# Configuration Reference
> **Parent Document**: [OpenTelemetryPlan.md](./OpenTelemetryPlan.md)
> **Related**: [Code Samples](./04-code-samples.md) | [Implementation Phases](./06-implementation-phases.md)
---
## 5.1 xrpld Configuration
> **OTLP** = OpenTelemetry Protocol | **TxQ** = Transaction Queue
### 5.1.1 Configuration File Section
Add to `cfg/xrpld-example.cfg`:
```ini
# ═══════════════════════════════════════════════════════════════════════════════
# TELEMETRY (OpenTelemetry Distributed Tracing)
# ═══════════════════════════════════════════════════════════════════════════════
#
# Enables distributed tracing for transaction flow, consensus, and RPC calls.
# Traces are exported to an OpenTelemetry Collector using OTLP protocol.
#
# [telemetry]
#
# # Enable/disable telemetry (default: 0 = disabled)
# enabled=1
#
# # OTLP endpoint (default: http://localhost:4318/v1/traces - OTLP/HTTP)
# # Note: only OTLP/HTTP is shipped in Phase 1b. OTLP/gRPC support is
# # planned as future work and is not yet parsed by TelemetryConfig.cpp.
# endpoint=http://localhost:4318/v1/traces
#
# # Use TLS for exporter connection (default: 0)
# use_tls=0
#
# # Path to CA certificate for TLS (optional)
# # tls_ca_cert=/path/to/ca.crt
#
# # Sampling ratio: 0.0-1.0 (default: 1.0 = 100% sampling)
# # Use lower values in production to reduce overhead
# # Default: 1.0 (all traces). For production deployments with high
# # throughput, 0.1 (10%) is recommended to reduce overhead.
# # See Section 7.4.2 for sampling strategy details.
# sampling_ratio=0.1
#
# # Batch processor settings
# batch_size=512 # Spans per batch (default: 512)
# batch_delay_ms=5000 # Max delay before sending batch (default: 5000)
# max_queue_size=2048 # Max queued spans (default: 2048)
#
# # Component-specific tracing (default: all enabled except peer)
# trace_transactions=1 # Transaction relay and processing
# trace_consensus=1 # Consensus rounds and proposals
# trace_rpc=1 # RPC request handling
# trace_peer=0 # Peer messages (high volume, disabled by default)
# trace_ledger=1 # Ledger acquisition and building
#
# # Planned (not yet parsed by TelemetryConfig.cpp):
# # trace_pathfind=1 # Path computation (Phase 2)
# # trace_txq=1 # Transaction queue (Phase 3)
# # trace_validator=0 # Validator list / manifest (future)
# # trace_amendment=0 # Amendment voting (future)
#
# # Service identification (automatically detected if not specified)
# # service_name=xrpld
# # service_instance_id=<node_public_key>
[telemetry]
enabled=0
```
### 5.1.2 Configuration Options Summary
| Option | Type | Default | Description |
| --------------------- | ------ | --------------------------------- | ----------------------------------------- |
| `enabled` | bool | `false` | Enable/disable telemetry |
| `endpoint` | string | `http://localhost:4318/v1/traces` | OTLP/HTTP collector endpoint |
| `use_tls` | bool | `false` | Enable TLS for exporter connection |
| `tls_ca_cert` | string | `""` | Path to CA certificate file |
| `sampling_ratio` | float | `1.0` | Sampling ratio (0.0-1.0) |
| `batch_size` | uint | `512` | Spans per export batch |
| `batch_delay_ms` | uint | `5000` | Max delay before sending batch (ms) |
| `max_queue_size` | uint | `2048` | Maximum queued spans |
| `trace_transactions` | bool | `true` | Enable transaction tracing |
| `trace_consensus` | bool | `true` | Enable consensus tracing |
| `trace_rpc` | bool | `true` | Enable RPC tracing |
| `trace_peer` | bool | `false` | Enable peer message tracing (high volume) |
| `trace_ledger` | bool | `true` | Enable ledger tracing |
| `service_name` | string | `"xrpld"` | Service name for traces |
| `service_instance_id` | string | `<node_pubkey>` | Instance identifier |
**Planned (not yet implemented)**: the following options appear in the design
documents but are not parsed by `TelemetryConfig.cpp` in Phase 1b and later
phases. They will be added as the corresponding subsystems are instrumented:
| Option | Planned Phase | Purpose |
| ----------------- | ------------- | ---------------------------------------- |
| `exporter` | Future | Select between OTLP/HTTP and OTLP/gRPC |
| `trace_pathfind` | Phase 2 | Path computation tracing toggle |
| `trace_txq` | Phase 3 | Transaction queue tracing toggle |
| `trace_validator` | Future | Validator list / manifest update tracing |
| `trace_amendment` | Future | Amendment voting tracing |
---
## 5.2 Configuration Parser
> **TxQ** = Transaction Queue
```cpp
// src/libxrpl/telemetry/TelemetryConfig.cpp
#include <xrpl/telemetry/Telemetry.h>
#include <xrpl/basics/Log.h>
namespace xrpl {
namespace telemetry {
Telemetry::Setup
setup_Telemetry(
Section const& section,
std::string const& nodePublicKey,
std::string const& version)
{
Telemetry::Setup setup;
// Basic settings
setup.enabled = section.value_or("enabled", false);
setup.serviceName = section.value_or("service_name", "xrpld");
setup.serviceVersion = version;
setup.serviceInstanceId = section.value_or(
"service_instance_id", nodePublicKey);
// Exporter settings
setup.exporterType = section.value_or("exporter", "otlp_grpc");
if (setup.exporterType == "otlp_grpc")
setup.exporterEndpoint = section.value_or("endpoint", "localhost:4317");
else if (setup.exporterType == "otlp_http")
setup.exporterEndpoint = section.value_or("endpoint", "localhost:4318");
setup.useTls = section.value_or("use_tls", false);
setup.tlsCertPath = section.value_or("tls_ca_cert", "");
// Sampling
setup.samplingRatio = section.value_or("sampling_ratio", 1.0);
if (setup.samplingRatio < 0.0 || setup.samplingRatio > 1.0)
{
Throw<std::runtime_error>(
"telemetry.sampling_ratio must be between 0.0 and 1.0");
}
// Batch processor
setup.batchSize = section.value_or("batch_size", 512u);
setup.batchDelay = std::chrono::milliseconds{
section.value_or("batch_delay_ms", 5000u)};
setup.maxQueueSize = section.value_or("max_queue_size", 2048u);
// Component filtering
setup.traceTransactions = section.value_or("trace_transactions", true);
setup.traceConsensus = section.value_or("trace_consensus", true);
setup.traceRpc = section.value_or("trace_rpc", true);
setup.tracePeer = section.value_or("trace_peer", false);
setup.traceLedger = section.value_or("trace_ledger", true);
setup.tracePathfind = section.value_or("trace_pathfind", true);
setup.traceTxQ = section.value_or("trace_txq", true);
setup.traceValidator = section.value_or("trace_validator", false);
setup.traceAmendment = section.value_or("trace_amendment", false);
return setup;
}
} // namespace telemetry
} // namespace xrpl
```
---
## 5.3 Application Integration
### 5.3.1 ApplicationImp Changes
> **Deferred identity**: The node public key (`nodeIdentity_`) is not
> available during `ApplicationImp`'s member initializer list — it is
> resolved later in `setup()`. The `Telemetry` object is therefore
> constructed with an empty `serviceInstanceId` and patched via
> `setServiceInstanceId()` once `setup()` has called `getNodeIdentity()`.
```cpp
// src/xrpld/app/main/Application.cpp (modified)
#include <xrpl/telemetry/Telemetry.h>
class ApplicationImp : public Application, public BasicApp
{
// ... existing members (perfLog_, etc.) ...
// Telemetry — constructed in the member initializer list with
// an empty serviceInstanceId, patched in setup().
std::unique_ptr<telemetry::Telemetry> telemetry_;
// Member initializer list (excerpt):
// ...
// , telemetry_(
// telemetry::make_Telemetry(
// telemetry::setup_Telemetry(
// config_->section("telemetry"),
// "", // Updated later via setServiceInstanceId()
// BuildInfo::getVersionString()),
// logs_->journal("Telemetry")))
// ...
bool setup(...) override
{
// ... existing setup code ...
nodeIdentity_ = getNodeIdentity(*this, cmdline);
// Inject node identity into telemetry resource attributes,
// unless the user already set a custom service_instance_id.
if (!config_->section("telemetry").exists("service_instance_id"))
telemetry_->setServiceInstanceId(
toBase58(TokenType::NodePublic, nodeIdentity_->first));
// ... rest of setup ...
}
void start(bool withTimers) override
{
// ... existing start code ...
telemetry_->start();
}
void run() override
{
// ... existing run/shutdown code ...
telemetry_->stop();
}
telemetry::Telemetry&
getTelemetry() override
{
return *telemetry_;
}
};
```
### 5.3.2 ServiceRegistry Interface Addition
```cpp
// include/xrpl/core/ServiceRegistry.h (modified)
namespace telemetry {
class Telemetry;
} // namespace telemetry
class ServiceRegistry
{
public:
// ... existing virtual methods ...
/** Get the telemetry system for distributed tracing. */
virtual telemetry::Telemetry&
getTelemetry() = 0;
};
```
> **Note:** `Application` extends `ServiceRegistry`, so `getTelemetry()` is
> available on both. Components that hold a `ServiceRegistry&` (e.g.
> `NetworkOPsImp`) call `registry_.get().getTelemetry()`. Components that
> still hold an `Application&` (e.g. `ServerHandler`, `PeerImp`,
> `RCLConsensusAdaptor`) call `app_.getTelemetry()` directly.
---
## 5.4 CMake Integration
> **OTLP** = OpenTelemetry Protocol
### 5.4.1 Find OpenTelemetry Module
```cmake
# cmake/FindOpenTelemetry.cmake
# Find OpenTelemetry C++ SDK
#
# This module defines:
# OpenTelemetry_FOUND - System has OpenTelemetry
# OpenTelemetry::api - API library target
# OpenTelemetry::sdk - SDK library target
# OpenTelemetry::otlp_grpc_exporter - OTLP gRPC exporter target
# OpenTelemetry::otlp_http_exporter - OTLP HTTP exporter target
find_package(opentelemetry-cpp CONFIG QUIET)
if(opentelemetry-cpp_FOUND)
set(OpenTelemetry_FOUND TRUE)
# Create imported targets if not already created by config
if(NOT TARGET OpenTelemetry::api)
add_library(OpenTelemetry::api ALIAS opentelemetry-cpp::api)
endif()
if(NOT TARGET OpenTelemetry::sdk)
add_library(OpenTelemetry::sdk ALIAS opentelemetry-cpp::sdk)
endif()
if(NOT TARGET OpenTelemetry::otlp_grpc_exporter)
add_library(OpenTelemetry::otlp_grpc_exporter ALIAS
opentelemetry-cpp::otlp_grpc_exporter)
endif()
else()
# Try pkg-config fallback
find_package(PkgConfig QUIET)
if(PKG_CONFIG_FOUND)
pkg_check_modules(OTEL opentelemetry-cpp QUIET)
if(OTEL_FOUND)
set(OpenTelemetry_FOUND TRUE)
# Create imported targets from pkg-config
add_library(OpenTelemetry::api INTERFACE IMPORTED)
target_include_directories(OpenTelemetry::api INTERFACE
${OTEL_INCLUDE_DIRS})
endif()
endif()
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(OpenTelemetry
REQUIRED_VARS OpenTelemetry_FOUND)
```
### 5.4.2 CMakeLists.txt Changes
```cmake
# CMakeLists.txt (additions)
# ═══════════════════════════════════════════════════════════════════════════════
# TELEMETRY OPTIONS
# ═══════════════════════════════════════════════════════════════════════════════
option(XRPL_ENABLE_TELEMETRY
"Enable OpenTelemetry distributed tracing support" OFF)
if(XRPL_ENABLE_TELEMETRY)
find_package(OpenTelemetry REQUIRED)
# Define compile-time flag
add_compile_definitions(XRPL_ENABLE_TELEMETRY)
message(STATUS "OpenTelemetry tracing: ENABLED")
else()
message(STATUS "OpenTelemetry tracing: DISABLED")
endif()
# ═══════════════════════════════════════════════════════════════════════════════
# TELEMETRY LIBRARY
# ═══════════════════════════════════════════════════════════════════════════════
if(XRPL_ENABLE_TELEMETRY)
add_library(xrpl_telemetry
src/libxrpl/telemetry/Telemetry.cpp
src/libxrpl/telemetry/TelemetryConfig.cpp
src/libxrpl/telemetry/TraceContext.cpp
)
target_include_directories(xrpl_telemetry
PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(xrpl_telemetry
PUBLIC
OpenTelemetry::api
OpenTelemetry::sdk
OpenTelemetry::otlp_grpc_exporter
PRIVATE
xrpl_basics
)
# Add to main library dependencies
target_link_libraries(xrpld PRIVATE xrpl_telemetry)
else()
# Create null implementation library
add_library(xrpl_telemetry
src/libxrpl/telemetry/NullTelemetry.cpp
)
target_include_directories(xrpl_telemetry
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
)
endif()
```
---
## 5.5 OpenTelemetry Collector Configuration
> **OTLP** = OpenTelemetry Protocol | **APM** = Application Performance Monitoring
### 5.5.1 Development Configuration
```yaml
# otel-collector-dev.yaml
# Minimal configuration for local development
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 1s
send_batch_size: 100
exporters:
# Console output for debugging
logging:
verbosity: detailed
sampling_initial: 5
sampling_thereafter: 200
# Tempo for trace visualization
otlp/tempo:
endpoint: tempo:4317
tls:
insecure: true
service:
pipelines:
traces:
receivers: [otlp]
processors: [batch]
exporters: [logging, otlp/tempo]
```
### 5.5.2 Production Configuration
```yaml
# otel-collector-prod.yaml
# Production configuration with filtering, sampling, and multiple backends
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
tls:
cert_file: /etc/otel/server.crt
key_file: /etc/otel/server.key
ca_file: /etc/otel/ca.crt
processors:
# Memory limiter to prevent OOM
memory_limiter:
check_interval: 1s
limit_mib: 1000
spike_limit_mib: 200
# Batch processing for efficiency
batch:
timeout: 5s
send_batch_size: 512
send_batch_max_size: 1024
# Tail-based sampling (keep errors and slow traces)
tail_sampling:
decision_wait: 10s
num_traces: 100000
expected_new_traces_per_sec: 1000
policies:
# Always keep error traces
- name: errors
type: status_code
status_code:
status_codes: [ERROR]
# Keep slow consensus rounds (>5s)
- name: slow-consensus
type: latency
latency:
threshold_ms: 5000
# Keep slow RPC requests (>1s)
- name: slow-rpc
type: and
and:
and_sub_policy:
- name: rpc-spans
type: string_attribute
string_attribute:
key: xrpl.rpc.command
values: [".*"]
enabled_regex_matching: true
- name: latency
type: latency
latency:
threshold_ms: 1000
# Probabilistic sampling for the rest
- name: probabilistic
type: probabilistic
probabilistic:
sampling_percentage: 10
# Attribute processing
attributes:
actions:
# Hash sensitive data
- key: xrpl.tx.account
action: hash
# Add deployment info
- key: deployment.environment
value: production
action: upsert
exporters:
# Grafana Tempo for long-term storage
otlp/tempo:
endpoint: tempo.monitoring:4317
tls:
insecure: false
ca_file: /etc/otel/tempo-ca.crt
# Elastic APM for correlation with logs
otlp/elastic:
endpoint: apm.elastic:8200
headers:
Authorization: "Bearer ${ELASTIC_APM_TOKEN}"
extensions:
health_check:
endpoint: 0.0.0.0:13133
zpages:
endpoint: 0.0.0.0:55679
service:
extensions: [health_check, zpages]
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, tail_sampling, attributes, batch]
exporters: [otlp/tempo, otlp/elastic]
```
---
## 5.6 Docker Compose Development Environment
> **OTLP** = OpenTelemetry Protocol
```yaml
# docker-compose-telemetry.yaml
version: "3.8"
services:
# OpenTelemetry Collector
otel-collector:
image: otel/opentelemetry-collector-contrib:0.92.0
container_name: otel-collector
command: ["--config=/etc/otel-collector-config.yaml"]
volumes:
- ./otel-collector-dev.yaml:/etc/otel-collector-config.yaml:ro
ports:
- "4317:4317" # OTLP gRPC
- "4318:4318" # OTLP HTTP
- "13133:13133" # Health check
depends_on:
- tempo
# Tempo for trace visualization
tempo:
image: grafana/tempo:2.6.1
container_name: tempo
ports:
- "3200:3200" # Tempo HTTP API
- "4317" # OTLP gRPC (internal)
# Grafana for dashboards
grafana:
image: grafana/grafana:10.2.3
container_name: grafana
environment:
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
volumes:
- ./grafana/provisioning:/etc/grafana/provisioning:ro
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
ports:
- "3000:3000"
depends_on:
- tempo
# Prometheus for metrics (optional, for correlation)
prometheus:
image: prom/prometheus:v2.48.1
container_name: prometheus
volumes:
- ./prometheus.yaml:/etc/prometheus/prometheus.yml:ro
ports:
- "9090:9090"
networks:
default:
name: xrpld-telemetry
```
---
## 5.7 Configuration Architecture
> **OTLP** = OpenTelemetry Protocol
```mermaid
flowchart TB
subgraph config["Configuration Sources"]
cfgFile["xrpld.cfg<br/>[telemetry] section"]
cmake["CMake<br/>XRPL_ENABLE_TELEMETRY"]
end
subgraph init["Initialization"]
parse["setup_Telemetry()"]
factory["make_Telemetry()"]
end
subgraph runtime["Runtime Components"]
tracer["TracerProvider"]
exporter["OTLP Exporter"]
processor["BatchProcessor"]
end
subgraph collector["Collector Pipeline"]
recv["Receivers"]
proc["Processors"]
exp["Exporters"]
end
cfgFile --> parse
cmake -->|"compile flag"| parse
parse --> factory
factory --> tracer
tracer --> processor
processor --> exporter
exporter -->|"OTLP"| recv
recv --> proc
proc --> exp
style config fill:#e3f2fd,stroke:#1976d2
style runtime fill:#e8f5e9,stroke:#388e3c
style collector fill:#fff3e0,stroke:#ff9800
```
**Reading the diagram:**
- **Configuration Sources**: `xrpld.cfg` provides runtime settings (endpoint, sampling) while the CMake flag controls whether telemetry is compiled in at all.
- **Initialization**: `setup_Telemetry()` parses config values, then `make_Telemetry()` constructs the provider, processor, and exporter objects.
- **Runtime Components**: The `TracerProvider` creates spans, the `BatchProcessor` buffers them, and the `OTLP Exporter` serializes and sends them over the wire.
- **OTLP arrow to Collector**: Trace data leaves the xrpld process via OTLP (gRPC or HTTP) and enters the external Collector pipeline.
- **Collector Pipeline**: `Receivers` ingest OTLP data, `Processors` apply sampling/filtering/enrichment, and `Exporters` forward traces to storage backends (Tempo, etc.).
---
## 5.8 Grafana Integration
> **APM** = Application Performance Monitoring
Step-by-step instructions for integrating xrpld traces with Grafana.
### 5.8.1 Data Source Configuration
#### Tempo (Recommended)
```yaml
# grafana/provisioning/datasources/tempo.yaml
apiVersion: 1
datasources:
- name: Tempo
type: tempo
access: proxy
url: http://tempo:3200
jsonData:
httpMethod: GET
tracesToLogs:
datasourceUid: loki
tags: ["service.name", "xrpl.tx.hash"]
mappedTags: [{ key: "trace_id", value: "traceID" }]
mapTagNamesEnabled: true
filterByTraceID: true
serviceMap:
datasourceUid: prometheus
nodeGraph:
enabled: true
search:
hide: false
lokiSearch:
datasourceUid: loki
```
#### Elastic APM
```yaml
# grafana/provisioning/datasources/elastic-apm.yaml
apiVersion: 1
datasources:
- name: Elasticsearch-APM
type: elasticsearch
access: proxy
url: http://elasticsearch:9200
database: "apm-*"
jsonData:
esVersion: "8.0.0"
timeField: "@timestamp"
logMessageField: message
logLevelField: log.level
```
### 5.8.2 Dashboard Provisioning
```yaml
# grafana/provisioning/dashboards/dashboards.yaml
apiVersion: 1
providers:
- name: "xrpld-dashboards"
orgId: 1
folder: "xrpld"
folderUid: "xrpld"
type: file
disableDeletion: false
updateIntervalSeconds: 30
options:
path: /var/lib/grafana/dashboards/rippled
```
### 5.8.3 Example Dashboard: RPC Performance
```json
{
"title": "xrpld RPC Performance",
"uid": "xrpld-rpc-performance",
"panels": [
{
"title": "RPC Latency by Command",
"type": "heatmap",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\" && span.xrpl.rpc.command != \"\"} | histogram_over_time(duration) by (span.xrpl.rpc.command)"
}
],
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }
},
{
"title": "RPC Error Rate",
"type": "timeseries",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\" && status.code=error} | rate() by (span.xrpl.rpc.command)"
}
],
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }
},
{
"title": "Top 10 Slowest RPC Commands",
"type": "table",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\" && span.xrpl.rpc.command != \"\"} | avg(duration) by (span.xrpl.rpc.command) | topk(10)"
}
],
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 }
},
{
"title": "Recent Traces",
"type": "table",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\"}"
}
],
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }
}
]
}
```
### 5.8.4 Example Dashboard: Transaction Tracing
```json
{
"title": "xrpld Transaction Tracing",
"uid": "xrpld-tx-tracing",
"panels": [
{
"title": "Transaction Throughput",
"type": "stat",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\" && name=\"tx.receive\"} | rate()"
}
],
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }
},
{
"title": "Cross-Node Relay Count",
"type": "timeseries",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\" && name=\"tx.relay\"} | avg(span.xrpl.tx.relay_count)"
}
],
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }
},
{
"title": "Transaction Validation Errors",
"type": "table",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\" && name=\"tx.validate\" && status.code=error}"
}
],
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }
}
]
}
```
### 5.8.5 TraceQL Query Examples
Common queries for xrpld traces:
```
# Find all traces for a specific transaction hash
{resource.service.name="xrpld" && span.xrpl.tx.hash="ABC123..."}
# Find slow RPC commands (>100ms)
{resource.service.name="xrpld" && name=~"rpc.command.*"} | duration > 100ms
# Find consensus rounds taking >5 seconds
{resource.service.name="xrpld" && name="consensus.round"} | duration > 5s
# Find failed transactions with error details
{resource.service.name="xrpld" && name="tx.validate" && status.code=error}
# Find transactions relayed to many peers
{resource.service.name="xrpld" && name="tx.relay"} | span.xrpl.tx.relay_count > 10
# Compare latency across nodes
{resource.service.name="xrpld" && name="rpc.command.account_info"} | avg(duration) by (resource.service.instance.id)
```
### 5.8.6 Correlation with PerfLog
To correlate OpenTelemetry traces with existing PerfLog data:
**Step 1: Configure Loki to ingest PerfLog**
```yaml
# promtail-config.yaml
scrape_configs:
- job_name: xrpld-perflog
static_configs:
- targets:
- localhost
labels:
job: xrpld
__path__: /var/log/rippled/perf*.log
pipeline_stages:
- json:
expressions:
trace_id: trace_id
ledger_seq: ledger_seq
tx_hash: tx_hash
- labels:
trace_id:
ledger_seq:
tx_hash:
```
**Step 2: Add trace_id to PerfLog entries**
Modify PerfLog to include trace_id when available:
```cpp
// In PerfLog output, add trace_id from current span context
void logPerf(Json::Value& entry) {
auto span = opentelemetry::trace::GetSpan(
opentelemetry::context::RuntimeContext::GetCurrent());
if (span && span->GetContext().IsValid()) {
char traceIdHex[33];
span->GetContext().trace_id().ToLowerBase16(traceIdHex);
entry["trace_id"] = std::string(traceIdHex, 32);
}
// ... existing logging
}
```
**Step 3: Configure Grafana trace-to-logs link**
In Tempo data source configuration, set up the derived field:
```yaml
jsonData:
tracesToLogs:
datasourceUid: loki
tags: ["trace_id", "xrpl.tx.hash"]
filterByTraceID: true
filterBySpanID: false
```
### 5.8.7 Correlation with Insight/StatsD Metrics
To correlate traces with existing Beast Insight metrics:
**Step 1: Export Insight metrics to Prometheus**
```yaml
# prometheus.yaml
scrape_configs:
- job_name: "xrpld-statsd"
static_configs:
- targets: ["statsd-exporter:9102"]
```
**Step 2: Add exemplars to metrics**
OpenTelemetry SDK automatically adds exemplars (trace IDs) to metrics when using the Prometheus exporter. This links metrics spikes to specific traces.
**Step 3: Configure Grafana metric-to-trace link**
```yaml
# In Prometheus data source
jsonData:
exemplarTraceIdDestinations:
- name: trace_id
datasourceUid: tempo
```
**Step 4: Dashboard panel with exemplars**
```json
{
"title": "RPC Latency with Trace Links",
"type": "timeseries",
"datasource": "Prometheus",
"targets": [
{
"expr": "histogram_quantile(0.99, rate(xrpld_rpc_duration_seconds_bucket[5m]))",
"exemplar": true
}
]
}
```
This allows clicking on metric data points to jump directly to the related trace.
---
_Previous: [Code Samples](./04-code-samples.md)_ | _Next: [Implementation Phases](./06-implementation-phases.md)_ | _Back to: [Overview](./OpenTelemetryPlan.md)_

View File

@@ -0,0 +1,575 @@
# Implementation Phases
> **Parent Document**: [OpenTelemetryPlan.md](./OpenTelemetryPlan.md)
> **Related**: [Configuration Reference](./05-configuration-reference.md) | [Observability Backends](./07-observability-backends.md)
---
## 6.1 Phase Overview
> **TxQ** = Transaction Queue
```mermaid
gantt
title OpenTelemetry Implementation Timeline
dateFormat YYYY-MM-DD
axisFormat Week %W
section Phase 1
Core Infrastructure :p1, 2024-01-01, 2w
SDK Integration :p1a, 2024-01-01, 4d
Telemetry Interface :p1b, after p1a, 3d
Configuration & CMake :p1c, after p1b, 3d
Unit Tests :p1d, after p1c, 2d
Buffer & Integration :p1e, after p1d, 2d
section Phase 2
RPC Tracing :p2, after p1, 2w
HTTP Context Extraction :p2a, after p1, 2d
RPC Handler Instrumentation :p2b, after p2a, 4d
PathFinding Instrumentation :p2f, after p2b, 2d
TxQ Instrumentation :p2g, after p2f, 2d
WebSocket Support :p2c, after p2g, 2d
Integration Tests :p2d, after p2c, 2d
Buffer & Review :p2e, after p2d, 4d
section Phase 3
Transaction Tracing :p3, after p2, 2w
Protocol Buffer Extension :p3a, after p2, 2d
PeerImp Instrumentation :p3b, after p3a, 3d
Fee Escalation Instrumentation :p3f, after p3b, 2d
Relay Context Propagation :p3c, after p3f, 3d
Multi-node Tests :p3d, after p3c, 2d
Buffer & Review :p3e, after p3d, 4d
section Phase 4
Consensus Tracing :p4, after p3, 2w
Consensus Round Spans :p4a, after p3, 3d
Proposal Handling :p4b, after p4a, 3d
Validator List & Manifest Tracing :p4f, after p4b, 2d
Amendment Voting Tracing :p4g, after p4f, 2d
SHAMap Sync Tracing :p4h, after p4g, 2d
Validation Tests :p4c, after p4h, 4d
Buffer & Review :p4e, after p4c, 4d
section Phase 5
Documentation & Deploy :p5, after p4, 1w
```
---
## 6.2 Phase 1: Core Infrastructure (Weeks 1-2)
**Objective**: Establish foundational telemetry infrastructure
### Tasks
| Task | Description |
| ---- | ----------------------------------------------------- |
| 1.1 | Add OpenTelemetry C++ SDK to Conan/CMake |
| 1.2 | Implement `Telemetry` interface and factory |
| 1.3 | Implement `SpanGuard` RAII wrapper |
| 1.4 | Implement configuration parser |
| 1.5 | Integrate into `ApplicationImp` |
| 1.6 | Add conditional compilation (`XRPL_ENABLE_TELEMETRY`) |
| 1.7 | Create `NullTelemetry` no-op implementation |
| 1.8 | Unit tests for core infrastructure |
### Exit Criteria
- [ ] OpenTelemetry SDK compiles and links
- [ ] Telemetry can be enabled/disabled via config
- [ ] Basic span creation works
- [ ] No performance regression when disabled
- [ ] Unit tests passing
---
## 6.3 Phase 2: RPC Tracing (Weeks 3-4)
> **TxQ** = Transaction Queue
**Objective**: Complete tracing for all RPC operations
### Tasks
| Task | Description |
| ---- | -------------------------------------------------------------------------- |
| 2.1 | Implement W3C Trace Context HTTP header extraction |
| 2.2 | Instrument `ServerHandler::onRequest()` |
| 2.3 | Instrument `RPCHandler::doCommand()` |
| 2.4 | Add RPC-specific attributes |
| 2.5 | Instrument WebSocket handler |
| 2.6 | PathFinding instrumentation (`pathfind.request`, `pathfind.compute` spans) |
| 2.7 | TxQ instrumentation (`txq.enqueue`, `txq.apply` spans) |
| 2.8 | Integration tests for RPC tracing |
| 2.9 | Performance benchmarks |
| 2.10 | Documentation |
### Exit Criteria
- [ ] All RPC commands traced
- [ ] Trace context propagates from HTTP headers
- [ ] WebSocket and HTTP both instrumented
- [ ] <1ms overhead per RPC call
- [ ] Integration tests passing
---
## 6.4 Phase 3: Transaction Tracing (Weeks 5-6)
**Objective**: Trace transaction lifecycle across network
### Tasks
| Task | Description |
| ---- | ---------------------------------------------------- |
| 3.1 | Define `TraceContext` Protocol Buffer message |
| 3.2 | Implement protobuf context serialization |
| 3.3 | Instrument `PeerImp::handleTransaction()` |
| 3.4 | Instrument `NetworkOPs::submitTransaction()` |
| 3.5 | Instrument HashRouter integration |
| 3.6 | Fee escalation instrumentation (`fee.escalate` span) |
| 3.7 | Implement relay context propagation |
| 3.8 | Integration tests (multi-node) |
| 3.9 | Performance benchmarks |
### Exit Criteria
- [ ] Transaction traces span across nodes
- [ ] Trace context in Protocol Buffer messages
- [ ] HashRouter deduplication visible in traces
- [ ] Multi-node integration tests passing
- [ ] <5% overhead on transaction throughput
---
## 6.5 Phase 4: Consensus Tracing (Weeks 7-8)
**Objective**: Full observability into consensus rounds
### Tasks
| Task | Description |
| ---- | ---------------------------------------------- |
| 4.1 | Instrument `RCLConsensusAdaptor::startRound()` |
| 4.2 | Instrument phase transitions |
| 4.3 | Instrument proposal handling |
| 4.4 | Instrument validation handling |
| 4.5 | Add consensus-specific attributes |
| 4.6 | Correlate with transaction traces |
| 4.7 | Validator list and manifest tracing |
| 4.8 | Amendment voting tracing |
| 4.9 | SHAMap sync tracing |
| 4.10 | Multi-validator integration tests |
| 4.11 | Performance validation |
### Exit Criteria
- [ ] Complete consensus round traces
- [ ] Phase transitions visible
- [ ] Proposals and validations traced
- [ ] No impact on consensus timing
- [ ] Multi-validator test network validated
### Implementation Status — Phase 4a Plan
Phase 4a (establish-phase gap fill & cross-node correlation) will add:
- **Deterministic trace ID** derived from `previousLedger.id()` so all validators
in the same round share the same `trace_id` (switchable via
`consensus_trace_strategy` config: `"deterministic"` or `"attribute"`).
See [Configuration Reference](./05-configuration-reference.md) for full
configuration options.
- **Round lifecycle spans**: `consensus.round` with round-to-round span links.
- **Establish phase**: `consensus.establish`, `consensus.update_positions` (with
`dispute.resolve` events), `consensus.check` (with threshold tracking).
- **Mode changes**: `consensus.mode_change` spans.
- **Validation**: `consensus.validation.send` with span link to round span
(thread-safe cross-thread access via `roundSpanContext_` snapshot).
- **Separation of concerns**: telemetry extracted to private helpers
(`startRoundTracing`, `createValidationSpan`, `startEstablishTracing`,
`updateEstablishTracing`, `endEstablishTracing`).
The `Phase4_taskList.md` spec document is introduced in the Phase 2 PR (#6424)
and will contain the full task breakdown and implementation notes.
---
## 6.6 Phase 5: Documentation & Deployment (Week 9)
**Objective**: Production readiness
### Tasks
| Task | Description |
| ---- | ----------------------------- |
| 5.1 | Operator runbook |
| 5.2 | Grafana dashboards |
| 5.3 | Alert definitions |
| 5.4 | Collector deployment examples |
| 5.5 | Developer documentation |
| 5.6 | Training materials |
| 5.7 | Final integration testing |
---
## 6.7 Risk Assessment
```mermaid
quadrantChart
title Risk Assessment Matrix
x-axis Low Impact --> High Impact
y-axis Low Likelihood --> High Likelihood
quadrant-1 Mitigate Immediately
quadrant-2 Plan Mitigation
quadrant-3 Accept Risk
quadrant-4 Monitor Closely
SDK Compat: [0.2, 0.18]
Protocol Chg: [0.75, 0.72]
Perf Overhead: [0.58, 0.42]
Context Prop: [0.4, 0.55]
Memory Leaks: [0.85, 0.25]
```
### Risk Details
| Risk | Likelihood | Impact | Mitigation |
| ------------------------------------ | ---------- | ------ | --------------------------------------- |
| Protocol changes break compatibility | Medium | High | Use high field numbers, optional fields |
| Performance overhead unacceptable | Medium | Medium | Sampling, conditional compilation |
| Context propagation complexity | Medium | Medium | Phased rollout, extensive testing |
| SDK compatibility issues | Low | Medium | Pin SDK version, fallback to no-op |
| Memory leaks in long-running nodes | Low | High | Memory profiling, bounded queues |
---
## 6.8 Success Metrics
| Metric | Target | Measurement |
| ------------------------ | -------------------------------------------------------------- | --------------------- |
| Trace coverage | >95% of transaction code paths (independent of sampling ratio) | Sampling verification |
| CPU overhead | <3% | Benchmark tests |
| Memory overhead | <10 MB | Memory profiling |
| Latency impact (p99) | <2% | Performance tests |
| Trace completeness | >99% spans with required attrs | Validation script |
| Cross-node trace linkage | >90% of multi-hop transactions | Integration tests |
---
## 6.9 Quick Wins and Crawl-Walk-Run Strategy
> **TxQ** = Transaction Queue
This section outlines a prioritized approach to maximize ROI with minimal initial investment.
### 6.9.1 Crawl-Walk-Run Overview
<div align="center">
```mermaid
flowchart TB
subgraph crawl["🐢 CRAWL (Week 1-2)"]
direction LR
c1[Core SDK Setup] ~~~ c2[RPC Tracing Only] ~~~ c3[PathFinding + TxQ Tracing] ~~~ c4[Single Node]
end
subgraph walk["🚶 WALK (Week 3-5)"]
direction LR
w1[Transaction Tracing] ~~~ w2[Fee Escalation Tracing] ~~~ w3[Cross-Node Context] ~~~ w4[Basic Dashboards]
end
subgraph run["🏃 RUN (Week 6-9)"]
direction LR
r1[Consensus Tracing] ~~~ r2[Validator, Amendment,<br/>SHAMap Tracing] ~~~ r3[Full Correlation] ~~~ r4[Production Deploy]
end
crawl --> walk --> run
style crawl fill:#1b5e20,stroke:#0d3d14,color:#fff
style walk fill:#bf360c,stroke:#8c2809,color:#fff
style run fill:#0d47a1,stroke:#082f6a,color:#fff
style c1 fill:#1b5e20,stroke:#0d3d14,color:#fff
style c2 fill:#1b5e20,stroke:#0d3d14,color:#fff
style c3 fill:#1b5e20,stroke:#0d3d14,color:#fff
style c4 fill:#1b5e20,stroke:#0d3d14,color:#fff
style w1 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
style w2 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
style w3 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
style w4 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
style r1 fill:#0d47a1,stroke:#082f6a,color:#fff
style r2 fill:#0d47a1,stroke:#082f6a,color:#fff
style r3 fill:#0d47a1,stroke:#082f6a,color:#fff
style r4 fill:#0d47a1,stroke:#082f6a,color:#fff
```
</div>
**Reading the diagram:**
- **CRAWL (Weeks 1-2)**: Minimal investment -- set up the SDK, instrument RPC and PathFinding/TxQ handlers, and verify on a single node. Delivers immediate latency visibility.
- **WALK (Weeks 3-5)**: Expand to transaction lifecycle tracing, fee escalation, cross-node context propagation, and basic Grafana dashboards. This is where distributed tracing starts working.
- **RUN (Weeks 6-9)**: Full consensus instrumentation, validator/amendment/SHAMap tracing, end-to-end correlation, and production deployment with sampling and alerting.
- **Arrows (crawl → walk → run)**: Each phase builds on the prior one; you cannot skip ahead because later phases depend on infrastructure established earlier.
### 6.9.2 Quick Wins (Immediate Value)
| Quick Win | Value | When to Deploy |
| ------------------------------ | ------ | -------------- |
| **RPC Command Tracing** | High | Week 2 |
| **RPC Latency Histograms** | High | Week 2 |
| **Error Rate Dashboard** | Medium | Week 2 |
| **Transaction Submit Tracing** | High | Week 3 |
| **Consensus Round Duration** | Medium | Week 6 |
### 6.9.3 CRAWL Phase (Weeks 1-2)
**Goal**: Get basic tracing working with minimal code changes.
**What You Get**:
- RPC request/response traces for all commands
- Latency breakdown per RPC command
- PathFinding and TxQ tracing (directly impacts RPC latency)
- Error visibility with stack traces
- Basic Grafana dashboard
**Code Changes**: ~15 lines in `ServerHandler.cpp`, ~40 lines in new telemetry module
**Why Start Here**:
- RPC is the lowest-risk, highest-visibility component
- PathFinding and TxQ are RPC-adjacent and directly affect latency
- Immediate value for debugging client issues
- No cross-node complexity
- Single file modification to existing code
### 6.9.4 WALK Phase (Weeks 3-5)
**Goal**: Add transaction lifecycle tracing across nodes.
**What You Get**:
- End-to-end transaction traces from submit to relay
- Fee escalation tracing within the transaction pipeline
- Cross-node correlation (see transaction path)
- HashRouter deduplication visibility
- Relay latency metrics
**Code Changes**: ~120 lines across 4 files, plus protobuf extension
**Why Do This Second**:
- Builds on RPC tracing (transactions submitted via RPC)
- Fee escalation is integral to the transaction processing pipeline
- Moderate complexity (requires context propagation)
- High value for debugging transaction issues
### 6.9.5 RUN Phase (Weeks 6-9)
**Goal**: Full observability including consensus.
**What You Get**:
- Complete consensus round visibility
- Phase transition timing
- Validator proposal tracking
- Validator list and manifest tracing
- Amendment voting tracing
- SHAMap sync tracing
- Full end-to-end traces (client → RPC → TX → consensus → ledger)
**Code Changes**: ~100 lines across 3 consensus files, plus validator/amendment/SHAMap modules
**Why Do This Last**:
- Highest complexity (consensus is critical path)
- Validator, amendment, and SHAMap components are lower priority
- Requires thorough testing
- Lower relative value (consensus issues are rarer)
### 6.9.6 ROI Prioritization Matrix
```mermaid
quadrantChart
title Implementation ROI Matrix
x-axis Low Effort --> High Effort
y-axis Low Value --> High Value
quadrant-1 Quick Wins - Do First
quadrant-2 Major Projects - Plan Carefully
quadrant-3 Nice to Have - Optional
quadrant-4 Time Sinks - Avoid
RPC Tracing: [0.15, 0.92]
TX Submit Trace: [0.3, 0.78]
TX Relay Trace: [0.5, 0.88]
Consensus Trace: [0.72, 0.72]
Peer Msg Trace: [0.85, 0.3]
Ledger Acquire: [0.55, 0.52]
```
---
## 6.10 Definition of Done
> **TxQ** = Transaction Queue | **HA** = High Availability
Clear, measurable criteria for each phase.
### 6.10.1 Phase 1: Core Infrastructure
| Criterion | Measurement | Target |
| --------------- | ---------------------------------------------------------- | ---------------------------- |
| SDK Integration | `cmake --build` succeeds with `-DXRPL_ENABLE_TELEMETRY=ON` | ✅ Compiles |
| Runtime Toggle | `enabled=0` produces zero overhead | <0.1% CPU difference |
| Span Creation | Unit test creates and exports span | Span appears in Tempo |
| Configuration | All config options parsed correctly | Config validation tests pass |
| Documentation | Developer guide exists | PR approved |
**Definition of Done**: All criteria met, PR merged, no regressions in CI.
### 6.10.2 Phase 2: RPC Tracing
| Criterion | Measurement | Target |
| ------------------ | ---------------------------------- | -------------------------- |
| Coverage | All RPC commands instrumented | 100% of commands |
| Context Extraction | traceparent header propagates | Integration test passes |
| Attributes | Command, status, duration recorded | Validation script confirms |
| Performance | RPC latency overhead | <1ms p99 |
| Dashboard | Grafana dashboard deployed | Screenshot in docs |
**Definition of Done**: RPC traces visible in Tempo for all commands, dashboard shows latency distribution.
### 6.10.3 Phase 3: Transaction Tracing
| Criterion | Measurement | Target |
| ---------------- | ------------------------------- | ---------------------------------- |
| Local Trace | Submit validate TxQ traced | Single-node test passes |
| Cross-Node | Context propagates via protobuf | Multi-node test passes |
| Relay Visibility | relay_count attribute correct | Spot check 100 txs |
| HashRouter | Deduplication visible in trace | Duplicate txs show suppressed=true |
| Performance | TX throughput overhead | <5% degradation |
**Definition of Done**: Transaction traces span 3+ nodes in test network, performance within bounds.
### 6.10.4 Phase 4: Consensus Tracing
| Criterion | Measurement | Target |
| -------------------- | ----------------------------- | ------------------------- |
| Round Tracing | startRound creates root span | Unit test passes |
| Phase Visibility | All phases have child spans | Integration test confirms |
| Proposer Attribution | Proposer ID in attributes | Spot check 50 rounds |
| Timing Accuracy | Phase durations match PerfLog | <5% variance |
| No Consensus Impact | Round timing unchanged | Performance test passes |
**Definition of Done**: Consensus rounds fully traceable, no impact on consensus timing.
### 6.10.5 Phase 5: Production Deployment
| Criterion | Measurement | Target |
| ------------ | ---------------------------- | -------------------------- |
| Collector HA | Multiple collectors deployed | No single point of failure |
| Sampling | Tail sampling configured | 10% base + errors + slow |
| Retention | Data retained per policy | 7 days hot, 30 days warm |
| Alerting | Alerts configured | Error spike, high latency |
| Runbook | Operator documentation | Approved by ops team |
| Training | Team trained | Session completed |
**Definition of Done**: Telemetry running in production, operators trained, alerts active.
### 6.10.6 Success Metrics Summary
| Phase | Primary Metric | Secondary Metric | Deadline |
| ------- | ---------------------- | --------------------------- | ------------- |
| Phase 1 | SDK compiles and runs | Zero overhead when disabled | End of Week 2 |
| Phase 2 | 100% RPC coverage | <1ms latency overhead | End of Week 4 |
| Phase 3 | Cross-node traces work | <5% throughput impact | End of Week 6 |
| Phase 4 | Consensus fully traced | No consensus timing impact | End of Week 8 |
| Phase 5 | Production deployment | Operators trained | End of Week 9 |
---
## 6.11 Recommended Implementation Order
Based on ROI analysis, implement in this exact order:
```mermaid
flowchart TB
subgraph week1["Week 1"]
t1[1. OpenTelemetry SDK<br/>Conan/CMake integration]
t2[2. Telemetry interface<br/>SpanGuard, config]
end
subgraph week2["Week 2"]
t3[3. RPC ServerHandler<br/>instrumentation]
t4[4. Basic Tempo setup<br/>for testing]
end
subgraph week3["Week 3"]
t5[5. Transaction submit<br/>tracing]
t6[6. Grafana dashboard<br/>v1]
end
subgraph week4["Week 4"]
t7[7. Protobuf context<br/>extension]
t8[8. PeerImp tx.relay<br/>instrumentation]
end
subgraph week5["Week 5"]
t9[9. Multi-node<br/>integration tests]
t10[10. Performance<br/>benchmarks]
end
subgraph week6_8["Weeks 6-8"]
t11[11. Consensus<br/>instrumentation]
t12[12. Full integration<br/>testing]
end
subgraph week9["Week 9"]
t13[13. Production<br/>deployment]
t14[14. Documentation<br/>& training]
end
t1 --> t2 --> t3 --> t4
t4 --> t5 --> t6
t6 --> t7 --> t8
t8 --> t9 --> t10
t10 --> t11 --> t12
t12 --> t13 --> t14
style week1 fill:#1b5e20,stroke:#0d3d14,color:#fff
style week2 fill:#1b5e20,stroke:#0d3d14,color:#fff
style week3 fill:#bf360c,stroke:#8c2809,color:#fff
style week4 fill:#bf360c,stroke:#8c2809,color:#fff
style week5 fill:#bf360c,stroke:#8c2809,color:#fff
style week6_8 fill:#0d47a1,stroke:#082f6a,color:#fff
style week9 fill:#4a148c,stroke:#2e0d57,color:#fff
style t1 fill:#1b5e20,stroke:#0d3d14,color:#fff
style t2 fill:#1b5e20,stroke:#0d3d14,color:#fff
style t3 fill:#1b5e20,stroke:#0d3d14,color:#fff
style t4 fill:#1b5e20,stroke:#0d3d14,color:#fff
style t5 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
style t6 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
style t7 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
style t8 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
style t9 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
style t10 fill:#ffe0b2,stroke:#ffcc80,color:#1e293b
style t11 fill:#0d47a1,stroke:#082f6a,color:#fff
style t12 fill:#0d47a1,stroke:#082f6a,color:#fff
style t13 fill:#4a148c,stroke:#2e0d57,color:#fff
style t14 fill:#4a148c,stroke:#2e0d57,color:#fff
```
**Reading the diagram:**
- **Week 1 (tasks 1-2)**: Foundation work -- integrate the OpenTelemetry SDK via Conan/CMake and build the `Telemetry` interface with `SpanGuard` and config parsing.
- **Week 2 (tasks 3-4)**: First observable output -- instrument `ServerHandler` for RPC tracing and stand up Tempo so developers can see traces immediately.
- **Weeks 3-5 (tasks 5-10)**: Transaction lifecycle -- add submit tracing, build the first Grafana dashboard, extend protobuf for cross-node context, instrument `PeerImp` relay, then validate with multi-node integration tests and performance benchmarks.
- **Weeks 6-8 (tasks 11-12)**: Consensus deep-dive -- instrument consensus rounds and phases, then run full integration testing across all instrumented paths.
- **Week 9 (tasks 13-14)**: Go-live -- deploy to production with sampling/alerting configured, and deliver documentation and operator training.
- **Arrow chain (t1 ... t14)**: Strict sequential dependency; each task's output is a prerequisite for the next.
---
_Previous: [Configuration Reference](./05-configuration-reference.md)_ | _Next: [Observability Backends](./07-observability-backends.md)_ | _Back to: [Overview](./OpenTelemetryPlan.md)_

View File

@@ -0,0 +1,641 @@
# Observability Backend Recommendations
> **Parent Document**: [OpenTelemetryPlan.md](./OpenTelemetryPlan.md)
> **Related**: [Implementation Phases](./06-implementation-phases.md) | [Appendix](./08-appendix.md)
---
## 7.1 Development/Testing Backends
> **OTLP** = OpenTelemetry Protocol
| Backend | Pros | Cons | Use Case |
| ---------- | ----------------------------------- | ---------------------- | ------------------- |
| **Tempo** | Cost-effective, Grafana integration | Requires Grafana stack | Local dev, CI, Prod |
| **Zipkin** | Simple, lightweight | Basic features | Quick prototyping |
### Quick Start with Tempo
```bash
# Start Tempo with OTLP support
docker run -d --name tempo \
-p 3200:3200 \
-p 4317:4317 \
-p 4318:4318 \
grafana/tempo:2.6.1
```
---
## 7.2 Production Backends
> **APM** = Application Performance Monitoring
| Backend | Pros | Cons | Use Case |
| ----------------- | ----------------------------------------- | ---------------------- | --------------------------- |
| **Grafana Tempo** | Cost-effective, Grafana integration | Requires Grafana stack | Most production deployments |
| **Elastic APM** | Full observability stack, log correlation | Resource intensive | Existing Elastic users |
| **Honeycomb** | Excellent query, high cardinality | SaaS cost | Deep debugging needs |
| **Datadog APM** | Full platform, easy setup | SaaS cost | Enterprise with budget |
### Backend Selection Flowchart
```mermaid
flowchart TD
start[Select Backend] --> budget{Budget<br/>Constraints?}
budget -->|Yes| oss[Open Source]
budget -->|No| saas{Prefer<br/>SaaS?}
oss --> existing{Existing<br/>Stack?}
existing -->|Grafana| tempo[Grafana Tempo]
existing -->|Elastic| elastic[Elastic APM]
existing -->|None| tempo
saas -->|Yes| enterprise{Enterprise<br/>Support?}
saas -->|No| oss
enterprise -->|Yes| datadog[Datadog APM]
enterprise -->|No| honeycomb[Honeycomb]
tempo --> final[Configure Collector]
elastic --> final
honeycomb --> final
datadog --> final
style start fill:#0f172a,stroke:#020617,color:#fff
style budget fill:#334155,stroke:#1e293b,color:#fff
style oss fill:#1e293b,stroke:#0f172a,color:#fff
style existing fill:#334155,stroke:#1e293b,color:#fff
style saas fill:#334155,stroke:#1e293b,color:#fff
style enterprise fill:#334155,stroke:#1e293b,color:#fff
style final fill:#0f172a,stroke:#020617,color:#fff
style tempo fill:#1b5e20,stroke:#0d3d14,color:#fff
style elastic fill:#bf360c,stroke:#8c2809,color:#fff
style honeycomb fill:#0d47a1,stroke:#082f6a,color:#fff
style datadog fill:#4a148c,stroke:#2e0d57,color:#fff
```
**Reading the diagram:**
- **Budget Constraints? (Yes)**: Leads to open-source options. If you already run Grafana or Elastic, pick the matching backend; otherwise default to Grafana Tempo.
- **Budget Constraints? (No) → Prefer SaaS?**: If you want a managed service, choose between Datadog (enterprise support) and Honeycomb (developer-focused). If not, fall back to open-source.
- **Terminal nodes (Tempo / Elastic / Honeycomb / Datadog)**: Each represents a concrete backend choice, all of which feed into the same final step.
- **Configure Collector**: Regardless of backend, you always finish by configuring the OTel Collector to export to your chosen destination.
---
## 7.3 Recommended Production Architecture
> **OTLP** = OpenTelemetry Protocol | **APM** = Application Performance Monitoring | **HA** = High Availability
```mermaid
flowchart TB
subgraph validators["Validator Nodes"]
v1[xrpld<br/>Validator 1]
v2[xrpld<br/>Validator 2]
end
subgraph stock["Stock Nodes"]
s1[xrpld<br/>Stock 1]
s2[xrpld<br/>Stock 2]
end
subgraph collector["OTel Collector Cluster"]
c1[Collector<br/>DC1]
c2[Collector<br/>DC2]
end
subgraph backends["Storage Backends"]
tempo[(Grafana<br/>Tempo)]
elastic[(Elastic<br/>APM)]
archive[(S3/GCS<br/>Archive)]
end
subgraph ui["Visualization"]
grafana[Grafana<br/>Dashboards]
end
v1 -->|OTLP| c1
v2 -->|OTLP| c1
s1 -->|OTLP| c2
s2 -->|OTLP| c2
c1 --> tempo
c1 --> elastic
c2 --> tempo
c2 --> archive
tempo --> grafana
elastic --> grafana
%% Note: simplified single-collector-per-DC topology shown for clarity
style validators fill:#b71c1c,stroke:#7f1d1d,color:#ffffff
style stock fill:#0d47a1,stroke:#082f6a,color:#ffffff
style collector fill:#bf360c,stroke:#8c2809,color:#ffffff
style backends fill:#1b5e20,stroke:#0d3d14,color:#ffffff
style ui fill:#4a148c,stroke:#2e0d57,color:#ffffff
```
**Reading the diagram:**
- **Validator / Stock Nodes**: All xrpld nodes emit trace data via OTLP. Validators and stock nodes are grouped separately because they may reside in different network zones.
- **Collector Cluster (DC1, DC2)**: Regional collectors receive OTLP from nodes in their datacenter, apply processing (sampling, enrichment), and fan out to multiple backends.
- **Storage Backends**: Tempo and Elastic provide queryable trace storage; S3/GCS Archive provides long-term cold storage for compliance or post-incident analysis.
- **Grafana Dashboards**: The single visualization layer that queries both Tempo and Elastic, giving operators a unified view of all traces.
- **Data flow direction**: Nodes → Collectors → Storage → Grafana. Each arrow represents a network hop; minimizing collector-to-backend hops reduces latency.
> **Note**: Production deployments should use multiple collector instances behind a load balancer for high availability. The diagram shows a simplified single-collector topology for clarity.
---
## 7.4 Architecture Considerations
### 7.4.1 Collector Placement
| Strategy | Description | Pros | Cons |
| ------------- | -------------------- | ------------------------ | ----------------------- |
| **Sidecar** | Collector per node | Isolation, simple config | Resource overhead |
| **DaemonSet** | Collector per host | Shared resources | Complexity |
| **Gateway** | Central collector(s) | Centralized processing | Single point of failure |
**Recommendation**: Use **Gateway** pattern with regional collectors for xrpld networks:
- One collector cluster per datacenter/region
- Tail-based sampling at collector level
- Multiple export destinations for redundancy
### 7.4.2 Sampling Strategy
```mermaid
flowchart LR
subgraph head["Head Sampling (Node)"]
hs[Node-level head sampling<br/>configurable, default: 100%<br/>recommended production: 10%]
end
subgraph tail["Tail Sampling (Collector)"]
ts1[Keep all errors]
ts2[Keep slow >5s]
ts3[Keep 10% rest]
end
head --> tail
ts1 --> final[Final Traces]
ts2 --> final
ts3 --> final
style head fill:#0d47a1,stroke:#082f6a,color:#fff
style tail fill:#1b5e20,stroke:#0d3d14,color:#fff
style hs fill:#0d47a1,stroke:#082f6a,color:#fff
style ts1 fill:#1b5e20,stroke:#0d3d14,color:#fff
style ts2 fill:#1b5e20,stroke:#0d3d14,color:#fff
style ts3 fill:#1b5e20,stroke:#0d3d14,color:#fff
style final fill:#bf360c,stroke:#8c2809,color:#fff
```
**Reading the diagram:**
- **Head Sampling (Node)**: The first filter -- each xrpld node decides whether to sample a trace at creation time (default 100%, recommended 10% in production). This controls the volume leaving the node.
- **Tail Sampling (Collector)**: The second filter -- the collector inspects completed traces and applies rules: keep all errors, keep anything slower than 5 seconds, and keep 10% of the remainder.
- **Arrow head → tail**: All head-sampled traces flow to the collector, where tail sampling further reduces volume while preserving the most valuable data.
- **Final Traces**: The output after both sampling stages; this is what gets stored and queried. The two-stage approach balances cost with debuggability.
### 7.4.3 Data Retention
| Environment | Hot Storage | Warm Storage | Cold Archive |
| ----------- | ----------- | ------------ | ------------ |
| Development | 24 hours | N/A | N/A |
| Staging | 7 days | N/A | N/A |
| Production | 7 days | 30 days | many years |
---
## 7.5 Integration Checklist
- [ ] Choose primary backend (Tempo recommended for cost/features)
- [ ] Deploy collector cluster with high availability
- [ ] Configure tail-based sampling for error/latency traces
- [ ] Set up Grafana dashboards for trace visualization
- [ ] Configure alerts for trace anomalies
- [ ] Establish data retention policies
- [ ] Test trace correlation with logs and metrics
---
## 7.6 Grafana Dashboard Examples
Pre-built dashboards for xrpld observability.
### 7.6.1 Consensus Health Dashboard
```json
{
"title": "xrpld Consensus Health",
"uid": "xrpld-consensus-health",
"tags": ["xrpld", "consensus", "tracing"],
"panels": [
{
"title": "Consensus Round Duration",
"type": "timeseries",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\" && name=\"consensus.round\"} | avg(duration) by (resource.service.instance.id)"
}
],
"fieldConfig": {
"defaults": {
"unit": "ms",
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 4000 },
{ "color": "red", "value": 5000 }
]
}
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }
},
{
"title": "Phase Duration Breakdown",
"type": "barchart",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\" && name=~\"consensus.phase.*\"} | avg(duration) by (name)"
}
],
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }
},
{
"title": "Proposers per Round",
"type": "stat",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\" && name=\"consensus.round\"} | avg(span.xrpl.consensus.proposers)"
}
],
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 8 }
},
{
"title": "Recent Slow Rounds (>5s)",
"type": "table",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\" && name=\"consensus.round\"} | duration > 5s"
}
],
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 }
}
]
}
```
### 7.6.2 Node Overview Dashboard
```json
{
"title": "xrpld Node Overview",
"uid": "xrpld-node-overview",
"panels": [
{
"title": "Active Nodes",
"type": "stat",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\"} | count_over_time() by (resource.service.instance.id) | count()"
}
],
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 }
},
{
"title": "Total Transactions (1h)",
"type": "stat",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\" && name=\"tx.receive\"} | count()"
}
],
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 }
},
{
"title": "Error Rate",
"type": "gauge",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\" && status.code=error} | rate() / {resource.service.name=\"xrpld\"} | rate() * 100"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"max": 10,
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 5 }
]
}
}
},
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 }
},
{
"title": "Service Map",
"type": "nodeGraph",
"datasource": "Tempo",
"gridPos": { "h": 12, "w": 12, "x": 12, "y": 0 }
}
]
}
```
### 7.6.3 Alert Rules
```yaml
# grafana/provisioning/alerting/rippled-alerts.yaml
apiVersion: 1
groups:
- name: xrpld-tracing-alerts
folder: xrpld
interval: 1m
rules:
- uid: consensus-slow
title: Consensus Round Slow
condition: A
data:
- refId: A
datasourceUid: tempo
model:
queryType: traceql
query: '{resource.service.name="xrpld" && name="consensus.round"} | avg(duration) > 5s'
# Note: Verify TraceQL aggregate queries are supported by your
# Tempo version. Aggregate alerting (e.g., avg(duration)) requires
# Tempo 2.3+ with TraceQL metrics enabled.
for: 5m
annotations:
summary: Consensus rounds taking >5 seconds
description: "Consensus duration: {{ $value }}ms"
labels:
severity: warning
- uid: rpc-error-spike
title: RPC Error Rate Spike
condition: B
data:
- refId: B
datasourceUid: tempo
model:
queryType: traceql
query: '{resource.service.name="xrpld" && name=~"rpc.command.*" && status.code=error} | rate() > 0.05'
# Note: Verify TraceQL aggregate queries are supported by your
# Tempo version. Aggregate alerting (e.g., rate()) requires
# Tempo 2.3+ with TraceQL metrics enabled.
for: 2m
annotations:
summary: RPC error rate >5%
labels:
severity: critical
- uid: tx-throughput-drop
title: Transaction Throughput Drop
condition: C
data:
- refId: C
datasourceUid: tempo
model:
queryType: traceql
query: '{resource.service.name="xrpld" && name="tx.receive"} | rate() < 10'
for: 10m
annotations:
summary: Transaction throughput below threshold
labels:
severity: warning
```
---
## 7.7 PerfLog and Insight Correlation
> **OTLP** = OpenTelemetry Protocol
How to correlate OpenTelemetry traces with existing xrpld observability.
### 7.7.1 Correlation Architecture
```mermaid
flowchart TB
subgraph xrpld["xrpld Node"]
otel[OpenTelemetry<br/>Spans]
perflog[PerfLog<br/>JSON Logs]
insight[Beast Insight<br/>StatsD Metrics]
end
subgraph collectors["Data Collection"]
otelc[OTel Collector]
promtail[Promtail/Fluentd]
statsd[StatsD Exporter]
end
subgraph storage["Storage"]
tempo[(Tempo)]
loki[(Loki)]
prom[(Prometheus)]
end
subgraph grafana["Grafana"]
traces[Trace View]
logs[Log View]
metrics[Metrics View]
corr[Correlation<br/>Panel]
end
otel -->|OTLP| otelc --> tempo
perflog -->|JSON| promtail --> loki
insight -->|StatsD| statsd --> prom
tempo --> traces
loki --> logs
prom --> metrics
traces --> corr
logs --> corr
metrics --> corr
style xrpld fill:#0d47a1,stroke:#082f6a,color:#fff
style collectors fill:#bf360c,stroke:#8c2809,color:#fff
style storage fill:#1b5e20,stroke:#0d3d14,color:#fff
style grafana fill:#4a148c,stroke:#2e0d57,color:#fff
style otel fill:#0d47a1,stroke:#082f6a,color:#fff
style perflog fill:#0d47a1,stroke:#082f6a,color:#fff
style insight fill:#0d47a1,stroke:#082f6a,color:#fff
style otelc fill:#bf360c,stroke:#8c2809,color:#fff
style promtail fill:#bf360c,stroke:#8c2809,color:#fff
style statsd fill:#bf360c,stroke:#8c2809,color:#fff
style tempo fill:#1b5e20,stroke:#0d3d14,color:#fff
style loki fill:#1b5e20,stroke:#0d3d14,color:#fff
style prom fill:#1b5e20,stroke:#0d3d14,color:#fff
style traces fill:#4a148c,stroke:#2e0d57,color:#fff
style logs fill:#4a148c,stroke:#2e0d57,color:#fff
style metrics fill:#4a148c,stroke:#2e0d57,color:#fff
style corr fill:#4a148c,stroke:#2e0d57,color:#fff
```
**Reading the diagram:**
- **xrpld Node (three sources)**: A single node emits three independent data streams -- OpenTelemetry spans, PerfLog JSON logs, and Beast Insight StatsD metrics.
- **Data Collection layer**: Each stream has its own collector -- OTel Collector for spans, Promtail/Fluentd for logs, and a StatsD exporter for metrics. They operate independently.
- **Storage layer (Tempo, Loki, Prometheus)**: Each data type lands in a purpose-built store optimized for its query patterns (trace search, log grep, metric aggregation).
- **Grafana Correlation Panel**: The key integration point -- Grafana queries all three stores and links them via shared fields (`trace_id`, `xrpl.tx.hash`, `ledger_seq`), enabling a single-pane debugging experience.
### 7.7.2 Correlation Fields
| Source | Field | Link To | Purpose |
| ----------- | --------------------------- | ------------- | -------------------------- |
| **Trace** | `trace_id` | Logs | Find log entries for trace |
| **Trace** | `xrpl.tx.hash` | Logs, Metrics | Find TX-related data |
| **Trace** | `xrpl.consensus.ledger.seq` | Logs | Find ledger-related logs |
| **PerfLog** | `trace_id` (new) | Traces | Jump to trace from log |
| **PerfLog** | `ledger_seq` | Traces | Find consensus trace |
| **Insight** | `exemplar.trace_id` | Traces | Jump from metric spike |
### 7.7.3 Example: Debugging a Slow Transaction
**Step 1: Find the trace**
```
# In Grafana Explore with Tempo
{resource.service.name="xrpld" && span.xrpl.tx.hash="ABC123..."}
```
**Step 2: Get the trace_id from the trace view**
```
Trace ID: 4bf92f3577b34da6a3ce929d0e0e4736
```
**Step 3: Find related PerfLog entries**
```
# In Grafana Explore with Loki
{job="xrpld"} |= "4bf92f3577b34da6a3ce929d0e0e4736"
```
**Step 4: Check Insight metrics for the time window**
```
# In Grafana with Prometheus
rate(xrpld_tx_applied_total[1m])
@ timestamp_from_trace
```
### 7.7.4 Unified Dashboard Example
```json
{
"title": "xrpld Unified Observability",
"uid": "xrpld-unified",
"panels": [
{
"title": "Transaction Latency (Traces)",
"type": "timeseries",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\" && name=\"tx.receive\"} | histogram_over_time(duration)"
}
],
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 0 }
},
{
"title": "Transaction Rate (Metrics)",
"type": "timeseries",
"datasource": "Prometheus",
"targets": [
{
"expr": "rate(xrpld_tx_received_total[5m])",
"legendFormat": "{{ instance }}"
}
],
"fieldConfig": {
"defaults": {
"links": [
{
"title": "View traces",
"url": "/explore?left={\"datasource\":\"Tempo\",\"query\":\"{resource.service.name=\\\"xrpld\\\" && name=\\\"tx.receive\\\"}\"}"
}
]
}
},
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 0 }
},
{
"title": "Recent Logs",
"type": "logs",
"datasource": "Loki",
"targets": [
{
"expr": "{job=\"xrpld\"} | json"
}
],
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 0 }
},
{
"title": "Trace Search",
"type": "table",
"datasource": "Tempo",
"targets": [
{
"queryType": "traceql",
"query": "{resource.service.name=\"xrpld\"}"
}
],
"fieldConfig": {
"overrides": [
{
"matcher": { "id": "byName", "options": "traceID" },
"properties": [
{
"id": "links",
"value": [
{
"title": "View trace",
"url": "/explore?left={\"datasource\":\"Tempo\",\"query\":\"${__value.raw}\"}"
},
{
"title": "View logs",
"url": "/explore?left={\"datasource\":\"Loki\",\"query\":\"{job=\\\"xrpld\\\"} |= \\\"${__value.raw}\\\"\"}"
}
]
}
]
}
]
},
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 6 }
}
]
}
```
---
_Previous: [Implementation Phases](./06-implementation-phases.md)_ | _Next: [Appendix](./08-appendix.md)_ | _Back to: [Overview](./OpenTelemetryPlan.md)_

View File

@@ -0,0 +1,195 @@
# Appendix
> **Parent Document**: [OpenTelemetryPlan.md](./OpenTelemetryPlan.md)
> **Related**: [Observability Backends](./07-observability-backends.md)
---
## 8.1 Glossary
> **OTLP** = OpenTelemetry Protocol | **TxQ** = Transaction Queue
| Term | Definition |
| --------------------- | ---------------------------------------------------------- |
| **Span** | A unit of work with start/end time, name, and attributes |
| **Trace** | A collection of spans representing a complete request flow |
| **Trace ID** | 128-bit unique identifier for a trace |
| **Span ID** | 64-bit unique identifier for a span within a trace |
| **Context** | Carrier for trace/span IDs across boundaries |
| **Propagator** | Component that injects/extracts context |
| **Sampler** | Decides which traces to record |
| **Exporter** | Sends spans to backend |
| **Collector** | Receives, processes, and forwards telemetry |
| **OTLP** | OpenTelemetry Protocol (wire format) |
| **W3C Trace Context** | Standard HTTP headers for trace propagation |
| **Baggage** | Key-value pairs propagated across service boundaries |
| **Resource** | Entity producing telemetry (service, host, etc.) |
| **Instrumentation** | Code that creates telemetry data |
### xrpld-Specific Terms
| Term | Definition |
| ----------------- | ------------------------------------------------------------- |
| **Overlay** | P2P network layer managing peer connections |
| **Consensus** | XRP Ledger consensus algorithm (RCL) |
| **Proposal** | Validator's suggested transaction set for a ledger |
| **Validation** | Validator's signature on a closed ledger |
| **HashRouter** | Component for transaction deduplication |
| **JobQueue** | Thread pool for asynchronous task execution |
| **PerfLog** | Existing performance logging system in xrpld |
| **Beast Insight** | Existing metrics framework in xrpld |
| **PathFinding** | Payment path computation engine for cross-currency payments |
| **TxQ** | Transaction queue managing fee-based prioritization |
| **LoadManager** | Dynamic fee escalation based on network load |
| **SHAMap** | SHA-256 hash-based map (Merkle trie variant) for ledger state |
---
## 8.2 Span Hierarchy Visualization
> **TxQ** = Transaction Queue
```mermaid
flowchart TB
subgraph trace["Trace: Transaction Lifecycle"]
rpc["rpc.request<br/>(entry point)"]
validate["tx.validate"]
relay["tx.relay<br/>(parent span)"]
subgraph peers["Peer Spans"]
p1["peer.send<br/>Peer A"]
p2["peer.send<br/>Peer B"]
p3["peer.send<br/>Peer C"]
end
subgraph pathfinding["PathFinding Spans"]
pathfind["pathfind.request"]
pathcomp["pathfind.compute"]
end
consensus["consensus.round"]
apply["tx.apply"]
subgraph txqueue["TxQ Spans"]
txq["txq.enqueue"]
txqApply["txq.apply"]
end
feeCalc["fee.escalate"]
end
subgraph validators["Validator Spans"]
valFetch["validator.list.fetch"]
valManifest["validator.manifest"]
end
rpc --> validate
rpc --> pathfind
pathfind --> pathcomp
validate --> relay
relay --> p1
relay --> p2
relay --> p3
p1 -.->|"context propagation"| consensus
consensus --> apply
apply --> txq
txq --> txqApply
txq --> feeCalc
style trace fill:#0f172a,stroke:#020617,color:#fff
style peers fill:#1e3a8a,stroke:#172554,color:#fff
style pathfinding fill:#134e4a,stroke:#0f766e,color:#fff
style txqueue fill:#064e3b,stroke:#047857,color:#fff
style validators fill:#4c1d95,stroke:#6d28d9,color:#fff
style rpc fill:#1d4ed8,stroke:#1e40af,color:#fff
style validate fill:#047857,stroke:#064e3b,color:#fff
style relay fill:#047857,stroke:#064e3b,color:#fff
style p1 fill:#0e7490,stroke:#155e75,color:#fff
style p2 fill:#0e7490,stroke:#155e75,color:#fff
style p3 fill:#0e7490,stroke:#155e75,color:#fff
style consensus fill:#fef3c7,stroke:#fde68a,color:#1e293b
style apply fill:#047857,stroke:#064e3b,color:#fff
style pathfind fill:#0e7490,stroke:#155e75,color:#fff
style pathcomp fill:#0e7490,stroke:#155e75,color:#fff
style txq fill:#047857,stroke:#064e3b,color:#fff
style txqApply fill:#047857,stroke:#064e3b,color:#fff
style feeCalc fill:#047857,stroke:#064e3b,color:#fff
style valFetch fill:#6d28d9,stroke:#4c1d95,color:#fff
style valManifest fill:#6d28d9,stroke:#4c1d95,color:#fff
```
**Reading the diagram:**
- **rpc.request (blue, top)**: The entry point — every traced transaction starts as an RPC call; this root span is the parent of all downstream work.
- **tx.validate and pathfind.request (green/teal, first fork)**: The RPC request fans out into transaction validation and, for cross-currency payments, a PathFinding branch (`pathfind.request` -> `pathfind.compute`).
- **tx.relay -> Peer Spans (teal, middle)**: After validation, the transaction is relayed to peers A, B, and C in parallel; each `peer.send` is a sibling child span showing fan-out across the network.
- **context propagation (dashed arrow)**: The dotted line from `peer.send Peer A` to `consensus.round` represents the trace context crossing a node boundary — the receiving validator picks up the same `trace_id` and continues the trace.
- **consensus.round -> tx.apply -> TxQ Spans (green, lower)**: Once consensus accepts the transaction, it is applied to the ledger; the TxQ spans (`txq.enqueue`, `txq.apply`, `fee.escalate`) capture queue depth and fee escalation behavior.
- **Validator Spans (purple, detached)**: `validator.list.fetch` and `validator.manifest` are independent workflows for UNL management — they run on their own traces and are linked to consensus via Span Links, not parent-child relationships.
---
## 8.3 References
> **OTLP** = OpenTelemetry Protocol
### OpenTelemetry Resources
1. [OpenTelemetry C++ SDK](https://github.com/open-telemetry/opentelemetry-cpp)
2. [OpenTelemetry Specification](https://opentelemetry.io/docs/specs/otel/)
3. [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/)
4. [OTLP Protocol Specification](https://opentelemetry.io/docs/specs/otlp/)
### Standards
5. [W3C Trace Context](https://www.w3.org/TR/trace-context/)
6. [W3C Baggage](https://www.w3.org/TR/baggage/)
7. [Protocol Buffers](https://protobuf.dev/)
### xrpld Resources
8. [xrpld Source Code](https://github.com/XRPLF/rippled)
9. [XRP Ledger Documentation](https://xrpl.org/docs/)
10. [xrpld Overlay README](https://github.com/XRPLF/rippled/blob/develop/src/xrpld/overlay/README.md)
11. [xrpld RPC README](https://github.com/XRPLF/rippled/blob/develop/src/xrpld/rpc/README.md)
12. [xrpld Consensus README](https://github.com/XRPLF/rippled/blob/develop/src/xrpld/app/consensus/README.md)
---
## 8.4 Version History
| Version | Date | Author | Changes |
| ------- | ---------- | ------ | -------------------------------------------------------------- |
| 1.0 | 2026-02-12 | - | Initial implementation plan |
| 1.1 | 2026-02-13 | - | Refactored into modular documents |
| 1.2 | 2026-03-24 | - | Review fixes: accuracy corrections, cross-document consistency |
---
## 8.5 Document Index
### Plan Documents
| Document | Description |
| ---------------------------------------------------------------- | -------------------------------------------- |
| [OpenTelemetryPlan.md](./OpenTelemetryPlan.md) | Master overview and executive summary |
| [00-tracing-fundamentals.md](./00-tracing-fundamentals.md) | Distributed tracing concepts and OTel primer |
| [01-architecture-analysis.md](./01-architecture-analysis.md) | xrpld architecture and trace points |
| [02-design-decisions.md](./02-design-decisions.md) | SDK selection, exporters, span conventions |
| [03-implementation-strategy.md](./03-implementation-strategy.md) | Directory structure, performance analysis |
| [04-code-samples.md](./04-code-samples.md) | C++ code examples for all components |
| [05-configuration-reference.md](./05-configuration-reference.md) | xrpld config, CMake, Collector configs |
| [06-implementation-phases.md](./06-implementation-phases.md) | Timeline, tasks, risks, success metrics |
| [07-observability-backends.md](./07-observability-backends.md) | Backend selection and architecture |
| [08-appendix.md](./08-appendix.md) | Glossary, references, version history |
### Task Lists
| Document | Description |
| ------------------------------------ | --------------------------------------------------- |
| [POC_taskList.md](./POC_taskList.md) | Proof-of-concept telemetry integration |
| [presentation.md](./presentation.md) | Presentation slides for OpenTelemetry plan overview |
---
_Previous: [Observability Backends](./07-observability-backends.md)_ | _Back to: [Overview](./OpenTelemetryPlan.md)_

View File

@@ -0,0 +1,230 @@
# [OpenTelemetry](00-tracing-fundamentals.md) Distributed Tracing Implementation Plan for xrpld (xrpld)
## Executive Summary
> **OTLP** = OpenTelemetry Protocol
This document provides a comprehensive implementation plan for integrating OpenTelemetry distributed tracing into the xrpld XRP Ledger node software. The plan addresses the unique challenges of a decentralized peer-to-peer system where trace context must propagate across network boundaries between independent nodes.
### Key Benefits
- **End-to-end transaction visibility**: Track transactions from submission through consensus to ledger inclusion
- **Consensus round analysis**: Understand timing and behavior of consensus phases across validators
- **RPC performance insights**: Identify slow handlers and optimize response times
- **Network topology understanding**: Visualize message propagation patterns between peers
- **Incident debugging**: Correlate events across distributed nodes during issues
### Estimated Performance Overhead
| Metric | Overhead | Notes |
| ------------- | ---------- | ----------------------------------- |
| CPU | 1-3% | Span creation and attribute setting |
| Memory | 2-5 MB | Batch buffer for pending spans |
| Network | 10-50 KB/s | Compressed OTLP export to collector |
| Latency (p99) | <2% | With proper sampling configuration |
---
## Document Structure
This implementation plan is organized into modular documents for easier navigation:
<div align="center">
```mermaid
flowchart TB
overview["📋 OpenTelemetryPlan.md<br/>(This Document)"]
subgraph fundamentals["Fundamentals"]
fund["00-tracing-fundamentals.md"]
end
subgraph analysis["Analysis & Design"]
arch["01-architecture-analysis.md"]
design["02-design-decisions.md"]
end
subgraph impl["Implementation"]
strategy["03-implementation-strategy.md"]
code["04-code-samples.md"]
config["05-configuration-reference.md"]
end
subgraph deploy["Deployment & Planning"]
phases["06-implementation-phases.md"]
backends["07-observability-backends.md"]
appendix["08-appendix.md"]
poc["POC_taskList.md"]
end
overview --> fundamentals
overview --> analysis
overview --> impl
overview --> deploy
fund --> arch
arch --> design
design --> strategy
strategy --> code
code --> config
config --> phases
phases --> backends
backends --> appendix
phases --> poc
style overview fill:#1b5e20,stroke:#0d3d14,color:#fff,stroke-width:2px
style fundamentals fill:#00695c,stroke:#004d40,color:#fff
style fund fill:#00695c,stroke:#004d40,color:#fff
style analysis fill:#0d47a1,stroke:#082f6a,color:#fff
style impl fill:#bf360c,stroke:#8c2809,color:#fff
style deploy fill:#4a148c,stroke:#2e0d57,color:#fff
style arch fill:#0d47a1,stroke:#082f6a,color:#fff
style design fill:#0d47a1,stroke:#082f6a,color:#fff
style strategy fill:#bf360c,stroke:#8c2809,color:#fff
style code fill:#bf360c,stroke:#8c2809,color:#fff
style config fill:#bf360c,stroke:#8c2809,color:#fff
style phases fill:#4a148c,stroke:#2e0d57,color:#fff
style backends fill:#4a148c,stroke:#2e0d57,color:#fff
style appendix fill:#4a148c,stroke:#2e0d57,color:#fff
style poc fill:#4a148c,stroke:#2e0d57,color:#fff
```
</div>
---
## Table of Contents
| Section | Document | Description |
| ------- | ---------------------------------------------------------- | ---------------------------------------------------------------------- |
| **0** | [Tracing Fundamentals](./00-tracing-fundamentals.md) | Distributed tracing concepts, span relationships, context propagation |
| **1** | [Architecture Analysis](./01-architecture-analysis.md) | xrpld component analysis, trace points, instrumentation priorities |
| **2** | [Design Decisions](./02-design-decisions.md) | SDK selection, exporters, span naming, attributes, context propagation |
| **3** | [Implementation Strategy](./03-implementation-strategy.md) | Directory structure, key principles, performance optimization |
| **4** | [Code Samples](./04-code-samples.md) | C++ implementation examples for core infrastructure and key modules |
| **5** | [Configuration Reference](./05-configuration-reference.md) | xrpld config, CMake integration, Collector configurations |
| **6** | [Implementation Phases](./06-implementation-phases.md) | 5-phase timeline, tasks, risks, success metrics |
| **7** | [Observability Backends](./07-observability-backends.md) | Backend selection guide and production architecture |
| **8** | [Appendix](./08-appendix.md) | Glossary, references, version history |
| **POC** | [POC Task List](./POC_taskList.md) | Proof of concept tasks for RPC tracing end-to-end demo |
---
## 0. Tracing Fundamentals
This document introduces distributed tracing concepts for readers unfamiliar with the domain. It covers what traces and spans are, how parent-child and follows-from relationships model causality, how context propagates across service boundaries, and how sampling controls data volume. It also maps these concepts to xrpld-specific scenarios like transaction relay and consensus.
➡️ **[Read Tracing Fundamentals](./00-tracing-fundamentals.md)**
---
## 1. Architecture Analysis
> **WS** = WebSocket | **TxQ** = Transaction Queue
The xrpld node consists of several key components that require instrumentation for comprehensive distributed tracing. The main areas include the RPC server (HTTP/WebSocket), Overlay P2P network, Consensus mechanism (RCLConsensus), JobQueue for async task execution, PathFinding, Transaction Queue (TxQ), fee escalation (LoadManager), ledger acquisition, validator management, and existing observability infrastructure (PerfLog, Insight/StatsD, Journal logging).
Key trace points span across transaction submission via RPC, peer-to-peer message propagation, consensus round execution, ledger building, path computation, transaction queue behavior, fee escalation, and validator health. The implementation prioritizes high-value, low-risk components first: RPC handlers provide immediate value with minimal risk, while consensus tracing requires careful implementation to avoid timing impacts.
➡️ **[Read full Architecture Analysis](./01-architecture-analysis.md)**
---
## 2. Design Decisions
> **OTLP** = OpenTelemetry Protocol | **CNCF** = Cloud Native Computing Foundation
The OpenTelemetry C++ SDK is selected for its CNCF backing, active development, and native performance characteristics. Traces are exported via OTLP/gRPC (primary) or OTLP/HTTP (fallback) to an OpenTelemetry Collector, which provides flexible routing and sampling.
Span naming follows a hierarchical `<component>.<operation>` convention (e.g., `rpc.submit`, `tx.relay`, `consensus.round`). Context propagation uses W3C Trace Context headers for HTTP and embedded Protocol Buffer fields for P2P messages. The implementation coexists with existing PerfLog and Insight observability systems through correlation IDs.
**Data Collection & Privacy**: Telemetry collects only operational metadata (timing, counts, hashes) — never sensitive content (private keys, balances, amounts, raw payloads). Privacy protection includes account hashing, configurable redaction, sampling, and collector-level filtering. Node operators retain full control over telemetry configuration.
➡️ **[Read full Design Decisions](./02-design-decisions.md)**
---
## 3. Implementation Strategy
The telemetry code is organized under `include/xrpl/telemetry/` for headers and `src/libxrpl/telemetry/` for implementation. Key principles include RAII-based span management via `SpanGuard`, conditional compilation with `XRPL_ENABLE_TELEMETRY`, and minimal runtime overhead through batch processing and efficient sampling.
Performance optimization strategies include probabilistic head sampling (10% default), tail-based sampling at the collector for errors and slow traces, batch export to reduce network overhead, and conditional instrumentation that compiles to no-ops when disabled.
➡️ **[Read full Implementation Strategy](./03-implementation-strategy.md)**
---
## 4. Code Samples
C++ implementation examples are provided for the core telemetry infrastructure and key modules:
- `Telemetry.h` - Core interface for tracer access and span creation
- `SpanGuard.h` - RAII wrapper for automatic span lifecycle management
- `TracingInstrumentation.h` - Macros for conditional instrumentation
- Protocol Buffer extensions for trace context propagation
- Module-specific instrumentation (RPC, Consensus, P2P, JobQueue)
- Remaining modules (PathFinding, TxQ, Validator, etc.) follow the same patterns
➡️ **[View all Code Samples](./04-code-samples.md)**
---
## 5. Configuration Reference
> **OTLP** = OpenTelemetry Protocol | **APM** = Application Performance Monitoring
Configuration is handled through the `[telemetry]` section in `xrpld.cfg` with options for enabling/disabling, exporter selection, endpoint configuration, sampling ratios, and component-level filtering. CMake integration includes a `XRPL_ENABLE_TELEMETRY` option for compile-time control.
OpenTelemetry Collector configurations are provided for development and production (with tail-based sampling, Tempo, and Elastic APM). Docker Compose examples enable quick local development environment setup.
➡️ **[View full Configuration Reference](./05-configuration-reference.md)**
---
## 6. Implementation Phases
The implementation spans 9 weeks across 5 phases:
| Phase | Duration | Focus | Key Deliverables |
| ----- | --------- | ------------------- | --------------------------------------------------- |
| 1 | Weeks 1-2 | Core Infrastructure | SDK integration, Telemetry interface, Configuration |
| 2 | Weeks 3-4 | RPC Tracing | HTTP context extraction, Handler instrumentation |
| 3 | Weeks 5-6 | Transaction Tracing | Protocol Buffer context, Relay propagation |
| 4 | Weeks 7-8 | Consensus Tracing | Round spans, Proposal/validation tracing |
| 5 | Week 9 | Documentation | Runbook, Dashboards, Training |
**Total Effort**: 47 person-days (2 developers working in parallel)
➡️ **[View full Implementation Phases](./06-implementation-phases.md)**
---
## 7. Observability Backends
> **APM** = Application Performance Monitoring | **GCS** = Google Cloud Storage
Grafana Tempo is recommended for all environments due to its cost-effectiveness and Grafana integration, while Elastic APM is ideal for organizations with existing Elastic infrastructure.
The recommended production architecture uses a gateway collector pattern with regional collectors performing tail-based sampling, routing traces to multiple backends (Tempo for primary storage, Elastic for log correlation, S3/GCS for long-term archive).
➡️ **[View Observability Backend Recommendations](./07-observability-backends.md)**
---
## 8. Appendix
The appendix contains a glossary of OpenTelemetry and xrpld-specific terms, references to external documentation and specifications, version history for this implementation plan, and a complete document index.
➡️ **[View Appendix](./08-appendix.md)**
---
## POC Task List
A step-by-step task list for building a minimal end-to-end proof of concept that demonstrates distributed tracing in xrpld. The POC scope is limited to RPC tracing — showing request traces flowing from xrpld through an OpenTelemetry Collector into Tempo, viewable in Grafana.
➡️ **[View POC Task List](./POC_taskList.md)**
---
_This document provides a comprehensive implementation plan for integrating OpenTelemetry distributed tracing into the xrpld XRP Ledger node software. For detailed information on any section, follow the links to the corresponding sub-documents._

View File

@@ -0,0 +1,636 @@
# OpenTelemetry POC Task List
> **Goal**: Build a minimal end-to-end proof of concept that demonstrates distributed tracing in xrpld. A successful POC will show RPC request traces flowing from xrpld through an OTel Collector into Tempo, viewable in Grafana.
>
> **Scope**: RPC tracing only (highest value, lowest risk per the [CRAWL phase](./06-implementation-phases.md#6102-quick-wins-immediate-value) in the implementation phases). No cross-node P2P context propagation or consensus tracing in the POC.
### Related Plan Documents
| Document | Relevance to POC |
| ---------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [00-tracing-fundamentals.md](./00-tracing-fundamentals.md) | Core concepts: traces, spans, context propagation, sampling |
| [01-architecture-analysis.md](./01-architecture-analysis.md) | RPC request flow (§1.5), key trace points (§1.6), instrumentation priority (§1.7) |
| [02-design-decisions.md](./02-design-decisions.md) | SDK selection (§2.1), exporter config (§2.2), span naming (§2.3), attribute schema (§2.4), coexistence with PerfLog/Insight (§2.6) |
| [03-implementation-strategy.md](./03-implementation-strategy.md) | Directory structure (§3.1), key principles (§3.2), performance overhead (§3.3-3.6), conditional compilation (§3.7.3), code intrusiveness (§3.9) |
| [04-code-samples.md](./04-code-samples.md) | Telemetry interface (§4.1), SpanGuard (§4.2), macros (§4.3), RPC instrumentation (§4.5.3) |
| [05-configuration-reference.md](./05-configuration-reference.md) | xrpld config (§5.1), config parser (§5.2), Application integration (§5.3), CMake (§5.4), Collector config (§5.5), Docker Compose (§5.6), Grafana (§5.8) |
| [06-implementation-phases.md](./06-implementation-phases.md) | Phase 1 core tasks (§6.2), Phase 2 RPC tasks (§6.3), quick wins (§6.10), definition of done (§6.11) |
| [07-observability-backends.md](./07-observability-backends.md) | Tempo dev setup (§7.1), Grafana dashboards (§7.6), alert rules (§7.6.3) |
---
## Task 0: Docker Observability Stack Setup
> **OTLP** = OpenTelemetry Protocol
**Objective**: Stand up the backend infrastructure to receive, store, and display traces.
**What to do**:
- Create `docker/telemetry/docker-compose.yml` in the repo with three services:
1. **OpenTelemetry Collector** (`otel/opentelemetry-collector-contrib:0.92.0`)
- Expose ports `4317` (OTLP gRPC) and `4318` (OTLP HTTP)
- Expose port `13133` (health check)
- Mount a config file `docker/telemetry/otel-collector-config.yaml`
2. **Tempo** (`grafana/tempo:2.6.1`)
- Expose port `3200` (HTTP API) and `4317` (OTLP gRPC, internal)
3. **Grafana** (`grafana/grafana:latest`) — optional but useful
- Expose port `3000`
- Enable anonymous admin access for local dev (`GF_AUTH_ANONYMOUS_ENABLED=true`, `GF_AUTH_ANONYMOUS_ORG_ROLE=Admin`)
- Provision Tempo as a data source via `docker/telemetry/grafana/provisioning/datasources/tempo.yaml`
- Create `docker/telemetry/otel-collector-config.yaml`:
```yaml
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 1s
send_batch_size: 100
exporters:
logging:
verbosity: detailed
otlp/tempo:
endpoint: tempo:4317
tls:
insecure: true
service:
pipelines:
traces:
receivers: [otlp]
processors: [batch]
exporters: [logging, otlp/tempo]
```
- Create Grafana Tempo datasource provisioning file at `docker/telemetry/grafana/provisioning/datasources/tempo.yaml`:
```yaml
apiVersion: 1
datasources:
- name: Tempo
type: tempo
access: proxy
url: http://tempo:3200
```
**Verification**: Run `docker compose -f docker/telemetry/docker-compose.yml up -d`, then:
- `curl http://localhost:13133` returns healthy (Collector)
- `http://localhost:3000` opens Grafana (Tempo datasource available, no traces yet)
**Reference**:
- [05-configuration-reference.md §5.5](./05-configuration-reference.md) — Collector config (dev YAML with Tempo exporter)
- [05-configuration-reference.md §5.6](./05-configuration-reference.md) — Docker Compose development environment
- [07-observability-backends.md §7.1](./07-observability-backends.md) — Tempo quick start and backend selection
- [05-configuration-reference.md §5.8](./05-configuration-reference.md) — Grafana datasource provisioning and dashboards
---
## Task 1: Add OpenTelemetry C++ SDK Dependency
**Objective**: Make `opentelemetry-cpp` available to the build system.
**What to do**:
- Edit `conanfile.py` to add `opentelemetry-cpp` as an **optional** dependency. The gRPC otel plugin flag (`"grpc/*:otel_plugin": False`) in the existing conanfile may need to remain false — we pull the OTel SDK separately.
- Add a Conan option: `with_telemetry = [True, False]` defaulting to `False`
- When `with_telemetry` is `True`, add `opentelemetry-cpp` to `self.requires()`
- Required OTel Conan components: `opentelemetry-cpp` (which bundles api, sdk, and exporters). If the package isn't in Conan Center, consider using `FetchContent` in CMake or building from source as a fallback.
- Edit `CMakeLists.txt`:
- Add option: `option(XRPL_ENABLE_TELEMETRY "Enable OpenTelemetry tracing" OFF)`
- When ON, `find_package(opentelemetry-cpp CONFIG REQUIRED)` and add compile definition `XRPL_ENABLE_TELEMETRY`
- When OFF, do nothing (zero build impact)
- Verify the build succeeds with `-DXRPL_ENABLE_TELEMETRY=OFF` (no regressions) and with `-DXRPL_ENABLE_TELEMETRY=ON` (SDK links successfully).
**Key files**:
- `conanfile.py`
- `CMakeLists.txt`
**Reference**:
- [05-configuration-reference.md §5.4](./05-configuration-reference.md) — CMake integration, `FindOpenTelemetry.cmake`, `XRPL_ENABLE_TELEMETRY` option
- [03-implementation-strategy.md §3.2](./03-implementation-strategy.md) — Key principle: zero-cost when disabled via compile-time flags
- [02-design-decisions.md §2.1](./02-design-decisions.md) — SDK selection rationale and required OTel components
---
## Task 2: Create Core Telemetry Interface and NullTelemetry
**Objective**: Define the `Telemetry` abstract interface and a no-op implementation so the rest of the codebase can reference telemetry without hard-depending on the OTel SDK.
**What to do**:
- Create `include/xrpl/telemetry/Telemetry.h`:
- Define `namespace xrpl::telemetry`
- Define `struct Telemetry::Setup` holding: `enabled`, `exporterEndpoint`, `samplingRatio`, `serviceName`, `serviceVersion`, `serviceInstanceId`, `traceRpc`, `traceTransactions`, `traceConsensus`, `tracePeer`
- Define abstract `class Telemetry` with:
- `virtual void start() = 0;`
- `virtual void stop() = 0;`
- `virtual bool isEnabled() const = 0;`
- `virtual nostd::shared_ptr<Tracer> getTracer(string_view name = "xrpld") = 0;`
- `virtual nostd::shared_ptr<Span> startSpan(string_view name, SpanKind kind = kInternal) = 0;`
- `virtual nostd::shared_ptr<Span> startSpan(string_view name, Context const& parentContext, SpanKind kind = kInternal) = 0;`
- `virtual bool shouldTraceRpc() const = 0;`
- `virtual bool shouldTraceTransactions() const = 0;`
- `virtual bool shouldTraceConsensus() const = 0;`
- Factory: `std::unique_ptr<Telemetry> make_Telemetry(Setup const&, beast::Journal);`
- Config parser: `Telemetry::Setup setup_Telemetry(Section const&, std::string const& nodePublicKey, std::string const& version);`
- Create `include/xrpl/telemetry/SpanGuard.h`:
- RAII guard that takes an `nostd::shared_ptr<Span>`, creates a `Scope`, and calls `span->End()` in destructor.
- Convenience: `setAttribute()`, `setOk()`, `setStatus()`, `addEvent()`, `recordException()`, `context()`
- See [04-code-samples.md](./04-code-samples.md) §4.2 for the full implementation.
- Create `src/libxrpl/telemetry/NullTelemetry.cpp`:
- Implements `Telemetry` with all no-ops.
- `isEnabled()` returns `false`, `startSpan()` returns a noop span.
- This is used when `XRPL_ENABLE_TELEMETRY` is OFF or `enabled=0` in config.
- Guard all OTel SDK headers behind `#ifdef XRPL_ENABLE_TELEMETRY`. The `NullTelemetry` implementation should compile without the OTel SDK present.
**Key new files**:
- `include/xrpl/telemetry/Telemetry.h`
- `include/xrpl/telemetry/SpanGuard.h`
- `src/libxrpl/telemetry/NullTelemetry.cpp`
**Reference**:
- [04-code-samples.md §4.1](./04-code-samples.md) — Full `Telemetry` interface with `Setup` struct, lifecycle, tracer access, span creation, and component filtering methods
- [04-code-samples.md §4.2](./04-code-samples.md) — Full `SpanGuard` RAII implementation and `NullSpanGuard` no-op class
- [03-implementation-strategy.md §3.1](./03-implementation-strategy.md) — Directory structure: `include/xrpl/telemetry/` for headers, `src/libxrpl/telemetry/` for implementation
- [03-implementation-strategy.md §3.7.3](./03-implementation-strategy.md) — Conditional instrumentation and zero-cost compile-time disabled pattern
---
## Task 3: Implement OTel-Backed Telemetry
> **OTLP** = OpenTelemetry Protocol
**Objective**: Implement the real `Telemetry` class that initializes the OTel SDK, configures the OTLP exporter and batch processor, and creates tracers/spans.
**What to do**:
- Create `src/libxrpl/telemetry/Telemetry.cpp` (compiled only when `XRPL_ENABLE_TELEMETRY=ON`):
- `class TelemetryImpl : public Telemetry` that:
- In `start()`: creates a `TracerProvider` with:
- Resource attributes: `service.name`, `service.version`, `service.instance.id`
- An `OtlpHttpExporter` pointed at `setup.exporterEndpoint` (default `localhost:4318`)
- A `BatchSpanProcessor` with configurable batch size and delay
- A `TraceIdRatioBasedSampler` using `setup.samplingRatio`
- Sets the global `TracerProvider`
- In `stop()`: calls `ForceFlush()` then shuts down the provider
- In `startSpan()`: delegates to `getTracer()->StartSpan(name, ...)`
- `shouldTraceRpc()` etc. read from `Setup` fields
- Create `src/libxrpl/telemetry/TelemetryConfig.cpp`:
- `setup_Telemetry()` parses the `[telemetry]` config section from `xrpld.cfg`
- Maps config keys: `enabled`, `exporter`, `endpoint`, `sampling_ratio`, `trace_rpc`, `trace_transactions`, `trace_consensus`, `trace_peer`
- Wire `make_Telemetry()` factory:
- If `setup.enabled` is true AND `XRPL_ENABLE_TELEMETRY` is defined: return `TelemetryImpl`
- Otherwise: return `NullTelemetry`
- Add telemetry source files to CMake. When `XRPL_ENABLE_TELEMETRY=ON`, compile `Telemetry.cpp` and `TelemetryConfig.cpp` and link against `opentelemetry-cpp::api`, `opentelemetry-cpp::sdk`, `opentelemetry-cpp::otlp_grpc_exporter`. When OFF, compile only `NullTelemetry.cpp`.
**Key new files**:
- `src/libxrpl/telemetry/Telemetry.cpp`
- `src/libxrpl/telemetry/TelemetryConfig.cpp`
**Key modified files**:
- `CMakeLists.txt` (add telemetry library target)
**Reference**:
- [04-code-samples.md §4.1](./04-code-samples.md) — `Telemetry` interface that `TelemetryImpl` must implement
- [05-configuration-reference.md §5.2](./05-configuration-reference.md) — `setup_Telemetry()` config parser implementation
- [02-design-decisions.md §2.2](./02-design-decisions.md) — OTLP/gRPC exporter config (endpoint, TLS options)
- [02-design-decisions.md §2.4.1](./02-design-decisions.md) — Resource attributes: `service.name`, `service.version`, `service.instance.id`, `xrpl.network.id`
- [03-implementation-strategy.md §3.4](./03-implementation-strategy.md) — Per-operation CPU costs and overhead budget for span creation
- [03-implementation-strategy.md §3.5](./03-implementation-strategy.md) — Memory overhead: static (~456 KB) and dynamic (~1.2 MB) budgets
---
## Task 4: Integrate Telemetry into Application Lifecycle
**Objective**: Wire the `Telemetry` object into the `ServiceRegistry` / `Application` so all components can access it.
**What to do**:
- Edit `include/xrpl/core/ServiceRegistry.h`:
- Forward-declare `namespace telemetry { class Telemetry; }` inside `namespace xrpl`
- Add pure virtual method: `virtual telemetry::Telemetry& getTelemetry() = 0;`
- (`Application` extends `ServiceRegistry`, so this is automatically available on `Application` too)
- Edit `src/xrpld/app/main/Application.cpp` (the `ApplicationImp` class):
- Add member: `std::unique_ptr<telemetry::Telemetry> telemetry_;`
- In the member initializer list, construct telemetry with an empty
`serviceInstanceId` (node identity is not yet known):
```cpp
, telemetry_(
telemetry::make_Telemetry(
telemetry::setup_Telemetry(
config_->section("telemetry"),
"", // Updated later via setServiceInstanceId()
BuildInfo::getVersionString()),
logs_->journal("Telemetry")))
```
- In `setup()`, after `nodeIdentity_` is resolved, inject the node
public key as the service instance ID:
```cpp
if (!config_->section("telemetry").exists("service_instance_id"))
telemetry_->setServiceInstanceId(
toBase58(TokenType::NodePublic, nodeIdentity_->first));
```
- In `start()`: call `telemetry_->start()`
- In `run()` (shutdown path): call `telemetry_->stop()` (to flush pending spans)
- Implement `getTelemetry()` override: return `*telemetry_`
- Add `[telemetry]` section to the example config `cfg/xrpld-example.cfg`:
```ini
# [telemetry]
# enabled=1
# endpoint=http://localhost:4318/v1/traces
# sampling_ratio=1.0
# trace_rpc=1
```
> **Access patterns**: Components holding `ServiceRegistry&` (e.g.
> `NetworkOPsImp`) call `registry_.get().getTelemetry()`. Components
> holding `Application&` (e.g. `ServerHandler`, `PeerImp`,
> `RCLConsensusAdaptor`) call `app_.getTelemetry()` directly. Both
> resolve to the same `Telemetry` instance.
**Key modified files**:
- `include/xrpl/core/ServiceRegistry.h`
- `src/xrpld/app/main/Application.cpp`
- `cfg/xrpld-example.cfg` (example config)
**Reference**:
- [05-configuration-reference.md §5.3](./05-configuration-reference.md) — `ApplicationImp` changes: member declaration, constructor init, `start()`/`stop()` wiring, `getTelemetry()` override
- [05-configuration-reference.md §5.1](./05-configuration-reference.md) — `[telemetry]` config section format and all option defaults
- [03-implementation-strategy.md §3.9.2](./03-implementation-strategy.md) — File impact assessment: `Application.cpp` ~15 lines added, ~3 changed (Low risk)
---
## Task 5: Create Instrumentation Macros
**Objective**: Define convenience macros that make instrumenting code one-liners, and that compile to zero-cost no-ops when telemetry is disabled.
**What to do**:
- Create `src/xrpld/telemetry/TracingInstrumentation.h`:
- When `XRPL_ENABLE_TELEMETRY` is defined:
```cpp
#define XRPL_TRACE_SPAN(telemetry, name) \
auto _xrpl_span_ = (telemetry).startSpan(name); \
::xrpl::telemetry::SpanGuard _xrpl_guard_(_xrpl_span_)
#define XRPL_TRACE_RPC(telemetry, name) \
std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \
if ((telemetry).shouldTraceRpc()) { \
_xrpl_guard_.emplace((telemetry).startSpan(name)); \
}
#define XRPL_TRACE_SET_ATTR(key, value) \
if (_xrpl_guard_.has_value()) { \
_xrpl_guard_->setAttribute(key, value); \
}
#define XRPL_TRACE_EXCEPTION(e) \
if (_xrpl_guard_.has_value()) { \
_xrpl_guard_->recordException(e); \
}
```
- When `XRPL_ENABLE_TELEMETRY` is NOT defined, all macros expand to `((void)0)`
**Key new file**:
- `src/xrpld/telemetry/TracingInstrumentation.h`
**Reference**:
- [04-code-samples.md §4.3](./04-code-samples.md) — Full macro definitions for `XRPL_TRACE_SPAN`, `XRPL_TRACE_RPC`, `XRPL_TRACE_CONSENSUS`, `XRPL_TRACE_SET_ATTR`, `XRPL_TRACE_EXCEPTION` with both enabled and disabled branches
- [03-implementation-strategy.md §3.7.3](./03-implementation-strategy.md) — Conditional instrumentation pattern: compile-time `#ifndef` and runtime `shouldTrace*()` checks
- [03-implementation-strategy.md §3.9.7](./03-implementation-strategy.md) — Before/after code examples showing minimal intrusiveness (~1-3 lines per instrumentation point)
---
## Task 6: Instrument RPC ServerHandler
> **WS** = WebSocket
**Objective**: Add tracing to the HTTP RPC entry point so every incoming RPC request creates a span.
**What to do**:
- Edit `src/xrpld/rpc/detail/ServerHandler.cpp`:
- `#include` the `TracingInstrumentation.h` header
- In `ServerHandler::onRequest(Session& session)`:
- At the top of the method, add: `XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.request");`
- After the RPC command name is extracted, set attribute: `XRPL_TRACE_SET_ATTR("xrpl.rpc.command", command);`
- After the response status is known, set: `XRPL_TRACE_SET_ATTR("http.status_code", static_cast<int64_t>(statusCode));`
- Wrap error paths with: `XRPL_TRACE_EXCEPTION(e);`
- In `ServerHandler::processRequest(...)`:
- Add a child span: `XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.process");`
- Set method attribute: `XRPL_TRACE_SET_ATTR("xrpl.rpc.method", request_method);`
- In `ServerHandler::onWSMessage(...)` (WebSocket path):
- Add: `XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.ws.message");`
- The goal is to see spans like:
```
rpc.request
└── rpc.process
```
in Tempo/Grafana for every HTTP RPC call.
**Key modified file**:
- `src/xrpld/rpc/detail/ServerHandler.cpp` (~15-25 lines added)
**Reference**:
- [04-code-samples.md §4.5.3](./04-code-samples.md) — Complete `ServerHandler::onRequest()` instrumented code sample with W3C header extraction, span creation, attribute setting, and error handling
- [01-architecture-analysis.md §1.5](./01-architecture-analysis.md) — RPC request flow diagram: HTTP request -> attributes -> jobqueue.enqueue -> rpc.command -> response
- [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.request` in `ServerHandler.cpp::onRequest()` (Priority: High)
- [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming convention: `rpc.request`, `rpc.command.*`
- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC span attributes: `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.params`
- [03-implementation-strategy.md §3.9.2](./03-implementation-strategy.md) — File impact: `ServerHandler.cpp` ~40 lines added, ~10 changed (Low risk)
---
## Task 7: Instrument RPC Command Execution
**Objective**: Add per-command tracing inside the RPC handler so each command (e.g., `submit`, `account_info`, `server_info`) gets its own child span.
**What to do**:
- Edit `src/xrpld/rpc/detail/RPCHandler.cpp`:
- `#include` the `TracingInstrumentation.h` header
- In `doCommand(RPC::JsonContext& context, Json::Value& result)`:
- At the top: `XRPL_TRACE_RPC(context.app.getTelemetry(), "rpc.command." + context.method);`
- Set attributes:
- `XRPL_TRACE_SET_ATTR("xrpl.rpc.command", context.method);`
- `XRPL_TRACE_SET_ATTR("xrpl.rpc.version", static_cast<int64_t>(context.apiVersion));`
- `XRPL_TRACE_SET_ATTR("xrpl.rpc.role", (context.role == Role::ADMIN) ? "admin" : "user");`
- On success: `XRPL_TRACE_SET_ATTR("xrpl.rpc.status", "success");`
- On error: `XRPL_TRACE_SET_ATTR("xrpl.rpc.status", "error");` and set the error message
- After this, traces in Tempo/Grafana should look like:
```
rpc.request (xrpl.rpc.command=account_info)
└── rpc.process
└── rpc.command.account_info (xrpl.rpc.version=2, xrpl.rpc.role=user, xrpl.rpc.status=success)
```
**Key modified file**:
- `src/xrpld/rpc/detail/RPCHandler.cpp` (~15-20 lines added)
**Reference**:
- [04-code-samples.md §4.5.3](./04-code-samples.md) — `ServerHandler::onRequest()` code sample (includes child span pattern for `rpc.command.*`)
- [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming: `rpc.command.*` pattern with dynamic command name (e.g., `rpc.command.server_info`)
- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC attribute schema: `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.status`
- [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.command.*` in `RPCHandler.cpp::doCommand()` (Priority: High)
- [02-design-decisions.md §2.6.5](./02-design-decisions.md) — Correlation with PerfLog: how `doCommand()` can link trace_id with existing PerfLog entries
- [03-implementation-strategy.md §3.4.4](./03-implementation-strategy.md) — RPC request overhead budget: ~1.75 μs total per request
---
## Task 8: Build, Run, and Verify End-to-End
> **OTLP** = OpenTelemetry Protocol
**Objective**: Prove the full pipeline works: xrpld emits traces -> OTel Collector receives them -> Tempo stores them for Grafana visualization.
**What to do**:
1. **Start the Docker stack**:
```bash
docker compose -f docker/telemetry/docker-compose.yml up -d
```
Verify Collector health: `curl http://localhost:13133`
2. **Build xrpld with telemetry**:
```bash
# Adjust for your actual build workflow
conan install . --build=missing -o with_telemetry=True
cmake --preset default -DXRPL_ENABLE_TELEMETRY=ON
cmake --build --preset default
```
3. **Configure xrpld**:
Add to `xrpld.cfg` (or your local test config):
```ini
[telemetry]
enabled=1
endpoint=localhost:4317
sampling_ratio=1.0
trace_rpc=1
```
4. **Start xrpld** in standalone mode:
```bash
./rippled --conf xrpld.cfg -a --start
```
5. **Generate RPC traffic**:
```bash
# server_info
curl -s -X POST http://localhost:5005 \
-H "Content-Type: application/json" \
-d '{"method":"server_info","params":[{}]}'
# ledger
curl -s -X POST http://localhost:5005 \
-H "Content-Type: application/json" \
-d '{"method":"ledger","params":[{"ledger_index":"current"}]}'
# account_info (will error in standalone, that's fine — we trace errors too)
curl -s -X POST http://localhost:5005 \
-H "Content-Type: application/json" \
-d '{"method":"account_info","params":[{"account":"rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh"}]}'
```
6. **Verify in Grafana (Tempo)**:
- Open `http://localhost:3000`
- Navigate to Explore → select Tempo datasource
- Search for service `xrpld`
- Confirm you see traces with spans: `rpc.request` -> `rpc.process` -> `rpc.command.server_info`
- Click into a trace and verify attributes: `xrpl.rpc.command`, `xrpl.rpc.status`, `xrpl.rpc.version`
7. **Verify zero-overhead when disabled**:
- Rebuild with `XRPL_ENABLE_TELEMETRY=OFF`, or set `enabled=0` in config
- Run the same RPC calls
- Confirm no new traces appear and no errors in xrpld logs
**Verification Checklist**:
- [ ] Docker stack starts without errors
- [ ] xrpld builds with `-DXRPL_ENABLE_TELEMETRY=ON`
- [ ] xrpld starts and connects to OTel Collector (check xrpld logs for telemetry messages)
- [ ] Traces appear in Grafana/Tempo under service "xrpld"
- [ ] Span hierarchy is correct (parent-child relationships)
- [ ] Span attributes are populated (`xrpl.rpc.command`, `xrpl.rpc.status`, etc.)
- [ ] Error spans show error status and message
- [ ] Building with `XRPL_ENABLE_TELEMETRY=OFF` produces no regressions
- [ ] Setting `enabled=0` at runtime produces no traces and no errors
**Reference**:
- [06-implementation-phases.md §6.11.1](./06-implementation-phases.md) — Phase 1 definition of done: SDK compiles, runtime toggle works, span creation verified in Tempo, config validation passes
- [06-implementation-phases.md §6.11.2](./06-implementation-phases.md#6112-phase-2-rpc-tracing) — Phase 2 definition of done: 100% RPC coverage, traceparent propagation, <1ms p99 overhead, dashboard deployed
- [06-implementation-phases.md §6.8](./06-implementation-phases.md) — Success metrics: trace coverage >95%, CPU overhead <3%, memory <5 MB, latency impact <2%
- [03-implementation-strategy.md §3.9.5](./03-implementation-strategy.md) — Backward compatibility: config optional, protocol unchanged, `XRPL_ENABLE_TELEMETRY=OFF` produces identical binary
- [01-architecture-analysis.md §1.8](./01-architecture-analysis.md) — Observable outcomes: what traces, metrics, and dashboards to expect
---
## Task 9: Document POC Results and Next Steps
> **OTLP** = OpenTelemetry Protocol | **WS** = WebSocket
**Objective**: Capture findings, screenshots, and remaining work for the team.
**What to do**:
- Take screenshots of Grafana/Tempo showing:
- The service list with "xrpld"
- A trace with the full span tree
- Span detail view showing attributes
- Document any issues encountered (build issues, SDK quirks, missing attributes)
- Note performance observations (build time impact, any noticeable runtime overhead)
- Write a short summary of what the POC proves and what it doesn't cover yet:
- **Proves**: OTel SDK integrates with xrpld, OTLP export works, RPC traces visible
- **Doesn't cover**: Cross-node P2P context propagation, consensus tracing, protobuf trace context, W3C traceparent header extraction, tail-based sampling, production deployment
- Outline next steps (mapping to the full plan phases):
- [Phase 2](./06-implementation-phases.md) completion: [W3C header extraction](./02-design-decisions.md) (§2.5), WebSocket tracing, all [RPC handlers](./01-architecture-analysis.md) (§1.6)
- [Phase 3](./06-implementation-phases.md): [Protobuf `TraceContext` message](./04-code-samples.md) (§4.4), [transaction relay tracing](./04-code-samples.md) (§4.5.1) across nodes
- [Phase 4](./06-implementation-phases.md): [Consensus round and phase tracing](./04-code-samples.md) (§4.5.2)
- [Phase 5](./06-implementation-phases.md): [Production collector config](./05-configuration-reference.md) (§5.5.2), [Grafana dashboards](./07-observability-backends.md) (§7.6), [alerting](./07-observability-backends.md) (§7.6.3)
**Reference**:
- [06-implementation-phases.md §6.1](./06-implementation-phases.md) — Full 5-phase timeline overview and Gantt chart
- [06-implementation-phases.md §6.10](./06-implementation-phases.md) — Crawl-Walk-Run strategy: POC is the CRAWL phase, next steps are WALK and RUN
- [06-implementation-phases.md §6.12](./06-implementation-phases.md) — Recommended implementation order (14 steps across 9 weeks)
- [03-implementation-strategy.md §3.9](./03-implementation-strategy.md) — Code intrusiveness assessment and risk matrix for each remaining component
- [07-observability-backends.md §7.2](./07-observability-backends.md) — Production backend selection (Tempo, Elastic APM, Honeycomb, Datadog)
- [02-design-decisions.md §2.5](./02-design-decisions.md) — Context propagation design: W3C HTTP headers, protobuf P2P, JobQueue internal
- [00-tracing-fundamentals.md](./00-tracing-fundamentals.md) — Reference for team onboarding on distributed tracing concepts
---
## Summary
| Task | Description | New Files | Modified Files | Depends On |
| ---- | ------------------------------------ | --------- | -------------- | ---------- |
| 0 | Docker observability stack | 4 | 0 | — |
| 1 | OTel C++ SDK dependency | 0 | 2 | — |
| 2 | Core Telemetry interface + NullImpl | 3 | 0 | 1 |
| 3 | OTel-backed Telemetry implementation | 2 | 1 | 1, 2 |
| 4 | Application lifecycle integration | 0 | 3 | 2, 3 |
| 5 | Instrumentation macros | 1 | 0 | 2 |
| 6 | Instrument RPC ServerHandler | 0 | 1 | 4, 5 |
| 7 | Instrument RPC command execution | 0 | 1 | 4, 5 |
| 8 | End-to-end verification | 0 | 0 | 0-7 |
| 9 | Document results and next steps | 1 | 0 | 8 |
**Parallel work**: Tasks 0 and 1 can run in parallel. Tasks 2 and 5 have no dependency on each other. Tasks 6 and 7 can be done in parallel once Tasks 4 and 5 are complete.
---
## Next Steps (Post-POC)
> **OTLP** = OpenTelemetry Protocol | **WS** = WebSocket
### Metrics Pipeline for Grafana Dashboards
The current POC exports **traces only**. Grafana's Explore view can query Tempo for individual traces, but time-series charts (latency histograms, request throughput, error rates) require a **metrics pipeline**. To enable this:
1. **Add a `spanmetrics` connector** to the OTel Collector config that derives RED metrics (Rate, Errors, Duration) from trace spans automatically:
```yaml
connectors:
spanmetrics:
histogram:
explicit:
buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s]
dimensions:
- name: xrpl.rpc.command
- name: xrpl.rpc.status
exporters:
prometheus:
endpoint: 0.0.0.0:8889
service:
pipelines:
traces:
receivers: [otlp]
processors: [batch]
exporters: [debug, otlp/tempo, spanmetrics]
metrics:
receivers: [spanmetrics]
exporters: [prometheus]
```
2. **Add Prometheus** to the Docker Compose stack to scrape the collector's metrics endpoint.
3. **Add Prometheus as a Grafana datasource** and build dashboards for:
- RPC request latency (p50/p95/p99) by command
- RPC throughput (requests/sec) by command
- Error rate by command
- Span duration distribution
### Additional Instrumentation
- **W3C `traceparent` header extraction** in `ServerHandler` to support cross-service context propagation from external callers
- **WebSocket RPC tracing** in `ServerHandler::onWSMessage()`
- **Transaction relay tracing** across nodes using protobuf `TraceContext` messages
- **Consensus round and phase tracing** for validator coordination visibility
- **Ledger close tracing** to measure close-to-validated latency
### Production Hardening
- **Tail-based sampling** in the OTel Collector to reduce volume while retaining error/slow traces
- **TLS configuration** for the OTLP exporter in production deployments
- **Resource limits** on the batch processor queue to prevent unbounded memory growth
- **Health monitoring** for the telemetry pipeline itself (collector lag, export failures)
### POC Lessons Learned
Issues encountered during POC implementation that inform future work:
| Issue | Resolution | Impact on Future Work |
| -------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------- | ---------------------------------------------------------------- |
| Conan lockfile rejected `opentelemetry-cpp/1.18.0` | Used `--lockfile=""` to bypass | Lockfile must be regenerated when adding new dependencies |
| Conan package only builds OTLP HTTP exporter, not gRPC | Switched from gRPC to HTTP exporter (`localhost:4318/v1/traces`) | HTTP exporter is the default; gRPC requires custom Conan profile |
| CMake target `opentelemetry-cpp::api` etc. don't exist in Conan package | Use umbrella target `opentelemetry-cpp::opentelemetry-cpp` | Conan targets differ from upstream CMake targets |
| OTel Collector `logging` exporter deprecated | Renamed to `debug` exporter | Use `debug` in all collector configs going forward |
| Macro parameter `telemetry` collided with `::xrpl::telemetry::` namespace | Renamed macro params to `_tel_obj_`, `_span_name_` | Avoid common words as macro parameter names |
| `opentelemetry::trace::Scope` creates new context on move | Store scope as member, create once in constructor | SpanGuard move semantics need care with Scope lifecycle |
| `TracerProviderFactory::Create` returns `unique_ptr<sdk::TracerProvider>`, not `nostd::shared_ptr` | Use `std::shared_ptr` member, wrap in `nostd::shared_ptr` for global provider | OTel SDK factory return types don't match API provider types |

View File

@@ -0,0 +1,673 @@
# OpenTelemetry Distributed Tracing for xrpld
---
## Slide 1: Introduction
> **CNCF** = Cloud Native Computing Foundation
### What is OpenTelemetry?
OpenTelemetry is an open-source, CNCF-backed observability framework for distributed tracing, metrics, and logs.
### Why OpenTelemetry for xrpld?
- **End-to-End Transaction Visibility**: Track transactions from submission → consensus → ledger inclusion
- **Cross-Node Correlation**: Follow requests across multiple independent nodes using a unique `trace_id`
- **Consensus Round Analysis**: Understand timing and behavior across validators
- **Incident Debugging**: Correlate events across distributed nodes during issues
```mermaid
flowchart LR
A["Node A<br/>tx.receive<br/>trace_id: abc123"] --> B["Node B<br/>tx.relay<br/>trace_id: abc123"] --> C["Node C<br/>tx.validate<br/>trace_id: abc123"] --> D["Node D<br/>ledger.apply<br/>trace_id: abc123"]
style A fill:#1565c0,stroke:#0d47a1,color:#fff
style B fill:#2e7d32,stroke:#1b5e20,color:#fff
style C fill:#2e7d32,stroke:#1b5e20,color:#fff
style D fill:#e65100,stroke:#bf360c,color:#fff
```
**Reading the diagram:**
- **Node A (blue, leftmost)**: The originating node that first receives the transaction and assigns a new `trace_id: abc123`; this ID becomes the correlation key for the entire distributed trace.
- **Node B and Node C (green, middle)**: Relay and validation nodes — each creates its own span but carries the same `trace_id`, so their work is linked to the original submission without any central coordinator.
- **Node D (orange, rightmost)**: The final node that applies the transaction to the ledger; the trace now spans the full lifecycle from submission to ledger inclusion.
- **Left-to-right flow**: The horizontal progression shows the real-world message path — a transaction hops from node to node, and the shared `trace_id` stitches all hops into a single queryable trace.
> **Trace ID: abc123** — All nodes share the same trace, enabling cross-node correlation.
---
## Slide 2: OpenTelemetry vs Open Source Alternatives
> **CNCF** = Cloud Native Computing Foundation
| Feature | OpenTelemetry | Jaeger | Zipkin | SkyWalking | Pinpoint | Prometheus |
| ------------------- | ---------------- | ---------------- | ------------------ | ---------- | ---------- | ---------- |
| **Tracing** | YES | YES | YES | YES | YES | NO |
| **Metrics** | YES | NO | NO | YES | YES | YES |
| **Logs** | YES | NO | NO | YES | NO | NO |
| **C++ SDK** | YES Official | YES (Deprecated) | YES (Unmaintained) | NO | NO | YES |
| **Vendor Neutral** | YES Primary goal | NO | NO | NO | NO | NO |
| **Instrumentation** | Manual + Auto | Manual | Manual | Auto-first | Auto-first | Manual |
| **Backend** | Any (exporters) | Self | Self | Self | Self | Self |
| **CNCF Status** | Incubating | Graduated | NO | Incubating | NO | Graduated |
> **Why OpenTelemetry?** It's the only actively maintained, full-featured C++ option with vendor neutrality — allowing export to Tempo, Prometheus, Grafana, or any commercial backend without changing instrumentation.
---
## Slide 3: Adoption Scope — Traces Only (Current Plan)
OpenTelemetry supports three signal types: **Traces**, **Metrics**, and **Logs**. xrpld already captures metrics (StatsD via Beast Insight) and logs (Journal/PerfLog). The question is: how much of OTel do we adopt?
> **Scenario A**: Add distributed tracing. Keep StatsD for metrics and Journal for logs.
```mermaid
flowchart LR
subgraph xrpld["xrpld Process"]
direction TB
OTel["OTel SDK<br/>(Traces)"]
Insight["Beast Insight<br/>(StatsD Metrics)"]
Journal["Journal + PerfLog<br/>(Logging)"]
end
OTel -->|"OTLP"| Collector["OTel Collector"]
Insight -->|"UDP"| StatsD["StatsD Server"]
Journal -->|"File I/O"| LogFile["perf.log / debug.log"]
Collector --> Tempo["Tempo"]
StatsD --> Graphite["Graphite / Grafana"]
LogFile --> Loki["Loki (optional)"]
style xrpld fill:#424242,stroke:#212121,color:#fff
style OTel fill:#2e7d32,stroke:#1b5e20,color:#fff
style Insight fill:#1565c0,stroke:#0d47a1,color:#fff
style Journal fill:#e65100,stroke:#bf360c,color:#fff
style Collector fill:#2e7d32,stroke:#1b5e20,color:#fff
```
| Aspect | Details |
| ------------------------------ | --------------------------------------------------------------------------------------------------------------- |
| **What changes for operators** | Deploy OTel Collector + trace backend. Existing StatsD and log pipelines stay as-is. |
| **Codebase impact** | New `Telemetry` module (~1500 LOC). Beast Insight and Journal untouched. |
| **New capabilities** | Cross-node trace correlation, span-based debugging, request lifecycle visibility. |
| **What we still can't do** | Correlate metrics with specific traces natively. StatsD metrics remain fire-and-forget with no trace exemplars. |
| **Maintenance burden** | Three separate observability systems to maintain (OTel + StatsD + Journal). |
| **Risk** | Lowest — additive change, no existing systems disturbed. |
---
## Slide 4: Future Adoption — Metrics & Logs via OTel
### Scenario B: + OTel Metrics (Replace StatsD)
> Migrate StatsD to OTel Metrics API, exposing Prometheus-compatible metrics. Remove Beast Insight.
```mermaid
flowchart LR
subgraph xrpld["xrpld Process"]
direction TB
OTel["OTel SDK<br/>(Traces + Metrics)"]
Journal["Journal + PerfLog<br/>(Logging)"]
end
OTel -->|"OTLP"| Collector["OTel Collector"]
Journal -->|"File I/O"| LogFile["perf.log / debug.log"]
Collector --> Tempo["Tempo<br/>(Traces)"]
Collector --> Prom["Prometheus<br/>(Metrics)"]
LogFile --> Loki["Loki (optional)"]
style xrpld fill:#424242,stroke:#212121,color:#fff
style OTel fill:#2e7d32,stroke:#1b5e20,color:#fff
style Journal fill:#e65100,stroke:#bf360c,color:#fff
style Collector fill:#2e7d32,stroke:#1b5e20,color:#fff
```
- **Better metrics?** Yes — Prometheus gives native histograms (p50/p95/p99), multi-dimensional labels, and exemplars linking metric spikes to traces.
- **Codebase**: Remove `Beast::Insight` + `StatsDCollector` (~2000 LOC). Single SDK for traces and metrics.
- **Operator effort**: Rewrite dashboards from StatsD/Graphite queries to PromQL. Run both in parallel during transition.
- **Risk**: Medium — operators must migrate monitoring infrastructure.
### Scenario C: + OTel Logs (Full Stack)
> Also replace Journal logging with OTel Logs API. Single SDK for everything.
```mermaid
flowchart LR
subgraph xrpld["xrpld Process"]
OTel["OTel SDK<br/>(Traces + Metrics + Logs)"]
end
OTel -->|"OTLP"| Collector["OTel Collector"]
Collector --> Tempo["Tempo<br/>(Traces)"]
Collector --> Prom["Prometheus<br/>(Metrics)"]
Collector --> Loki["Loki / Elastic<br/>(Logs)"]
style xrpld fill:#424242,stroke:#212121,color:#fff
style OTel fill:#2e7d32,stroke:#1b5e20,color:#fff
style Collector fill:#2e7d32,stroke:#1b5e20,color:#fff
```
- **Structured logging**: OTel Logs API outputs structured records with `trace_id`, `span_id`, severity, and attributes by design.
- **Full correlation**: Every log line carries `trace_id`. Click trace → see logs. Click metric spike → see trace → see logs.
- **Codebase**: Remove Beast Insight (~2000 LOC) + simplify Journal/PerfLog (~3000 LOC). One dependency instead of three.
- **Risk**: Highest — `beast::Journal` is deeply embedded in every component. Large refactor. OTel C++ Logs API is newer (stable since v1.11, less battle-tested).
### Recommendation
```mermaid
flowchart LR
A["Phase 1<br/><b>Traces Only</b><br/>(Current Plan)"] --> B["Phase 2<br/><b>+ Metrics</b><br/>(Replace StatsD)"] --> C["Phase 3<br/><b>+ Logs</b><br/>(Full OTel)"]
style A fill:#2e7d32,stroke:#1b5e20,color:#fff
style B fill:#1565c0,stroke:#0d47a1,color:#fff
style C fill:#e65100,stroke:#bf360c,color:#fff
```
| Phase | Signal | Strategy | Risk |
| -------------------- | --------- | -------------------------------------------------------------- | ------ |
| **Phase 1** (now) | Traces | Add OTel traces. Keep StatsD and Journal. Prove value. | Low |
| **Phase 2** (future) | + Metrics | Migrate StatsD → Prometheus via OTel. Remove Beast Insight. | Medium |
| **Phase 3** (future) | + Logs | Adopt OTel Logs API. Align with structured logging initiative. | High |
> **Key Takeaway**: Start with traces (unique value, lowest risk), then incrementally adopt metrics and logs as the OTel infrastructure proves itself.
---
## Slide 5: Comparison with xrpld's Existing Solutions
### Current Observability Stack
| Aspect | PerfLog (JSON) | StatsD (Metrics) | OpenTelemetry (NEW) |
| --------------------- | --------------------- | --------------------- | --------------------------- |
| **Type** | Logging | Metrics | Distributed Tracing |
| **Scope** | Single node | Single node | **Cross-node** |
| **Data** | JSON log entries | Counters, gauges | Spans with context |
| **Correlation** | By timestamp | By metric name | By `trace_id` |
| **Overhead** | Low (file I/O) | Low (UDP) | Low-Medium (configurable) |
| **Question Answered** | "What happened here?" | "How many? How fast?" | **"What was the journey?"** |
### Use Case Matrix
| Scenario | PerfLog | StatsD | OpenTelemetry |
| -------------------------------- | ------- | ------ | ------------- |
| "How many TXs per second?" | ❌ | ✅ | ❌ |
| "Why was this specific TX slow?" | ⚠️ | ❌ | ✅ |
| "Which node delayed consensus?" | ❌ | ❌ | ✅ |
| "Show TX journey across 5 nodes" | ❌ | ❌ | ✅ |
> **Key Insight**: In the **traces-only** approach (Phase 1), OpenTelemetry **complements** existing systems. In future phases, OTel metrics and logs could **replace** StatsD and Journal respectively — see Slides 3-4 for the full adoption roadmap.
---
## Slide 6: Architecture
> **OTLP** = OpenTelemetry Protocol | **WS** = WebSocket
### High-Level Integration Architecture
```mermaid
flowchart TB
subgraph xrpld["xrpld Node"]
subgraph services["Core Services"]
direction LR
RPC["RPC Server<br/>(HTTP/WS)"] ~~~ Overlay["Overlay<br/>(P2P Network)"] ~~~ Consensus["Consensus<br/>(RCLConsensus)"]
end
Telemetry["Telemetry Module<br/>(OpenTelemetry SDK)"]
services --> Telemetry
end
Telemetry -->|OTLP/gRPC| Collector["OTel Collector"]
Collector --> Tempo["Grafana Tempo"]
Collector --> Elastic["Elastic APM"]
style xrpld fill:#424242,stroke:#212121,color:#fff
style services fill:#1565c0,stroke:#0d47a1,color:#fff
style Telemetry fill:#2e7d32,stroke:#1b5e20,color:#fff
style Collector fill:#e65100,stroke:#bf360c,color:#fff
```
**Reading the diagram:**
- **Core Services (blue, top)**: RPC Server, Overlay, and Consensus are the three primary components that generate trace data — they represent the entry points for client requests, peer messages, and consensus rounds respectively.
- **Telemetry Module (green, middle)**: The OpenTelemetry SDK sits below the core services and receives span data from all three; it acts as a single collection point within the xrpld process.
- **OTel Collector (orange, center)**: An external process that receives spans over OTLP/gRPC from the Telemetry Module; it decouples xrpld from backend choices and handles batching, sampling, and routing.
- **Backends (bottom row)**: Tempo and Elastic APM are interchangeable — the Collector fans out to any combination, so operators can switch backends without modifying xrpld code.
- **Top-to-bottom flow**: Data flows from instrumented code down through the SDK, out over the network to the Collector, and finally into storage/visualization backends.
### Context Propagation
```mermaid
sequenceDiagram
participant Client
participant NodeA as Node A
participant NodeB as Node B
Client->>NodeA: Submit TX (no context)
Note over NodeA: Creates trace_id: abc123<br/>span: tx.receive
NodeA->>NodeB: Relay TX<br/>(traceparent: abc123)
Note over NodeB: Links to trace_id: abc123<br/>span: tx.relay
```
- **HTTP/RPC**: W3C Trace Context headers (`traceparent`)
- **P2P Messages**: Protocol Buffer extension fields
---
## Slide 7: Implementation Plan
### 5-Phase Rollout (9 Weeks)
> **Note**: Dates shown are relative to project start, not calendar dates.
```mermaid
gantt
title Implementation Timeline
dateFormat YYYY-MM-DD
axisFormat Week %W
section Phase 1
Core Infrastructure :p1, 2024-01-01, 2w
section Phase 2
RPC Tracing :p2, after p1, 2w
section Phase 3
Transaction Tracing :p3, after p2, 2w
section Phase 4
Consensus Tracing :p4, after p3, 2w
section Phase 5
Documentation :p5, after p4, 1w
```
### Phase Details
| Phase | Focus | Key Deliverables | Effort |
| ----- | ------------------- | -------------------------------------------- | ------- |
| 1 | Core Infrastructure | SDK integration, Telemetry interface, Config | 10 days |
| 2 | RPC Tracing | HTTP context extraction, Handler spans | 10 days |
| 3 | Transaction Tracing | Protobuf context, P2P relay propagation | 10 days |
| 4 | Consensus Tracing | Round spans, Proposal/validation tracing | 10 days |
| 5 | Documentation | Runbook, Dashboards, Training | 7 days |
**Total Effort**: ~47 developer-days (2 developers)
> **Future Phases** (not in current scope): After traces are stable, OTel metrics can replace StatsD (~3 weeks), and OTel logs can replace Journal (~4 weeks, aligned with structured logging initiative). See Slides 3-4 for the full adoption roadmap.
---
## Slide 8: Performance Overhead
> **OTLP** = OpenTelemetry Protocol
### Estimated System Impact
| Metric | Overhead | Notes |
| ----------------- | ---------- | ------------------------------------------------ |
| **CPU** | 1-3% | Span creation and attribute setting |
| **Memory** | ~10 MB | SDK statics + batch buffer + worker thread stack |
| **Network** | 10-50 KB/s | Compressed OTLP export to collector |
| **Latency (p99)** | <2% | With proper sampling configuration |
#### How We Arrived at These Numbers
**Assumptions (XRPL mainnet baseline)**:
| Parameter | Value | Source |
| ------------------------- | ---------------------- | --------------------------------------------------------------------------------------------------- |
| Transaction throughput | ~25 TPS (peaks to ~50) | Mainnet average |
| Default peers per node | 21 | `peerfinder/detail/Tuning.h` (`defaultMaxPeers`) |
| Consensus round frequency | ~1 round / 3-4 seconds | `ConsensusParms.h` (`ledgerMIN_CONSENSUS=1950ms`) |
| Proposers per round | ~20-35 | Mainnet UNL size |
| P2P message rate | ~160 msgs/sec | See message breakdown below |
| Avg TX processing time | ~200 μs | Profiled baseline |
| Single span creation cost | 500-1000 ns | OTel C++ SDK benchmarks (see [3.5.4](./03-implementation-strategy.md#354-performance-data-sources)) |
**P2P message breakdown** (per node, mainnet):
| Message Type | Rate | Derivation |
| ------------- | ------------ | --------------------------------------------------------------------- |
| TMTransaction | ~100/sec | ~25 TPS × ~4 relay hops per TX, deduplicated by HashRouter |
| TMValidation | ~50/sec | ~35 validators × ~1 validation/3s round ~12/sec, plus relay fan-out |
| TMProposeSet | ~10/sec | ~35 proposers / 3s round ~12/round, clustered in establish phase |
| **Total** | **~160/sec** | **Only traced message types counted** |
**CPU (1-3%) — Calculation**:
Per-transaction tracing cost breakdown:
| Operation | Cost | Notes |
| ----------------------------------------------- | ----------- | ------------------------------------------ |
| `tx.receive` span (create + end + 4 attributes) | ~1400 ns | ~1000ns create + ~200ns end + 4×50ns attrs |
| `tx.validate` span | ~1200 ns | ~1000ns create + ~200ns for 2 attributes |
| `tx.relay` span | ~1200 ns | ~1000ns create + ~200ns for 2 attributes |
| Context injection into P2P message | ~200 ns | Serialize trace_id + span_id into protobuf |
| **Total per TX** | **~4.0 μs** | |
> **CPU overhead**: 4.0 μs / 200 μs baseline = **~2.0% per transaction**. Under high load with consensus + RPC spans overlapping, reaches ~3%. Consensus itself adds only ~36 μs per 3-second round (~0.001%), so the TX path dominates. On production server hardware (3+ GHz Xeon), span creation drops to ~500-600 ns, bringing per-TX cost to ~2.6 μs (~1.3%). See [Section 3.5.4](./03-implementation-strategy.md#354-performance-data-sources) for benchmark sources.
**Memory (~10 MB) — Calculation**:
| Component | Size | Notes |
| --------------------------------------------- | ------------------ | ------------------------------------- |
| TracerProvider + Exporter (gRPC channel init) | ~320 KB | Allocated once at startup |
| BatchSpanProcessor (circular buffer) | ~16 KB | 2049 × 8-byte AtomicUniquePtr entries |
| BatchSpanProcessor (worker thread stack) | ~8 MB | Default Linux thread stack size |
| Active spans (in-flight, max ~1000) | ~500-800 KB | ~500-800 bytes/span × 1000 concurrent |
| Export queue (batch buffer, max 2048 spans) | ~1 MB | ~500 bytes/span × 2048 queue depth |
| Thread-local context storage (~100 threads) | ~6.4 KB | ~64 bytes/thread |
| **Total** | **~10 MB ceiling** | |
> Memory plateaus once the export queue fills — the `max_queue_size=2048` config bounds growth.
> The worker thread stack (~8 MB) dominates the static footprint but is virtual memory; actual RSS
> depends on stack usage (typically much less). Active spans are larger than originally estimated
> (~500-800 bytes) because the OTel SDK `Span` object includes a mutex (~40 bytes), `SpanData`
> recordable (~250 bytes base), and `std::map`-based attribute storage (~200-500 bytes for 3-5
> string attributes). See [Section 3.5.4](./03-implementation-strategy.md#354-performance-data-sources) for source references.
**Network (10-50 KB/s) — Calculation**:
Two sources of network overhead:
**(A) OTLP span export to Collector:**
| Sampling Rate | Effective Spans/sec | Avg Span Size (compressed) | Bandwidth |
| -------------------------- | ------------------- | -------------------------- | ------------ |
| 100% (dev only) | ~500 | ~500 bytes | ~250 KB/s |
| **10% (recommended prod)** | **~50** | **~500 bytes** | **~25 KB/s** |
| 1% (minimal) | ~5 | ~500 bytes | ~2.5 KB/s |
> The ~500 spans/sec at 100% comes from: ~100 TX spans + ~160 P2P context spans + ~23 consensus spans/round + ~50 RPC spans = ~500/sec. OTLP protobuf with gzip compression yields ~500 bytes/span average.
**(B) P2P trace context overhead** (added to existing messages, always-on regardless of sampling):
| Message Type | Rate | Context Size | Bandwidth |
| ------------- | -------- | ------------ | ------------- |
| TMTransaction | ~100/sec | 29 bytes | ~2.9 KB/s |
| TMValidation | ~50/sec | 29 bytes | ~1.5 KB/s |
| TMProposeSet | ~10/sec | 29 bytes | ~0.3 KB/s |
| **Total P2P** | | | **~4.7 KB/s** |
> **Combined**: 25 KB/s (OTLP export at 10%) + 5 KB/s (P2P context) ≈ **~30 KB/s typical**. The 10-50 KB/s range covers 10-20% sampling under normal to peak mainnet load.
**Latency (<2%) — Calculation**:
| Path | Tracing Cost | Baseline | Overhead |
| ------------------------------ | ------------ | -------- | -------- |
| Fast RPC (e.g., `server_info`) | 2.75 μs | ~1 ms | 0.275% |
| Slow RPC (e.g., `path_find`) | 2.75 μs | ~100 ms | 0.003% |
| Transaction processing | 4.0 μs | ~200 μs | 2.0% |
| Consensus round | 36 μs | ~3 sec | 0.001% |
> At p99, even the worst case (TX processing at 2.0%) is within the 1-3% range. RPC and consensus overhead are negligible. On production hardware, TX overhead drops to ~1.3%.
### Per-Message Overhead (Context Propagation)
Each P2P message carries trace context with the following overhead:
| Field | Size | Description |
| ------------- | ------------- | ----------------------------------------- |
| `trace_id` | 16 bytes | Unique identifier for the entire trace |
| `span_id` | 8 bytes | Current span (becomes parent on receiver) |
| `trace_flags` | 1 byte | Sampling decision flags |
| `trace_state` | 0-4 bytes | Optional vendor-specific data |
| **Total** | **~29 bytes** | **Added per traced P2P message** |
```mermaid
flowchart LR
subgraph msg["P2P Message with Trace Context"]
A["Original Message<br/>(variable size)"] --> B["+ TraceContext<br/>(~29 bytes)"]
end
subgraph breakdown["Context Breakdown"]
C["trace_id<br/>16 bytes"]
D["span_id<br/>8 bytes"]
E["flags<br/>1 byte"]
F["state<br/>0-4 bytes"]
end
B --> breakdown
style A fill:#424242,stroke:#212121,color:#fff
style B fill:#2e7d32,stroke:#1b5e20,color:#fff
style C fill:#1565c0,stroke:#0d47a1,color:#fff
style D fill:#1565c0,stroke:#0d47a1,color:#fff
style E fill:#e65100,stroke:#bf360c,color:#fff
style F fill:#4a148c,stroke:#2e0d57,color:#fff
```
**Reading the diagram:**
- **Original Message (gray, left)**: The existing P2P message payload of variable size this is unchanged; trace context is appended, never modifying the original data.
- **+ TraceContext (green, right of message)**: The additional 29-byte context block attached to each traced message; the arrow from the original message shows it is a pure addition.
- **Context Breakdown (right subgraph)**: The four fields `trace_id` (16 bytes), `span_id` (8 bytes), `flags` (1 byte), and `state` (0-4 bytes) show exactly what is added and their individual sizes.
- **Color coding**: Blue fields (`trace_id`, `span_id`) are the core identifiers required for trace correlation; orange (`flags`) controls sampling decisions; purple (`state`) is optional vendor data typically omitted.
> **Note**: 29 bytes represents ~1-6% overhead depending on message size (500B simple TX to 5KB proposal), which is acceptable for the observability benefits provided.
### Mitigation Strategies
```mermaid
flowchart LR
A["Head Sampling<br/>10% default"] --> B["Tail Sampling<br/>Keep errors/slow"] --> C["Batch Export<br/>Reduce I/O"] --> D["Conditional Compile<br/>XRPL_ENABLE_TELEMETRY"]
style A fill:#1565c0,stroke:#0d47a1,color:#fff
style B fill:#2e7d32,stroke:#1b5e20,color:#fff
style C fill:#e65100,stroke:#bf360c,color:#fff
style D fill:#4a148c,stroke:#2e0d57,color:#fff
```
> For a detailed explanation of head vs. tail sampling, see Slide 9.
### Kill Switches (Rollback Options)
1. **Config Disable**: Set `enabled=0` in config instant disable, no restart needed for sampling
2. **Rebuild**: Compile with `XRPL_ENABLE_TELEMETRY=OFF` zero overhead (no-op)
3. **Full Revert**: Clean separation allows easy commit reversion
---
## Slide 9: Sampling Strategies — Head vs. Tail
> Sampling controls **which traces are recorded and exported**. Without sampling, every operation generates a trace — at 500+ spans/sec, this overwhelms storage and network. Sampling lets you keep the signal, discard the noise.
### Head Sampling (Decision at Start)
The sampling decision is made **when a trace begins**, before any work is done. A random number is generated; if it falls within the configured ratio, the entire trace is recorded. Otherwise, the trace is silently dropped.
```mermaid
flowchart LR
A["New Request<br/>Arrives"] --> B{"Random < 10%?"}
B -->|"Yes (1 in 10)"| C["Record Entire Trace<br/>(all spans)"]
B -->|"No (9 in 10)"| D["Drop Entire Trace<br/>(zero overhead)"]
style C fill:#2e7d32,stroke:#1b5e20,color:#fff
style D fill:#c62828,stroke:#8c2809,color:#fff
style B fill:#1565c0,stroke:#0d47a1,color:#fff
```
| Aspect | Details |
| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Where it runs** | Inside xrpld (SDK-level). Configured via `sampling_ratio` in `xrpld.cfg`. |
| **When the decision happens** | At trace creation time before the first span is even populated. |
| **How it works** | `sampling_ratio=0.1` means each trace has a 10% probability of being recorded. Dropped traces incur near-zero overhead (no spans created, no attributes set, no export). |
| **Propagation** | Once a trace is sampled, the `trace_flags` field (1 byte in the context header) tells downstream nodes to also sample it. Unsampled traces propagate `trace_flags=0`, so downstream nodes skip them too. |
| **Pros** | Lowest overhead. Simple to configure. Predictable resource usage. |
| **Cons** | **Blind** it doesn't know if the trace will be interesting. A rare error or slow consensus round has only a 10% chance of being captured. |
| **Best for** | High-volume, steady-state traffic where most traces look similar (e.g., routine RPC requests). |
**xrpld configuration**:
```ini
[telemetry]
# Record 10% of traces (recommended for production)
sampling_ratio=0.1
```
### Tail Sampling (Decision at End)
The sampling decision is made **after the trace completes**, based on its actual content was it slow? Did it error? Was it a consensus round? This requires buffering complete traces before deciding.
```mermaid
flowchart TB
A["All Traces<br/>Buffered (100%)"] --> B["OTel Collector<br/>Evaluates Rules"]
B --> C{"Error?"}
C -->|Yes| K["KEEP"]
C -->|No| D{"Slow?<br/>(>5s consensus,<br/>>1s RPC)"}
D -->|Yes| K
D -->|No| E{"Random < 10%?"}
E -->|Yes| K
E -->|No| F["DROP"]
style K fill:#2e7d32,stroke:#1b5e20,color:#fff
style F fill:#c62828,stroke:#8c2809,color:#fff
style B fill:#1565c0,stroke:#0d47a1,color:#fff
style C fill:#e65100,stroke:#bf360c,color:#fff
style D fill:#e65100,stroke:#bf360c,color:#fff
style E fill:#4a148c,stroke:#2e0d57,color:#fff
```
| Aspect | Details |
| ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Where it runs** | In the **OTel Collector** (external process), not inside xrpld. xrpld exports 100% of traces; the Collector decides what to keep. |
| **When the decision happens** | After the Collector has received all spans for a trace (waits `decision_wait=10s` for stragglers). |
| **How it works** | Policy rules evaluate the completed trace: keep all errors, keep slow operations above a threshold, keep all consensus rounds, then probabilistically sample the rest at 10%. |
| **Pros** | **Never misses important traces**. Errors, slow requests, and consensus anomalies are always captured regardless of probability. |
| **Cons** | Higher resource usage xrpld must export 100% of spans to the Collector, which buffers them in memory before deciding. The Collector needs more RAM (configured via `num_traces` and `decision_wait`). |
| **Best for** | Production troubleshooting where you can't afford to miss errors or anomalies. |
**Collector configuration** (tail sampling rules for xrpld):
```yaml
processors:
tail_sampling:
decision_wait: 10s # Wait for all spans in a trace
num_traces: 100000 # Buffer up to 100K concurrent traces
policies:
- name: errors # Always keep error traces
type: status_code
status_code: { status_codes: [ERROR] }
- name: slow-consensus # Keep consensus rounds >5s
type: latency
latency: { threshold_ms: 5000 }
- name: slow-rpc # Keep slow RPC requests >1s
type: latency
latency: { threshold_ms: 1000 }
- name: probabilistic # Sample 10% of everything else
type: probabilistic
probabilistic: { sampling_percentage: 10 }
```
### Head vs. Tail — Side-by-Side
| | Head Sampling | Tail Sampling |
| ----------------------------- | ---------------------------------------- | ------------------------------------------------ |
| **Decision point** | Trace start (inside xrpld) | Trace end (in OTel Collector) |
| **Knows trace content?** | No (random coin flip) | Yes (evaluates completed trace) |
| **Overhead on xrpld** | Lowest (dropped traces = no-op) | Higher (must export 100% to Collector) |
| **Collector resource usage** | Low (receives only sampled traces) | Higher (buffers all traces before deciding) |
| **Captures all errors?** | No (only if trace was randomly selected) | **Yes** (error policy catches them) |
| **Captures slow operations?** | No (random) | **Yes** (latency policy catches them) |
| **Configuration** | `xrpld.cfg`: `sampling_ratio=0.1` | `otel-collector.yaml`: `tail_sampling` processor |
| **Best for** | High-throughput steady-state | Troubleshooting & anomaly detection |
### Recommended Strategy for xrpld
Use **both** in a layered approach:
```mermaid
flowchart LR
subgraph xrpld["xrpld (Head Sampling)"]
HS["sampling_ratio=1.0<br/>(export everything)"]
end
subgraph collector["OTel Collector (Tail Sampling)"]
TS["Keep: errors + slow + 10% random<br/>Drop: routine traces"]
end
subgraph storage["Backend Storage"]
ST["Only interesting traces<br/>stored long-term"]
end
xrpld -->|"100% of spans"| collector -->|"~15-20% kept"| storage
style xrpld fill:#424242,stroke:#212121,color:#fff
style collector fill:#1565c0,stroke:#0d47a1,color:#fff
style storage fill:#2e7d32,stroke:#1b5e20,color:#fff
```
> **Why this works**: xrpld exports everything (no blind drops), the Collector applies intelligent filtering (keep errors/slow/anomalies, sample the rest), and only ~15-20% of traces reach storage. If Collector resource usage becomes a concern, add head sampling at `sampling_ratio=0.5` to halve the export volume while still giving the Collector enough data for good tail-sampling decisions.
---
## Slide 10: Data Collection & Privacy
### What Data is Collected
| Category | Attributes Collected | Purpose |
| --------------- | ------------------------------------------------------------------------------------ | --------------------------- |
| **Transaction** | `tx.hash`, `tx.type`, `tx.result`, `tx.fee`, `ledger_index` | Trace transaction lifecycle |
| **Consensus** | `round`, `phase`, `mode`, `proposers` (count of proposing validators), `duration_ms` | Analyze consensus timing |
| **RPC** | `command`, `version`, `status`, `duration_ms` | Monitor RPC performance |
| **Peer** | `peer.id`(public key), `latency_ms`, `message.type`, `message.size` | Network topology analysis |
| **Ledger** | `ledger.hash`, `ledger.index`, `close_time`, `tx_count` | Ledger progression tracking |
| **Job** | `job.type`, `queue_ms`, `worker` | JobQueue performance |
### What is NOT Collected (Privacy Guarantees)
```mermaid
flowchart LR
subgraph notCollected["❌ NOT Collected"]
direction LR
A["Private Keys"] ~~~ B["Account Balances"] ~~~ C["Transaction Amounts"]
end
subgraph alsoNot["❌ Also Excluded"]
direction LR
D["IP Addresses<br/>(configurable)"] ~~~ E["Personal Data"] ~~~ F["Raw TX Payloads"]
end
style A fill:#c62828,stroke:#8c2809,color:#fff
style B fill:#c62828,stroke:#8c2809,color:#fff
style C fill:#c62828,stroke:#8c2809,color:#fff
style D fill:#c62828,stroke:#8c2809,color:#fff
style E fill:#c62828,stroke:#8c2809,color:#fff
style F fill:#c62828,stroke:#8c2809,color:#fff
```
**Reading the diagram:**
- **NOT Collected (top row, red)**: Private Keys, Account Balances, and Transaction Amounts are explicitly excluded these are financial/security-sensitive fields that telemetry never touches.
- **Also Excluded (bottom row, red)**: IP Addresses (configurable per deployment), Personal Data, and Raw TX Payloads are also excluded these protect operator and user privacy.
- **All-red styling**: Every box is styled in red to visually reinforce that these are hard exclusions, not optional the telemetry system has no code path to collect any of these fields.
- **Two-row layout**: The split between "NOT Collected" and "Also Excluded" distinguishes between financial data (top) and operational/personal data (bottom), making the privacy boundaries clear to auditors.
### Privacy Protection Mechanisms
| Mechanism | Description |
| -------------------------- | ------------------------------------------------------------- |
| **Account Hashing** | `xrpl.tx.account` is hashed at collector level before storage |
| **Configurable Redaction** | Sensitive fields can be excluded via config |
| **Sampling** | Only 10% of traces recorded by default (reduces exposure) |
| **Local Control** | Node operators control what gets exported |
| **No Raw Payloads** | Transaction content is never recorded, only metadata |
> **Key Principle**: Telemetry collects **operational metadata** (timing, counts, hashes) — never **sensitive content** (keys, balances, amounts).
---
_End of Presentation_

View File

@@ -1,8 +1,8 @@
#!/bin/bash
if [[ $# -ne 1 || "$1" == "--help" || "$1" == "-h" ]]; then
name=$( basename $0 )
cat <<- USAGE
name=$(basename $0)
cat <<-USAGE
Usage: $name <username>
Where <username> is the Github username of the upstream repo. e.g. XRPLF
@@ -14,7 +14,7 @@ fi
shift
user="$1"
# Get the origin URL. Expect it be an SSH-style URL
origin=$( git remote get-url origin )
origin=$(git remote get-url origin)
if [[ "${origin}" == "" ]]; then
echo Invalid origin remote >&2
exit 1
@@ -22,11 +22,11 @@ fi
# echo "Origin: ${origin}"
# Parse the origin
ifs_orig="${IFS}"
IFS=':' read remote originpath <<< "${origin}"
IFS=':' read remote originpath <<<"${origin}"
# echo "Remote: ${remote}, Originpath: ${originpath}"
IFS='@' read sshuser server <<< "${remote}"
IFS='@' read sshuser server <<<"${remote}"
# echo "SSHUser: ${sshuser}, Server: ${server}"
IFS='/' read originuser repo <<< "${originpath}"
IFS='/' read originuser repo <<<"${originpath}"
# echo "Originuser: ${originuser}, Repo: ${repo}"
if [[ "${sshuser}" == "" || "${server}" == "" || "${originuser}" == "" || "${repo}" == "" ]]; then
echo "Can't parse origin URL: ${origin}" >&2
@@ -35,9 +35,9 @@ fi
upstream="https://${server}/${user}/${repo}"
upstreampush="${remote}:${user}/${repo}"
upstreamgroup="upstream upstream-push"
current=$( git remote get-url upstream 2>/dev/null )
currentpush=$( git remote get-url upstream-push 2>/dev/null )
currentgroup=$( git config remotes.upstreams )
current=$(git remote get-url upstream 2>/dev/null)
currentpush=$(git remote get-url upstream-push 2>/dev/null)
currentgroup=$(git config remotes.upstreams)
if [[ "${current}" == "${upstream}" ]]; then
echo "Upstream already set up correctly. Skip"
elif [[ -n "${current}" && "${current}" != "${upstream}" && "${current}" != "${upstreampush}" ]]; then
@@ -45,9 +45,9 @@ elif [[ -n "${current}" && "${current}" != "${upstream}" && "${current}" != "${u
else
if [[ "${current}" == "${upstreampush}" ]]; then
echo "Upstream set to dangerous push URL. Update."
_run git remote rename upstream upstream-push || \
_run git remote remove upstream
currentpush=$( git remote get-url upstream-push 2>/dev/null )
_run git remote rename upstream upstream-push ||
_run git remote remove upstream
currentpush=$(git remote get-url upstream-push 2>/dev/null)
fi
_run git remote add upstream "${upstream}"
fi

View File

@@ -1,8 +1,8 @@
#!/bin/bash
if [[ $# -lt 3 || "$1" == "--help" || "$1" = "-h" ]]; then
name=$( basename $0 )
cat <<- USAGE
name=$(basename $0)
cat <<-USAGE
Usage: $name workbranch base/branch user/branch [user/branch [...]]
* workbranch will be created locally from base/branch
@@ -16,7 +16,7 @@ fi
work="$1"
shift
branches=( $( echo "${@}" | sed "s/:/\//" ) )
branches=($(echo "${@}" | sed "s/:/\//"))
base="${branches[0]}"
unset branches[0]
@@ -24,10 +24,10 @@ set -e
users=()
for b in "${branches[@]}"; do
users+=( $( echo $b | cut -d/ -f1 ) )
users+=($(echo $b | cut -d/ -f1))
done
users=( $( printf '%s\n' "${users[@]}" | sort -u ) )
users=($(printf '%s\n' "${users[@]}" | sort -u))
git fetch --multiple upstreams "${users[@]}"
git checkout -B "$work" --no-track "$base"
@@ -40,7 +40,7 @@ done
# Make sure the commits look right
git log --show-signature "$base..HEAD"
parts=( $( echo $base | sed "s/\// /" ) )
parts=($(echo $base | sed "s/\// /"))
repo="${parts[0]}"
b="${parts[1]}"
push=$repo
@@ -50,7 +50,7 @@ fi
if [[ "$repo" == "upstream" ]]; then
repo="upstreams"
fi
cat << PUSH
cat <<PUSH
-------------------------------------------------------------------
This script will not push. Verify everything is correct, then push

View File

@@ -1,8 +1,8 @@
#!/bin/bash
if [[ $# -ne 3 || "$1" == "--help" || "$1" = "-h" ]]; then
name=$( basename $0 )
cat <<- USAGE
name=$(basename $0)
cat <<-USAGE
Usage: $name workbranch base/branch version
* workbranch will be created locally from base/branch. If it exists,
@@ -16,7 +16,7 @@ fi
work="$1"
shift
base=$( echo "$1" | sed "s/:/\//" )
base=$(echo "$1" | sed "s/:/\//")
shift
version=$1
@@ -28,16 +28,16 @@ git fetch upstreams
git checkout -B "${work}" --no-track "${base}"
push=$( git rev-parse --abbrev-ref --symbolic-full-name '@{push}' \
2>/dev/null ) || true
push=$(git rev-parse --abbrev-ref --symbolic-full-name '@{push}' \
2>/dev/null) || true
if [[ "${push}" != "" ]]; then
echo "Warning: ${push} may already exist."
fi
build=$( find -name BuildInfo.cpp )
sed 's/\(^.*versionString =\).*$/\1 "'${version}'"/' ${build} > version.cpp && \
diff "${build}" version.cpp && exit 1 || \
mv -vi version.cpp ${build}
build=$(find -name BuildInfo.cpp)
sed 's/\(^.*versionString =\).*$/\1 "'${version}'"/' ${build} >version.cpp &&
diff "${build}" version.cpp && exit 1 ||
mv -vi version.cpp ${build}
git diff
@@ -47,7 +47,7 @@ git commit -S -m "Set version to ${version}"
git log --oneline --first-parent ${base}^..
cat << PUSH
cat <<PUSH
-------------------------------------------------------------------
This script will not push. Verify everything is correct, then push

View File

@@ -168,7 +168,13 @@ def main():
if not os.environ.get("TIDY"):
return 0
repo_root = Path(__file__).parent.parent
repo_root = Path(
subprocess.check_output(
["git", "rev-parse", "--show-toplevel"],
cwd=Path(__file__).parent,
text=True,
).strip()
)
files = staged_files(repo_root)
if not files:
return 0

View File

@@ -953,6 +953,21 @@
#
# Optional keys for NuDB and RocksDB:
#
# cache_size Size of cache for database records. Default is 16384.
# Setting this value to 0 will use the default value.
#
# cache_age Length of time in minutes to keep database records
# cached. Default is 5 minutes. Setting this value to
# 0 will use the default value.
#
# Note: if cache_size or cache_age is not specified,
# default values will be used for the unspecified
# parameter.
#
# Note: the cache will not be created if online_delete
# is specified, because the rotating NodeStore does
# not use this cache).
#
# fast_load Boolean. If set, load the last persisted ledger
# from disk upon process start before syncing to
# the network. This is likely to improve performance

View File

@@ -0,0 +1,13 @@
# Python dependencies for XRP Ledger code generation scripts
#
# These packages are required to run the code generation scripts that
# parse macro files and generate C++ wrapper classes.
# C preprocessor for Python - used to preprocess macro files
pcpp>=1.30
# Parser combinator library - used to parse the macro DSL
pyparsing>=3.0.0
# Template engine - used to generate C++ code from templates
Mako>=1.2.2

View File

@@ -1,13 +1,105 @@
# Python dependencies for XRP Ledger code generation scripts
#
# These packages are required to run the code generation scripts that
# parse macro files and generate C++ wrapper classes.
# C preprocessor for Python - used to preprocess macro files
pcpp>=1.30
# Parser combinator library - used to parse the macro DSL
pyparsing>=3.0.0
# Template engine - used to generate C++ code from templates
Mako>=1.2.2
# This file was autogenerated by uv via the following command:
# uv pip compile requirements.in --generate-hashes --output-file requirements.txt
mako==1.3.12 \
--hash=sha256:8f61569480282dbf557145ce441e4ba888be453c30989f879f0d652e39f53ea9 \
--hash=sha256:9f778e93289bd410bb35daadeb4fc66d95a746f0b75777b942088b7fd7af550a
# via -r requirements.in
markupsafe==3.0.3 \
--hash=sha256:0303439a41979d9e74d18ff5e2dd8c43ed6c6001fd40e5bf2e43f7bd9bbc523f \
--hash=sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a \
--hash=sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf \
--hash=sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19 \
--hash=sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf \
--hash=sha256:0f4b68347f8c5eab4a13419215bdfd7f8c9b19f2b25520968adfad23eb0ce60c \
--hash=sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175 \
--hash=sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219 \
--hash=sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb \
--hash=sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6 \
--hash=sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab \
--hash=sha256:15d939a21d546304880945ca1ecb8a039db6b4dc49b2c5a400387cdae6a62e26 \
--hash=sha256:177b5253b2834fe3678cb4a5f0059808258584c559193998be2601324fdeafb1 \
--hash=sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce \
--hash=sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218 \
--hash=sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634 \
--hash=sha256:1ba88449deb3de88bd40044603fafffb7bc2b055d626a330323a9ed736661695 \
--hash=sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad \
--hash=sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73 \
--hash=sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c \
--hash=sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe \
--hash=sha256:2a15a08b17dd94c53a1da0438822d70ebcd13f8c3a95abe3a9ef9f11a94830aa \
--hash=sha256:2f981d352f04553a7171b8e44369f2af4055f888dfb147d55e42d29e29e74559 \
--hash=sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa \
--hash=sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37 \
--hash=sha256:3537e01efc9d4dccdf77221fb1cb3b8e1a38d5428920e0657ce299b20324d758 \
--hash=sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f \
--hash=sha256:38664109c14ffc9e7437e86b4dceb442b0096dfe3541d7864d9cbe1da4cf36c8 \
--hash=sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d \
--hash=sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c \
--hash=sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97 \
--hash=sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a \
--hash=sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19 \
--hash=sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9 \
--hash=sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9 \
--hash=sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc \
--hash=sha256:591ae9f2a647529ca990bc681daebdd52c8791ff06c2bfa05b65163e28102ef2 \
--hash=sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4 \
--hash=sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354 \
--hash=sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50 \
--hash=sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698 \
--hash=sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9 \
--hash=sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b \
--hash=sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc \
--hash=sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115 \
--hash=sha256:7c3fb7d25180895632e5d3148dbdc29ea38ccb7fd210aa27acbd1201a1902c6e \
--hash=sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485 \
--hash=sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f \
--hash=sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12 \
--hash=sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025 \
--hash=sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009 \
--hash=sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d \
--hash=sha256:949b8d66bc381ee8b007cd945914c721d9aba8e27f71959d750a46f7c282b20b \
--hash=sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a \
--hash=sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5 \
--hash=sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f \
--hash=sha256:a320721ab5a1aba0a233739394eb907f8c8da5c98c9181d1161e77a0c8e36f2d \
--hash=sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1 \
--hash=sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287 \
--hash=sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6 \
--hash=sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f \
--hash=sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581 \
--hash=sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed \
--hash=sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b \
--hash=sha256:c0c0b3ade1c0b13b936d7970b1d37a57acde9199dc2aecc4c336773e1d86049c \
--hash=sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026 \
--hash=sha256:c4ffb7ebf07cfe8931028e3e4c85f0357459a3f9f9490886198848f4fa002ec8 \
--hash=sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676 \
--hash=sha256:d2ee202e79d8ed691ceebae8e0486bd9a2cd4794cec4824e1c99b6f5009502f6 \
--hash=sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e \
--hash=sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d \
--hash=sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d \
--hash=sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01 \
--hash=sha256:df2449253ef108a379b8b5d6b43f4b1a8e81a061d6537becd5582fba5f9196d7 \
--hash=sha256:e1c1493fb6e50ab01d20a22826e57520f1284df32f2d8601fdd90b6304601419 \
--hash=sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795 \
--hash=sha256:e2103a929dfa2fcaf9bb4e7c091983a49c9ac3b19c9061b6d5427dd7d14d81a1 \
--hash=sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5 \
--hash=sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d \
--hash=sha256:e8fc20152abba6b83724d7ff268c249fa196d8259ff481f3b1476383f8f24e42 \
--hash=sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe \
--hash=sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda \
--hash=sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e \
--hash=sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737 \
--hash=sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523 \
--hash=sha256:f42d0984e947b8adf7dd6dde396e720934d12c506ce84eea8476409563607591 \
--hash=sha256:f71a396b3bf33ecaa1626c255855702aca4d3d9fea5e051b41ac59a9c1c41edc \
--hash=sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a \
--hash=sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50
# via mako
pcpp==1.30 \
--hash=sha256:05fe08292b6da57f385001c891a87f40d6aa7f46787b03e8ba326d20a3297c6e \
--hash=sha256:5af9fbce55f136d7931ae915fae03c34030a3b36c496e72d9636cedc8e2543a1
# via -r requirements.in
pyparsing==3.3.2 \
--hash=sha256:850ba148bd908d7e2411587e247a1e4f0327839c40e2e5e6d05a007ecc69911d \
--hash=sha256:c777f4d763f140633dcb6d8a3eda953bf7a214dc4eff598413c070bcdc117cbc
# via -r requirements.in

View File

@@ -65,6 +65,7 @@ words:
- Btrfs
- Buildx
- canonicality
- CGNAT
- changespq
- checkme
- choco
@@ -93,6 +94,7 @@ words:
- daria
- dcmake
- dearmor
- dedented
- deleteme
- demultiplexer
- deserializaton
@@ -115,6 +117,8 @@ words:
- fmtdur
- fsanitize
- funclets
- gantt
- Gantt
- gcov
- gcovr
- ghead
@@ -160,12 +164,11 @@ words:
- mathbunnyru
- mcmodel
- MEMORYSTATUSEX
- MPTAMM
- MPTDEX
- Merkle
- Metafuncton
- misprediction
- missingok
- MPTAMM
- mptbalance
- MPTDEX
- mptflags
@@ -199,12 +202,16 @@ words:
- nonxrp
- noreplace
- noripple
- nostd
- nostdinc
- notifempty
- nudb
- nullptr
- nunl
- Nyffenegger
- onlatest
- ostr
- otelc
- pargs
- partitioner
- paychan
@@ -254,6 +261,7 @@ words:
- sfields
- shamap
- shamapitem
- shfmt
- shlibs
- sidechain
- SIGGOOD
@@ -281,6 +289,7 @@ words:
- takerpays
- ters
- TMEndpointv2
- traceql
- trixie
- tx
- txid
@@ -288,6 +297,7 @@ words:
- txjson
- txn
- txns
- txqueue
- txs
- ubsan
- UBSAN
@@ -298,6 +308,7 @@ words:
- unauthorizing
- unergonomic
- unfetched
- unfindable
- unflatten
- unfund
- unimpair
@@ -334,4 +345,5 @@ words:
- xrplf
- xxhash
- xxhasher
- CGNAT
- xychart
- zpages

48
docker/check-sanitizers.sh Executable file
View File

@@ -0,0 +1,48 @@
#!/bin/bash
# Sanity-check that the sanitizer runtimes shipped with g++/clang++ work
# end-to-end against the system loader: compile each example with both
# compilers, run it, and confirm the expected diagnostic is emitted.
set -eo pipefail
cpp_files_dir="${1:?usage: $0 <cpp_files_dir>}"
case "$(uname -m)" in
x86_64) loader=/lib64/ld-linux-x86-64.so.2 ;;
aarch64) loader=/lib/ld-linux-aarch64.so.1 ;;
*)
echo "Unsupported arch: $(uname -m)" >&2
exit 1
;;
esac
declare -A sanitize=(
[asan]="-fsanitize=address"
[tsan]="-fsanitize=thread"
[ubsan]="-fsanitize=undefined"
)
declare -A expect=(
[asan]="heap-use-after-free"
[tsan]="data race"
[ubsan]="signed integer overflow"
)
for compiler in g++ clang++; do
for name in asan tsan ubsan; do
bin="/tmp/${name}-${compiler}"
echo "=== Build ${name} with ${compiler} ==="
"$compiler" -std=c++20 -O1 -g ${sanitize[$name]} \
-Wl,--dynamic-linker=$loader \
"${cpp_files_dir}/${name}.cpp" -o "$bin"
echo "=== Run ${name}-${compiler} ==="
output=$("$bin" 2>&1) || true
echo "$output"
echo "$output" | grep -q "${expect[$name]}" ||
{
echo "expected '${expect[$name]}' from $bin"
exit 1
}
rm -f "$bin"
done
done

28
docker/cpp_files/asan.cpp Normal file
View File

@@ -0,0 +1,28 @@
#include <atomic>
#include <cstddef>
#include <iostream>
#if defined(__clang__) || defined(__GNUC__)
__attribute__((noinline))
#elif defined(_MSC_VER)
__declspec(noinline)
#endif
int
read_after_free(volatile int* array, std::size_t index)
{
std::atomic_signal_fence(std::memory_order_seq_cst);
int value = array[index];
std::atomic_signal_fence(std::memory_order_seq_cst);
return value;
}
int
main()
{
int* array = new int[5]{10, 20, 30, 40, 50};
delete[] array;
std::cout << "Value at index 2: " << read_after_free(array, 2) << std::endl;
return 0;
}

26
docker/cpp_files/tsan.cpp Normal file
View File

@@ -0,0 +1,26 @@
#include <iostream>
#include <thread>
static int kCounter = 0;
void
increment()
{
for (int i = 0; i < 100'000; ++i)
{
++kCounter;
}
}
int
main()
{
std::thread t1(increment);
std::thread t2(increment);
t1.join();
t2.join();
std::cout << "Final counter value: " << kCounter << std::endl;
return 0;
}

View File

@@ -0,0 +1,13 @@
#include <iostream>
#include <limits>
int
main()
{
int maxInt = std::numeric_limits<int>::max();
int volatile one = 1;
std::cout << "Current max: " << maxInt << std::endl;
int overflowed = maxInt + one;
std::cout << "Overflowed result: " << overflowed << std::endl;
return 0;
}

View File

@@ -45,8 +45,30 @@ COPY --from=builder /tmp/build/result /nix/ci-env
ENV PATH="/nix/ci-env/bin:$PATH"
# Externally-built dynamically-linked ELF binaries hard-code the loader path
# (e.g. /lib64/ld-linux-x86-64.so.2) in their PT_INTERP header. Copy the
# loader from the Nix store to that path when the base image doesn't already
# provide one (i.e. on nixos/nix).
RUN <<EOF
case "$(uname -m)" in
x86_64) target=/lib64/ld-linux-x86-64.so.2 ;;
aarch64) target=/lib/ld-linux-aarch64.so.1 ;;
*) echo "Unsupported arch: $(uname -m)" >&2; exit 1 ;;
esac
if [ ! -e "$target" ]; then
# Use the loader from the same glibc that gcc links libc against, so
# ld-linux and libc/libpthread share GLIBC_PRIVATE symbols at runtime.
src="$(dirname "$(gcc -print-file-name=libc.so.6)")/$(basename "$target")"
[ -e "$src" ] || { echo "ld-linux not found at $src" >&2; exit 1; }
mkdir -p "$(dirname "$target")"
cp "$src" "$target"
fi
EOF
RUN <<EOF
ccache --version
clang --version
clang++ --version
clang-format --version
cmake --version
conan --version
@@ -64,3 +86,10 @@ python3 --version
run-clang-tidy --help
vim --version
EOF
# Sanity-check that the sanitizer runtimes shipped with g++/clang++ work
# end-to-end against the system loader.
COPY docker/cpp_files/ /tmp/cpp_files/
COPY docker/check-sanitizers.sh /tmp/check-sanitizers.sh
RUN grep -qi ubuntu /etc/os-release 2>/dev/null && /tmp/check-sanitizers.sh /tmp/cpp_files || true

4
flake.lock generated
View File

@@ -15,7 +15,7 @@
"type": "indirect"
}
},
"nixpkgs-glibc231": {
"nixpkgs-custom-glibc": {
"flake": false,
"locked": {
"lastModified": 1593520194,
@@ -35,7 +35,7 @@
"root": {
"inputs": {
"nixpkgs": "nixpkgs",
"nixpkgs-glibc231": "nixpkgs-glibc231"
"nixpkgs-custom-glibc": "nixpkgs-custom-glibc"
}
}
},

View File

@@ -6,16 +6,16 @@
# version — matches the system libc on Ubuntu 20.04 LTS. Imported
# manually (flake = false) because this revision predates nixpkgs'
# own flake.nix.
nixpkgs-glibc231 = {
nixpkgs-custom-glibc = {
url = "github:NixOS/nixpkgs/9cd98386a38891d1074fc18036b842dc4416f562";
flake = false;
};
};
outputs =
{ nixpkgs, nixpkgs-glibc231, ... }:
{ nixpkgs, nixpkgs-custom-glibc, ... }:
let
forEachSystem = import ./nix/utils.nix { inherit nixpkgs nixpkgs-glibc231; };
forEachSystem = import ./nix/utils.nix { inherit nixpkgs nixpkgs-custom-glibc; };
in
{
devShells = forEachSystem (import ./nix/devshell.nix);

View File

@@ -2,12 +2,16 @@
#include <xrpl/beast/utility/instrumentation.h>
#include <array>
#include <cstdint>
#include <functional>
#include <limits>
#include <optional>
#include <ostream>
#include <set>
#include <stdexcept>
#include <string>
#include <unordered_map>
namespace xrpl {
@@ -38,17 +42,58 @@ isPowerOfTen(T value)
return logTen(value).has_value();
}
namespace detail {
/** Builds a table of the powers of 10
*
* This function is marked consteval, so it can only be run in
* a constexpr context. This assures that it is and can only be run at
* compile time. Doing it at runtime would be pretty wasteful and
* inefficient.
*/
constexpr std::size_t kInt64Digits = 20;
consteval std::array<std::uint64_t, kInt64Digits>
buildPowersOfTen()
{
std::array<std::uint64_t, kInt64Digits> result{};
std::uint64_t power = 1;
std::size_t exponent = 0;
// end the loop early so it doesn't overflow;
for (; exponent < result.size() - 1; ++exponent, power *= 10)
{
result[exponent] = power;
if (power > std::numeric_limits<std::uint64_t>::max() / 10)
throw std::logic_error("Power of 10 table is too big");
}
result[exponent] = power;
if (power < std::numeric_limits<std::uint64_t>::max() / 10)
throw std::logic_error("Power of 10 table is not big enough for the uint64_t type");
return result;
}
} // namespace detail
constexpr std::array<std::uint64_t, detail::kInt64Digits> kPowerOfTen = detail::buildPowersOfTen();
static_assert(kPowerOfTen[0] == 1);
static_assert(kPowerOfTen[1] == 10);
static_assert(kPowerOfTen[10] == 10'000'000'000);
static_assert(
isPowerOfTen(kPowerOfTen.back()) && *logTen(kPowerOfTen.back()) == detail::kInt64Digits - 1);
/** MantissaRange defines a range for the mantissa of a normalized Number.
*
* The mantissa is in the range [min, max], where
* * min is a power of 10, and
* * max = min * 10 - 1.
*
* The mantissa_scale enum indicates whether the range is "small" or "large".
* This intentionally restricts the number of MantissaRanges that can be
* instantiated to two: one for each scale.
* The MantissaScale enum indicates properties of the range: size, and some behavioral
* options. This intentionally restricts the number of unique MantissaRanges that can
* be instantiated: one for each scale.
*
* The "small" scale is based on the behavior of STAmount for IOUs. It has a min
* The "Small" scale is based on the behavior of STAmount for IOUs. It has a min
* value of 10^15, and a max value of 10^16-1. This was sufficient for
* uses before Lending Protocol was implemented, mostly related to AMM.
*
@@ -59,46 +104,100 @@ isPowerOfTen(T value)
* STNumber field type, and for internal calculations. That necessitated the
* "large" scale.
*
* The "large" scale is intended to represent all values that can be represented
* The "Large" scales are intended to represent all values that can be represented
* by an STAmount - IOUs, XRP, and MPTs. It has a min value of 10^18, and a max
* value of 10^19-1.
* value of 10^19-1. "LargeLegacy" is like "Large", but preserves
* a rounding error when a computation results in a mantissa of
* Number::kMaxRep that needs to be rounded up, but rounds down
* instead. It will maintain consistent behavior until the fixCleanup3_2_0
* amendment is enabled.
*
* Note that if the mentioned amendments are eventually retired, this class
* should be left in place, but the "small" scale option should be removed. This
* should be left in place, but the "Small" scale option should be removed. This
* will allow for future expansion beyond 64-bits if it is ever needed.
*/
struct MantissaRange
struct MantissaRange final
{
using rep = std::uint64_t;
enum class MantissaScale { Small, Large };
explicit constexpr MantissaRange(MantissaScale scale)
: min(getMin(scale)), log(logTen(min).value_or(-1)), scale(scale)
enum class MantissaScale {
Small,
// LargeLegacy can be removed when fixCleanup3_2_0 is retired
LargeLegacy,
Large,
};
// This entire enum can be removed when fixCleanup3_2_0 is retired
enum class CuspRoundingFix : bool {
Disabled = false,
Enabled = true,
};
explicit constexpr MantissaRange(MantissaScale sc) : scale(sc)
{
}
rep min;
rep max{(min * 10) - 1};
int log;
MantissaScale scale;
MantissaScale const scale;
int const log{getExponent(scale)};
rep const min{getMin(scale, log)};
rep const max{(min * 10) - 1};
CuspRoundingFix const cuspRoundingFixEnabled{isCuspFixEnabled(scale)};
static MantissaRange const&
getMantissaRange(MantissaScale scale);
static std::set<MantissaScale> const&
getAllScales();
private:
static constexpr rep
getMin(MantissaScale scale)
static constexpr int
getExponent(MantissaScale scale)
{
switch (scale)
{
case MantissaScale::Small:
return 1'000'000'000'000'000ULL;
return 15;
case MantissaScale::LargeLegacy:
case MantissaScale::Large:
return 1'000'000'000'000'000'000ULL;
return 18;
// LCOV_EXCL_START
default:
// Since this can never be called outside a non-constexpr
// context, this throw assures that the build fails if an
// If called in a constexpr context, this throw assures that the build fails if an
// invalid scale is used.
throw std::runtime_error("Unknown mantissa scale");
// LCOV_EXCL_STOP
}
}
// Keep this function for future use with different ways to compute
// the ranges.
static constexpr rep
getMin(MantissaScale scale, int exponent)
{
if (exponent < 0 || exponent >= kPowerOfTen.size())
throw std::runtime_error("Invalid exponent"); // LCOV_EXCL_LINE
return kPowerOfTen[exponent];
}
static constexpr CuspRoundingFix
isCuspFixEnabled(MantissaScale scale)
{
switch (scale)
{
case MantissaScale::Small:
case MantissaScale::LargeLegacy:
return CuspRoundingFix::Disabled;
case MantissaScale::Large:
return CuspRoundingFix::Enabled;
default:
// If called in a constexpr context, this throw assures that the build fails if an
// invalid scale is used.
throw std::runtime_error("Unknown mantissa scale"); // LCOV_EXCL_LINE
}
}
static std::unordered_map<MantissaScale, MantissaRange> const&
getRanges();
};
// Like std::integral, but only 64-bit integral types.
@@ -203,7 +302,7 @@ concept Integral64 = std::is_same_v<T, std::int64_t> || std::is_same_v<T, std::u
* amendments are enabled to determine which result to expect.
*
*/
class Number
class Number final
{
using rep = std::int64_t;
using internalrep = MantissaRange::rep;
@@ -424,49 +523,28 @@ public:
return kRange.get().log;
}
/// oneSmall is needed because the ranges are private
static constexpr Number
oneSmall();
/// oneLarge is needed because the ranges are private
static constexpr Number
oneLarge();
// And one is needed because it needs to choose between oneSmall and
// oneLarge based on the current range
static Number
one();
template <Integral64 T>
template <
auto MinMantissa,
auto MaxMantissa,
Integral64 T = std::decay_t<decltype(MinMantissa)>>
[[nodiscard]]
std::pair<T, int>
normalizeToRange(T minMantissa, T maxMantissa) const;
normalizeToRange() const;
private:
static thread_local RoundingMode mode;
// The available ranges for mantissa
static constexpr MantissaRange kSmallRange{MantissaRange::MantissaScale::Small};
static_assert(isPowerOfTen(kSmallRange.min));
static_assert(kSmallRange.min == 1'000'000'000'000'000LL);
static_assert(kSmallRange.max == 9'999'999'999'999'999LL);
static_assert(kSmallRange.log == 15);
static_assert(kSmallRange.min < kMaxRep);
static_assert(kSmallRange.max < kMaxRep);
static constexpr MantissaRange kLargeRange{MantissaRange::MantissaScale::Large};
static_assert(isPowerOfTen(kLargeRange.min));
static_assert(kLargeRange.min == 1'000'000'000'000'000'000ULL);
static_assert(kLargeRange.max == internalrep(9'999'999'999'999'999'999ULL));
static_assert(kLargeRange.log == 18);
static_assert(kLargeRange.min < kMaxRep);
static_assert(kLargeRange.max > kMaxRep);
// The range for the mantissa when normalized.
// Use reference_wrapper to avoid making copies, and prevent accidentally
// changing the values inside the range.
static thread_local std::reference_wrapper<MantissaRange const> kRange;
void
normalize();
normalize(MantissaRange const& range);
/** Normalize Number components to an arbitrary range.
*
@@ -481,7 +559,8 @@ private:
T& mantissa,
int& exponent,
internalrep const& minMantissa,
internalrep const& maxMantissa);
internalrep const& maxMantissa,
MantissaRange::CuspRoundingFix cuspRoundingFixEnabled);
template <class T>
friend void
@@ -490,7 +569,9 @@ private:
T& mantissa,
int& exponent,
MantissaRange::rep const& minMantissa,
MantissaRange::rep const& maxMantissa);
MantissaRange::rep const& maxMantissa,
MantissaRange::CuspRoundingFix cuspRoundingFixEnabled,
bool dropped);
[[nodiscard]] bool
isnormal() const noexcept;
@@ -526,7 +607,7 @@ static constexpr Number kNumZero{};
inline Number::Number(bool negative, internalrep mantissa, int exponent, Normalized)
: Number(negative, mantissa, exponent, Unchecked{})
{
normalize();
normalize(kRange);
}
inline Number::Number(internalrep mantissa, int exponent, Normalized)
@@ -696,10 +777,21 @@ Number::isnormal() const noexcept
kMinExponent <= exponent_ && exponent_ <= kMaxExponent);
}
template <Integral64 T>
template <auto MinMantissa, auto MaxMantissa, Integral64 T>
std::pair<T, int>
Number::normalizeToRange(T minMantissa, T maxMantissa) const
Number::normalizeToRange() const
{
static_assert(std::is_same_v<T, std::uint64_t> || std::is_same_v<T, std::int64_t>);
static_assert(std::is_same_v<T, std::decay_t<decltype(MinMantissa)>>);
static_assert(std::is_same_v<T, std::decay_t<decltype(MaxMantissa)>>);
auto constexpr kMIN = static_cast<T>(MinMantissa);
auto constexpr kMAX = static_cast<T>(MaxMantissa);
static_assert(kMIN > 0);
static_assert(kMIN % 10 == 0);
static_assert(isPowerOfTen(kMIN));
static_assert(kMAX % 10 == 9);
static_assert((kMAX + 1) / 10 == kMIN);
bool negative = negative_;
internalrep mantissa = mantissa_;
int exponent = exponent_;
@@ -711,7 +803,10 @@ Number::normalizeToRange(T minMantissa, T maxMantissa) const
"xrpl::Number::normalizeToRange",
"Number is non-negative for unsigned range.");
}
Number::normalize(negative, mantissa, exponent, minMantissa, maxMantissa);
// Don't need to worry about the cuspRounding fix because rounding up will never take the
// mantissa over maxMantissa with a ones digit value other than 0. 0 can safely be truncated.
Number::normalize(
negative, mantissa, exponent, kMIN, kMAX, MantissaRange::CuspRoundingFix::Disabled);
auto const sign = negative ? -1 : 1;
return std::make_pair(static_cast<T>(sign * mantissa), exponent);
@@ -763,6 +858,8 @@ to_string(MantissaRange::MantissaScale const& scale)
{
case MantissaRange::MantissaScale::Small:
return "small";
case MantissaRange::MantissaScale::LargeLegacy:
return "largeLegacy";
case MantissaRange::MantissaScale::Large:
return "large";
default:

View File

@@ -181,14 +181,14 @@ private:
beast::insight::Collector::ptr const& collector)
: hook(collector->makeHook(handler))
, size(collector->makeGauge(prefix, "size"))
, hit_rate(collector->makeGauge(prefix, "hit_rate"))
, hitRate(collector->makeGauge(prefix, "hit_rate"))
{
}
beast::insight::Hook hook;
beast::insight::Gauge size;
beast::insight::Gauge hit_rate;
beast::insight::Gauge hitRate;
std::size_t hits{0};
std::size_t misses{0};
@@ -197,16 +197,16 @@ private:
class KeyOnlyEntry
{
public:
clock_type::time_point last_access;
clock_type::time_point lastAccess;
explicit KeyOnlyEntry(clock_type::time_point const& lastAccess) : last_access(lastAccess)
explicit KeyOnlyEntry(clock_type::time_point const& lastAccess) : lastAccess(lastAccess)
{
}
void
touch(clock_type::time_point const& now)
{
last_access = now;
lastAccess = now;
}
};
@@ -214,10 +214,10 @@ private:
{
public:
shared_weak_combo_pointer_type ptr;
clock_type::time_point last_access;
clock_type::time_point lastAccess;
ValueEntry(clock_type::time_point const& lastAccess, shared_pointer_type const& ptr)
: ptr(ptr), last_access(lastAccess)
: ptr(ptr), lastAccess(lastAccess)
{
}
@@ -246,7 +246,7 @@ private:
void
touch(clock_type::time_point const& now)
{
last_access = now;
lastAccess = now;
}
};
@@ -286,13 +286,13 @@ private:
std::string name_;
// Desired number of cache entries (0 = ignore)
int const target_size_;
int const targetSize_;
// Desired maximum cache age
clock_type::duration const target_age_;
clock_type::duration const targetAge_;
// Number of items cached
int cache_count_{0};
int cacheCount_{0};
cache_type cache_; // Hold strong reference to recent objects
std::uint64_t hits_{0};
std::uint64_t misses_{0};

View File

@@ -34,8 +34,8 @@ inline TaggedCache<
, clock_(clock)
, stats_(name, std::bind(&TaggedCache::collectMetrics, this), collector)
, name_(name)
, target_size_(size)
, target_age_(expiration)
, targetSize_(size)
, targetAge_(expiration)
{
}
@@ -86,7 +86,7 @@ TaggedCache<Key, T, IsKeyCache, SharedWeakUnionPointer, SharedPointerType, Hash,
getCacheSize() const
{
std::scoped_lock const lock(mutex_);
return cache_count_;
return cacheCount_;
}
template <
@@ -139,7 +139,7 @@ TaggedCache<Key, T, IsKeyCache, SharedWeakUnionPointer, SharedPointerType, Hash,
{
std::scoped_lock const lock(mutex_);
cache_.clear();
cache_count_ = 0;
cacheCount_ = 0;
}
template <
@@ -157,7 +157,7 @@ TaggedCache<Key, T, IsKeyCache, SharedWeakUnionPointer, SharedPointerType, Hash,
{
std::scoped_lock const lock(mutex_);
cache_.clear();
cache_count_ = 0;
cacheCount_ = 0;
hits_ = 0;
misses_ = 0;
}
@@ -213,21 +213,21 @@ TaggedCache<Key, T, IsKeyCache, SharedWeakUnionPointer, SharedPointerType, Hash,
{
std::scoped_lock const lock(mutex_);
if (target_size_ == 0 || (static_cast<int>(cache_.size()) <= target_size_))
if (targetSize_ == 0 || (static_cast<int>(cache_.size()) <= targetSize_))
{
whenExpire = now - target_age_;
whenExpire = now - targetAge_;
}
else
{
whenExpire = now - (target_age_ * target_size_ / cache_.size());
whenExpire = now - (targetAge_ * targetSize_ / cache_.size());
clock_type::duration const minimumAge(std::chrono::seconds(1));
if (whenExpire > (now - minimumAge))
whenExpire = now - minimumAge;
JLOG(journal_.trace())
<< name_ << " is growing fast " << cache_.size() << " of " << target_size_
<< " aging at " << (now - whenExpire).count() << " of " << target_age_.count();
<< name_ << " is growing fast " << cache_.size() << " of " << targetSize_
<< " aging at " << (now - whenExpire).count() << " of " << targetAge_.count();
}
std::vector<std::thread> workers;
@@ -242,7 +242,7 @@ TaggedCache<Key, T, IsKeyCache, SharedWeakUnionPointer, SharedPointerType, Hash,
for (std::thread& worker : workers)
worker.join();
cache_count_ -= allRemovals;
cacheCount_ -= allRemovals;
}
// At this point allStuffToSweep will go out of scope outside the lock
// and decrement the reference count on each strong pointer.
@@ -280,7 +280,7 @@ TaggedCache<Key, T, IsKeyCache, SharedWeakUnionPointer, SharedPointerType, Hash,
if (entry.isCached())
{
--cache_count_;
--cacheCount_;
entry.ptr.convertToWeak();
ret = true;
}
@@ -317,7 +317,7 @@ TaggedCache<Key, T, IsKeyCache, SharedWeakUnionPointer, SharedPointerType, Hash,
std::piecewise_construct,
std::forward_as_tuple(key),
std::forward_as_tuple(clock_.now(), data));
++cache_count_;
++cacheCount_;
return false;
}
@@ -366,12 +366,12 @@ TaggedCache<Key, T, IsKeyCache, SharedWeakUnionPointer, SharedPointerType, Hash,
data = cachedData;
}
++cache_count_;
++cacheCount_;
return true;
}
entry.ptr = data;
++cache_count_;
++cacheCount_;
return false;
}
@@ -477,7 +477,7 @@ TaggedCache<Key, T, IsKeyCache, SharedWeakUnionPointer, SharedPointerType, Hash,
auto [it, inserted] = cache_.emplace(
std::piecewise_construct, std::forward_as_tuple(key), std::forward_as_tuple(now));
if (!inserted)
it->second.last_access = now;
it->second.lastAccess = now;
return inserted;
}
@@ -626,7 +626,7 @@ TaggedCache<Key, T, IsKeyCache, SharedWeakUnionPointer, SharedPointerType, Hash,
if (entry.isCached())
{
// independent of cache size, so not counted as a hit
++cache_count_;
++cacheCount_;
entry.touch(clock_.now());
return entry.ptr.getStrong();
}
@@ -658,7 +658,7 @@ TaggedCache<Key, T, IsKeyCache, SharedWeakUnionPointer, SharedPointerType, Hash,
if (total != 0)
hitRate = (hits_ * 100) / total;
}
stats_.hit_rate.set(hitRate);
stats_.hitRate.set(hitRate);
}
}
@@ -706,7 +706,7 @@ TaggedCache<Key, T, IsKeyCache, SharedWeakUnionPointer, SharedPointerType, Hash,
++cit;
}
}
else if (cit->second.last_access <= whenExpire)
else if (cit->second.lastAccess <= whenExpire)
{
// strong, expired
++cacheRemovals;
@@ -773,12 +773,12 @@ TaggedCache<Key, T, IsKeyCache, SharedWeakUnionPointer, SharedPointerType, Hash,
auto cit = partition.begin();
while (cit != partition.end())
{
if (cit->second.last_access > now)
if (cit->second.lastAccess > now)
{
cit->second.last_access = now;
cit->second.lastAccess = now;
++cit;
}
else if (cit->second.last_access <= whenExpire)
else if (cit->second.lastAccess <= whenExpire)
{
cit = partition.erase(cit);
}

View File

@@ -24,20 +24,20 @@ namespace xrpl {
template <class EF>
class ScopeExit
{
EF exit_function_;
bool execute_on_destruction_{true};
EF exitFunction_;
bool executeOnDestruction_{true};
public:
~ScopeExit()
{
if (execute_on_destruction_)
exit_function_();
if (executeOnDestruction_)
exitFunction_();
}
ScopeExit(ScopeExit&& rhs) noexcept(
std::is_nothrow_move_constructible_v<EF> || std::is_nothrow_copy_constructible_v<EF>)
: exit_function_{std::forward<EF>(rhs.exit_function_)}
, execute_on_destruction_{rhs.execute_on_destruction_}
: exitFunction_{std::forward<EF>(rhs.exitFunction_)}
, executeOnDestruction_{rhs.executeOnDestruction_}
{
rhs.release();
}
@@ -51,7 +51,7 @@ public:
std::enable_if_t<
!std::is_same_v<std::remove_cv_t<EFP>, ScopeExit> &&
std::is_constructible_v<EF, EFP>>* = 0) noexcept
: exit_function_{std::forward<EFP>(f)}
: exitFunction_{std::forward<EFP>(f)}
{
static_assert(std::is_nothrow_constructible_v<EF, decltype(std::forward<EFP>(f))>);
}
@@ -59,7 +59,7 @@ public:
void
release() noexcept
{
execute_on_destruction_ = false;
executeOnDestruction_ = false;
}
};
@@ -69,22 +69,22 @@ ScopeExit(EF) -> ScopeExit<EF>;
template <class EF>
class ScopeFail
{
EF exit_function_;
bool execute_on_destruction_{true};
int uncaught_on_creation_{std::uncaught_exceptions()};
EF exitFunction_;
bool executeOnDestruction_{true};
int uncaughtOnCreation_{std::uncaught_exceptions()};
public:
~ScopeFail()
{
if (execute_on_destruction_ && std::uncaught_exceptions() > uncaught_on_creation_)
exit_function_();
if (executeOnDestruction_ && std::uncaught_exceptions() > uncaughtOnCreation_)
exitFunction_();
}
ScopeFail(ScopeFail&& rhs) noexcept(
std::is_nothrow_move_constructible_v<EF> || std::is_nothrow_copy_constructible_v<EF>)
: exit_function_{std::forward<EF>(rhs.exit_function_)}
, execute_on_destruction_{rhs.execute_on_destruction_}
, uncaught_on_creation_{rhs.uncaught_on_creation_}
: exitFunction_{std::forward<EF>(rhs.exitFunction_)}
, executeOnDestruction_{rhs.executeOnDestruction_}
, uncaughtOnCreation_{rhs.uncaughtOnCreation_}
{
rhs.release();
}
@@ -98,7 +98,7 @@ public:
std::enable_if_t<
!std::is_same_v<std::remove_cv_t<EFP>, ScopeFail> &&
std::is_constructible_v<EF, EFP>>* = 0) noexcept
: exit_function_{std::forward<EFP>(f)}
: exitFunction_{std::forward<EFP>(f)}
{
static_assert(std::is_nothrow_constructible_v<EF, decltype(std::forward<EFP>(f))>);
}
@@ -106,7 +106,7 @@ public:
void
release() noexcept
{
execute_on_destruction_ = false;
executeOnDestruction_ = false;
}
};
@@ -116,22 +116,22 @@ ScopeFail(EF) -> ScopeFail<EF>;
template <class EF>
class ScopeSuccess
{
EF exit_function_;
bool execute_on_destruction_{true};
int uncaught_on_creation_{std::uncaught_exceptions()};
EF exitFunction_;
bool executeOnDestruction_{true};
int uncaughtOnCreation_{std::uncaught_exceptions()};
public:
~ScopeSuccess() noexcept(noexcept(exit_function_()))
~ScopeSuccess() noexcept(noexcept(exitFunction_()))
{
if (execute_on_destruction_ && std::uncaught_exceptions() <= uncaught_on_creation_)
exit_function_();
if (executeOnDestruction_ && std::uncaught_exceptions() <= uncaughtOnCreation_)
exitFunction_();
}
ScopeSuccess(ScopeSuccess&& rhs) noexcept(
std::is_nothrow_move_constructible_v<EF> || std::is_nothrow_copy_constructible_v<EF>)
: exit_function_{std::forward<EF>(rhs.exit_function_)}
, execute_on_destruction_{rhs.execute_on_destruction_}
, uncaught_on_creation_{rhs.uncaught_on_creation_}
: exitFunction_{std::forward<EF>(rhs.exitFunction_)}
, executeOnDestruction_{rhs.executeOnDestruction_}
, uncaughtOnCreation_{rhs.uncaughtOnCreation_}
{
rhs.release();
}
@@ -146,14 +146,14 @@ public:
!std::is_same_v<std::remove_cv_t<EFP>, ScopeSuccess> &&
std::is_constructible_v<EF, EFP>>* =
0) noexcept(std::is_nothrow_constructible_v<EF, EFP> || std::is_nothrow_constructible_v<EF, EFP&>)
: exit_function_{std::forward<EFP>(f)}
: exitFunction_{std::forward<EFP>(f)}
{
}
void
release() noexcept
{
execute_on_destruction_ = false;
executeOnDestruction_ = false;
}
};

View File

@@ -77,8 +77,8 @@ private:
std::ostream& os_;
Results results_;
SuiteResults suite_results_;
CaseResults case_results_;
SuiteResults suiteResults_;
CaseResults caseResults_;
public:
Reporter(Reporter const&) = delete;
@@ -196,22 +196,22 @@ template <class Unused>
void
Reporter<Unused>::onSuiteBegin(SuiteInfo const& info)
{
suite_results_ = SuiteResults{info.fullName()};
suiteResults_ = SuiteResults{info.fullName()};
}
template <class Unused>
void
Reporter<Unused>::onSuiteEnd()
{
results_.add(suite_results_);
results_.add(suiteResults_);
}
template <class Unused>
void
Reporter<Unused>::onCaseBegin(std::string const& name)
{
case_results_ = CaseResults(name);
os_ << suite_results_.name << (case_results_.name.empty() ? "" : (" " + case_results_.name))
caseResults_ = CaseResults(name);
os_ << suiteResults_.name << (caseResults_.name.empty() ? "" : (" " + caseResults_.name))
<< std::endl;
}
@@ -219,23 +219,23 @@ template <class Unused>
void
Reporter<Unused>::onCaseEnd()
{
suite_results_.add(case_results_);
suiteResults_.add(caseResults_);
}
template <class Unused>
void
Reporter<Unused>::onPass()
{
++case_results_.total;
++caseResults_.total;
}
template <class Unused>
void
Reporter<Unused>::onFail(std::string const& reason)
{
++case_results_.failed;
++case_results_.total;
os_ << "#" << case_results_.total << " failed" << (reason.empty() ? "" : ": ") << reason
++caseResults_.failed;
++caseResults_.total;
os_ << "#" << caseResults_.total << " failed" << (reason.empty() ? "" : ": ") << reason
<< std::endl;
}

View File

@@ -47,7 +47,7 @@ inline bool
JobQueue::Coro::post()
{
{
std::scoped_lock const lk(mutex_run_);
std::scoped_lock const lk(mutexRun_);
running_ = true;
}
@@ -58,7 +58,7 @@ JobQueue::Coro::post()
}
// The coroutine will not run. Clean up running_.
std::scoped_lock const lk(mutex_run_);
std::scoped_lock const lk(mutexRun_);
running_ = false;
cv_.notify_all();
return false;
@@ -68,7 +68,7 @@ inline void
JobQueue::Coro::resume()
{
{
std::scoped_lock const lk(mutex_run_);
std::scoped_lock const lk(mutexRun_);
running_ = true;
}
{
@@ -92,7 +92,7 @@ JobQueue::Coro::resume()
}
detail::getLocalValues().release();
detail::getLocalValues().reset(saved);
std::scoped_lock const lk(mutex_run_);
std::scoped_lock const lk(mutexRun_);
running_ = false;
cv_.notify_all();
}
@@ -127,7 +127,7 @@ JobQueue::Coro::expectEarlyExit()
inline void
JobQueue::Coro::join()
{
std::unique_lock<std::mutex> lk(mutex_run_);
std::unique_lock<std::mutex> lk(mutexRun_);
cv_.wait(lk, [this]() { return !running_; });
}

View File

@@ -127,7 +127,7 @@ private:
std::function<void()> job_;
std::shared_ptr<LoadEvent> loadEvent_;
std::string name_;
clock_type::time_point queue_time_;
clock_type::time_point queueTime_;
};
using JobCounter = ClosureCounter<void>;

View File

@@ -52,7 +52,7 @@ public:
std::string name_;
bool running_{false};
std::mutex mutex_;
std::mutex mutex_run_;
std::mutex mutexRun_;
std::condition_variable cv_;
boost::coroutines2::coroutine<void>::push_type* yield_{};
boost::coroutines2::coroutine<void>::pull_type coro_;
@@ -246,7 +246,7 @@ private:
// Statistics tracking
perf::PerfLog& perfLog_;
beast::insight::Collector::ptr collector_;
beast::insight::Gauge job_count_;
beast::insight::Gauge jobCount_;
beast::insight::Hook hook_;
std::condition_variable cv_;

View File

@@ -161,7 +161,7 @@ public:
* While the JSON spec doesn't explicitly disallow this, you should avoid
* calling this method twice with the same tag for the same object.
*
* If CHECK_JSON_WRITER is defined, this function throws an exception if if
* If CHECK_JSON_WRITER is defined, this function throws an exception if
* the tag you use has already been used in this object.
*/
template <typename Type>

View File

@@ -9,7 +9,7 @@ class BookDirs
private:
ReadView const* view_ = nullptr;
uint256 const root_;
uint256 const next_quality_;
uint256 const nextQuality_;
uint256 const key_;
std::shared_ptr<SLE const> sle_ = nullptr;
unsigned int entry_ = 0;
@@ -67,15 +67,15 @@ private:
friend class BookDirs;
const_iterator(ReadView const& view, uint256 const& root, uint256 const& dirKey)
: view_(&view), root_(root), key_(dirKey), cur_key_(dirKey)
: view_(&view), root_(root), key_(dirKey), curKey_(dirKey)
{
}
ReadView const* view_ = nullptr;
uint256 root_;
uint256 next_quality_;
uint256 nextQuality_;
uint256 key_;
uint256 cur_key_;
uint256 curKey_;
std::shared_ptr<SLE const> sle_;
unsigned int entry_ = 0;
uint256 index_;

View File

@@ -76,7 +76,7 @@ private:
// monotonic_resource_ must outlive `items_`. Make a pointer so it may be
// easily moved.
std::unique_ptr<boost::container::pmr::monotonic_buffer_resource> monotonic_resource_;
std::unique_ptr<boost::container::pmr::monotonic_buffer_resource> monotonicResource_;
txs_map txs_;
Rules rules_;
LedgerHeader header_;

View File

@@ -57,7 +57,7 @@ isVaultPseudoAccountFrozen(
ReadView const& view,
AccountID const& account,
MPTIssue const& mptShare,
int depth);
std::uint8_t depth);
[[nodiscard]] bool
isLPTokenFrozen(

View File

@@ -22,14 +22,14 @@ public:
static constexpr size_t kInitialBufferSize = kilobytes(256);
RawStateTable()
: monotonic_resource_{std::make_unique<boost::container::pmr::monotonic_buffer_resource>(
: monotonicResource_{std::make_unique<boost::container::pmr::monotonic_buffer_resource>(
kInitialBufferSize)}
, items_{monotonic_resource_.get()} {};
, items_{monotonicResource_.get()} {};
RawStateTable(RawStateTable const& rhs)
: monotonic_resource_{std::make_unique<boost::container::pmr::monotonic_buffer_resource>(
: monotonicResource_{std::make_unique<boost::container::pmr::monotonic_buffer_resource>(
kInitialBufferSize)}
, items_{rhs.items_, monotonic_resource_.get()}
, items_{rhs.items_, monotonicResource_.get()}
, dropsDestroyed_{rhs.dropsDestroyed_} {};
RawStateTable(RawStateTable&&) = default;
@@ -101,7 +101,7 @@ private:
boost::container::pmr::polymorphic_allocator<std::pair<key_type const, SleAction>>>;
// monotonic_resource_ must outlive `items_`. Make a pointer so it may be
// easily moved.
std::unique_ptr<boost::container::pmr::monotonic_buffer_resource> monotonic_resource_;
std::unique_ptr<boost::container::pmr::monotonic_buffer_resource> monotonicResource_;
items_t items_;
XRPAmount dropsDestroyed_{0};

View File

@@ -4,8 +4,38 @@
#include <xrpl/protocol/Rules.h>
#include <xrpl/protocol/st.h>
#include <string_view>
namespace xrpl {
/**
* Broker cover preclaim precision guard (fixCleanup3_2_0).
*
* Prevents a "silent sub-ULP no-op" where a deposit, withdrawal, or clawback
* amount is so small that it rounds to zero at `sfCoverAvailable`'s scale.
* Without this guard, both the pseudo trust-line and `sfCoverAvailable` would
* identically absorb the rounded zero, resulting in a successful transaction
* (tesSUCCESS) where no funds actually moved.
*
* @param view Read view (rules used for amendment gating).
* @param sleBroker The loan broker SLE (read-only).
* @param vaultAsset The underlying vault asset (the broker's cover asset).
* @param amount The effective subtraction/addition amount.
* @param j Journal for logging.
* @param logPrefix Transactor name for log diagnostics.
*
* @return `tecPRECISION_LOSS` if the request rounds to zero at cover scale.
* `tesSUCCESS` if the amendment is disabled or the request is safely supra-ULP.
*/
[[nodiscard]] TER
canApplyToBrokerCover(
ReadView const& view,
SLE::const_ref sleBroker,
Asset const& vaultAsset,
STAmount const& amount,
beast::Journal j,
std::string_view logPrefix);
// Lending protocol has dependencies, so capture them here.
bool
checkLendingProtocolDependencies(Rules const& rules, STTx const& tx);
@@ -173,6 +203,21 @@ getAssetsTotalScale(SLE::const_ref vaultSle)
return scale(vaultSle->at(sfAssetsTotal), vaultSle->at(sfAsset));
}
// Compute the minimum required broker cover, rounded consistently.
// DebtTotal is a broker-level aggregate maintained at vault scale, so the
// rounding must also use vault scale — never an individual loan's scale.
inline Number
minimumBrokerCover(Number const& debtTotal, TenthBips32 coverRateMinimum, SLE::const_ref vaultSle)
{
XRPL_ASSERT(
vaultSle && vaultSle->getType() == ltVAULT, "xrpl::minimumBrokerCover : valid Vault sle");
NumberRoundModeGuard const mg(Number::RoundingMode::Upward);
return roundToAsset(
vaultSle->at(sfAsset),
tenthBipsOfValue(debtTotal, coverRateMinimum),
getAssetsTotalScale(vaultSle));
}
TER
checkLoanGuards(
Asset const& vaultAsset,
@@ -416,6 +461,7 @@ loanAccruedInterest(
ExtendedPaymentComponents
computeOverpaymentComponents(
Rules const& rules,
Asset const& asset,
int32_t const loanScale,
Number const& overpayment,

View File

@@ -27,14 +27,18 @@ isGlobalFrozen(ReadView const& view, MPTIssue const& mptIssue);
isIndividualFrozen(ReadView const& view, AccountID const& account, MPTIssue const& mptIssue);
[[nodiscard]] bool
isFrozen(ReadView const& view, AccountID const& account, MPTIssue const& mptIssue, int depth = 0);
isFrozen(
ReadView const& view,
AccountID const& account,
MPTIssue const& mptIssue,
std::uint8_t depth = 0);
[[nodiscard]] bool
isAnyFrozen(
ReadView const& view,
std::initializer_list<AccountID> const& accounts,
MPTIssue const& mptIssue,
int depth = 0);
std::uint8_t depth = 0);
//------------------------------------------------------------------------------
//
@@ -88,7 +92,7 @@ requireAuth(
MPTIssue const& mptIssue,
AccountID const& account,
AuthType authType = AuthType::Legacy,
int depth = 0);
std::uint8_t depth = 0);
/** Enforce account has MPToken to match its authorization.
*
@@ -104,22 +108,77 @@ enforceMPTokenAuthorization(
XRPAmount const& priorBalance,
beast::Journal j);
/** Check if the destination account is allowed
* to receive MPT. Return tecNO_AUTH if it doesn't
* and tesSUCCESS otherwise.
/** Resolve the underlying asset of a vault share.
*
* Reads sfReferenceHolding from @p sleShareIssuance to determine which
* asset the vault wraps. @p sleHolding must be the SLE that
* sfReferenceHolding points to — either an ltMPTOKEN (returns its
* MPTIssue) or an ltRIPPLE_STATE (returns its low/high Issue).
*
* @pre Both SLEs must exist and @p sleHolding must be of type ltMPTOKEN
* or ltRIPPLE_STATE. Passing any other type is undefined behaviour.
* @param sleShareIssuance MPTokenIssuance SLE for the vault share token.
* @param sleHolding SLE referenced by sfReferenceHolding.
* @return The underlying Asset (MPTIssue or Issue).
*/
[[nodiscard]] Asset
assetOfHolding(SLE const& sleShareIssuance, SLE const& sleHolding);
/** Check whether @p to may receive the given MPT from @p from.
*
* The check passes when any of the following is true:
* - @p waive is WaiveMPTCanTransfer::Yes (recovery-path exemption), or
* - @p from or @p to is the issuer, or
* - lsfMPTCanTransfer is set on the MPTokenIssuance.
*
* For vault shares (MPTokenIssuances that carry sfReferenceHolding) the
* check recurses into the underlying asset's transferability. This
* recursion is defensive; vault-of-vault-shares is rejected at vault
* creation, so in practice depth never exceeds 1.
*
* @param view Ledger state to read from.
* @param mptIssue The MPT issuance being transferred.
* @param from Sending account.
* @param to Receiving account.
* @param waive WaiveMPTCanTransfer::Yes skips the lsfMPTCanTransfer
* check. Use for recovery paths (e.g. unwinding SAV or
* Lending Protocol positions after an issuer revokes
* transferability).
* @param depth Recursion depth; bounded at kMaxAssetCheckDepth.
* @return tesSUCCESS if the transfer is allowed, tecNO_AUTH otherwise.
*/
[[nodiscard]] TER
canTransfer(
ReadView const& view,
MPTIssue const& mptIssue,
AccountID const& from,
AccountID const& to);
AccountID const& to,
WaiveMPTCanTransfer waive = WaiveMPTCanTransfer::No,
std::uint8_t depth = 0);
/** Check if Asset can be traded on DEX. return tecNO_PERMISSION
* if it doesn't and tesSUCCESS otherwise.
/** Check whether @p asset may be traded on the DEX.
*
* For IOU assets the check delegates to the existing offer/AMM freeze
* logic. For MPT assets it checks lsfMPTCanTrade on the MPTokenIssuance.
* Vault shares recurse into the underlying asset's tradability via
* sfReferenceHolding; depth is bounded at kMaxAssetCheckDepth.
*
* @param view Ledger state to read from.
* @param asset The asset to check.
* @param depth Recursion depth; bounded at kMaxAssetCheckDepth.
* @return tesSUCCESS if trading is allowed, tecNO_PERMISSION otherwise.
*/
[[nodiscard]] TER
canTrade(ReadView const& view, Asset const& asset);
canTrade(ReadView const& view, Asset const& asset, std::uint8_t depth = 0);
/** Convenience to combine canTrade/Transfer. Returns tesSUCCESS if Asset is Issue.
*/
[[nodiscard]] TER
canMPTTradeAndTransfer(
ReadView const& v,
Asset const& asset,
AccountID const& from,
AccountID const& to);
//------------------------------------------------------------------------------
//
@@ -227,17 +286,4 @@ issuerFundsToSelfIssue(ReadView const& view, MPTIssue const& issue);
void
issuerSelfDebitHookMPT(ApplyView& view, MPTIssue const& issue, std::uint64_t amount);
//------------------------------------------------------------------------------
//
// MPT DEX
//
//------------------------------------------------------------------------------
/* Return true if a transaction is allowed for the specified MPT/account. The
* function checks MPTokenIssuance and MPToken objects flags to determine if the
* transaction is allowed.
*/
TER
checkMPTTxAllowed(ReadView const& v, TxType tx, Asset const& asset, AccountID const& accountID);
} // namespace xrpl

View File

@@ -93,7 +93,7 @@ isFrozen(ReadView const& view, AccountID const& account, Issue const& issue)
// Overload with depth parameter for uniformity with MPTIssue version.
// The depth parameter is ignored for IOUs since they don't have vault recursion.
[[nodiscard]] inline bool
isFrozen(ReadView const& view, AccountID const& account, Issue const& issue, int /*depth*/)
isFrozen(ReadView const& view, AccountID const& account, Issue const& issue, std::uint8_t /*depth*/)
{
return isFrozen(view, account, issue);
}
@@ -110,7 +110,7 @@ isDeepFrozen(
ReadView const& view,
AccountID const& account,
Issue const& issue,
int = 0 /*ignored*/)
std::uint8_t = 0 /*ignored*/)
{
return isDeepFrozen(view, account, issue.currency, issue.account);
}

View File

@@ -34,6 +34,15 @@ enum class WaiveTransferFee : bool { No = false, Yes };
/** Controls whether accountSend is allowed to overflow OutstandingAmount **/
enum class AllowMPTOverflow : bool { No = false, Yes };
/** Controls whether canTransfer enforces lsfMPTCanTransfer on MPTs.
*
* Default is No (enforce). Use Yes at call sites that must remain available
* even when an MPT issuer has cleared lsfMPTCanTransfer - for example,
* unwinding existing positions in SAV or the Lending Protocol. Has no
* effect on the IOU branch of canTransfer.
*/
enum class WaiveMPTCanTransfer : bool { No = false, Yes };
/* Check if MPToken (for MPT) or trust line (for IOU) exists:
* - StrongAuth - before checking if authorization is required
* - WeakAuth
@@ -54,16 +63,26 @@ enum class AuthType { StrongAuth, WeakAuth, Legacy };
[[nodiscard]] bool
isGlobalFrozen(ReadView const& view, Asset const& asset);
[[nodiscard]] TER
checkGlobalFrozen(ReadView const& view, Asset const& asset);
[[nodiscard]] bool
isIndividualFrozen(ReadView const& view, AccountID const& account, Asset const& asset);
[[nodiscard]] TER
checkIndividualFrozen(ReadView const& view, AccountID const& account, Asset const& asset);
/**
* isFrozen check is recursive for MPT shares in a vault, descending to
* assets in the vault, up to maxAssetCheckDepth recursion depth. This is
* purely defensive, as we currently do not allow such vaults to be created.
*/
[[nodiscard]] bool
isFrozen(ReadView const& view, AccountID const& account, Asset const& asset, int depth = 0);
isFrozen(
ReadView const& view,
AccountID const& account,
Asset const& asset,
std::uint8_t depth = 0);
[[nodiscard]] TER
checkFrozen(ReadView const& view, AccountID const& account, Issue const& issue);
@@ -85,14 +104,14 @@ isAnyFrozen(
ReadView const& view,
std::initializer_list<AccountID> const& accounts,
Asset const& asset,
int depth = 0);
std::uint8_t depth = 0);
[[nodiscard]] bool
isDeepFrozen(
ReadView const& view,
AccountID const& account,
MPTIssue const& mptIssue,
int depth = 0);
std::uint8_t depth = 0);
/**
* isFrozen check is recursive for MPT shares in a vault, descending to
@@ -100,7 +119,11 @@ isDeepFrozen(
* purely defensive, as we currently do not allow such vaults to be created.
*/
[[nodiscard]] bool
isDeepFrozen(ReadView const& view, AccountID const& account, Asset const& asset, int depth = 0);
isDeepFrozen(
ReadView const& view,
AccountID const& account,
Asset const& asset,
std::uint8_t depth = 0);
[[nodiscard]] TER
checkDeepFrozen(ReadView const& view, AccountID const& account, MPTIssue const& mptIssue);
@@ -234,7 +257,13 @@ requireAuth(
AuthType authType = AuthType::Legacy);
[[nodiscard]] TER
canTransfer(ReadView const& view, Asset const& asset, AccountID const& from, AccountID const& to);
canTransfer(
ReadView const& view,
Asset const& asset,
AccountID const& from,
AccountID const& to,
WaiveMPTCanTransfer waive = WaiveMPTCanTransfer::No,
std::uint8_t depth = 0);
//------------------------------------------------------------------------------
//

View File

@@ -1,5 +1,7 @@
#pragma once
#include <xrpl/ledger/ReadView.h>
#include <xrpl/protocol/AccountID.h>
#include <xrpl/protocol/STAmount.h>
#include <xrpl/protocol/STLedgerEntry.h>
@@ -43,6 +45,14 @@ sharesToAssetsDeposit(
/** Controls whether to truncate shares instead of rounding. */
enum class TruncateShares : bool { No = false, Yes = true };
/** Controls whether the withdraw conversion helpers
(assetsToSharesWithdraw and sharesToAssetsWithdraw) subtract
sfLossUnrealized from sfAssetsTotal before computing the exchange rate.
The default (No) applies the standard discounted rate; Yes is used when
the redeemer is the sole remaining shareholder.
*/
enum class WaiveUnrealizedLoss : bool { No = false, Yes = true };
/** From the perspective of a vault, return the number of shares to demand from
the depositor when they ask to withdraw a fixed amount of assets. Since
shares are MPT this number is integral, and it will be rounded to nearest
@@ -52,6 +62,8 @@ enum class TruncateShares : bool { No = false, Yes = true };
@param issuance The MPTokenIssuance SLE for the vault's shares.
@param assets The amount of assets to convert.
@param truncate Whether to truncate instead of rounding.
@param waive Whether to waive the unrealized-loss discount when computing
the exchange rate.
@return The number of shares, or nullopt on error.
*/
@@ -60,7 +72,8 @@ assetsToSharesWithdraw(
std::shared_ptr<SLE const> const& vault,
std::shared_ptr<SLE const> const& issuance,
STAmount const& assets,
TruncateShares truncate = TruncateShares::No);
TruncateShares truncate = TruncateShares::No,
WaiveUnrealizedLoss waive = WaiveUnrealizedLoss::No);
/** From the perspective of a vault, return the number of assets to give the
depositor when they redeem a fixed amount of shares. Note, since shares are
@@ -69,6 +82,8 @@ assetsToSharesWithdraw(
@param vault The vault SLE.
@param issuance The MPTokenIssuance SLE for the vault's shares.
@param shares The amount of shares to convert.
@param waive Whether to waive (i.e. not subtract) the vault's unrealized
loss when computing the exchange rate.
@return The number of assets, or nullopt on error.
*/
@@ -76,6 +91,22 @@ assetsToSharesWithdraw(
sharesToAssetsWithdraw(
std::shared_ptr<SLE const> const& vault,
std::shared_ptr<SLE const> const& issuance,
STAmount const& shares);
STAmount const& shares,
WaiveUnrealizedLoss waive = WaiveUnrealizedLoss::No);
/** Returns true iff `account` holds all of the vault's outstanding shares —
i.e. is the sole remaining shareholder. Returns false if the account
holds no shares or fewer than the total outstanding.
@param view The ledger view.
@param account The candidate sole shareholder.
@param issuance The MPTokenIssuance SLE for the vault's shares; provides
both the share MPTID and the outstanding-amount total.
*/
[[nodiscard]] bool
isSoleShareholder(
ReadView const& view,
AccountID const& account,
std::shared_ptr<SLE const> const& issuance);
} // namespace xrpl

View File

@@ -21,13 +21,13 @@ public:
bool sslVerify,
beast::Journal j,
boost::asio::ssl::context_base::method method = boost::asio::ssl::context::sslv23)
: ssl_context_{method}, j_(j), verify_{sslVerify}
: sslContext_{method}, j_(j), verify_{sslVerify}
{
boost::system::error_code ec;
if (sslVerifyFile.empty())
{
registerSSLCerts(ssl_context_, ec, j_);
registerSSLCerts(sslContext_, ec, j_);
if (ec && sslVerifyDir.empty())
{
@@ -37,12 +37,12 @@ public:
}
else
{
ssl_context_.load_verify_file(sslVerifyFile);
sslContext_.load_verify_file(sslVerifyFile);
}
if (!sslVerifyDir.empty())
{
ssl_context_.add_verify_path(sslVerifyDir, ec);
sslContext_.add_verify_path(sslVerifyDir, ec);
if (ec)
{
@@ -55,7 +55,7 @@ public:
boost::asio::ssl::context&
context()
{
return ssl_context_;
return sslContext_;
}
[[nodiscard]] bool
@@ -153,7 +153,7 @@ public:
}
private:
boost::asio::ssl::context ssl_context_;
boost::asio::ssl::context sslContext_;
beast::Journal const j_;
bool const verify_;
};

View File

@@ -83,10 +83,6 @@ public:
virtual Status
fetch(uint256 const& hash, std::shared_ptr<NodeObject>* pObject) = 0;
/** Fetch a batch synchronously. */
virtual std::pair<std::vector<std::shared_ptr<NodeObject>>, Status>
fetchBatch(std::vector<uint256> const& hashes) = 0;
/** Store a single object.
Depending on the implementation this may happen immediately
or deferred using a scheduled task.

View File

@@ -131,6 +131,10 @@ public:
std::uint32_t ledgerSeq,
std::function<void(std::shared_ptr<NodeObject> const&)>&& callback);
/** Remove expired entries from the positive and negative caches. */
virtual void
sweep() = 0;
/** Gather statistics pertaining to read and write activities.
*
* @param obj Json object reference into which to place counters.

View File

@@ -22,6 +22,32 @@ public:
beast::Journal j)
: Database(scheduler, readThreads, config, j), backend_(std::move(backend))
{
std::optional<int> cacheSize, cacheAge;
if (config.exists("cache_size"))
{
cacheSize = get<int>(config, "cache_size");
if (cacheSize.value() < 0)
Throw<std::runtime_error>("Specified negative value for cache_size");
}
if (config.exists("cache_age"))
{
cacheAge = get<int>(config, "cache_age");
if (cacheAge.value() < 0)
Throw<std::runtime_error>("Specified negative value for cache_age");
}
if (cacheSize.has_value() || cacheAge.has_value())
{
cache_ = std::make_shared<TaggedCache<uint256, NodeObject>>(
"DatabaseNodeImp",
cacheSize.value_or(0),
std::chrono::minutes(cacheAge.value_or(0)),
stopwatch(),
j);
}
XRPL_ASSERT(
backend_,
"xrpl::NodeStore::DatabaseNodeImp::DatabaseNodeImp : non-null "
@@ -67,16 +93,19 @@ public:
backend_->sync();
}
std::vector<std::shared_ptr<NodeObject>>
fetchBatch(std::vector<uint256> const& hashes);
void
asyncFetch(
uint256 const& hash,
std::uint32_t ledgerSeq,
std::function<void(std::shared_ptr<NodeObject> const&)>&& callback) override;
void
sweep() override;
private:
// Cache for database objects. This cache is not always initialized. Check
// for null before using.
std::shared_ptr<TaggedCache<uint256, NodeObject>> cache_;
// Persistent key/value storage
std::shared_ptr<Backend> backend_;

View File

@@ -55,6 +55,9 @@ public:
void
sync() override;
void
sweep() override;
private:
std::shared_ptr<Backend> writableBackend_;
std::shared_ptr<Backend> archiveBackend_;

View File

@@ -140,8 +140,8 @@ private:
using issue_hasher = std::hash<xrpl::Issue>;
using mptissue_hasher = std::hash<xrpl::MPTIssue>;
issue_hasher m_issue_hasher_;
mptissue_hasher m_mptissue_hasher_;
issue_hasher mIssueHasher_;
mptissue_hasher mMptissueHasher_;
public:
explicit hash() = default;
@@ -151,11 +151,11 @@ public:
{
return asset.visit(
[&](xrpl::Issue const& issue) {
value_type const result(m_issue_hasher_(issue));
value_type const result(mIssueHasher_(issue));
return result;
},
[&](xrpl::MPTIssue const& issue) {
value_type const result(m_mptissue_hasher_(issue));
value_type const result(mMptissueHasher_(issue));
return result;
});
}
@@ -170,8 +170,8 @@ private:
using asset_hasher = std::hash<xrpl::Asset>;
using uint256_hasher = xrpl::uint256::hasher;
asset_hasher issue_hasher_;
uint256_hasher uint256_hasher_;
asset_hasher issueHasher_;
uint256_hasher uint256Hasher_;
public:
hash() = default;
@@ -182,11 +182,11 @@ public:
value_type
operator()(argument_type const& value) const
{
value_type result(issue_hasher_(value.in));
boost::hash_combine(result, issue_hasher_(value.out));
value_type result(issueHasher_(value.in));
boost::hash_combine(result, issueHasher_(value.out));
if (value.domain)
boost::hash_combine(result, uint256_hasher_(*value.domain));
boost::hash_combine(result, uint256Hasher_(*value.domain));
return result;
}

View File

@@ -172,24 +172,24 @@ struct ErrorInfo
{
// Default ctor needed to produce an empty std::array during constexpr eval.
constexpr ErrorInfo()
: code(RpcUnknown), token("unknown"), message("An unknown error code."), http_status(200)
: code(RpcUnknown), token("unknown"), message("An unknown error code."), httpStatus(200)
{
}
constexpr ErrorInfo(ErrorCodeI code, char const* token, char const* message)
: code(code), token(token), message(message), http_status(200)
: code(code), token(token), message(message), httpStatus(200)
{
}
constexpr ErrorInfo(ErrorCodeI code, char const* token, char const* message, int httpStatus)
: code(code), token(token), message(message), http_status(httpStatus)
: code(code), token(token), message(message), httpStatus(httpStatus)
{
}
ErrorCodeI code;
json::StaticString token;
json::StaticString message;
int http_status;
int httpStatus;
};
/** Returns an ErrorInfo that reflects the error code. */

View File

@@ -122,4 +122,17 @@ private:
std::optional<Rules> saved_;
};
class NumberSO;
class NumberMantissaScaleGuard;
bool
useRulesGuards(Rules const& rules);
void
createGuards(
Rules const& rules,
std::optional<NumberSO>& stNumberSO,
std::optional<CurrentTransactionRulesGuard>& rulesGuard,
std::optional<NumberMantissaScaleGuard>& mantissaScaleGuard);
} // namespace xrpl

View File

@@ -365,8 +365,8 @@ using SF_XCHAIN_BRIDGE = TypedField<STXChainBridge>;
#define UNTYPED_SFIELD(sfName, stiSuffix, fieldValue, ...) extern SField const sfName;
#define TYPED_SFIELD(sfName, stiSuffix, fieldValue, ...) extern SF_##stiSuffix const sfName;
extern SField const kSfInvalid;
extern SField const kSfGeneric;
extern SField const sfInvalid; // NOLINT(readability-identifier-naming)
extern SField const sfGeneric; // NOLINT(readability-identifier-naming)
#include <xrpl/protocol/detail/sfields.macro>

View File

@@ -3,11 +3,13 @@
#include <xrpl/basics/CountedObject.h>
#include <xrpl/basics/LocalValue.h>
#include <xrpl/basics/Number.h>
#include <xrpl/beast/utility/Journal.h>
#include <xrpl/beast/utility/instrumentation.h>
#include <xrpl/protocol/Asset.h>
#include <xrpl/protocol/IOUAmount.h>
#include <xrpl/protocol/Issue.h>
#include <xrpl/protocol/MPTAmount.h>
#include <xrpl/protocol/Protocol.h>
#include <xrpl/protocol/SField.h>
#include <xrpl/protocol/STBase.h>
#include <xrpl/protocol/Serializer.h>
@@ -184,6 +186,23 @@ public:
[[nodiscard]] STAmount const&
value() const noexcept;
/**
* Checks if this amount evaluates to zero when constrained to a specific
* accounting scale.
* For XRP and MPT `roundToScale` is a no-op, returns true only when the amount itself is zero.
* The `scale` argument is ignored in that case.
* For IOU, the amount is rounded to the given scale using Number::RoundingMode::ToNearest mode
* and the result is checked for zero; if `scale <= exponent()`, `roundToScale` short-circuits
* and returns the value unchanged, so this returns false for any non-zero amount.
*
* @param scale The target accounting scale to evaluate against.
* @return `true` if this amount rounds to zero at the given scale, `false` otherwise.
*
* @see roundToScale
*/
[[nodiscard]] bool
isZeroAtScale(int scale) const;
//--------------------------------------------------------------------------
//
// Operators
@@ -540,7 +559,7 @@ STAmount::fromNumber(A const& a, Number const& number)
return STAmount{asset, intValue, 0, negative};
}
auto const [mantissa, exponent] = working.normalizeToRange(kMinValue, kMaxValue);
auto const [mantissa, exponent] = working.normalizeToRange<kMinValue, kMaxValue>();
return STAmount{asset, mantissa, exponent, negative};
}
@@ -575,12 +594,25 @@ STAmount::value() const noexcept
return *this;
}
inline bool
[[nodiscard]] inline bool
isLegalNet(STAmount const& value)
{
return !value.native() || (value.mantissa() <= STAmount::kMaxNativeN);
}
[[nodiscard]] inline bool
isLegalMPT(STAmount const& value)
{
return !value.holds<MPTIssue>() ||
(!value.negative() && value.exponent() == 0 && value.mantissa() <= kMaxMpTokenAmount);
}
/* Check recursively if an object has invalid MPTAmount or XRPAmount in STAmount field.
* Calls isLegalNet() and isLegalMPT().
*/
[[nodiscard]] bool
hasInvalidAmount(STBase const& field, beast::Journal j);
//------------------------------------------------------------------------------
//
// Operators

View File

@@ -24,7 +24,7 @@ public:
STBlob(SField const& f, void const* data, std::size_t size);
STBlob(SField const& f, Buffer&& b);
STBlob(SField const& n);
STBlob(SerialIter&, SField const& name = kSfGeneric);
STBlob(SerialIter&, SField const& name = sfGeneric);
[[nodiscard]] std::size_t
size() const;

View File

@@ -21,8 +21,8 @@ class STPathElement final : public CountedObject<STPathElement>
PathAsset assetID_;
AccountID issuerID_;
bool is_offer_;
std::size_t hash_value_;
bool isOffer_;
std::size_t hashValue_;
public:
// Bitwise values (typeCurrency | typeMPT)
@@ -235,9 +235,9 @@ private:
// ------------ STPathElement ------------
inline STPathElement::STPathElement() : type_(TypeNone), is_offer_(true)
inline STPathElement::STPathElement() : type_(TypeNone), isOffer_(true)
{
hash_value_ = getHash(*this);
hashValue_ = getHash(*this);
}
inline STPathElement::STPathElement(
@@ -248,11 +248,11 @@ inline STPathElement::STPathElement(
{
if (!account)
{
is_offer_ = true;
isOffer_ = true;
}
else
{
is_offer_ = false;
isOffer_ = false;
accountID_ = *account;
type_ |= TypeAccount;
XRPL_ASSERT(
@@ -272,7 +272,7 @@ inline STPathElement::STPathElement(
XRPL_ASSERT(issuerID_ != noAccount(), "xrpl::STPathElement::STPathElement : issuer is set");
}
hash_value_ = getHash(*this);
hashValue_ = getHash(*this);
}
inline STPathElement::STPathElement(
@@ -284,9 +284,9 @@ inline STPathElement::STPathElement(
, accountID_(account)
, assetID_(asset)
, issuerID_(issuer)
, is_offer_(isXRP(accountID_))
, isOffer_(isXRP(accountID_))
{
if (!is_offer_)
if (!isOffer_)
type_ |= TypeAccount;
if (forceAsset || !isXRP(assetID_))
@@ -295,7 +295,7 @@ inline STPathElement::STPathElement(
if (!isXRP(issuer))
type_ |= TypeIssuer;
hash_value_ = getHash(*this);
hashValue_ = getHash(*this);
}
inline STPathElement::STPathElement(
@@ -307,12 +307,12 @@ inline STPathElement::STPathElement(
, accountID_(account)
, assetID_(asset)
, issuerID_(issuer)
, is_offer_(isXRP(accountID_))
, isOffer_(isXRP(accountID_))
{
assetID_.visit(
[&](Currency const&) { type_ = type_ & (~Type::TypeMpt); },
[&](MPTID const&) { type_ = type_ & (~Type::TypeCurrency); });
hash_value_ = getHash(*this);
hashValue_ = getHash(*this);
}
inline auto
@@ -324,7 +324,7 @@ STPathElement::getNodeType() const
inline bool
STPathElement::isOffer() const
{
return is_offer_;
return isOffer_;
}
inline bool
@@ -404,7 +404,7 @@ STPathElement::getIssuerID() const
inline bool
STPathElement::operator==(STPathElement const& t) const
{
return (type_ & TypeAccount) == (t.type_ & TypeAccount) && hash_value_ == t.hash_value_ &&
return (type_ & TypeAccount) == (t.type_ & TypeAccount) && hashValue_ == t.hashValue_ &&
accountID_ == t.accountID_ && assetID_ == t.assetID_ && issuerID_ == t.issuerID_;
}

View File

@@ -27,7 +27,7 @@ enum class TxnSql : char {
class STTx final : public STObject, public CountedObject<STTx>
{
uint256 tid_;
TxType tx_type_;
TxType txType_;
public:
static constexpr std::size_t kMinMultiSigners = 1;
@@ -187,7 +187,7 @@ inline STTx::STTx(SerialIter&& sit) // NOLINT(cppcoreguidelines-rvalue-referenc
inline TxType
STTx::getTxnType() const
{
return tx_type_;
return txType_;
}
inline Blob

View File

@@ -15,7 +15,7 @@
// Add new amendments to the top of this list.
// Keep it sorted in reverse chronological order.
XRPL_FIX (Cleanup3_2_0, Supported::No, VoteBehavior::DefaultNo)
XRPL_FIX (Cleanup3_2_0, Supported::Yes, VoteBehavior::DefaultNo)
XRPL_FEATURE(MPTokensV2, Supported::No, VoteBehavior::DefaultNo)
XRPL_FIX (Cleanup3_1_3, Supported::Yes, VoteBehavior::DefaultYes)
XRPL_FIX (BatchInnerSigs, Supported::No, VoteBehavior::DefaultNo)

View File

@@ -400,6 +400,7 @@ LEDGER_ENTRY(ltMPTOKEN_ISSUANCE, 0x007e, MPTokenIssuance, mpt_issuance, ({
{sfPreviousTxnLgrSeq, SoeRequired},
{sfDomainID, SoeOptional},
{sfMutableFlags, SoeDefault},
{sfReferenceHolding, SoeOptional},
}))
/** A ledger object which tracks MPToken
@@ -591,7 +592,7 @@ LEDGER_ENTRY(ltLOAN, 0x0089, Loan, loan, ({
// LoanBroker.ManagementFeeRate
// The unrounded true total fee still owed to the broker.
//
// Note the the "True" values may differ significantly from the tracked
// Note the "True" values may differ significantly from the tracked
// rounded values.
{sfPaymentRemaining, SoeDefault},
{sfPeriodicPayment, SoeRequired},

View File

@@ -205,6 +205,7 @@ TYPED_SFIELD(sfParentBatchID, UINT256, 36)
TYPED_SFIELD(sfLoanBrokerID, UINT256, 37,
SField::kSmdPseudoAccount | SField::kSmdDefault)
TYPED_SFIELD(sfLoanID, UINT256, 38)
TYPED_SFIELD(sfReferenceHolding, UINT256, 39)
// number (common)
TYPED_SFIELD(sfNumber, NUMBER, 1)

View File

@@ -688,6 +688,7 @@ TRANSACTION(ttLEDGER_STATE_FIX, 53, LedgerStateFix,
({
{sfLedgerFixType, SoeRequired},
{sfOwner, SoeOptional},
{sfBookDirectory, SoeOptional},
}))
/** This transaction type creates a MPTokensIssuance instance */

View File

@@ -15,8 +15,8 @@ Generation requires a one-time setup step to create a virtual environment
and install Python dependencies, followed by running the generation target:
```bash
cmake --build . --target setup_code_gen # create venv and install dependencies (once)
cmake --build . --target code_gen # generate code
cmake --build . --target setup_code_gen # create venv and install dependencies (once)
cmake --build . --target code_gen # generate code
```
By default, `CODEGEN_VENV_DIR` points to `.venv` in the project root. The

View File

@@ -278,6 +278,30 @@ public:
{
return this->sle_->isFieldPresent(sfMutableFlags);
}
/**
* @brief Get sfReferenceHolding (SoeOptional)
* @return The field value, or std::nullopt if not present.
*/
[[nodiscard]]
protocol_autogen::Optional<SF_UINT256::type::value_type>
getReferenceHolding() const
{
if (hasReferenceHolding())
return this->sle_->at(sfReferenceHolding);
return std::nullopt;
}
/**
* @brief Check if sfReferenceHolding is present.
* @return True if the field is present, false otherwise.
*/
[[nodiscard]]
bool
hasReferenceHolding() const
{
return this->sle_->isFieldPresent(sfReferenceHolding);
}
};
/**
@@ -469,6 +493,17 @@ public:
return *this;
}
/**
* @brief Set sfReferenceHolding (SoeOptional)
* @return Reference to this builder for method chaining.
*/
MPTokenIssuanceBuilder&
setReferenceHolding(std::decay_t<typename SF_UINT256::type::value_type> const& value)
{
object_[sfReferenceHolding] = value;
return *this;
}
/**
* @brief Build and return the completed MPTokenIssuance wrapper.
* @param index The ledger entry index.

View File

@@ -83,6 +83,32 @@ public:
{
return this->tx_->isFieldPresent(sfOwner);
}
/**
* @brief Get sfBookDirectory (SoeOptional)
* @return The field value, or std::nullopt if not present.
*/
[[nodiscard]]
protocol_autogen::Optional<SF_UINT256::type::value_type>
getBookDirectory() const
{
if (hasBookDirectory())
{
return this->tx_->at(sfBookDirectory);
}
return std::nullopt;
}
/**
* @brief Check if sfBookDirectory is present.
* @return True if the field is present, false otherwise.
*/
[[nodiscard]]
bool
hasBookDirectory() const
{
return this->tx_->isFieldPresent(sfBookDirectory);
}
};
/**
@@ -149,6 +175,17 @@ public:
return *this;
}
/**
* @brief Set sfBookDirectory (SoeOptional)
* @return Reference to this builder for method chaining.
*/
LedgerStateFixBuilder&
setBookDirectory(std::decay_t<typename SF_UINT256::type::value_type> const& value)
{
object_[sfBookDirectory] = value;
return *this;
}
/**
* @brief Build and return the LedgerStateFix wrapper.
* @param publicKey The public key for signing.

View File

@@ -21,7 +21,7 @@ struct Entry : public beast::List<Entry>::Node
@param now Construction time of Entry.
*/
explicit Entry(clock_type::time_point const now)
: refcount(0), local_balance(now), remote_balance(0)
: refcount(0), localBalance(now), remoteBalance(0)
{
}
@@ -46,7 +46,7 @@ struct Entry : public beast::List<Entry>::Node
int
balance(clock_type::time_point const now)
{
return local_balance.value(now) + remote_balance;
return localBalance.value(now) + remoteBalance;
}
// Add a charge and return normalized balance
@@ -54,7 +54,7 @@ struct Entry : public beast::List<Entry>::Node
int
add(int charge, clock_type::time_point const now)
{
return local_balance.add(charge, now) + remote_balance;
return localBalance.add(charge, now) + remoteBalance;
}
// The public key of the peer
@@ -67,10 +67,10 @@ struct Entry : public beast::List<Entry>::Node
int refcount;
// Exponentially decaying balance of resource consumption
DecayingSample<kDecayWindowSeconds, clock_type> local_balance;
DecayingSample<kDecayWindowSeconds, clock_type> localBalance;
// Normalized balance contribution from imports
int remote_balance;
int remoteBalance;
// Time of the last warning
clock_type::time_point lastWarningTime;

View File

@@ -25,11 +25,11 @@ struct Key
std::size_t
operator()(Key const& v) const
{
return addr_hash_(v.address);
return addrHash_(v.address);
}
private:
beast::Uhash<> addr_hash_;
beast::Uhash<> addrHash_;
};
struct KeyEqual

View File

@@ -194,34 +194,34 @@ public:
for (auto& inboundEntry : inbound_)
{
int const localBalance = inboundEntry.local_balance.value(now);
if ((localBalance + inboundEntry.remote_balance) >= threshold)
int const localBalance = inboundEntry.localBalance.value(now);
if ((localBalance + inboundEntry.remoteBalance) >= threshold)
{
json::Value& entry = (ret[inboundEntry.toString()] = json::ValueType::Object);
entry[jss::local] = localBalance;
entry[jss::remote] = inboundEntry.remote_balance;
entry[jss::remote] = inboundEntry.remoteBalance;
entry[jss::type] = "inbound";
}
}
for (auto& outboundEntry : outbound_)
{
int const localBalance = outboundEntry.local_balance.value(now);
if ((localBalance + outboundEntry.remote_balance) >= threshold)
int const localBalance = outboundEntry.localBalance.value(now);
if ((localBalance + outboundEntry.remoteBalance) >= threshold)
{
json::Value& entry = (ret[outboundEntry.toString()] = json::ValueType::Object);
entry[jss::local] = localBalance;
entry[jss::remote] = outboundEntry.remote_balance;
entry[jss::remote] = outboundEntry.remoteBalance;
entry[jss::type] = "outbound";
}
}
for (auto& adminEntry : admin_)
{
int const localBalance = adminEntry.local_balance.value(now);
if ((localBalance + adminEntry.remote_balance) >= threshold)
int const localBalance = adminEntry.localBalance.value(now);
if ((localBalance + adminEntry.remoteBalance) >= threshold)
{
json::Value& entry = (ret[adminEntry.toString()] = json::ValueType::Object);
entry[jss::local] = localBalance;
entry[jss::remote] = adminEntry.remote_balance;
entry[jss::remote] = adminEntry.remoteBalance;
entry[jss::type] = "admin";
}
}
@@ -242,7 +242,7 @@ public:
for (auto& inboundEntry : inbound_)
{
Gossip::Item item;
item.balance = inboundEntry.local_balance.value(now);
item.balance = inboundEntry.localBalance.value(now);
if (item.balance >= kMinimumGossipBalance)
{
item.address = inboundEntry.key->address;
@@ -278,7 +278,7 @@ public:
Import::Item item;
item.balance = gossipItem.balance;
item.consumer = newInboundEndpoint(gossipItem.address);
item.consumer.entry().remote_balance += item.balance;
item.consumer.entry().remoteBalance += item.balance;
next.items.push_back(item);
}
}
@@ -295,14 +295,14 @@ public:
Import::Item item;
item.balance = gossipItem.balance;
item.consumer = newInboundEndpoint(gossipItem.address);
item.consumer.entry().remote_balance += item.balance;
item.consumer.entry().remoteBalance += item.balance;
next.items.push_back(item);
}
Import& prev(resultIt->second);
for (auto& item : prev.items)
{
item.consumer.entry().remote_balance -= item.balance;
item.consumer.entry().remoteBalance -= item.balance;
}
std::swap(next, prev);
@@ -345,7 +345,7 @@ public:
for (auto itemIter(import.items.begin()); itemIter != import.items.end();
++itemIter)
{
itemIter->consumer.entry().remote_balance -= itemIter->balance;
itemIter->consumer.entry().remoteBalance -= itemIter->balance;
}
iter = importTable_.erase(iter);
@@ -520,8 +520,8 @@ public:
item["count"] = entry.refcount;
item["name"] = entry.toString();
item["balance"] = entry.balance(now);
if (entry.remote_balance != 0)
item["remote_balance"] = entry.remote_balance;
if (entry.remoteBalance != 0)
item["remote_balance"] = entry.remoteBalance;
}
}

View File

@@ -21,7 +21,7 @@ struct Handoff
bool moved = false;
// If response is set, this determines the keep alive
bool keep_alive = false;
bool keepAlive = false;
// When set, this will be sent back
std::shared_ptr<Writer> response;

View File

@@ -30,19 +30,19 @@ struct Port
boost::asio::ip::address ip;
std::uint16_t port = 0;
std::set<std::string, boost::beast::iless> protocol;
std::vector<boost::asio::ip::network_v4> admin_nets_v4;
std::vector<boost::asio::ip::network_v6> admin_nets_v6;
std::vector<boost::asio::ip::network_v4> secure_gateway_nets_v4;
std::vector<boost::asio::ip::network_v6> secure_gateway_nets_v6;
std::vector<boost::asio::ip::network_v4> adminNetsV4;
std::vector<boost::asio::ip::network_v6> adminNetsV6;
std::vector<boost::asio::ip::network_v4> secureGatewayNetsV4;
std::vector<boost::asio::ip::network_v6> secureGatewayNetsV6;
std::string user;
std::string password;
std::string admin_user;
std::string admin_password;
std::string ssl_key;
std::string ssl_cert;
std::string ssl_chain;
std::string ssl_ciphers;
boost::beast::websocket::permessage_deflate pmd_options;
std::string adminUser;
std::string adminPassword;
std::string sslKey;
std::string sslCert;
std::string sslChain;
std::string sslCiphers;
boost::beast::websocket::permessage_deflate pmdOptions;
std::shared_ptr<boost::asio::ssl::context> context;
// How many incoming connections are allowed on this
@@ -50,7 +50,7 @@ struct Port
int limit = 0;
// Websocket disconnects if send queue exceeds this limit
std::uint16_t ws_queue_limit{};
std::uint16_t wsQueueLimit{};
// Returns `true` if any websocket protocols are specified
[[nodiscard]] bool
@@ -78,22 +78,22 @@ struct ParsedPort
std::set<std::string, boost::beast::iless> protocol;
std::string user;
std::string password;
std::string admin_user;
std::string admin_password;
std::string ssl_key;
std::string ssl_cert;
std::string ssl_chain;
std::string ssl_ciphers;
boost::beast::websocket::permessage_deflate pmd_options;
std::string adminUser;
std::string adminPassword;
std::string sslKey;
std::string sslCert;
std::string sslChain;
std::string sslCiphers;
boost::beast::websocket::permessage_deflate pmdOptions;
int limit = 0;
std::uint16_t ws_queue_limit{};
std::uint16_t wsQueueLimit{};
std::optional<boost::asio::ip::address> ip;
std::optional<std::uint16_t> port;
std::vector<boost::asio::ip::network_v4> admin_nets_v4;
std::vector<boost::asio::ip::network_v6> admin_nets_v6;
std::vector<boost::asio::ip::network_v4> secure_gateway_nets_v4;
std::vector<boost::asio::ip::network_v6> secure_gateway_nets_v6;
std::vector<boost::asio::ip::network_v4> adminNetsV4;
std::vector<boost::asio::ip::network_v6> adminNetsV6;
std::vector<boost::asio::ip::network_v4> secureGatewayNetsV4;
std::vector<boost::asio::ip::network_v6> secureGatewayNetsV6;
};
void

View File

@@ -58,13 +58,13 @@ protected:
Handler& handler_;
boost::asio::executor_work_guard<boost::asio::executor> work_;
boost::asio::strand<boost::asio::executor> strand_;
endpoint_type remote_address_;
endpoint_type remoteAddress_;
beast::Journal const journal_;
std::string id_;
std::size_t nid_;
boost::asio::streambuf read_buf_;
boost::asio::streambuf readBuf_;
http_request_type message_;
std::vector<Buffer> wq_;
std::vector<Buffer> wq2_;
@@ -73,9 +73,9 @@ protected:
bool complete_ = false;
boost::system::error_code ec_;
int request_count_ = 0;
std::size_t bytes_in_ = 0;
std::size_t bytes_out_ = 0;
int requestCount_ = 0;
std::size_t bytesIn_ = 0;
std::size_t bytesOut_ = 0;
//--------------------------------------------------------------------------
@@ -151,7 +151,7 @@ protected:
beast::IP::Endpoint
remoteAddress() override
{
return beast::IPAddressConversion::fromAsio(remote_address_);
return beast::IPAddressConversion::fromAsio(remoteAddress_);
}
http_request_type&
@@ -191,23 +191,23 @@ BaseHTTPPeer<Handler, Impl>::BaseHTTPPeer(
, handler_(handler)
, work_(boost::asio::make_work_guard(executor))
, strand_(boost::asio::make_strand(executor))
, remote_address_(std::move(remoteAddress))
, remoteAddress_(std::move(remoteAddress))
, journal_(journal)
{
read_buf_.commit(
boost::asio::buffer_copy(read_buf_.prepare(boost::asio::buffer_size(buffers)), buffers));
readBuf_.commit(
boost::asio::buffer_copy(readBuf_.prepare(boost::asio::buffer_size(buffers)), buffers));
static std::atomic<int> kSid;
nid_ = ++kSid;
id_ = std::string("#") + std::to_string(nid_) + " ";
JLOG(journal_.trace()) << id_ << "accept: " << remote_address_.address();
JLOG(journal_.trace()) << id_ << "accept: " << remoteAddress_.address();
}
template <class Handler, class Impl>
BaseHTTPPeer<Handler, Impl>::~BaseHTTPPeer()
{
handler_.onClose(session(), ec_);
JLOG(journal_.trace()) << id_ << "destroyed: " << request_count_
<< ((request_count_ == 1) ? " request" : " requests");
JLOG(journal_.trace()) << id_ << "destroyed: " << requestCount_
<< ((requestCount_ == 1) ? " request" : " requests");
}
template <class Handler, class Impl>
@@ -245,7 +245,7 @@ BaseHTTPPeer<Handler, Impl>::startTimer()
boost::beast::get_lowest_layer(impl().stream_)
.expires_after(
std::chrono::seconds(
remote_address_.address().is_loopback() ? kTimeoutSecondsLocal : kTimeoutSeconds));
remoteAddress_.address().is_loopback() ? kTimeoutSecondsLocal : kTimeoutSeconds));
}
// Convenience for discarding the error code
@@ -274,7 +274,7 @@ BaseHTTPPeer<Handler, Impl>::doRead(yield_context doYield)
complete_ = false;
error_code ec;
startTimer();
boost::beast::http::async_read(impl().stream_, read_buf_, message_, doYield[ec]);
boost::beast::http::async_read(impl().stream_, readBuf_, message_, doYield[ec]);
cancelTimer();
if (ec == boost::beast::http::error::end_of_stream)
return doClose();
@@ -296,7 +296,7 @@ BaseHTTPPeer<Handler, Impl>::onWrite(error_code const& ec, std::size_t bytesTran
return onTimer();
if (ec)
return fail(ec, "write");
bytes_out_ += bytesTransferred;
bytesOut_ += bytesTransferred;
{
std::scoped_lock const lock(mutex_);
wq2_.clear();

View File

@@ -27,7 +27,7 @@ protected:
Port const& port_;
Handler& handler_;
endpoint_type remote_address_;
endpoint_type remoteAddress_;
beast::WrappedSink sink_;
beast::Journal const j_;
@@ -65,7 +65,7 @@ BasePeer<Handler, Impl>::BasePeer(
beast::Journal journal)
: port_(port)
, handler_(handler)
, remote_address_(std::move(remoteAddress))
, remoteAddress_(std::move(remoteAddress))
, sink_(
journal.sink(),
[] {

View File

@@ -42,15 +42,15 @@ private:
/// The socket has been closed, or will close after the next write
/// finishes. Do not do any more writes, and don't try to close
/// again.
bool do_close_ = false;
bool doClose_ = false;
boost::beast::websocket::close_reason cr_;
waitable_timer timer_;
bool close_on_timer_ = false;
bool ping_active_ = false;
bool closeOnTimer_ = false;
bool pingActive_ = false;
boost::beast::websocket::ping_data payload_;
error_code ec_;
std::function<void(boost::beast::websocket::frame_type, boost::beast::string_view)>
control_callback_;
controlCallback_;
public:
template <class Body, class Headers>
@@ -85,7 +85,7 @@ public:
[[nodiscard]] boost::asio::ip::tcp::endpoint const&
remoteEndpoint() const override
{
return this->remote_address_;
return this->remoteAddress_;
}
void
@@ -173,14 +173,14 @@ BaseWSPeer<Handler, Impl>::run()
{
if (!strand_.running_in_this_thread())
return post(strand_, std::bind(&BaseWSPeer::run, impl().shared_from_this()));
impl().ws_.set_option(port().pmd_options);
impl().ws_.set_option(port().pmdOptions);
// Must manage the control callback memory outside of the `control_callback`
// function
control_callback_ =
controlCallback_ =
std::bind(&BaseWSPeer::onPingPong, this, std::placeholders::_1, std::placeholders::_2);
impl().ws_.control_callback(control_callback_);
impl().ws_.control_callback(controlCallback_);
startTimer();
close_on_timer_ = true;
closeOnTimer_ = true;
impl().ws_.set_option(boost::beast::websocket::stream_base::decorator([](auto& res) {
res.set(boost::beast::http::field::server, BuildInfo::getFullVersionString());
}));
@@ -198,9 +198,9 @@ BaseWSPeer<Handler, Impl>::send(std::shared_ptr<WSMsg> w)
{
if (!strand_.running_in_this_thread())
return post(strand_, std::bind(&BaseWSPeer::send, impl().shared_from_this(), std::move(w)));
if (do_close_)
if (doClose_)
return;
if (wq_.size() > port().ws_queue_limit)
if (wq_.size() > port().wsQueueLimit)
{
cr_.code = safeCast<decltype(cr_.code)>(boost::beast::websocket::close_code::policy_error);
cr_.reason = "Policy error: client is too slow.";
@@ -227,9 +227,9 @@ BaseWSPeer<Handler, Impl>::close(boost::beast::websocket::close_reason const& re
{
if (!strand_.running_in_this_thread())
return post(strand_, [self = impl().shared_from_this(), reason] { self->close(reason); });
if (do_close_)
if (doClose_)
return;
do_close_ = true;
doClose_ = true;
if (wq_.empty())
{
impl().ws_.async_close(
@@ -260,7 +260,7 @@ BaseWSPeer<Handler, Impl>::onWsHandshake(error_code const& ec)
{
if (ec)
return fail(ec, "on_ws_handshake");
close_on_timer_ = false;
closeOnTimer_ = false;
doRead();
}
@@ -313,7 +313,7 @@ BaseWSPeer<Handler, Impl>::onWriteFin(error_code const& ec)
if (ec)
return fail(ec, "write_fin");
wq_.pop_front();
if (do_close_)
if (doClose_)
{
impl().ws_.async_close(
cr_,
@@ -409,7 +409,7 @@ BaseWSPeer<Handler, Impl>::onPing(error_code const& ec)
{
if (ec == boost::asio::error::operation_aborted)
return;
ping_active_ = false;
pingActive_ = false;
if (!ec)
return;
fail(ec, "on_ping");
@@ -426,7 +426,7 @@ BaseWSPeer<Handler, Impl>::onPingPong(
boost::beast::string_view const p(payload_.begin());
if (payload == p)
{
close_on_timer_ = false;
closeOnTimer_ = false;
JLOG(this->j_.trace()) << "got matching pong";
}
else
@@ -444,11 +444,11 @@ BaseWSPeer<Handler, Impl>::onTimer(error_code ec)
return;
if (!ec)
{
if (!close_on_timer_ || !ping_active_)
if (!closeOnTimer_ || !pingActive_)
{
startTimer();
close_on_timer_ = true;
ping_active_ = true;
closeOnTimer_ = true;
pingActive_ = true;
// cryptographic is probably overkill..
beast::rngfill(payload_.begin(), payload_.size(), cryptoPrng());
impl().ws_.async_ping(

View File

@@ -23,7 +23,6 @@
#include <sys/resource.h>
#include <dirent.h>
#include <unistd.h>
#endif
#include <algorithm>
@@ -61,7 +60,7 @@ private:
boost::asio::io_context& ioc_;
stream_type stream_;
socket_type& socket_;
endpoint_type remote_address_;
endpoint_type remoteAddress_;
boost::asio::strand<boost::asio::io_context::executor_type> strand_;
beast::Journal const j_;
@@ -90,16 +89,19 @@ private:
acceptor_type acceptor_;
boost::asio::strand<boost::asio::io_context::executor_type> strand_;
bool ssl_{
port_.protocol.count("https") > 0 || port_.protocol.count("wss") > 0 ||
port_.protocol.count("wss2") > 0 || port_.protocol.count("peer") > 0};
port_.protocol.contains("https") || port_.protocol.contains("wss") ||
port_.protocol.contains("wss2") || port_.protocol.contains("peer")};
bool plain_{
port_.protocol.count("http") > 0 || port_.protocol.count("ws") > 0 ||
(port_.protocol.count("ws2") != 0u)};
port_.protocol.contains("http") || port_.protocol.contains("ws") ||
(port_.protocol.contains("ws2"))};
static constexpr std::chrono::milliseconds kInitialAcceptDelay{50};
static constexpr std::chrono::milliseconds kMaxAcceptDelay{2000};
std::chrono::milliseconds accept_delay_{kInitialAcceptDelay};
boost::asio::steady_timer backoff_timer_;
static constexpr double kFreeFdThreshold = 0.70;
std::chrono::milliseconds acceptDelay_{kInitialAcceptDelay};
boost::asio::steady_timer backoffTimer_;
static constexpr std::uint64_t kMaxUsedFdPercent = 70;
static constexpr std::chrono::milliseconds kFdSampleInterval{250};
clock_type::time_point fdSampleAt_;
bool cachedThrottle_{false};
struct FDStats
{
@@ -164,7 +166,7 @@ Door<Handler>::Detector::Detector(
, ioc_(ioc)
, stream_(std::move(stream))
, socket_(stream_.socket())
, remote_address_(std::move(remoteAddress))
, remoteAddress_(std::move(remoteAddress))
, strand_(boost::asio::make_strand(ioc_))
, j_(j)
{
@@ -199,18 +201,18 @@ Door<Handler>::Detector::doDetect(boost::asio::yield_context doYield)
if (ssl)
{
if (auto sp = ios().template emplace<SSLHTTPPeer<Handler>>(
port_, handler_, ioc_, j_, remote_address_, buf.data(), std::move(stream_)))
port_, handler_, ioc_, j_, remoteAddress_, buf.data(), std::move(stream_)))
sp->run();
return;
}
if (auto sp = ios().template emplace<PlainHTTPPeer<Handler>>(
port_, handler_, ioc_, j_, remote_address_, buf.data(), std::move(stream_)))
port_, handler_, ioc_, j_, remoteAddress_, buf.data(), std::move(stream_)))
sp->run();
return;
}
if (ec != boost::asio::error::operation_aborted)
{
JLOG(j_.trace()) << "Error detecting ssl: " << ec.message() << " from " << remote_address_;
JLOG(j_.trace()) << "Error detecting ssl: " << ec.message() << " from " << remoteAddress_;
}
}
@@ -279,7 +281,8 @@ Door<Handler>::Door(
, ioc_(ioContext)
, acceptor_(ioContext)
, strand_(boost::asio::make_strand(ioContext))
, backoff_timer_(ioContext)
, backoffTimer_(ioContext)
, fdSampleAt_(clock_type::now() - kFdSampleInterval)
{
reOpen();
}
@@ -302,7 +305,7 @@ Door<Handler>::close()
return boost::asio::post(
strand_, std::bind(&Door<Handler>::close, this->shared_from_this()));
}
backoff_timer_.cancel();
backoffTimer_.cancel();
error_code ec;
acceptor_.close(ec);
}
@@ -338,11 +341,11 @@ Door<Handler>::doAccept(boost::asio::yield_context doYield)
{
if (shouldThrottleForFds())
{
backoff_timer_.expires_after(accept_delay_);
JLOG(j_.warn()) << "Throttling do_accept for " << acceptDelay_.count() << "ms.";
backoffTimer_.expires_after(acceptDelay_);
boost::system::error_code tec;
backoff_timer_.async_wait(doYield[tec]);
accept_delay_ = std::min(accept_delay_ * 2, kMaxAcceptDelay);
JLOG(j_.warn()) << "Throttling do_accept for " << accept_delay_.count() << "ms.";
backoffTimer_.async_wait(doYield[tec]);
acceptDelay_ = std::min(acceptDelay_ * 2, kMaxAcceptDelay);
continue;
}
@@ -359,14 +362,17 @@ Door<Handler>::doAccept(boost::asio::yield_context doYield)
if (ec == boost::asio::error::no_descriptors ||
ec == boost::asio::error::no_buffer_space)
{
JLOG(j_.warn()) << "accept: Too many open files. Pausing for "
<< accept_delay_.count() << "ms.";
char const* const cause = (ec == boost::asio::error::no_descriptors)
? "too many open files"
: "kernel buffer space exhausted";
JLOG(j_.warn()) << "accept: " << cause << ". Pausing for " << acceptDelay_.count()
<< "ms.";
backoff_timer_.expires_after(accept_delay_);
backoffTimer_.expires_after(acceptDelay_);
boost::system::error_code tec;
backoff_timer_.async_wait(doYield[tec]);
backoffTimer_.async_wait(doYield[tec]);
accept_delay_ = std::min(accept_delay_ * 2, kMaxAcceptDelay);
acceptDelay_ = std::min(acceptDelay_ * 2, kMaxAcceptDelay);
}
else
{
@@ -375,7 +381,7 @@ Door<Handler>::doAccept(boost::asio::yield_context doYield)
continue;
}
accept_delay_ = kInitialAcceptDelay;
acceptDelay_ = kInitialAcceptDelay;
if (ssl_ && plain_)
{
@@ -428,14 +434,15 @@ Door<Handler>::shouldThrottleForFds()
#if BOOST_OS_WINDOWS
return false;
#else
auto const stats = queryFdStats();
if (!stats || stats->limit == 0)
return false;
auto const now = clock_type::now();
if (now - fdSampleAt_ < kFdSampleInterval)
return cachedThrottle_;
auto const& s = *stats;
auto const free = (s.limit > s.used) ? (s.limit - s.used) : 0ull;
double const freeRatio = static_cast<double>(free) / static_cast<double>(s.limit);
return freeRatio < kFreeFdThreshold;
fdSampleAt_ = now;
auto const stats = queryFdStats();
cachedThrottle_ =
stats && stats->limit > 0 && stats->used * 100 > stats->limit * kMaxUsedFdPercent;
return cachedThrottle_;
#endif
}

Some files were not shown because too many files have changed in this diff Show More