From d32f34d3bf863a5cb35776beacbd8a1328cfe469 Mon Sep 17 00:00:00 2001
From: Nicholas Dudfield <ndudfield@gmail.com>
Date: Tue, 3 Mar 2026 10:17:46 +0700
Subject: [PATCH] build(levelization): add fast python generator with CI parity
 check

Add Builds/levelization/levelization.py for fast local iteration and semantic comparison against canonical shell output via --compare-to.

Keep Builds/levelization/levelization.sh as canonical path, and update levelization workflow to fail if python output diverges from shell-generated results.

Also harden interactive-shell detection in levelization.sh for portability and document local usage in README.
---
 .github/workflows/levelization.yml  |   5 +
 Builds/levelization/README.md       |   7 +
 Builds/levelization/levelization.py | 352 ++++++++++++++++++++++++++++
 Builds/levelization/levelization.sh |   2 +-
 4 files changed, 365 insertions(+), 1 deletion(-)
 create mode 100755 Builds/levelization/levelization.py

diff --git a/.github/workflows/levelization.yml b/.github/workflows/levelization.yml
index f99c1ca56..febf6071b 100644
--- a/.github/workflows/levelization.yml
+++ b/.github/workflows/levelization.yml
@@ -11,6 +11,11 @@ jobs:
     - uses: actions/checkout@v3
     - name: Check levelization
       run: Builds/levelization/levelization.sh
+    - name: Verify Python Generator Matches Canonical Script
+      run: |
+        python3 Builds/levelization/levelization.py \
+          --results-dir /tmp/levelization-py-results \
+          --compare-to Builds/levelization/results
     - name: Check for differences
       id: assert
       run: |
diff --git a/Builds/levelization/README.md b/Builds/levelization/README.md
index 4ff3a5423..0501f430d 100644
--- a/Builds/levelization/README.md
+++ b/Builds/levelization/README.md
@@ -59,6 +59,10 @@ the rippled source. The only caveat is that it runs much slower
 under Windows than in Linux. It hasn't yet been tested under MacOS.
 It generates many files of [results](results):
 
+For local iteration speed there is also
+[levelization.py](levelization.py), which generates the same artifact set much
+faster. The shell script remains canonical for CI/auditing.
+
 * `rawincludes.txt`: The raw dump of the `#includes`
 * `paths.txt`: A second dump grouping the source module
   to the destination module, deduped, and with frequency counts.
@@ -109,6 +113,9 @@ prevent false alarms and merging issues, and because it's easy to
 get those details locally.
 
 1. Run `levelization.sh`
+   * Faster local loop: `python3 Builds/levelization/levelization.py`
+   * Optional parity check against canonical shell output:
+     `python3 Builds/levelization/levelization.py --results-dir /tmp/levelization-py-results --compare-to Builds/levelization/results`
 2. Grep the modules in `paths.txt`.
    * For example, if a cycle is found `A ~= B`, simply `grep -w
      A Builds/levelization/results/paths.txt | grep -w B`
diff --git a/Builds/levelization/levelization.py b/Builds/levelization/levelization.py
new file mode 100755
index 000000000..eb48ebe8d
--- /dev/null
+++ b/Builds/levelization/levelization.py
@@ -0,0 +1,352 @@
+#!/usr/bin/env python3
+"""
+Development-oriented levelization generator.
+
+This script produces the same result artifact set as levelization.sh, but is
+much faster by doing parsing/counting in-process instead of spawning many
+external tools in tight loops.
+
+The shell script remains the canonical CI path. Use this script for local
+iteration speed, then run levelization.sh before committing if you need strict
+parity with existing workflow.
+"""
+
+from __future__ import annotations
+
+import argparse
+import concurrent.futures
+import os
+import posixpath
+import re
+import shutil
+import time
+from collections import Counter, defaultdict
+from pathlib import Path
+
+INCLUDE_PATTERN = re.compile(r"^\s*#include.*/.*\.h")
+INCLUDE_TARGET_PATTERN = re.compile(r'.*["<]([^">]+)[">].*')
+PATHS_LINE_PATTERN = re.compile(r"^\s*(\d+)\s+(\S+)\s+(\S+)\s*$")
+
+
+def dictionary_sort_key(value: str) -> str:
+    """Approximate `sort -d` behavior used by the shell script."""
+    return "".join(ch for ch in value if ch.isalnum() or ch.isspace())
+
+
+def normalize_level(value: str) -> str:
+    # Match shell behavior: if level includes a file component (contains "."),
+    # replace with dirname + "/toplevel".
+    if "." in value:
+        parent = posixpath.dirname(value) or "."
+        value = f"{parent}/toplevel"
+    return value.replace("/", ".")
+
+
+def source_level(rel_path: str) -> str:
+    parts = rel_path.split("/")
+    return normalize_level("/".join(parts[1:3]))
+
+
+def include_level(include_line: str) -> str | None:
+    match = INCLUDE_TARGET_PATTERN.match(include_line)
+    if not match:
+        return None
+    include_path = match.group(1)
+    parts = include_path.split("/")
+    return normalize_level("/".join(parts[:2]))
+
+
+def scan_file(path: Path, repo_root: Path) -> tuple[list[str], list[tuple[str, str]]]:
+    rel = path.relative_to(repo_root).as_posix()
+    src_level = source_level(rel)
+
+    raw_lines: list[str] = []
+    paths: list[tuple[str, str]] = []
+
+    with path.open("r", encoding="utf-8", errors="ignore") as handle:
+        for line in handle:
+            if "boost" in line:
+                continue
+            if not INCLUDE_PATTERN.match(line):
+                continue
+
+            line = line.rstrip("\n")
+            raw_lines.append(f"{rel}:{line}")
+
+            dst_level = include_level(line)
+            if dst_level is None:
+                continue
+            if src_level != dst_level:
+                paths.append((src_level, dst_level))
+
+    return raw_lines, paths
+
+
+def iter_source_files(repo_root: Path) -> list[Path]:
+    files: list[Path] = []
+    for top in ("include", "src"):
+        root = repo_root / top
+        files.extend(path for path in root.rglob("*") if path.is_file())
+    files.sort(key=lambda p: p.relative_to(repo_root).as_posix())
+    return files
+
+
+def write_relation_db(
+    results_dir: Path,
+    edge_counts: list[tuple[tuple[str, str], int]],
+) -> tuple[dict[str, list[tuple[str, int]]], dict[str, list[tuple[str, int]]]]:
+    includes_dir = results_dir / "includes"
+    includedby_dir = results_dir / "includedby"
+    includes_dir.mkdir(parents=True, exist_ok=True)
+    includedby_dir.mkdir(parents=True, exist_ok=True)
+
+    includes: dict[str, list[tuple[str, int]]] = defaultdict(list)
+    includedby: dict[str, list[tuple[str, int]]] = defaultdict(list)
+
+    with (results_dir / "paths.txt").open("w", encoding="utf-8") as out:
+        for (src, dst), count in edge_counts:
+            out.write(f"{count:7d} {src} {dst}\n")
+            includes[src].append((dst, count))
+            includedby[dst].append((src, count))
+
+    for src, entries in includes.items():
+        with (includes_dir / src).open("w", encoding="utf-8") as out:
+            for dst, count in entries:
+                out.write(f"{dst} {count}\n")
+
+    for dst, entries in includedby.items():
+        with (includedby_dir / dst).open("w", encoding="utf-8") as out:
+            for src, count in entries:
+                out.write(f"{src} {count}\n")
+
+    return includes, includedby
+
+
+def build_loops_and_ordering(
+    includes: dict[str, list[tuple[str, int]]],
+) -> tuple[list[str], list[str]]:
+    include_map = {
+        src: {dst: count for dst, count in entries}
+        for src, entries in includes.items()
+    }
+
+    ordering_lines: list[str] = []
+    loops_lines: list[str] = []
+
+    seen_pairs: set[tuple[str, str]] = set()
+
+    for source in sorted(includes.keys()):
+        for include, includefreq in includes[source]:
+            if include not in include_map:
+                continue
+
+            sourcefreq = include_map[include].get(source)
+            if sourcefreq is None:
+                ordering_lines.append(f"{source} > {include}\n")
+                continue
+
+            if (include, source) in seen_pairs:
+                continue
+            seen_pairs.add((source, include))
+
+            loops_lines.append(f"Loop: {source} {include}\n")
+            if includefreq - sourcefreq > 3:
+                loops_lines.append(f"  {source} > {include}\n\n")
+            elif sourcefreq - includefreq > 3:
+                loops_lines.append(f"  {include} > {source}\n\n")
+            elif sourcefreq == includefreq:
+                loops_lines.append(f"  {include} == {source}\n\n")
+            else:
+                loops_lines.append(f"  {include} ~= {source}\n\n")
+
+    return ordering_lines, loops_lines
+
+
+def parse_paths(path: Path) -> dict[tuple[str, str], int]:
+    out: dict[tuple[str, str], int] = {}
+    for line in path.read_text(encoding="utf-8").splitlines():
+        if not line.strip():
+            continue
+        match = PATHS_LINE_PATTERN.match(line)
+        if not match:
+            raise ValueError(f"Cannot parse paths line: {line!r}")
+        count = int(match.group(1))
+        src = match.group(2)
+        dst = match.group(3)
+        out[(src, dst)] = count
+    return out
+
+
+def parse_relation_dir(path: Path) -> dict[str, Counter[str]]:
+    out: dict[str, Counter[str]] = {}
+    if not path.exists():
+        return out
+    for file in sorted(p for p in path.iterdir() if p.is_file()):
+        lines = [line for line in file.read_text(encoding="utf-8").splitlines() if line]
+        out[file.name] = Counter(lines)
+    return out
+
+
+def compare_results(generated: Path, canonical: Path) -> list[str]:
+    mismatches: list[str] = []
+
+    # rawincludes: compare as sets/multisets of lines to ignore traversal order.
+    gen_raw = Counter(generated.joinpath("rawincludes.txt").read_text(encoding="utf-8").splitlines())
+    can_raw = Counter(canonical.joinpath("rawincludes.txt").read_text(encoding="utf-8").splitlines())
+    if gen_raw != can_raw:
+        mismatches.append("rawincludes.txt differs (line multiset mismatch)")
+
+    # paths: compare parsed edge->count map, ignoring ordering/whitespace.
+    gen_paths = parse_paths(generated / "paths.txt")
+    can_paths = parse_paths(canonical / "paths.txt")
+    if gen_paths != can_paths:
+        mismatches.append("paths.txt differs (edge count mismatch)")
+
+    # includes / includedby: compare per-file line multisets.
+    for rel in ("includes", "includedby"):
+        gen_rel = parse_relation_dir(generated / rel)
+        can_rel = parse_relation_dir(canonical / rel)
+        if gen_rel != can_rel:
+            mismatches.append(f"{rel}/ differs (file set or content mismatch)")
+
+    # ordering and loops are canonical artifacts; require exact bytes.
+    for name in ("ordering.txt", "loops.txt"):
+        gen_text = generated.joinpath(name).read_text(encoding="utf-8")
+        can_text = canonical.joinpath(name).read_text(encoding="utf-8")
+        if gen_text != can_text:
+            mismatches.append(f"{name} differs")
+
+    return mismatches
+
+
+def generate(results_dir: Path, repo_root: Path, workers: int) -> None:
+    if results_dir.exists():
+        shutil.rmtree(results_dir)
+    results_dir.mkdir(parents=True)
+
+    files = iter_source_files(repo_root)
+
+    raw_by_file: dict[str, list[str]] = {}
+    paths_by_file: dict[str, list[tuple[str, str]]] = {}
+
+    start = time.perf_counter()
+    if workers <= 1:
+        for file in files:
+            rel = file.relative_to(repo_root).as_posix()
+            raw, paths = scan_file(file, repo_root)
+            raw_by_file[rel] = raw
+            paths_by_file[rel] = paths
+    else:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as pool:
+            futures = {
+                file.relative_to(repo_root).as_posix(): pool.submit(
+                    scan_file, file, repo_root
+                )
+                for file in files
+            }
+            for rel in sorted(futures.keys()):
+                raw, paths = futures[rel].result()
+                raw_by_file[rel] = raw
+                paths_by_file[rel] = paths
+
+    raw_lines: list[str] = []
+    raw_lines.extend(
+        line
+        for rel in sorted(raw_by_file.keys())
+        for line in raw_by_file[rel]
+    )
+    with (results_dir / "rawincludes.txt").open("w", encoding="utf-8") as out:
+        out.write("\n".join(raw_lines))
+        if raw_lines:
+            out.write("\n")
+
+    path_pairs: list[tuple[str, str]] = []
+    path_pairs.extend(
+        pair
+        for rel in sorted(paths_by_file.keys())
+        for pair in paths_by_file[rel]
+    )
+    counts = Counter(path_pairs)
+
+    edge_counts = sorted(
+        counts.items(),
+        key=lambda item: (
+            dictionary_sort_key(f"{item[0][0]} {item[0][1]}"),
+            item[0][0],
+            item[0][1],
+        ),
+    )
+
+    includes, _ = write_relation_db(results_dir, edge_counts)
+    ordering, loops = build_loops_and_ordering(includes)
+
+    with (results_dir / "ordering.txt").open("w", encoding="utf-8") as out:
+        out.writelines(ordering)
+    with (results_dir / "loops.txt").open("w", encoding="utf-8") as out:
+        out.writelines(loops)
+
+    elapsed = time.perf_counter() - start
+    print(
+        f"levelization.py: scanned {len(files)} files, "
+        f"{len(raw_lines)} includes, {len(edge_counts)} unique paths in "
+        f"{elapsed:.2f}s"
+    )
+    print((results_dir / "ordering.txt").read_text(encoding="utf-8"), end="")
+    print((results_dir / "loops.txt").read_text(encoding="utf-8"), end="")
+
+
+def main() -> int:
+    script_dir = Path(__file__).resolve().parent
+    repo_root = script_dir.parents[1]
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--repo-root",
+        type=Path,
+        default=repo_root,
+        help="Repository root (defaults based on script location).",
+    )
+    parser.add_argument(
+        "--results-dir",
+        type=Path,
+        default=script_dir / "results",
+        help="Output results directory.",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=min(32, (os.cpu_count() or 1)),
+        help="Thread count for source scanning (default: CPU count, max 32).",
+    )
+    parser.add_argument(
+        "--compare-to",
+        type=Path,
+        default=None,
+        help=(
+            "Compare generated results against this directory and exit non-zero "
+            "on mismatch (semantic comparison for rawincludes/paths/includes)."
+        ),
+    )
+    args = parser.parse_args()
+
+    generated_dir = args.results_dir.resolve()
+    generate(
+        results_dir=generated_dir,
+        repo_root=args.repo_root.resolve(),
+        workers=max(1, args.workers),
+    )
+
+    if args.compare_to is not None:
+        canonical_dir = args.compare_to.resolve()
+        mismatches = compare_results(generated_dir, canonical_dir)
+        if mismatches:
+            print("levelization.py: mismatch against canonical results:")
+            for mismatch in mismatches:
+                print(f"  - {mismatch}")
+            return 1
+        print("levelization.py: matches canonical results")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/Builds/levelization/levelization.sh b/Builds/levelization/levelization.sh
index c18ca703f..9148390b4 100755
--- a/Builds/levelization/levelization.sh
+++ b/Builds/levelization/levelization.sh
@@ -7,7 +7,7 @@
 
 pushd $( dirname $0 )
 
-if [ -v PS1 ]
+if [[ -n "${PS1-}" ]]
 then
   # if the shell is interactive, clean up any flotsam before analyzing
   git clean -ix