mirror of
https://github.com/Xahau/xahaud.git
synced 2026-06-02 08:16:42 +00:00
build(levelization): add fast python generator with CI parity check
Add Builds/levelization/levelization.py for fast local iteration and semantic comparison against canonical shell output via --compare-to. Keep Builds/levelization/levelization.sh as canonical path, and update levelization workflow to fail if python output diverges from shell-generated results. Also harden interactive-shell detection in levelization.sh for portability and document local usage in README.
This commit is contained in:
5
.github/workflows/levelization.yml
vendored
5
.github/workflows/levelization.yml
vendored
@@ -11,6 +11,11 @@ jobs:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Check levelization
|
||||
run: Builds/levelization/levelization.sh
|
||||
- name: Verify Python Generator Matches Canonical Script
|
||||
run: |
|
||||
python3 Builds/levelization/levelization.py \
|
||||
--results-dir /tmp/levelization-py-results \
|
||||
--compare-to Builds/levelization/results
|
||||
- name: Check for differences
|
||||
id: assert
|
||||
run: |
|
||||
|
||||
@@ -59,6 +59,10 @@ the rippled source. The only caveat is that it runs much slower
|
||||
under Windows than in Linux. It hasn't yet been tested under MacOS.
|
||||
It generates many files of [results](results):
|
||||
|
||||
For local iteration speed there is also
|
||||
[levelization.py](levelization.py), which generates the same artifact set much
|
||||
faster. The shell script remains canonical for CI/auditing.
|
||||
|
||||
* `rawincludes.txt`: The raw dump of the `#includes`
|
||||
* `paths.txt`: A second dump grouping the source module
|
||||
to the destination module, deduped, and with frequency counts.
|
||||
@@ -109,6 +113,9 @@ prevent false alarms and merging issues, and because it's easy to
|
||||
get those details locally.
|
||||
|
||||
1. Run `levelization.sh`
|
||||
* Faster local loop: `python3 Builds/levelization/levelization.py`
|
||||
* Optional parity check against canonical shell output:
|
||||
`python3 Builds/levelization/levelization.py --results-dir /tmp/levelization-py-results --compare-to Builds/levelization/results`
|
||||
2. Grep the modules in `paths.txt`.
|
||||
* For example, if a cycle is found `A ~= B`, simply `grep -w
|
||||
A Builds/levelization/results/paths.txt | grep -w B`
|
||||
|
||||
352
Builds/levelization/levelization.py
Executable file
352
Builds/levelization/levelization.py
Executable file
@@ -0,0 +1,352 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Development-oriented levelization generator.
|
||||
|
||||
This script produces the same result artifact set as levelization.sh, but is
|
||||
much faster by doing parsing/counting in-process instead of spawning many
|
||||
external tools in tight loops.
|
||||
|
||||
The shell script remains the canonical CI path. Use this script for local
|
||||
iteration speed, then run levelization.sh before committing if you need strict
|
||||
parity with existing workflow.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import os
|
||||
import posixpath
|
||||
import re
|
||||
import shutil
|
||||
import time
|
||||
from collections import Counter, defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
INCLUDE_PATTERN = re.compile(r"^\s*#include.*/.*\.h")
|
||||
INCLUDE_TARGET_PATTERN = re.compile(r'.*["<]([^">]+)[">].*')
|
||||
PATHS_LINE_PATTERN = re.compile(r"^\s*(\d+)\s+(\S+)\s+(\S+)\s*$")
|
||||
|
||||
|
||||
def dictionary_sort_key(value: str) -> str:
|
||||
"""Approximate `sort -d` behavior used by the shell script."""
|
||||
return "".join(ch for ch in value if ch.isalnum() or ch.isspace())
|
||||
|
||||
|
||||
def normalize_level(value: str) -> str:
|
||||
# Match shell behavior: if level includes a file component (contains "."),
|
||||
# replace with dirname + "/toplevel".
|
||||
if "." in value:
|
||||
parent = posixpath.dirname(value) or "."
|
||||
value = f"{parent}/toplevel"
|
||||
return value.replace("/", ".")
|
||||
|
||||
|
||||
def source_level(rel_path: str) -> str:
|
||||
parts = rel_path.split("/")
|
||||
return normalize_level("/".join(parts[1:3]))
|
||||
|
||||
|
||||
def include_level(include_line: str) -> str | None:
|
||||
match = INCLUDE_TARGET_PATTERN.match(include_line)
|
||||
if not match:
|
||||
return None
|
||||
include_path = match.group(1)
|
||||
parts = include_path.split("/")
|
||||
return normalize_level("/".join(parts[:2]))
|
||||
|
||||
|
||||
def scan_file(path: Path, repo_root: Path) -> tuple[list[str], list[tuple[str, str]]]:
|
||||
rel = path.relative_to(repo_root).as_posix()
|
||||
src_level = source_level(rel)
|
||||
|
||||
raw_lines: list[str] = []
|
||||
paths: list[tuple[str, str]] = []
|
||||
|
||||
with path.open("r", encoding="utf-8", errors="ignore") as handle:
|
||||
for line in handle:
|
||||
if "boost" in line:
|
||||
continue
|
||||
if not INCLUDE_PATTERN.match(line):
|
||||
continue
|
||||
|
||||
line = line.rstrip("\n")
|
||||
raw_lines.append(f"{rel}:{line}")
|
||||
|
||||
dst_level = include_level(line)
|
||||
if dst_level is None:
|
||||
continue
|
||||
if src_level != dst_level:
|
||||
paths.append((src_level, dst_level))
|
||||
|
||||
return raw_lines, paths
|
||||
|
||||
|
||||
def iter_source_files(repo_root: Path) -> list[Path]:
|
||||
files: list[Path] = []
|
||||
for top in ("include", "src"):
|
||||
root = repo_root / top
|
||||
files.extend(path for path in root.rglob("*") if path.is_file())
|
||||
files.sort(key=lambda p: p.relative_to(repo_root).as_posix())
|
||||
return files
|
||||
|
||||
|
||||
def write_relation_db(
|
||||
results_dir: Path,
|
||||
edge_counts: list[tuple[tuple[str, str], int]],
|
||||
) -> tuple[dict[str, list[tuple[str, int]]], dict[str, list[tuple[str, int]]]]:
|
||||
includes_dir = results_dir / "includes"
|
||||
includedby_dir = results_dir / "includedby"
|
||||
includes_dir.mkdir(parents=True, exist_ok=True)
|
||||
includedby_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
includes: dict[str, list[tuple[str, int]]] = defaultdict(list)
|
||||
includedby: dict[str, list[tuple[str, int]]] = defaultdict(list)
|
||||
|
||||
with (results_dir / "paths.txt").open("w", encoding="utf-8") as out:
|
||||
for (src, dst), count in edge_counts:
|
||||
out.write(f"{count:7d} {src} {dst}\n")
|
||||
includes[src].append((dst, count))
|
||||
includedby[dst].append((src, count))
|
||||
|
||||
for src, entries in includes.items():
|
||||
with (includes_dir / src).open("w", encoding="utf-8") as out:
|
||||
for dst, count in entries:
|
||||
out.write(f"{dst} {count}\n")
|
||||
|
||||
for dst, entries in includedby.items():
|
||||
with (includedby_dir / dst).open("w", encoding="utf-8") as out:
|
||||
for src, count in entries:
|
||||
out.write(f"{src} {count}\n")
|
||||
|
||||
return includes, includedby
|
||||
|
||||
|
||||
def build_loops_and_ordering(
|
||||
includes: dict[str, list[tuple[str, int]]],
|
||||
) -> tuple[list[str], list[str]]:
|
||||
include_map = {
|
||||
src: {dst: count for dst, count in entries}
|
||||
for src, entries in includes.items()
|
||||
}
|
||||
|
||||
ordering_lines: list[str] = []
|
||||
loops_lines: list[str] = []
|
||||
|
||||
seen_pairs: set[tuple[str, str]] = set()
|
||||
|
||||
for source in sorted(includes.keys()):
|
||||
for include, includefreq in includes[source]:
|
||||
if include not in include_map:
|
||||
continue
|
||||
|
||||
sourcefreq = include_map[include].get(source)
|
||||
if sourcefreq is None:
|
||||
ordering_lines.append(f"{source} > {include}\n")
|
||||
continue
|
||||
|
||||
if (include, source) in seen_pairs:
|
||||
continue
|
||||
seen_pairs.add((source, include))
|
||||
|
||||
loops_lines.append(f"Loop: {source} {include}\n")
|
||||
if includefreq - sourcefreq > 3:
|
||||
loops_lines.append(f" {source} > {include}\n\n")
|
||||
elif sourcefreq - includefreq > 3:
|
||||
loops_lines.append(f" {include} > {source}\n\n")
|
||||
elif sourcefreq == includefreq:
|
||||
loops_lines.append(f" {include} == {source}\n\n")
|
||||
else:
|
||||
loops_lines.append(f" {include} ~= {source}\n\n")
|
||||
|
||||
return ordering_lines, loops_lines
|
||||
|
||||
|
||||
def parse_paths(path: Path) -> dict[tuple[str, str], int]:
|
||||
out: dict[tuple[str, str], int] = {}
|
||||
for line in path.read_text(encoding="utf-8").splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
match = PATHS_LINE_PATTERN.match(line)
|
||||
if not match:
|
||||
raise ValueError(f"Cannot parse paths line: {line!r}")
|
||||
count = int(match.group(1))
|
||||
src = match.group(2)
|
||||
dst = match.group(3)
|
||||
out[(src, dst)] = count
|
||||
return out
|
||||
|
||||
|
||||
def parse_relation_dir(path: Path) -> dict[str, Counter[str]]:
|
||||
out: dict[str, Counter[str]] = {}
|
||||
if not path.exists():
|
||||
return out
|
||||
for file in sorted(p for p in path.iterdir() if p.is_file()):
|
||||
lines = [line for line in file.read_text(encoding="utf-8").splitlines() if line]
|
||||
out[file.name] = Counter(lines)
|
||||
return out
|
||||
|
||||
|
||||
def compare_results(generated: Path, canonical: Path) -> list[str]:
|
||||
mismatches: list[str] = []
|
||||
|
||||
# rawincludes: compare as sets/multisets of lines to ignore traversal order.
|
||||
gen_raw = Counter(generated.joinpath("rawincludes.txt").read_text(encoding="utf-8").splitlines())
|
||||
can_raw = Counter(canonical.joinpath("rawincludes.txt").read_text(encoding="utf-8").splitlines())
|
||||
if gen_raw != can_raw:
|
||||
mismatches.append("rawincludes.txt differs (line multiset mismatch)")
|
||||
|
||||
# paths: compare parsed edge->count map, ignoring ordering/whitespace.
|
||||
gen_paths = parse_paths(generated / "paths.txt")
|
||||
can_paths = parse_paths(canonical / "paths.txt")
|
||||
if gen_paths != can_paths:
|
||||
mismatches.append("paths.txt differs (edge count mismatch)")
|
||||
|
||||
# includes / includedby: compare per-file line multisets.
|
||||
for rel in ("includes", "includedby"):
|
||||
gen_rel = parse_relation_dir(generated / rel)
|
||||
can_rel = parse_relation_dir(canonical / rel)
|
||||
if gen_rel != can_rel:
|
||||
mismatches.append(f"{rel}/ differs (file set or content mismatch)")
|
||||
|
||||
# ordering and loops are canonical artifacts; require exact bytes.
|
||||
for name in ("ordering.txt", "loops.txt"):
|
||||
gen_text = generated.joinpath(name).read_text(encoding="utf-8")
|
||||
can_text = canonical.joinpath(name).read_text(encoding="utf-8")
|
||||
if gen_text != can_text:
|
||||
mismatches.append(f"{name} differs")
|
||||
|
||||
return mismatches
|
||||
|
||||
|
||||
def generate(results_dir: Path, repo_root: Path, workers: int) -> None:
|
||||
if results_dir.exists():
|
||||
shutil.rmtree(results_dir)
|
||||
results_dir.mkdir(parents=True)
|
||||
|
||||
files = iter_source_files(repo_root)
|
||||
|
||||
raw_by_file: dict[str, list[str]] = {}
|
||||
paths_by_file: dict[str, list[tuple[str, str]]] = {}
|
||||
|
||||
start = time.perf_counter()
|
||||
if workers <= 1:
|
||||
for file in files:
|
||||
rel = file.relative_to(repo_root).as_posix()
|
||||
raw, paths = scan_file(file, repo_root)
|
||||
raw_by_file[rel] = raw
|
||||
paths_by_file[rel] = paths
|
||||
else:
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as pool:
|
||||
futures = {
|
||||
file.relative_to(repo_root).as_posix(): pool.submit(
|
||||
scan_file, file, repo_root
|
||||
)
|
||||
for file in files
|
||||
}
|
||||
for rel in sorted(futures.keys()):
|
||||
raw, paths = futures[rel].result()
|
||||
raw_by_file[rel] = raw
|
||||
paths_by_file[rel] = paths
|
||||
|
||||
raw_lines: list[str] = []
|
||||
raw_lines.extend(
|
||||
line
|
||||
for rel in sorted(raw_by_file.keys())
|
||||
for line in raw_by_file[rel]
|
||||
)
|
||||
with (results_dir / "rawincludes.txt").open("w", encoding="utf-8") as out:
|
||||
out.write("\n".join(raw_lines))
|
||||
if raw_lines:
|
||||
out.write("\n")
|
||||
|
||||
path_pairs: list[tuple[str, str]] = []
|
||||
path_pairs.extend(
|
||||
pair
|
||||
for rel in sorted(paths_by_file.keys())
|
||||
for pair in paths_by_file[rel]
|
||||
)
|
||||
counts = Counter(path_pairs)
|
||||
|
||||
edge_counts = sorted(
|
||||
counts.items(),
|
||||
key=lambda item: (
|
||||
dictionary_sort_key(f"{item[0][0]} {item[0][1]}"),
|
||||
item[0][0],
|
||||
item[0][1],
|
||||
),
|
||||
)
|
||||
|
||||
includes, _ = write_relation_db(results_dir, edge_counts)
|
||||
ordering, loops = build_loops_and_ordering(includes)
|
||||
|
||||
with (results_dir / "ordering.txt").open("w", encoding="utf-8") as out:
|
||||
out.writelines(ordering)
|
||||
with (results_dir / "loops.txt").open("w", encoding="utf-8") as out:
|
||||
out.writelines(loops)
|
||||
|
||||
elapsed = time.perf_counter() - start
|
||||
print(
|
||||
f"levelization.py: scanned {len(files)} files, "
|
||||
f"{len(raw_lines)} includes, {len(edge_counts)} unique paths in "
|
||||
f"{elapsed:.2f}s"
|
||||
)
|
||||
print((results_dir / "ordering.txt").read_text(encoding="utf-8"), end="")
|
||||
print((results_dir / "loops.txt").read_text(encoding="utf-8"), end="")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
script_dir = Path(__file__).resolve().parent
|
||||
repo_root = script_dir.parents[1]
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--repo-root",
|
||||
type=Path,
|
||||
default=repo_root,
|
||||
help="Repository root (defaults based on script location).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--results-dir",
|
||||
type=Path,
|
||||
default=script_dir / "results",
|
||||
help="Output results directory.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
default=min(32, (os.cpu_count() or 1)),
|
||||
help="Thread count for source scanning (default: CPU count, max 32).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--compare-to",
|
||||
type=Path,
|
||||
default=None,
|
||||
help=(
|
||||
"Compare generated results against this directory and exit non-zero "
|
||||
"on mismatch (semantic comparison for rawincludes/paths/includes)."
|
||||
),
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
generated_dir = args.results_dir.resolve()
|
||||
generate(
|
||||
results_dir=generated_dir,
|
||||
repo_root=args.repo_root.resolve(),
|
||||
workers=max(1, args.workers),
|
||||
)
|
||||
|
||||
if args.compare_to is not None:
|
||||
canonical_dir = args.compare_to.resolve()
|
||||
mismatches = compare_results(generated_dir, canonical_dir)
|
||||
if mismatches:
|
||||
print("levelization.py: mismatch against canonical results:")
|
||||
for mismatch in mismatches:
|
||||
print(f" - {mismatch}")
|
||||
return 1
|
||||
print("levelization.py: matches canonical results")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -7,7 +7,7 @@
|
||||
|
||||
pushd $( dirname $0 )
|
||||
|
||||
if [ -v PS1 ]
|
||||
if [[ -n "${PS1-}" ]]
|
||||
then
|
||||
# if the shell is interactive, clean up any flotsam before analyzing
|
||||
git clean -ix
|
||||
|
||||
Reference in New Issue
Block a user