update release notes script to filter cherry-picked commits already in release build

2026-07-30 02:20:21 +00:00 · 2026-05-27 12:41:03 -07:00
parent d7456276b9
commit 52fccc0086
1 changed files with 129 additions and 22 deletions
--- a/tools/generate-release-notes.py
+++ b/tools/generate-release-notes.py
@@ -53,6 +53,15 @@ SKIP_PATTERNS = [
    re.compile(r"^Merge tag ", re.IGNORECASE),
 ]

+# Patterns for normalizing commit titles when detecting cherry-pick duplicates.
+# Strips trailing "(#NNNN)" PR-number suffixes and conventional-commit prefixes.
+PR_NUM_RE = re.compile(r"\s*\(#\d+\)")
+CONV_COMMIT_RE = re.compile(
+    r"^(fix|feat|refactor|chore|docs|test|tests|ci|build|style|perf|revert|release|bugfix)"
+    r"(\([^)]*\))?\s*:\s*",
+    re.IGNORECASE,
+)
+

 # --- API helpers ---

@@ -164,19 +173,54 @@ def fetch_version_info(ref):


 def fetch_commits(from_ref, to_ref):
-    """Fetch all commits between two refs using the GitHub compare API."""
-    commits = []
-    page = 1
-    while True:
-        data = run_gh_rest(
-            f"repos/XRPLF/rippled/compare/{from_ref}...{to_ref}?per_page=250&page={page}"
-        )
-        batch = data.get("commits", [])
-        commits.extend(batch)
-        if len(batch) < 250:
-            break
-        page += 1
-    return commits
+    """Fetch commits between two refs, filtering out incoming cherry-pick duplicates."""
+
+    def key(c):
+        t = c["commit"]["message"].split("\n")[0]
+        t = PR_NUM_RE.sub("", t)
+        t = CONV_COMMIT_RE.sub("", t)
+        return re.sub(r"\s+", " ", t).strip().lower()
+
+    def paginate(base, head):
+        results, page = [], 1
+        while True:
+            data = run_gh_rest(
+                f"repos/XRPLF/rippled/compare/{base}...{head}?per_page=250&page={page}"
+            )
+            batch = data.get("commits", [])
+            results.extend(batch)
+            if len(batch) < 250:
+                break
+            page += 1
+        return results
+
+    incoming = paginate(from_ref, to_ref)
+    shipped = paginate(to_ref, from_ref)
+    incoming_keys = {key(c) for c in incoming}
+    shipped_keys = {key(c) for c in shipped}
+
+    before = len(incoming)
+    deduped = [c for c in incoming if key(c) not in shipped_keys]
+    dropped = before - len(deduped)
+    if dropped:
+        print(f"  Filtered {dropped} cherry-pick duplicates.")
+
+    # Surface backward-diff commits with no forward-diff match. These are
+    # either real release-branch originals or cherry-pick dupes that drifted
+    # enough to escape matching.
+    unmatched = [
+        c for c in shipped
+        if key(c) not in incoming_keys
+        and not should_skip(c["commit"]["message"].split("\n")[0])
+    ]
+    for c in unmatched:
+        c["_potential_dupe"] = True
+    if unmatched:
+        print(f"  Adding {len(unmatched)} unmatched {from_ref} commit(s) to draft "
+              f"flagged as [POTENTIAL DUPE — VERIFY].")
+    deduped.extend(unmatched)
+
+    return deduped


 def parse_features_macro(text):
@@ -542,6 +586,13 @@ def main():
    pr_shas = {}       # PR/issue number → commit SHA (for file lookups on Issues)
    pr_bodies = {}     # PR/issue number → commit body (for fallback descriptions)
    orphan_commits = []  # Commits with no PR/Issues link
+    # Potential dupe commits are kept in their own parallel buckets so they
+    # don't collide with real entries by PR number. They go through the same
+    # PR-enrichment pipeline to give reviewers full side-by-side context.
+    dupe_pr_numbers = {}
+    dupe_pr_shas = {}
+    dupe_pr_bodies = {}
+    dupe_orphan_commits = []
    authors = set()

    for commit in commits:
@@ -552,18 +603,28 @@ def main():
        author = commit["commit"]["author"]["name"]
        email = commit["commit"]["author"].get("email", "")

-        # Skip Ripple employees from credits
-        login = (commit.get("author") or {}).get("login")
-        if not email.lower().endswith("@ripple.com") and email not in EXCLUDED_EMAILS:
-            if login:
-                authors.add(f"@{login}")
-            else:
-                authors.add(author)
+        # Skip Ripple employees from credits and skip potential-dupe commits entirely
+        if not commit.get("_potential_dupe"):
+            login = (commit.get("author") or {}).get("login")
+            if not email.lower().endswith("@ripple.com") and email not in EXCLUDED_EMAILS:
+                if login:
+                    authors.add(f"@{login}")
+                else:
+                    authors.add(author)

        if should_skip(message):
            continue

        pr_number = extract_pr_number(message)
+        if commit.get("_potential_dupe"):
+            if pr_number:
+                dupe_pr_numbers[pr_number] = message
+                dupe_pr_shas[pr_number] = sha
+                dupe_pr_bodies[pr_number] = body
+            else:
+                dupe_orphan_commits.append({"sha": sha, "message": message, "body": body})
+            continue
+
        if pr_number:
            pr_numbers[pr_number] = message
            pr_shas[pr_number] = sha
@@ -586,12 +647,14 @@ def main():

    print(f"Building changelog entries...")

-    # Fetch all PR details in batches via GraphQL
-    pr_details = fetch_prs_graphql(list(pr_numbers.keys()))
+    # Fetch all PR details in batches via GraphQL.
+    all_pr_numbers = list(set(pr_numbers.keys()) | set(dupe_pr_numbers.keys()))
+    pr_details = fetch_prs_graphql(all_pr_numbers)

    # Build entries, sorting amendments automatically
    amendment_entries = []
    entries = []
+    DUPE_MARKER = "[POTENTIAL DUPE — VERIFY]"
    for pr_number, commit_msg in pr_numbers.items():
        pr_data = pr_details.get(pr_number)

@@ -638,6 +701,50 @@ def main():
            entry = format_commit_entry(sha, orphan["message"], orphan["body"], files)
            entries.append(entry)

+    # Build entries for potential dupes
+    for pr_number, commit_msg in dupe_pr_numbers.items():
+        sha = dupe_pr_shas[pr_number]
+        pr_data = pr_details.get(pr_number)
+        print(f"  Building potential-dupe entry for #{pr_number} ({sha[:7]})...")
+
+        if pr_data:
+            title = f"{DUPE_MARKER} {pr_data['title']}"
+            body = pr_data.get("body", "")
+            labels = pr_data.get("labels", [])
+            files = pr_data.get("files", [])
+            link_type = pr_data.get("type", "pull")
+            if not files:
+                files = fetch_commit_files(sha)
+            if is_amendment(files) and amendment_diff:
+                entry = format_uncategorized_entry(pr_number, title, labels, body, link_type=link_type)
+                amendment_entries.append(entry)
+            else:
+                entry = format_uncategorized_entry(pr_number, title, labels, body, files, link_type)
+                entries.append(entry)
+        else:
+            # PR/Issue lookup failed — fall back to commit-only entry
+            files = fetch_commit_files(sha)
+            title = f"{DUPE_MARKER} {commit_msg}"
+            if is_amendment(files) and amendment_diff:
+                entry = format_commit_entry(sha, title, dupe_pr_bodies[pr_number])
+                amendment_entries.append(entry)
+            else:
+                entry = format_commit_entry(sha, title, dupe_pr_bodies[pr_number], files)
+                entries.append(entry)
+
+    # Potential dupe orphans (no PR link at all)
+    for orphan in dupe_orphan_commits:
+        sha = orphan["sha"]
+        print(f"  Building potential-dupe orphan entry for {sha[:7]}...")
+        files = fetch_commit_files(sha)
+        title = f"{DUPE_MARKER} {orphan['message']}"
+        if is_amendment(files) and amendment_diff:
+            entry = format_commit_entry(sha, title, orphan["body"])
+            amendment_entries.append(entry)
+        else:
+            entry = format_commit_entry(sha, title, orphan["body"], files)
+            entries.append(entry)
+
    # Generate markdown
    markdown = generate_markdown(version, args.date, amendment_diff, amendment_unchanged, amendment_entries, entries, authors, version_commit)