From 52fccc0086aeab96d5e25cd32ef040cc265ba791 Mon Sep 17 00:00:00 2001
From: Oliver Eggert <oeggert@ripple.com>
Date: Wed, 27 May 2026 12:41:03 -0700
Subject: [PATCH] update release notes script to filter cherry-picked commits
 already in release build

---
 tools/generate-release-notes.py | 151 +++++++++++++++++++++++++++-----
 1 file changed, 129 insertions(+), 22 deletions(-)

diff --git a/tools/generate-release-notes.py b/tools/generate-release-notes.py
index e89ec06676..49e5f71edb 100644
--- a/tools/generate-release-notes.py
+++ b/tools/generate-release-notes.py
@@ -53,6 +53,15 @@ SKIP_PATTERNS = [
     re.compile(r"^Merge tag ", re.IGNORECASE),
 ]
 
+# Patterns for normalizing commit titles when detecting cherry-pick duplicates.
+# Strips trailing "(#NNNN)" PR-number suffixes and conventional-commit prefixes.
+PR_NUM_RE = re.compile(r"\s*\(#\d+\)")
+CONV_COMMIT_RE = re.compile(
+    r"^(fix|feat|refactor|chore|docs|test|tests|ci|build|style|perf|revert|release|bugfix)"
+    r"(\([^)]*\))?\s*:\s*",
+    re.IGNORECASE,
+)
+
 
 # --- API helpers ---
 
@@ -164,19 +173,54 @@ def fetch_version_info(ref):
 
 
 def fetch_commits(from_ref, to_ref):
-    """Fetch all commits between two refs using the GitHub compare API."""
-    commits = []
-    page = 1
-    while True:
-        data = run_gh_rest(
-            f"repos/XRPLF/rippled/compare/{from_ref}...{to_ref}?per_page=250&page={page}"
-        )
-        batch = data.get("commits", [])
-        commits.extend(batch)
-        if len(batch) < 250:
-            break
-        page += 1
-    return commits
+    """Fetch commits between two refs, filtering out incoming cherry-pick duplicates."""
+
+    def key(c):
+        t = c["commit"]["message"].split("\n")[0]
+        t = PR_NUM_RE.sub("", t)
+        t = CONV_COMMIT_RE.sub("", t)
+        return re.sub(r"\s+", " ", t).strip().lower()
+
+    def paginate(base, head):
+        results, page = [], 1
+        while True:
+            data = run_gh_rest(
+                f"repos/XRPLF/rippled/compare/{base}...{head}?per_page=250&page={page}"
+            )
+            batch = data.get("commits", [])
+            results.extend(batch)
+            if len(batch) < 250:
+                break
+            page += 1
+        return results
+
+    incoming = paginate(from_ref, to_ref)
+    shipped = paginate(to_ref, from_ref)
+    incoming_keys = {key(c) for c in incoming}
+    shipped_keys = {key(c) for c in shipped}
+
+    before = len(incoming)
+    deduped = [c for c in incoming if key(c) not in shipped_keys]
+    dropped = before - len(deduped)
+    if dropped:
+        print(f"  Filtered {dropped} cherry-pick duplicates.")
+
+    # Surface backward-diff commits with no forward-diff match. These are
+    # either real release-branch originals or cherry-pick dupes that drifted
+    # enough to escape matching.
+    unmatched = [
+        c for c in shipped
+        if key(c) not in incoming_keys
+        and not should_skip(c["commit"]["message"].split("\n")[0])
+    ]
+    for c in unmatched:
+        c["_potential_dupe"] = True
+    if unmatched:
+        print(f"  Adding {len(unmatched)} unmatched {from_ref} commit(s) to draft "
+              f"flagged as [POTENTIAL DUPE — VERIFY].")
+    deduped.extend(unmatched)
+
+    return deduped
 
 
 def parse_features_macro(text):
@@ -542,6 +586,13 @@ def main():
     pr_shas = {}       # PR/issue number → commit SHA (for file lookups on Issues)
     pr_bodies = {}     # PR/issue number → commit body (for fallback descriptions)
     orphan_commits = []  # Commits with no PR/Issues link
+    # Potential dupe commits are kept in their own parallel buckets so they
+    # don't collide with real entries by PR number. They go through the same
+    # PR-enrichment pipeline to give reviewers full side-by-side context.
+    dupe_pr_numbers = {}
+    dupe_pr_shas = {}
+    dupe_pr_bodies = {}
+    dupe_orphan_commits = []
     authors = set()
 
     for commit in commits:
@@ -552,18 +603,28 @@ def main():
         author = commit["commit"]["author"]["name"]
         email = commit["commit"]["author"].get("email", "")
 
-        # Skip Ripple employees from credits
-        login = (commit.get("author") or {}).get("login")
-        if not email.lower().endswith("@ripple.com") and email not in EXCLUDED_EMAILS:
-            if login:
-                authors.add(f"@{login}")
-            else:
-                authors.add(author)
+        # Skip Ripple employees from credits and skip potential-dupe commits entirely
+        if not commit.get("_potential_dupe"):
+            login = (commit.get("author") or {}).get("login")
+            if not email.lower().endswith("@ripple.com") and email not in EXCLUDED_EMAILS:
+                if login:
+                    authors.add(f"@{login}")
+                else:
+                    authors.add(author)
 
         if should_skip(message):
             continue
 
         pr_number = extract_pr_number(message)
+        if commit.get("_potential_dupe"):
+            if pr_number:
+                dupe_pr_numbers[pr_number] = message
+                dupe_pr_shas[pr_number] = sha
+                dupe_pr_bodies[pr_number] = body
+            else:
+                dupe_orphan_commits.append({"sha": sha, "message": message, "body": body})
+            continue
+
         if pr_number:
             pr_numbers[pr_number] = message
             pr_shas[pr_number] = sha
@@ -586,12 +647,14 @@ def main():
 
     print(f"Building changelog entries...")
 
-    # Fetch all PR details in batches via GraphQL
-    pr_details = fetch_prs_graphql(list(pr_numbers.keys()))
+    # Fetch all PR details in batches via GraphQL.
+    all_pr_numbers = list(set(pr_numbers.keys()) | set(dupe_pr_numbers.keys()))
+    pr_details = fetch_prs_graphql(all_pr_numbers)
 
     # Build entries, sorting amendments automatically
     amendment_entries = []
     entries = []
+    DUPE_MARKER = "[POTENTIAL DUPE — VERIFY]"
     for pr_number, commit_msg in pr_numbers.items():
         pr_data = pr_details.get(pr_number)
 
@@ -638,6 +701,50 @@ def main():
             entry = format_commit_entry(sha, orphan["message"], orphan["body"], files)
             entries.append(entry)
 
+    # Build entries for potential dupes
+    for pr_number, commit_msg in dupe_pr_numbers.items():
+        sha = dupe_pr_shas[pr_number]
+        pr_data = pr_details.get(pr_number)
+        print(f"  Building potential-dupe entry for #{pr_number} ({sha[:7]})...")
+
+        if pr_data:
+            title = f"{DUPE_MARKER} {pr_data['title']}"
+            body = pr_data.get("body", "")
+            labels = pr_data.get("labels", [])
+            files = pr_data.get("files", [])
+            link_type = pr_data.get("type", "pull")
+            if not files:
+                files = fetch_commit_files(sha)
+            if is_amendment(files) and amendment_diff:
+                entry = format_uncategorized_entry(pr_number, title, labels, body, link_type=link_type)
+                amendment_entries.append(entry)
+            else:
+                entry = format_uncategorized_entry(pr_number, title, labels, body, files, link_type)
+                entries.append(entry)
+        else:
+            # PR/Issue lookup failed — fall back to commit-only entry
+            files = fetch_commit_files(sha)
+            title = f"{DUPE_MARKER} {commit_msg}"
+            if is_amendment(files) and amendment_diff:
+                entry = format_commit_entry(sha, title, dupe_pr_bodies[pr_number])
+                amendment_entries.append(entry)
+            else:
+                entry = format_commit_entry(sha, title, dupe_pr_bodies[pr_number], files)
+                entries.append(entry)
+
+    # Potential dupe orphans (no PR link at all)
+    for orphan in dupe_orphan_commits:
+        sha = orphan["sha"]
+        print(f"  Building potential-dupe orphan entry for {sha[:7]}...")
+        files = fetch_commit_files(sha)
+        title = f"{DUPE_MARKER} {orphan['message']}"
+        if is_amendment(files) and amendment_diff:
+            entry = format_commit_entry(sha, title, orphan["body"])
+            amendment_entries.append(entry)
+        else:
+            entry = format_commit_entry(sha, title, orphan["body"], files)
+            entries.append(entry)
+
     # Generate markdown
     markdown = generate_markdown(version, args.date, amendment_diff, amendment_unchanged, amendment_entries, entries, authors, version_commit)