From d717fce47f8d61821bf42ebbc6c9ad63cc57f603 Mon Sep 17 00:00:00 2001 From: Jacob Ellis Date: Mon, 25 May 2026 12:20:02 +0930 Subject: [PATCH] fix(pulls): use merge-commit parents for diff/has_changes on merged PRs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real-world hit: collins-platform PR #12 (123,773 additions, merged). After #234 landed, cube prv ran the local-git path but git diff --quiet origin/main...origin/feat/... returned exit 0 (no diff), so the empty-PR bail fired. False-empty. Root cause: three-dot origin/...origin/ diffs from the merge-base. For OPEN PRs, merge-base = where head branched off base. For MERGED PRs, base has absorbed head — so merge-base IS head itself, and merge-base..head is empty. GitHub's diff endpoint dodges this by diffing the merge commit's two parents (^1 = base-before-merge, ^2 = head). cube does the same locally now: - fetch_pr requests mergeCommit alongside other PR fields - PRData.merge_commit_oid carries the SHA - has_changes() / get_diff() prefer git diff ^1 ^2 when the SHA is set. Fetches by SHA so this works even when the head branch was deleted post-merge. - Falls back to three-dot, then to API, then to conservative True. Smoke test on the real PR: has_changes True (was: False, silently bailed) get_diff 130k+ lines (was: empty) Tests: 2 new (16 total). Full suite: 508/508. mypy clean. Co-Authored-By: Claude Opus 4.7 --- python/cube/github/pulls.py | 161 ++++++++++++++++++++++++++--- tests/cli/test_pulls_local_diff.py | 125 +++++++++++++++++++--- 2 files changed, 258 insertions(+), 28 deletions(-) diff --git a/python/cube/github/pulls.py b/python/cube/github/pulls.py index 225df28c..8e5995d9 100644 --- a/python/cube/github/pulls.py +++ b/python/cube/github/pulls.py @@ -33,6 +33,11 @@ class PRData: existing_reviews: list[dict] state: str = "" merged_at: str = "" + # For merged PRs, the SHA of the merge commit on the base branch. + # We use ``^1..^2`` to recover the + # PR's actual diff — the three-dot ``origin/...origin/`` + # path collapses to empty once base has absorbed head. + merge_commit_oid: str = "" # Back-compat: callers that read ``pr.diff`` directly. None until # someone calls ``get_diff()`` — the old eager-populated string is @@ -72,6 +77,16 @@ def has_changes(self) -> bool: """ if self._repo: return True + # Merged-PR path: three-dot ``origin/...origin/`` + # collapses to empty once base has absorbed head (merge commit + # moves head's content onto main; their merge-base IS head). + # The merge commit's parents — ^1 = base-before-merge, ^2 = + # PR's head — give us the real diff. ``gh pr diff`` does the + # same thing server-side; we replicate it locally with no cap. + if self.merge_commit_oid: + answer = self._diff_empty_via_merge_commit() + if answer is not None: + return not answer if self.head_branch and self.base_branch: try: result = subprocess.run( @@ -108,6 +123,47 @@ def has_changes(self) -> bool: # Fallback: assume there are changes (don't skip the review). return True + def _diff_empty_via_merge_commit(self) -> Optional[bool]: + """For merged PRs: return True iff the merge commit's two + parents are content-identical (genuinely empty PR), False if + they differ, or None if the local repo can't answer (merge + commit not fetched yet, etc — fall back to other paths).""" + oid = self.merge_commit_oid + try: + # Ensure the merge commit + its parents are present locally. + # If the merge commit's parents aren't reachable, fetch + # main + the merge commit oid itself. + subprocess.run( + ["git", "fetch", "--quiet", "origin", self.base_branch, oid], + cwd=self._cwd, + capture_output=True, + timeout=60, + ) + # Resolve parents — if either doesn't resolve we can't + # answer, fall through. + for ref in (f"{oid}^1", f"{oid}^2"): + rv = subprocess.run( + ["git", "rev-parse", "--verify", ref], + cwd=self._cwd, + capture_output=True, + timeout=10, + ) + if rv.returncode != 0: + return None + diff_check = subprocess.run( + ["git", "diff", "--quiet", f"{oid}^1", f"{oid}^2"], + cwd=self._cwd, + capture_output=True, + timeout=30, + ) + if diff_check.returncode == 0: + return True # parents identical = empty PR + if diff_check.returncode == 1: + return False # parents differ = has changes + except (OSError, subprocess.TimeoutExpired): + return None + return None + def get_diff(self, *, max_lines: int = 8000) -> str: """Return the PR's diff, capped at ``max_lines`` lines. @@ -129,6 +185,7 @@ def get_diff(self, *, max_lines: int = 8000) -> str: base_branch=self.base_branch, cwd=self._cwd, repo_flags=(["--repo", self._repo] if self._repo else []), + merge_commit_oid=self.merge_commit_oid, ) # Truncate per-call so each caller's cap is independent. lines = self._uncapped_diff.splitlines() @@ -180,7 +237,7 @@ def fetch_pr(pr_number: int, cwd: Optional[str] = None, repo: Optional[str] = No "view", str(pr_number), "--json", - "number,title,body,headRefName,baseRefName,headRefOid,reviews,state,mergedAt", + "number,title,body,headRefName,baseRefName,headRefOid,reviews,state,mergedAt,mergeCommit", ] + repo_flags, capture_output=True, @@ -207,6 +264,7 @@ def fetch_pr(pr_number: int, cwd: Optional[str] = None, repo: Optional[str] = No # The old eager ``gh pr diff`` call was hitting GitHub's 20,000-line # API cap (HTTP 406) on large PRs, aborting peer review before any # judge ran. Lazy + local-git-first removes the cap entirely. + merge_commit_obj = data.get("mergeCommit") or {} return PRData( number=data.get("number", pr_number), title=data.get("title", ""), @@ -217,6 +275,7 @@ def fetch_pr(pr_number: int, cwd: Optional[str] = None, repo: Optional[str] = No existing_reviews=data.get("reviews") or [], state=data.get("state", ""), merged_at=data.get("mergedAt") or "", + merge_commit_oid=merge_commit_obj.get("oid") or "", _cwd=cwd, _repo=repo, ) @@ -229,6 +288,7 @@ def _fetch_pr_diff( base_branch: str, cwd: Optional[str], repo_flags: list[str], + merge_commit_oid: str = "", ) -> str: """Return the PR's diff text. @@ -251,14 +311,23 @@ def _fetch_pr_diff( # diff from this repo instead of the requested one — silently # wrong output. Skip the local path entirely in that case. cross_repo = bool(repo_flags) - if head_branch and base_branch and not cross_repo: - local_diff = _try_local_git_diff( - head_branch=head_branch, - base_branch=base_branch, - cwd=cwd, - ) - if local_diff is not None: - return local_diff + if not cross_repo: + # Merged-PR-aware path: when we know the merge commit, diff + # its two parents (the GitHub-equivalent PR diff). This is + # what ``gh pr diff`` does server-side; it works for both + # open AND merged PRs whereas three-dot collapses for merged. + if merge_commit_oid: + local_diff = _try_local_diff_via_merge_commit(merge_commit_oid, base_branch, cwd) + if local_diff is not None: + return local_diff + if head_branch and base_branch: + local_diff = _try_local_git_diff( + head_branch=head_branch, + base_branch=base_branch, + cwd=cwd, + ) + if local_diff is not None: + return local_diff # API fallback. Same shape as the legacy path so callers see the # same diff format (unified diff text) on either route. Bounded @@ -299,12 +368,14 @@ def _try_local_git_diff( base_branch: str, cwd: Optional[str], ) -> Optional[str]: - """Compute the three-dot diff via local git. Returns None on any - failure so the caller falls back to the API path. + """Compute the three-dot diff via local git for OPEN PRs. + + Returns None on any failure so the caller falls back to the API + path (or to ``_try_local_diff_via_merge_commit`` for merged PRs). - Three-dot intentionally matches what ``gh pr diff`` returns (diff - from the merge-base, not the current tip of the base branch) — - keeps caller assertions stable across the two paths. + Three-dot matches what ``gh pr diff`` returns for an OPEN PR. + For MERGED PRs three-dot collapses to empty (base absorbed head) + — use ``_try_local_diff_via_merge_commit`` for those instead. Fetches origin first so refs are fresh; uses ``--no-tags`` and ``--depth=...`` aren't used here because cube's CI / dev @@ -351,6 +422,68 @@ def _try_local_git_diff( return diff_result.stdout +def _try_local_diff_via_merge_commit( + merge_commit_oid: str, + base_branch: str, + cwd: Optional[str], +) -> Optional[str]: + """Compute the PR diff for a MERGED PR via the merge commit's two + parents. Returns None when the local repo can't resolve them + (merge commit not fetched yet, etc) so the caller falls back. + + ``^1`` = base-branch tip BEFORE the merge. + ``^2`` = PR head's final tip. + ``git diff ^1 ^2`` reproduces what GitHub's diff + endpoint returns for a merged PR — including the full diff that + three-dot ``origin/...origin/`` would collapse to + empty once base has absorbed head. + + Fetches the merge commit + base by SHA so this works even when + the head branch was deleted post-merge. + """ + if not merge_commit_oid: + return None + try: + # Best-effort fetch — bring the merge commit + base ref in + # case they aren't local yet. Non-zero exit is fine; the + # rev-parse below is the real gate. + subprocess.run( + ["git", "fetch", "--quiet", "origin", base_branch, merge_commit_oid], + cwd=cwd, + capture_output=True, + timeout=60, + ) + # Both parents must resolve locally. + for ref in (f"{merge_commit_oid}^1", f"{merge_commit_oid}^2"): + rv = subprocess.run( + ["git", "rev-parse", "--verify", ref], + cwd=cwd, + capture_output=True, + timeout=10, + ) + if rv.returncode != 0: + return None + diff_result = subprocess.run( + [ + "git", + "diff", + "--no-color", + f"{merge_commit_oid}^1", + f"{merge_commit_oid}^2", + ], + cwd=cwd, + capture_output=True, + text=True, + check=False, + timeout=120, + ) + except (OSError, subprocess.TimeoutExpired): + return None + if diff_result.returncode != 0: + return None + return diff_result.stdout + + def fetch_failed_ci_logs(pr_number: int, cwd: Optional[str] = None, max_log_lines: int = 200) -> list[dict]: """Fetch logs from failed CI/GHA checks on a PR. diff --git a/tests/cli/test_pulls_local_diff.py b/tests/cli/test_pulls_local_diff.py index 77113947..51dbc899 100644 --- a/tests/cli/test_pulls_local_diff.py +++ b/tests/cli/test_pulls_local_diff.py @@ -28,20 +28,28 @@ from cube.github import pulls -def _gh_view_stub(head: str = "feat/big-branch", base: str = "main") -> str: - return json.dumps( - { - "number": 1234, - "title": "Big PR", - "body": "body", - "headRefName": head, - "baseRefName": base, - "headRefOid": "abc123", - "reviews": [], - "state": "OPEN", - "mergedAt": None, - } - ) +def _gh_view_stub( + head: str = "feat/big-branch", + base: str = "main", + *, + state: str = "OPEN", + merged_at: str = "", + merge_commit_oid: str = "", +) -> str: + payload = { + "number": 1234, + "title": "Big PR", + "body": "body", + "headRefName": head, + "baseRefName": base, + "headRefOid": "abc123", + "reviews": [], + "state": state, + "mergedAt": merged_at or None, + } + if merge_commit_oid: + payload["mergeCommit"] = {"oid": merge_commit_oid} + return json.dumps(payload) # --------------------------------------------------------------------------- @@ -390,6 +398,95 @@ def _fake_run(cmd, *args, **kwargs): ), "cross-repo fetch must pass --repo flag to gh" +def test_merged_pr_uses_merge_commit_parents_for_has_changes(monkeypatch): + """Three-dot ``origin/...origin/`` collapses to empty + once base has absorbed head — has_changes used to falsely report + a merged 123k-line PR as "no diff", bailing out of cube prv. + + Fix: when the PR is merged AND we have the merge commit SHA, + diff its parents (^1 = base-before-merge, ^2 = PR head) instead. + Matches what ``gh pr diff`` does server-side. + + Test pins: a merged PR with merge_commit_oid set produces + has_changes=True without ever consulting the (broken-for-merged) + three-dot path.""" + three_dot_called = {"v": False} + + def _fake_run(cmd, *args, **kwargs): + if cmd[:3] == ["gh", "pr", "view"]: + return subprocess.CompletedProcess( + cmd, + 0, + stdout=_gh_view_stub(state="MERGED", merged_at="2026-05-24T13:06:25Z", merge_commit_oid="abc123"), + stderr="", + ) + if cmd[:2] == ["git", "fetch"]: + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + if cmd[:3] == ["git", "rev-parse", "--verify"]: + return subprocess.CompletedProcess(cmd, 0, stdout="abc123\n", stderr="") + if cmd[:3] == ["git", "diff", "--quiet"]: + # If the args are the three-dot form, flag the regression. + args_after_quiet = cmd[3:] + if any("..." in a for a in args_after_quiet): + three_dot_called["v"] = True + # Three-dot would return 0 (no diff) here — and that's + # exactly the bug we're avoiding. Returning 0 forces + # the test to fail loudly if we ever fall through. + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + # Merge-commit parents path: report "has diff" so we + # confirm the contract. + return subprocess.CompletedProcess(cmd, 1, stdout="", stderr="") + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + monkeypatch.setattr(pulls, "check_gh_installed", lambda: True) + monkeypatch.setattr(subprocess, "run", _fake_run) + + data = pulls.fetch_pr(1234) + assert data.merge_commit_oid == "abc123" + assert data.has_changes() is True + assert three_dot_called["v"] is False, "merged PRs must NOT consult the three-dot path" + + +def test_merged_pr_get_diff_uses_merge_commit_parents(monkeypatch): + """Symmetric to has_changes: get_diff for a merged PR resolves + via the merge commit's parents, not three-dot. Without this, + the dedupe agent received an empty diff for any merged PR + cube was asked to review post-merge.""" + called = {"three_dot": False, "merge_commit_diff": False} + + def _fake_run(cmd, *args, **kwargs): + if cmd[:3] == ["gh", "pr", "view"]: + return subprocess.CompletedProcess( + cmd, + 0, + stdout=_gh_view_stub(state="MERGED", merged_at="2026-05-24T13:06:25Z", merge_commit_oid="abc123"), + stderr="", + ) + if cmd[:2] == ["git", "fetch"]: + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + if cmd[:3] == ["git", "rev-parse", "--verify"]: + return subprocess.CompletedProcess(cmd, 0, stdout="abc123\n", stderr="") + if cmd[:3] == ["git", "diff", "--no-color"]: + args_after_no_color = cmd[3:] + if any("..." in a for a in args_after_no_color): + called["three_dot"] = True + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + # Merge-commit parents form: ^1 and ^2 + if any(a.endswith("^1") for a in args_after_no_color): + called["merge_commit_diff"] = True + return subprocess.CompletedProcess(cmd, 0, stdout="diff --git a/x b/x\n+merged content\n", stderr="") + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + monkeypatch.setattr(pulls, "check_gh_installed", lambda: True) + monkeypatch.setattr(subprocess, "run", _fake_run) + + data = pulls.fetch_pr(1234) + diff = data.get_diff() + assert "merged content" in diff + assert called["merge_commit_diff"] is True + assert called["three_dot"] is False + + def test_has_changes_defaults_to_true_on_git_failure(monkeypatch): """If git can't answer (offline, no origin, etc), assume the PR has changes — better than silently skipping a real PR because the