Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 147 additions & 14 deletions python/cube/github/pulls.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ class PRData:
existing_reviews: list[dict]
state: str = ""
merged_at: str = ""
# For merged PRs, the SHA of the merge commit on the base branch.
# We use ``<merge_commit>^1..<merge_commit>^2`` to recover the
# PR's actual diff — the three-dot ``origin/<base>...origin/<head>``
# path collapses to empty once base has absorbed head.
merge_commit_oid: str = ""

# Back-compat: callers that read ``pr.diff`` directly. None until
# someone calls ``get_diff()`` — the old eager-populated string is
Expand Down Expand Up @@ -72,6 +77,16 @@ def has_changes(self) -> bool:
"""
if self._repo:
return True
# Merged-PR path: three-dot ``origin/<base>...origin/<head>``
# collapses to empty once base has absorbed head (merge commit
# moves head's content onto main; their merge-base IS head).
# The merge commit's parents — ^1 = base-before-merge, ^2 =
# PR's head — give us the real diff. ``gh pr diff`` does the
# same thing server-side; we replicate it locally with no cap.
if self.merge_commit_oid:
answer = self._diff_empty_via_merge_commit()
if answer is not None:
return not answer
if self.head_branch and self.base_branch:
try:
result = subprocess.run(
Expand Down Expand Up @@ -108,6 +123,47 @@ def has_changes(self) -> bool:
# Fallback: assume there are changes (don't skip the review).
return True

def _diff_empty_via_merge_commit(self) -> Optional[bool]:
"""For merged PRs: return True iff the merge commit's two
parents are content-identical (genuinely empty PR), False if
they differ, or None if the local repo can't answer (merge
commit not fetched yet, etc — fall back to other paths)."""
oid = self.merge_commit_oid
try:
# Ensure the merge commit + its parents are present locally.
# If the merge commit's parents aren't reachable, fetch
# main + the merge commit oid itself.
subprocess.run(
["git", "fetch", "--quiet", "origin", self.base_branch, oid],
cwd=self._cwd,
capture_output=True,
timeout=60,
)
# Resolve parents — if either doesn't resolve we can't
# answer, fall through.
for ref in (f"{oid}^1", f"{oid}^2"):
rv = subprocess.run(
["git", "rev-parse", "--verify", ref],
cwd=self._cwd,
capture_output=True,
timeout=10,
)
if rv.returncode != 0:
return None
diff_check = subprocess.run(
["git", "diff", "--quiet", f"{oid}^1", f"{oid}^2"],
cwd=self._cwd,
capture_output=True,
timeout=30,
)
if diff_check.returncode == 0:
return True # parents identical = empty PR
if diff_check.returncode == 1:
return False # parents differ = has changes
except (OSError, subprocess.TimeoutExpired):
return None
return None

def get_diff(self, *, max_lines: int = 8000) -> str:
"""Return the PR's diff, capped at ``max_lines`` lines.

Expand All @@ -129,6 +185,7 @@ def get_diff(self, *, max_lines: int = 8000) -> str:
base_branch=self.base_branch,
cwd=self._cwd,
repo_flags=(["--repo", self._repo] if self._repo else []),
merge_commit_oid=self.merge_commit_oid,
)
# Truncate per-call so each caller's cap is independent.
lines = self._uncapped_diff.splitlines()
Expand Down Expand Up @@ -180,7 +237,7 @@ def fetch_pr(pr_number: int, cwd: Optional[str] = None, repo: Optional[str] = No
"view",
str(pr_number),
"--json",
"number,title,body,headRefName,baseRefName,headRefOid,reviews,state,mergedAt",
"number,title,body,headRefName,baseRefName,headRefOid,reviews,state,mergedAt,mergeCommit",
]
+ repo_flags,
capture_output=True,
Expand All @@ -207,6 +264,7 @@ def fetch_pr(pr_number: int, cwd: Optional[str] = None, repo: Optional[str] = No
# The old eager ``gh pr diff`` call was hitting GitHub's 20,000-line
# API cap (HTTP 406) on large PRs, aborting peer review before any
# judge ran. Lazy + local-git-first removes the cap entirely.
merge_commit_obj = data.get("mergeCommit") or {}
return PRData(
number=data.get("number", pr_number),
title=data.get("title", ""),
Expand All @@ -217,6 +275,7 @@ def fetch_pr(pr_number: int, cwd: Optional[str] = None, repo: Optional[str] = No
existing_reviews=data.get("reviews") or [],
state=data.get("state", ""),
merged_at=data.get("mergedAt") or "",
merge_commit_oid=merge_commit_obj.get("oid") or "",
_cwd=cwd,
_repo=repo,
)
Expand All @@ -229,6 +288,7 @@ def _fetch_pr_diff(
base_branch: str,
cwd: Optional[str],
repo_flags: list[str],
merge_commit_oid: str = "",
) -> str:
"""Return the PR's diff text.

Expand All @@ -251,14 +311,23 @@ def _fetch_pr_diff(
# diff from this repo instead of the requested one — silently
# wrong output. Skip the local path entirely in that case.
cross_repo = bool(repo_flags)
if head_branch and base_branch and not cross_repo:
local_diff = _try_local_git_diff(
head_branch=head_branch,
base_branch=base_branch,
cwd=cwd,
)
if local_diff is not None:
return local_diff
if not cross_repo:
# Merged-PR-aware path: when we know the merge commit, diff
# its two parents (the GitHub-equivalent PR diff). This is
# what ``gh pr diff`` does server-side; it works for both
# open AND merged PRs whereas three-dot collapses for merged.
if merge_commit_oid:
local_diff = _try_local_diff_via_merge_commit(merge_commit_oid, base_branch, cwd)
if local_diff is not None:
return local_diff
if head_branch and base_branch:
local_diff = _try_local_git_diff(
head_branch=head_branch,
base_branch=base_branch,
cwd=cwd,
)
if local_diff is not None:
return local_diff

# API fallback. Same shape as the legacy path so callers see the
# same diff format (unified diff text) on either route. Bounded
Expand Down Expand Up @@ -299,12 +368,14 @@ def _try_local_git_diff(
base_branch: str,
cwd: Optional[str],
) -> Optional[str]:
"""Compute the three-dot diff via local git. Returns None on any
failure so the caller falls back to the API path.
"""Compute the three-dot diff via local git for OPEN PRs.

Returns None on any failure so the caller falls back to the API
path (or to ``_try_local_diff_via_merge_commit`` for merged PRs).

Three-dot intentionally matches what ``gh pr diff`` returns (diff
from the merge-base, not the current tip of the base branch) —
keeps caller assertions stable across the two paths.
Three-dot matches what ``gh pr diff`` returns for an OPEN PR.
For MERGED PRs three-dot collapses to empty (base absorbed head)
— use ``_try_local_diff_via_merge_commit`` for those instead.

Fetches origin first so refs are fresh; uses ``--no-tags`` and
``--depth=...`` aren't used here because cube's CI / dev
Expand Down Expand Up @@ -351,6 +422,68 @@ def _try_local_git_diff(
return diff_result.stdout


def _try_local_diff_via_merge_commit(
merge_commit_oid: str,
base_branch: str,
cwd: Optional[str],
) -> Optional[str]:
"""Compute the PR diff for a MERGED PR via the merge commit's two
parents. Returns None when the local repo can't resolve them
(merge commit not fetched yet, etc) so the caller falls back.

``<merge>^1`` = base-branch tip BEFORE the merge.
``<merge>^2`` = PR head's final tip.
``git diff <merge>^1 <merge>^2`` reproduces what GitHub's diff
endpoint returns for a merged PR — including the full diff that
three-dot ``origin/<base>...origin/<head>`` would collapse to
empty once base has absorbed head.

Fetches the merge commit + base by SHA so this works even when
the head branch was deleted post-merge.
"""
if not merge_commit_oid:
return None
try:
# Best-effort fetch — bring the merge commit + base ref in
# case they aren't local yet. Non-zero exit is fine; the
# rev-parse below is the real gate.
subprocess.run(
["git", "fetch", "--quiet", "origin", base_branch, merge_commit_oid],
cwd=cwd,
capture_output=True,
timeout=60,
)
# Both parents must resolve locally.
for ref in (f"{merge_commit_oid}^1", f"{merge_commit_oid}^2"):
rv = subprocess.run(
["git", "rev-parse", "--verify", ref],
cwd=cwd,
capture_output=True,
timeout=10,
)
if rv.returncode != 0:
return None
diff_result = subprocess.run(
[
"git",
"diff",
"--no-color",
f"{merge_commit_oid}^1",
f"{merge_commit_oid}^2",
],
cwd=cwd,
capture_output=True,
text=True,
check=False,
timeout=120,
)
except (OSError, subprocess.TimeoutExpired):
return None
if diff_result.returncode != 0:
return None
return diff_result.stdout


def fetch_failed_ci_logs(pr_number: int, cwd: Optional[str] = None, max_log_lines: int = 200) -> list[dict]:
"""Fetch logs from failed CI/GHA checks on a PR.

Expand Down
125 changes: 111 additions & 14 deletions tests/cli/test_pulls_local_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,28 @@
from cube.github import pulls


def _gh_view_stub(head: str = "feat/big-branch", base: str = "main") -> str:
return json.dumps(
{
"number": 1234,
"title": "Big PR",
"body": "body",
"headRefName": head,
"baseRefName": base,
"headRefOid": "abc123",
"reviews": [],
"state": "OPEN",
"mergedAt": None,
}
)
def _gh_view_stub(
head: str = "feat/big-branch",
base: str = "main",
*,
state: str = "OPEN",
merged_at: str = "",
merge_commit_oid: str = "",
) -> str:
payload = {
"number": 1234,
"title": "Big PR",
"body": "body",
"headRefName": head,
"baseRefName": base,
"headRefOid": "abc123",
"reviews": [],
"state": state,
"mergedAt": merged_at or None,
}
if merge_commit_oid:
payload["mergeCommit"] = {"oid": merge_commit_oid}
return json.dumps(payload)


# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -390,6 +398,95 @@ def _fake_run(cmd, *args, **kwargs):
), "cross-repo fetch must pass --repo flag to gh"


def test_merged_pr_uses_merge_commit_parents_for_has_changes(monkeypatch):
"""Three-dot ``origin/<base>...origin/<head>`` collapses to empty
once base has absorbed head — has_changes used to falsely report
a merged 123k-line PR as "no diff", bailing out of cube prv.

Fix: when the PR is merged AND we have the merge commit SHA,
diff its parents (^1 = base-before-merge, ^2 = PR head) instead.
Matches what ``gh pr diff`` does server-side.

Test pins: a merged PR with merge_commit_oid set produces
has_changes=True without ever consulting the (broken-for-merged)
three-dot path."""
three_dot_called = {"v": False}

def _fake_run(cmd, *args, **kwargs):
if cmd[:3] == ["gh", "pr", "view"]:
return subprocess.CompletedProcess(
cmd,
0,
stdout=_gh_view_stub(state="MERGED", merged_at="2026-05-24T13:06:25Z", merge_commit_oid="abc123"),
stderr="",
)
if cmd[:2] == ["git", "fetch"]:
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
if cmd[:3] == ["git", "rev-parse", "--verify"]:
return subprocess.CompletedProcess(cmd, 0, stdout="abc123\n", stderr="")
if cmd[:3] == ["git", "diff", "--quiet"]:
# If the args are the three-dot form, flag the regression.
args_after_quiet = cmd[3:]
if any("..." in a for a in args_after_quiet):
three_dot_called["v"] = True
# Three-dot would return 0 (no diff) here — and that's
# exactly the bug we're avoiding. Returning 0 forces
# the test to fail loudly if we ever fall through.
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
# Merge-commit parents path: report "has diff" so we
# confirm the contract.
return subprocess.CompletedProcess(cmd, 1, stdout="", stderr="")
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")

monkeypatch.setattr(pulls, "check_gh_installed", lambda: True)
monkeypatch.setattr(subprocess, "run", _fake_run)

data = pulls.fetch_pr(1234)
assert data.merge_commit_oid == "abc123"
assert data.has_changes() is True
assert three_dot_called["v"] is False, "merged PRs must NOT consult the three-dot path"


def test_merged_pr_get_diff_uses_merge_commit_parents(monkeypatch):
"""Symmetric to has_changes: get_diff for a merged PR resolves
via the merge commit's parents, not three-dot. Without this,
the dedupe agent received an empty diff for any merged PR
cube was asked to review post-merge."""
called = {"three_dot": False, "merge_commit_diff": False}

def _fake_run(cmd, *args, **kwargs):
if cmd[:3] == ["gh", "pr", "view"]:
return subprocess.CompletedProcess(
cmd,
0,
stdout=_gh_view_stub(state="MERGED", merged_at="2026-05-24T13:06:25Z", merge_commit_oid="abc123"),
stderr="",
)
if cmd[:2] == ["git", "fetch"]:
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
if cmd[:3] == ["git", "rev-parse", "--verify"]:
return subprocess.CompletedProcess(cmd, 0, stdout="abc123\n", stderr="")
if cmd[:3] == ["git", "diff", "--no-color"]:
args_after_no_color = cmd[3:]
if any("..." in a for a in args_after_no_color):
called["three_dot"] = True
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
# Merge-commit parents form: ^1 and ^2
if any(a.endswith("^1") for a in args_after_no_color):
called["merge_commit_diff"] = True
return subprocess.CompletedProcess(cmd, 0, stdout="diff --git a/x b/x\n+merged content\n", stderr="")
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")

monkeypatch.setattr(pulls, "check_gh_installed", lambda: True)
monkeypatch.setattr(subprocess, "run", _fake_run)

data = pulls.fetch_pr(1234)
diff = data.get_diff()
assert "merged content" in diff
assert called["merge_commit_diff"] is True
assert called["three_dot"] is False


def test_has_changes_defaults_to_true_on_git_failure(monkeypatch):
"""If git can't answer (offline, no origin, etc), assume the PR
has changes — better than silently skipping a real PR because the
Expand Down
Loading