diff --git a/src/repo_sync/workflows/approve_logic.py b/src/repo_sync/workflows/approve_logic.py index 73a6124..6499459 100644 --- a/src/repo_sync/workflows/approve_logic.py +++ b/src/repo_sync/workflows/approve_logic.py @@ -13,6 +13,7 @@ import json import logging import os +import subprocess import time from datetime import datetime, timezone @@ -108,17 +109,63 @@ def check_mergeability( return "unknown" -def approve_and_auto_merge(gh: GhOps, pr_number: int) -> None: - """Approve a clean PR and enable auto-merge.""" +# Retry parameters for enabling auto-merge. Enabling auto-merge can fail +# with a transient "Base branch was modified" GraphQL error when another +# PR merges into the base branch concurrently with our request. Retrying +# a few times with a short delay is enough to recover and avoids the +# merge queue getting stuck waiting for the next event to re-trigger this +# workflow. +_AUTO_MERGE_MAX_ATTEMPTS = 3 +_AUTO_MERGE_RETRY_DELAY_SECONDS = 10 +_AUTO_MERGE_RETRYABLE_ERROR = "Base branch was modified" + + +def approve_and_auto_merge( + gh: GhOps, + pr_number: int, + max_attempts: int = _AUTO_MERGE_MAX_ATTEMPTS, + retry_delay: int = _AUTO_MERGE_RETRY_DELAY_SECONDS, +) -> None: + """Approve a clean PR and enable auto-merge. + + Enabling auto-merge is retried on the transient "Base branch was + modified" GraphQL error, which happens when the base branch is updated + concurrently with our request (e.g., another sync PR in the queue + merging at the same moment). Retrying minimizes the chance of the + queue getting stuck. + """ gh._run([ "pr", "review", str(pr_number), "--repo", gh.repo, "--approve", "--body", "Clean sync — no conflicts.", ], check=False) - gh._run([ + merge_args = [ "pr", "merge", str(pr_number), "--repo", gh.repo, "--auto", "--squash", - ], check=False) + ] + for attempt in range(1, max_attempts + 1): + try: + gh._run(merge_args) + return + except subprocess.CalledProcessError as exc: + stderr = (exc.stderr or "").strip() + is_retryable = _AUTO_MERGE_RETRYABLE_ERROR in stderr + if not is_retryable or attempt == max_attempts: + # Match the original check=False behavior: log a warning and + # return rather than raising. The escalation workflow will + # eventually catch a PR stuck without auto-merge enabled. + logger.warning( + "gh %s failed (rc=%d) on attempt %d/%d: %s", + " ".join(merge_args), exc.returncode, + attempt, max_attempts, stderr, + ) + return + logger.info( + "Enabling auto-merge failed due to concurrent base-branch " + "update (attempt %d/%d). Retrying in %ds...", + attempt, max_attempts, retry_delay, + ) + time.sleep(retry_delay) def handle_conflict( diff --git a/tests/workflows/test_approve_logic.py b/tests/workflows/test_approve_logic.py new file mode 100644 index 0000000..5bc0df0 --- /dev/null +++ b/tests/workflows/test_approve_logic.py @@ -0,0 +1,114 @@ +"""Tests for repo_sync.workflows.approve_logic.""" + +from __future__ import annotations + +import subprocess +from unittest.mock import MagicMock + +import pytest + +from repo_sync.workflows import approve_logic +from repo_sync.workflows.approve_logic import approve_and_auto_merge + + +def _make_called_process_error(stderr: str) -> subprocess.CalledProcessError: + """Build a CalledProcessError matching what GhOps._run raises.""" + return subprocess.CalledProcessError( + returncode=1, + cmd=["gh", "pr", "merge"], + output="", + stderr=stderr, + ) + + +class TestApproveAndAutoMerge: + """Tests for the approve+auto-merge step, including retry behavior.""" + + def test_succeeds_on_first_attempt(self, monkeypatch: pytest.MonkeyPatch) -> None: + gh = MagicMock() + gh.repo = "owner/repo" + gh._run = MagicMock(return_value="") + + sleep_calls: list[float] = [] + monkeypatch.setattr(approve_logic.time, "sleep", sleep_calls.append) + + approve_and_auto_merge(gh, pr_number=42) + + # Approval and a single merge attempt; no retries. + assert gh._run.call_count == 2 + assert sleep_calls == [] + + def test_retries_on_base_branch_modified_then_succeeds( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + gh = MagicMock() + gh.repo = "owner/repo" + + retryable_err = _make_called_process_error( + "GraphQL: Base branch was modified. Review and try the merge " + "again. (mergePullRequest)" + ) + # First call is the approval (check=False, so it returns ""). The + # next call is the first auto-merge attempt (raises), and the third + # call is the retry (succeeds). + gh._run = MagicMock(side_effect=["", retryable_err, ""]) + + sleep_calls: list[float] = [] + monkeypatch.setattr(approve_logic.time, "sleep", sleep_calls.append) + + approve_and_auto_merge( + gh, pr_number=42, max_attempts=3, retry_delay=10 + ) + + # Approval + 2 merge attempts; one sleep of 10s between them. + assert gh._run.call_count == 3 + assert sleep_calls == [10] + + def test_gives_up_after_max_attempts_on_retryable_error( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + gh = MagicMock() + gh.repo = "owner/repo" + + retryable_err = _make_called_process_error( + "GraphQL: Base branch was modified. (mergePullRequest)" + ) + gh._run = MagicMock( + side_effect=["", retryable_err, retryable_err, retryable_err] + ) + + sleep_calls: list[float] = [] + monkeypatch.setattr(approve_logic.time, "sleep", sleep_calls.append) + + # Should NOT raise — match original check=False behavior of logging + # and returning so the queue can be unblocked by a later event. + approve_and_auto_merge( + gh, pr_number=42, max_attempts=3, retry_delay=10 + ) + + # Approval + 3 merge attempts; 2 sleeps (no sleep after the final + # attempt). + assert gh._run.call_count == 4 + assert sleep_calls == [10, 10] + + def test_does_not_retry_on_non_retryable_error( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + gh = MagicMock() + gh.repo = "owner/repo" + + non_retryable_err = _make_called_process_error( + "GraphQL: something else went wrong" + ) + gh._run = MagicMock(side_effect=["", non_retryable_err]) + + sleep_calls: list[float] = [] + monkeypatch.setattr(approve_logic.time, "sleep", sleep_calls.append) + + approve_and_auto_merge( + gh, pr_number=42, max_attempts=3, retry_delay=10 + ) + + # Approval + a single failed merge attempt; no retries, no sleeps. + assert gh._run.call_count == 2 + assert sleep_calls == []