From 073646626243af10430875aaac6e71f9f02e93c3 Mon Sep 17 00:00:00 2001 From: Suhaib Mujahid Date: Sat, 31 Jan 2026 13:17:46 -0500 Subject: [PATCH 01/10] Refactor to use PhabricatorPatch for patch retrieval Replaced usage of PhabricatorReviewData.get_patch_by_id with direct instantiation of PhabricatorPatch in both dataset creation and evaluation notebooks. --- notebooks/code_review_create_dataset.ipynb | 5 ++--- notebooks/code_review_evaluation.ipynb | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/notebooks/code_review_create_dataset.ipynb b/notebooks/code_review_create_dataset.ipynb index 96cdf91e42..a7d262dc01 100644 --- a/notebooks/code_review_create_dataset.ipynb +++ b/notebooks/code_review_create_dataset.ipynb @@ -94,16 +94,15 @@ "metadata": {}, "outputs": [], "source": [ - "from bugbug.tools.core.platforms.phabricator import PhabricatorReviewData\n", + "from bugbug.tools.core.platforms.phabricator import PhabricatorPatch\n", "from bugbug.tools.patch_summarization.agent import PatchSummarizationTool\n", "\n", "summarizer = PatchSummarizationTool.create()\n", - "review_data = PhabricatorReviewData()\n", "\n", "\n", "@weave.op()\n", "def summarize(diff_id):\n", - " patch = review_data.get_patch_by_id(diff_id)\n", + " patch = PhabricatorPatch(diff_id=diff_id)\n", " summaries[diff_id] = summarizer.run(patch)\n", "\n", "\n", diff --git a/notebooks/code_review_evaluation.ipynb b/notebooks/code_review_evaluation.ipynb index 3291d491e8..68a99d0cc5 100644 --- a/notebooks/code_review_evaluation.ipynb +++ b/notebooks/code_review_evaluation.ipynb @@ -63,6 +63,7 @@ "from functools import cached_property\n", "\n", "from bugbug.tools.code_review.agent import CodeReviewTool\n", + "from bugbug.tools.core.platforms.phabricator import PhabricatorPatch\n", "\n", "\n", "class CodeReviewModel(weave.Model):\n", @@ -74,7 +75,7 @@ "\n", " @weave.op()\n", " def invoke(self, diff_id: int, patch_summary: str) -> dict:\n", - " patch = self.tool.review_data.get_patch_by_id(diff_id)\n", + " patch = PhabricatorPatch(diff_id=diff_id)\n", " try:\n", " comments = self.tool.generate_review_comments(patch, patch_summary)\n", " return {\n", From aff45e8a1fc35690b53fe772a1cc1818d540b944 Mon Sep 17 00:00:00 2001 From: Suhaib Mujahid Date: Thu, 29 Jan 2026 15:25:38 -0500 Subject: [PATCH 02/10] Remove unused run_by_diff_id() and related code --- bugbug/tools/code_review/agent.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/bugbug/tools/code_review/agent.py b/bugbug/tools/code_review/agent.py index 92e2b3f366..8c25df2ac2 100644 --- a/bugbug/tools/code_review/agent.py +++ b/bugbug/tools/code_review/agent.py @@ -41,7 +41,7 @@ from bugbug.tools.core.data_types import InlineComment from bugbug.tools.core.exceptions import LargeDiffError, ModelResultError from bugbug.tools.core.llms import get_tokenizer -from bugbug.tools.core.platforms.base import Patch, ReviewData +from bugbug.tools.core.platforms.base import Patch logger = getLogger(__name__) @@ -84,7 +84,6 @@ def __init__( llm: BaseChatModel, patch_summarizer: PatchSummarizer, suggestion_filterer: SuggestionFilterer, - review_data: ReviewData, function_search: Optional[FunctionSearch] = None, review_comments_db: Optional["ReviewCommentsDB"] = None, show_patch_example: bool = False, @@ -95,8 +94,6 @@ def __init__( self.target_software = target_software - self.review_data = review_data - self._tokenizer = get_tokenizer( llm.model_name if hasattr(llm, "model_name") else "" ) @@ -156,11 +153,6 @@ def create(cls, **kwargs): QdrantVectorDB("diff_comments") ) - if "review_data" not in kwargs: - from bugbug.tools.core.platforms.phabricator import PhabricatorReviewData - - kwargs["review_data"] = PhabricatorReviewData() - if "llm" not in kwargs: from bugbug.tools.core.llms import create_anthropic_llm @@ -218,10 +210,6 @@ def generate_review_comments( return result["structured_response"].comments - def run_by_diff_id(self, diff_id: str | int) -> list[InlineComment] | None: - patch = self.review_data.get_patch_by_id(diff_id) - return self.run(patch) - def run(self, patch: Patch) -> list[InlineComment] | None: if self.count_tokens(patch.raw_diff) > 21000: raise LargeDiffError("The diff is too large") From cf69cc1ee2919fb49ce92b83c588164424787b55 Mon Sep 17 00:00:00 2001 From: Suhaib Mujahid Date: Fri, 30 Jan 2026 19:45:01 -0500 Subject: [PATCH 03/10] Remove unused get_review_request_by_id methods Eliminated the abstract and concrete implementations of get_review_request_by_id from ReviewData and its subclasses, as they are no longer used. --- bugbug/tools/core/platforms/base.py | 4 ---- bugbug/tools/core/platforms/phabricator.py | 7 +------ bugbug/tools/core/platforms/swarm.py | 5 +---- 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/bugbug/tools/core/platforms/base.py b/bugbug/tools/core/platforms/base.py index 0805e3b25a..309c142101 100644 --- a/bugbug/tools/core/platforms/base.py +++ b/bugbug/tools/core/platforms/base.py @@ -74,10 +74,6 @@ class ReviewData(ABC): NIT_PATTERN = re.compile(r"[^a-zA-Z0-9]nit[\s:,]", re.IGNORECASE) - @abstractmethod - def get_review_request_by_id(self, review_id: int): - raise NotImplementedError - @abstractmethod def get_patch_by_id(self, patch_id: str | int) -> Patch: raise NotImplementedError diff --git a/bugbug/tools/core/platforms/phabricator.py b/bugbug/tools/core/platforms/phabricator.py index 164491b985..8ff2b6e16f 100644 --- a/bugbug/tools/core/platforms/phabricator.py +++ b/bugbug/tools/core/platforms/phabricator.py @@ -15,7 +15,7 @@ from tqdm import tqdm from bugbug import db, phabricator, utils -from bugbug.tools.core.data_types import InlineComment, ReviewRequest +from bugbug.tools.core.data_types import InlineComment from bugbug.tools.core.platforms.base import Patch, ReviewData from bugbug.tools.core.platforms.bugzilla import Bug from bugbug.utils import get_secret @@ -646,11 +646,6 @@ def __init__(self): get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN") ) - def get_review_request_by_id(self, revision_id: int) -> ReviewRequest: - revisions = phabricator.get(rev_ids=[int(revision_id)]) - assert len(revisions) == 1 - return ReviewRequest(revisions[0]["fields"]["diffID"]) - @tenacity.retry( stop=tenacity.stop_after_attempt(7), wait=tenacity.wait_exponential(multiplier=2, min=2), diff --git a/bugbug/tools/core/platforms/swarm.py b/bugbug/tools/core/platforms/swarm.py index 13cca92fda..0143aa2ecc 100644 --- a/bugbug/tools/core/platforms/swarm.py +++ b/bugbug/tools/core/platforms/swarm.py @@ -9,7 +9,7 @@ from functools import cached_property from typing import Iterable -from bugbug.tools.core.data_types import InlineComment, ReviewRequest +from bugbug.tools.core.data_types import InlineComment from bugbug.tools.core.platforms.base import Patch, ReviewData from bugbug.utils import get_secret @@ -78,9 +78,6 @@ def __init__(self): "instance": get_secret("SWARM_INSTANCE"), } - def get_review_request_by_id(self, revision_id: int) -> ReviewRequest: - return ReviewRequest(revision_id) - def get_patch_by_id(self, patch_id: str | int) -> Patch: return SwarmPatch(str(patch_id), self.auth) From 4258a5693f2c4bc9fd0bfb3a8fe1f6e9eccbd256 Mon Sep 17 00:00:00 2001 From: Suhaib Mujahid Date: Fri, 30 Jan 2026 20:00:28 -0500 Subject: [PATCH 04/10] Refactor Phabricator API client usage Introduces a cached `get_phabricator_client` function to manage Phabricator API client instantiation and replaces direct usage of `phabricator.PHABRICATOR_API` with the new client throughout the module. Removes legacy secret fetching and updates environment variable handling for backward compatibility. --- bugbug/tools/core/platforms/phabricator.py | 51 +++++++++++++--------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/bugbug/tools/core/platforms/phabricator.py b/bugbug/tools/core/platforms/phabricator.py index 8ff2b6e16f..6bfa9a3f05 100644 --- a/bugbug/tools/core/platforms/phabricator.py +++ b/bugbug/tools/core/platforms/phabricator.py @@ -5,9 +5,10 @@ """Phabricator platform implementation for code review.""" +import os from collections import defaultdict from datetime import datetime -from functools import cached_property +from functools import cache, cached_property from logging import getLogger from typing import Iterable, Optional @@ -18,7 +19,6 @@ from bugbug.tools.core.data_types import InlineComment from bugbug.tools.core.platforms.base import Patch, ReviewData from bugbug.tools.core.platforms.bugzilla import Bug -from bugbug.utils import get_secret logger = getLogger(__name__) @@ -29,12 +29,27 @@ UNTRUSTED_CONTENT_REDACTED = "[Content from untrusted user removed for security]" +@cache +def get_phabricator_client(api_key: Optional[str] = None, url: Optional[str] = None): + """Get a cached Phabricator client instance.""" + from libmozdata.phabricator import PhabricatorAPI + + # Fallback to old environment variable names for backward compatibility + if not api_key: + api_key = os.getenv("PHABRICATOR_KEY") or os.getenv("BUGBUG_PHABRICATOR_TOKEN") + + if not url: + url = os.getenv("PHABRICATOR_URL") or os.getenv("BUGBUG_PHABRICATOR_URL") + + return PhabricatorAPI(api_key, url) + + def _get_users_info_batch_impl(user_phids: set[str]) -> dict[str, dict]: """Internal implementation for fetching user information. This function is retried by the wrapper function. """ - assert phabricator.PHABRICATOR_API is not None + phabricator = get_phabricator_client() if not user_phids: return {} @@ -42,13 +57,13 @@ def _get_users_info_batch_impl(user_phids: set[str]) -> dict[str, dict]: logger.info(f"Fetching user info for {len(user_phids)} PHIDs") # Get user names and nick - users_response = phabricator.PHABRICATOR_API.request( + users_response = phabricator.request( "user.search", constraints={"phids": list(user_phids)}, ) # Get MOCO group members - moco_response = phabricator.PHABRICATOR_API.request( + moco_response = phabricator.request( "project.search", constraints={"phids": [MOCO_GROUP_PHID]}, attachments={"members": True}, @@ -315,11 +330,11 @@ def get_old_file(self, file_path: str) -> str: @cached_property def _changesets(self) -> list[dict]: - assert phabricator.PHABRICATOR_API is not None + phabricator = get_phabricator_client() diff = self._diff_metadata - changesets = phabricator.PHABRICATOR_API.request( + changesets = phabricator.request( "differential.changeset.search", constraints={"diffPHIDs": [diff["phid"]]}, )["data"] @@ -328,8 +343,8 @@ def _changesets(self) -> list[dict]: @cached_property def raw_diff(self) -> str: - assert phabricator.PHABRICATOR_API is not None - raw_diff = phabricator.PHABRICATOR_API.load_raw_diff(self.diff_id) + phabricator = get_phabricator_client() + raw_diff = phabricator.load_raw_diff(self.diff_id) return raw_diff @@ -345,8 +360,8 @@ def _commit_available(commit_hash: str) -> bool: @cached_property def _diff_metadata(self) -> dict: - assert phabricator.PHABRICATOR_API is not None - diffs = phabricator.PHABRICATOR_API.search_diffs(diff_id=self.diff_id) + phabricator = get_phabricator_client() + diffs = phabricator.search_diffs(diff_id=self.diff_id) assert len(diffs) == 1 diff = diffs[0] @@ -399,12 +414,12 @@ def date_modified(self) -> datetime: @cached_property def _revision_metadata(self) -> dict: - assert phabricator.PHABRICATOR_API is not None + phabricator = get_phabricator_client() # We pass either the revision PHID or the revision ID, not both. revision_phid = self.revision_phid if not self._revision_id else None - revision = phabricator.PHABRICATOR_API.load_revision( + revision = phabricator.load_revision( rev_phid=revision_phid, rev_id=self._revision_id ) @@ -435,9 +450,9 @@ def revision_id(self) -> int: return self._revision_metadata["id"] def _get_transactions(self) -> list[dict]: - assert phabricator.PHABRICATOR_API is not None + phabricator = get_phabricator_client() - transactions = phabricator.PHABRICATOR_API.request( + transactions = phabricator.request( "transaction.search", objectIdentifier=self._revision_metadata["phid"], )["data"] @@ -640,12 +655,6 @@ def to_md(self) -> str: class PhabricatorReviewData(ReviewData): - def __init__(self): - super().__init__() - phabricator.set_api_key( - get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN") - ) - @tenacity.retry( stop=tenacity.stop_after_attempt(7), wait=tenacity.wait_exponential(multiplier=2, min=2), From a0bbac558132139e0d2eca57cf08190990be09ce Mon Sep 17 00:00:00 2001 From: Suhaib Mujahid Date: Fri, 30 Jan 2026 20:02:02 -0500 Subject: [PATCH 05/10] Move db and phabricator imports to method scope Relocated the imports of db and phabricator modules inside the get_all_inline_comments method to avoid unnecessary top-level imports. This change helps reduce import overhead. --- bugbug/tools/core/platforms/phabricator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bugbug/tools/core/platforms/phabricator.py b/bugbug/tools/core/platforms/phabricator.py index 6bfa9a3f05..dbb2f5e43b 100644 --- a/bugbug/tools/core/platforms/phabricator.py +++ b/bugbug/tools/core/platforms/phabricator.py @@ -15,7 +15,7 @@ import tenacity from tqdm import tqdm -from bugbug import db, phabricator, utils +from bugbug import utils from bugbug.tools.core.data_types import InlineComment from bugbug.tools.core.platforms.base import Patch, ReviewData from bugbug.tools.core.platforms.bugzilla import Bug @@ -666,6 +666,8 @@ def get_patch_by_id(self, patch_id: str | int) -> Patch: def get_all_inline_comments( self, comment_filter ) -> Iterable[tuple[int, list[InlineComment]]]: + from bugbug import db, phabricator + db.download(phabricator.REVISIONS_DB) for revision in tqdm( From 0220d45c3a6b69258d2cbdb8e275f7dde4d2e23b Mon Sep 17 00:00:00 2001 From: Suhaib Mujahid Date: Fri, 30 Jan 2026 21:30:49 -0500 Subject: [PATCH 06/10] Remove Swarm platform implementation The implementation is not complete. It can live outside bugbug. --- bugbug/tools/code_review/__init__.py | 9 --- bugbug/tools/core/platforms/swarm.py | 88 ---------------------------- 2 files changed, 97 deletions(-) delete mode 100644 bugbug/tools/core/platforms/swarm.py diff --git a/bugbug/tools/code_review/__init__.py b/bugbug/tools/code_review/__init__.py index 23f5ff7704..3d9a28481b 100644 --- a/bugbug/tools/code_review/__init__.py +++ b/bugbug/tools/code_review/__init__.py @@ -43,13 +43,6 @@ from bugbug.tools.core.platforms.phabricator import ( PhabricatorReviewData, ) -from bugbug.tools.core.platforms.swarm import SwarmReviewData - -# Legacy compatibility -review_data_classes = { - "phabricator": PhabricatorReviewData, - "swarm": SwarmReviewData, -} __all__ = [ # Agent @@ -71,6 +64,4 @@ "ReviewData", # Phabricator "PhabricatorReviewData", - # Legacy - "review_data_classes", ] diff --git a/bugbug/tools/core/platforms/swarm.py b/bugbug/tools/core/platforms/swarm.py deleted file mode 100644 index 0143aa2ecc..0000000000 --- a/bugbug/tools/core/platforms/swarm.py +++ /dev/null @@ -1,88 +0,0 @@ -# -*- coding: utf-8 -*- -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this file, -# You can obtain one at http://mozilla.org/MPL/2.0/. - -"""Swarm platform implementation for code review.""" - -from datetime import datetime -from functools import cached_property -from typing import Iterable - -from bugbug.tools.core.data_types import InlineComment -from bugbug.tools.core.platforms.base import Patch, ReviewData -from bugbug.utils import get_secret - - -class SwarmPatch(Patch): - def __init__(self, patch_id: str, auth: dict) -> None: - self._patch_id = patch_id - self.auth = auth - self.rev_id = int(patch_id) - - @property - def patch_id(self) -> str: - return self._patch_id - - @cached_property - def _revision_metadata(self) -> dict: - import swarm - - revisions = swarm.get(self.auth, rev_ids=[self.rev_id]) - assert len(revisions) == 1 - - return revisions[0] - - @property - def raw_diff(self) -> str: - revision = self._revision_metadata - - return revision["fields"]["diff"] - - @cached_property - def base_commit_hash(self) -> str: - raise NotImplementedError - - @property - def date_created(self) -> datetime: - revision = self._revision_metadata - - return datetime.fromtimestamp(revision["fields"]["created"]) - - @cached_property - def patch_title(self) -> str: - raise NotImplementedError - - @property - def patch_description(self) -> str: - raise NotImplementedError - - @cached_property - def bug_title(self) -> str: - raise NotImplementedError - - @property - def patch_url(self) -> str: - raise NotImplementedError - - def get_old_file(self, file_path: str) -> str: - raise NotImplementedError - - -class SwarmReviewData(ReviewData): - def __init__(self): - self.auth = { - "user": get_secret("SWARM_USER"), - "password": get_secret("SWARM_PASS"), - "port": get_secret("SWARM_PORT"), - "instance": get_secret("SWARM_INSTANCE"), - } - - def get_patch_by_id(self, patch_id: str | int) -> Patch: - return SwarmPatch(str(patch_id), self.auth) - - def get_all_inline_comments( - self, comment_filter - ) -> Iterable[tuple[int, list[InlineComment]]]: - # Todo - raise NotImplementedError From 22dafb0703b8f60e1e5dd35299b87ce91e9db527 Mon Sep 17 00:00:00 2001 From: Suhaib Mujahid Date: Mon, 2 Feb 2026 22:05:18 -0500 Subject: [PATCH 07/10] Robustify scorer summaries and simplify invoke --- bugbug/tools/code_review/scorer.py | 47 +++++++++++++++----------- notebooks/code_review_evaluation.ipynb | 15 +++----- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/bugbug/tools/code_review/scorer.py b/bugbug/tools/code_review/scorer.py index fac5aab475..5a6e78267d 100644 --- a/bugbug/tools/code_review/scorer.py +++ b/bugbug/tools/code_review/scorer.py @@ -30,16 +30,19 @@ def score( "ground_truth_valid_count": valid_comment_count, "ground_truth_invalid_count": invalid_comment_count, "ground_truth_total_count": len(ground_truth_comments), - "successful": not output["error"], } def summarize(self, score_rows: list[dict]) -> dict: """Aggregate scores across all examples.""" - total_generated = sum(r["generated_comment_count"] for r in score_rows) - total_gt_valid = sum(r["ground_truth_valid_count"] for r in score_rows) - total_gt_invalid = sum(r["ground_truth_invalid_count"] for r in score_rows) - total_gt = sum(r["ground_truth_total_count"] for r in score_rows) - error_count = sum(not r["successful"] for r in score_rows) + total_examples = len(score_rows) + total_generated = sum(r.get("generated_comment_count", 0) for r in score_rows) + total_gt_valid = sum(r.get("ground_truth_valid_count", 0) for r in score_rows) + total_gt_invalid = sum( + r.get("ground_truth_invalid_count", 0) for r in score_rows + ) + total_gt = sum(r.get("ground_truth_total_count", 0) for r in score_rows) + successful_runs = sum("generated_comment_count" in r for r in score_rows) + error_count = total_examples - successful_runs return { "total_generated_comments": total_generated, @@ -47,10 +50,10 @@ def summarize(self, score_rows: list[dict]) -> dict: "total_ground_truth_invalid": total_gt_invalid, "total_ground_truth": total_gt, "avg_generated_per_diff": ( - total_generated / len(score_rows) if score_rows else 0 + total_generated / successful_runs if successful_runs else 0 ), - "error_rate": error_count / len(score_rows) if score_rows else 0, - "num_examples": len(score_rows), + "error_rate": error_count / total_examples if total_examples else 0, + "num_examples": total_examples, } @@ -210,36 +213,40 @@ def score( } def summarize(self, score_rows: list[dict]) -> dict: - total_matched_valid = sum(r["matched_valid_count"] for r in score_rows) - total_matched_invalid = sum(r["matched_invalid_count"] for r in score_rows) - total_unmatched_gen = sum(r["unmatched_generated_count"] for r in score_rows) + total_matched_valid = sum(r.get("matched_valid_count", 0) for r in score_rows) + total_matched_invalid = sum( + r.get("matched_invalid_count", 0) for r in score_rows + ) + total_unmatched_gen = sum( + r.get("unmatched_generated_count", 0) for r in score_rows + ) total_unmatched_gt_valid = sum( - r["unmatched_ground_truth_valid_count"] for r in score_rows + r.get("unmatched_ground_truth_valid_count", 0) for r in score_rows ) total_unmatched_gt_invalid = sum( - r["unmatched_ground_truth_invalid_count"] for r in score_rows + r.get("unmatched_ground_truth_invalid_count", 0) for r in score_rows ) total_gt_valid = total_matched_valid + total_unmatched_gt_valid total_gt_invalid = total_matched_invalid + total_unmatched_gt_invalid # Filtering aggregates - total_retained = sum(r["filtering_retained_count"] for r in score_rows) - total_excluded = sum(r["filtering_excluded_count"] for r in score_rows) + total_retained = sum(r.get("filtering_retained_count", 0) for r in score_rows) + total_excluded = sum(r.get("filtering_excluded_count", 0) for r in score_rows) total_generated = total_retained + total_excluded # Filtering x Matching aggregates (use len() since values are lists) total_matched_valid_retained = sum( - len(r["matched_valid_retained"]) for r in score_rows + len(r.get("matched_valid_retained", [])) for r in score_rows ) total_matched_valid_excluded = sum( - len(r["matched_valid_excluded"]) for r in score_rows + len(r.get("matched_valid_excluded", [])) for r in score_rows ) total_matched_invalid_retained = sum( - len(r["matched_invalid_retained"]) for r in score_rows + len(r.get("matched_invalid_retained", [])) for r in score_rows ) total_matched_invalid_excluded = sum( - len(r["matched_invalid_excluded"]) for r in score_rows + len(r.get("matched_invalid_excluded", [])) for r in score_rows ) return { diff --git a/notebooks/code_review_evaluation.ipynb b/notebooks/code_review_evaluation.ipynb index 68a99d0cc5..86e499eabe 100644 --- a/notebooks/code_review_evaluation.ipynb +++ b/notebooks/code_review_evaluation.ipynb @@ -76,17 +76,10 @@ " @weave.op()\n", " def invoke(self, diff_id: int, patch_summary: str) -> dict:\n", " patch = PhabricatorPatch(diff_id=diff_id)\n", - " try:\n", - " comments = self.tool.generate_review_comments(patch, patch_summary)\n", - " return {\n", - " \"comments\": comments,\n", - " \"error\": None,\n", - " }\n", - " except Exception as e:\n", - " return {\n", - " \"comments\": [],\n", - " \"error\": str(e),\n", - " }\n", + " comments = self.tool.generate_review_comments(patch, patch_summary)\n", + " return {\n", + " \"comments\": comments,\n", + " }\n", "\n", "\n", "model = CodeReviewModel()" From aff9fa83dc2e5b1694f4b16bd86caef773b5f8a6 Mon Sep 17 00:00:00 2001 From: Suhaib Mujahid Date: Sat, 31 Jan 2026 13:24:31 -0500 Subject: [PATCH 08/10] Refactor CodeReviewTool to run async Converted generate_review_comments and run methods in CodeReviewTool to async and updated internal calls to use async/await. This prepares the tool for asynchronous operations, improving scalability and responsiveness. --- bugbug/tools/code_review/agent.py | 10 ++++++---- notebooks/code_review_evaluation.ipynb | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/bugbug/tools/code_review/agent.py b/bugbug/tools/code_review/agent.py index 8c25df2ac2..fe98969df0 100644 --- a/bugbug/tools/code_review/agent.py +++ b/bugbug/tools/code_review/agent.py @@ -188,11 +188,11 @@ def generate_initial_prompt(self, patch: Patch, patch_summary: str) -> str: approved_examples=self._get_generated_examples(patch, created_before), ) - def generate_review_comments( + async def generate_review_comments( self, patch: Patch, patch_summary: str ) -> list[GeneratedReviewComment]: try: - for chunk in self.agent.stream( + async for chunk in self.agent.astream( { "messages": [ HumanMessage( @@ -210,13 +210,15 @@ def generate_review_comments( return result["structured_response"].comments - def run(self, patch: Patch) -> list[InlineComment] | None: + async def run(self, patch: Patch) -> list[InlineComment] | None: if self.count_tokens(patch.raw_diff) > 21000: raise LargeDiffError("The diff is too large") patch_summary = self.patch_summarizer.run(patch) - unfiltered_suggestions = self.generate_review_comments(patch, patch_summary) + unfiltered_suggestions = await self.generate_review_comments( + patch, patch_summary + ) if not unfiltered_suggestions: logger.info("No suggestions were generated") return [] diff --git a/notebooks/code_review_evaluation.ipynb b/notebooks/code_review_evaluation.ipynb index 86e499eabe..52214b687f 100644 --- a/notebooks/code_review_evaluation.ipynb +++ b/notebooks/code_review_evaluation.ipynb @@ -74,9 +74,9 @@ " return CodeReviewTool.create()\n", "\n", " @weave.op()\n", - " def invoke(self, diff_id: int, patch_summary: str) -> dict:\n", + " async def invoke(self, diff_id: int, patch_summary: str) -> dict:\n", " patch = PhabricatorPatch(diff_id=diff_id)\n", - " comments = self.tool.generate_review_comments(patch, patch_summary)\n", + " comments = await self.tool.generate_review_comments(patch, patch_summary)\n", " return {\n", " \"comments\": comments,\n", " }\n", From 3040105daa2db6a4f86f351db14da5bf33c6ac9b Mon Sep 17 00:00:00 2001 From: Suhaib Mujahid Date: Tue, 3 Feb 2026 20:31:30 -0500 Subject: [PATCH 09/10] Remove some embedded tests from mozilla code_search. This is needed since base_commit_hash() needs to be async, and it is not worth changing this to support that here. --- bugbug/code_search/mozilla.py | 80 ----------------------------------- 1 file changed, 80 deletions(-) diff --git a/bugbug/code_search/mozilla.py b/bugbug/code_search/mozilla.py index 1c83af4578..c3f5cb2431 100644 --- a/bugbug/code_search/mozilla.py +++ b/bugbug/code_search/mozilla.py @@ -6,7 +6,6 @@ from bugbug.code_search.parser import FunctionSearchParser from bugbug.code_search.searchfox_api import FunctionSearchSearchfoxAPI from bugbug.code_search.searchfox_data import FunctionSearchSearchfoxData -from bugbug.tools.core.platforms.phabricator import PhabricatorPatch class FunctionSearchMozilla(FunctionSearch): @@ -87,82 +86,3 @@ def get_function_by_name( register_function_search("mozilla", FunctionSearchMozilla) - - -if __name__ == "__main__": - import sys - - from libmozdata.phabricator import PhabricatorAPI - - from bugbug.utils import get_secret, get_session, get_user_agent, setup_libmozdata - - setup_libmozdata() - - phabricator = PhabricatorAPI( - get_secret("PHABRICATOR_TOKEN"), get_secret("PHABRICATOR_URL") - ) - - def get_file(commit_hash, path): - r = get_session("hgmo").get( - f"https://hg.mozilla.org/mozilla-unified/raw-file/{commit_hash}/{path}", - headers={ - "User-Agent": get_user_agent(), - }, - ) - r.raise_for_status() - return r.text - - repo_dir = sys.argv[1] - - function_search_mozilla = FunctionSearchMozilla(repo_dir, get_file, False) - - # https://phabricator.services.mozilla.com/D199272?id=811858 - patch1 = PhabricatorPatch("811858") - - # In this case, the function was not used before the patch. - print( - function_search_mozilla.get_function_by_name( - patch1.base_commit_hash, - "dom/base/nsObjectLoadingContent.cpp", - "LowerCaseEqualsASCII", - ) - ) - - # In this case, the function was used before the patch. - print( - function_search_mozilla.get_function_by_name( - patch1.base_commit_hash, - "dom/base/nsObjectLoadingContent.cpp", - "HtmlObjectContentTypeForMIMEType", - ) - ) - - # https://phabricator.services.mozilla.com/D199248?id=811740 - patch2 = PhabricatorPatch("811740") - - # In this case, it is a JS file. - print( - function_search_mozilla.get_function_by_name( - patch2.base_commit_hash, - "testing/modules/XPCShellContentUtils.sys.mjs", - "registerPathHandler", - ) - ) - - patch3 = PhabricatorPatch("721783") - - print( - function_search_mozilla.get_function_by_name( - patch3.base_commit_hash, - "dom/performance/Performance.cpp", - "Performance::MemoryPressure", - ) - ) - - patch4 = PhabricatorPatch("736446") - - function_search_mozilla.get_function_by_line( - patch4.base_commit_hash, - "browser/base/content/test/webrtc/browser_devices_select_audio_output.js", - 180, - ) From bc9141ca8a585efc72dcae96aa5f2cc985dd08c3 Mon Sep 17 00:00:00 2001 From: Suhaib Mujahid Date: Sat, 31 Jan 2026 15:21:16 -0500 Subject: [PATCH 10/10] Refactor PhabricatorPatch to use async HTTP requests Converted file retrieval methods in PhabricatorPatch and related interfaces to async, replacing synchronous HTTP calls with httpx.AsyncClient. --- bugbug/tools/code_review/langchain_tools.py | 4 +- bugbug/tools/core/connection.py | 40 +++++++++++++ bugbug/tools/core/platforms/base.py | 6 +- bugbug/tools/core/platforms/phabricator.py | 62 +++++++++++---------- requirements.txt | 2 + tests/test_phabricator_trusted_filtering.py | 10 +++- 6 files changed, 86 insertions(+), 38 deletions(-) create mode 100644 bugbug/tools/core/connection.py diff --git a/bugbug/tools/code_review/langchain_tools.py b/bugbug/tools/code_review/langchain_tools.py index 08b296028d..7a4f6cf642 100644 --- a/bugbug/tools/code_review/langchain_tools.py +++ b/bugbug/tools/code_review/langchain_tools.py @@ -20,7 +20,7 @@ class CodeReviewContext: @tool -def expand_context(file_path: str, start_line: int, end_line: int) -> str: +async def expand_context(file_path: str, start_line: int, end_line: int) -> str: """Show the content of a file between specified line numbers as it is before the patch. Be careful to not fill your context window with too much data. Request the @@ -38,7 +38,7 @@ def expand_context(file_path: str, start_line: int, end_line: int) -> str: runtime = get_runtime(CodeReviewContext) try: - file_content = runtime.context.patch.get_old_file(file_path) + file_content = await runtime.context.patch.get_old_file(file_path) except FileNotFoundError: return "File not found in the repository before the patch." diff --git a/bugbug/tools/core/connection.py b/bugbug/tools/core/connection.py new file mode 100644 index 0000000000..83168ed9dd --- /dev/null +++ b/bugbug/tools/core/connection.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +"""Core connection utilities for bugbug tools.""" + +import os +from functools import cache + +import httpx + + +def get_user_agent() -> str: + """Get the User-Agent string from environment or default.""" + return os.getenv("USER_AGENT", "bugbug") + + +@cache +def get_http_client() -> httpx.AsyncClient: + """Get the shared HTTP client instance.""" + http_client = httpx.AsyncClient( + follow_redirects=True, + headers={ + "User-Agent": get_user_agent(), + }, + ) + + return http_client + + +async def close_http_client() -> None: + """Close the shared HTTP client instance and clear the cache.""" + if get_http_client.cache_info().currsize == 0: + # No cached client to close + return + + client = get_http_client() + get_http_client.cache_clear() + await client.aclose() diff --git a/bugbug/tools/core/platforms/base.py b/bugbug/tools/core/platforms/base.py index 309c142101..01fea6eee3 100644 --- a/bugbug/tools/core/platforms/base.py +++ b/bugbug/tools/core/platforms/base.py @@ -23,10 +23,6 @@ class Patch(ABC): @abstractmethod def patch_id(self) -> str: ... - @property - @abstractmethod - def base_commit_hash(self) -> str: ... - @property @abstractmethod def raw_diff(self) -> str: ... @@ -64,7 +60,7 @@ def patch_url(self) -> str: ... @abstractmethod - def get_old_file(self, file_path: str) -> str: + async def get_old_file(self, file_path: str) -> str: """Return the contents of a file before the patch was applied.""" ... diff --git a/bugbug/tools/core/platforms/phabricator.py b/bugbug/tools/core/platforms/phabricator.py index dbb2f5e43b..7bd5f29dd2 100644 --- a/bugbug/tools/core/platforms/phabricator.py +++ b/bugbug/tools/core/platforms/phabricator.py @@ -13,9 +13,10 @@ from typing import Iterable, Optional import tenacity +from async_lru import alru_cache from tqdm import tqdm -from bugbug import utils +from bugbug.tools.core.connection import get_http_client, get_user_agent from bugbug.tools.core.data_types import InlineComment from bugbug.tools.core.platforms.base import Patch, ReviewData from bugbug.tools.core.platforms.bugzilla import Bug @@ -30,8 +31,13 @@ @cache -def get_phabricator_client(api_key: Optional[str] = None, url: Optional[str] = None): +def get_phabricator_client( + api_key: Optional[str] = None, + url: Optional[str] = None, + user_agent: Optional[str] = None, +): """Get a cached Phabricator client instance.""" + from libmozdata.config import set_default_value from libmozdata.phabricator import PhabricatorAPI # Fallback to old environment variable names for backward compatibility @@ -41,6 +47,12 @@ def get_phabricator_client(api_key: Optional[str] = None, url: Optional[str] = N if not url: url = os.getenv("PHABRICATOR_URL") or os.getenv("BUGBUG_PHABRICATOR_URL") + if not user_agent: + user_agent = get_user_agent() + + # This is awkward since PhabricatorAPI does not accept user agent directly + set_default_value("User-Agent", "name", user_agent) + return PhabricatorAPI(api_key, url) @@ -281,7 +293,7 @@ def revision_phid(self) -> str: raise ValueError("Cannot determine revision PHID") - def _get_file_from_patch(self, file_path: str, is_before_patch: bool) -> str: + async def _get_file_from_patch(self, file_path: str, is_before_patch: bool) -> str: for changeset in self._changesets: if changeset["fields"]["path"]["displayPath"] == file_path: break @@ -291,22 +303,18 @@ def _get_file_from_patch(self, file_path: str, is_before_patch: bool) -> str: changeset_id = changeset["id"] view = "old" if is_before_patch else "new" - r = utils.get_session("phabricator_web").get( + client = get_http_client() + r = await client.get( f"https://phabricator.services.mozilla.com/differential/changeset/?view={view}&ref={changeset_id}", - headers={ - "User-Agent": utils.get_user_agent(), - }, ) r.raise_for_status() return r.text - def _get_file_from_repo(self, file_path: str, commit_hash: str) -> str: - r = utils.get_session("hgmo").get( + async def _get_file_from_repo(self, file_path: str, commit_hash: str) -> str: + client = get_http_client() + r = await client.get( f"https://hg.mozilla.org/mozilla-unified/raw-file/{commit_hash}/{file_path}", - headers={ - "User-Agent": utils.get_user_agent(), - }, ) if r.status_code == 404: @@ -317,15 +325,15 @@ def _get_file_from_repo(self, file_path: str, commit_hash: str) -> str: r.raise_for_status() return r.text - def get_old_file(self, file_path: str) -> str: + async def get_old_file(self, file_path: str) -> str: if file_path.startswith("b/") or file_path.startswith("a/"): file_path = file_path[2:] try: - return self._get_file_from_patch(file_path, is_before_patch=True) + return await self._get_file_from_patch(file_path, is_before_patch=True) except FileNotFoundError: - return self._get_file_from_repo( - file_path, commit_hash=self.base_commit_hash + return await self._get_file_from_repo( + file_path, commit_hash=await self.get_base_commit_hash() ) @cached_property @@ -349,14 +357,12 @@ def raw_diff(self) -> str: return raw_diff @staticmethod - def _commit_available(commit_hash: str) -> bool: - r = utils.get_session("hgmo").get( + async def _commit_available(commit_hash: str) -> bool: + client = get_http_client() + r = await client.get( f"https://hg.mozilla.org/mozilla-unified/json-rev/{commit_hash}", - headers={ - "User-Agent": utils.get_user_agent(), - }, ) - return r.ok + return r.is_success @cached_property def _diff_metadata(self) -> dict: @@ -367,13 +373,13 @@ def _diff_metadata(self) -> dict: return diff - @cached_property - def base_commit_hash(self) -> str: + @alru_cache + async def get_base_commit_hash(self) -> str: diff = self._diff_metadata try: base_commit_hash = diff["refs"]["base"]["identifier"] - if self._commit_available(base_commit_hash): + if await self._commit_available(base_commit_hash): return base_commit_hash except KeyError: pass @@ -382,11 +388,9 @@ def base_commit_hash(self) -> str: start_date = datetime.fromtimestamp(diff["dateCreated"] - 86400) end_date_str = end_date.strftime("%Y-%m-%d %H:%M:%S") start_date_str = start_date.strftime("%Y-%m-%d %H:%M:%S") - r = utils.get_session("hgmo").get( + client = get_http_client() + r = await client.get( f"https://hg.mozilla.org/mozilla-central/json-pushes?startdate={start_date_str}&enddate={end_date_str}&version=2&tipsonly=1", - headers={ - "User-Agent": utils.get_user_agent(), - }, ) pushes = r.json()["pushes"] closest_push = None diff --git a/requirements.txt b/requirements.txt index 5f1c2210aa..b245c1ed6f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ amqp==5.3.1 +async-lru==2.1.0 beautifulsoup4==4.14.3 boto3==1.42.32 +httpx==0.28.1 imbalanced-learn==0.14.1 langchain==1.2.6 langchain-anthropic==1.3.1 diff --git a/tests/test_phabricator_trusted_filtering.py b/tests/test_phabricator_trusted_filtering.py index 98fb173a81..0f7689d116 100644 --- a/tests/test_phabricator_trusted_filtering.py +++ b/tests/test_phabricator_trusted_filtering.py @@ -369,7 +369,10 @@ def test_to_md_filters_untrusted_content_after_last_trusted(self): return_value="diff --git a/test.py b/test.py\n+hello" ) - with patch.object(phabricator, "PHABRICATOR_API", mock_api): + with patch( + "bugbug.tools.core.platforms.phabricator.get_phabricator_client", + return_value=mock_api, + ): patch_obj = PhabricatorPatch(diff_id=123456) md_output = patch_obj.to_md() @@ -435,7 +438,10 @@ def mock_request(method, **kwargs): return_value="diff --git a/test.py b/test.py\n+hi" ) - with patch.object(phabricator, "PHABRICATOR_API", mock_api): + with patch( + "bugbug.tools.core.platforms.phabricator.get_phabricator_client", + return_value=mock_api, + ): patch_obj = PhabricatorPatch(diff_id=123456) md_output = patch_obj.to_md()