diff --git a/bugbug/tools/code_review/agent.py b/bugbug/tools/code_review/agent.py index f04de3d860..c486f2b48f 100644 --- a/bugbug/tools/code_review/agent.py +++ b/bugbug/tools/code_review/agent.py @@ -24,6 +24,7 @@ AgentResponse, CodeReviewToolResponse, GeneratedReviewComment, + PatchScopeResponse, Skill, ) from bugbug.tools.code_review.database import ReviewCommentsDB @@ -36,6 +37,7 @@ CODE_REVIEW_TODO_PROMPT, CODE_REVIEW_TODO_TOOL_DESCRIPTION, FIRST_MESSAGE_TEMPLATE, + PATCH_SCOPE_PROMPT, STATIC_COMMENT_EXAMPLES, SYSTEM_PROMPT_TEMPLATE, TEMPLATE_COMMENT_EXAMPLE, @@ -124,6 +126,13 @@ def __init__( middleware=middleware, ) + # A separate, tool-less pass dedicated to assessing the patch's overall + # scope and suggesting a split when the patch is too large or unfocused. + self.scope_agent = create_agent( + llm, + response_format=ProviderStrategy(PatchScopeResponse), + ) + self.review_comments_db = review_comments_db self.show_patch_example = show_patch_example @@ -243,6 +252,21 @@ async def generate_review_comments( return result["structured_response"].comments, manifest + async def assess_patch_scope( + self, patch: Patch, patch_summary: str + ) -> list[GeneratedReviewComment]: + """Run a separate pass that suggests splitting an overly large/unfocused patch. + + Returns at most one comment. Empty when no split is warranted. + """ + prompt = PATCH_SCOPE_PROMPT.format( + target_software=self.target_software, + patch=format_patch_set(patch.patch_set), + patch_summary=patch_summary, + ) + result = await self.scope_agent.ainvoke({"messages": [HumanMessage(prompt)]}) + return result["structured_response"].comments[:1] + async def run(self, patch: Patch) -> CodeReviewToolResponse: if self.count_tokens(patch.raw_diff) > 21000: raise LargeDiffError("The diff is too large") @@ -258,8 +282,16 @@ async def run(self, patch: Patch) -> CodeReviewToolResponse: filtered_suggestions = self.suggestion_filterer.run(unfiltered_suggestions) + # The patch-scope suggestion is a deliberate, meta-level comment, so it + # bypasses the suggestion filterer and is always sorted last. + scope_suggestions = await self.assess_patch_scope(patch, patch_summary) + for offset, suggestion in enumerate(scope_suggestions): + suggestion.order = len(filtered_suggestions) + offset + 1 + inline_comments = list( - convert_generated_comments_to_inline(filtered_suggestions, patch.patch_set) + convert_generated_comments_to_inline( + filtered_suggestions + scope_suggestions, patch.patch_set + ) ) return CodeReviewToolResponse( @@ -268,6 +300,7 @@ async def run(self, patch: Patch) -> CodeReviewToolResponse: details={ "model": self._agent_model_name, "num_unfiltered_suggestions": len(unfiltered_suggestions), + "num_scope_suggestions": len(scope_suggestions), "external_content": external_content_manifest, }, ) diff --git a/bugbug/tools/code_review/data_types.py b/bugbug/tools/code_review/data_types.py index b6c56c1114..07d2c002fb 100644 --- a/bugbug/tools/code_review/data_types.py +++ b/bugbug/tools/code_review/data_types.py @@ -30,6 +30,17 @@ class AgentResponse(BaseModel): ) +class PatchScopeResponse(BaseModel): + """The response from the patch-scope assessment pass.""" + + comments: list[GeneratedReviewComment] = Field( + description=( + "At most one comment suggesting the patch be split into smaller, " + "independently reviewable pieces. Empty when no split is warranted." + ) + ) + + class CodeReviewToolResponse(BaseModel): """The response from the CodeReviewTool.""" diff --git a/bugbug/tools/code_review/prompts.py b/bugbug/tools/code_review/prompts.py index 8d6d94b61f..3b91556f7e 100644 --- a/bugbug/tools/code_review/prompts.py +++ b/bugbug/tools/code_review/prompts.py @@ -58,6 +58,37 @@ """ +PATCH_SCOPE_PROMPT = """You are an expert {target_software} engineer assessing whether a pull request is too large or unfocused to be reviewed well. + +Large changes are harder to review and empirically carry higher defect and regression risk: reviewer defect-detection drops sharply once a change grows past a few hundred changed lines, and at Mozilla the patches that introduce regressions tend to be the larger ones. + +Decide whether to suggest the author split this patch into smaller, independently reviewable pieces. Either of these is a reason to suggest a split: + 1. The patch bundles multiple INDEPENDENT, unrelated changes (e.g. two unrelated features, an unrelated refactor mixed with a behavior change, or several bug fixes that share no rationale) + 2. The patch is very large even if cohesive — past roughly a few hundred changed lines a single change becomes hard to review thoroughly, regardless of how unified it is + +Rules: +- Emit AT MOST ONE comment. If neither reason holds, return an empty list of comments. +- Use judgement: do NOT flag a moderately sized, cohesive change. Reserve this for patches whose size or mixed scope genuinely impedes careful review. +- A large change that has no natural seam and must land atomically is acceptable — return an empty list rather than suggesting an impractical split. +- If you do comment, name concrete seams: the distinct concerns, or the stages of a large cohesive change (e.g. land the data-model change separately from the call-site updates). +- Briefly tell the author *why*: larger patches get less thorough review and empirically introduce more bugs and regressions, so smaller patches are easier to review and safer to land. +- Anchor the comment to a representative changed line (a line that begins with `+`). Set `file` to that file's path and `code_line` to that line's number. +- Use direct, declarative language. NEVER use these banned phrases: "maybe", "might want to", "consider", "possibly", "could be", "you may want to". + +Here is a summary of the patch: + + +{patch_summary} + + +Here is the patch you need to assess: + + +{patch} + +""" + + FIRST_MESSAGE_TEMPLATE = """Here is a summary of the patch: diff --git a/tests/test_code_review.py b/tests/test_code_review.py index c571583b04..8a533766da 100644 --- a/tests/test_code_review.py +++ b/tests/test_code_review.py @@ -13,6 +13,8 @@ from bugbug.tools.code_review import data_types, langchain_tools, review_context from bugbug.tools.code_review.data_types import ( ExternalContent, + GeneratedReviewComment, + PatchScopeResponse, Skill, _strip_frontmatter, ) @@ -1258,3 +1260,95 @@ async def test_extra_context_toml_replaces_by_name(): assert len(results) == 1 assert results[0].name == ".claude/skills/dom-media-v2.md" assert results[0].body == "Updated guidelines.\n" + + +# --------------------------------------------------------------------------- +# patch-scope assessment pass (split suggestion) +# --------------------------------------------------------------------------- + + +def _make_scope_tool(scope_response): + """Build a CodeReviewTool without running __init__ (avoids create_agent). + + The scope_agent is mocked to return the given PatchScopeResponse. + """ + pytest.importorskip("langchain") + from bugbug.tools.code_review.agent import CodeReviewTool + + tool = CodeReviewTool.__new__(CodeReviewTool) + tool.target_software = "Mozilla Firefox" + tool.scope_agent = MagicMock() + tool.scope_agent.ainvoke = AsyncMock( + return_value={"structured_response": scope_response} + ) + return tool + + +def test_assess_patch_scope_returns_at_most_one_comment(): + # The model returns two comments; the pass must keep only the first. + comments = [ + GeneratedReviewComment( + file="b/f.txt", + code_line=1, + comment=f"Split this patch {i}", + explanation="bundles unrelated changes", + order=1, + ) + for i in range(2) + ] + tool = _make_scope_tool(PatchScopeResponse(comments=comments)) + + patch = make_patch("--- a/f.txt\n+++ b/f.txt\n@@ -0,0 +1,2 @@\n+a\n+b\n") + result = asyncio.run(tool.assess_patch_scope(patch, "summary")) + + assert len(result) == 1 + assert result[0].comment == "Split this patch 0" + + +def test_assess_patch_scope_returns_empty_when_no_split_warranted(): + tool = _make_scope_tool(PatchScopeResponse(comments=[])) + + patch = make_patch("--- a/f.txt\n+++ b/f.txt\n@@ -0,0 +1,2 @@\n+a\n+b\n") + result = asyncio.run(tool.assess_patch_scope(patch, "summary")) + + assert result == [] + + +def test_run_appends_scope_suggestion_last(): + pytest.importorskip("langchain") + + regular = GeneratedReviewComment( + file="b/f.txt", + code_line=1, + comment="Fix the bug", + explanation="off-by-one", + order=1, + ) + scope = GeneratedReviewComment( + file="b/f.txt", + code_line=2, + comment="Split this patch into smaller pieces", + explanation="bundles unrelated changes", + order=1, + ) + tool = _make_scope_tool(PatchScopeResponse(comments=[scope])) + + patch = make_patch("--- a/f.txt\n+++ b/f.txt\n@@ -0,0 +1,2 @@\n+a\n+b\n") + patch.raw_diff = "diff" + + tool.count_tokens = MagicMock(return_value=10) + tool._agent_model = "model-x" + tool.patch_summarizer = MagicMock() + tool.patch_summarizer.run = MagicMock(return_value="summary") + tool.generate_review_comments = AsyncMock(return_value=([regular], [])) + tool.suggestion_filterer = MagicMock() + tool.suggestion_filterer.run = MagicMock(return_value=[regular]) + + result = asyncio.run(tool.run(patch)) + + assert result.details["num_scope_suggestions"] == 1 + assert len(result.review_comments) == 2 + # The split suggestion is sorted last (order = len(filtered) + 1). + last = result.review_comments[-1] + assert last.content == "Split this patch into smaller pieces" + assert last.order == 2