From 217e6f69088c8087ac9a67c51f9890db3274e67c Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Wed, 24 Jun 2026 14:21:22 -0700 Subject: [PATCH 01/15] Add ability to clone a specific commit --- .../hackbot-runtime/hackbot_runtime/config.py | 3 + .../hackbot_runtime/context.py | 3 +- .../hackbot-runtime/hackbot_runtime/source.py | 62 +++++++++++++++++-- libs/hackbot-runtime/tests/test_context.py | 26 +++++++- libs/hackbot-runtime/tests/test_source.py | 55 ++++++++++++++-- 5 files changed, 136 insertions(+), 13 deletions(-) diff --git a/libs/hackbot-runtime/hackbot_runtime/config.py b/libs/hackbot-runtime/hackbot_runtime/config.py index ef1ae6173d..3c4ec56ba2 100644 --- a/libs/hackbot-runtime/hackbot_runtime/config.py +++ b/libs/hackbot-runtime/hackbot_runtime/config.py @@ -21,6 +21,9 @@ class SourceConfig(BaseModel): # Where the checkout lands. The env var SOURCE_REPO overrides this at runtime # (the orchestrator points it at the task-local workspace). checkout_path: Path = Path("/workspace/source") + # Optional commit/branch/tag to check out instead of remote HEAD. The env var + # SOURCE_REF overrides this at runtime (per-run inputs like a failure commit). + ref: str | None = None class FirefoxConfig(BaseModel): diff --git a/libs/hackbot-runtime/hackbot_runtime/context.py b/libs/hackbot-runtime/hackbot_runtime/context.py index 8ccddfd2ea..8269de8fd1 100644 --- a/libs/hackbot-runtime/hackbot_runtime/context.py +++ b/libs/hackbot-runtime/hackbot_runtime/context.py @@ -112,7 +112,8 @@ def source_repo(self) -> Path: ) env_path = os.environ.get("SOURCE_REPO") path = Path(env_path) if env_path else self._config.source.checkout_path - ensure_source_repo(path, self._config.source.repo_url) + ref = os.environ.get("SOURCE_REF") or self._config.source.ref + ensure_source_repo(path, self._config.source.repo_url, ref) # Record where the agent starts editing, so publish_changes() can later # diff the final tree against it. Best-effort: a failure here must not # break the agent's access to source — it only disables change capture. diff --git a/libs/hackbot-runtime/hackbot_runtime/source.py b/libs/hackbot-runtime/hackbot_runtime/source.py index 04352876eb..0dc0b4060d 100644 --- a/libs/hackbot-runtime/hackbot_runtime/source.py +++ b/libs/hackbot-runtime/hackbot_runtime/source.py @@ -10,13 +10,27 @@ log = logging.getLogger("hackbot_runtime.source") -def ensure_source_repo(source_repo: Path, repo_url: str) -> None: +def ensure_source_repo( + source_repo: Path, repo_url: str, ref: str | None = None +) -> None: """Ensure a shallow checkout of ``repo_url`` exists at ``source_repo``. Idempotent: clones if absent, otherwise shallow-fetches and hard-resets to - the remote HEAD. Recovers from a partial checkout left by an earlier failed - run (e.g. the clone succeeded but the checkout ran out of disk). + the requested ``ref`` (``origin/HEAD`` when ``ref`` is None). Recovers from a + partial checkout left by an earlier failed run (e.g. the clone succeeded but + the checkout ran out of disk). + + When ``ref`` is set (a commit/branch/tag), the repo is pinned there — useful + for agents that must operate on a specific historical commit (e.g. a build + failure commit) rather than the tip of the default branch. """ + # Both the recovery path and the fresh clone converge on a shallow fetch of + # this ref so a pinned commit is fetchable even when it is not on HEAD. + fetch_target = ref if ref else "HEAD" + # A pinned commit needs its parent too so the commit's own diff can be + # computed (e.g. `git show `); depth=1 would fetch only the commit + # itself with no parent to diff against. + depth = "--depth=2" if ref else "--depth=1" git_dir = source_repo / ".git" if git_dir.exists(): # An earlier run killed mid-fetch (e.g. the container was stopped) @@ -45,9 +59,17 @@ def ensure_source_repo(source_repo: Path, repo_url: str) -> None: stdout=sys.stderr, stderr=sys.stderr, ) - log.info("updating source at %s (shallow fetch)", source_repo) + log.info("updating source at %s (shallow fetch %s)", source_repo, fetch_target) subprocess.run( - ["git", "-C", str(source_repo), "fetch", "--depth=1", "origin", "HEAD"], + [ + "git", + "-C", + str(source_repo), + "fetch", + depth, + "origin", + fetch_target, + ], check=True, stdout=sys.stderr, stderr=sys.stderr, @@ -60,6 +82,36 @@ def ensure_source_repo(source_repo: Path, repo_url: str) -> None: ) return source_repo.mkdir(parents=True, exist_ok=True) + if ref: + # A bare clone can't fetch an arbitrary commit directly, so init an empty + # repo and shallow-fetch just the requested ref. + log.info("cloning %s (shallow) to %s at ref %s", repo_url, source_repo, ref) + subprocess.run( + ["git", "init", "-q", str(source_repo)], + check=True, + stdout=sys.stderr, + stderr=sys.stderr, + ) + subprocess.run( + ["git", "-C", str(source_repo), "remote", "add", "origin", repo_url], + check=True, + stdout=sys.stderr, + stderr=sys.stderr, + ) + subprocess.run( + ["git", "-C", str(source_repo), "fetch", depth, "origin", ref], + check=True, + stdout=sys.stderr, + stderr=sys.stderr, + ) + subprocess.run( + ["git", "-C", str(source_repo), "checkout", "-q", "FETCH_HEAD"], + check=True, + stdout=sys.stderr, + stderr=sys.stderr, + ) + log.info("shallow clone complete") + return log.info("cloning %s (shallow) to %s", repo_url, source_repo) subprocess.run( ["git", "clone", "--depth=1", repo_url, str(source_repo)], diff --git a/libs/hackbot-runtime/tests/test_context.py b/libs/hackbot-runtime/tests/test_context.py index 6dea30342a..26ef0b4d1d 100644 --- a/libs/hackbot-runtime/tests/test_context.py +++ b/libs/hackbot-runtime/tests/test_context.py @@ -37,11 +37,12 @@ def test_firefox_disabled_raises(tmp_path): def test_source_repo_prepares_and_honors_env_override(tmp_path, monkeypatch): calls = [] - def fake_ensure(path: Path, repo_url: str) -> None: - calls.append((path, repo_url)) + def fake_ensure(path: Path, repo_url: str, ref: str | None = None) -> None: + calls.append((path, repo_url, ref)) monkeypatch.setattr("hackbot_runtime.context.ensure_source_repo", fake_ensure) monkeypatch.setenv("SOURCE_REPO", str(tmp_path / "from-env")) + monkeypatch.delenv("SOURCE_REF", raising=False) cfg = HackbotConfig( source=SourceConfig( @@ -52,7 +53,26 @@ def fake_ensure(path: Path, repo_url: str) -> None: hb = _hb(tmp_path, cfg) assert hb.source_repo == tmp_path / "from-env" - assert calls == [(tmp_path / "from-env", "https://example.com/r.git")] + assert calls == [(tmp_path / "from-env", "https://example.com/r.git", None)] + + +def test_source_repo_honors_source_ref_env(tmp_path, monkeypatch): + calls = [] + + def fake_ensure(path: Path, repo_url: str, ref: str | None = None) -> None: + calls.append((path, repo_url, ref)) + + monkeypatch.setattr("hackbot_runtime.context.ensure_source_repo", fake_ensure) + monkeypatch.delenv("SOURCE_REPO", raising=False) + monkeypatch.setenv("SOURCE_REF", "deadbeef") + + cfg = HackbotConfig( + source=SourceConfig(repo_url="r", checkout_path=Path("/from/toml")) + ) + hb = _hb(tmp_path, cfg) + + assert hb.source_repo == Path("/from/toml") + assert calls == [(Path("/from/toml"), "r", "deadbeef")] def test_source_repo_uses_toml_path_without_env(tmp_path, monkeypatch): diff --git a/libs/hackbot-runtime/tests/test_source.py b/libs/hackbot-runtime/tests/test_source.py index 4bc83d30e3..b22a91f409 100644 --- a/libs/hackbot-runtime/tests/test_source.py +++ b/libs/hackbot-runtime/tests/test_source.py @@ -6,9 +6,7 @@ from hackbot_runtime import ensure_source_repo -def _make_remote(path: Path) -> None: - subprocess.run(["git", "init", "-q", str(path)], check=True) - (path / "README.md").write_text("hello") +def _commit(path: Path, message: str) -> str: subprocess.run(["git", "-C", str(path), "add", "."], check=True) subprocess.run( [ @@ -22,10 +20,23 @@ def _make_remote(path: Path) -> None: "commit", "-q", "-m", - "init", + message, ], check=True, ) + rev = subprocess.run( + ["git", "-C", str(path), "rev-parse", "HEAD"], + check=True, + capture_output=True, + text=True, + ) + return rev.stdout.strip() + + +def _make_remote(path: Path) -> str: + subprocess.run(["git", "init", "-q", str(path)], check=True) + (path / "README.md").write_text("hello") + return _commit(path, "init") def test_clones_when_absent(tmp_path): @@ -45,3 +56,39 @@ def test_idempotent_update_when_present(tmp_path): # Second call takes the fetch + hard-reset branch and must still succeed. ensure_source_repo(dest, f"file://{remote}") assert (dest / "README.md").read_text() == "hello" + + +def test_pins_to_ref_when_absent(tmp_path): + remote = tmp_path / "remote" + first = _make_remote(remote) + # A second commit advances HEAD; pinning to `first` must ignore it. + (remote / "README.md").write_text("world") + _commit(remote, "second") + dest = tmp_path / "dest" + ensure_source_repo(dest, f"file://{remote}", ref=first) + head = subprocess.run( + ["git", "-C", str(dest), "rev-parse", "HEAD"], + check=True, + capture_output=True, + text=True, + ) + assert head.stdout.strip() == first + assert (dest / "README.md").read_text() == "hello" + + +def test_pinned_ref_includes_parent_for_diff(tmp_path): + remote = tmp_path / "remote" + _make_remote(remote) + (remote / "README.md").write_text("world") + second = _commit(remote, "second") + dest = tmp_path / "dest" + ensure_source_repo(dest, f"file://{remote}", ref=second) + # The parent must be present so the commit's own diff can be computed. + show = subprocess.run( + ["git", "-C", str(dest), "show", second], + check=True, + capture_output=True, + text=True, + ) + assert "hello" in show.stdout + assert "world" in show.stdout From 39d82f5121a4feab05f629b7e67769454c3c7a4a Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Wed, 24 Jun 2026 14:37:40 -0700 Subject: [PATCH 02/15] Improve Bugzilla error handling --- libs/agent-tools/agent_tools/bugzilla.py | 52 ++++++++++++++---------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/libs/agent-tools/agent_tools/bugzilla.py b/libs/agent-tools/agent_tools/bugzilla.py index 03f2bcf8a4..aa256750d3 100644 --- a/libs/agent-tools/agent_tools/bugzilla.py +++ b/libs/agent-tools/agent_tools/bugzilla.py @@ -53,6 +53,29 @@ def _bugsy_error(e: bugsy.BugsyException) -> ToolError: return ToolError(msg, payload=payload) +def _request(ctx: BugzillaContext, path: str, params: dict[str, Any] | None = None): + """Issue a Bugzilla request, normalizing every failure into a ToolError. + + bugsy only raises ``BugsyException`` for Bugzilla-level errors; a bad proxy + URL, an auth redirect, or an empty body instead surfaces as a raw + ``JSONDecodeError``/connection error. Catching those here turns an opaque + "Expecting value: line 1 column 1" into an actionable message. + """ + try: + return ctx.client.request(path, params=params or {}) + except bugsy.BugsyException as e: + raise _bugsy_error(e) from e + except Exception as e: + raise ToolError( + f"Bugzilla request to '{path}' failed: {type(e).__name__}: {e}", + payload={ + "error": "bugzilla_request_failed", + "path": path, + "message": str(e), + }, + ) from e + + @tool async def search_bugs( ctx: BugzillaContext, @@ -77,10 +100,7 @@ async def search_bugs( component, status, resolution, priority, severity, assigned_to, whiteboard, include_fields, limit. """ - try: - result = ctx.client.request("bug", params=params) - except bugsy.BugsyException as e: - raise _bugsy_error(e) from e + result = _request(ctx, "bug", params) bugs = result.get("bugs", []) return {"count": len(bugs), "bugs": bugs} @@ -124,12 +144,7 @@ async def get_bugs( "cf_crash_signature,url,version,op_sys,platform" ) id_csv = ",".join(str(i) for i in ids) - try: - result = ctx.client.request( - "bug", params={"id": id_csv, "include_fields": include} - ) - except bugsy.BugsyException as e: - raise _bugsy_error(e) from e + result = _request(ctx, "bug", {"id": id_csv, "include_fields": include}) bugs = result.get("bugs", []) returned = {b["id"] for b in bugs} inaccessible = [i for i in ids if i not in returned] @@ -153,6 +168,8 @@ async def get_bugs( "code": getattr(e, "code", None), "message": getattr(e, "msg", str(e)), } + except Exception as e: + payload["comments_error"] = {"message": f"{type(e).__name__}: {e}"} return payload @@ -163,10 +180,7 @@ async def get_bug_comments( bug_id: Annotated[int, Field(description="Bug ID.")], ) -> dict: """Fetch all comments for a single bug.""" - try: - result = ctx.client.request(f"bug/{bug_id}/comment") - except bugsy.BugsyException as e: - raise _bugsy_error(e) from e + result = _request(ctx, f"bug/{bug_id}/comment") comments = result.get("bugs", {}).get(str(bug_id), {}).get("comments", []) return {"bug_id": bug_id, "count": len(comments), "comments": comments} @@ -192,10 +206,7 @@ async def get_bug_attachments( base64-encoded in the 'data' field of each attachment. """ params = {} if include_data else {"exclude_fields": "data"} - try: - result = ctx.client.request(f"bug/{bug_id}/attachment", params=params) - except bugsy.BugsyException as e: - raise _bugsy_error(e) from e + result = _request(ctx, f"bug/{bug_id}/attachment", params) atts = result.get("bugs", {}).get(str(bug_id), []) return {"bug_id": bug_id, "count": len(atts), "attachments": atts} @@ -223,10 +234,7 @@ async def download_attachment( get_bug_attachments first to discover attachment IDs. Returns the written path, size, and content_type. """ - try: - result = ctx.client.request(f"bug/attachment/{attachment_id}") - except bugsy.BugsyException as e: - raise _bugsy_error(e) from e + result = _request(ctx, f"bug/attachment/{attachment_id}") att = result.get("attachments", {}).get(str(attachment_id)) if att is None: From 040944c439548b98b05f0529b8694e2d8d3f0de9 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Wed, 24 Jun 2026 14:45:28 -0700 Subject: [PATCH 03/15] Add build target and fix Rust path --- .../agent_tools/firefox/__init__.py | 21 ++++++++++++++----- .../firefox/tools/build_firefox.py | 20 ++++++++++++++++-- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/libs/agent-tools/agent_tools/firefox/__init__.py b/libs/agent-tools/agent_tools/firefox/__init__.py index e371a6be2f..268efa2cab 100644 --- a/libs/agent-tools/agent_tools/firefox/__init__.py +++ b/libs/agent-tools/agent_tools/firefox/__init__.py @@ -122,17 +122,28 @@ async def build_firefox( description="MOZCONFIG to use. Optional — defaults to the configured mozconfig." ), ] = None, + target: Annotated[ + str | None, + Field( + description=( + "Optional build target, e.g. a directory like 'docshell/base'. " + "When set, only that target is built — much faster than a full " + "tree build and enough to confirm a localized fix compiles." + ) + ), + ] = None, ) -> dict: """Build Firefox using the configured mozconfig. - Slow (tens of minutes on a cold build, faster incremental). Returns JSON: - success (bool), build_dir (str), message (str), stdout/stderr. Only call this - if you've changed source or the binary is missing — check if the binary - exists first. + Slow on a full tree build (tens of minutes cold, faster incremental); pass a + `target` directory to build just the part you changed. Returns JSON: success + (bool), build_dir (str), message (str), stdout/stderr. Only call this if + you've changed source or the binary is missing — check if the binary exists + first. """ firefox_dir_p = Path(firefox_dir) if firefox_dir else ctx.source_dir mozconfig_p = Path(mozconfig_path) if mozconfig_path else ctx.mozconfig - return await _build_firefox(firefox_dir_p, mozconfig_p, ctx.objdir) + return await _build_firefox(firefox_dir_p, mozconfig_p, ctx.objdir, target=target) @tool diff --git a/libs/agent-tools/agent_tools/firefox/tools/build_firefox.py b/libs/agent-tools/agent_tools/firefox/tools/build_firefox.py index ea623643f7..00555f724a 100644 --- a/libs/agent-tools/agent_tools/firefox/tools/build_firefox.py +++ b/libs/agent-tools/agent_tools/firefox/tools/build_firefox.py @@ -10,6 +10,7 @@ async def build_firefox( firefox_dir: Path, mozconfig_path: Path, objdir: Path, + target: str | None = None, ) -> dict[str, Any]: """Build Firefox using a specified mozconfig. @@ -19,6 +20,9 @@ async def build_firefox( objdir: Expected build output directory (reported back on success; mozconfig actually determines where the build lands, so this should match what the mozconfig sets) + target: Optional build target (e.g. a directory like ``docshell/base``). + When set, only that target is built -- far faster than a full tree + build and enough to verify a localized fix compiles. Returns: Dict with build result information (success, build_dir, message, @@ -41,9 +45,21 @@ async def build_firefox( env["MOZCONFIG"] = str(mozconfig_path.resolve()) env["CLAUDECODE"] = "1" + # `mach bootstrap` installs rust under ~/.cargo/bin and clang under + # ~/.mozbuild/clang/bin, neither of which is on the default PATH. Without + # this the build fails with "Rust compiler not found" even right after a + # successful bootstrap. + home = Path.home() + toolchain_bins = [home / ".cargo" / "bin", home / ".mozbuild" / "clang" / "bin"] + env["PATH"] = os.pathsep.join( + [*(str(p) for p in toolchain_bins), env.get("PATH", "")] + ) + + mach_args = ["./mach", "build"] + if target: + mach_args.append(target) process = await asyncio.create_subprocess_exec( - "./mach", - "build", + *mach_args, cwd=firefox_dir, env=env, stdout=asyncio.subprocess.PIPE, From b043e05722778c27b18ed157020a00fadfff23aa Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Wed, 24 Jun 2026 14:58:35 -0700 Subject: [PATCH 04/15] Move try_server.py to try_push.py Pure rename to preserve git history before reworking the contents for the hackbot port. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../build-repair/hackbot_agents/build_repair/try_push.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename bugbug/tools/build_repair/try_server.py => agents/build-repair/hackbot_agents/build_repair/try_push.py (100%) diff --git a/bugbug/tools/build_repair/try_server.py b/agents/build-repair/hackbot_agents/build_repair/try_push.py similarity index 100% rename from bugbug/tools/build_repair/try_server.py rename to agents/build-repair/hackbot_agents/build_repair/try_push.py From a402729cc7143090425c0d2bf875d3649e4985ed Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Wed, 24 Jun 2026 15:12:48 -0700 Subject: [PATCH 05/15] Move old build repair agent files to new locations Pure renames to preserve git history before reworking the contents for the hackbot port: agent.py -> hackbot_agents/build_repair/agent.py prompts.py -> hackbot_agents/build_repair/prompts.py build_repair_eval.py -> buildrepair_eval/eval.py Co-Authored-By: Claude Opus 4.8 (1M context) --- .../build-repair-evals/buildrepair_eval/eval.py | 0 .../build-repair/hackbot_agents}/build_repair/agent.py | 0 .../build-repair/hackbot_agents}/build_repair/prompts.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename scripts/build_repair_eval.py => agents/build-repair-evals/buildrepair_eval/eval.py (100%) rename {bugbug/tools => agents/build-repair/hackbot_agents}/build_repair/agent.py (100%) rename {bugbug/tools => agents/build-repair/hackbot_agents}/build_repair/prompts.py (100%) diff --git a/scripts/build_repair_eval.py b/agents/build-repair-evals/buildrepair_eval/eval.py similarity index 100% rename from scripts/build_repair_eval.py rename to agents/build-repair-evals/buildrepair_eval/eval.py diff --git a/bugbug/tools/build_repair/agent.py b/agents/build-repair/hackbot_agents/build_repair/agent.py similarity index 100% rename from bugbug/tools/build_repair/agent.py rename to agents/build-repair/hackbot_agents/build_repair/agent.py diff --git a/bugbug/tools/build_repair/prompts.py b/agents/build-repair/hackbot_agents/build_repair/prompts.py similarity index 100% rename from bugbug/tools/build_repair/prompts.py rename to agents/build-repair/hackbot_agents/build_repair/prompts.py From 3ae452177918ed2e89073facafac3d338ca286e9 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Wed, 24 Jun 2026 15:38:54 -0700 Subject: [PATCH 06/15] Move remaining build repair files to new locations Pure renames to preserve git history before reworking the contents for the hackbot port: config.py -> hackbot_agents/build_repair/config.py eval.py -> build-repair/evals/buildrepair_eval/eval.py Co-Authored-By: Claude Opus 4.8 (1M context) --- .../evals}/buildrepair_eval/eval.py | 0 .../build-repair/hackbot_agents}/build_repair/config.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename agents/{build-repair-evals => build-repair/evals}/buildrepair_eval/eval.py (100%) rename {bugbug/tools => agents/build-repair/hackbot_agents}/build_repair/config.py (100%) diff --git a/agents/build-repair-evals/buildrepair_eval/eval.py b/agents/build-repair/evals/buildrepair_eval/eval.py similarity index 100% rename from agents/build-repair-evals/buildrepair_eval/eval.py rename to agents/build-repair/evals/buildrepair_eval/eval.py diff --git a/bugbug/tools/build_repair/config.py b/agents/build-repair/hackbot_agents/build_repair/config.py similarity index 100% rename from bugbug/tools/build_repair/config.py rename to agents/build-repair/hackbot_agents/build_repair/config.py From 387acbfbe92114607a9754594ea81538ad63c7fc Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Wed, 24 Jun 2026 15:39:56 -0700 Subject: [PATCH 07/15] Migrate build repair agent to Hackbot --- .github/dependabot.yml | 19 - agents/build-repair/Dockerfile | 55 ++ agents/build-repair/compose.yml | 38 + agents/build-repair/hackbot.toml | 8 + .../hackbot_agents/build_repair/__init__.py | 0 .../hackbot_agents/build_repair/__main__.py | 48 + .../hackbot_agents/build_repair/agent.py | 823 ++++++------------ .../hackbot_agents/build_repair/broker.py | 99 +++ .../hackbot_agents/build_repair/config.py | 111 +-- .../hackbot_agents/build_repair/logs.py | 104 +++ .../hackbot_agents/build_repair/prompts.py | 94 +- .../hackbot_agents/build_repair/try_push.py | 295 +++---- agents/build-repair/pyproject.toml | 33 + docker-compose.yml | 1 + pyproject.toml | 2 +- services/buildrepair/Dockerfile | 34 - services/buildrepair/README.md | 64 -- services/buildrepair/docker-compose.dev.yml | 18 - services/buildrepair/pyproject.toml | 12 - services/hackbot-api/app/agents.py | 8 +- services/hackbot-api/app/schemas.py | 9 + services/hackbot-api/tests/test_agents.py | 23 +- uv.lock | 101 +++ 23 files changed, 987 insertions(+), 1012 deletions(-) create mode 100644 agents/build-repair/Dockerfile create mode 100644 agents/build-repair/compose.yml create mode 100644 agents/build-repair/hackbot.toml create mode 100644 agents/build-repair/hackbot_agents/build_repair/__init__.py create mode 100644 agents/build-repair/hackbot_agents/build_repair/__main__.py create mode 100644 agents/build-repair/hackbot_agents/build_repair/broker.py create mode 100644 agents/build-repair/hackbot_agents/build_repair/logs.py create mode 100644 agents/build-repair/pyproject.toml delete mode 100644 services/buildrepair/Dockerfile delete mode 100644 services/buildrepair/README.md delete mode 100644 services/buildrepair/docker-compose.dev.yml delete mode 100644 services/buildrepair/pyproject.toml diff --git a/.github/dependabot.yml b/.github/dependabot.yml index c666f68f3e..0c766de53a 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -72,25 +72,6 @@ updates: open-pull-requests-limit: 99 allow: - dependency-type: direct - - package-ecosystem: uv - directory: "/services/buildrepair" - schedule: - interval: weekly - day: thursday - groups: - patch: - applies-to: version-updates - patterns: - - "*" - update-types: - - patch - cooldown: - semver-major-days: 14 - semver-minor-days: 7 - semver-patch-days: 3 - open-pull-requests-limit: 99 - allow: - - dependency-type: direct - package-ecosystem: npm directory: "/ui/changes" schedule: diff --git a/agents/build-repair/Dockerfile b/agents/build-repair/Dockerfile new file mode 100644 index 0000000000..fa3872a15e --- /dev/null +++ b/agents/build-repair/Dockerfile @@ -0,0 +1,55 @@ +FROM python:3.12 AS builder + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +ENV UV_PROJECT_ENVIRONMENT=/opt/venv + +WORKDIR /app + +# Install external deps without building workspace members. +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ + --mount=type=bind,source=uv.lock,target=uv.lock \ + --mount=type=bind,source=VERSION,target=VERSION \ + uv sync --frozen --no-dev --no-install-workspace --package hackbot-agent-build-repair + +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,target=/app,rw \ + uv sync --locked --no-dev --no-editable --package hackbot-agent-build-repair + +FROM python:3.12 AS base + +COPY --from=builder /opt/venv /opt/venv +WORKDIR /app + +ENV PYTHONUNBUFFERED=1 +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PATH="/opt/venv/bin:$PATH" + +FROM base AS agent + +# hackbot.toml lives at the agent root (not inside the package), so copy it into +# the working dir; the runtime discovers it there (cwd) at startup. +COPY agents/build-repair/hackbot.toml /app/hackbot.toml + +RUN useradd --create-home --shell /bin/bash agent \ + && mkdir -p /workspace \ + && chown agent:agent /workspace + +# `mach bootstrap` installs the toolchain here at runtime; put it on PATH so the +# agent's own `./mach build` (and the build_firefox tool) find rustc/clang. +ENV PATH="/home/agent/.cargo/bin:/home/agent/.mozbuild/clang/bin:${PATH}" + +USER agent + +CMD ["python", "-m", "hackbot_agents.build_repair"] + +FROM base AS broker + +RUN useradd --create-home --shell /bin/bash broker + +USER broker + +EXPOSE 8765 + +CMD ["python", "-m", "hackbot_agents.build_repair.broker"] diff --git a/agents/build-repair/compose.yml b/agents/build-repair/compose.yml new file mode 100644 index 0000000000..c6e63839a1 --- /dev/null +++ b/agents/build-repair/compose.yml @@ -0,0 +1,38 @@ +services: + build-repair-broker: + build: + context: ../.. + dockerfile: agents/build-repair/Dockerfile + target: broker + environment: + BUGZILLA_API_URL: ${BUGZILLA_API_URL} + BUGZILLA_API_KEY: ${BUGZILLA_API_KEY} + expose: + - "8765" + + build-repair-agent: + build: + context: ../.. + dockerfile: agents/build-repair/Dockerfile + target: agent + environment: + - RUN_ID + - BUG_ID=${BUG_ID:?error} + - GIT_COMMIT=${GIT_COMMIT:?error} + - FAILURE_TASKS=${FAILURE_TASKS:?error} + - RUN_TRY_PUSH=${RUN_TRY_PUSH:-false} + - BUGZILLA_MCP_URL=http://build-repair-broker:8765/mcp + - SOURCE_REPO=/workspace/firefox + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:?error} + # No uploader locally: summary/logs/artifacts are written under + # /artifacts/, bind-mounted to the host's ~/hackbot/artifacts. + - ARTIFACTS_DIR=/artifacts + volumes: + - workspace:/workspace + - ${HOME}/hackbot/artifacts:/artifacts + depends_on: + build-repair-broker: + condition: service_started + +volumes: + workspace: diff --git a/agents/build-repair/hackbot.toml b/agents/build-repair/hackbot.toml new file mode 100644 index 0000000000..21210d4d2e --- /dev/null +++ b/agents/build-repair/hackbot.toml @@ -0,0 +1,8 @@ +[source] +repo_url = "https://github.com/mozilla-firefox/firefox.git" +checkout_path = "/workspace/firefox" +# The failure commit is supplied per run via SOURCE_REF (from the git_commit input). + +[firefox] +enabled = true +objdir = "objdir-build-repair" diff --git a/agents/build-repair/hackbot_agents/build_repair/__init__.py b/agents/build-repair/hackbot_agents/build_repair/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/agents/build-repair/hackbot_agents/build_repair/__main__.py b/agents/build-repair/hackbot_agents/build_repair/__main__.py new file mode 100644 index 0000000000..bef34de4ef --- /dev/null +++ b/agents/build-repair/hackbot_agents/build_repair/__main__.py @@ -0,0 +1,48 @@ +import os + +from hackbot_runtime import HackbotContext, run_async +from pydantic_settings import BaseSettings, SettingsConfigDict + +from .agent import BuildRepairResult, run_build_repair + + +class AgentInputs(BaseSettings): + bug_id: int | None = None + git_commit: str + failure_tasks: dict[str, str] + bugzilla_mcp_url: str + run_try_push: bool = False + model: str | None = None + max_turns: int | None = None + + model_config = SettingsConfigDict(extra="ignore") + + +async def main(ctx: HackbotContext) -> BuildRepairResult: + inputs = AgentInputs() + + # The build failure lives at this commit; pin the checkout there before the + # runtime prepares the source tree (consumed in HackbotContext.source_repo). + os.environ.setdefault("SOURCE_REF", inputs.git_commit) + + return await run_build_repair( + bugzilla_mcp_server={ + "type": "http", + "url": inputs.bugzilla_mcp_url, + }, + source_repo=ctx.source_repo, + fx_ctx=ctx.firefox, + bug_id=inputs.bug_id, + git_commit=inputs.git_commit, + failure_tasks=inputs.failure_tasks, + run_try_push=inputs.run_try_push, + model=inputs.model, + max_turns=inputs.max_turns, + log=ctx.log_path, + verbose=True, + publish_file=ctx.publish_file, + ) + + +if __name__ == "__main__": + run_async(main) diff --git a/agents/build-repair/hackbot_agents/build_repair/agent.py b/agents/build-repair/hackbot_agents/build_repair/agent.py index 7175bbd7c7..d32e908a22 100644 --- a/agents/build-repair/hackbot_agents/build_repair/agent.py +++ b/agents/build-repair/hackbot_agents/build_repair/agent.py @@ -3,587 +3,324 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. -import subprocess -import traceback +"""Build-repair agent. + +Two-stage claude-agent-sdk agent that analyzes a Firefox build failure and +implements a fix in the source tree. The runtime checks the tree out at the +failure commit (via ``SOURCE_REF``) and collects the agent's edits into +``changes.patch``; this module only orchestrates the agent and publishes the +analysis artifacts. +""" + +from __future__ import annotations + +import json +import sys +import tempfile from collections.abc import Callable -from logging import getLogger from pathlib import Path -from claude_agent_sdk import ClaudeAgentOptions, ResultMessage, query -from pydantic import BaseModel, Field -from tenacity import ( - retry, - retry_if_exception, - retry_if_exception_message, - stop_after_attempt, - wait_exponential_jitter, +from agent_tools import firefox +from agent_tools.claude_sdk import build_sdk_server +from agent_tools.firefox import FirefoxContext +from claude_agent_sdk import ( + AssistantMessage, + ClaudeAgentOptions, + ClaudeSDKClient, + McpServerConfig, + ResultMessage, + ToolResultBlock, + ToolUseBlock, + UserMessage, ) +from hackbot_agents.build_repair.logs import download_failure_logs +from hackbot_agents.build_repair.try_push import TRY_TOOLS +from hackbot_runtime import AgentError, HackbotAgentResult +from hackbot_runtime.claude import Reporter -from bugbug.tools.base import GenerativeModelTool -from bugbug.tools.build_repair.config import ( +from .config import ( ADDITIONAL_DIRS, ALLOWED_TOOLS, ANALYSIS_MODEL, - FIREFOX_MCP_URL, + BUGZILLA_READ_TOOLS, + BUILD_TOOL, + FIREFOX_TOOLS, FIX_MODEL, - SANDBOX_CONFIG, - VERIFY_ALLOWED_TOOLS, - VERIFY_MODEL, + TRY_PUSH_TOOL, ) -from bugbug.tools.build_repair.prompts import ( +from .prompts import ( ANALYSIS_TEMPLATE, - EVAL_PROMPT, + BUG_ANALYSIS_STEP, + BUG_CONTEXT, FIX_TEMPLATE, - VERIFY_TEMPLATE, + TRY_PUSH_INSTRUCTIONS, ) -logger = getLogger(__name__) +TARGET_SOFTWARE = "Mozilla Firefox" -class BuildFailure(BaseModel): - """Input describing a build failure from the dataset.""" - - bug_id: int = Field(description="The ID of the bug in Bugzilla.") - bug_title: str | None = Field(default=None, description="Optional bug title.") - bug_comments: list[str] | None = Field( - default=None, description="Optional bug comments." - ) - git_commit: str = Field(description="Git revision to checkout.") - failure_tasks: list[dict] = Field( - description="List of {task_name, task_id, retry_id, failure_lines}." - ) +class BuildRepairResult(HackbotAgentResult): + bug_id: int | None = None + git_commit: str + summary: str = "" + analysis: str = "" + local_build_verified: bool | None = None + try_build_passed: bool | None = None + lando_job_id: str | None = None + treeherder_url: str | None = None -class UsageStats(BaseModel): - cost_usd: float = Field(default=0.0) - num_turns: int = Field(default=0) - input_tokens: int = Field(default=0) - output_tokens: int = Field(default=0) - cache_read_input_tokens: int = Field(default=0) - cache_creation_input_tokens: int = Field(default=0) - - -class AgentResponse(UsageStats): - """Output from a build repair run, including analysis, diff, cost, and build results.""" - - summary: str = Field(default="") - analysis: str = Field(default="") - diff: str = Field(default="") - error: str | None = Field(default=None) - error_traceback: str | None = Field(default=None) - failure_stage: str | None = Field(default=None) - cost_usd: float = Field(default=0.0) - num_turns: int = Field(default=0) - input_tokens: int = Field(default=0) - output_tokens: int = Field(default=0) - cache_read_input_tokens: int = Field(default=0) - cache_creation_input_tokens: int = Field(default=0) - local_build_passed: bool | None = Field(default=None) - try_build_passed: bool | None = Field(default=None) - lando_job_id: str | None = Field(default=None) - treeherder_url: str | None = Field(default=None) - stage1_transcript: list[dict] = Field(default_factory=list) - stage2_transcript: list[dict] = Field(default_factory=list) - - -class GroundTruth(BaseModel): - gh_fix_commits: list[str] = Field( - description="Git commit hashes of the ground truth fix." +def _result_text(block: ToolResultBlock) -> str: + if isinstance(block.content, str): + return block.content + if isinstance(block.content, list): + return "\n".join( + c.get("text", "") + for c in block.content + if isinstance(c, dict) and c.get("type") == "text" + ) + return str(block.content) + + +def _build_options( + *, + model: str | None, + effort: str, + cwd: Path, + scratch_dir: Path, + mcp_servers: dict[str, McpServerConfig], + allowed_tools: list[str], + max_turns: int | None, +) -> ClaudeAgentOptions: + # The agent always runs inside an isolated Docker container, so there is no + # sandbox and tools run without per-command permission prompts. + return ClaudeAgentOptions( + model=model, + cwd=str(cwd), + mcp_servers=mcp_servers, + allowed_tools=allowed_tools, + disallowed_tools=["AskUserQuestion", "Task"], + add_dirs=[*ADDITIONAL_DIRS, str(scratch_dir)], + permission_mode="bypassPermissions", + effort=effort, + max_turns=max_turns, + setting_sources=[], ) -class Judgment(BaseModel): - analysis_correct: bool - analysis_quality: float - analysis_explanation: str - fix_matches_ground_truth: bool - fix_quality: float - fix_explanation: str - fix_acceptance_probability: float - fix_acceptance_explanation: str +def _write_mozconfig(fx_ctx: FirefoxContext) -> None: + """Write a mozconfig mirroring the failing CI build, unless one exists. - -class VerifyResponse(UsageStats): - judgment: Judgment | None = Field(default=None) - verification_transcript: list[dict] = Field(default_factory=list) - - -class BuildRepairTool(GenerativeModelTool): - """Two-stage build repair agent using Claude Agent SDK. - - Stage 1: Analyzes the failure and produces analysis/planning/summary docs. - Stage 2: Reads the analysis and implements a fix. Skipped in analysis-only mode. - After Stage 2, commits the fix, runs ./mach build, and optionally submits to try. + Verification only means something if the local build reproduces the failure + condition. Many failures (e.g. a variable used only inside a stripped + ``MOZ_DIAGNOSTIC_ASSERT``) compile fine in a default Nightly-style build and + fail only in a release-milestone build with warnings-as-errors. ``--enable- + release`` leaves ``MOZ_DIAGNOSTIC_ASSERT_ENABLED`` undefined and + ``--enable-warnings-as-errors`` promotes warnings to errors, so this config + surfaces that whole class locally. """ + if fx_ctx.mozconfig.exists(): + return + fx_ctx.mozconfig.write_text( + "ac_add_options --enable-application=browser\n" + "ac_add_options --disable-debug\n" + "ac_add_options --enable-release\n" + "ac_add_options --enable-warnings-as-errors\n" + f"mk_add_options MOZ_OBJDIR={fx_ctx.objdir}\n" + ) - def __init__( - self, - target_software: str = "Mozilla Firefox", - analysis_only: bool = False, - eval_mode: bool = False, - analysis_model: str = ANALYSIS_MODEL, - fix_model: str = FIX_MODEL, - verify_model: str = VERIFY_MODEL, - ) -> None: - self.eval_mode = eval_mode - self.target_software = target_software - self.analysis_only = analysis_only - self.analysis_model = analysis_model - self.fix_model = fix_model - self.verify_model = verify_model - - @classmethod - def create(cls, **kwargs): - return cls(**kwargs) - - @staticmethod - def _usage_fields(usage: dict) -> dict: - return { - "input_tokens": usage.get("input_tokens", 0), - "output_tokens": usage.get("output_tokens", 0), - "cache_read_input_tokens": usage.get("cache_read_input_tokens", 0), - "cache_creation_input_tokens": usage.get("cache_creation_input_tokens", 0), - } - - @staticmethod - def _serialize_message(message) -> dict: - data = {"type": type(message).__name__} - if hasattr(message, "model_dump"): - data.update(message.model_dump()) - elif hasattr(message, "__dict__"): - data.update(vars(message)) - else: - data["raw"] = str(message) - return data - - async def _run_stage( - self, - stage_name: str, - prompt: str, - model: str, - options: ClaudeAgentOptions, - bug_id: int, - on_message: Callable[[str, dict], None] | None = None, - ) -> tuple[list[dict], float, int, dict]: - transcript: list[dict] = [] - cost = 0.0 - turns = 0 - result_data: dict = {} - usage: dict = {} - - @retry( - retry=( - retry_if_exception_message(match="Control request timeout") - | retry_if_exception_message(match="overloaded") - | retry_if_exception_message(match="529") - | retry_if_exception_message(match="exit code") - | retry_if_exception( - lambda e: isinstance(e, (TimeoutError, ConnectionError, OSError)) - ) - ), - stop=stop_after_attempt(5), - wait=wait_exponential_jitter(initial=2, max=60, jitter=5), - before_sleep=lambda rs: logger.warning( - "Bug %s: %s transient error (attempt %d/5), retrying: %s", - bug_id, - stage_name, - rs.attempt_number, - rs.outcome.exception(), - ), - reraise=True, - ) - async def _query(): - nonlocal cost, turns, usage, result_data - async for message in query(prompt=prompt, options=options): - serialized = self._serialize_message(message) - transcript.append(serialized) - logger.debug("Bug %s: %s [%s]", bug_id, stage_name, serialized["type"]) - if on_message: - on_message(stage_name, serialized) - if isinstance(message, ResultMessage): - cost += message.total_cost_usd or 0 - turns += message.num_turns or 0 - usage = getattr(message, "usage", {}) or {} - result_data = serialized - - if on_message: - on_message( - stage_name, - { - "type": "stage_start", - "prompt": prompt, - "model": model, - }, - ) - try: - await _query() - finally: - if on_message: - on_message( - stage_name, - { - "type": "stage_end", - "cost_usd": cost, - "num_turns": turns, - "result_data": result_data, - }, - ) - - return transcript, cost, turns, usage - - def _prepare_input_files(self, failure: BuildFailure, worktree_path: Path) -> None: - in_dir = worktree_path / "repair_agent" / "in" / str(failure.bug_id) - in_dir.mkdir(parents=True, exist_ok=True) - - (in_dir / "bug_description.md").write_text( - f"# Bug {failure.bug_id}: {failure.bug_title}\n\n" - + "\n\n---\n\n".join(failure.bug_comments or []) - ) - - logs_content = "" - for task in failure.failure_tasks: - logs_content += f"## {task['task_name']} (task_id: {task['task_id']})\n\n" - logs_content += "\n".join(task["failure_lines"]) + "\n\n" - (in_dir / "build_failure_logs.md").write_text(logs_content) - - out_dir = worktree_path / "repair_agent" / "out" / str(failure.bug_id) - out_dir.mkdir(parents=True, exist_ok=True) - logger.info( - "Prepared input files for bug %s at %s (%d failure tasks)", - failure.bug_id, - in_dir, - len(failure.failure_tasks), - ) +async def run_build_repair( + *, + bugzilla_mcp_server: McpServerConfig, + source_repo: Path, + fx_ctx: FirefoxContext, + bug_id: int | None = None, + git_commit: str, + failure_tasks: dict[str, str], + run_try_push: bool = False, + model: str | None = None, + max_turns: int | None = None, + verbose: bool = False, + log: Path | None = None, + publish_file: Callable[[str, Path, str | None], str] | None = None, +) -> BuildRepairResult: + """Analyze a build failure and implement a fix in ``source_repo``. + + Returns a :class:`BuildRepairResult`; raises :class:`AgentError` if a stage + ends in an error or produces no result. + """ + label = f"bug {bug_id}" if bug_id is not None else f"commit {git_commit[:12]}" + print(f"[build_repair] repairing {label} at {git_commit}", file=sys.stderr) + + scratch_dir = Path(tempfile.mkdtemp(prefix=f"build-repair-{bug_id or 'nobug'}-")) + scratch_in = scratch_dir / "in" + scratch_out = scratch_dir / "out" + scratch_in.mkdir(parents=True, exist_ok=True) + scratch_out.mkdir(parents=True, exist_ok=True) + + task_logs = await download_failure_logs(failure_tasks, scratch_in) + failure_logs = "\n".join( + f"- {name}: sanitized errors at {tl.sanitized} (start here); " + f"full log at {tl.full}" + for name, tl in task_logs.items() + ) - def _read_output(self, failure: BuildFailure, worktree_path: Path, key: str) -> str: - path = ( - worktree_path / "repair_agent" / "out" / str(failure.bug_id) / f"{key}.md" - ) - if path.exists(): - return path.read_text() - return "" + firefox_tools = [*firefox.TOOLS, *TRY_TOOLS] if run_try_push else firefox.TOOLS + firefox_server = build_sdk_server("firefox", fx_ctx, firefox_tools) + mcp_servers: dict[str, McpServerConfig] = { + "bugzilla": bugzilla_mcp_server, + "firefox": firefox_server, + } + allowed_tools = [ + *ALLOWED_TOOLS, + *BUGZILLA_READ_TOOLS, + *FIREFOX_TOOLS, + *([TRY_PUSH_TOOL] if run_try_push else []), + ] + + task_name = next(iter(failure_tasks), "") + analysis_prompt = ANALYSIS_TEMPLATE.format( + target_software=TARGET_SOFTWARE, + git_commit=git_commit, + failure_logs=failure_logs, + scratch_out=scratch_out, + bug_context=BUG_CONTEXT.format(bug_id=bug_id) if bug_id is not None else "", + bug_step=BUG_ANALYSIS_STEP.format(bug_id=bug_id) if bug_id is not None else "", + logs_num=3 if bug_id is not None else 2, + ) + fix_prompt = FIX_TEMPLATE.format( + target_software=TARGET_SOFTWARE, + scratch_out=scratch_out, + try_push=( + TRY_PUSH_INSTRUCTIONS.format(task_name=task_name) if run_try_push else "" + ), + ) - async def run( - self, - failure: BuildFailure, - worktree_path: Path, - skip_try_push: bool = False, - on_message: Callable[[str, dict], None] | None = None, - ) -> AgentResponse: - logger.info( - "Starting build repair for bug %s " - "(commit=%s, worktree=%s, analysis_only=%s, skip_try_push=%s)", - failure.bug_id, - failure.git_commit, - worktree_path, - self.analysis_only, - skip_try_push, - ) - self._prepare_input_files(failure, worktree_path) - - mcp_servers = {"firefox": {"type": "http", "url": FIREFOX_MCP_URL}} - disallowed = ["AskUserQuestion", "Task"] - total_cost = 0.0 - total_turns = 0 - total_usage: dict = {} - - logger.info( - "Bug %s: starting Stage 1 (analysis) with model=%s", - failure.bug_id, - self.analysis_model, - ) - stage1_options = ClaudeAgentOptions( - model=self.analysis_model, - cwd=str(worktree_path), - allowed_tools=ALLOWED_TOOLS, - disallowed_tools=disallowed, - add_dirs=ADDITIONAL_DIRS, - sandbox=SANDBOX_CONFIG, - permission_mode="acceptEdits", + total_cost = 0.0 + total_turns = 0 + # Last JSON result of each tracked tool, keyed by tool name. Lets us report + # the actual local-build / try-push outcomes instead of guessing. + captured: dict[str, dict] = {} + tracked = {BUILD_TOOL, *([TRY_PUSH_TOOL] if run_try_push else [])} + + with Reporter(verbose=verbose, log_path=log) as reporter: + # Stage 1: analysis (high effort, no source edits yet). + reporter.header(f"{label}: analysis") + analysis_opts = _build_options( + model=model or ANALYSIS_MODEL, effort="high", + cwd=source_repo, + scratch_dir=scratch_dir, mcp_servers=mcp_servers, + allowed_tools=allowed_tools, + max_turns=max_turns, ) - analysis_prompt = ANALYSIS_TEMPLATE.format( - bug_id=failure.bug_id, - target_software=self.target_software, - worktree_path=worktree_path, - eval=EVAL_PROMPT if self.eval_mode else "", - ) - try: - ( - stage1_transcript, - stage1_cost, - stage1_turns, - stage1_usage, - ) = await self._run_stage( - "analysis", - analysis_prompt, - self.analysis_model, - stage1_options, - failure.bug_id, - on_message, - ) - total_cost += stage1_cost - total_turns += stage1_turns - for k, v in stage1_usage.items(): - if isinstance(v, (int, float)): - total_usage[k] = total_usage.get(k, 0) + v - except Exception as e: - logger.error( - "Bug %s: starting Stage 2 (fix) with model=%s", - failure.bug_id, - self.fix_model, - ) - return AgentResponse( - error=str(e), - error_traceback=traceback.format_exc(), - failure_stage="analysis", - cost_usd=total_cost, - num_turns=total_turns, - **self._usage_fields(total_usage), - ) - - logger.info( - "Bug %s: Stage 1 complete (cost=$%.4f, turns=%d)", - failure.bug_id, - total_cost, - total_turns, + result_msg = await _run_session( + reporter, analysis_opts, analysis_prompt, captured, tracked ) - summary = self._read_output(failure, worktree_path, "summary") - analysis = self._read_output(failure, worktree_path, "analysis") - logger.info( - "Bug %s: read output files (summary=%d chars, analysis=%d chars)", - failure.bug_id, - len(summary), - len(analysis), - ) - - if self.analysis_only: - logger.info("Bug %s: analysis-only mode, skipping Stage 2", failure.bug_id) - return AgentResponse( - summary=summary, - analysis=analysis, - cost_usd=total_cost, - num_turns=total_turns, - **self._usage_fields(total_usage), - stage1_transcript=stage1_transcript, - ) - - logger.info( - "Bug %s: starting Stage 2 (fix) with model=%s", - failure.bug_id, - self.fix_model, - ) - stage2_options = ClaudeAgentOptions( - model=self.fix_model, - cwd=str(worktree_path), - allowed_tools=ALLOWED_TOOLS, - disallowed_tools=disallowed, - add_dirs=ADDITIONAL_DIRS, - sandbox=SANDBOX_CONFIG, - permission_mode="acceptEdits", + _check(result_msg, label, "analysis") + total_cost += result_msg.total_cost_usd or 0.0 + total_turns += result_msg.num_turns or 0 + + # Stage 2: fix (lower effort, edits the source tree and verifies it + # builds against a mozconfig that mirrors the failing CI config). + _write_mozconfig(fx_ctx) + reporter.header(f"{label}: fix") + fix_opts = _build_options( + model=model or FIX_MODEL, effort="low", + cwd=source_repo, + scratch_dir=scratch_dir, mcp_servers=mcp_servers, + allowed_tools=allowed_tools, + max_turns=max_turns, ) - fix_prompt = FIX_TEMPLATE.format( - target_software=self.target_software, - bug_id=failure.bug_id, - worktree_path=worktree_path, - eval=EVAL_PROMPT if self.eval_mode else "", - ) - try: - ( - stage2_transcript, - stage2_cost, - stage2_turns, - stage2_usage, - ) = await self._run_stage( - "fix", - fix_prompt, - self.fix_model, - stage2_options, - failure.bug_id, - on_message, - ) - total_cost += stage2_cost - total_turns += stage2_turns - for k, v in stage2_usage.items(): - if isinstance(v, (int, float)): - total_usage[k] = total_usage.get(k, 0) + v - except Exception as e: - logger.exception( - "Bug %s: Stage 2 (fix) failed: %s", - failure.bug_id, - e, - ) - return AgentResponse( - summary=summary, - analysis=analysis, - error=str(e), - error_traceback=traceback.format_exc(), - failure_stage="fix", - cost_usd=total_cost, - num_turns=total_turns, - **self._usage_fields(total_usage), - ) - - logger.info( - "Bug %s: Stage 2 complete (cost=$%.4f, turns=%d)", - failure.bug_id, - total_cost, - total_turns, + result_msg = await _run_session( + reporter, fix_opts, fix_prompt, captured, tracked ) - - subprocess.run( - ["git", "add", "-A"], - cwd=worktree_path, - capture_output=True, - ) - diff_result = subprocess.run( - ["git", "diff", "--staged", "HEAD"], - cwd=worktree_path, - capture_output=True, - text=True, - ) - diff = diff_result.stdout - logger.info("Bug %s: git diff produced %d chars", failure.bug_id, len(diff)) - - if not diff.strip(): - logger.warning("Bug %s: no diff produced, returning early", failure.bug_id) - return AgentResponse( - summary=summary, - analysis=analysis, - diff=diff, - cost_usd=total_cost, - num_turns=total_turns, - **self._usage_fields(total_usage), - stage1_transcript=stage1_transcript, - stage2_transcript=stage2_transcript, - ) - - from bugbug.tools.build_repair.try_server import run_try_verification - - task_name = ( - failure.failure_tasks[0]["task_name"] if failure.failure_tasks else "" - ) - logger.info( - "Bug %s: starting try verification (task=%s, skip_try_push=%s)", - failure.bug_id, - task_name, - skip_try_push, - ) - try_result = run_try_verification( - worktree_path=worktree_path, - bug_id=failure.bug_id, - task_name=task_name, - skip_try_push=skip_try_push, - ) - - logger.info( - "Bug %s: try verification done " - "(local_build=%s, try_build=%s, lando_job=%s, " - "total_cost=$%.4f, total_turns=%d)", - failure.bug_id, - try_result.local_build_passed, - try_result.try_build_passed, - try_result.lando_job_id, - total_cost, - total_turns, - ) - return AgentResponse( - summary=summary, - analysis=analysis, - diff=diff, - cost_usd=total_cost, - num_turns=total_turns, - **self._usage_fields(total_usage), - local_build_passed=try_result.local_build_passed, - try_build_passed=try_result.try_build_passed, - lando_job_id=try_result.lando_job_id, - treeherder_url=try_result.treeherder_url, - stage1_transcript=stage1_transcript, - stage2_transcript=stage2_transcript, - ) - - @retry( - stop=stop_after_attempt(3), - wait=wait_exponential_jitter(initial=2, max=30, jitter=5), - before_sleep=lambda rs: logger.warning( - "Verification failed (attempt %d/3), retrying: %s", - rs.attempt_number, - rs.outcome.exception(), - ), - reraise=True, + _check(result_msg, label, "fix") + total_cost += result_msg.total_cost_usd or 0.0 + total_turns += result_msg.num_turns or 0 + + summary = _read_doc(scratch_out, "summary", publish_file) + analysis = _read_doc(scratch_out, "analysis", publish_file) + + build_result = captured.get(BUILD_TOOL) + try_result = captured.get(TRY_PUSH_TOOL, {}) + + return BuildRepairResult( + bug_id=bug_id, + git_commit=git_commit, + summary=summary, + analysis=analysis, + local_build_verified=build_result.get("success") if build_result else None, + try_build_passed=try_result.get("try_build_passed"), + lando_job_id=try_result.get("lando_job_id"), + treeherder_url=try_result.get("treeherder_url"), + num_turns=total_turns, + total_cost_usd=total_cost, ) - async def verify( - self, - failure: BuildFailure, - agent_diff: str, - ground_truth: GroundTruth, - worktree_path: Path, - on_message: Callable[[str, dict], None] | None = None, - ) -> VerifyResponse: - out_dir = worktree_path / "repair_agent" / "out" / str(failure.bug_id) - out_dir.mkdir(parents=True, exist_ok=True) - (out_dir / "agent_fix.diff").write_text(agent_diff, encoding="utf-8") - - gt_commits = " ".join(ground_truth.gh_fix_commits) - prompt = VERIFY_TEMPLATE.format( - target_software=self.target_software, - bug_id=failure.bug_id, - failure_commit=failure.git_commit, - ground_truth_commits=gt_commits, - worktree_path=worktree_path, - ) - options = ClaudeAgentOptions( - model=self.verify_model, - cwd=str(worktree_path), - allowed_tools=VERIFY_ALLOWED_TOOLS, - disallowed_tools=["AskUserQuestion", "Task"], - sandbox=SANDBOX_CONFIG, - permission_mode="acceptEdits", - effort="high", - output_format={ - "type": "json_schema", - "schema": Judgment.model_json_schema(), - }, - ) - logger.info( - "Bug %s: starting verification stage (model=%s, ground_truth=%s)", - failure.bug_id, - self.verify_model, - gt_commits, - ) +async def _run_session( + reporter: Reporter, + options: ClaudeAgentOptions, + prompt: str, + captured: dict[str, dict], + tracked: set[str], +) -> ResultMessage | None: + """Drive one agent session, capturing the last result of each tracked tool. - transcript, cost, turns, usage = await self._run_stage( - "verification", - prompt, - self.verify_model, - options, - failure.bug_id, - on_message, + ``captured`` is keyed by tool name and updated in place with the parsed JSON + of each successful call to a tool in ``tracked`` (e.g. the local build and + the try push), so the caller can report real outcomes. + """ + pending: dict[str, str] = {} + result_msg: ResultMessage | None = None + async with ClaudeSDKClient(options=options) as client: + await client.query(prompt) + async for msg in client.receive_response(): + reporter.message(msg) + if isinstance(msg, AssistantMessage): + for block in msg.content: + if isinstance(block, ToolUseBlock) and block.name in tracked: + pending[block.id] = block.name + elif isinstance(msg, UserMessage) and isinstance(msg.content, list): + for block in msg.content: + if ( + isinstance(block, ToolResultBlock) + and block.tool_use_id in pending + and not block.is_error + ): + name = pending.pop(block.tool_use_id) + try: + captured[name] = json.loads(_result_text(block)) + except (ValueError, TypeError): + pass + elif isinstance(msg, ResultMessage): + result_msg = msg + return result_msg + + +def _check(result_msg: ResultMessage | None, label: str, stage: str) -> None: + if result_msg is None: + raise AgentError(f"{label}: {stage} stage produced no result message") + if result_msg.is_error: + raise AgentError( + f"{label}: {stage} stage failed: {result_msg.result or result_msg.subtype}" ) - judgment: Judgment | None = None - for msg in reversed(transcript): - if msg.get("structured_output"): - judgment = Judgment.model_validate(msg["structured_output"]) - break - - if judgment is None: - result_msgs = [m for m in transcript if m.get("type") == "ResultMessage"] - raise RuntimeError( - f"Bug {failure.bug_id}: verification produced no structured output. " - f"Result messages: {result_msgs}" - ) - - return VerifyResponse( - judgment=judgment, - cost_usd=cost, - num_turns=turns, - verification_transcript=transcript, - **self._usage_fields(usage), - ) + +def _read_doc( + scratch_out: Path, + key: str, + publish_file: Callable[[str, Path, str | None], str] | None, +) -> str: + """Read a stage-1 output doc and, if a publisher is given, publish it.""" + path = scratch_out / f"{key}.md" + if not path.exists(): + return "" + if publish_file is not None: + publish_file(f"{key}.md", path, "text/markdown") + return path.read_text() diff --git a/agents/build-repair/hackbot_agents/build_repair/broker.py b/agents/build-repair/hackbot_agents/build_repair/broker.py new file mode 100644 index 0000000000..70b275808b --- /dev/null +++ b/agents/build-repair/hackbot_agents/build_repair/broker.py @@ -0,0 +1,99 @@ +"""Bugzilla MCP broker. + +Sidecar container that holds the Bugzilla API key and serves the +bugzilla MCP tools over HTTP. The agent process (in a sibling container +in the same Cloud Run Job task) reaches us at `127.0.0.1:/mcp`. +The agent container itself binds no Bugzilla credentials. +""" + +import logging +from contextlib import asynccontextmanager + +import bugsy +import uvicorn +from agent_tools import bugzilla +from agent_tools.bugzilla import BugzillaContext +from agent_tools.claude_sdk import build_sdk_server +from mcp.server.streamable_http_manager import StreamableHTTPSessionManager +from pydantic import field_validator +from pydantic_settings import BaseSettings, SettingsConfigDict +from starlette.applications import Starlette +from starlette.routing import Mount + +log = logging.getLogger("bugzilla-broker") + + +class BrokerInputs(BaseSettings): + bugzilla_api_url: str + bugzilla_api_key: str + host: str = "0.0.0.0" + port: int = 8765 + + model_config = SettingsConfigDict(extra="ignore") + + @field_validator("bugzilla_api_url") + @classmethod + def _ensure_rest_base(cls, v: str) -> str: + """Bugsy expects the REST base (``.../rest``) and just appends the path. + + A bare host like ``https://bugzilla.mozilla.org`` makes every call hit + the HTML site and fail to parse as JSON, so normalize it here. + """ + v = v.rstrip("/") + return v if v.endswith("/rest") else f"{v}/rest" + + +def build_app(inputs: BrokerInputs) -> Starlette: + client = bugsy.Bugsy( + api_key=inputs.bugzilla_api_key, bugzilla_url=inputs.bugzilla_api_url + ) + ctx = BugzillaContext(client=client) + sdk_config = build_sdk_server("bugzilla", ctx, bugzilla.TOOLS) + mcp_server = sdk_config["instance"] + + manager = StreamableHTTPSessionManager(app=mcp_server, stateless=True) + + @asynccontextmanager + async def lifespan(app): + # Probe Bugzilla once at startup so a bad API URL/key surfaces here as a + # clear log line instead of an opaque JSON-decode error on every tool + # call. We stay up regardless: the agent then gets a structured error. + try: + version = client.request("version").get("version") + log.info( + "bugzilla reachable at %s (version %s)", + inputs.bugzilla_api_url, + version, + ) + except Exception: + log.exception( + "bugzilla health check failed against %s -- check BUGZILLA_API_URL " + "and BUGZILLA_API_KEY; tool calls will fail until this is fixed", + inputs.bugzilla_api_url, + ) + async with manager.run(): + log.info( + "bugzilla broker ready on %s:%d (read-only)", + inputs.host, + inputs.port, + ) + yield + + async def mcp_handler(scope, receive, send): + await manager.handle_request(scope, receive, send) + + return Starlette(routes=[Mount("/mcp", app=mcp_handler)], lifespan=lifespan) + + +def main() -> None: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + inputs = BrokerInputs() + app = build_app(inputs) + uvicorn.run(app, host=inputs.host, port=inputs.port, log_config=None) + + +if __name__ == "__main__": + main() diff --git a/agents/build-repair/hackbot_agents/build_repair/config.py b/agents/build-repair/hackbot_agents/build_repair/config.py index a3f69ef34e..b52b13b1ad 100644 --- a/agents/build-repair/hackbot_agents/build_repair/config.py +++ b/agents/build-repair/hackbot_agents/build_repair/config.py @@ -3,88 +3,49 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. -from datetime import date - -from claude_agent_sdk import SandboxNetworkConfig, SandboxSettings - -ANALYSIS_MODEL = "claude-opus-4-6" -FIX_MODEL = "claude-opus-4-6" -VERIFY_MODEL = "claude-opus-4-6" -DEFAULT_MAX_TURNS = 80 -WORKTREE_BASE_DIR = "/tmp/build_repair_worktrees" -TRY_PUSH_TIMEOUT_SECONDS = 7200 -TRY_PUSH_POLL_INTERVAL_SECONDS = 60 -TREEHERDER_BASE_URL = "https://treeherder.mozilla.org" - -FIREFOX_MCP_URL = "https://mcp-dev.moz.tools/mcp" - -# Training data cutoff dates per model, for data contamination filtering. -# Examples with fix_commit_date before the cutoff may have been in training data. -# Source: https://platform.claude.com/docs/en/about-claude/models/overview -MODEL_CUTOFF_DATES = { - "claude-opus-4-6": date(2025, 8, 1), - "claude-sonnet-4-6": date(2026, 1, 1), - "claude-haiku-4-5-20251001": date(2025, 7, 1), - "claude-sonnet-4-5-20250929": date(2025, 7, 1), - "claude-opus-4-5-20251101": date(2025, 8, 1), - "claude-opus-4-1-20250805": date(2025, 3, 1), - "claude-sonnet-4-20250514": date(2025, 3, 1), - "claude-3-7-sonnet-20250219": date(2024, 11, 1), - "claude-opus-4-20250514": date(2025, 3, 1), -} +"""Models and tool allowlist for the build-repair agent.""" + +ANALYSIS_MODEL = "claude-opus-4-8" +FIX_MODEL = "claude-opus-4-8" + +# Bugzilla MCP tool names as exposed to the agent (mcp____). +BUGZILLA_READ_TOOLS = [ + "mcp__bugzilla__search_bugs", + "mcp__bugzilla__get_bugs", + "mcp__bugzilla__get_bug_comments", + "mcp__bugzilla__get_bug_attachments", + "mcp__bugzilla__download_attachment", +] -VERIFY_ALLOWED_TOOLS = [ - "Read", - "Bash(git show:*)", - "Bash(git log:*)", - "Bash(git diff:*)", - "Bash(find:*)", - "Bash(grep:*)", - "WebFetch(domain:firefox-source-docs.mozilla.org)", - "WebFetch(domain:searchfox.org)", +# In-process Firefox build/test MCP tools. +BUILD_TOOL = "mcp__firefox__build_firefox" +FIREFOX_TOOLS = [ + BUILD_TOOL, + "mcp__firefox__bootstrap_firefox", + "mcp__firefox__evaluate_testcase", + "mcp__firefox__evaluate_js_shell", ] +# Optional try-server tool, wired only when run_try_push is enabled. +TRY_PUSH_TOOL = "mcp__firefox__submit_try_push" + +# The agent always runs inside an isolated Docker container, so there is no +# sandbox and tools run without per-command permission prompts (see +# permission_mode="bypassPermissions" in agent.py). This is just the set of +# built-in tools the agent is allowed to call alongside the MCP servers. ALLOWED_TOOLS = [ - "Edit(~/.mozbuild)", - "Edit(~/.cache/uv)", - "Bash(./mach build:*)", - "Bash(./mach clobber:*)", - "Bash(./mach configure:*)", - "Bash(./mach run:*)", - "Bash(./mach test:*)", - "Bash(./mach wpt:*)", - "Bash(./mach lint:*)", - "Bash(./mach format:*)", - "Bash(./mach clang-format:*)", - "Bash(./mach try:*)", - "Bash(./mach help:*)", - "Bash(./mach vendor:*)", - "Bash(./mach bootstrap:*)", - "Bash(./mach artifact:*)", - "Bash(clang++:*)", - "Bash(rm:*)", - "Bash(timeout:*)", - "Bash(find:*)", - "Bash(grep:*)", - "Bash(tee:*)", - "Bash(kill:*)", - "Bash(searchfox-cli:*)", - "Bash(treeherder-cli:*)", - "Bash(jj:*)", - "WebFetch(domain:firefox-source-docs.mozilla.org)", - "WebFetch(domain:treeherder.mozilla.org)", - "WebFetch(domain:searchfox.org)", - "WebFetch(o1069899.ingest.sentry.io)", + "Read", + "Grep", + "Glob", + "Bash", + "Edit", + "Write", + "MultiEdit", + "WebFetch", + "WebSearch", ] ADDITIONAL_DIRS = [ "~/.mozbuild", "~/.cache/uv/", ] - -SANDBOX_CONFIG = SandboxSettings( - enabled=True, - autoAllowBashIfSandboxed=True, - allowUnsandboxedCommands=False, - network=SandboxNetworkConfig(allowLocalBinding=True), -) diff --git a/agents/build-repair/hackbot_agents/build_repair/logs.py b/agents/build-repair/hackbot_agents/build_repair/logs.py new file mode 100644 index 0000000000..e0c22875b0 --- /dev/null +++ b/agents/build-repair/hackbot_agents/build_repair/logs.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +"""Download and sanitize Taskcluster build-failure logs. + +The agent is given a mapping of ``task-name -> Taskcluster task ID``. Before the +Claude SDK is invoked we fetch each task's ``live_backing.log`` and write two +files to the scratch dir: the full log and a sanitized companion that keeps only +the ``ERROR -`` / ``FATAL -`` lines. The agent is told to start from the +sanitized log (so its context isn't drowned by tens of MB of build output) and +fall back to the full log for surrounding detail. +""" + +from __future__ import annotations + +import asyncio +import logging +import re +from pathlib import Path +from typing import NamedTuple + +import requests + +logger = logging.getLogger(__name__) + +ARTIFACT_URL = ( + "https://firefoxci.taskcluster-artifacts.net/" + "{task_id}/{run_id}/public/logs/live_backing.log" +) +RUN_ID = 0 +_HEADERS = {"User-Agent": "hackbot-build-repair/1.0"} +_TIMEOUT = 120 +_MAX_LINES = 2000 + +_ERROR_RE = re.compile(r"(?:ERROR|FATAL) -") + + +class TaskLogs(NamedTuple): + """Paths to the two log files written for one failing task.""" + + sanitized: Path + full: Path + + +def _safe_filename(task_name: str) -> str: + return re.sub(r"[^A-Za-z0-9._-]+", "_", task_name).strip("_") or "task" + + +def sanitize_log(text: str) -> str: + """Keep only ``ERROR -`` / ``FATAL -`` lines, deduping consecutive repeats and capping size.""" + kept: list[str] = [] + previous: str | None = None + for line in text.splitlines(): + if not _ERROR_RE.search(line): + continue + stripped = line.rstrip() + if stripped == previous: + continue + previous = stripped + kept.append(stripped) + if len(kept) >= _MAX_LINES: + kept.append(f"... (truncated at {_MAX_LINES} error lines)") + break + return "\n".join(kept) + + +def _fetch_and_write(task_name: str, task_id: str, dest_dir: Path) -> TaskLogs: + safe = _safe_filename(task_name) + full_path = dest_dir / f"{safe}.log" + sanitized_path = dest_dir / f"{safe}.errors.txt" + url = ARTIFACT_URL.format(task_id=task_id, run_id=RUN_ID) + try: + resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT) + resp.raise_for_status() + full_path.write_text(resp.text) + sanitized = sanitize_log(resp.text) + sanitized_path.write_text( + sanitized if sanitized else f"(no ERROR -/FATAL - lines matched in {url})\n" + ) + except requests.exceptions.RequestException as exc: + logger.warning("Failed to download log for %s (%s): %s", task_name, url, exc) + note = f"(failed to download {url}: {exc})\n" + full_path.write_text(note) + sanitized_path.write_text(note) + return TaskLogs(sanitized=sanitized_path, full=full_path) + + +async def download_failure_logs( + failure_tasks: dict[str, str], dest_dir: Path +) -> dict[str, TaskLogs]: + """Download the full log and write a sanitized companion for each task concurrently. + + Returns a mapping of task name to its :class:`TaskLogs` (sanitized + full paths). + """ + names = list(failure_tasks) + logs = await asyncio.gather( + *( + asyncio.to_thread(_fetch_and_write, name, failure_tasks[name], dest_dir) + for name in names + ) + ) + return dict(zip(names, logs)) diff --git a/agents/build-repair/hackbot_agents/build_repair/prompts.py b/agents/build-repair/hackbot_agents/build_repair/prompts.py index ee166620c2..7e8516b7e1 100644 --- a/agents/build-repair/hackbot_agents/build_repair/prompts.py +++ b/agents/build-repair/hackbot_agents/build_repair/prompts.py @@ -3,75 +3,57 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. -"""Prompt templates for build repair agent.""" +"""Prompt templates for the build-repair agent.""" ANALYSIS_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing and fixing a build failure. -Investigate why the last commit broke {target_software} build. - -The last commit attempted to fix a bug from Bugzilla. - +Investigate why commit {git_commit} broke the {target_software} build. The source tree +is already checked out at that commit (your working directory). +{bug_context} Analyze the following: -1. Git diff for the last commit -2. Bugzilla bug description -3. Taskcluster build failure logs -The files with bug description and logs are located at {worktree_path}/repair_agent/in/{bug_id} +1. The git diff of commit {git_commit} (use `git show {git_commit}`). +{bug_step}{logs_num}. The Taskcluster build failure logs. Each failing task has a sanitized log (only the ERROR -/FATAL - lines) and the full log. Start from the sanitized log -- it usually pinpoints the failing file and line. The full log can be tens of thousands of lines, so grep it for that file/line rather than reading it sequentially: +{failure_logs} Create three separate documents: -1. {worktree_path}/repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues -2. {worktree_path}/repair_agent/out/{bug_id}/planning.md with a fixing plan -3. {worktree_path}/repair_agent/out/{bug_id}/summary.md with a brief one paragraph summary of analysis and planning that can point a developer in the right direction - -Do not prompt to edit those documents. -{eval} +1. {scratch_out}/analysis.md with your detailed analysis of what caused the failure +2. {scratch_out}/planning.md with a fixing plan +3. {scratch_out}/summary.md with a brief one-paragraph summary of the analysis and plan + that can point a developer in the right direction -Do not write any code yet. Work fully autonomously, do not ask any questions. +Do not prompt to edit those documents. Do not write any code yet. Work fully +autonomously and do not ask any questions. """ -FIX_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing and fixing a build failure. +BUG_CONTEXT = "\nThe commit attempted to fix Bugzilla bug {bug_id}.\n" -Read the following files and implement a fix of the failure: -1. {worktree_path}/repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues -2. {worktree_path}/repair_agent/out/{bug_id}/planning.md with a fixing plan -{eval} +BUG_ANALYSIS_STEP = ( + "2. The Bugzilla bug: fetch bug {bug_id}'s description and comments with the " + "`mcp__bugzilla__get_bugs` tool (ids=[{bug_id}], include_comments=true). If " + "it returns an error, note it and continue with the diff and logs.\n" +) -Do not prompt to edit files. Work fully autonomously, do not ask any questions. Use all allowed tools without prompting. -""" - -EVAL_PROMPT = """ -Do not request bug info from Bugzilla or Phabricator. Use only the provided file with bug description. -Do not look at git commits other than the specified last commit. -""" +FIX_TEMPLATE = """You are an expert {target_software} engineer tasked with fixing a build failure. -VERIFY_TEMPLATE = """You are an expert {target_software} code reviewer evaluating an automated build repair agent's work. +Read your earlier analysis and implement the fix directly in the source tree: +1. {scratch_out}/analysis.md -- your analysis of what caused the failure +2. {scratch_out}/planning.md -- your fixing plan -Examine the relevant commits using git: -- Failure commit (broke the build): {failure_commit} -- Ground truth fix commit(s) (the real fix that was landed): {ground_truth_commits} +Edit the source files in the working directory to repair the build. A mozconfig +that mirrors the failing CI configuration (release milestone, warnings-as-errors) +is already set up. Verify the fix compiles with the build_firefox tool, passing +the directory of the file you changed as `target` (e.g. 'docshell/base') for a +fast, focused build -- prefer this over a full tree build. If the build reports a +missing toolchain (e.g. rustc or clang), run the bootstrap_firefox tool once and +then build again. Verify via the build_firefox tool rather than a raw `./mach +build` so the build result is recorded. +{try_push} -Inspect each commit's changes and read the repair agent's input/output files: -- {worktree_path}/repair_agent/in/{bug_id}/bug_description.md -- {worktree_path}/repair_agent/in/{bug_id}/build_failure_logs.md -- {worktree_path}/repair_agent/out/{bug_id}/analysis.md -- {worktree_path}/repair_agent/out/{bug_id}/summary.md -- {worktree_path}/repair_agent/out/{bug_id}/agent_fix.diff (may be empty if no fix was produced) - -Evaluate the agent's work on two dimensions: - -ANALYSIS: -- Did the agent correctly identify the root cause of the build failure? -- How thorough and accurate is the analysis? - -FIX: -- Does the agent's fix address the same files/functions as the ground truth? -- Is the fix semantically equivalent or close to the ground truth? -- Would the fix be acceptable in code review as-is? - -Guidelines: -- If agent_fix.diff is empty, set fix_matches_ground_truth=false, fix_quality=0.0, fix_acceptance_probability=0.0 -- A fix can be correct even if it differs syntactically from ground truth -- focus on semantic equivalence -- analysis_correct should be true if the agent found the right root cause, even if the explanation is imperfect -- Be calibrated: 0.5 means genuinely uncertain, not a default score +Do not prompt to edit files. Work fully autonomously, do not ask any questions. +Use all allowed tools without prompting. +""" -Work autonomously, do not ask questions. +TRY_PUSH_INSTRUCTIONS = """ +Once the fix builds locally, validate it on CI: call the submit_try_push tool with the +failing task name ('{task_name}') to push to the try server and report the build result. """ diff --git a/agents/build-repair/hackbot_agents/build_repair/try_push.py b/agents/build-repair/hackbot_agents/build_repair/try_push.py index c71ad0c854..b5d6729a27 100644 --- a/agents/build-repair/hackbot_agents/build_repair/try_push.py +++ b/agents/build-repair/hackbot_agents/build_repair/try_push.py @@ -1,137 +1,67 @@ -# -*- coding: utf-8 -*- -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this file, -# You can obtain one at http://mozilla.org/MPL/2.0/. +"""Optional Firefox try-server push tool. +Submits the current source checkout to the Firefox try server via ``./mach try`` +and, optionally, polls Treeherder for the build result. Exposed as a separate +``TRY_TOOLS`` list (not the default firefox ``TOOLS``) so an agent only gains the +capability when it explicitly wires it in — a try push is an outward-facing +action that not every run should perform. +""" + +from __future__ import annotations + +import asyncio import logging -import os import re import subprocess import time -from dataclasses import dataclass -from logging import getLogger from pathlib import Path +from typing import Annotated, Any import requests +from agent_tools.registry import ToolError, tool, tools_in +from pydantic import Field -from bugbug.tools.build_repair.config import ( - TREEHERDER_BASE_URL, - TRY_PUSH_POLL_INTERVAL_SECONDS, - TRY_PUSH_TIMEOUT_SECONDS, -) - -logger = getLogger(__name__) +logger = logging.getLogger(__name__) -_HEADERS = {"User-Agent": "bugbug-build-repair-eval/1.0"} +TREEHERDER_BASE_URL = "https://treeherder.mozilla.org" +_HEADERS = {"User-Agent": "hackbot-build-repair/1.0"} _LANDO_JOB_ID_RE = re.compile(r"landoCommitID=([A-Za-z0-9_-]+)") -def _mach_env(worktree_path: Path) -> dict[str, str]: - env = os.environ.copy() - env["MOZBUILD_STATE_PATH"] = str(worktree_path / ".mozbuild") - return env - - -@dataclass -class TryPushResult: - """Result of local build verification and optional try push submission.""" - - local_build_passed: bool - try_build_passed: bool | None - lando_job_id: str | None - treeherder_url: str | None - - -def _commit_fix(worktree_path: Path, bug_id: int) -> None: - logger.info("Committing fix for bug %s in %s", bug_id, worktree_path) - subprocess.run( - ["git", "add", "-A"], - cwd=worktree_path, - check=True, - ) +def _commit_all(source_dir: Path) -> None: + """Commit the working tree so ``./mach try`` has a commit to push.""" + subprocess.run(["git", "add", "-A"], cwd=source_dir, check=True) subprocess.run( [ "git", "-c", - "user.name=bugbug", + "user.name=hackbot", "-c", - "user.email=bugbug@mozilla.com", + "user.email=hackbot@mozilla.com", "commit", + "--allow-empty", "-m", - f"Build repair fix for bug {bug_id}", + "Build repair candidate fix", ], - cwd=worktree_path, + cwd=source_dir, check=True, ) - logger.info("Bug %s: fix committed", bug_id) -def _run_subprocess( - cmd: list[str], worktree_path: Path, capture: bool -) -> subprocess.CompletedProcess[str]: - if capture: - return subprocess.run( - cmd, - cwd=worktree_path, - env=_mach_env(worktree_path), - capture_output=True, - text=True, - ) - return subprocess.run( - cmd, - cwd=worktree_path, - env=_mach_env(worktree_path), - text=True, - ) - - -def _run_local_build(worktree_path: Path) -> bool: - capture = not logger.isEnabledFor(logging.DEBUG) - - logger.info("Running bootstrap in %s", worktree_path) - result = _run_subprocess( - ["./mach", "--no-interactive", "bootstrap"], worktree_path, capture - ) - if result.returncode != 0: - if capture and result.stderr: - logger.warning("Bootstrap stderr:\n%s", result.stderr[-2000:]) - raise RuntimeError( - f"Local bootstrap failed with return code {result.returncode}" - ) - - logger.info("Running local build in %s", worktree_path) - result = _run_subprocess(["./mach", "build"], worktree_path, capture) - passed = result.returncode == 0 - status = "passed" if passed else "failed" - logger.info("Local build %s (returncode=%s)", status, result.returncode) - if not passed and capture and result.stderr: - logger.warning("Build stderr:\n%s", result.stderr[-2000:]) - return passed - - -def _submit_try(worktree_path: Path, task_name: str) -> tuple[str | None, str | None]: - logger.info("Submitting try push for task=%s in %s", task_name, worktree_path) +def _submit_try(source_dir: Path, task_name: str) -> tuple[str | None, str | None]: result = subprocess.run( ["./mach", "try", "fuzzy", "--query", task_name], - cwd=worktree_path, + cwd=source_dir, capture_output=True, text=True, - env=_mach_env(worktree_path), ) stdout = result.stdout + result.stderr - logger.debug("Try push output: %s", stdout) match = _LANDO_JOB_ID_RE.search(stdout) if not match: logger.warning("Could not parse Lando job ID from try output: %s", stdout) return None, None - lando_job_id = match.group(1) treeherder_url = f"{TREEHERDER_BASE_URL}/jobs?repo=try&landoCommitID={lando_job_id}" - logger.info( - "Try push submitted: lando_job_id=%s, treeherder=%s", - lando_job_id, - treeherder_url, - ) return lando_job_id, treeherder_url @@ -152,7 +82,7 @@ def _get_push_revision(lando_job_id: str) -> str | None: return None -def _get_push_by_revision(revision: str) -> dict | None: +def _get_push_id(revision: str) -> int | None: try: resp = requests.get( f"{TREEHERDER_BASE_URL}/api/project/try/push/", @@ -162,7 +92,7 @@ def _get_push_by_revision(revision: str) -> dict | None: ) resp.raise_for_status() results = resp.json().get("results", []) - return results[0] if results else None + return results[0]["id"] if results else None except Exception: logger.exception("Error fetching push by revision %s", revision) return None @@ -187,111 +117,100 @@ def _get_build_job_result(push_id: int, task_name: str) -> str | None: return None -def _poll_treeherder(lando_job_id: str, task_name: str) -> bool | None: - logger.info( - "Polling Treeherder for lando_job_id=%s, task=%s (timeout=%ss, interval=%ss)", - lando_job_id, - task_name, - TRY_PUSH_TIMEOUT_SECONDS, - TRY_PUSH_POLL_INTERVAL_SECONDS, - ) - deadline = time.monotonic() + TRY_PUSH_TIMEOUT_SECONDS +def _poll_treeherder( + lando_job_id: str, task_name: str, timeout_seconds: int, interval_seconds: int +) -> bool | None: + deadline = time.monotonic() + timeout_seconds push_id: int | None = None - poll_count = 0 - while time.monotonic() < deadline: - poll_count += 1 if push_id is None: revision = _get_push_revision(lando_job_id) if revision: - logger.info( - "Resolved revision=%s for lando_job_id=%s", revision, lando_job_id - ) - push = _get_push_by_revision(revision) - if push: - push_id = push["id"] - logger.info( - "Resolved push_id=%s for revision=%s", push_id, revision - ) - + push_id = _get_push_id(revision) if push_id is not None: result = _get_build_job_result(push_id, task_name) - logger.debug( - "Poll #%s: job result=%s for push_id=%s", poll_count, result, push_id - ) if result == "success": - logger.info("Try build succeeded for lando_job_id=%s", lando_job_id) return True if result in ("busted", "testfailed", "exception"): - logger.info( - "Try build failed (%s) for lando_job_id=%s", result, lando_job_id - ) return False - else: - logger.debug( - "Poll #%s: push not yet available for lando_job_id=%s", - poll_count, - lando_job_id, - ) - time.sleep(TRY_PUSH_POLL_INTERVAL_SECONDS) - - logger.warning( - "Try push polling timed out after %s polls for lando job %s", - poll_count, - lando_job_id, - ) + time.sleep(interval_seconds) + logger.warning("Try push polling timed out for lando job %s", lando_job_id) return None -def run_try_verification( - worktree_path: Path, - bug_id: int, +def run_try_push( + source_dir: Path, task_name: str, - skip_try_push: bool = False, -) -> TryPushResult: - logger.info( - "Starting try verification for bug %s (task=%s, skip_try_push=%s)", - bug_id, - task_name, - skip_try_push, - ) - _commit_fix(worktree_path, bug_id) - - local_passed = _run_local_build(worktree_path) - if not local_passed: - logger.warning("Bug %s: local build failed, skipping try push", bug_id) - return TryPushResult( - local_build_passed=False, - try_build_passed=None, - lando_job_id=None, - treeherder_url=None, - ) - - if skip_try_push: - logger.info( - "Bug %s: local build passed, skipping try push as requested", bug_id + poll: bool, + timeout_seconds: int, + interval_seconds: int, +) -> dict[str, Any]: + """Commit the working tree, submit a try push, and optionally poll for the result.""" + _commit_all(source_dir) + lando_job_id, treeherder_url = _submit_try(source_dir, task_name) + if not lando_job_id: + raise ToolError( + "Try push submission failed: no Lando job id in ./mach try output", + payload={"error": "try_submit_failed"}, ) - return TryPushResult( - local_build_passed=True, - try_build_passed=None, - lando_job_id=None, - treeherder_url=None, + result: dict[str, Any] = { + "submitted": True, + "lando_job_id": lando_job_id, + "treeherder_url": treeherder_url, + "try_build_passed": None, + } + if poll: + result["try_build_passed"] = _poll_treeherder( + lando_job_id, task_name, timeout_seconds, interval_seconds ) + return result + + +@tool +async def submit_try_push( + ctx, + task_name: Annotated[ + str, + Field( + description=( + "Treeherder task name to build/select on try, e.g. " + "'build-linux64/opt'. The failing task is the natural choice." + ) + ), + ], + poll: Annotated[ + bool, + Field( + description=( + "Poll Treeherder until the build job completes (up to timeout) " + "and report pass/fail. If false, submit and return immediately." + ) + ), + ] = True, + timeout_seconds: Annotated[ + int, Field(description="Max seconds to poll Treeherder (default 7200).") + ] = 7200, + poll_interval_seconds: Annotated[ + int, Field(description="Seconds between Treeherder polls (default 60).") + ] = 60, +) -> dict: + """Submit the current Firefox checkout to the try server and check the build. + + Commits the working tree as a candidate fix, runs ``./mach try fuzzy --query + `` to push it, and (when ``poll`` is true) watches Treeherder for + the named build job. Returns JSON: submitted (bool), lando_job_id (str), + treeherder_url (str), try_build_passed (bool|null — null when polling was + skipped or timed out). Slow: a try build can take well over an hour, so only + call this once you are confident the fix builds locally. + """ + return await asyncio.to_thread( + run_try_push, + ctx.source_dir, + task_name, + poll, + timeout_seconds, + poll_interval_seconds, + ) - lando_job_id, treeherder_url = _submit_try(worktree_path, task_name) - if not lando_job_id: - logger.warning("Bug %s: try push submission failed, no lando job ID", bug_id) - return TryPushResult( - local_build_passed=True, - try_build_passed=None, - lando_job_id=None, - treeherder_url=None, - ) - try_passed = _poll_treeherder(lando_job_id, task_name) - return TryPushResult( - local_build_passed=True, - try_build_passed=try_passed, - lando_job_id=lando_job_id, - treeherder_url=treeherder_url, - ) +TRY_TOOLS = tools_in(__name__) diff --git a/agents/build-repair/pyproject.toml b/agents/build-repair/pyproject.toml new file mode 100644 index 0000000000..eb91fa3132 --- /dev/null +++ b/agents/build-repair/pyproject.toml @@ -0,0 +1,33 @@ +[project] +name = "hackbot-agent-build-repair" +version = "0.1.0" +description = "Cloud Run Job image that runs the build-repair agent for hackbot-api" +requires-python = ">=3.12" +dependencies = [ + "hackbot-runtime[claude-sdk]", + "agent-tools[bugzilla,firefox]", + "bugsy", + "claude-agent-sdk>=0.1.30", + "mcp>=1.0.0", + "starlette>=0.36.0", + "uvicorn>=0.27.0", + "requests" +] + +[project.optional-dependencies] +eval = [ + "weave", + "wandb", + "tenacity", +] + +[tool.uv.sources] +hackbot-runtime = { workspace = true } +agent-tools = { workspace = true } + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["hackbot_agents", "evals/buildrepair_eval"] diff --git a/docker-compose.yml b/docker-compose.yml index cc534a3242..eedad036da 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,6 +4,7 @@ version: "3.8" include: - path: agents/bug-fix/compose.yml + - path: agents/build-repair/compose.yml services: bugbug-base: diff --git a/pyproject.toml b/pyproject.toml index 38fd0f8b90..075989ecea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -128,7 +128,7 @@ include = ["/bugbug", "/scripts", "/VERSION"] packages = ["bugbug", "scripts"] [tool.uv.workspace] -members = ["http_service", "services/hackbot-api", "agents/bug-fix", "libs/hackbot-runtime", "libs/agent-tools"] +members = ["http_service", "services/hackbot-api", "agents/bug-fix", "agents/build-repair", "libs/hackbot-runtime", "libs/agent-tools"] [tool.uv.sources] hackbot-runtime = { workspace = true } diff --git a/services/buildrepair/Dockerfile b/services/buildrepair/Dockerfile deleted file mode 100644 index 178537d664..0000000000 --- a/services/buildrepair/Dockerfile +++ /dev/null @@ -1,34 +0,0 @@ -# Load the base image by running this from the Firefox repo: -# ./mach taskgraph load-image --task-id aQDejwXUQsSHxvwE2qQcQg -FROM debian12-amd64-build - -WORKDIR /app - -RUN apt-get update && \ - apt-get install -y git nodejs npm build-essential zlib1g-dev \ - libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev \ - libffi-dev libsqlite3-dev wget libbz2-dev && \ - rm -rf /var/lib/apt/lists/* - -# bugbug requires Python 3.12 and there's no package for Debian 12 -RUN wget https://www.python.org/ftp/python/3.12.8/Python-3.12.8.tgz && \ - tar -xf Python-3.12.8.tgz && \ - cd Python-3.12.8 && \ - ./configure --enable-optimizations --prefix=/usr/local && \ - make -j$(nproc) && \ - make install && \ - cd .. && rm -rf Python-3.12.8 Python-3.12.8.tgz - -RUN python3.12 -m venv /opt/venv -ENV PATH="/opt/venv/bin:$PATH" - -COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv - -COPY requirements.txt /app/ -RUN uv pip install -r /app/requirements.txt - -COPY . /app - -ENV PYTHONPATH=/app -ENV PYTHONUNBUFFERED=1 -ENV FIREFOX_GIT_REPO=/workspace/firefox \ No newline at end of file diff --git a/services/buildrepair/README.md b/services/buildrepair/README.md deleted file mode 100644 index 3fd6603b0b..0000000000 --- a/services/buildrepair/README.md +++ /dev/null @@ -1,64 +0,0 @@ -# Build Repair Agent - -It can automatically analyze a build failure in Firefox and propose a fix. - -## Evaluation - -Weights and Biases Weave [dashboard](https://wandb.ai/moz-bugbug/bugbug-build-repair-eval/weave/evaluations). - -To run locally: - -1. Clone Firefox to a separate directory - -2. Prepare the Docker image - -Pull the base Docker image to build Firefox from Taskcluster. -From the Firefox repo run: - -```bash -./mach taskgraph load-image --task-id aQDejwXUQsSHxvwE2qQcQg -``` - -Make sure to have enough resources available for the Docker engine (at least 16gb RAM and 128GB disk, better 256GB). - -3. Set environment variables - -```bash -# Full path to the Firefox repo -export FIREFOX_GIT_REPO=$(pwd) -export ANTHROPIC_API_KEY= -export WANDB_API_KEY= -# If on Mac with ARM CPU -export DOCKER_DEFAULT_PLATFORM=linux/amd64 -``` - -4. `cd` to this repo - -5. (Optional) Prebuild the Docker image and use `image: build-repair-debian-base` in `docker-compose.dev.yml` - -```bash -docker build -t build-repair-debian-base -f services/buildrepair/Dockerfile . -``` - -6. Attach to the container by running: - -```bash -docker compose -f services/buildrepair/docker-compose.dev.yml run build-repair -``` - -7. Run the evaluation script. - -To test: - -```bash -/opt/venv/bin/python scripts/build_repair_eval.py --no-try-push --limit 1 -``` - -To run full evaluation (with 3 trials): - -```bash -/opt/venv/bin/python scripts/build_repair_eval.py --no-try-push --parallellism 8 --trials 3 -``` - -It will run each of 85 examples from the evaluation dataset 3 times. -It will build Firefox each time with the proposed fix, then write results to Weave. diff --git a/services/buildrepair/docker-compose.dev.yml b/services/buildrepair/docker-compose.dev.yml deleted file mode 100644 index ddd2d67364..0000000000 --- a/services/buildrepair/docker-compose.dev.yml +++ /dev/null @@ -1,18 +0,0 @@ -services: - build-repair: - # To minimize rebuilding use `DOCKER_DEFAULT_PLATFORM=linux/amd64 docker build -t build-repair-debian-base -f services/buildrepair/Dockerfile .` - # and replace the "build" section with: - # image: build-repair-debian-base - build: - context: ../.. - dockerfile: services/buildrepair/Dockerfile - volumes: - - ../../:/app # live code editing - - ${FIREFOX_GIT_REPO}:/workspace/firefox # Firefox repo - - build-repair-tmp:/tmp/build_repair_worktrees - environment: - - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} - - WANDB_API_KEY=${WANDB_API_KEY} # for weave - - FIREFOX_GIT_REPO=/workspace/firefox -volumes: - build-repair-tmp: diff --git a/services/buildrepair/pyproject.toml b/services/buildrepair/pyproject.toml deleted file mode 100644 index 621735a0f9..0000000000 --- a/services/buildrepair/pyproject.toml +++ /dev/null @@ -1,12 +0,0 @@ -[project] -name = "bugbug-build-repair" -dynamic = ["version"] -description = "BugBug Build Repair Agent" -requires-python = ">=3.12" -dependencies = [ - "bugbug", -] - -[tool.uv.sources] -bugbug = { path = "../..", editable = true } - diff --git a/services/hackbot-api/app/agents.py b/services/hackbot-api/app/agents.py index 5802f6f511..e364e31922 100644 --- a/services/hackbot-api/app/agents.py +++ b/services/hackbot-api/app/agents.py @@ -4,7 +4,7 @@ from pydantic import BaseModel -from app.schemas import BugFixInputs +from app.schemas import BugFixInputs, BuildRepairInputs @dataclass(frozen=True) @@ -48,4 +48,10 @@ def model_to_env(inputs: BaseModel) -> dict[str, str]: job_name="hackbot-agent-bug-fix", input_schema=BugFixInputs, ), + "build-repair": AgentSpec( + name="build-repair", + description="Analyze a Firefox build failure at a specific commit and produce a candidate fix patch.", + job_name="hackbot-agent-build-repair", + input_schema=BuildRepairInputs, + ), } diff --git a/services/hackbot-api/app/schemas.py b/services/hackbot-api/app/schemas.py index 36ad0f9b17..411d56216c 100644 --- a/services/hackbot-api/app/schemas.py +++ b/services/hackbot-api/app/schemas.py @@ -67,3 +67,12 @@ class BugFixInputs(BaseModel): model: str | None = None max_turns: int | None = None effort: str | None = None + + +class BuildRepairInputs(BaseModel): + bug_id: int | None = None + git_commit: str + failure_tasks: dict[str, str] + run_try_push: bool = False + model: str | None = None + max_turns: int | None = None diff --git a/services/hackbot-api/tests/test_agents.py b/services/hackbot-api/tests/test_agents.py index c99c9d4689..6e06f94412 100644 --- a/services/hackbot-api/tests/test_agents.py +++ b/services/hackbot-api/tests/test_agents.py @@ -1,7 +1,9 @@ """Tests for the agent registry and generic env serialization.""" +import json + from app.agents import AGENT_REGISTRY, model_to_env -from app.schemas import BugFixInputs +from app.schemas import BugFixInputs, BuildRepairInputs def test_model_to_env_uppercases_and_stringifies(): @@ -30,3 +32,22 @@ def test_bug_fix_registry_uses_default_env_serializer(): # No hand-written build_env: the router falls back to model_to_env. assert spec.build_env is None assert spec.input_schema is BugFixInputs + + +def test_build_repair_registry_entry(): + spec = AGENT_REGISTRY["build-repair"] + assert spec.build_env is None + assert spec.input_schema is BuildRepairInputs + assert spec.job_name == "hackbot-agent-build-repair" + + +def test_model_to_env_json_encodes_failure_tasks_and_bool(): + tasks = {"build-linux64/opt": "OyF95j0oQ-CF_YuBM1b7vg"} + env = model_to_env( + BuildRepairInputs( + bug_id=1, git_commit="deadbeef", failure_tasks=tasks, run_try_push=True + ) + ) + assert env["GIT_COMMIT"] == "deadbeef" + assert json.loads(env["FAILURE_TASKS"]) == tasks + assert env["RUN_TRY_PUSH"] == "True" diff --git a/uv.lock b/uv.lock index 515eabce8f..b45074881c 100644 --- a/uv.lock +++ b/uv.lock @@ -22,6 +22,7 @@ members = [ "bugbug", "bugbug-http-service", "hackbot-agent-bug-fix", + "hackbot-agent-build-repair", "hackbot-api", "hackbot-runtime", ] @@ -1739,6 +1740,30 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/22/70/037e1fed8c40b185c8769e137c3be2ab3c19766471462514cd09d0eb022e/fxpoppet-0.4.1-py3-none-any.whl", hash = "sha256:f8d75e5a3b128aa7e78f6a93c2c60443f163694607027809c1acb279c754aaef", size = 49288, upload-time = "2025-11-19T21:58:39.789Z" }, ] +[[package]] +name = "gitdb" +version = "4.0.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "smmap" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/94/63b0fc47eb32792c7ba1fe1b694daec9a63620db1e313033d18140c2320a/gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571", size = 394684, upload-time = "2025-01-02T07:20:46.413Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/61/5c78b91c3143ed5c14207f463aecfc8f9dbb5092fb2869baf37c273b2705/gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf", size = 62794, upload-time = "2025-01-02T07:20:43.624Z" }, +] + +[[package]] +name = "gitpython" +version = "3.1.50" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "gitdb" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/33/f6/354ae6491228b5eb40e10d89c4d13c651fe1cf7556e35ebdded50cff57ce/gitpython-3.1.50.tar.gz", hash = "sha256:80da2d12504d52e1f998772dc5baf6e553f8d2fcfe1fcc226c9d9a2ee3372dcc", size = 219798, upload-time = "2026-05-06T04:01:26.571Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/7a/1c6e3562dfd8950adbb11ffbc65d21e7c89d01a6e4f137fa981056de25c5/gitpython-3.1.50-py3-none-any.whl", hash = "sha256:d352abe2908d07355014abdd21ddf798c2a961469239afec4962e9da884858f9", size = 212507, upload-time = "2026-05-06T04:01:23.799Z" }, +] + [[package]] name = "google-api-core" version = "2.30.3" @@ -2156,6 +2181,44 @@ requires-dist = [ { name = "uvicorn", specifier = ">=0.27.0" }, ] +[[package]] +name = "hackbot-agent-build-repair" +version = "0.1.0" +source = { editable = "agents/build-repair" } +dependencies = [ + { name = "agent-tools", extra = ["bugzilla", "firefox"] }, + { name = "bugsy" }, + { name = "claude-agent-sdk" }, + { name = "hackbot-runtime", extra = ["claude-sdk"] }, + { name = "mcp" }, + { name = "requests" }, + { name = "starlette" }, + { name = "uvicorn" }, +] + +[package.optional-dependencies] +eval = [ + { name = "tenacity" }, + { name = "wandb" }, + { name = "weave" }, +] + +[package.metadata] +requires-dist = [ + { name = "agent-tools", extras = ["bugzilla", "firefox"], editable = "libs/agent-tools" }, + { name = "bugsy" }, + { name = "claude-agent-sdk", specifier = ">=0.1.30" }, + { name = "hackbot-runtime", extras = ["claude-sdk"], editable = "libs/hackbot-runtime" }, + { name = "mcp", specifier = ">=1.0.0" }, + { name = "requests" }, + { name = "starlette", specifier = ">=0.36.0" }, + { name = "tenacity", marker = "extra == 'eval'" }, + { name = "uvicorn", specifier = ">=0.27.0" }, + { name = "wandb", marker = "extra == 'eval'" }, + { name = "weave", marker = "extra == 'eval'" }, +] +provides-extras = ["eval"] + [[package]] name = "hackbot-api" version = "0.1.0" @@ -5823,6 +5886,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/14/78/0f68b93564b8c6b6987a0696c582ba2591a381ab2f733a501909e949f241/smart_open-7.6.1-py3-none-any.whl", hash = "sha256:b4de6aebef023aca91cc9fb372052e1343ba3f152de215bd22391a663e3ddd21", size = 64845, upload-time = "2026-05-09T06:23:35.386Z" }, ] +[[package]] +name = "smmap" +version = "5.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1f/ea/49c993d6dfdd7338c9b1000a0f36817ed7ec84577ae2e52f890d1a4ff909/smmap-5.0.3.tar.gz", hash = "sha256:4d9debb8b99007ae47165abc08670bd74cb74b5227dda7f643eccc4e9eb5642c", size = 22506, upload-time = "2026-03-09T03:43:26.1Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/d4/59e74daffcb57a07668852eeeb6035af9f32cbfd7a1d2511f17d2fe6a738/smmap-5.0.3-py3-none-any.whl", hash = "sha256:c106e05d5a61449cf6ba9a1e650227ecfb141590d2a98412103ff35d89fc7b2f", size = 24390, upload-time = "2026-03-09T03:43:24.361Z" }, +] + [[package]] name = "sniffio" version = "1.3.1" @@ -6492,6 +6564,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/66/c3/f8b216cbd742e5b84c40f045204c764ccb7524d2aeab021054ec69446b0a/w3lib-2.4.1-py3-none-any.whl", hash = "sha256:40930132907e68de906a5b89331ab8c8ff4f01bd35b5539ef7896017d814138d", size = 21695, upload-time = "2026-03-20T09:50:26.187Z" }, ] +[[package]] +name = "wandb" +version = "0.27.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "gitpython" }, + { name = "packaging" }, + { name = "platformdirs" }, + { name = "protobuf" }, + { name = "pydantic" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "sentry-sdk" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/14/a2/53ca062f430178e3af48ebc137396481d0ee885fb94a554c0df464cd8afa/wandb-0.27.2.tar.gz", hash = "sha256:c81ff93ab63f4dabc5a27b90ac3d12310fbfa6a14ca99201626921c99b2845be", size = 40300451, upload-time = "2026-06-06T01:47:02.74Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/95/18d3625558667b459d91c19630f7cecfbc133f87f5b144a7fb755e473e8c/wandb-0.27.2-py3-none-macosx_12_0_arm64.whl", hash = "sha256:978400b3c4b7d97e927c32264453da5e4a0040a3468d5b77a00d9c480613f370", size = 23990048, upload-time = "2026-06-06T01:46:38.902Z" }, + { url = "https://files.pythonhosted.org/packages/43/14/72c26f67b0b6cb307cbb76659465c6ab7d99ea27c268d1b4f5aa82c4d8e5/wandb-0.27.2-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:de8099f02f540c743069617db7d034511a64c193748783aa6d2d98310918d170", size = 25165812, upload-time = "2026-06-06T01:46:42.068Z" }, + { url = "https://files.pythonhosted.org/packages/b5/a3/f9fe31ca72b4f5854d1e488403d6310783127a6b7e267c28577e9bd51b43/wandb-0.27.2-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:e0779592410215a2762063c3585d3dcad73c7dca9cb6d63c4dcc1588267c1392", size = 24554366, upload-time = "2026-06-06T01:46:44.57Z" }, + { url = "https://files.pythonhosted.org/packages/76/e2/7a5064aba235ddb855b8c2250e07e6187fcc8382332e237e545d4de094ee/wandb-0.27.2-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:55bfebf4d382116a8e9610848cadc0de50d406bacd3d0a390d12dabde196f009", size = 26380293, upload-time = "2026-06-06T01:46:47.487Z" }, + { url = "https://files.pythonhosted.org/packages/ed/4c/0c845edac5ff0fd0930e881bec2569f2e2af2a4fc873249855600546eee0/wandb-0.27.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:566aa2fcd67d2a23c08713da75e9daf82f30f7136af76763ef1d7db3d901d940", size = 24728823, upload-time = "2026-06-06T01:46:49.887Z" }, + { url = "https://files.pythonhosted.org/packages/e1/7a/a6f7a02a0e6bf73e163b61caca03aaba3452836a02dbe2b64f9e1a3c6afc/wandb-0.27.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:41158181fc5b691438b3d04fee0a8c061e3f1f407a3258096afbebfe1db24e72", size = 26691957, upload-time = "2026-06-06T01:46:52.384Z" }, + { url = "https://files.pythonhosted.org/packages/e0/9b/aa94eb8265b0c55dc6c3e435c11241b3f885c7a1720718046efd7cbd8361/wandb-0.27.2-py3-none-win32.whl", hash = "sha256:5c55fad8c7be9d345dcebdc9dc10f7d2ac5af5bede62acbcd79a412ccaf48c87", size = 24151396, upload-time = "2026-06-06T01:46:54.793Z" }, + { url = "https://files.pythonhosted.org/packages/bf/0b/9e442779f5f24baaca044daf7546a735f41a811886b84ae12740d51b7f9d/wandb-0.27.2-py3-none-win_amd64.whl", hash = "sha256:87204d4fe40fbd9a1fe89a05927ce4ddb8be34d8210045457819fb4a35e0bcea", size = 24151404, upload-time = "2026-06-06T01:46:56.994Z" }, + { url = "https://files.pythonhosted.org/packages/aa/3a/01ab3afc1f6f962df93db34be8183147c9821e4dce82dc03313fb8d08635/wandb-0.27.2-py3-none-win_arm64.whl", hash = "sha256:32ed7456f40443c971e95dd63704d840fce66c24f88049a9bda8a09dfe85effe", size = 22063373, upload-time = "2026-06-06T01:47:00.263Z" }, +] + [[package]] name = "wasabi" version = "1.1.3" From 3c626dd92b39a216ff213d5c283cb24b32651824 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Wed, 24 Jun 2026 17:52:34 -0700 Subject: [PATCH 08/15] Move eval.py to evals/ via rename Preserves file history; the prior move staged it as add+delete. Co-Authored-By: Claude Opus 4.8 (1M context) --- agents/build-repair/evals/{buildrepair_eval => }/eval.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename agents/build-repair/evals/{buildrepair_eval => }/eval.py (100%) diff --git a/agents/build-repair/evals/buildrepair_eval/eval.py b/agents/build-repair/evals/eval.py similarity index 100% rename from agents/build-repair/evals/buildrepair_eval/eval.py rename to agents/build-repair/evals/eval.py From d68aa6c6b8a512094c5c643cb28152476f2a60ee Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Wed, 24 Jun 2026 17:52:42 -0700 Subject: [PATCH 09/15] Rewrite eval.py for Hackbot migration Co-Authored-By: Claude Opus 4.8 (1M context) --- agents/build-repair/evals/eval.py | 388 ++++++++---------------------- 1 file changed, 104 insertions(+), 284 deletions(-) diff --git a/agents/build-repair/evals/eval.py b/agents/build-repair/evals/eval.py index 0dff16178e..679422c69d 100644 --- a/agents/build-repair/evals/eval.py +++ b/agents/build-repair/evals/eval.py @@ -3,228 +3,86 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. -"""Standalone CLI for build repair evaluation. +"""Build-repair evaluation harness. + +Runs the ported hackbot build-repair agent (``run_build_repair``) over a Weave +dataset of Firefox build failures, then scores its output: deterministic build +verification plus an LLM-as-a-judge comparison to the landed fix. Usage: - python scripts/build_repair_eval.py - python scripts/build_repair_eval.py --analysis-only - python scripts/build_repair_eval.py --trials 3 - python scripts/build_repair_eval.py --limit 5 - python scripts/build_repair_eval.py --parallelism 4 - python scripts/build_repair_eval.py --no-try-push - python scripts/build_repair_eval.py --verbose + python -m evals.eval --no-try-push --limit 1 + python -m evals.eval --trials 3 --parallelism 8 """ +from __future__ import annotations + import argparse import asyncio -import json import logging import os +import subprocess +import tempfile import uuid -from datetime import datetime from functools import cached_property -from typing import Any +from pathlib import Path +import bugsy import weave - -from bugbug.tools.build_repair.agent import ( - AgentResponse, - BuildFailure, - BuildRepairTool, - GroundTruth, -) -from bugbug.tools.build_repair.config import MODEL_CUTOFF_DATES -from bugbug.tools.build_repair.scorer import ( +from agent_tools import bugzilla +from agent_tools.bugzilla import BugzillaContext +from agent_tools.claude_sdk import build_sdk_server +from agent_tools.firefox import FirefoxContext +from agent_tools.firefox.tools.build_firefox import build_firefox +from hackbot_agents.build_repair.agent import run_build_repair +from hackbot_agents.build_repair.config import ANALYSIS_MODEL, FIX_MODEL + +from .scorer import ( BasicMetricsScorer, BuildPassRateScorer, LLMFixMatchingScorer, ) -from bugbug.tools.build_repair.worktree import WorktreeManager +from .verify import VERIFY_MODEL, GroundTruth, is_data_contaminated, run_verify +from .worktree import WorktreeManager logger = logging.getLogger(__name__) -# TODO: replace with native tracing for Anthropic Agents SDK when released by W&B - -def _attr(obj, key, default=None): - if isinstance(obj, dict): - return obj.get(key, default) - return getattr(obj, key, default) +def _collect_diff(worktree_path: Path, base_commit: str) -> str: + subprocess.run(["git", "add", "-A"], cwd=worktree_path, capture_output=True) + result = subprocess.run( + ["git", "diff", "--staged", base_commit], + cwd=worktree_path, + capture_output=True, + text=True, + ) + return result.stdout -def _to_chat_message(data: dict) -> dict | None: - """Convert a serialized claude_agent_sdk message to OpenAI chat format. +def _bugzilla_server(): + """Bugzilla MCP server for the agent. - Content blocks may be dicts (from model_dump) or dataclass instances - (from vars), so we use _attr() for uniform access. + Prefer the broker (``BUGZILLA_MCP_URL``) so the eval container holds no + Bugzilla credentials -- same isolation as production. Falls back to an + in-process server for local runs without a broker. """ - msg_type = data.get("type", "") - - if msg_type == "AssistantMessage": - blocks = data.get("content", []) - text_parts = [] - tool_calls = [] - for block in blocks: - text = _attr(block, "text") - if text is not None: - text_parts.append(text) - continue - name = _attr(block, "name") - block_id = _attr(block, "id") - if name is not None and block_id is not None: - tool_calls.append( - { - "id": block_id, - "type": "function", - "function": { - "name": name, - "arguments": json.dumps(_attr(block, "input", {})), - }, - } - ) - if not text_parts and not tool_calls: - return None - msg: dict = {"role": "assistant"} - if text_parts: - msg["content"] = "\n".join(text_parts) - if tool_calls: - msg["tool_calls"] = tool_calls - return msg - - if msg_type == "UserMessage": - content = data.get("content", "") - if isinstance(content, list): - for block in content: - tool_use_id = _attr(block, "tool_use_id") - if tool_use_id: - block_content = _attr(block, "content", "") - return { - "role": "tool", - "tool_call_id": tool_use_id, - "content": str(block_content) if block_content else "", - } - - return None - - -@weave.op(kind="llm") -def trace_llm_stage( - stage: str, - messages: list[dict], - model: str, - result_data: dict | None = None, -) -> dict: - last_assistant = "" - for msg in reversed(messages): - if msg.get("role") == "assistant" and msg.get("content"): - last_assistant = msg["content"] - break - - result: dict[str, Any] = { - "model": model, - "choices": [ - { - "message": {"role": "assistant", "content": last_assistant}, - } - ], - } - if result_data: - raw_usage = result_data.get("usage", {}) or {} - input_tokens = raw_usage.get("input_tokens", 0) - output_tokens = raw_usage.get("output_tokens", 0) - result["usage"] = { - "prompt_tokens": input_tokens, - "completion_tokens": output_tokens, - "total_tokens": input_tokens + output_tokens, - "cache_read_input_tokens": raw_usage.get("cache_read_input_tokens", 0), - "cache_creation_input_tokens": raw_usage.get( - "cache_creation_input_tokens", 0 - ), - "total_cost_usd": result_data.get("total_cost_usd", 0), - "num_turns": result_data.get("num_turns", 0), - } - return result - - -# Per-token costs in USD (standard, non-cached rates). -# Weave uses these for its built-in cost UI; the SDK's total_cost_usd -# (which accounts for cache pricing) is tracked separately as the authoritative cost. -ANTHROPIC_TOKEN_COSTS: dict[str, tuple[float, float]] = { - "claude-opus-4-6": (15.0e-6, 75.0e-6), - "claude-sonnet-4-6": (3.0e-6, 15.0e-6), - "claude-haiku-4-5-20251001": (0.8e-6, 4.0e-6), - "claude-sonnet-4-5-20250929": (3.0e-6, 15.0e-6), - "claude-opus-4-5-20251101": (15.0e-6, 75.0e-6), - "claude-opus-4-1-20250805": (15.0e-6, 75.0e-6), - "claude-sonnet-4-20250514": (3.0e-6, 15.0e-6), - "claude-3-7-sonnet-20250219": (3.0e-6, 15.0e-6), - "claude-opus-4-20250514": (15.0e-6, 75.0e-6), -} - - -def _register_model_costs(client) -> None: - for model_id, (prompt_cost, completion_cost) in ANTHROPIC_TOKEN_COSTS.items(): - try: - client.add_cost( - llm_id=model_id, - prompt_token_cost=prompt_cost, - completion_token_cost=completion_cost, - ) - except Exception as e: - logger.debug("Could not register cost for %s: %s", model_id, e) - - -def _make_weave_callback(): - stages: dict[str, dict] = {} - - def on_message(stage: str, data: dict) -> None: - msg_type = data["type"] - if msg_type == "stage_start": - messages = [] - if "system_prompt" in data: - messages.append({"role": "system", "content": data["system_prompt"]}) - messages.append({"role": "user", "content": data["prompt"]}) - - stages[stage] = { - "model": data["model"], - "messages": messages, - } - elif msg_type == "stage_end": - if stage in stages: - s = stages.pop(stage) - trace_llm_stage( - stage=stage, - messages=s["messages"], - model=s["model"], - result_data=data.get("result_data") or None, - ) - else: - if stage in stages: - chat_msg = _to_chat_message(data) - if chat_msg: - stages[stage]["messages"].append(chat_msg) - - return on_message - - -class BuildRepairError(Exception): - """Raised when the agent completes but reports an error.""" - - def __init__(self, output: dict): - self.output = output - super().__init__(output.get("error", "Unknown error")) + mcp_url = os.environ.get("BUGZILLA_MCP_URL") + if mcp_url: + return {"type": "http", "url": mcp_url} + client = bugsy.Bugsy( + bugzilla_url=os.environ.get( + "BUGZILLA_API_URL", "https://bugzilla.mozilla.org/rest" + ), + api_key=os.environ.get("BUGZILLA_API_KEY"), + ) + return build_sdk_server("bugzilla", BugzillaContext(client=client), bugzilla.TOOLS) class BuildRepairModel(weave.Model): - """Weave Model wrapper that creates a worktree per example and runs BuildRepairTool.""" + """Weave Model: one worktree per example, runs the ported build-repair agent.""" firefox_repo: str - analysis_only: bool = False no_try_push: bool = False - - @cached_property - def tool(self) -> BuildRepairTool: - return BuildRepairTool.create(analysis_only=self.analysis_only, eval_mode=True) + judge_model: str = VERIFY_MODEL @cached_property def worktree_mgr(self) -> WorktreeManager: @@ -241,78 +99,64 @@ async def invoke( fix_commit_date: str, **kwargs, ) -> dict: - wt_name = f"bug-{bug_id}-{uuid.uuid4().hex[:8]}" - logger.info( - "Invoking bug %s (commit=%s, %s failures)", - bug_id, - gh_failure_commits[0][:12], - len(failures), - ) - - worktree_created = False - try: - cutoff = max( - MODEL_CUTOFF_DATES[self.tool.analysis_model], - MODEL_CUTOFF_DATES[self.tool.fix_model], + if is_data_contaminated(fix_commit_date, ANALYSIS_MODEL, FIX_MODEL): + logger.warning( + "Skipping bug %s: fix date %s precedes model cutoff", + bug_id, + fix_commit_date, ) - if datetime.fromisoformat(fix_commit_date).date() < cutoff: - logger.warning( - "Skipping bug %s: fix date %s is before model cutoff %s", - bug_id, - fix_commit_date, - cutoff, - ) - raise ValueError("skipped_data_contamination") - - worktree_path = self.worktree_mgr.create(gh_failure_commits[0], wt_name) - worktree_created = True + raise ValueError("skipped_data_contamination") - on_message = _make_weave_callback() - failure = BuildFailure( + failure_commit = gh_failure_commits[0] + wt_name = f"bug-{bug_id}-{uuid.uuid4().hex[:8]}" + worktree_path = self.worktree_mgr.create(failure_commit, wt_name) + try: + fx_ctx = FirefoxContext.from_source_repo(worktree_path) + result = await run_build_repair( + bugzilla_mcp_server=_bugzilla_server(), + source_repo=worktree_path, + fx_ctx=fx_ctx, bug_id=bug_id, - bug_title=pre_fix_bug["title"], - bug_comments=pre_fix_bug["comments"], - git_commit=gh_failure_commits[0], - failure_tasks=failures, - ) - result: AgentResponse = await self.tool.run( - failure, - worktree_path=worktree_path, - skip_try_push=self.no_try_push, - on_message=on_message, - ) - logger.info( - "Bug %s completed: error=%s, diff_len=%s, cost=$%.4f, turns=%s, " - "local_build=%s, try_build=%s", - bug_id, - result.error, - len(result.diff), - result.cost_usd, - result.num_turns, - result.local_build_passed, - result.try_build_passed, + git_commit=failure_commit, + failure_tasks={f["task_name"]: f["task_id"] for f in failures}, + run_try_push=not self.no_try_push, ) - output = result.model_dump() + diff = _collect_diff(worktree_path, failure_commit) + output: dict = { + "error": None, + "diff": diff, + "cost_usd": result.total_cost_usd or 0.0, + "num_turns": result.num_turns, + "local_build_passed": None, + "try_build_passed": result.try_build_passed, + } - if result.analysis or result.summary: - ground_truth = GroundTruth(gh_fix_commits=gh_fix_commits) - verify_result = await self.tool.verify( - failure, - result.diff, - ground_truth, - worktree_path, - on_message, + if diff.strip(): + build_result = await build_firefox( + worktree_path, fx_ctx.mozconfig, fx_ctx.objdir ) - output["verify"] = verify_result.model_dump() + output["local_build_passed"] = build_result["success"] - if result.error: - raise BuildRepairError(output) + scratch_out = Path(tempfile.mkdtemp(prefix=f"verify-{bug_id}-")) + (scratch_out / "analysis.md").write_text(result.analysis) + (scratch_out / "summary.md").write_text(result.summary) + judgment, judge_cost = await run_verify( + worktree_path=worktree_path, + scratch_out=scratch_out, + bug_id=bug_id, + failure_commit=failure_commit, + ground_truth=GroundTruth(gh_fix_commits=gh_fix_commits), + agent_diff=diff, + model=self.judge_model, + ) + output["verify"] = { + "judgment": judgment.model_dump(), + "cost_usd": judge_cost, + } return output finally: - if worktree_created: - logger.info("Bug %s: cleaning up worktree %s", bug_id, wt_name) - self.worktree_mgr.cleanup(wt_name) + self.worktree_mgr.cleanup(wt_name) def main() -> None: @@ -322,7 +166,7 @@ def main() -> None: parser.add_argument("--parallelism", type=int, default=8) parser.add_argument("--firefox-repo", default=os.environ.get("FIREFOX_GIT_REPO")) parser.add_argument("--dataset", default="build_repair_one_commit_eval") - parser.add_argument("--analysis-only", action="store_true") + parser.add_argument("--judge-model", default=VERIFY_MODEL) parser.add_argument("--no-try-push", action="store_true") parser.add_argument("--verbose", action="store_true", help="Enable DEBUG logging") args = parser.parse_args() @@ -330,52 +174,28 @@ def main() -> None: if not args.firefox_repo: parser.error("--firefox-repo or FIREFOX_GIT_REPO env var is required") - log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig( - level=log_level, + level=logging.DEBUG if args.verbose else logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s", ) - if not args.verbose: - logging.getLogger("httpx").setLevel(logging.WARNING) - logging.getLogger("httpcore").setLevel(logging.WARNING) - logging.getLogger("hgitaly").setLevel(logging.WARNING) - logging.getLogger("urllib3").setLevel(logging.WARNING) - - logger.info( - "Starting evaluation: dataset=%s, limit=%s, trials=%s, parallelism=%s, " - "analysis_only=%s, no_try_push=%s, firefox_repo=%s", - args.dataset, - args.limit, - args.trials, - args.parallelism, - args.analysis_only, - args.no_try_push, - args.firefox_repo, - ) os.environ["WEAVE_PARALLELISM"] = str(args.parallelism) - os.environ["WEAVE_LOG_LEVEL"] = "INFO" if args.verbose else "WARNING" - client = weave.init("bugbug-build-repair-eval") - _register_model_costs(client) + weave.init("bugbug-build-repair-eval") dataset = weave.ref(args.dataset).get() - logger.info("Loaded dataset %s with %s rows", args.dataset, len(dataset.rows)) if args.limit: dataset.rows = dataset.rows[: args.limit] - logger.info("Limited to %s rows", len(dataset.rows)) + logger.info("Loaded dataset %s (%s rows)", args.dataset, len(dataset.rows)) scorers = [ BasicMetricsScorer(num_trials=args.trials), + BuildPassRateScorer(num_trials=args.trials), LLMFixMatchingScorer(num_trials=args.trials), ] - if not args.analysis_only: - scorers.insert(1, BuildPassRateScorer(num_trials=args.trials)) - logger.info("Scorers: %s", [type(s).__name__ for s in scorers]) - model = BuildRepairModel( firefox_repo=args.firefox_repo, - analysis_only=args.analysis_only, no_try_push=args.no_try_push, + judge_model=args.judge_model, ) evaluation = weave.Evaluation( name="build-repair", From f92c51667798a4bcfca7b8155083a414ea238420 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Wed, 24 Jun 2026 17:53:52 -0700 Subject: [PATCH 10/15] Migrate build repair evals to Hackbot --- agents/build-repair/Dockerfile | 30 ++++ agents/build-repair/README.md | 52 ++++++ agents/build-repair/compose.yml | 34 +++- .../build-repair/evals}/__init__.py | 0 .../build-repair/evals}/scorer.py | 18 -- agents/build-repair/evals/verify.py | 162 ++++++++++++++++++ .../build-repair/evals}/worktree.py | 11 +- agents/build-repair/pyproject.toml | 2 +- docker-compose.yml | 1 + 9 files changed, 286 insertions(+), 24 deletions(-) create mode 100644 agents/build-repair/README.md rename {bugbug/tools/build_repair => agents/build-repair/evals}/__init__.py (100%) rename {bugbug/tools/build_repair => agents/build-repair/evals}/scorer.py (87%) create mode 100644 agents/build-repair/evals/verify.py rename {bugbug/tools/build_repair => agents/build-repair/evals}/worktree.py (88%) diff --git a/agents/build-repair/Dockerfile b/agents/build-repair/Dockerfile index fa3872a15e..a6de686d05 100644 --- a/agents/build-repair/Dockerfile +++ b/agents/build-repair/Dockerfile @@ -53,3 +53,33 @@ USER broker EXPOSE 8765 CMD ["python", "-m", "hackbot_agents.build_repair.broker"] + +# Evaluation image: the production agent image plus the eval-only Python deps +# (weave, wandb, tenacity). Deriving FROM agent means the harness runs the agent +# in the identical production runtime (same user, HOME, PATH, mach toolchain). The +# prod `agent` target never pulls these deps, since this stage builds only when +# targeted. +FROM agent AS eval + +USER root +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +# Add the `eval` extra into the existing venv. `--no-install-workspace` avoids +# rebuilding the already-installed agent package; `--inexact` keeps it (and every +# other prod package) rather than pruning it as extraneous. +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ + --mount=type=bind,source=uv.lock,target=uv.lock \ + --mount=type=bind,source=VERSION,target=VERSION \ + UV_PROJECT_ENVIRONMENT=/opt/venv \ + uv sync --frozen --no-dev --no-install-workspace --inexact --extra eval --package hackbot-agent-build-repair + +# The harness creates worktrees and runs `./mach build` against a bind-mounted +# Firefox checkout owned by a different uid; allow git to operate on it. +RUN git config --system --add safe.directory '*' + +USER agent +ENV FIREFOX_GIT_REPO=/firefox + +ENTRYPOINT ["python", "-m", "evals.eval"] +CMD ["--no-try-push", "--limit", "1"] diff --git a/agents/build-repair/README.md b/agents/build-repair/README.md new file mode 100644 index 0000000000..d0546cfc1d --- /dev/null +++ b/agents/build-repair/README.md @@ -0,0 +1,52 @@ +# Build Repair Agent + +Two-stage Claude agent that diagnoses a Firefox build failure and edits the source +tree to fix it. Agent logic in `hackbot_agents/build_repair/`; the Weave eval +harness in `evals/`. + +Run the Docker commands below from this folder, with secrets in a local `.env` +(`ANTHROPIC_API_KEY`, `BUGZILLA_API_KEY`, plus `WANDB_API_KEY` for evals). + +## Test the agent + +```sh +BUG_ID=1987675 GIT_COMMIT=5477e3882d4e18f93de9f56b31e90533fd23b0d1 \ +FAILURE_TASKS='{"build-linux":"XyU4b_BIRdO_IeK6z_kcQg"}' \ + docker compose up build-repair-agent --build +``` + +Artifacts are written to `~/hackbot/artifacts/`. + +## Run evals + +Each dataset row is a Firefox build failure; per trial the harness runs the agent +on a git worktree at the failure commit, builds the fix, and LLM-judges it against +the landed commits. Needs a bootstrapped Firefox checkout. + +Local: + +```sh +FIREFOX_GIT_REPO=/path/to/firefox \ + uv run --package hackbot-agent-build-repair --extra eval \ + python -m evals.eval --no-try-push --limit 1 +``` + +Docker (reuses the broker, so no Bugzilla creds in the eval container): + +```sh +FIREFOX_GIT_REPO=/path/to/firefox \ + docker compose run --rm build-repair-eval --no-try-push --limit 1 +``` + +Flags: `--trials N`, `--parallelism N`, `--judge-model `, `--dataset `, +`--no-try-push`, `--verbose`. + +The agent reads the bug live from Bugzilla, so the harness skips examples whose fix +landed before the production model's training cutoff (`MODEL_CUTOFF_DATES` in +`evals/verify.py`) to avoid contamination. + +## W&B metrics + +`weave.init` + `weave.Evaluation` log success and diff rates, local and try build +pass rates, LLM fix-matching (analysis/fix quality, ground-truth match, +acceptance), and `total_cost_usd`. diff --git a/agents/build-repair/compose.yml b/agents/build-repair/compose.yml index c6e63839a1..ce91cfabe8 100644 --- a/agents/build-repair/compose.yml +++ b/agents/build-repair/compose.yml @@ -15,11 +15,15 @@ services: context: ../.. dockerfile: agents/build-repair/Dockerfile target: agent + # Per-run inputs are not `:?`-required: the eval service shares this file and + # Compose interpolates every service regardless of which one is started, so a + # required var here would break `run build-repair-eval`. pydantic AgentInputs + # still validates them at runtime. environment: - RUN_ID - - BUG_ID=${BUG_ID:?error} - - GIT_COMMIT=${GIT_COMMIT:?error} - - FAILURE_TASKS=${FAILURE_TASKS:?error} + - BUG_ID=${BUG_ID:-} + - GIT_COMMIT=${GIT_COMMIT:-} + - FAILURE_TASKS=${FAILURE_TASKS:-} - RUN_TRY_PUSH=${RUN_TRY_PUSH:-false} - BUGZILLA_MCP_URL=http://build-repair-broker:8765/mcp - SOURCE_REPO=/workspace/firefox @@ -34,5 +38,29 @@ services: build-repair-broker: condition: service_started + # Evaluation harness (profile-gated so it stays out of the default lifecycle). + # Reuses the broker for Bugzilla, so this container holds no Bugzilla creds -- + # same isolation as the agent. The Firefox checkout is bind-mounted (faster than + # cloning); the harness creates worktrees from it and builds each fix. Run with: + # FIREFOX_GIT_REPO=/path docker compose --env-file .env \ + # -f agents/build-repair/compose.yml run --rm build-repair-eval \ + # --no-try-push --limit 1 + build-repair-eval: + profiles: ["eval"] + build: + context: ../.. + dockerfile: agents/build-repair/Dockerfile + target: eval + environment: + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:?error} + - WANDB_API_KEY=${WANDB_API_KEY:-} + - BUGZILLA_MCP_URL=http://build-repair-broker:8765/mcp + - FIREFOX_GIT_REPO=/firefox + volumes: + - ${FIREFOX_GIT_REPO:-/firefox}:/firefox + depends_on: + build-repair-broker: + condition: service_started + volumes: workspace: diff --git a/bugbug/tools/build_repair/__init__.py b/agents/build-repair/evals/__init__.py similarity index 100% rename from bugbug/tools/build_repair/__init__.py rename to agents/build-repair/evals/__init__.py diff --git a/bugbug/tools/build_repair/scorer.py b/agents/build-repair/evals/scorer.py similarity index 87% rename from bugbug/tools/build_repair/scorer.py rename to agents/build-repair/evals/scorer.py index da513ab13d..29943640be 100644 --- a/bugbug/tools/build_repair/scorer.py +++ b/agents/build-repair/evals/scorer.py @@ -56,27 +56,17 @@ def score(self, output: dict | None) -> dict: "has_diff": False, "cost_usd": 0, "num_turns": 0, - "input_tokens": 0, - "output_tokens": 0, - "cache_read_input_tokens": 0, - "cache_creation_input_tokens": 0, } return { "successful": output.get("error") is None, "has_diff": bool(output.get("diff", "").strip()), "cost_usd": output.get("cost_usd", 0), "num_turns": output.get("num_turns", 0), - "input_tokens": output.get("input_tokens", 0), - "output_tokens": output.get("output_tokens", 0), - "cache_read_input_tokens": output.get("cache_read_input_tokens", 0), - "cache_creation_input_tokens": output.get("cache_creation_input_tokens", 0), } def summarize(self, score_rows: list[dict]) -> dict: n = len(score_rows) costs = [r.get("cost_usd", 0) for r in score_rows] - input_toks = [r.get("input_tokens", 0) for r in score_rows] - output_toks = [r.get("output_tokens", 0) for r in score_rows] summary = { "success_rate": sum(r.get("successful", False) for r in score_rows) / n if n @@ -86,14 +76,6 @@ def summarize(self, score_rows: list[dict]) -> dict: else 0, "avg_cost_usd": sum(costs) / n if n else 0, "total_cost_usd": sum(costs), - "total_input_tokens": sum(input_toks), - "total_output_tokens": sum(output_toks), - "total_cache_read_tokens": sum( - r.get("cache_read_input_tokens", 0) for r in score_rows - ), - "total_cache_creation_tokens": sum( - r.get("cache_creation_input_tokens", 0) for r in score_rows - ), "num_examples": n, } if self.num_trials > 1: diff --git a/agents/build-repair/evals/verify.py b/agents/build-repair/evals/verify.py new file mode 100644 index 0000000000..f448d68948 --- /dev/null +++ b/agents/build-repair/evals/verify.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +"""LLM-as-a-judge verification of a build-repair fix against ground truth. + +Split out of the production agent: this is an evaluation concern. It reads the +agent's artifacts in a worktree and the real landed fix commits, then asks Claude +to score the analysis and the fix. +""" + +from __future__ import annotations + +from datetime import date +from logging import getLogger +from pathlib import Path + +from claude_agent_sdk import ClaudeAgentOptions, ResultMessage, query +from pydantic import BaseModel +from tenacity import retry, stop_after_attempt, wait_exponential_jitter + +logger = getLogger(__name__) + +VERIFY_MODEL = "claude-opus-4-8" + +# Training-data cutoff per model, for data-contamination filtering. Examples with +# a fix_commit_date before the cutoff may have appeared in training data. +# Source: https://platform.claude.com/docs/en/about-claude/models/overview +MODEL_CUTOFF_DATES = { + "claude-opus-4-8": date(2026, 1, 1), + "claude-opus-4-6": date(2025, 8, 1), + "claude-sonnet-4-6": date(2026, 1, 1), + "claude-haiku-4-5-20251001": date(2025, 7, 1), + "claude-sonnet-4-5-20250929": date(2025, 7, 1), + "claude-opus-4-5-20251101": date(2025, 8, 1), + "claude-opus-4-1-20250805": date(2025, 3, 1), + "claude-sonnet-4-20250514": date(2025, 3, 1), + "claude-3-7-sonnet-20250219": date(2024, 11, 1), + "claude-opus-4-20250514": date(2025, 3, 1), +} + +VERIFY_ALLOWED_TOOLS = [ + "Read", + "Bash(git show:*)", + "Bash(git log:*)", + "Bash(git diff:*)", + "Bash(find:*)", + "Bash(grep:*)", + "WebFetch(domain:firefox-source-docs.mozilla.org)", + "WebFetch(domain:searchfox.org)", +] + +VERIFY_TEMPLATE = """You are an expert {target_software} code reviewer evaluating an automated build repair agent's work. + +Examine the relevant commits using git: +- Failure commit (broke the build): {failure_commit} +- Ground truth fix commit(s) (the real fix that was landed): {ground_truth_commits} + +Inspect each commit's changes and read the repair agent's output files: +- {scratch_out}/analysis.md +- {scratch_out}/summary.md +- {scratch_out}/agent_fix.diff (may be empty if no fix was produced) + +Evaluate the agent's work on two dimensions: + +ANALYSIS: +- Did the agent correctly identify the root cause of the build failure? +- How thorough and accurate is the analysis? + +FIX: +- Does the agent's fix address the same files/functions as the ground truth? +- Is the fix semantically equivalent or close to the ground truth? +- Would the fix be acceptable in code review as-is? + +Guidelines: +- If agent_fix.diff is empty, set fix_matches_ground_truth=false, fix_quality=0.0, fix_acceptance_probability=0.0 +- A fix can be correct even if it differs syntactically from ground truth -- focus on semantic equivalence +- analysis_correct should be true if the agent found the right root cause, even if the explanation is imperfect +- Be calibrated: 0.5 means genuinely uncertain, not a default score + +Work autonomously, do not ask questions. +""" + + +class GroundTruth(BaseModel): + gh_fix_commits: list[str] + + +class Judgment(BaseModel): + analysis_correct: bool + analysis_quality: float + analysis_explanation: str + fix_matches_ground_truth: bool + fix_quality: float + fix_explanation: str + fix_acceptance_probability: float + fix_acceptance_explanation: str + + +def is_data_contaminated(fix_commit_date: str, *models: str) -> bool: + """True when the fix predates the latest training cutoff of the given models. + + Conservative across the models that could have memorized the landed fix: skip + the example if it predates any of their cutoffs (i.e. the latest one). + """ + cutoffs = [c for m in models if (c := MODEL_CUTOFF_DATES.get(m)) is not None] + if not cutoffs: + return False + return date.fromisoformat(fix_commit_date[:10]) < max(cutoffs) + + +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential_jitter(initial=2, max=30, jitter=5), + reraise=True, +) +async def run_verify( + *, + worktree_path: Path, + scratch_out: Path, + bug_id: int, + failure_commit: str, + ground_truth: GroundTruth, + agent_diff: str, + target_software: str = "Mozilla Firefox", + model: str = VERIFY_MODEL, +) -> tuple[Judgment, float]: + """Judge the agent's analysis and fix. Returns (judgment, cost_usd).""" + scratch_out.mkdir(parents=True, exist_ok=True) + (scratch_out / "agent_fix.diff").write_text(agent_diff, encoding="utf-8") + + prompt = VERIFY_TEMPLATE.format( + target_software=target_software, + failure_commit=failure_commit, + ground_truth_commits=" ".join(ground_truth.gh_fix_commits), + scratch_out=scratch_out, + ) + options = ClaudeAgentOptions( + model=model, + cwd=str(worktree_path), + allowed_tools=VERIFY_ALLOWED_TOOLS, + disallowed_tools=["AskUserQuestion", "Task"], + permission_mode="acceptEdits", + effort="high", + output_format={"type": "json_schema", "schema": Judgment.model_json_schema()}, + ) + + judgment: Judgment | None = None + cost = 0.0 + async for message in query(prompt=prompt, options=options): + if isinstance(message, ResultMessage): + cost += message.total_cost_usd or 0.0 + structured = getattr(message, "structured_output", None) + if structured: + judgment = Judgment.model_validate(structured) + elif message.result: + judgment = Judgment.model_validate_json(message.result) + + if judgment is None: + raise RuntimeError(f"bug {bug_id}: verification produced no structured output") + return judgment, cost diff --git a/bugbug/tools/build_repair/worktree.py b/agents/build-repair/evals/worktree.py similarity index 88% rename from bugbug/tools/build_repair/worktree.py rename to agents/build-repair/evals/worktree.py index 1fe2980738..5cb60529a2 100644 --- a/bugbug/tools/build_repair/worktree.py +++ b/agents/build-repair/evals/worktree.py @@ -3,14 +3,21 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. +"""Git worktree management for parallel evaluation trials. + +Each trial runs the agent against an isolated checkout of the Firefox repo at a +specific failure commit, so trials don't conflict. (Production runs are already +isolated per container, so the agent itself needs no worktrees.) +""" + import subprocess from logging import getLogger from pathlib import Path -from bugbug.tools.build_repair.config import WORKTREE_BASE_DIR - logger = getLogger(__name__) +WORKTREE_BASE_DIR = "/tmp/build_repair_worktrees" + class WorktreeManager: """Manages git worktrees for parallel evaluation runs against a Firefox repo.""" diff --git a/agents/build-repair/pyproject.toml b/agents/build-repair/pyproject.toml index eb91fa3132..669aac824b 100644 --- a/agents/build-repair/pyproject.toml +++ b/agents/build-repair/pyproject.toml @@ -30,4 +30,4 @@ requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -packages = ["hackbot_agents", "evals/buildrepair_eval"] +packages = ["hackbot_agents", "evals"] diff --git a/docker-compose.yml b/docker-compose.yml index eedad036da..e8577231fe 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,6 +5,7 @@ version: "3.8" include: - path: agents/bug-fix/compose.yml - path: agents/build-repair/compose.yml + - path: agents/build-repair/compose.eval.yml services: bugbug-base: From 11750c6f578368c52559efb24707f7cf5ec87bdf Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Thu, 25 Jun 2026 10:00:32 -0700 Subject: [PATCH 11/15] Remove old file from docker compose --- docker-compose.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index e8577231fe..eedad036da 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,6 @@ version: "3.8" include: - path: agents/bug-fix/compose.yml - path: agents/build-repair/compose.yml - - path: agents/build-repair/compose.eval.yml services: bugbug-base: From 84e1a2a6fa62ddbdf55e50ca1ada2b1a959eae8f Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Thu, 25 Jun 2026 10:22:40 -0700 Subject: [PATCH 12/15] Add eval todo --- agents/build-repair/evals/eval.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/agents/build-repair/evals/eval.py b/agents/build-repair/evals/eval.py index 679422c69d..f41c1c19d7 100644 --- a/agents/build-repair/evals/eval.py +++ b/agents/build-repair/evals/eval.py @@ -92,6 +92,9 @@ def worktree_mgr(self) -> WorktreeManager: async def invoke( self, bug_id: int, + # Bug fields before the fix. This filed is a part of the dataset. + # The new Hackbot agent is not using it. It pulls the Bugzilla bug itself. + # TODO: investigate how to hide the fix in evals for the new agent pre_fix_bug: dict, gh_failure_commits: list[str], gh_fix_commits: list[str], From dbac63fbd3ab40cd689d7d9243871ba16b15b827 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Thu, 25 Jun 2026 13:55:05 -0700 Subject: [PATCH 13/15] Update readme --- agents/build-repair/README.md | 62 ++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 11 deletions(-) diff --git a/agents/build-repair/README.md b/agents/build-repair/README.md index d0546cfc1d..a2c000adee 100644 --- a/agents/build-repair/README.md +++ b/agents/build-repair/README.md @@ -4,9 +4,30 @@ Two-stage Claude agent that diagnoses a Firefox build failure and edits the sour tree to fix it. Agent logic in `hackbot_agents/build_repair/`; the Weave eval harness in `evals/`. -Run the Docker commands below from this folder, with secrets in a local `.env` +Run the Docker commands below from the repo root, with secrets in a local `.env` (`ANTHROPIC_API_KEY`, `BUGZILLA_API_KEY`, plus `WANDB_API_KEY` for evals). +The second stage attempts building Firefox to verify the fix and iterate on it if it fails. +It also optionally bootstraps Firefox build if needed. + +## Input + +- `BUG_ID` - Optional Bugzilla bug ID +- `GIT_COMMIT` - Firefox Git commit that failed the build +- `FAILURE_TASKS` - a dictionary of failed Taskcluster tasks {task_name: taskcluster_task_id} + +## Output + +First stage - analysis: + +- `summary.md` - a quick summary for a developer +- `analysis.md` - detailed analysis +- `planning.md` - intermediate file that outlines fixing steps for the second stage + +Second stage - fixing: + +- A patch in Hackbot format + ## Test the agent ```sh @@ -17,13 +38,17 @@ FAILURE_TASKS='{"build-linux":"XyU4b_BIRdO_IeK6z_kcQg"}' \ Artifacts are written to `~/hackbot/artifacts/`. -## Run evals +## Evaluation + +The evaluation dataset is prepared with [build_repair_create_dataset.ipynb](../../notebooks/build_repair_create_dataset.ipynb) and saved to Weights and Biases Weave. -Each dataset row is a Firefox build failure; per trial the harness runs the agent +### Run evals + +Each dataset row is a Firefox build failure. The harness runs the agent on a git worktree at the failure commit, builds the fix, and LLM-judges it against the landed commits. Needs a bootstrapped Firefox checkout. -Local: +Local (use only for debugging as new agent is not sandboxed): ```sh FIREFOX_GIT_REPO=/path/to/firefox \ @@ -31,22 +56,37 @@ FIREFOX_GIT_REPO=/path/to/firefox \ python -m evals.eval --no-try-push --limit 1 ``` -Docker (reuses the broker, so no Bugzilla creds in the eval container): +Docker (reuses the broker container, so no Bugzilla creds passed to the eval container): ```sh FIREFOX_GIT_REPO=/path/to/firefox \ - docker compose run --rm build-repair-eval --no-try-push --limit 1 + docker compose --env-file .env -f agents/build-repair/compose.yml run --rm --build build-repair-eval --no-try-push --limit 1 ``` -Flags: `--trials N`, `--parallelism N`, `--judge-model `, `--dataset `, -`--no-try-push`, `--verbose`. +Flags: + +`--trials N` - the number of times to run each example + +`--parallelism N` - the number of runs to parallelize with Weave + +`--judge-model ` - Claude model ID for LLM-as-a-judge + +`--dataset ` - Weave dataset name -The agent reads the bug live from Bugzilla, so the harness skips examples whose fix +`--no-try-push` - do not run TRY push to verify the results, only local build + +`--verbose` - debugging log level + +The harness skips examples whose fix landed before the production model's training cutoff (`MODEL_CUTOFF_DATES` in `evals/verify.py`) to avoid contamination. -## W&B metrics +Change the models in [config.py](hackbot_agents/build_repair/config.py) to older ones (`claude-opus-4-6`) to test on older datasets. -`weave.init` + `weave.Evaluation` log success and diff rates, local and try build +### W&B metrics + +`weave.init` + `weave.Evaluation` log success and diff rates, local and TRY build pass rates, LLM fix-matching (analysis/fix quality, ground-truth match, acceptance), and `total_cost_usd`. + +See https://wandb.ai/moz-bugbug/bugbug-build-repair-eval/weave/evaluations From b2f6b7494cc2347a692eecc5e76ce71b0d1e1762 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Thu, 25 Jun 2026 14:25:18 -0700 Subject: [PATCH 14/15] Remove services --- .github/dependabot.yml | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 0c766de53a..0ce7a61d73 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -34,44 +34,6 @@ updates: open-pull-requests-limit: 99 allow: - dependency-type: direct - - package-ecosystem: uv - directory: "/services/mcp" - schedule: - interval: weekly - day: tuesday - groups: - patch: - applies-to: version-updates - patterns: - - "*" - update-types: - - patch - cooldown: - semver-major-days: 14 - semver-minor-days: 7 - semver-patch-days: 3 - open-pull-requests-limit: 99 - allow: - - dependency-type: direct - - package-ecosystem: uv - directory: "/services/reviewhelper-api" - schedule: - interval: weekly - day: wednesday - groups: - patch: - applies-to: version-updates - patterns: - - "*" - update-types: - - patch - cooldown: - semver-major-days: 14 - semver-minor-days: 7 - semver-patch-days: 3 - open-pull-requests-limit: 99 - allow: - - dependency-type: direct - package-ecosystem: npm directory: "/ui/changes" schedule: From d07ac26eaa2a4545dd26efce3608efc4d4e39d82 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Thu, 25 Jun 2026 17:26:53 -0700 Subject: [PATCH 15/15] Remove unnecessary git config --- agents/build-repair/Dockerfile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/agents/build-repair/Dockerfile b/agents/build-repair/Dockerfile index a6de686d05..e1890dc740 100644 --- a/agents/build-repair/Dockerfile +++ b/agents/build-repair/Dockerfile @@ -74,10 +74,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ UV_PROJECT_ENVIRONMENT=/opt/venv \ uv sync --frozen --no-dev --no-install-workspace --inexact --extra eval --package hackbot-agent-build-repair -# The harness creates worktrees and runs `./mach build` against a bind-mounted -# Firefox checkout owned by a different uid; allow git to operate on it. -RUN git config --system --add safe.directory '*' - USER agent ENV FIREFOX_GIT_REPO=/firefox