From 217e6f69088c8087ac9a67c51f9890db3274e67c Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Wed, 24 Jun 2026 14:21:22 -0700
Subject: [PATCH 01/15] Add ability to clone a specific commit

---
 .../hackbot-runtime/hackbot_runtime/config.py |  3 +
 .../hackbot_runtime/context.py                |  3 +-
 .../hackbot-runtime/hackbot_runtime/source.py | 62 +++++++++++++++++--
 libs/hackbot-runtime/tests/test_context.py    | 26 +++++++-
 libs/hackbot-runtime/tests/test_source.py     | 55 ++++++++++++++--
 5 files changed, 136 insertions(+), 13 deletions(-)
diff --git a/libs/hackbot-runtime/hackbot_runtime/config.py b/libs/hackbot-runtime/hackbot_runtime/config.py
index ef1ae6173d..3c4ec56ba2 100644
--- a/libs/hackbot-runtime/hackbot_runtime/config.py
+++ b/libs/hackbot-runtime/hackbot_runtime/config.py
@@ -21,6 +21,9 @@ class SourceConfig(BaseModel):
     # Where the checkout lands. The env var SOURCE_REPO overrides this at runtime
     # (the orchestrator points it at the task-local workspace).
     checkout_path: Path = Path("/workspace/source")
+    # Optional commit/branch/tag to check out instead of remote HEAD. The env var
+    # SOURCE_REF overrides this at runtime (per-run inputs like a failure commit).
+    ref: str | None = None
 
 
 class FirefoxConfig(BaseModel):
diff --git a/libs/hackbot-runtime/hackbot_runtime/context.py b/libs/hackbot-runtime/hackbot_runtime/context.py
index 8ccddfd2ea..8269de8fd1 100644
--- a/libs/hackbot-runtime/hackbot_runtime/context.py
+++ b/libs/hackbot-runtime/hackbot_runtime/context.py
@@ -112,7 +112,8 @@ def source_repo(self) -> Path:
             )
         env_path = os.environ.get("SOURCE_REPO")
         path = Path(env_path) if env_path else self._config.source.checkout_path
-        ensure_source_repo(path, self._config.source.repo_url)
+        ref = os.environ.get("SOURCE_REF") or self._config.source.ref
+        ensure_source_repo(path, self._config.source.repo_url, ref)
         # Record where the agent starts editing, so publish_changes() can later
         # diff the final tree against it. Best-effort: a failure here must not
         # break the agent's access to source — it only disables change capture.
diff --git a/libs/hackbot-runtime/hackbot_runtime/source.py b/libs/hackbot-runtime/hackbot_runtime/source.py
index 04352876eb..0dc0b4060d 100644
--- a/libs/hackbot-runtime/hackbot_runtime/source.py
+++ b/libs/hackbot-runtime/hackbot_runtime/source.py
@@ -10,13 +10,27 @@
 log = logging.getLogger("hackbot_runtime.source")
 
 
-def ensure_source_repo(source_repo: Path, repo_url: str) -> None:
+def ensure_source_repo(
+    source_repo: Path, repo_url: str, ref: str | None = None
+) -> None:
     """Ensure a shallow checkout of ``repo_url`` exists at ``source_repo``.
 
     Idempotent: clones if absent, otherwise shallow-fetches and hard-resets to
-    the remote HEAD. Recovers from a partial checkout left by an earlier failed
-    run (e.g. the clone succeeded but the checkout ran out of disk).
+    the requested ``ref`` (``origin/HEAD`` when ``ref`` is None). Recovers from a
+    partial checkout left by an earlier failed run (e.g. the clone succeeded but
+    the checkout ran out of disk).
+
+    When ``ref`` is set (a commit/branch/tag), the repo is pinned there — useful
+    for agents that must operate on a specific historical commit (e.g. a build
+    failure commit) rather than the tip of the default branch.
     """
+    # Both the recovery path and the fresh clone converge on a shallow fetch of
+    # this ref so a pinned commit is fetchable even when it is not on HEAD.
+    fetch_target = ref if ref else "HEAD"
+    # A pinned commit needs its parent too so the commit's own diff can be
+    # computed (e.g. `git show <commit>`); depth=1 would fetch only the commit
+    # itself with no parent to diff against.
+    depth = "--depth=2" if ref else "--depth=1"
     git_dir = source_repo / ".git"
     if git_dir.exists():
         # An earlier run killed mid-fetch (e.g. the container was stopped)
@@ -45,9 +59,17 @@ def ensure_source_repo(source_repo: Path, repo_url: str) -> None:
                 stdout=sys.stderr,
                 stderr=sys.stderr,
             )
-        log.info("updating source at %s (shallow fetch)", source_repo)
+        log.info("updating source at %s (shallow fetch %s)", source_repo, fetch_target)
         subprocess.run(
-            ["git", "-C", str(source_repo), "fetch", "--depth=1", "origin", "HEAD"],
+            [
+                "git",
+                "-C",
+                str(source_repo),
+                "fetch",
+                depth,
+                "origin",
+                fetch_target,
+            ],
             check=True,
             stdout=sys.stderr,
             stderr=sys.stderr,
@@ -60,6 +82,36 @@ def ensure_source_repo(source_repo: Path, repo_url: str) -> None:
         )
         return
     source_repo.mkdir(parents=True, exist_ok=True)
+    if ref:
+        # A bare clone can't fetch an arbitrary commit directly, so init an empty
+        # repo and shallow-fetch just the requested ref.
+        log.info("cloning %s (shallow) to %s at ref %s", repo_url, source_repo, ref)
+        subprocess.run(
+            ["git", "init", "-q", str(source_repo)],
+            check=True,
+            stdout=sys.stderr,
+            stderr=sys.stderr,
+        )
+        subprocess.run(
+            ["git", "-C", str(source_repo), "remote", "add", "origin", repo_url],
+            check=True,
+            stdout=sys.stderr,
+            stderr=sys.stderr,
+        )
+        subprocess.run(
+            ["git", "-C", str(source_repo), "fetch", depth, "origin", ref],
+            check=True,
+            stdout=sys.stderr,
+            stderr=sys.stderr,
+        )
+        subprocess.run(
+            ["git", "-C", str(source_repo), "checkout", "-q", "FETCH_HEAD"],
+            check=True,
+            stdout=sys.stderr,
+            stderr=sys.stderr,
+        )
+        log.info("shallow clone complete")
+        return
     log.info("cloning %s (shallow) to %s", repo_url, source_repo)
     subprocess.run(
         ["git", "clone", "--depth=1", repo_url, str(source_repo)],
diff --git a/libs/hackbot-runtime/tests/test_context.py b/libs/hackbot-runtime/tests/test_context.py
index 6dea30342a..26ef0b4d1d 100644
--- a/libs/hackbot-runtime/tests/test_context.py
+++ b/libs/hackbot-runtime/tests/test_context.py
@@ -37,11 +37,12 @@ def test_firefox_disabled_raises(tmp_path):
 def test_source_repo_prepares_and_honors_env_override(tmp_path, monkeypatch):
     calls = []
 
-    def fake_ensure(path: Path, repo_url: str) -> None:
-        calls.append((path, repo_url))
+    def fake_ensure(path: Path, repo_url: str, ref: str | None = None) -> None:
+        calls.append((path, repo_url, ref))
 
     monkeypatch.setattr("hackbot_runtime.context.ensure_source_repo", fake_ensure)
     monkeypatch.setenv("SOURCE_REPO", str(tmp_path / "from-env"))
+    monkeypatch.delenv("SOURCE_REF", raising=False)
 
     cfg = HackbotConfig(
         source=SourceConfig(
@@ -52,7 +53,26 @@ def fake_ensure(path: Path, repo_url: str) -> None:
     hb = _hb(tmp_path, cfg)
 
     assert hb.source_repo == tmp_path / "from-env"
-    assert calls == [(tmp_path / "from-env", "https://example.com/r.git")]
+    assert calls == [(tmp_path / "from-env", "https://example.com/r.git", None)]
+
+
+def test_source_repo_honors_source_ref_env(tmp_path, monkeypatch):
+    calls = []
+
+    def fake_ensure(path: Path, repo_url: str, ref: str | None = None) -> None:
+        calls.append((path, repo_url, ref))
+
+    monkeypatch.setattr("hackbot_runtime.context.ensure_source_repo", fake_ensure)
+    monkeypatch.delenv("SOURCE_REPO", raising=False)
+    monkeypatch.setenv("SOURCE_REF", "deadbeef")
+
+    cfg = HackbotConfig(
+        source=SourceConfig(repo_url="r", checkout_path=Path("/from/toml"))
+    )
+    hb = _hb(tmp_path, cfg)
+
+    assert hb.source_repo == Path("/from/toml")
+    assert calls == [(Path("/from/toml"), "r", "deadbeef")]
 
 
 def test_source_repo_uses_toml_path_without_env(tmp_path, monkeypatch):
diff --git a/libs/hackbot-runtime/tests/test_source.py b/libs/hackbot-runtime/tests/test_source.py
index 4bc83d30e3..b22a91f409 100644
--- a/libs/hackbot-runtime/tests/test_source.py
+++ b/libs/hackbot-runtime/tests/test_source.py
@@ -6,9 +6,7 @@
 from hackbot_runtime import ensure_source_repo
 
 
-def _make_remote(path: Path) -> None:
-    subprocess.run(["git", "init", "-q", str(path)], check=True)
-    (path / "README.md").write_text("hello")
+def _commit(path: Path, message: str) -> str:
     subprocess.run(["git", "-C", str(path), "add", "."], check=True)
     subprocess.run(
         [
@@ -22,10 +20,23 @@ def _make_remote(path: Path) -> None:
             "commit",
             "-q",
             "-m",
-            "init",
+            message,
         ],
         check=True,
     )
+    rev = subprocess.run(
+        ["git", "-C", str(path), "rev-parse", "HEAD"],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    return rev.stdout.strip()
+
+
+def _make_remote(path: Path) -> str:
+    subprocess.run(["git", "init", "-q", str(path)], check=True)
+    (path / "README.md").write_text("hello")
+    return _commit(path, "init")
 
 
 def test_clones_when_absent(tmp_path):
@@ -45,3 +56,39 @@ def test_idempotent_update_when_present(tmp_path):
     # Second call takes the fetch + hard-reset branch and must still succeed.
     ensure_source_repo(dest, f"file://{remote}")
     assert (dest / "README.md").read_text() == "hello"
+
+
+def test_pins_to_ref_when_absent(tmp_path):
+    remote = tmp_path / "remote"
+    first = _make_remote(remote)
+    # A second commit advances HEAD; pinning to `first` must ignore it.
+    (remote / "README.md").write_text("world")
+    _commit(remote, "second")
+    dest = tmp_path / "dest"
+    ensure_source_repo(dest, f"file://{remote}", ref=first)
+    head = subprocess.run(
+        ["git", "-C", str(dest), "rev-parse", "HEAD"],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    assert head.stdout.strip() == first
+    assert (dest / "README.md").read_text() == "hello"
+
+
+def test_pinned_ref_includes_parent_for_diff(tmp_path):
+    remote = tmp_path / "remote"
+    _make_remote(remote)
+    (remote / "README.md").write_text("world")
+    second = _commit(remote, "second")
+    dest = tmp_path / "dest"
+    ensure_source_repo(dest, f"file://{remote}", ref=second)
+    # The parent must be present so the commit's own diff can be computed.
+    show = subprocess.run(
+        ["git", "-C", str(dest), "show", second],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    assert "hello" in show.stdout
+    assert "world" in show.stdout

From 39d82f5121a4feab05f629b7e67769454c3c7a4a Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Wed, 24 Jun 2026 14:37:40 -0700
Subject: [PATCH 02/15] Improve Bugzilla error handling

---
 libs/agent-tools/agent_tools/bugzilla.py | 52 ++++++++++++++----------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/libs/agent-tools/agent_tools/bugzilla.py b/libs/agent-tools/agent_tools/bugzilla.py
index 03f2bcf8a4..aa256750d3 100644
--- a/libs/agent-tools/agent_tools/bugzilla.py
+++ b/libs/agent-tools/agent_tools/bugzilla.py
@@ -53,6 +53,29 @@ def _bugsy_error(e: bugsy.BugsyException) -> ToolError:
     return ToolError(msg, payload=payload)
 
 
+def _request(ctx: BugzillaContext, path: str, params: dict[str, Any] | None = None):
+    """Issue a Bugzilla request, normalizing every failure into a ToolError.
+
+    bugsy only raises ``BugsyException`` for Bugzilla-level errors; a bad proxy
+    URL, an auth redirect, or an empty body instead surfaces as a raw
+    ``JSONDecodeError``/connection error. Catching those here turns an opaque
+    "Expecting value: line 1 column 1" into an actionable message.
+    """
+    try:
+        return ctx.client.request(path, params=params or {})
+    except bugsy.BugsyException as e:
+        raise _bugsy_error(e) from e
+    except Exception as e:
+        raise ToolError(
+            f"Bugzilla request to '{path}' failed: {type(e).__name__}: {e}",
+            payload={
+                "error": "bugzilla_request_failed",
+                "path": path,
+                "message": str(e),
+            },
+        ) from e
+
+
 @tool
 async def search_bugs(
     ctx: BugzillaContext,
@@ -77,10 +100,7 @@ async def search_bugs(
     component, status, resolution, priority, severity, assigned_to, whiteboard,
     include_fields, limit.
     """
-    try:
-        result = ctx.client.request("bug", params=params)
-    except bugsy.BugsyException as e:
-        raise _bugsy_error(e) from e
+    result = _request(ctx, "bug", params)
     bugs = result.get("bugs", [])
     return {"count": len(bugs), "bugs": bugs}
 
@@ -124,12 +144,7 @@ async def get_bugs(
         "cf_crash_signature,url,version,op_sys,platform"
     )
     id_csv = ",".join(str(i) for i in ids)
-    try:
-        result = ctx.client.request(
-            "bug", params={"id": id_csv, "include_fields": include}
-        )
-    except bugsy.BugsyException as e:
-        raise _bugsy_error(e) from e
+    result = _request(ctx, "bug", {"id": id_csv, "include_fields": include})
     bugs = result.get("bugs", [])
     returned = {b["id"] for b in bugs}
     inaccessible = [i for i in ids if i not in returned]
@@ -153,6 +168,8 @@ async def get_bugs(
                 "code": getattr(e, "code", None),
                 "message": getattr(e, "msg", str(e)),
             }
+        except Exception as e:
+            payload["comments_error"] = {"message": f"{type(e).__name__}: {e}"}
 
     return payload
 
@@ -163,10 +180,7 @@ async def get_bug_comments(
     bug_id: Annotated[int, Field(description="Bug ID.")],
 ) -> dict:
     """Fetch all comments for a single bug."""
-    try:
-        result = ctx.client.request(f"bug/{bug_id}/comment")
-    except bugsy.BugsyException as e:
-        raise _bugsy_error(e) from e
+    result = _request(ctx, f"bug/{bug_id}/comment")
     comments = result.get("bugs", {}).get(str(bug_id), {}).get("comments", [])
     return {"bug_id": bug_id, "count": len(comments), "comments": comments}
 
@@ -192,10 +206,7 @@ async def get_bug_attachments(
     base64-encoded in the 'data' field of each attachment.
     """
     params = {} if include_data else {"exclude_fields": "data"}
-    try:
-        result = ctx.client.request(f"bug/{bug_id}/attachment", params=params)
-    except bugsy.BugsyException as e:
-        raise _bugsy_error(e) from e
+    result = _request(ctx, f"bug/{bug_id}/attachment", params)
     atts = result.get("bugs", {}).get(str(bug_id), [])
     return {"bug_id": bug_id, "count": len(atts), "attachments": atts}
 
@@ -223,10 +234,7 @@ async def download_attachment(
     get_bug_attachments first to discover attachment IDs. Returns the written
     path, size, and content_type.
     """
-    try:
-        result = ctx.client.request(f"bug/attachment/{attachment_id}")
-    except bugsy.BugsyException as e:
-        raise _bugsy_error(e) from e
+    result = _request(ctx, f"bug/attachment/{attachment_id}")
 
     att = result.get("attachments", {}).get(str(attachment_id))
     if att is None:

From 040944c439548b98b05f0529b8694e2d8d3f0de9 Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Wed, 24 Jun 2026 14:45:28 -0700
Subject: [PATCH 03/15] Add build target and fix Rust path

---
 .../agent_tools/firefox/__init__.py           | 21 ++++++++++++++-----
 .../firefox/tools/build_firefox.py            | 20 ++++++++++++++++--
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/libs/agent-tools/agent_tools/firefox/__init__.py b/libs/agent-tools/agent_tools/firefox/__init__.py
index e371a6be2f..268efa2cab 100644
--- a/libs/agent-tools/agent_tools/firefox/__init__.py
+++ b/libs/agent-tools/agent_tools/firefox/__init__.py
@@ -122,17 +122,28 @@ async def build_firefox(
             description="MOZCONFIG to use. Optional — defaults to the configured mozconfig."
         ),
     ] = None,
+    target: Annotated[
+        str | None,
+        Field(
+            description=(
+                "Optional build target, e.g. a directory like 'docshell/base'. "
+                "When set, only that target is built — much faster than a full "
+                "tree build and enough to confirm a localized fix compiles."
+            )
+        ),
+    ] = None,
 ) -> dict:
     """Build Firefox using the configured mozconfig.
 
-    Slow (tens of minutes on a cold build, faster incremental). Returns JSON:
-    success (bool), build_dir (str), message (str), stdout/stderr. Only call this
-    if you've changed source or the binary is missing — check if the binary
-    exists first.
+    Slow on a full tree build (tens of minutes cold, faster incremental); pass a
+    `target` directory to build just the part you changed. Returns JSON: success
+    (bool), build_dir (str), message (str), stdout/stderr. Only call this if
+    you've changed source or the binary is missing — check if the binary exists
+    first.
     """
     firefox_dir_p = Path(firefox_dir) if firefox_dir else ctx.source_dir
     mozconfig_p = Path(mozconfig_path) if mozconfig_path else ctx.mozconfig
-    return await _build_firefox(firefox_dir_p, mozconfig_p, ctx.objdir)
+    return await _build_firefox(firefox_dir_p, mozconfig_p, ctx.objdir, target=target)
 
 
 @tool
diff --git a/libs/agent-tools/agent_tools/firefox/tools/build_firefox.py b/libs/agent-tools/agent_tools/firefox/tools/build_firefox.py
index ea623643f7..00555f724a 100644
--- a/libs/agent-tools/agent_tools/firefox/tools/build_firefox.py
+++ b/libs/agent-tools/agent_tools/firefox/tools/build_firefox.py
@@ -10,6 +10,7 @@ async def build_firefox(
     firefox_dir: Path,
     mozconfig_path: Path,
     objdir: Path,
+    target: str | None = None,
 ) -> dict[str, Any]:
     """Build Firefox using a specified mozconfig.
 
@@ -19,6 +20,9 @@ async def build_firefox(
         objdir: Expected build output directory (reported back on success;
             mozconfig actually determines where the build lands, so this
             should match what the mozconfig sets)
+        target: Optional build target (e.g. a directory like ``docshell/base``).
+            When set, only that target is built -- far faster than a full tree
+            build and enough to verify a localized fix compiles.
 
     Returns:
         Dict with build result information (success, build_dir, message,
@@ -41,9 +45,21 @@ async def build_firefox(
         env["MOZCONFIG"] = str(mozconfig_path.resolve())
         env["CLAUDECODE"] = "1"
 
+        # `mach bootstrap` installs rust under ~/.cargo/bin and clang under
+        # ~/.mozbuild/clang/bin, neither of which is on the default PATH. Without
+        # this the build fails with "Rust compiler not found" even right after a
+        # successful bootstrap.
+        home = Path.home()
+        toolchain_bins = [home / ".cargo" / "bin", home / ".mozbuild" / "clang" / "bin"]
+        env["PATH"] = os.pathsep.join(
+            [*(str(p) for p in toolchain_bins), env.get("PATH", "")]
+        )
+
+        mach_args = ["./mach", "build"]
+        if target:
+            mach_args.append(target)
         process = await asyncio.create_subprocess_exec(
-            "./mach",
-            "build",
+            *mach_args,
             cwd=firefox_dir,
             env=env,
             stdout=asyncio.subprocess.PIPE,

From b043e05722778c27b18ed157020a00fadfff23aa Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Wed, 24 Jun 2026 14:58:35 -0700
Subject: [PATCH 04/15] Move try_server.py to try_push.py

Pure rename to preserve git history before reworking the contents
for the hackbot port.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../build-repair/hackbot_agents/build_repair/try_push.py          | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename bugbug/tools/build_repair/try_server.py => agents/build-repair/hackbot_agents/build_repair/try_push.py (100%)

diff --git a/bugbug/tools/build_repair/try_server.py b/agents/build-repair/hackbot_agents/build_repair/try_push.py
similarity index 100%
rename from bugbug/tools/build_repair/try_server.py
rename to agents/build-repair/hackbot_agents/build_repair/try_push.py

From a402729cc7143090425c0d2bf875d3649e4985ed Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Wed, 24 Jun 2026 15:12:48 -0700
Subject: [PATCH 05/15] Move old build repair agent files to new locations

Pure renames to preserve git history before reworking the contents
for the hackbot port:
  agent.py             -> hackbot_agents/build_repair/agent.py
  prompts.py           -> hackbot_agents/build_repair/prompts.py
  build_repair_eval.py -> buildrepair_eval/eval.py

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../build-repair-evals/buildrepair_eval/eval.py                   | 0
 .../build-repair/hackbot_agents}/build_repair/agent.py            | 0
 .../build-repair/hackbot_agents}/build_repair/prompts.py          | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename scripts/build_repair_eval.py => agents/build-repair-evals/buildrepair_eval/eval.py (100%)
 rename {bugbug/tools => agents/build-repair/hackbot_agents}/build_repair/agent.py (100%)
 rename {bugbug/tools => agents/build-repair/hackbot_agents}/build_repair/prompts.py (100%)

diff --git a/scripts/build_repair_eval.py b/agents/build-repair-evals/buildrepair_eval/eval.py
similarity index 100%
rename from scripts/build_repair_eval.py
rename to agents/build-repair-evals/buildrepair_eval/eval.py
diff --git a/bugbug/tools/build_repair/agent.py b/agents/build-repair/hackbot_agents/build_repair/agent.py
similarity index 100%
rename from bugbug/tools/build_repair/agent.py
rename to agents/build-repair/hackbot_agents/build_repair/agent.py
diff --git a/bugbug/tools/build_repair/prompts.py b/agents/build-repair/hackbot_agents/build_repair/prompts.py
similarity index 100%
rename from bugbug/tools/build_repair/prompts.py
rename to agents/build-repair/hackbot_agents/build_repair/prompts.py

From 3ae452177918ed2e89073facafac3d338ca286e9 Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Wed, 24 Jun 2026 15:38:54 -0700
Subject: [PATCH 06/15] Move remaining build repair files to new locations

Pure renames to preserve git history before reworking the contents
for the hackbot port:
  config.py -> hackbot_agents/build_repair/config.py
  eval.py   -> build-repair/evals/buildrepair_eval/eval.py

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../evals}/buildrepair_eval/eval.py                               | 0
 .../build-repair/hackbot_agents}/build_repair/config.py           | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename agents/{build-repair-evals => build-repair/evals}/buildrepair_eval/eval.py (100%)
 rename {bugbug/tools => agents/build-repair/hackbot_agents}/build_repair/config.py (100%)

diff --git a/agents/build-repair-evals/buildrepair_eval/eval.py b/agents/build-repair/evals/buildrepair_eval/eval.py
similarity index 100%
rename from agents/build-repair-evals/buildrepair_eval/eval.py
rename to agents/build-repair/evals/buildrepair_eval/eval.py
diff --git a/bugbug/tools/build_repair/config.py b/agents/build-repair/hackbot_agents/build_repair/config.py
similarity index 100%
rename from bugbug/tools/build_repair/config.py
rename to agents/build-repair/hackbot_agents/build_repair/config.py

From 387acbfbe92114607a9754594ea81538ad63c7fc Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Wed, 24 Jun 2026 15:39:56 -0700
Subject: [PATCH 07/15] Migrate build repair agent to Hackbot

---
 .github/dependabot.yml                        |  19 -
 agents/build-repair/Dockerfile                |  55 ++
 agents/build-repair/compose.yml               |  38 +
 agents/build-repair/hackbot.toml              |   8 +
 .../hackbot_agents/build_repair/__init__.py   |   0
 .../hackbot_agents/build_repair/__main__.py   |  48 +
 .../hackbot_agents/build_repair/agent.py      | 823 ++++++------------
 .../hackbot_agents/build_repair/broker.py     |  99 +++
 .../hackbot_agents/build_repair/config.py     | 111 +--
 .../hackbot_agents/build_repair/logs.py       | 104 +++
 .../hackbot_agents/build_repair/prompts.py    |  94 +-
 .../hackbot_agents/build_repair/try_push.py   | 295 +++----
 agents/build-repair/pyproject.toml            |  33 +
 docker-compose.yml                            |   1 +
 pyproject.toml                                |   2 +-
 services/buildrepair/Dockerfile               |  34 -
 services/buildrepair/README.md                |  64 --
 services/buildrepair/docker-compose.dev.yml   |  18 -
 services/buildrepair/pyproject.toml           |  12 -
 services/hackbot-api/app/agents.py            |   8 +-
 services/hackbot-api/app/schemas.py           |   9 +
 services/hackbot-api/tests/test_agents.py     |  23 +-
 uv.lock                                       | 101 +++
 23 files changed, 987 insertions(+), 1012 deletions(-)
 create mode 100644 agents/build-repair/Dockerfile
 create mode 100644 agents/build-repair/compose.yml
 create mode 100644 agents/build-repair/hackbot.toml
 create mode 100644 agents/build-repair/hackbot_agents/build_repair/__init__.py
 create mode 100644 agents/build-repair/hackbot_agents/build_repair/__main__.py
 create mode 100644 agents/build-repair/hackbot_agents/build_repair/broker.py
 create mode 100644 agents/build-repair/hackbot_agents/build_repair/logs.py
 create mode 100644 agents/build-repair/pyproject.toml
 delete mode 100644 services/buildrepair/Dockerfile
 delete mode 100644 services/buildrepair/README.md
 delete mode 100644 services/buildrepair/docker-compose.dev.yml
 delete mode 100644 services/buildrepair/pyproject.toml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index c666f68f3e..0c766de53a 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -72,25 +72,6 @@ updates:
     open-pull-requests-limit: 99
     allow:
       - dependency-type: direct
-  - package-ecosystem: uv
-    directory: "/services/buildrepair"
-    schedule:
-      interval: weekly
-      day: thursday
-    groups:
-      patch:
-        applies-to: version-updates
-        patterns:
-          - "*"
-        update-types:
-          - patch
-    cooldown:
-      semver-major-days: 14
-      semver-minor-days: 7
-      semver-patch-days: 3
-    open-pull-requests-limit: 99
-    allow:
-      - dependency-type: direct
   - package-ecosystem: npm
     directory: "/ui/changes"
     schedule:
diff --git a/agents/build-repair/Dockerfile b/agents/build-repair/Dockerfile
new file mode 100644
index 0000000000..fa3872a15e
--- /dev/null
+++ b/agents/build-repair/Dockerfile
@@ -0,0 +1,55 @@
+FROM python:3.12 AS builder
+
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+
+ENV UV_PROJECT_ENVIRONMENT=/opt/venv
+
+WORKDIR /app
+
+# Install external deps without building workspace members.
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+    --mount=type=bind,source=uv.lock,target=uv.lock \
+    --mount=type=bind,source=VERSION,target=VERSION \
+    uv sync --frozen --no-dev --no-install-workspace --package hackbot-agent-build-repair
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,target=/app,rw \
+    uv sync --locked --no-dev --no-editable --package hackbot-agent-build-repair
+
+FROM python:3.12 AS base
+
+COPY --from=builder /opt/venv /opt/venv
+WORKDIR /app
+
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PATH="/opt/venv/bin:$PATH"
+
+FROM base AS agent
+
+# hackbot.toml lives at the agent root (not inside the package), so copy it into
+# the working dir; the runtime discovers it there (cwd) at startup.
+COPY agents/build-repair/hackbot.toml /app/hackbot.toml
+
+RUN useradd --create-home --shell /bin/bash agent \
+    && mkdir -p /workspace \
+    && chown agent:agent /workspace
+
+# `mach bootstrap` installs the toolchain here at runtime; put it on PATH so the
+# agent's own `./mach build` (and the build_firefox tool) find rustc/clang.
+ENV PATH="/home/agent/.cargo/bin:/home/agent/.mozbuild/clang/bin:${PATH}"
+
+USER agent
+
+CMD ["python", "-m", "hackbot_agents.build_repair"]
+
+FROM base AS broker
+
+RUN useradd --create-home --shell /bin/bash broker
+
+USER broker
+
+EXPOSE 8765
+
+CMD ["python", "-m", "hackbot_agents.build_repair.broker"]
diff --git a/agents/build-repair/compose.yml b/agents/build-repair/compose.yml
new file mode 100644
index 0000000000..c6e63839a1
--- /dev/null
+++ b/agents/build-repair/compose.yml
@@ -0,0 +1,38 @@
+services:
+  build-repair-broker:
+    build:
+      context: ../..
+      dockerfile: agents/build-repair/Dockerfile
+      target: broker
+    environment:
+      BUGZILLA_API_URL: ${BUGZILLA_API_URL}
+      BUGZILLA_API_KEY: ${BUGZILLA_API_KEY}
+    expose:
+      - "8765"
+
+  build-repair-agent:
+    build:
+      context: ../..
+      dockerfile: agents/build-repair/Dockerfile
+      target: agent
+    environment:
+      - RUN_ID
+      - BUG_ID=${BUG_ID:?error}
+      - GIT_COMMIT=${GIT_COMMIT:?error}
+      - FAILURE_TASKS=${FAILURE_TASKS:?error}
+      - RUN_TRY_PUSH=${RUN_TRY_PUSH:-false}
+      - BUGZILLA_MCP_URL=http://build-repair-broker:8765/mcp
+      - SOURCE_REPO=/workspace/firefox
+      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:?error}
+      # No uploader locally: summary/logs/artifacts are written under
+      # /artifacts/<run_id>, bind-mounted to the host's ~/hackbot/artifacts.
+      - ARTIFACTS_DIR=/artifacts
+    volumes:
+      - workspace:/workspace
+      - ${HOME}/hackbot/artifacts:/artifacts
+    depends_on:
+      build-repair-broker:
+        condition: service_started
+
+volumes:
+  workspace:
diff --git a/agents/build-repair/hackbot.toml b/agents/build-repair/hackbot.toml
new file mode 100644
index 0000000000..21210d4d2e
--- /dev/null
+++ b/agents/build-repair/hackbot.toml
@@ -0,0 +1,8 @@
+[source]
+repo_url = "https://github.com/mozilla-firefox/firefox.git"
+checkout_path = "/workspace/firefox"
+# The failure commit is supplied per run via SOURCE_REF (from the git_commit input).
+
+[firefox]
+enabled = true
+objdir = "objdir-build-repair"
diff --git a/agents/build-repair/hackbot_agents/build_repair/__init__.py b/agents/build-repair/hackbot_agents/build_repair/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/agents/build-repair/hackbot_agents/build_repair/__main__.py b/agents/build-repair/hackbot_agents/build_repair/__main__.py
new file mode 100644
index 0000000000..bef34de4ef
--- /dev/null
+++ b/agents/build-repair/hackbot_agents/build_repair/__main__.py
@@ -0,0 +1,48 @@
+import os
+
+from hackbot_runtime import HackbotContext, run_async
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+from .agent import BuildRepairResult, run_build_repair
+
+
+class AgentInputs(BaseSettings):
+    bug_id: int | None = None
+    git_commit: str
+    failure_tasks: dict[str, str]
+    bugzilla_mcp_url: str
+    run_try_push: bool = False
+    model: str | None = None
+    max_turns: int | None = None
+
+    model_config = SettingsConfigDict(extra="ignore")
+
+
+async def main(ctx: HackbotContext) -> BuildRepairResult:
+    inputs = AgentInputs()
+
+    # The build failure lives at this commit; pin the checkout there before the
+    # runtime prepares the source tree (consumed in HackbotContext.source_repo).
+    os.environ.setdefault("SOURCE_REF", inputs.git_commit)
+
+    return await run_build_repair(
+        bugzilla_mcp_server={
+            "type": "http",
+            "url": inputs.bugzilla_mcp_url,
+        },
+        source_repo=ctx.source_repo,
+        fx_ctx=ctx.firefox,
+        bug_id=inputs.bug_id,
+        git_commit=inputs.git_commit,
+        failure_tasks=inputs.failure_tasks,
+        run_try_push=inputs.run_try_push,
+        model=inputs.model,
+        max_turns=inputs.max_turns,
+        log=ctx.log_path,
+        verbose=True,
+        publish_file=ctx.publish_file,
+    )
+
+
+if __name__ == "__main__":
+    run_async(main)
diff --git a/agents/build-repair/hackbot_agents/build_repair/agent.py b/agents/build-repair/hackbot_agents/build_repair/agent.py
index 7175bbd7c7..d32e908a22 100644
--- a/agents/build-repair/hackbot_agents/build_repair/agent.py
+++ b/agents/build-repair/hackbot_agents/build_repair/agent.py
@@ -3,587 +3,324 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
-import subprocess
-import traceback
+"""Build-repair agent.
+
+Two-stage claude-agent-sdk agent that analyzes a Firefox build failure and
+implements a fix in the source tree. The runtime checks the tree out at the
+failure commit (via ``SOURCE_REF``) and collects the agent's edits into
+``changes.patch``; this module only orchestrates the agent and publishes the
+analysis artifacts.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+import tempfile
 from collections.abc import Callable
-from logging import getLogger
 from pathlib import Path
 
-from claude_agent_sdk import ClaudeAgentOptions, ResultMessage, query
-from pydantic import BaseModel, Field
-from tenacity import (
-    retry,
-    retry_if_exception,
-    retry_if_exception_message,
-    stop_after_attempt,
-    wait_exponential_jitter,
+from agent_tools import firefox
+from agent_tools.claude_sdk import build_sdk_server
+from agent_tools.firefox import FirefoxContext
+from claude_agent_sdk import (
+    AssistantMessage,
+    ClaudeAgentOptions,
+    ClaudeSDKClient,
+    McpServerConfig,
+    ResultMessage,
+    ToolResultBlock,
+    ToolUseBlock,
+    UserMessage,
 )
+from hackbot_agents.build_repair.logs import download_failure_logs
+from hackbot_agents.build_repair.try_push import TRY_TOOLS
+from hackbot_runtime import AgentError, HackbotAgentResult
+from hackbot_runtime.claude import Reporter
 
-from bugbug.tools.base import GenerativeModelTool
-from bugbug.tools.build_repair.config import (
+from .config import (
     ADDITIONAL_DIRS,
     ALLOWED_TOOLS,
     ANALYSIS_MODEL,
-    FIREFOX_MCP_URL,
+    BUGZILLA_READ_TOOLS,
+    BUILD_TOOL,
+    FIREFOX_TOOLS,
     FIX_MODEL,
-    SANDBOX_CONFIG,
-    VERIFY_ALLOWED_TOOLS,
-    VERIFY_MODEL,
+    TRY_PUSH_TOOL,
 )
-from bugbug.tools.build_repair.prompts import (
+from .prompts import (
     ANALYSIS_TEMPLATE,
-    EVAL_PROMPT,
+    BUG_ANALYSIS_STEP,
+    BUG_CONTEXT,
     FIX_TEMPLATE,
-    VERIFY_TEMPLATE,
+    TRY_PUSH_INSTRUCTIONS,
 )
 
-logger = getLogger(__name__)
+TARGET_SOFTWARE = "Mozilla Firefox"
 
 
-class BuildFailure(BaseModel):
-    """Input describing a build failure from the dataset."""
-
-    bug_id: int = Field(description="The ID of the bug in Bugzilla.")
-    bug_title: str | None = Field(default=None, description="Optional bug title.")
-    bug_comments: list[str] | None = Field(
-        default=None, description="Optional bug comments."
-    )
-    git_commit: str = Field(description="Git revision to checkout.")
-    failure_tasks: list[dict] = Field(
-        description="List of {task_name, task_id, retry_id, failure_lines}."
-    )
+class BuildRepairResult(HackbotAgentResult):
+    bug_id: int | None = None
+    git_commit: str
+    summary: str = ""
+    analysis: str = ""
+    local_build_verified: bool | None = None
+    try_build_passed: bool | None = None
+    lando_job_id: str | None = None
+    treeherder_url: str | None = None
 
 
-class UsageStats(BaseModel):
-    cost_usd: float = Field(default=0.0)
-    num_turns: int = Field(default=0)
-    input_tokens: int = Field(default=0)
-    output_tokens: int = Field(default=0)
-    cache_read_input_tokens: int = Field(default=0)
-    cache_creation_input_tokens: int = Field(default=0)
-
-
-class AgentResponse(UsageStats):
-    """Output from a build repair run, including analysis, diff, cost, and build results."""
-
-    summary: str = Field(default="")
-    analysis: str = Field(default="")
-    diff: str = Field(default="")
-    error: str | None = Field(default=None)
-    error_traceback: str | None = Field(default=None)
-    failure_stage: str | None = Field(default=None)
-    cost_usd: float = Field(default=0.0)
-    num_turns: int = Field(default=0)
-    input_tokens: int = Field(default=0)
-    output_tokens: int = Field(default=0)
-    cache_read_input_tokens: int = Field(default=0)
-    cache_creation_input_tokens: int = Field(default=0)
-    local_build_passed: bool | None = Field(default=None)
-    try_build_passed: bool | None = Field(default=None)
-    lando_job_id: str | None = Field(default=None)
-    treeherder_url: str | None = Field(default=None)
-    stage1_transcript: list[dict] = Field(default_factory=list)
-    stage2_transcript: list[dict] = Field(default_factory=list)
-
-
-class GroundTruth(BaseModel):
-    gh_fix_commits: list[str] = Field(
-        description="Git commit hashes of the ground truth fix."
+def _result_text(block: ToolResultBlock) -> str:
+    if isinstance(block.content, str):
+        return block.content
+    if isinstance(block.content, list):
+        return "\n".join(
+            c.get("text", "")
+            for c in block.content
+            if isinstance(c, dict) and c.get("type") == "text"
+        )
+    return str(block.content)
+
+
+def _build_options(
+    *,
+    model: str | None,
+    effort: str,
+    cwd: Path,
+    scratch_dir: Path,
+    mcp_servers: dict[str, McpServerConfig],
+    allowed_tools: list[str],
+    max_turns: int | None,
+) -> ClaudeAgentOptions:
+    # The agent always runs inside an isolated Docker container, so there is no
+    # sandbox and tools run without per-command permission prompts.
+    return ClaudeAgentOptions(
+        model=model,
+        cwd=str(cwd),
+        mcp_servers=mcp_servers,
+        allowed_tools=allowed_tools,
+        disallowed_tools=["AskUserQuestion", "Task"],
+        add_dirs=[*ADDITIONAL_DIRS, str(scratch_dir)],
+        permission_mode="bypassPermissions",
+        effort=effort,
+        max_turns=max_turns,
+        setting_sources=[],
     )
 
 
-class Judgment(BaseModel):
-    analysis_correct: bool
-    analysis_quality: float
-    analysis_explanation: str
-    fix_matches_ground_truth: bool
-    fix_quality: float
-    fix_explanation: str
-    fix_acceptance_probability: float
-    fix_acceptance_explanation: str
+def _write_mozconfig(fx_ctx: FirefoxContext) -> None:
+    """Write a mozconfig mirroring the failing CI build, unless one exists.
 
-
-class VerifyResponse(UsageStats):
-    judgment: Judgment | None = Field(default=None)
-    verification_transcript: list[dict] = Field(default_factory=list)
-
-
-class BuildRepairTool(GenerativeModelTool):
-    """Two-stage build repair agent using Claude Agent SDK.
-
-    Stage 1: Analyzes the failure and produces analysis/planning/summary docs.
-    Stage 2: Reads the analysis and implements a fix. Skipped in analysis-only mode.
-    After Stage 2, commits the fix, runs ./mach build, and optionally submits to try.
+    Verification only means something if the local build reproduces the failure
+    condition. Many failures (e.g. a variable used only inside a stripped
+    ``MOZ_DIAGNOSTIC_ASSERT``) compile fine in a default Nightly-style build and
+    fail only in a release-milestone build with warnings-as-errors. ``--enable-
+    release`` leaves ``MOZ_DIAGNOSTIC_ASSERT_ENABLED`` undefined and
+    ``--enable-warnings-as-errors`` promotes warnings to errors, so this config
+    surfaces that whole class locally.
     """
+    if fx_ctx.mozconfig.exists():
+        return
+    fx_ctx.mozconfig.write_text(
+        "ac_add_options --enable-application=browser\n"
+        "ac_add_options --disable-debug\n"
+        "ac_add_options --enable-release\n"
+        "ac_add_options --enable-warnings-as-errors\n"
+        f"mk_add_options MOZ_OBJDIR={fx_ctx.objdir}\n"
+    )
 
-    def __init__(
-        self,
-        target_software: str = "Mozilla Firefox",
-        analysis_only: bool = False,
-        eval_mode: bool = False,
-        analysis_model: str = ANALYSIS_MODEL,
-        fix_model: str = FIX_MODEL,
-        verify_model: str = VERIFY_MODEL,
-    ) -> None:
-        self.eval_mode = eval_mode
-        self.target_software = target_software
-        self.analysis_only = analysis_only
-        self.analysis_model = analysis_model
-        self.fix_model = fix_model
-        self.verify_model = verify_model
-
-    @classmethod
-    def create(cls, **kwargs):
-        return cls(**kwargs)
-
-    @staticmethod
-    def _usage_fields(usage: dict) -> dict:
-        return {
-            "input_tokens": usage.get("input_tokens", 0),
-            "output_tokens": usage.get("output_tokens", 0),
-            "cache_read_input_tokens": usage.get("cache_read_input_tokens", 0),
-            "cache_creation_input_tokens": usage.get("cache_creation_input_tokens", 0),
-        }
-
-    @staticmethod
-    def _serialize_message(message) -> dict:
-        data = {"type": type(message).__name__}
-        if hasattr(message, "model_dump"):
-            data.update(message.model_dump())
-        elif hasattr(message, "__dict__"):
-            data.update(vars(message))
-        else:
-            data["raw"] = str(message)
-        return data
-
-    async def _run_stage(
-        self,
-        stage_name: str,
-        prompt: str,
-        model: str,
-        options: ClaudeAgentOptions,
-        bug_id: int,
-        on_message: Callable[[str, dict], None] | None = None,
-    ) -> tuple[list[dict], float, int, dict]:
-        transcript: list[dict] = []
-        cost = 0.0
-        turns = 0
-        result_data: dict = {}
-        usage: dict = {}
-
-        @retry(
-            retry=(
-                retry_if_exception_message(match="Control request timeout")
-                | retry_if_exception_message(match="overloaded")
-                | retry_if_exception_message(match="529")
-                | retry_if_exception_message(match="exit code")
-                | retry_if_exception(
-                    lambda e: isinstance(e, (TimeoutError, ConnectionError, OSError))
-                )
-            ),
-            stop=stop_after_attempt(5),
-            wait=wait_exponential_jitter(initial=2, max=60, jitter=5),
-            before_sleep=lambda rs: logger.warning(
-                "Bug %s: %s transient error (attempt %d/5), retrying: %s",
-                bug_id,
-                stage_name,
-                rs.attempt_number,
-                rs.outcome.exception(),
-            ),
-            reraise=True,
-        )
-        async def _query():
-            nonlocal cost, turns, usage, result_data
-            async for message in query(prompt=prompt, options=options):
-                serialized = self._serialize_message(message)
-                transcript.append(serialized)
-                logger.debug("Bug %s: %s [%s]", bug_id, stage_name, serialized["type"])
-                if on_message:
-                    on_message(stage_name, serialized)
-                if isinstance(message, ResultMessage):
-                    cost += message.total_cost_usd or 0
-                    turns += message.num_turns or 0
-                    usage = getattr(message, "usage", {}) or {}
-                    result_data = serialized
-
-        if on_message:
-            on_message(
-                stage_name,
-                {
-                    "type": "stage_start",
-                    "prompt": prompt,
-                    "model": model,
-                },
-            )
-        try:
-            await _query()
-        finally:
-            if on_message:
-                on_message(
-                    stage_name,
-                    {
-                        "type": "stage_end",
-                        "cost_usd": cost,
-                        "num_turns": turns,
-                        "result_data": result_data,
-                    },
-                )
-
-        return transcript, cost, turns, usage
-
-    def _prepare_input_files(self, failure: BuildFailure, worktree_path: Path) -> None:
-        in_dir = worktree_path / "repair_agent" / "in" / str(failure.bug_id)
-        in_dir.mkdir(parents=True, exist_ok=True)
-
-        (in_dir / "bug_description.md").write_text(
-            f"# Bug {failure.bug_id}: {failure.bug_title}\n\n"
-            + "\n\n---\n\n".join(failure.bug_comments or [])
-        )
-
-        logs_content = ""
-        for task in failure.failure_tasks:
-            logs_content += f"## {task['task_name']} (task_id: {task['task_id']})\n\n"
-            logs_content += "\n".join(task["failure_lines"]) + "\n\n"
-        (in_dir / "build_failure_logs.md").write_text(logs_content)
-
-        out_dir = worktree_path / "repair_agent" / "out" / str(failure.bug_id)
-        out_dir.mkdir(parents=True, exist_ok=True)
 
-        logger.info(
-            "Prepared input files for bug %s at %s (%d failure tasks)",
-            failure.bug_id,
-            in_dir,
-            len(failure.failure_tasks),
-        )
+async def run_build_repair(
+    *,
+    bugzilla_mcp_server: McpServerConfig,
+    source_repo: Path,
+    fx_ctx: FirefoxContext,
+    bug_id: int | None = None,
+    git_commit: str,
+    failure_tasks: dict[str, str],
+    run_try_push: bool = False,
+    model: str | None = None,
+    max_turns: int | None = None,
+    verbose: bool = False,
+    log: Path | None = None,
+    publish_file: Callable[[str, Path, str | None], str] | None = None,
+) -> BuildRepairResult:
+    """Analyze a build failure and implement a fix in ``source_repo``.
+
+    Returns a :class:`BuildRepairResult`; raises :class:`AgentError` if a stage
+    ends in an error or produces no result.
+    """
+    label = f"bug {bug_id}" if bug_id is not None else f"commit {git_commit[:12]}"
+    print(f"[build_repair] repairing {label} at {git_commit}", file=sys.stderr)
+
+    scratch_dir = Path(tempfile.mkdtemp(prefix=f"build-repair-{bug_id or 'nobug'}-"))
+    scratch_in = scratch_dir / "in"
+    scratch_out = scratch_dir / "out"
+    scratch_in.mkdir(parents=True, exist_ok=True)
+    scratch_out.mkdir(parents=True, exist_ok=True)
+
+    task_logs = await download_failure_logs(failure_tasks, scratch_in)
+    failure_logs = "\n".join(
+        f"- {name}: sanitized errors at {tl.sanitized} (start here); "
+        f"full log at {tl.full}"
+        for name, tl in task_logs.items()
+    )
 
-    def _read_output(self, failure: BuildFailure, worktree_path: Path, key: str) -> str:
-        path = (
-            worktree_path / "repair_agent" / "out" / str(failure.bug_id) / f"{key}.md"
-        )
-        if path.exists():
-            return path.read_text()
-        return ""
+    firefox_tools = [*firefox.TOOLS, *TRY_TOOLS] if run_try_push else firefox.TOOLS
+    firefox_server = build_sdk_server("firefox", fx_ctx, firefox_tools)
+    mcp_servers: dict[str, McpServerConfig] = {
+        "bugzilla": bugzilla_mcp_server,
+        "firefox": firefox_server,
+    }
+    allowed_tools = [
+        *ALLOWED_TOOLS,
+        *BUGZILLA_READ_TOOLS,
+        *FIREFOX_TOOLS,
+        *([TRY_PUSH_TOOL] if run_try_push else []),
+    ]
+
+    task_name = next(iter(failure_tasks), "")
+    analysis_prompt = ANALYSIS_TEMPLATE.format(
+        target_software=TARGET_SOFTWARE,
+        git_commit=git_commit,
+        failure_logs=failure_logs,
+        scratch_out=scratch_out,
+        bug_context=BUG_CONTEXT.format(bug_id=bug_id) if bug_id is not None else "",
+        bug_step=BUG_ANALYSIS_STEP.format(bug_id=bug_id) if bug_id is not None else "",
+        logs_num=3 if bug_id is not None else 2,
+    )
+    fix_prompt = FIX_TEMPLATE.format(
+        target_software=TARGET_SOFTWARE,
+        scratch_out=scratch_out,
+        try_push=(
+            TRY_PUSH_INSTRUCTIONS.format(task_name=task_name) if run_try_push else ""
+        ),
+    )
 
-    async def run(
-        self,
-        failure: BuildFailure,
-        worktree_path: Path,
-        skip_try_push: bool = False,
-        on_message: Callable[[str, dict], None] | None = None,
-    ) -> AgentResponse:
-        logger.info(
-            "Starting build repair for bug %s "
-            "(commit=%s, worktree=%s, analysis_only=%s, skip_try_push=%s)",
-            failure.bug_id,
-            failure.git_commit,
-            worktree_path,
-            self.analysis_only,
-            skip_try_push,
-        )
-        self._prepare_input_files(failure, worktree_path)
-
-        mcp_servers = {"firefox": {"type": "http", "url": FIREFOX_MCP_URL}}
-        disallowed = ["AskUserQuestion", "Task"]
-        total_cost = 0.0
-        total_turns = 0
-        total_usage: dict = {}
-
-        logger.info(
-            "Bug %s: starting Stage 1 (analysis) with model=%s",
-            failure.bug_id,
-            self.analysis_model,
-        )
-        stage1_options = ClaudeAgentOptions(
-            model=self.analysis_model,
-            cwd=str(worktree_path),
-            allowed_tools=ALLOWED_TOOLS,
-            disallowed_tools=disallowed,
-            add_dirs=ADDITIONAL_DIRS,
-            sandbox=SANDBOX_CONFIG,
-            permission_mode="acceptEdits",
+    total_cost = 0.0
+    total_turns = 0
+    # Last JSON result of each tracked tool, keyed by tool name. Lets us report
+    # the actual local-build / try-push outcomes instead of guessing.
+    captured: dict[str, dict] = {}
+    tracked = {BUILD_TOOL, *([TRY_PUSH_TOOL] if run_try_push else [])}
+
+    with Reporter(verbose=verbose, log_path=log) as reporter:
+        # Stage 1: analysis (high effort, no source edits yet).
+        reporter.header(f"{label}: analysis")
+        analysis_opts = _build_options(
+            model=model or ANALYSIS_MODEL,
             effort="high",
+            cwd=source_repo,
+            scratch_dir=scratch_dir,
             mcp_servers=mcp_servers,
+            allowed_tools=allowed_tools,
+            max_turns=max_turns,
         )
-        analysis_prompt = ANALYSIS_TEMPLATE.format(
-            bug_id=failure.bug_id,
-            target_software=self.target_software,
-            worktree_path=worktree_path,
-            eval=EVAL_PROMPT if self.eval_mode else "",
-        )
-        try:
-            (
-                stage1_transcript,
-                stage1_cost,
-                stage1_turns,
-                stage1_usage,
-            ) = await self._run_stage(
-                "analysis",
-                analysis_prompt,
-                self.analysis_model,
-                stage1_options,
-                failure.bug_id,
-                on_message,
-            )
-            total_cost += stage1_cost
-            total_turns += stage1_turns
-            for k, v in stage1_usage.items():
-                if isinstance(v, (int, float)):
-                    total_usage[k] = total_usage.get(k, 0) + v
-        except Exception as e:
-            logger.error(
-                "Bug %s: starting Stage 2 (fix) with model=%s",
-                failure.bug_id,
-                self.fix_model,
-            )
-            return AgentResponse(
-                error=str(e),
-                error_traceback=traceback.format_exc(),
-                failure_stage="analysis",
-                cost_usd=total_cost,
-                num_turns=total_turns,
-                **self._usage_fields(total_usage),
-            )
-
-        logger.info(
-            "Bug %s: Stage 1 complete (cost=$%.4f, turns=%d)",
-            failure.bug_id,
-            total_cost,
-            total_turns,
+        result_msg = await _run_session(
+            reporter, analysis_opts, analysis_prompt, captured, tracked
         )
-        summary = self._read_output(failure, worktree_path, "summary")
-        analysis = self._read_output(failure, worktree_path, "analysis")
-        logger.info(
-            "Bug %s: read output files (summary=%d chars, analysis=%d chars)",
-            failure.bug_id,
-            len(summary),
-            len(analysis),
-        )
-
-        if self.analysis_only:
-            logger.info("Bug %s: analysis-only mode, skipping Stage 2", failure.bug_id)
-            return AgentResponse(
-                summary=summary,
-                analysis=analysis,
-                cost_usd=total_cost,
-                num_turns=total_turns,
-                **self._usage_fields(total_usage),
-                stage1_transcript=stage1_transcript,
-            )
-
-        logger.info(
-            "Bug %s: starting Stage 2 (fix) with model=%s",
-            failure.bug_id,
-            self.fix_model,
-        )
-        stage2_options = ClaudeAgentOptions(
-            model=self.fix_model,
-            cwd=str(worktree_path),
-            allowed_tools=ALLOWED_TOOLS,
-            disallowed_tools=disallowed,
-            add_dirs=ADDITIONAL_DIRS,
-            sandbox=SANDBOX_CONFIG,
-            permission_mode="acceptEdits",
+        _check(result_msg, label, "analysis")
+        total_cost += result_msg.total_cost_usd or 0.0
+        total_turns += result_msg.num_turns or 0
+
+        # Stage 2: fix (lower effort, edits the source tree and verifies it
+        # builds against a mozconfig that mirrors the failing CI config).
+        _write_mozconfig(fx_ctx)
+        reporter.header(f"{label}: fix")
+        fix_opts = _build_options(
+            model=model or FIX_MODEL,
             effort="low",
+            cwd=source_repo,
+            scratch_dir=scratch_dir,
             mcp_servers=mcp_servers,
+            allowed_tools=allowed_tools,
+            max_turns=max_turns,
         )
-        fix_prompt = FIX_TEMPLATE.format(
-            target_software=self.target_software,
-            bug_id=failure.bug_id,
-            worktree_path=worktree_path,
-            eval=EVAL_PROMPT if self.eval_mode else "",
-        )
-        try:
-            (
-                stage2_transcript,
-                stage2_cost,
-                stage2_turns,
-                stage2_usage,
-            ) = await self._run_stage(
-                "fix",
-                fix_prompt,
-                self.fix_model,
-                stage2_options,
-                failure.bug_id,
-                on_message,
-            )
-            total_cost += stage2_cost
-            total_turns += stage2_turns
-            for k, v in stage2_usage.items():
-                if isinstance(v, (int, float)):
-                    total_usage[k] = total_usage.get(k, 0) + v
-        except Exception as e:
-            logger.exception(
-                "Bug %s: Stage 2 (fix) failed: %s",
-                failure.bug_id,
-                e,
-            )
-            return AgentResponse(
-                summary=summary,
-                analysis=analysis,
-                error=str(e),
-                error_traceback=traceback.format_exc(),
-                failure_stage="fix",
-                cost_usd=total_cost,
-                num_turns=total_turns,
-                **self._usage_fields(total_usage),
-            )
-
-        logger.info(
-            "Bug %s: Stage 2 complete (cost=$%.4f, turns=%d)",
-            failure.bug_id,
-            total_cost,
-            total_turns,
+        result_msg = await _run_session(
+            reporter, fix_opts, fix_prompt, captured, tracked
         )
-
-        subprocess.run(
-            ["git", "add", "-A"],
-            cwd=worktree_path,
-            capture_output=True,
-        )
-        diff_result = subprocess.run(
-            ["git", "diff", "--staged", "HEAD"],
-            cwd=worktree_path,
-            capture_output=True,
-            text=True,
-        )
-        diff = diff_result.stdout
-        logger.info("Bug %s: git diff produced %d chars", failure.bug_id, len(diff))
-
-        if not diff.strip():
-            logger.warning("Bug %s: no diff produced, returning early", failure.bug_id)
-            return AgentResponse(
-                summary=summary,
-                analysis=analysis,
-                diff=diff,
-                cost_usd=total_cost,
-                num_turns=total_turns,
-                **self._usage_fields(total_usage),
-                stage1_transcript=stage1_transcript,
-                stage2_transcript=stage2_transcript,
-            )
-
-        from bugbug.tools.build_repair.try_server import run_try_verification
-
-        task_name = (
-            failure.failure_tasks[0]["task_name"] if failure.failure_tasks else ""
-        )
-        logger.info(
-            "Bug %s: starting try verification (task=%s, skip_try_push=%s)",
-            failure.bug_id,
-            task_name,
-            skip_try_push,
-        )
-        try_result = run_try_verification(
-            worktree_path=worktree_path,
-            bug_id=failure.bug_id,
-            task_name=task_name,
-            skip_try_push=skip_try_push,
-        )
-
-        logger.info(
-            "Bug %s: try verification done "
-            "(local_build=%s, try_build=%s, lando_job=%s, "
-            "total_cost=$%.4f, total_turns=%d)",
-            failure.bug_id,
-            try_result.local_build_passed,
-            try_result.try_build_passed,
-            try_result.lando_job_id,
-            total_cost,
-            total_turns,
-        )
-        return AgentResponse(
-            summary=summary,
-            analysis=analysis,
-            diff=diff,
-            cost_usd=total_cost,
-            num_turns=total_turns,
-            **self._usage_fields(total_usage),
-            local_build_passed=try_result.local_build_passed,
-            try_build_passed=try_result.try_build_passed,
-            lando_job_id=try_result.lando_job_id,
-            treeherder_url=try_result.treeherder_url,
-            stage1_transcript=stage1_transcript,
-            stage2_transcript=stage2_transcript,
-        )
-
-    @retry(
-        stop=stop_after_attempt(3),
-        wait=wait_exponential_jitter(initial=2, max=30, jitter=5),
-        before_sleep=lambda rs: logger.warning(
-            "Verification failed (attempt %d/3), retrying: %s",
-            rs.attempt_number,
-            rs.outcome.exception(),
-        ),
-        reraise=True,
+        _check(result_msg, label, "fix")
+        total_cost += result_msg.total_cost_usd or 0.0
+        total_turns += result_msg.num_turns or 0
+
+    summary = _read_doc(scratch_out, "summary", publish_file)
+    analysis = _read_doc(scratch_out, "analysis", publish_file)
+
+    build_result = captured.get(BUILD_TOOL)
+    try_result = captured.get(TRY_PUSH_TOOL, {})
+
+    return BuildRepairResult(
+        bug_id=bug_id,
+        git_commit=git_commit,
+        summary=summary,
+        analysis=analysis,
+        local_build_verified=build_result.get("success") if build_result else None,
+        try_build_passed=try_result.get("try_build_passed"),
+        lando_job_id=try_result.get("lando_job_id"),
+        treeherder_url=try_result.get("treeherder_url"),
+        num_turns=total_turns,
+        total_cost_usd=total_cost,
     )
-    async def verify(
-        self,
-        failure: BuildFailure,
-        agent_diff: str,
-        ground_truth: GroundTruth,
-        worktree_path: Path,
-        on_message: Callable[[str, dict], None] | None = None,
-    ) -> VerifyResponse:
-        out_dir = worktree_path / "repair_agent" / "out" / str(failure.bug_id)
-        out_dir.mkdir(parents=True, exist_ok=True)
-        (out_dir / "agent_fix.diff").write_text(agent_diff, encoding="utf-8")
-
-        gt_commits = " ".join(ground_truth.gh_fix_commits)
-        prompt = VERIFY_TEMPLATE.format(
-            target_software=self.target_software,
-            bug_id=failure.bug_id,
-            failure_commit=failure.git_commit,
-            ground_truth_commits=gt_commits,
-            worktree_path=worktree_path,
-        )
 
-        options = ClaudeAgentOptions(
-            model=self.verify_model,
-            cwd=str(worktree_path),
-            allowed_tools=VERIFY_ALLOWED_TOOLS,
-            disallowed_tools=["AskUserQuestion", "Task"],
-            sandbox=SANDBOX_CONFIG,
-            permission_mode="acceptEdits",
-            effort="high",
-            output_format={
-                "type": "json_schema",
-                "schema": Judgment.model_json_schema(),
-            },
-        )
 
-        logger.info(
-            "Bug %s: starting verification stage (model=%s, ground_truth=%s)",
-            failure.bug_id,
-            self.verify_model,
-            gt_commits,
-        )
+async def _run_session(
+    reporter: Reporter,
+    options: ClaudeAgentOptions,
+    prompt: str,
+    captured: dict[str, dict],
+    tracked: set[str],
+) -> ResultMessage | None:
+    """Drive one agent session, capturing the last result of each tracked tool.
 
-        transcript, cost, turns, usage = await self._run_stage(
-            "verification",
-            prompt,
-            self.verify_model,
-            options,
-            failure.bug_id,
-            on_message,
+    ``captured`` is keyed by tool name and updated in place with the parsed JSON
+    of each successful call to a tool in ``tracked`` (e.g. the local build and
+    the try push), so the caller can report real outcomes.
+    """
+    pending: dict[str, str] = {}
+    result_msg: ResultMessage | None = None
+    async with ClaudeSDKClient(options=options) as client:
+        await client.query(prompt)
+        async for msg in client.receive_response():
+            reporter.message(msg)
+            if isinstance(msg, AssistantMessage):
+                for block in msg.content:
+                    if isinstance(block, ToolUseBlock) and block.name in tracked:
+                        pending[block.id] = block.name
+            elif isinstance(msg, UserMessage) and isinstance(msg.content, list):
+                for block in msg.content:
+                    if (
+                        isinstance(block, ToolResultBlock)
+                        and block.tool_use_id in pending
+                        and not block.is_error
+                    ):
+                        name = pending.pop(block.tool_use_id)
+                        try:
+                            captured[name] = json.loads(_result_text(block))
+                        except (ValueError, TypeError):
+                            pass
+            elif isinstance(msg, ResultMessage):
+                result_msg = msg
+    return result_msg
+
+
+def _check(result_msg: ResultMessage | None, label: str, stage: str) -> None:
+    if result_msg is None:
+        raise AgentError(f"{label}: {stage} stage produced no result message")
+    if result_msg.is_error:
+        raise AgentError(
+            f"{label}: {stage} stage failed: {result_msg.result or result_msg.subtype}"
         )
 
-        judgment: Judgment | None = None
-        for msg in reversed(transcript):
-            if msg.get("structured_output"):
-                judgment = Judgment.model_validate(msg["structured_output"])
-                break
-
-        if judgment is None:
-            result_msgs = [m for m in transcript if m.get("type") == "ResultMessage"]
-            raise RuntimeError(
-                f"Bug {failure.bug_id}: verification produced no structured output. "
-                f"Result messages: {result_msgs}"
-            )
-
-        return VerifyResponse(
-            judgment=judgment,
-            cost_usd=cost,
-            num_turns=turns,
-            verification_transcript=transcript,
-            **self._usage_fields(usage),
-        )
+
+def _read_doc(
+    scratch_out: Path,
+    key: str,
+    publish_file: Callable[[str, Path, str | None], str] | None,
+) -> str:
+    """Read a stage-1 output doc and, if a publisher is given, publish it."""
+    path = scratch_out / f"{key}.md"
+    if not path.exists():
+        return ""
+    if publish_file is not None:
+        publish_file(f"{key}.md", path, "text/markdown")
+    return path.read_text()
diff --git a/agents/build-repair/hackbot_agents/build_repair/broker.py b/agents/build-repair/hackbot_agents/build_repair/broker.py
new file mode 100644
index 0000000000..70b275808b
--- /dev/null
+++ b/agents/build-repair/hackbot_agents/build_repair/broker.py
@@ -0,0 +1,99 @@
+"""Bugzilla MCP broker.
+
+Sidecar container that holds the Bugzilla API key and serves the
+bugzilla MCP tools over HTTP. The agent process (in a sibling container
+in the same Cloud Run Job task) reaches us at `127.0.0.1:<port>/mcp`.
+The agent container itself binds no Bugzilla credentials.
+"""
+
+import logging
+from contextlib import asynccontextmanager
+
+import bugsy
+import uvicorn
+from agent_tools import bugzilla
+from agent_tools.bugzilla import BugzillaContext
+from agent_tools.claude_sdk import build_sdk_server
+from mcp.server.streamable_http_manager import StreamableHTTPSessionManager
+from pydantic import field_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+from starlette.applications import Starlette
+from starlette.routing import Mount
+
+log = logging.getLogger("bugzilla-broker")
+
+
+class BrokerInputs(BaseSettings):
+    bugzilla_api_url: str
+    bugzilla_api_key: str
+    host: str = "0.0.0.0"
+    port: int = 8765
+
+    model_config = SettingsConfigDict(extra="ignore")
+
+    @field_validator("bugzilla_api_url")
+    @classmethod
+    def _ensure_rest_base(cls, v: str) -> str:
+        """Bugsy expects the REST base (``.../rest``) and just appends the path.
+
+        A bare host like ``https://bugzilla.mozilla.org`` makes every call hit
+        the HTML site and fail to parse as JSON, so normalize it here.
+        """
+        v = v.rstrip("/")
+        return v if v.endswith("/rest") else f"{v}/rest"
+
+
+def build_app(inputs: BrokerInputs) -> Starlette:
+    client = bugsy.Bugsy(
+        api_key=inputs.bugzilla_api_key, bugzilla_url=inputs.bugzilla_api_url
+    )
+    ctx = BugzillaContext(client=client)
+    sdk_config = build_sdk_server("bugzilla", ctx, bugzilla.TOOLS)
+    mcp_server = sdk_config["instance"]
+
+    manager = StreamableHTTPSessionManager(app=mcp_server, stateless=True)
+
+    @asynccontextmanager
+    async def lifespan(app):
+        # Probe Bugzilla once at startup so a bad API URL/key surfaces here as a
+        # clear log line instead of an opaque JSON-decode error on every tool
+        # call. We stay up regardless: the agent then gets a structured error.
+        try:
+            version = client.request("version").get("version")
+            log.info(
+                "bugzilla reachable at %s (version %s)",
+                inputs.bugzilla_api_url,
+                version,
+            )
+        except Exception:
+            log.exception(
+                "bugzilla health check failed against %s -- check BUGZILLA_API_URL "
+                "and BUGZILLA_API_KEY; tool calls will fail until this is fixed",
+                inputs.bugzilla_api_url,
+            )
+        async with manager.run():
+            log.info(
+                "bugzilla broker ready on %s:%d (read-only)",
+                inputs.host,
+                inputs.port,
+            )
+            yield
+
+    async def mcp_handler(scope, receive, send):
+        await manager.handle_request(scope, receive, send)
+
+    return Starlette(routes=[Mount("/mcp", app=mcp_handler)], lifespan=lifespan)
+
+
+def main() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    )
+    inputs = BrokerInputs()
+    app = build_app(inputs)
+    uvicorn.run(app, host=inputs.host, port=inputs.port, log_config=None)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/agents/build-repair/hackbot_agents/build_repair/config.py b/agents/build-repair/hackbot_agents/build_repair/config.py
index a3f69ef34e..b52b13b1ad 100644
--- a/agents/build-repair/hackbot_agents/build_repair/config.py
+++ b/agents/build-repair/hackbot_agents/build_repair/config.py
@@ -3,88 +3,49 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
-from datetime import date
-
-from claude_agent_sdk import SandboxNetworkConfig, SandboxSettings
-
-ANALYSIS_MODEL = "claude-opus-4-6"
-FIX_MODEL = "claude-opus-4-6"
-VERIFY_MODEL = "claude-opus-4-6"
-DEFAULT_MAX_TURNS = 80
-WORKTREE_BASE_DIR = "/tmp/build_repair_worktrees"
-TRY_PUSH_TIMEOUT_SECONDS = 7200
-TRY_PUSH_POLL_INTERVAL_SECONDS = 60
-TREEHERDER_BASE_URL = "https://treeherder.mozilla.org"
-
-FIREFOX_MCP_URL = "https://mcp-dev.moz.tools/mcp"
-
-# Training data cutoff dates per model, for data contamination filtering.
-# Examples with fix_commit_date before the cutoff may have been in training data.
-# Source: https://platform.claude.com/docs/en/about-claude/models/overview
-MODEL_CUTOFF_DATES = {
-    "claude-opus-4-6": date(2025, 8, 1),
-    "claude-sonnet-4-6": date(2026, 1, 1),
-    "claude-haiku-4-5-20251001": date(2025, 7, 1),
-    "claude-sonnet-4-5-20250929": date(2025, 7, 1),
-    "claude-opus-4-5-20251101": date(2025, 8, 1),
-    "claude-opus-4-1-20250805": date(2025, 3, 1),
-    "claude-sonnet-4-20250514": date(2025, 3, 1),
-    "claude-3-7-sonnet-20250219": date(2024, 11, 1),
-    "claude-opus-4-20250514": date(2025, 3, 1),
-}
+"""Models and tool allowlist for the build-repair agent."""
+
+ANALYSIS_MODEL = "claude-opus-4-8"
+FIX_MODEL = "claude-opus-4-8"
+
+# Bugzilla MCP tool names as exposed to the agent (mcp__<server>__<tool>).
+BUGZILLA_READ_TOOLS = [
+    "mcp__bugzilla__search_bugs",
+    "mcp__bugzilla__get_bugs",
+    "mcp__bugzilla__get_bug_comments",
+    "mcp__bugzilla__get_bug_attachments",
+    "mcp__bugzilla__download_attachment",
+]
 
-VERIFY_ALLOWED_TOOLS = [
-    "Read",
-    "Bash(git show:*)",
-    "Bash(git log:*)",
-    "Bash(git diff:*)",
-    "Bash(find:*)",
-    "Bash(grep:*)",
-    "WebFetch(domain:firefox-source-docs.mozilla.org)",
-    "WebFetch(domain:searchfox.org)",
+# In-process Firefox build/test MCP tools.
+BUILD_TOOL = "mcp__firefox__build_firefox"
+FIREFOX_TOOLS = [
+    BUILD_TOOL,
+    "mcp__firefox__bootstrap_firefox",
+    "mcp__firefox__evaluate_testcase",
+    "mcp__firefox__evaluate_js_shell",
 ]
 
+# Optional try-server tool, wired only when run_try_push is enabled.
+TRY_PUSH_TOOL = "mcp__firefox__submit_try_push"
+
+# The agent always runs inside an isolated Docker container, so there is no
+# sandbox and tools run without per-command permission prompts (see
+# permission_mode="bypassPermissions" in agent.py). This is just the set of
+# built-in tools the agent is allowed to call alongside the MCP servers.
 ALLOWED_TOOLS = [
-    "Edit(~/.mozbuild)",
-    "Edit(~/.cache/uv)",
-    "Bash(./mach build:*)",
-    "Bash(./mach clobber:*)",
-    "Bash(./mach configure:*)",
-    "Bash(./mach run:*)",
-    "Bash(./mach test:*)",
-    "Bash(./mach wpt:*)",
-    "Bash(./mach lint:*)",
-    "Bash(./mach format:*)",
-    "Bash(./mach clang-format:*)",
-    "Bash(./mach try:*)",
-    "Bash(./mach help:*)",
-    "Bash(./mach vendor:*)",
-    "Bash(./mach bootstrap:*)",
-    "Bash(./mach artifact:*)",
-    "Bash(clang++:*)",
-    "Bash(rm:*)",
-    "Bash(timeout:*)",
-    "Bash(find:*)",
-    "Bash(grep:*)",
-    "Bash(tee:*)",
-    "Bash(kill:*)",
-    "Bash(searchfox-cli:*)",
-    "Bash(treeherder-cli:*)",
-    "Bash(jj:*)",
-    "WebFetch(domain:firefox-source-docs.mozilla.org)",
-    "WebFetch(domain:treeherder.mozilla.org)",
-    "WebFetch(domain:searchfox.org)",
-    "WebFetch(o1069899.ingest.sentry.io)",
+    "Read",
+    "Grep",
+    "Glob",
+    "Bash",
+    "Edit",
+    "Write",
+    "MultiEdit",
+    "WebFetch",
+    "WebSearch",
 ]
 
 ADDITIONAL_DIRS = [
     "~/.mozbuild",
     "~/.cache/uv/",
 ]
-
-SANDBOX_CONFIG = SandboxSettings(
-    enabled=True,
-    autoAllowBashIfSandboxed=True,
-    allowUnsandboxedCommands=False,
-    network=SandboxNetworkConfig(allowLocalBinding=True),
-)
diff --git a/agents/build-repair/hackbot_agents/build_repair/logs.py b/agents/build-repair/hackbot_agents/build_repair/logs.py
new file mode 100644
index 0000000000..e0c22875b0
--- /dev/null
+++ b/agents/build-repair/hackbot_agents/build_repair/logs.py
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""Download and sanitize Taskcluster build-failure logs.
+
+The agent is given a mapping of ``task-name -> Taskcluster task ID``. Before the
+Claude SDK is invoked we fetch each task's ``live_backing.log`` and write two
+files to the scratch dir: the full log and a sanitized companion that keeps only
+the ``ERROR -`` / ``FATAL -`` lines. The agent is told to start from the
+sanitized log (so its context isn't drowned by tens of MB of build output) and
+fall back to the full log for surrounding detail.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import re
+from pathlib import Path
+from typing import NamedTuple
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+ARTIFACT_URL = (
+    "https://firefoxci.taskcluster-artifacts.net/"
+    "{task_id}/{run_id}/public/logs/live_backing.log"
+)
+RUN_ID = 0
+_HEADERS = {"User-Agent": "hackbot-build-repair/1.0"}
+_TIMEOUT = 120
+_MAX_LINES = 2000
+
+_ERROR_RE = re.compile(r"(?:ERROR|FATAL) -")
+
+
+class TaskLogs(NamedTuple):
+    """Paths to the two log files written for one failing task."""
+
+    sanitized: Path
+    full: Path
+
+
+def _safe_filename(task_name: str) -> str:
+    return re.sub(r"[^A-Za-z0-9._-]+", "_", task_name).strip("_") or "task"
+
+
+def sanitize_log(text: str) -> str:
+    """Keep only ``ERROR -`` / ``FATAL -`` lines, deduping consecutive repeats and capping size."""
+    kept: list[str] = []
+    previous: str | None = None
+    for line in text.splitlines():
+        if not _ERROR_RE.search(line):
+            continue
+        stripped = line.rstrip()
+        if stripped == previous:
+            continue
+        previous = stripped
+        kept.append(stripped)
+        if len(kept) >= _MAX_LINES:
+            kept.append(f"... (truncated at {_MAX_LINES} error lines)")
+            break
+    return "\n".join(kept)
+
+
+def _fetch_and_write(task_name: str, task_id: str, dest_dir: Path) -> TaskLogs:
+    safe = _safe_filename(task_name)
+    full_path = dest_dir / f"{safe}.log"
+    sanitized_path = dest_dir / f"{safe}.errors.txt"
+    url = ARTIFACT_URL.format(task_id=task_id, run_id=RUN_ID)
+    try:
+        resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
+        resp.raise_for_status()
+        full_path.write_text(resp.text)
+        sanitized = sanitize_log(resp.text)
+        sanitized_path.write_text(
+            sanitized if sanitized else f"(no ERROR -/FATAL - lines matched in {url})\n"
+        )
+    except requests.exceptions.RequestException as exc:
+        logger.warning("Failed to download log for %s (%s): %s", task_name, url, exc)
+        note = f"(failed to download {url}: {exc})\n"
+        full_path.write_text(note)
+        sanitized_path.write_text(note)
+    return TaskLogs(sanitized=sanitized_path, full=full_path)
+
+
+async def download_failure_logs(
+    failure_tasks: dict[str, str], dest_dir: Path
+) -> dict[str, TaskLogs]:
+    """Download the full log and write a sanitized companion for each task concurrently.
+
+    Returns a mapping of task name to its :class:`TaskLogs` (sanitized + full paths).
+    """
+    names = list(failure_tasks)
+    logs = await asyncio.gather(
+        *(
+            asyncio.to_thread(_fetch_and_write, name, failure_tasks[name], dest_dir)
+            for name in names
+        )
+    )
+    return dict(zip(names, logs))
diff --git a/agents/build-repair/hackbot_agents/build_repair/prompts.py b/agents/build-repair/hackbot_agents/build_repair/prompts.py
index ee166620c2..7e8516b7e1 100644
--- a/agents/build-repair/hackbot_agents/build_repair/prompts.py
+++ b/agents/build-repair/hackbot_agents/build_repair/prompts.py
@@ -3,75 +3,57 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
-"""Prompt templates for build repair agent."""
+"""Prompt templates for the build-repair agent."""
 
 ANALYSIS_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing and fixing a build failure.
 
-Investigate why the last commit broke {target_software} build.
-
-The last commit attempted to fix a bug from Bugzilla.
-
+Investigate why commit {git_commit} broke the {target_software} build. The source tree
+is already checked out at that commit (your working directory).
+{bug_context}
 Analyze the following:
-1. Git diff for the last commit
-2. Bugzilla bug description
-3. Taskcluster build failure logs
-The files with bug description and logs are located at {worktree_path}/repair_agent/in/{bug_id}
+1. The git diff of commit {git_commit} (use `git show {git_commit}`).
+{bug_step}{logs_num}. The Taskcluster build failure logs. Each failing task has a sanitized log (only the ERROR -/FATAL - lines) and the full log. Start from the sanitized log -- it usually pinpoints the failing file and line. The full log can be tens of thousands of lines, so grep it for that file/line rather than reading it sequentially:
+{failure_logs}
 
 Create three separate documents:
-1. {worktree_path}/repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues
-2. {worktree_path}/repair_agent/out/{bug_id}/planning.md with a fixing plan
-3. {worktree_path}/repair_agent/out/{bug_id}/summary.md with a brief one paragraph summary of analysis and planning that can point a developer in the right direction
-
-Do not prompt to edit those documents.
-{eval}
+1. {scratch_out}/analysis.md with your detailed analysis of what caused the failure
+2. {scratch_out}/planning.md with a fixing plan
+3. {scratch_out}/summary.md with a brief one-paragraph summary of the analysis and plan
+   that can point a developer in the right direction
 
-Do not write any code yet. Work fully autonomously, do not ask any questions.
+Do not prompt to edit those documents. Do not write any code yet. Work fully
+autonomously and do not ask any questions.
 """
 
-FIX_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing and fixing a build failure.
+BUG_CONTEXT = "\nThe commit attempted to fix Bugzilla bug {bug_id}.\n"
 
-Read the following files and implement a fix of the failure:
-1. {worktree_path}/repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues
-2. {worktree_path}/repair_agent/out/{bug_id}/planning.md with a fixing plan
-{eval}
+BUG_ANALYSIS_STEP = (
+    "2. The Bugzilla bug: fetch bug {bug_id}'s description and comments with the "
+    "`mcp__bugzilla__get_bugs` tool (ids=[{bug_id}], include_comments=true). If "
+    "it returns an error, note it and continue with the diff and logs.\n"
+)
 
-Do not prompt to edit files. Work fully autonomously, do not ask any questions. Use all allowed tools without prompting.
-"""
-
-EVAL_PROMPT = """
-Do not request bug info from Bugzilla or Phabricator. Use only the provided file with bug description.
-Do not look at git commits other than the specified last commit.
-"""
+FIX_TEMPLATE = """You are an expert {target_software} engineer tasked with fixing a build failure.
 
-VERIFY_TEMPLATE = """You are an expert {target_software} code reviewer evaluating an automated build repair agent's work.
+Read your earlier analysis and implement the fix directly in the source tree:
+1. {scratch_out}/analysis.md -- your analysis of what caused the failure
+2. {scratch_out}/planning.md -- your fixing plan
 
-Examine the relevant commits using git:
-- Failure commit (broke the build): {failure_commit}
-- Ground truth fix commit(s) (the real fix that was landed): {ground_truth_commits}
+Edit the source files in the working directory to repair the build. A mozconfig
+that mirrors the failing CI configuration (release milestone, warnings-as-errors)
+is already set up. Verify the fix compiles with the build_firefox tool, passing
+the directory of the file you changed as `target` (e.g. 'docshell/base') for a
+fast, focused build -- prefer this over a full tree build. If the build reports a
+missing toolchain (e.g. rustc or clang), run the bootstrap_firefox tool once and
+then build again. Verify via the build_firefox tool rather than a raw `./mach
+build` so the build result is recorded.
+{try_push}
 
-Inspect each commit's changes and read the repair agent's input/output files:
-- {worktree_path}/repair_agent/in/{bug_id}/bug_description.md
-- {worktree_path}/repair_agent/in/{bug_id}/build_failure_logs.md
-- {worktree_path}/repair_agent/out/{bug_id}/analysis.md
-- {worktree_path}/repair_agent/out/{bug_id}/summary.md
-- {worktree_path}/repair_agent/out/{bug_id}/agent_fix.diff (may be empty if no fix was produced)
-
-Evaluate the agent's work on two dimensions:
-
-ANALYSIS:
-- Did the agent correctly identify the root cause of the build failure?
-- How thorough and accurate is the analysis?
-
-FIX:
-- Does the agent's fix address the same files/functions as the ground truth?
-- Is the fix semantically equivalent or close to the ground truth?
-- Would the fix be acceptable in code review as-is?
-
-Guidelines:
-- If agent_fix.diff is empty, set fix_matches_ground_truth=false, fix_quality=0.0, fix_acceptance_probability=0.0
-- A fix can be correct even if it differs syntactically from ground truth -- focus on semantic equivalence
-- analysis_correct should be true if the agent found the right root cause, even if the explanation is imperfect
-- Be calibrated: 0.5 means genuinely uncertain, not a default score
+Do not prompt to edit files. Work fully autonomously, do not ask any questions.
+Use all allowed tools without prompting.
+"""
 
-Work autonomously, do not ask questions.
+TRY_PUSH_INSTRUCTIONS = """
+Once the fix builds locally, validate it on CI: call the submit_try_push tool with the
+failing task name ('{task_name}') to push to the try server and report the build result.
 """
diff --git a/agents/build-repair/hackbot_agents/build_repair/try_push.py b/agents/build-repair/hackbot_agents/build_repair/try_push.py
index c71ad0c854..b5d6729a27 100644
--- a/agents/build-repair/hackbot_agents/build_repair/try_push.py
+++ b/agents/build-repair/hackbot_agents/build_repair/try_push.py
@@ -1,137 +1,67 @@
-# -*- coding: utf-8 -*-
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this file,
-# You can obtain one at http://mozilla.org/MPL/2.0/.
+"""Optional Firefox try-server push tool.
 
+Submits the current source checkout to the Firefox try server via ``./mach try``
+and, optionally, polls Treeherder for the build result. Exposed as a separate
+``TRY_TOOLS`` list (not the default firefox ``TOOLS``) so an agent only gains the
+capability when it explicitly wires it in — a try push is an outward-facing
+action that not every run should perform.
+"""
+
+from __future__ import annotations
+
+import asyncio
 import logging
-import os
 import re
 import subprocess
 import time
-from dataclasses import dataclass
-from logging import getLogger
 from pathlib import Path
+from typing import Annotated, Any
 
 import requests
+from agent_tools.registry import ToolError, tool, tools_in
+from pydantic import Field
 
-from bugbug.tools.build_repair.config import (
-    TREEHERDER_BASE_URL,
-    TRY_PUSH_POLL_INTERVAL_SECONDS,
-    TRY_PUSH_TIMEOUT_SECONDS,
-)
-
-logger = getLogger(__name__)
+logger = logging.getLogger(__name__)
 
-_HEADERS = {"User-Agent": "bugbug-build-repair-eval/1.0"}
+TREEHERDER_BASE_URL = "https://treeherder.mozilla.org"
+_HEADERS = {"User-Agent": "hackbot-build-repair/1.0"}
 _LANDO_JOB_ID_RE = re.compile(r"landoCommitID=([A-Za-z0-9_-]+)")
 
 
-def _mach_env(worktree_path: Path) -> dict[str, str]:
-    env = os.environ.copy()
-    env["MOZBUILD_STATE_PATH"] = str(worktree_path / ".mozbuild")
-    return env
-
-
-@dataclass
-class TryPushResult:
-    """Result of local build verification and optional try push submission."""
-
-    local_build_passed: bool
-    try_build_passed: bool | None
-    lando_job_id: str | None
-    treeherder_url: str | None
-
-
-def _commit_fix(worktree_path: Path, bug_id: int) -> None:
-    logger.info("Committing fix for bug %s in %s", bug_id, worktree_path)
-    subprocess.run(
-        ["git", "add", "-A"],
-        cwd=worktree_path,
-        check=True,
-    )
+def _commit_all(source_dir: Path) -> None:
+    """Commit the working tree so ``./mach try`` has a commit to push."""
+    subprocess.run(["git", "add", "-A"], cwd=source_dir, check=True)
     subprocess.run(
         [
             "git",
             "-c",
-            "user.name=bugbug",
+            "user.name=hackbot",
             "-c",
-            "user.email=bugbug@mozilla.com",
+            "user.email=hackbot@mozilla.com",
             "commit",
+            "--allow-empty",
             "-m",
-            f"Build repair fix for bug {bug_id}",
+            "Build repair candidate fix",
         ],
-        cwd=worktree_path,
+        cwd=source_dir,
         check=True,
     )
-    logger.info("Bug %s: fix committed", bug_id)
 
 
-def _run_subprocess(
-    cmd: list[str], worktree_path: Path, capture: bool
-) -> subprocess.CompletedProcess[str]:
-    if capture:
-        return subprocess.run(
-            cmd,
-            cwd=worktree_path,
-            env=_mach_env(worktree_path),
-            capture_output=True,
-            text=True,
-        )
-    return subprocess.run(
-        cmd,
-        cwd=worktree_path,
-        env=_mach_env(worktree_path),
-        text=True,
-    )
-
-
-def _run_local_build(worktree_path: Path) -> bool:
-    capture = not logger.isEnabledFor(logging.DEBUG)
-
-    logger.info("Running bootstrap in %s", worktree_path)
-    result = _run_subprocess(
-        ["./mach", "--no-interactive", "bootstrap"], worktree_path, capture
-    )
-    if result.returncode != 0:
-        if capture and result.stderr:
-            logger.warning("Bootstrap stderr:\n%s", result.stderr[-2000:])
-        raise RuntimeError(
-            f"Local bootstrap failed with return code {result.returncode}"
-        )
-
-    logger.info("Running local build in %s", worktree_path)
-    result = _run_subprocess(["./mach", "build"], worktree_path, capture)
-    passed = result.returncode == 0
-    status = "passed" if passed else "failed"
-    logger.info("Local build %s (returncode=%s)", status, result.returncode)
-    if not passed and capture and result.stderr:
-        logger.warning("Build stderr:\n%s", result.stderr[-2000:])
-    return passed
-
-
-def _submit_try(worktree_path: Path, task_name: str) -> tuple[str | None, str | None]:
-    logger.info("Submitting try push for task=%s in %s", task_name, worktree_path)
+def _submit_try(source_dir: Path, task_name: str) -> tuple[str | None, str | None]:
     result = subprocess.run(
         ["./mach", "try", "fuzzy", "--query", task_name],
-        cwd=worktree_path,
+        cwd=source_dir,
         capture_output=True,
         text=True,
-        env=_mach_env(worktree_path),
     )
     stdout = result.stdout + result.stderr
-    logger.debug("Try push output: %s", stdout)
     match = _LANDO_JOB_ID_RE.search(stdout)
     if not match:
         logger.warning("Could not parse Lando job ID from try output: %s", stdout)
         return None, None
-
     lando_job_id = match.group(1)
     treeherder_url = f"{TREEHERDER_BASE_URL}/jobs?repo=try&landoCommitID={lando_job_id}"
-    logger.info(
-        "Try push submitted: lando_job_id=%s, treeherder=%s",
-        lando_job_id,
-        treeherder_url,
-    )
     return lando_job_id, treeherder_url
 
 
@@ -152,7 +82,7 @@ def _get_push_revision(lando_job_id: str) -> str | None:
     return None
 
 
-def _get_push_by_revision(revision: str) -> dict | None:
+def _get_push_id(revision: str) -> int | None:
     try:
         resp = requests.get(
             f"{TREEHERDER_BASE_URL}/api/project/try/push/",
@@ -162,7 +92,7 @@ def _get_push_by_revision(revision: str) -> dict | None:
         )
         resp.raise_for_status()
         results = resp.json().get("results", [])
-        return results[0] if results else None
+        return results[0]["id"] if results else None
     except Exception:
         logger.exception("Error fetching push by revision %s", revision)
     return None
@@ -187,111 +117,100 @@ def _get_build_job_result(push_id: int, task_name: str) -> str | None:
     return None
 
 
-def _poll_treeherder(lando_job_id: str, task_name: str) -> bool | None:
-    logger.info(
-        "Polling Treeherder for lando_job_id=%s, task=%s (timeout=%ss, interval=%ss)",
-        lando_job_id,
-        task_name,
-        TRY_PUSH_TIMEOUT_SECONDS,
-        TRY_PUSH_POLL_INTERVAL_SECONDS,
-    )
-    deadline = time.monotonic() + TRY_PUSH_TIMEOUT_SECONDS
+def _poll_treeherder(
+    lando_job_id: str, task_name: str, timeout_seconds: int, interval_seconds: int
+) -> bool | None:
+    deadline = time.monotonic() + timeout_seconds
     push_id: int | None = None
-    poll_count = 0
-
     while time.monotonic() < deadline:
-        poll_count += 1
         if push_id is None:
             revision = _get_push_revision(lando_job_id)
             if revision:
-                logger.info(
-                    "Resolved revision=%s for lando_job_id=%s", revision, lando_job_id
-                )
-                push = _get_push_by_revision(revision)
-                if push:
-                    push_id = push["id"]
-                    logger.info(
-                        "Resolved push_id=%s for revision=%s", push_id, revision
-                    )
-
+                push_id = _get_push_id(revision)
         if push_id is not None:
             result = _get_build_job_result(push_id, task_name)
-            logger.debug(
-                "Poll #%s: job result=%s for push_id=%s", poll_count, result, push_id
-            )
             if result == "success":
-                logger.info("Try build succeeded for lando_job_id=%s", lando_job_id)
                 return True
             if result in ("busted", "testfailed", "exception"):
-                logger.info(
-                    "Try build failed (%s) for lando_job_id=%s", result, lando_job_id
-                )
                 return False
-        else:
-            logger.debug(
-                "Poll #%s: push not yet available for lando_job_id=%s",
-                poll_count,
-                lando_job_id,
-            )
-        time.sleep(TRY_PUSH_POLL_INTERVAL_SECONDS)
-
-    logger.warning(
-        "Try push polling timed out after %s polls for lando job %s",
-        poll_count,
-        lando_job_id,
-    )
+        time.sleep(interval_seconds)
+    logger.warning("Try push polling timed out for lando job %s", lando_job_id)
     return None
 
 
-def run_try_verification(
-    worktree_path: Path,
-    bug_id: int,
+def run_try_push(
+    source_dir: Path,
     task_name: str,
-    skip_try_push: bool = False,
-) -> TryPushResult:
-    logger.info(
-        "Starting try verification for bug %s (task=%s, skip_try_push=%s)",
-        bug_id,
-        task_name,
-        skip_try_push,
-    )
-    _commit_fix(worktree_path, bug_id)
-
-    local_passed = _run_local_build(worktree_path)
-    if not local_passed:
-        logger.warning("Bug %s: local build failed, skipping try push", bug_id)
-        return TryPushResult(
-            local_build_passed=False,
-            try_build_passed=None,
-            lando_job_id=None,
-            treeherder_url=None,
-        )
-
-    if skip_try_push:
-        logger.info(
-            "Bug %s: local build passed, skipping try push as requested", bug_id
+    poll: bool,
+    timeout_seconds: int,
+    interval_seconds: int,
+) -> dict[str, Any]:
+    """Commit the working tree, submit a try push, and optionally poll for the result."""
+    _commit_all(source_dir)
+    lando_job_id, treeherder_url = _submit_try(source_dir, task_name)
+    if not lando_job_id:
+        raise ToolError(
+            "Try push submission failed: no Lando job id in ./mach try output",
+            payload={"error": "try_submit_failed"},
         )
-        return TryPushResult(
-            local_build_passed=True,
-            try_build_passed=None,
-            lando_job_id=None,
-            treeherder_url=None,
+    result: dict[str, Any] = {
+        "submitted": True,
+        "lando_job_id": lando_job_id,
+        "treeherder_url": treeherder_url,
+        "try_build_passed": None,
+    }
+    if poll:
+        result["try_build_passed"] = _poll_treeherder(
+            lando_job_id, task_name, timeout_seconds, interval_seconds
         )
+    return result
+
+
+@tool
+async def submit_try_push(
+    ctx,
+    task_name: Annotated[
+        str,
+        Field(
+            description=(
+                "Treeherder task name to build/select on try, e.g. "
+                "'build-linux64/opt'. The failing task is the natural choice."
+            )
+        ),
+    ],
+    poll: Annotated[
+        bool,
+        Field(
+            description=(
+                "Poll Treeherder until the build job completes (up to timeout) "
+                "and report pass/fail. If false, submit and return immediately."
+            )
+        ),
+    ] = True,
+    timeout_seconds: Annotated[
+        int, Field(description="Max seconds to poll Treeherder (default 7200).")
+    ] = 7200,
+    poll_interval_seconds: Annotated[
+        int, Field(description="Seconds between Treeherder polls (default 60).")
+    ] = 60,
+) -> dict:
+    """Submit the current Firefox checkout to the try server and check the build.
+
+    Commits the working tree as a candidate fix, runs ``./mach try fuzzy --query
+    <task_name>`` to push it, and (when ``poll`` is true) watches Treeherder for
+    the named build job. Returns JSON: submitted (bool), lando_job_id (str),
+    treeherder_url (str), try_build_passed (bool|null — null when polling was
+    skipped or timed out). Slow: a try build can take well over an hour, so only
+    call this once you are confident the fix builds locally.
+    """
+    return await asyncio.to_thread(
+        run_try_push,
+        ctx.source_dir,
+        task_name,
+        poll,
+        timeout_seconds,
+        poll_interval_seconds,
+    )
 
-    lando_job_id, treeherder_url = _submit_try(worktree_path, task_name)
-    if not lando_job_id:
-        logger.warning("Bug %s: try push submission failed, no lando job ID", bug_id)
-        return TryPushResult(
-            local_build_passed=True,
-            try_build_passed=None,
-            lando_job_id=None,
-            treeherder_url=None,
-        )
 
-    try_passed = _poll_treeherder(lando_job_id, task_name)
-    return TryPushResult(
-        local_build_passed=True,
-        try_build_passed=try_passed,
-        lando_job_id=lando_job_id,
-        treeherder_url=treeherder_url,
-    )
+TRY_TOOLS = tools_in(__name__)
diff --git a/agents/build-repair/pyproject.toml b/agents/build-repair/pyproject.toml
new file mode 100644
index 0000000000..eb91fa3132
--- /dev/null
+++ b/agents/build-repair/pyproject.toml
@@ -0,0 +1,33 @@
+[project]
+name = "hackbot-agent-build-repair"
+version = "0.1.0"
+description = "Cloud Run Job image that runs the build-repair agent for hackbot-api"
+requires-python = ">=3.12"
+dependencies = [
+    "hackbot-runtime[claude-sdk]",
+    "agent-tools[bugzilla,firefox]",
+    "bugsy",
+    "claude-agent-sdk>=0.1.30",
+    "mcp>=1.0.0",
+    "starlette>=0.36.0",
+    "uvicorn>=0.27.0",
+    "requests"
+]
+
+[project.optional-dependencies]
+eval = [
+    "weave",
+    "wandb",
+    "tenacity",
+]
+
+[tool.uv.sources]
+hackbot-runtime = { workspace = true }
+agent-tools = { workspace = true }
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["hackbot_agents", "evals/buildrepair_eval"]
diff --git a/docker-compose.yml b/docker-compose.yml
index cc534a3242..eedad036da 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -4,6 +4,7 @@ version: "3.8"
 
 include:
   - path: agents/bug-fix/compose.yml
+  - path: agents/build-repair/compose.yml
 
 services:
   bugbug-base:
diff --git a/pyproject.toml b/pyproject.toml
index 38fd0f8b90..075989ecea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -128,7 +128,7 @@ include = ["/bugbug", "/scripts", "/VERSION"]
 packages = ["bugbug", "scripts"]
 
 [tool.uv.workspace]
-members = ["http_service", "services/hackbot-api", "agents/bug-fix", "libs/hackbot-runtime", "libs/agent-tools"]
+members = ["http_service", "services/hackbot-api", "agents/bug-fix", "agents/build-repair", "libs/hackbot-runtime", "libs/agent-tools"]
 
 [tool.uv.sources]
 hackbot-runtime = { workspace = true }
diff --git a/services/buildrepair/Dockerfile b/services/buildrepair/Dockerfile
deleted file mode 100644
index 178537d664..0000000000
--- a/services/buildrepair/Dockerfile
+++ /dev/null
@@ -1,34 +0,0 @@
-# Load the base image by running this from the Firefox repo:
-# ./mach taskgraph load-image --task-id aQDejwXUQsSHxvwE2qQcQg
-FROM debian12-amd64-build
-
-WORKDIR /app
-
-RUN apt-get update && \
-    apt-get install -y git nodejs npm build-essential zlib1g-dev \
-    libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev \
-    libffi-dev libsqlite3-dev wget libbz2-dev && \
-    rm -rf /var/lib/apt/lists/*
-
-# bugbug requires Python 3.12 and there's no package for Debian 12
-RUN wget https://www.python.org/ftp/python/3.12.8/Python-3.12.8.tgz && \
-    tar -xf Python-3.12.8.tgz && \
-    cd Python-3.12.8 && \
-    ./configure --enable-optimizations --prefix=/usr/local && \
-    make -j$(nproc) && \
-    make install && \
-    cd .. && rm -rf Python-3.12.8 Python-3.12.8.tgz
-
-RUN python3.12 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
-
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
-
-COPY requirements.txt /app/
-RUN uv pip install -r /app/requirements.txt
-
-COPY . /app
-
-ENV PYTHONPATH=/app
-ENV PYTHONUNBUFFERED=1
-ENV FIREFOX_GIT_REPO=/workspace/firefox
\ No newline at end of file
diff --git a/services/buildrepair/README.md b/services/buildrepair/README.md
deleted file mode 100644
index 3fd6603b0b..0000000000
--- a/services/buildrepair/README.md
+++ /dev/null
@@ -1,64 +0,0 @@
-# Build Repair Agent
-
-It can automatically analyze a build failure in Firefox and propose a fix.
-
-## Evaluation
-
-Weights and Biases Weave [dashboard](https://wandb.ai/moz-bugbug/bugbug-build-repair-eval/weave/evaluations).
-
-To run locally:
-
-1. Clone Firefox to a separate directory
-
-2. Prepare the Docker image
-
-Pull the base Docker image to build Firefox from Taskcluster.
-From the Firefox repo run:
-
-```bash
-./mach taskgraph load-image --task-id aQDejwXUQsSHxvwE2qQcQg
-```
-
-Make sure to have enough resources available for the Docker engine (at least 16gb RAM and 128GB disk, better 256GB).
-
-3. Set environment variables
-
-```bash
-# Full path to the Firefox repo
-export FIREFOX_GIT_REPO=$(pwd)
-export ANTHROPIC_API_KEY=<The key to run Agents SDK>
-export WANDB_API_KEY=<Weights and Biases key for Weave>
-# If on Mac with ARM CPU
-export DOCKER_DEFAULT_PLATFORM=linux/amd64
-```
-
-4. `cd` to this repo
-
-5. (Optional) Prebuild the Docker image and use `image: build-repair-debian-base` in `docker-compose.dev.yml`
-
-```bash
-docker build -t build-repair-debian-base -f services/buildrepair/Dockerfile .
-```
-
-6. Attach to the container by running:
-
-```bash
-docker compose -f services/buildrepair/docker-compose.dev.yml run build-repair
-```
-
-7. Run the evaluation script.
-
-To test:
-
-```bash
-/opt/venv/bin/python scripts/build_repair_eval.py --no-try-push --limit 1
-```
-
-To run full evaluation (with 3 trials):
-
-```bash
-/opt/venv/bin/python scripts/build_repair_eval.py --no-try-push --parallellism 8 --trials 3
-```
-
-It will run each of 85 examples from the evaluation dataset 3 times.
-It will build Firefox each time with the proposed fix, then write results to Weave.
diff --git a/services/buildrepair/docker-compose.dev.yml b/services/buildrepair/docker-compose.dev.yml
deleted file mode 100644
index ddd2d67364..0000000000
--- a/services/buildrepair/docker-compose.dev.yml
+++ /dev/null
@@ -1,18 +0,0 @@
-services:
-  build-repair:
-    # To minimize rebuilding use `DOCKER_DEFAULT_PLATFORM=linux/amd64 docker build -t build-repair-debian-base -f services/buildrepair/Dockerfile .`
-    # and replace the "build" section with:
-    #  image: build-repair-debian-base
-    build:
-      context: ../..
-      dockerfile: services/buildrepair/Dockerfile
-    volumes:
-      - ../../:/app # live code editing
-      - ${FIREFOX_GIT_REPO}:/workspace/firefox # Firefox repo
-      - build-repair-tmp:/tmp/build_repair_worktrees
-    environment:
-      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
-      - WANDB_API_KEY=${WANDB_API_KEY} # for weave
-      - FIREFOX_GIT_REPO=/workspace/firefox
-volumes:
-  build-repair-tmp:
diff --git a/services/buildrepair/pyproject.toml b/services/buildrepair/pyproject.toml
deleted file mode 100644
index 621735a0f9..0000000000
--- a/services/buildrepair/pyproject.toml
+++ /dev/null
@@ -1,12 +0,0 @@
-[project]
-name = "bugbug-build-repair"
-dynamic = ["version"]
-description = "BugBug Build Repair Agent"
-requires-python = ">=3.12"
-dependencies = [
-    "bugbug",
-]
-
-[tool.uv.sources]
-bugbug = { path = "../..", editable = true }
-
diff --git a/services/hackbot-api/app/agents.py b/services/hackbot-api/app/agents.py
index 5802f6f511..e364e31922 100644
--- a/services/hackbot-api/app/agents.py
+++ b/services/hackbot-api/app/agents.py
@@ -4,7 +4,7 @@
 
 from pydantic import BaseModel
 
-from app.schemas import BugFixInputs
+from app.schemas import BugFixInputs, BuildRepairInputs
 
 
 @dataclass(frozen=True)
@@ -48,4 +48,10 @@ def model_to_env(inputs: BaseModel) -> dict[str, str]:
         job_name="hackbot-agent-bug-fix",
         input_schema=BugFixInputs,
     ),
+    "build-repair": AgentSpec(
+        name="build-repair",
+        description="Analyze a Firefox build failure at a specific commit and produce a candidate fix patch.",
+        job_name="hackbot-agent-build-repair",
+        input_schema=BuildRepairInputs,
+    ),
 }
diff --git a/services/hackbot-api/app/schemas.py b/services/hackbot-api/app/schemas.py
index 36ad0f9b17..411d56216c 100644
--- a/services/hackbot-api/app/schemas.py
+++ b/services/hackbot-api/app/schemas.py
@@ -67,3 +67,12 @@ class BugFixInputs(BaseModel):
     model: str | None = None
     max_turns: int | None = None
     effort: str | None = None
+
+
+class BuildRepairInputs(BaseModel):
+    bug_id: int | None = None
+    git_commit: str
+    failure_tasks: dict[str, str]
+    run_try_push: bool = False
+    model: str | None = None
+    max_turns: int | None = None
diff --git a/services/hackbot-api/tests/test_agents.py b/services/hackbot-api/tests/test_agents.py
index c99c9d4689..6e06f94412 100644
--- a/services/hackbot-api/tests/test_agents.py
+++ b/services/hackbot-api/tests/test_agents.py
@@ -1,7 +1,9 @@
 """Tests for the agent registry and generic env serialization."""
 
+import json
+
 from app.agents import AGENT_REGISTRY, model_to_env
-from app.schemas import BugFixInputs
+from app.schemas import BugFixInputs, BuildRepairInputs
 
 
 def test_model_to_env_uppercases_and_stringifies():
@@ -30,3 +32,22 @@ def test_bug_fix_registry_uses_default_env_serializer():
     # No hand-written build_env: the router falls back to model_to_env.
     assert spec.build_env is None
     assert spec.input_schema is BugFixInputs
+
+
+def test_build_repair_registry_entry():
+    spec = AGENT_REGISTRY["build-repair"]
+    assert spec.build_env is None
+    assert spec.input_schema is BuildRepairInputs
+    assert spec.job_name == "hackbot-agent-build-repair"
+
+
+def test_model_to_env_json_encodes_failure_tasks_and_bool():
+    tasks = {"build-linux64/opt": "OyF95j0oQ-CF_YuBM1b7vg"}
+    env = model_to_env(
+        BuildRepairInputs(
+            bug_id=1, git_commit="deadbeef", failure_tasks=tasks, run_try_push=True
+        )
+    )
+    assert env["GIT_COMMIT"] == "deadbeef"
+    assert json.loads(env["FAILURE_TASKS"]) == tasks
+    assert env["RUN_TRY_PUSH"] == "True"
diff --git a/uv.lock b/uv.lock
index 515eabce8f..b45074881c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -22,6 +22,7 @@ members = [
     "bugbug",
     "bugbug-http-service",
     "hackbot-agent-bug-fix",
+    "hackbot-agent-build-repair",
     "hackbot-api",
     "hackbot-runtime",
 ]
@@ -1739,6 +1740,30 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/22/70/037e1fed8c40b185c8769e137c3be2ab3c19766471462514cd09d0eb022e/fxpoppet-0.4.1-py3-none-any.whl", hash = "sha256:f8d75e5a3b128aa7e78f6a93c2c60443f163694607027809c1acb279c754aaef", size = 49288, upload-time = "2025-11-19T21:58:39.789Z" },
 ]
 
+[[package]]
+name = "gitdb"
+version = "4.0.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "smmap" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/63b0fc47eb32792c7ba1fe1b694daec9a63620db1e313033d18140c2320a/gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571", size = 394684, upload-time = "2025-01-02T07:20:46.413Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/61/5c78b91c3143ed5c14207f463aecfc8f9dbb5092fb2869baf37c273b2705/gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf", size = 62794, upload-time = "2025-01-02T07:20:43.624Z" },
+]
+
+[[package]]
+name = "gitpython"
+version = "3.1.50"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "gitdb" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/33/f6/354ae6491228b5eb40e10d89c4d13c651fe1cf7556e35ebdded50cff57ce/gitpython-3.1.50.tar.gz", hash = "sha256:80da2d12504d52e1f998772dc5baf6e553f8d2fcfe1fcc226c9d9a2ee3372dcc", size = 219798, upload-time = "2026-05-06T04:01:26.571Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/20/7a/1c6e3562dfd8950adbb11ffbc65d21e7c89d01a6e4f137fa981056de25c5/gitpython-3.1.50-py3-none-any.whl", hash = "sha256:d352abe2908d07355014abdd21ddf798c2a961469239afec4962e9da884858f9", size = 212507, upload-time = "2026-05-06T04:01:23.799Z" },
+]
+
 [[package]]
 name = "google-api-core"
 version = "2.30.3"
@@ -2156,6 +2181,44 @@ requires-dist = [
     { name = "uvicorn", specifier = ">=0.27.0" },
 ]
 
+[[package]]
+name = "hackbot-agent-build-repair"
+version = "0.1.0"
+source = { editable = "agents/build-repair" }
+dependencies = [
+    { name = "agent-tools", extra = ["bugzilla", "firefox"] },
+    { name = "bugsy" },
+    { name = "claude-agent-sdk" },
+    { name = "hackbot-runtime", extra = ["claude-sdk"] },
+    { name = "mcp" },
+    { name = "requests" },
+    { name = "starlette" },
+    { name = "uvicorn" },
+]
+
+[package.optional-dependencies]
+eval = [
+    { name = "tenacity" },
+    { name = "wandb" },
+    { name = "weave" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "agent-tools", extras = ["bugzilla", "firefox"], editable = "libs/agent-tools" },
+    { name = "bugsy" },
+    { name = "claude-agent-sdk", specifier = ">=0.1.30" },
+    { name = "hackbot-runtime", extras = ["claude-sdk"], editable = "libs/hackbot-runtime" },
+    { name = "mcp", specifier = ">=1.0.0" },
+    { name = "requests" },
+    { name = "starlette", specifier = ">=0.36.0" },
+    { name = "tenacity", marker = "extra == 'eval'" },
+    { name = "uvicorn", specifier = ">=0.27.0" },
+    { name = "wandb", marker = "extra == 'eval'" },
+    { name = "weave", marker = "extra == 'eval'" },
+]
+provides-extras = ["eval"]
+
 [[package]]
 name = "hackbot-api"
 version = "0.1.0"
@@ -5823,6 +5886,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/14/78/0f68b93564b8c6b6987a0696c582ba2591a381ab2f733a501909e949f241/smart_open-7.6.1-py3-none-any.whl", hash = "sha256:b4de6aebef023aca91cc9fb372052e1343ba3f152de215bd22391a663e3ddd21", size = 64845, upload-time = "2026-05-09T06:23:35.386Z" },
 ]
 
+[[package]]
+name = "smmap"
+version = "5.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1f/ea/49c993d6dfdd7338c9b1000a0f36817ed7ec84577ae2e52f890d1a4ff909/smmap-5.0.3.tar.gz", hash = "sha256:4d9debb8b99007ae47165abc08670bd74cb74b5227dda7f643eccc4e9eb5642c", size = 22506, upload-time = "2026-03-09T03:43:26.1Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c1/d4/59e74daffcb57a07668852eeeb6035af9f32cbfd7a1d2511f17d2fe6a738/smmap-5.0.3-py3-none-any.whl", hash = "sha256:c106e05d5a61449cf6ba9a1e650227ecfb141590d2a98412103ff35d89fc7b2f", size = 24390, upload-time = "2026-03-09T03:43:24.361Z" },
+]
+
 [[package]]
 name = "sniffio"
 version = "1.3.1"
@@ -6492,6 +6564,35 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/66/c3/f8b216cbd742e5b84c40f045204c764ccb7524d2aeab021054ec69446b0a/w3lib-2.4.1-py3-none-any.whl", hash = "sha256:40930132907e68de906a5b89331ab8c8ff4f01bd35b5539ef7896017d814138d", size = 21695, upload-time = "2026-03-20T09:50:26.187Z" },
 ]
 
+[[package]]
+name = "wandb"
+version = "0.27.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "gitpython" },
+    { name = "packaging" },
+    { name = "platformdirs" },
+    { name = "protobuf" },
+    { name = "pydantic" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "sentry-sdk" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/14/a2/53ca062f430178e3af48ebc137396481d0ee885fb94a554c0df464cd8afa/wandb-0.27.2.tar.gz", hash = "sha256:c81ff93ab63f4dabc5a27b90ac3d12310fbfa6a14ca99201626921c99b2845be", size = 40300451, upload-time = "2026-06-06T01:47:02.74Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4b/95/18d3625558667b459d91c19630f7cecfbc133f87f5b144a7fb755e473e8c/wandb-0.27.2-py3-none-macosx_12_0_arm64.whl", hash = "sha256:978400b3c4b7d97e927c32264453da5e4a0040a3468d5b77a00d9c480613f370", size = 23990048, upload-time = "2026-06-06T01:46:38.902Z" },
+    { url = "https://files.pythonhosted.org/packages/43/14/72c26f67b0b6cb307cbb76659465c6ab7d99ea27c268d1b4f5aa82c4d8e5/wandb-0.27.2-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:de8099f02f540c743069617db7d034511a64c193748783aa6d2d98310918d170", size = 25165812, upload-time = "2026-06-06T01:46:42.068Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/a3/f9fe31ca72b4f5854d1e488403d6310783127a6b7e267c28577e9bd51b43/wandb-0.27.2-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:e0779592410215a2762063c3585d3dcad73c7dca9cb6d63c4dcc1588267c1392", size = 24554366, upload-time = "2026-06-06T01:46:44.57Z" },
+    { url = "https://files.pythonhosted.org/packages/76/e2/7a5064aba235ddb855b8c2250e07e6187fcc8382332e237e545d4de094ee/wandb-0.27.2-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:55bfebf4d382116a8e9610848cadc0de50d406bacd3d0a390d12dabde196f009", size = 26380293, upload-time = "2026-06-06T01:46:47.487Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/4c/0c845edac5ff0fd0930e881bec2569f2e2af2a4fc873249855600546eee0/wandb-0.27.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:566aa2fcd67d2a23c08713da75e9daf82f30f7136af76763ef1d7db3d901d940", size = 24728823, upload-time = "2026-06-06T01:46:49.887Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/7a/a6f7a02a0e6bf73e163b61caca03aaba3452836a02dbe2b64f9e1a3c6afc/wandb-0.27.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:41158181fc5b691438b3d04fee0a8c061e3f1f407a3258096afbebfe1db24e72", size = 26691957, upload-time = "2026-06-06T01:46:52.384Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/9b/aa94eb8265b0c55dc6c3e435c11241b3f885c7a1720718046efd7cbd8361/wandb-0.27.2-py3-none-win32.whl", hash = "sha256:5c55fad8c7be9d345dcebdc9dc10f7d2ac5af5bede62acbcd79a412ccaf48c87", size = 24151396, upload-time = "2026-06-06T01:46:54.793Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/0b/9e442779f5f24baaca044daf7546a735f41a811886b84ae12740d51b7f9d/wandb-0.27.2-py3-none-win_amd64.whl", hash = "sha256:87204d4fe40fbd9a1fe89a05927ce4ddb8be34d8210045457819fb4a35e0bcea", size = 24151404, upload-time = "2026-06-06T01:46:56.994Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/3a/01ab3afc1f6f962df93db34be8183147c9821e4dce82dc03313fb8d08635/wandb-0.27.2-py3-none-win_arm64.whl", hash = "sha256:32ed7456f40443c971e95dd63704d840fce66c24f88049a9bda8a09dfe85effe", size = 22063373, upload-time = "2026-06-06T01:47:00.263Z" },
+]
+
 [[package]]
 name = "wasabi"
 version = "1.1.3"

From 3c626dd92b39a216ff213d5c283cb24b32651824 Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Wed, 24 Jun 2026 17:52:34 -0700
Subject: [PATCH 08/15] Move eval.py to evals/ via rename

Preserves file history; the prior move staged it as add+delete.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 agents/build-repair/evals/{buildrepair_eval => }/eval.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename agents/build-repair/evals/{buildrepair_eval => }/eval.py (100%)

diff --git a/agents/build-repair/evals/buildrepair_eval/eval.py b/agents/build-repair/evals/eval.py
similarity index 100%
rename from agents/build-repair/evals/buildrepair_eval/eval.py
rename to agents/build-repair/evals/eval.py

From d68aa6c6b8a512094c5c643cb28152476f2a60ee Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Wed, 24 Jun 2026 17:52:42 -0700
Subject: [PATCH 09/15] Rewrite eval.py for Hackbot migration

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 agents/build-repair/evals/eval.py | 388 ++++++++----------------------
 1 file changed, 104 insertions(+), 284 deletions(-)

diff --git a/agents/build-repair/evals/eval.py b/agents/build-repair/evals/eval.py
index 0dff16178e..679422c69d 100644
--- a/agents/build-repair/evals/eval.py
+++ b/agents/build-repair/evals/eval.py
@@ -3,228 +3,86 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
-"""Standalone CLI for build repair evaluation.
+"""Build-repair evaluation harness.
+
+Runs the ported hackbot build-repair agent (``run_build_repair``) over a Weave
+dataset of Firefox build failures, then scores its output: deterministic build
+verification plus an LLM-as-a-judge comparison to the landed fix.
 
 Usage:
-    python scripts/build_repair_eval.py
-    python scripts/build_repair_eval.py --analysis-only
-    python scripts/build_repair_eval.py --trials 3
-    python scripts/build_repair_eval.py --limit 5
-    python scripts/build_repair_eval.py --parallelism 4
-    python scripts/build_repair_eval.py --no-try-push
-    python scripts/build_repair_eval.py --verbose
+    python -m evals.eval --no-try-push --limit 1
+    python -m evals.eval --trials 3 --parallelism 8
 """
 
+from __future__ import annotations
+
 import argparse
 import asyncio
-import json
 import logging
 import os
+import subprocess
+import tempfile
 import uuid
-from datetime import datetime
 from functools import cached_property
-from typing import Any
+from pathlib import Path
 
+import bugsy
 import weave
-
-from bugbug.tools.build_repair.agent import (
-    AgentResponse,
-    BuildFailure,
-    BuildRepairTool,
-    GroundTruth,
-)
-from bugbug.tools.build_repair.config import MODEL_CUTOFF_DATES
-from bugbug.tools.build_repair.scorer import (
+from agent_tools import bugzilla
+from agent_tools.bugzilla import BugzillaContext
+from agent_tools.claude_sdk import build_sdk_server
+from agent_tools.firefox import FirefoxContext
+from agent_tools.firefox.tools.build_firefox import build_firefox
+from hackbot_agents.build_repair.agent import run_build_repair
+from hackbot_agents.build_repair.config import ANALYSIS_MODEL, FIX_MODEL
+
+from .scorer import (
     BasicMetricsScorer,
     BuildPassRateScorer,
     LLMFixMatchingScorer,
 )
-from bugbug.tools.build_repair.worktree import WorktreeManager
+from .verify import VERIFY_MODEL, GroundTruth, is_data_contaminated, run_verify
+from .worktree import WorktreeManager
 
 logger = logging.getLogger(__name__)
 
-# TODO: replace with native tracing for Anthropic Agents SDK when released by W&B
 
-
-def _attr(obj, key, default=None):
-    if isinstance(obj, dict):
-        return obj.get(key, default)
-    return getattr(obj, key, default)
+def _collect_diff(worktree_path: Path, base_commit: str) -> str:
+    subprocess.run(["git", "add", "-A"], cwd=worktree_path, capture_output=True)
+    result = subprocess.run(
+        ["git", "diff", "--staged", base_commit],
+        cwd=worktree_path,
+        capture_output=True,
+        text=True,
+    )
+    return result.stdout
 
 
-def _to_chat_message(data: dict) -> dict | None:
-    """Convert a serialized claude_agent_sdk message to OpenAI chat format.
+def _bugzilla_server():
+    """Bugzilla MCP server for the agent.
 
-    Content blocks may be dicts (from model_dump) or dataclass instances
-    (from vars), so we use _attr() for uniform access.
+    Prefer the broker (``BUGZILLA_MCP_URL``) so the eval container holds no
+    Bugzilla credentials -- same isolation as production. Falls back to an
+    in-process server for local runs without a broker.
     """
-    msg_type = data.get("type", "")
-
-    if msg_type == "AssistantMessage":
-        blocks = data.get("content", [])
-        text_parts = []
-        tool_calls = []
-        for block in blocks:
-            text = _attr(block, "text")
-            if text is not None:
-                text_parts.append(text)
-                continue
-            name = _attr(block, "name")
-            block_id = _attr(block, "id")
-            if name is not None and block_id is not None:
-                tool_calls.append(
-                    {
-                        "id": block_id,
-                        "type": "function",
-                        "function": {
-                            "name": name,
-                            "arguments": json.dumps(_attr(block, "input", {})),
-                        },
-                    }
-                )
-        if not text_parts and not tool_calls:
-            return None
-        msg: dict = {"role": "assistant"}
-        if text_parts:
-            msg["content"] = "\n".join(text_parts)
-        if tool_calls:
-            msg["tool_calls"] = tool_calls
-        return msg
-
-    if msg_type == "UserMessage":
-        content = data.get("content", "")
-        if isinstance(content, list):
-            for block in content:
-                tool_use_id = _attr(block, "tool_use_id")
-                if tool_use_id:
-                    block_content = _attr(block, "content", "")
-                    return {
-                        "role": "tool",
-                        "tool_call_id": tool_use_id,
-                        "content": str(block_content) if block_content else "",
-                    }
-
-    return None
-
-
-@weave.op(kind="llm")
-def trace_llm_stage(
-    stage: str,
-    messages: list[dict],
-    model: str,
-    result_data: dict | None = None,
-) -> dict:
-    last_assistant = ""
-    for msg in reversed(messages):
-        if msg.get("role") == "assistant" and msg.get("content"):
-            last_assistant = msg["content"]
-            break
-
-    result: dict[str, Any] = {
-        "model": model,
-        "choices": [
-            {
-                "message": {"role": "assistant", "content": last_assistant},
-            }
-        ],
-    }
-    if result_data:
-        raw_usage = result_data.get("usage", {}) or {}
-        input_tokens = raw_usage.get("input_tokens", 0)
-        output_tokens = raw_usage.get("output_tokens", 0)
-        result["usage"] = {
-            "prompt_tokens": input_tokens,
-            "completion_tokens": output_tokens,
-            "total_tokens": input_tokens + output_tokens,
-            "cache_read_input_tokens": raw_usage.get("cache_read_input_tokens", 0),
-            "cache_creation_input_tokens": raw_usage.get(
-                "cache_creation_input_tokens", 0
-            ),
-            "total_cost_usd": result_data.get("total_cost_usd", 0),
-            "num_turns": result_data.get("num_turns", 0),
-        }
-    return result
-
-
-# Per-token costs in USD (standard, non-cached rates).
-# Weave uses these for its built-in cost UI; the SDK's total_cost_usd
-# (which accounts for cache pricing) is tracked separately as the authoritative cost.
-ANTHROPIC_TOKEN_COSTS: dict[str, tuple[float, float]] = {
-    "claude-opus-4-6": (15.0e-6, 75.0e-6),
-    "claude-sonnet-4-6": (3.0e-6, 15.0e-6),
-    "claude-haiku-4-5-20251001": (0.8e-6, 4.0e-6),
-    "claude-sonnet-4-5-20250929": (3.0e-6, 15.0e-6),
-    "claude-opus-4-5-20251101": (15.0e-6, 75.0e-6),
-    "claude-opus-4-1-20250805": (15.0e-6, 75.0e-6),
-    "claude-sonnet-4-20250514": (3.0e-6, 15.0e-6),
-    "claude-3-7-sonnet-20250219": (3.0e-6, 15.0e-6),
-    "claude-opus-4-20250514": (15.0e-6, 75.0e-6),
-}
-
-
-def _register_model_costs(client) -> None:
-    for model_id, (prompt_cost, completion_cost) in ANTHROPIC_TOKEN_COSTS.items():
-        try:
-            client.add_cost(
-                llm_id=model_id,
-                prompt_token_cost=prompt_cost,
-                completion_token_cost=completion_cost,
-            )
-        except Exception as e:
-            logger.debug("Could not register cost for %s: %s", model_id, e)
-
-
-def _make_weave_callback():
-    stages: dict[str, dict] = {}
-
-    def on_message(stage: str, data: dict) -> None:
-        msg_type = data["type"]
-        if msg_type == "stage_start":
-            messages = []
-            if "system_prompt" in data:
-                messages.append({"role": "system", "content": data["system_prompt"]})
-            messages.append({"role": "user", "content": data["prompt"]})
-
-            stages[stage] = {
-                "model": data["model"],
-                "messages": messages,
-            }
-        elif msg_type == "stage_end":
-            if stage in stages:
-                s = stages.pop(stage)
-                trace_llm_stage(
-                    stage=stage,
-                    messages=s["messages"],
-                    model=s["model"],
-                    result_data=data.get("result_data") or None,
-                )
-        else:
-            if stage in stages:
-                chat_msg = _to_chat_message(data)
-                if chat_msg:
-                    stages[stage]["messages"].append(chat_msg)
-
-    return on_message
-
-
-class BuildRepairError(Exception):
-    """Raised when the agent completes but reports an error."""
-
-    def __init__(self, output: dict):
-        self.output = output
-        super().__init__(output.get("error", "Unknown error"))
+    mcp_url = os.environ.get("BUGZILLA_MCP_URL")
+    if mcp_url:
+        return {"type": "http", "url": mcp_url}
+    client = bugsy.Bugsy(
+        bugzilla_url=os.environ.get(
+            "BUGZILLA_API_URL", "https://bugzilla.mozilla.org/rest"
+        ),
+        api_key=os.environ.get("BUGZILLA_API_KEY"),
+    )
+    return build_sdk_server("bugzilla", BugzillaContext(client=client), bugzilla.TOOLS)
 
 
 class BuildRepairModel(weave.Model):
-    """Weave Model wrapper that creates a worktree per example and runs BuildRepairTool."""
+    """Weave Model: one worktree per example, runs the ported build-repair agent."""
 
     firefox_repo: str
-    analysis_only: bool = False
     no_try_push: bool = False
-
-    @cached_property
-    def tool(self) -> BuildRepairTool:
-        return BuildRepairTool.create(analysis_only=self.analysis_only, eval_mode=True)
+    judge_model: str = VERIFY_MODEL
 
     @cached_property
     def worktree_mgr(self) -> WorktreeManager:
@@ -241,78 +99,64 @@ async def invoke(
         fix_commit_date: str,
         **kwargs,
     ) -> dict:
-        wt_name = f"bug-{bug_id}-{uuid.uuid4().hex[:8]}"
-        logger.info(
-            "Invoking bug %s (commit=%s, %s failures)",
-            bug_id,
-            gh_failure_commits[0][:12],
-            len(failures),
-        )
-
-        worktree_created = False
-        try:
-            cutoff = max(
-                MODEL_CUTOFF_DATES[self.tool.analysis_model],
-                MODEL_CUTOFF_DATES[self.tool.fix_model],
+        if is_data_contaminated(fix_commit_date, ANALYSIS_MODEL, FIX_MODEL):
+            logger.warning(
+                "Skipping bug %s: fix date %s precedes model cutoff",
+                bug_id,
+                fix_commit_date,
             )
-            if datetime.fromisoformat(fix_commit_date).date() < cutoff:
-                logger.warning(
-                    "Skipping bug %s: fix date %s is before model cutoff %s",
-                    bug_id,
-                    fix_commit_date,
-                    cutoff,
-                )
-                raise ValueError("skipped_data_contamination")
-
-            worktree_path = self.worktree_mgr.create(gh_failure_commits[0], wt_name)
-            worktree_created = True
+            raise ValueError("skipped_data_contamination")
 
-            on_message = _make_weave_callback()
-            failure = BuildFailure(
+        failure_commit = gh_failure_commits[0]
+        wt_name = f"bug-{bug_id}-{uuid.uuid4().hex[:8]}"
+        worktree_path = self.worktree_mgr.create(failure_commit, wt_name)
+        try:
+            fx_ctx = FirefoxContext.from_source_repo(worktree_path)
+            result = await run_build_repair(
+                bugzilla_mcp_server=_bugzilla_server(),
+                source_repo=worktree_path,
+                fx_ctx=fx_ctx,
                 bug_id=bug_id,
-                bug_title=pre_fix_bug["title"],
-                bug_comments=pre_fix_bug["comments"],
-                git_commit=gh_failure_commits[0],
-                failure_tasks=failures,
-            )
-            result: AgentResponse = await self.tool.run(
-                failure,
-                worktree_path=worktree_path,
-                skip_try_push=self.no_try_push,
-                on_message=on_message,
-            )
-            logger.info(
-                "Bug %s completed: error=%s, diff_len=%s, cost=$%.4f, turns=%s, "
-                "local_build=%s, try_build=%s",
-                bug_id,
-                result.error,
-                len(result.diff),
-                result.cost_usd,
-                result.num_turns,
-                result.local_build_passed,
-                result.try_build_passed,
+                git_commit=failure_commit,
+                failure_tasks={f["task_name"]: f["task_id"] for f in failures},
+                run_try_push=not self.no_try_push,
             )
 
-            output = result.model_dump()
+            diff = _collect_diff(worktree_path, failure_commit)
+            output: dict = {
+                "error": None,
+                "diff": diff,
+                "cost_usd": result.total_cost_usd or 0.0,
+                "num_turns": result.num_turns,
+                "local_build_passed": None,
+                "try_build_passed": result.try_build_passed,
+            }
 
-            if result.analysis or result.summary:
-                ground_truth = GroundTruth(gh_fix_commits=gh_fix_commits)
-                verify_result = await self.tool.verify(
-                    failure,
-                    result.diff,
-                    ground_truth,
-                    worktree_path,
-                    on_message,
+            if diff.strip():
+                build_result = await build_firefox(
+                    worktree_path, fx_ctx.mozconfig, fx_ctx.objdir
                 )
-                output["verify"] = verify_result.model_dump()
+                output["local_build_passed"] = build_result["success"]
 
-            if result.error:
-                raise BuildRepairError(output)
+            scratch_out = Path(tempfile.mkdtemp(prefix=f"verify-{bug_id}-"))
+            (scratch_out / "analysis.md").write_text(result.analysis)
+            (scratch_out / "summary.md").write_text(result.summary)
+            judgment, judge_cost = await run_verify(
+                worktree_path=worktree_path,
+                scratch_out=scratch_out,
+                bug_id=bug_id,
+                failure_commit=failure_commit,
+                ground_truth=GroundTruth(gh_fix_commits=gh_fix_commits),
+                agent_diff=diff,
+                model=self.judge_model,
+            )
+            output["verify"] = {
+                "judgment": judgment.model_dump(),
+                "cost_usd": judge_cost,
+            }
             return output
         finally:
-            if worktree_created:
-                logger.info("Bug %s: cleaning up worktree %s", bug_id, wt_name)
-                self.worktree_mgr.cleanup(wt_name)
+            self.worktree_mgr.cleanup(wt_name)
 
 
 def main() -> None:
@@ -322,7 +166,7 @@ def main() -> None:
     parser.add_argument("--parallelism", type=int, default=8)
     parser.add_argument("--firefox-repo", default=os.environ.get("FIREFOX_GIT_REPO"))
     parser.add_argument("--dataset", default="build_repair_one_commit_eval")
-    parser.add_argument("--analysis-only", action="store_true")
+    parser.add_argument("--judge-model", default=VERIFY_MODEL)
     parser.add_argument("--no-try-push", action="store_true")
     parser.add_argument("--verbose", action="store_true", help="Enable DEBUG logging")
     args = parser.parse_args()
@@ -330,52 +174,28 @@ def main() -> None:
     if not args.firefox_repo:
         parser.error("--firefox-repo or FIREFOX_GIT_REPO env var is required")
 
-    log_level = logging.DEBUG if args.verbose else logging.INFO
     logging.basicConfig(
-        level=log_level,
+        level=logging.DEBUG if args.verbose else logging.INFO,
         format="%(asctime)s %(levelname)s %(name)s: %(message)s",
     )
-    if not args.verbose:
-        logging.getLogger("httpx").setLevel(logging.WARNING)
-        logging.getLogger("httpcore").setLevel(logging.WARNING)
-        logging.getLogger("hgitaly").setLevel(logging.WARNING)
-        logging.getLogger("urllib3").setLevel(logging.WARNING)
-
-    logger.info(
-        "Starting evaluation: dataset=%s, limit=%s, trials=%s, parallelism=%s, "
-        "analysis_only=%s, no_try_push=%s, firefox_repo=%s",
-        args.dataset,
-        args.limit,
-        args.trials,
-        args.parallelism,
-        args.analysis_only,
-        args.no_try_push,
-        args.firefox_repo,
-    )
 
     os.environ["WEAVE_PARALLELISM"] = str(args.parallelism)
-    os.environ["WEAVE_LOG_LEVEL"] = "INFO" if args.verbose else "WARNING"
-    client = weave.init("bugbug-build-repair-eval")
-    _register_model_costs(client)
+    weave.init("bugbug-build-repair-eval")
 
     dataset = weave.ref(args.dataset).get()
-    logger.info("Loaded dataset %s with %s rows", args.dataset, len(dataset.rows))
     if args.limit:
         dataset.rows = dataset.rows[: args.limit]
-        logger.info("Limited to %s rows", len(dataset.rows))
+    logger.info("Loaded dataset %s (%s rows)", args.dataset, len(dataset.rows))
 
     scorers = [
         BasicMetricsScorer(num_trials=args.trials),
+        BuildPassRateScorer(num_trials=args.trials),
         LLMFixMatchingScorer(num_trials=args.trials),
     ]
-    if not args.analysis_only:
-        scorers.insert(1, BuildPassRateScorer(num_trials=args.trials))
-    logger.info("Scorers: %s", [type(s).__name__ for s in scorers])
-
     model = BuildRepairModel(
         firefox_repo=args.firefox_repo,
-        analysis_only=args.analysis_only,
         no_try_push=args.no_try_push,
+        judge_model=args.judge_model,
     )
     evaluation = weave.Evaluation(
         name="build-repair",

From f92c51667798a4bcfca7b8155083a414ea238420 Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Wed, 24 Jun 2026 17:53:52 -0700
Subject: [PATCH 10/15] Migrate build repair evals to Hackbot

---
 agents/build-repair/Dockerfile                |  30 ++++
 agents/build-repair/README.md                 |  52 ++++++
 agents/build-repair/compose.yml               |  34 +++-
 .../build-repair/evals}/__init__.py           |   0
 .../build-repair/evals}/scorer.py             |  18 --
 agents/build-repair/evals/verify.py           | 162 ++++++++++++++++++
 .../build-repair/evals}/worktree.py           |  11 +-
 agents/build-repair/pyproject.toml            |   2 +-
 docker-compose.yml                            |   1 +
 9 files changed, 286 insertions(+), 24 deletions(-)
 create mode 100644 agents/build-repair/README.md
 rename {bugbug/tools/build_repair => agents/build-repair/evals}/__init__.py (100%)
 rename {bugbug/tools/build_repair => agents/build-repair/evals}/scorer.py (87%)
 create mode 100644 agents/build-repair/evals/verify.py
 rename {bugbug/tools/build_repair => agents/build-repair/evals}/worktree.py (88%)

diff --git a/agents/build-repair/Dockerfile b/agents/build-repair/Dockerfile
index fa3872a15e..a6de686d05 100644
--- a/agents/build-repair/Dockerfile
+++ b/agents/build-repair/Dockerfile
@@ -53,3 +53,33 @@ USER broker
 EXPOSE 8765
 
 CMD ["python", "-m", "hackbot_agents.build_repair.broker"]
+
+# Evaluation image: the production agent image plus the eval-only Python deps
+# (weave, wandb, tenacity). Deriving FROM agent means the harness runs the agent
+# in the identical production runtime (same user, HOME, PATH, mach toolchain). The
+# prod `agent` target never pulls these deps, since this stage builds only when
+# targeted.
+FROM agent AS eval
+
+USER root
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+
+# Add the `eval` extra into the existing venv. `--no-install-workspace` avoids
+# rebuilding the already-installed agent package; `--inexact` keeps it (and every
+# other prod package) rather than pruning it as extraneous.
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+    --mount=type=bind,source=uv.lock,target=uv.lock \
+    --mount=type=bind,source=VERSION,target=VERSION \
+    UV_PROJECT_ENVIRONMENT=/opt/venv \
+    uv sync --frozen --no-dev --no-install-workspace --inexact --extra eval --package hackbot-agent-build-repair
+
+# The harness creates worktrees and runs `./mach build` against a bind-mounted
+# Firefox checkout owned by a different uid; allow git to operate on it.
+RUN git config --system --add safe.directory '*'
+
+USER agent
+ENV FIREFOX_GIT_REPO=/firefox
+
+ENTRYPOINT ["python", "-m", "evals.eval"]
+CMD ["--no-try-push", "--limit", "1"]
diff --git a/agents/build-repair/README.md b/agents/build-repair/README.md
new file mode 100644
index 0000000000..d0546cfc1d
--- /dev/null
+++ b/agents/build-repair/README.md
@@ -0,0 +1,52 @@
+# Build Repair Agent
+
+Two-stage Claude agent that diagnoses a Firefox build failure and edits the source
+tree to fix it. Agent logic in `hackbot_agents/build_repair/`; the Weave eval
+harness in `evals/`.
+
+Run the Docker commands below from this folder, with secrets in a local `.env`
+(`ANTHROPIC_API_KEY`, `BUGZILLA_API_KEY`, plus `WANDB_API_KEY` for evals).
+
+## Test the agent
+
+```sh
+BUG_ID=1987675 GIT_COMMIT=5477e3882d4e18f93de9f56b31e90533fd23b0d1 \
+FAILURE_TASKS='{"build-linux":"XyU4b_BIRdO_IeK6z_kcQg"}' \
+  docker compose up build-repair-agent --build
+```
+
+Artifacts are written to `~/hackbot/artifacts/`.
+
+## Run evals
+
+Each dataset row is a Firefox build failure; per trial the harness runs the agent
+on a git worktree at the failure commit, builds the fix, and LLM-judges it against
+the landed commits. Needs a bootstrapped Firefox checkout.
+
+Local:
+
+```sh
+FIREFOX_GIT_REPO=/path/to/firefox \
+  uv run --package hackbot-agent-build-repair --extra eval \
+  python -m evals.eval --no-try-push --limit 1
+```
+
+Docker (reuses the broker, so no Bugzilla creds in the eval container):
+
+```sh
+FIREFOX_GIT_REPO=/path/to/firefox \
+  docker compose run --rm build-repair-eval --no-try-push --limit 1
+```
+
+Flags: `--trials N`, `--parallelism N`, `--judge-model <id>`, `--dataset <ref>`,
+`--no-try-push`, `--verbose`.
+
+The agent reads the bug live from Bugzilla, so the harness skips examples whose fix
+landed before the production model's training cutoff (`MODEL_CUTOFF_DATES` in
+`evals/verify.py`) to avoid contamination.
+
+## W&B metrics
+
+`weave.init` + `weave.Evaluation` log success and diff rates, local and try build
+pass rates, LLM fix-matching (analysis/fix quality, ground-truth match,
+acceptance), and `total_cost_usd`.
diff --git a/agents/build-repair/compose.yml b/agents/build-repair/compose.yml
index c6e63839a1..ce91cfabe8 100644
--- a/agents/build-repair/compose.yml
+++ b/agents/build-repair/compose.yml
@@ -15,11 +15,15 @@ services:
       context: ../..
       dockerfile: agents/build-repair/Dockerfile
       target: agent
+    # Per-run inputs are not `:?`-required: the eval service shares this file and
+    # Compose interpolates every service regardless of which one is started, so a
+    # required var here would break `run build-repair-eval`. pydantic AgentInputs
+    # still validates them at runtime.
     environment:
       - RUN_ID
-      - BUG_ID=${BUG_ID:?error}
-      - GIT_COMMIT=${GIT_COMMIT:?error}
-      - FAILURE_TASKS=${FAILURE_TASKS:?error}
+      - BUG_ID=${BUG_ID:-}
+      - GIT_COMMIT=${GIT_COMMIT:-}
+      - FAILURE_TASKS=${FAILURE_TASKS:-}
       - RUN_TRY_PUSH=${RUN_TRY_PUSH:-false}
       - BUGZILLA_MCP_URL=http://build-repair-broker:8765/mcp
       - SOURCE_REPO=/workspace/firefox
@@ -34,5 +38,29 @@ services:
       build-repair-broker:
         condition: service_started
 
+  # Evaluation harness (profile-gated so it stays out of the default lifecycle).
+  # Reuses the broker for Bugzilla, so this container holds no Bugzilla creds --
+  # same isolation as the agent. The Firefox checkout is bind-mounted (faster than
+  # cloning); the harness creates worktrees from it and builds each fix. Run with:
+  #   FIREFOX_GIT_REPO=/path docker compose --env-file .env \
+  #     -f agents/build-repair/compose.yml run --rm build-repair-eval \
+  #     --no-try-push --limit 1
+  build-repair-eval:
+    profiles: ["eval"]
+    build:
+      context: ../..
+      dockerfile: agents/build-repair/Dockerfile
+      target: eval
+    environment:
+      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:?error}
+      - WANDB_API_KEY=${WANDB_API_KEY:-}
+      - BUGZILLA_MCP_URL=http://build-repair-broker:8765/mcp
+      - FIREFOX_GIT_REPO=/firefox
+    volumes:
+      - ${FIREFOX_GIT_REPO:-/firefox}:/firefox
+    depends_on:
+      build-repair-broker:
+        condition: service_started
+
 volumes:
   workspace:
diff --git a/bugbug/tools/build_repair/__init__.py b/agents/build-repair/evals/__init__.py
similarity index 100%
rename from bugbug/tools/build_repair/__init__.py
rename to agents/build-repair/evals/__init__.py
diff --git a/bugbug/tools/build_repair/scorer.py b/agents/build-repair/evals/scorer.py
similarity index 87%
rename from bugbug/tools/build_repair/scorer.py
rename to agents/build-repair/evals/scorer.py
index da513ab13d..29943640be 100644
--- a/bugbug/tools/build_repair/scorer.py
+++ b/agents/build-repair/evals/scorer.py
@@ -56,27 +56,17 @@ def score(self, output: dict | None) -> dict:
                 "has_diff": False,
                 "cost_usd": 0,
                 "num_turns": 0,
-                "input_tokens": 0,
-                "output_tokens": 0,
-                "cache_read_input_tokens": 0,
-                "cache_creation_input_tokens": 0,
             }
         return {
             "successful": output.get("error") is None,
             "has_diff": bool(output.get("diff", "").strip()),
             "cost_usd": output.get("cost_usd", 0),
             "num_turns": output.get("num_turns", 0),
-            "input_tokens": output.get("input_tokens", 0),
-            "output_tokens": output.get("output_tokens", 0),
-            "cache_read_input_tokens": output.get("cache_read_input_tokens", 0),
-            "cache_creation_input_tokens": output.get("cache_creation_input_tokens", 0),
         }
 
     def summarize(self, score_rows: list[dict]) -> dict:
         n = len(score_rows)
         costs = [r.get("cost_usd", 0) for r in score_rows]
-        input_toks = [r.get("input_tokens", 0) for r in score_rows]
-        output_toks = [r.get("output_tokens", 0) for r in score_rows]
         summary = {
             "success_rate": sum(r.get("successful", False) for r in score_rows) / n
             if n
@@ -86,14 +76,6 @@ def summarize(self, score_rows: list[dict]) -> dict:
             else 0,
             "avg_cost_usd": sum(costs) / n if n else 0,
             "total_cost_usd": sum(costs),
-            "total_input_tokens": sum(input_toks),
-            "total_output_tokens": sum(output_toks),
-            "total_cache_read_tokens": sum(
-                r.get("cache_read_input_tokens", 0) for r in score_rows
-            ),
-            "total_cache_creation_tokens": sum(
-                r.get("cache_creation_input_tokens", 0) for r in score_rows
-            ),
             "num_examples": n,
         }
         if self.num_trials > 1:
diff --git a/agents/build-repair/evals/verify.py b/agents/build-repair/evals/verify.py
new file mode 100644
index 0000000000..f448d68948
--- /dev/null
+++ b/agents/build-repair/evals/verify.py
@@ -0,0 +1,162 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""LLM-as-a-judge verification of a build-repair fix against ground truth.
+
+Split out of the production agent: this is an evaluation concern. It reads the
+agent's artifacts in a worktree and the real landed fix commits, then asks Claude
+to score the analysis and the fix.
+"""
+
+from __future__ import annotations
+
+from datetime import date
+from logging import getLogger
+from pathlib import Path
+
+from claude_agent_sdk import ClaudeAgentOptions, ResultMessage, query
+from pydantic import BaseModel
+from tenacity import retry, stop_after_attempt, wait_exponential_jitter
+
+logger = getLogger(__name__)
+
+VERIFY_MODEL = "claude-opus-4-8"
+
+# Training-data cutoff per model, for data-contamination filtering. Examples with
+# a fix_commit_date before the cutoff may have appeared in training data.
+# Source: https://platform.claude.com/docs/en/about-claude/models/overview
+MODEL_CUTOFF_DATES = {
+    "claude-opus-4-8": date(2026, 1, 1),
+    "claude-opus-4-6": date(2025, 8, 1),
+    "claude-sonnet-4-6": date(2026, 1, 1),
+    "claude-haiku-4-5-20251001": date(2025, 7, 1),
+    "claude-sonnet-4-5-20250929": date(2025, 7, 1),
+    "claude-opus-4-5-20251101": date(2025, 8, 1),
+    "claude-opus-4-1-20250805": date(2025, 3, 1),
+    "claude-sonnet-4-20250514": date(2025, 3, 1),
+    "claude-3-7-sonnet-20250219": date(2024, 11, 1),
+    "claude-opus-4-20250514": date(2025, 3, 1),
+}
+
+VERIFY_ALLOWED_TOOLS = [
+    "Read",
+    "Bash(git show:*)",
+    "Bash(git log:*)",
+    "Bash(git diff:*)",
+    "Bash(find:*)",
+    "Bash(grep:*)",
+    "WebFetch(domain:firefox-source-docs.mozilla.org)",
+    "WebFetch(domain:searchfox.org)",
+]
+
+VERIFY_TEMPLATE = """You are an expert {target_software} code reviewer evaluating an automated build repair agent's work.
+
+Examine the relevant commits using git:
+- Failure commit (broke the build): {failure_commit}
+- Ground truth fix commit(s) (the real fix that was landed): {ground_truth_commits}
+
+Inspect each commit's changes and read the repair agent's output files:
+- {scratch_out}/analysis.md
+- {scratch_out}/summary.md
+- {scratch_out}/agent_fix.diff (may be empty if no fix was produced)
+
+Evaluate the agent's work on two dimensions:
+
+ANALYSIS:
+- Did the agent correctly identify the root cause of the build failure?
+- How thorough and accurate is the analysis?
+
+FIX:
+- Does the agent's fix address the same files/functions as the ground truth?
+- Is the fix semantically equivalent or close to the ground truth?
+- Would the fix be acceptable in code review as-is?
+
+Guidelines:
+- If agent_fix.diff is empty, set fix_matches_ground_truth=false, fix_quality=0.0, fix_acceptance_probability=0.0
+- A fix can be correct even if it differs syntactically from ground truth -- focus on semantic equivalence
+- analysis_correct should be true if the agent found the right root cause, even if the explanation is imperfect
+- Be calibrated: 0.5 means genuinely uncertain, not a default score
+
+Work autonomously, do not ask questions.
+"""
+
+
+class GroundTruth(BaseModel):
+    gh_fix_commits: list[str]
+
+
+class Judgment(BaseModel):
+    analysis_correct: bool
+    analysis_quality: float
+    analysis_explanation: str
+    fix_matches_ground_truth: bool
+    fix_quality: float
+    fix_explanation: str
+    fix_acceptance_probability: float
+    fix_acceptance_explanation: str
+
+
+def is_data_contaminated(fix_commit_date: str, *models: str) -> bool:
+    """True when the fix predates the latest training cutoff of the given models.
+
+    Conservative across the models that could have memorized the landed fix: skip
+    the example if it predates any of their cutoffs (i.e. the latest one).
+    """
+    cutoffs = [c for m in models if (c := MODEL_CUTOFF_DATES.get(m)) is not None]
+    if not cutoffs:
+        return False
+    return date.fromisoformat(fix_commit_date[:10]) < max(cutoffs)
+
+
+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential_jitter(initial=2, max=30, jitter=5),
+    reraise=True,
+)
+async def run_verify(
+    *,
+    worktree_path: Path,
+    scratch_out: Path,
+    bug_id: int,
+    failure_commit: str,
+    ground_truth: GroundTruth,
+    agent_diff: str,
+    target_software: str = "Mozilla Firefox",
+    model: str = VERIFY_MODEL,
+) -> tuple[Judgment, float]:
+    """Judge the agent's analysis and fix. Returns (judgment, cost_usd)."""
+    scratch_out.mkdir(parents=True, exist_ok=True)
+    (scratch_out / "agent_fix.diff").write_text(agent_diff, encoding="utf-8")
+
+    prompt = VERIFY_TEMPLATE.format(
+        target_software=target_software,
+        failure_commit=failure_commit,
+        ground_truth_commits=" ".join(ground_truth.gh_fix_commits),
+        scratch_out=scratch_out,
+    )
+    options = ClaudeAgentOptions(
+        model=model,
+        cwd=str(worktree_path),
+        allowed_tools=VERIFY_ALLOWED_TOOLS,
+        disallowed_tools=["AskUserQuestion", "Task"],
+        permission_mode="acceptEdits",
+        effort="high",
+        output_format={"type": "json_schema", "schema": Judgment.model_json_schema()},
+    )
+
+    judgment: Judgment | None = None
+    cost = 0.0
+    async for message in query(prompt=prompt, options=options):
+        if isinstance(message, ResultMessage):
+            cost += message.total_cost_usd or 0.0
+            structured = getattr(message, "structured_output", None)
+            if structured:
+                judgment = Judgment.model_validate(structured)
+            elif message.result:
+                judgment = Judgment.model_validate_json(message.result)
+
+    if judgment is None:
+        raise RuntimeError(f"bug {bug_id}: verification produced no structured output")
+    return judgment, cost
diff --git a/bugbug/tools/build_repair/worktree.py b/agents/build-repair/evals/worktree.py
similarity index 88%
rename from bugbug/tools/build_repair/worktree.py
rename to agents/build-repair/evals/worktree.py
index 1fe2980738..5cb60529a2 100644
--- a/bugbug/tools/build_repair/worktree.py
+++ b/agents/build-repair/evals/worktree.py
@@ -3,14 +3,21 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
+"""Git worktree management for parallel evaluation trials.
+
+Each trial runs the agent against an isolated checkout of the Firefox repo at a
+specific failure commit, so trials don't conflict. (Production runs are already
+isolated per container, so the agent itself needs no worktrees.)
+"""
+
 import subprocess
 from logging import getLogger
 from pathlib import Path
 
-from bugbug.tools.build_repair.config import WORKTREE_BASE_DIR
-
 logger = getLogger(__name__)
 
+WORKTREE_BASE_DIR = "/tmp/build_repair_worktrees"
+
 
 class WorktreeManager:
     """Manages git worktrees for parallel evaluation runs against a Firefox repo."""
diff --git a/agents/build-repair/pyproject.toml b/agents/build-repair/pyproject.toml
index eb91fa3132..669aac824b 100644
--- a/agents/build-repair/pyproject.toml
+++ b/agents/build-repair/pyproject.toml
@@ -30,4 +30,4 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
-packages = ["hackbot_agents", "evals/buildrepair_eval"]
+packages = ["hackbot_agents", "evals"]
diff --git a/docker-compose.yml b/docker-compose.yml
index eedad036da..e8577231fe 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -5,6 +5,7 @@ version: "3.8"
 include:
   - path: agents/bug-fix/compose.yml
   - path: agents/build-repair/compose.yml
+  - path: agents/build-repair/compose.eval.yml
 
 services:
   bugbug-base:

From 11750c6f578368c52559efb24707f7cf5ec87bdf Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Thu, 25 Jun 2026 10:00:32 -0700
Subject: [PATCH 11/15] Remove old file from docker compose

---
 docker-compose.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index e8577231fe..eedad036da 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -5,7 +5,6 @@ version: "3.8"
 include:
   - path: agents/bug-fix/compose.yml
   - path: agents/build-repair/compose.yml
-  - path: agents/build-repair/compose.eval.yml
 
 services:
   bugbug-base:

From 84e1a2a6fa62ddbdf55e50ca1ada2b1a959eae8f Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Thu, 25 Jun 2026 10:22:40 -0700
Subject: [PATCH 12/15] Add eval todo

---
 agents/build-repair/evals/eval.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/agents/build-repair/evals/eval.py b/agents/build-repair/evals/eval.py
index 679422c69d..f41c1c19d7 100644
--- a/agents/build-repair/evals/eval.py
+++ b/agents/build-repair/evals/eval.py
@@ -92,6 +92,9 @@ def worktree_mgr(self) -> WorktreeManager:
     async def invoke(
         self,
         bug_id: int,
+        # Bug fields before the fix. This filed is a part of the dataset.
+        # The new Hackbot agent is not using it. It pulls the Bugzilla bug itself.
+        # TODO: investigate how to hide the fix in evals for the new agent
         pre_fix_bug: dict,
         gh_failure_commits: list[str],
         gh_fix_commits: list[str],

From dbac63fbd3ab40cd689d7d9243871ba16b15b827 Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Thu, 25 Jun 2026 13:55:05 -0700
Subject: [PATCH 13/15] Update readme

---
 agents/build-repair/README.md | 62 ++++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 11 deletions(-)

diff --git a/agents/build-repair/README.md b/agents/build-repair/README.md
index d0546cfc1d..a2c000adee 100644
--- a/agents/build-repair/README.md
+++ b/agents/build-repair/README.md
@@ -4,9 +4,30 @@ Two-stage Claude agent that diagnoses a Firefox build failure and edits the sour
 tree to fix it. Agent logic in `hackbot_agents/build_repair/`; the Weave eval
 harness in `evals/`.
 
-Run the Docker commands below from this folder, with secrets in a local `.env`
+Run the Docker commands below from the repo root, with secrets in a local `.env`
 (`ANTHROPIC_API_KEY`, `BUGZILLA_API_KEY`, plus `WANDB_API_KEY` for evals).
 
+The second stage attempts building Firefox to verify the fix and iterate on it if it fails.
+It also optionally bootstraps Firefox build if needed.
+
+## Input
+
+- `BUG_ID` - Optional Bugzilla bug ID
+- `GIT_COMMIT` - Firefox Git commit that failed the build
+- `FAILURE_TASKS` - a dictionary of failed Taskcluster tasks {task_name: taskcluster_task_id}
+
+## Output
+
+First stage - analysis:
+
+- `summary.md` - a quick summary for a developer
+- `analysis.md` - detailed analysis
+- `planning.md` - intermediate file that outlines fixing steps for the second stage
+
+Second stage - fixing:
+
+- A patch in Hackbot format
+
 ## Test the agent
 
 ```sh
@@ -17,13 +38,17 @@ FAILURE_TASKS='{"build-linux":"XyU4b_BIRdO_IeK6z_kcQg"}' \
 
 Artifacts are written to `~/hackbot/artifacts/`.
 
-## Run evals
+## Evaluation
+
+The evaluation dataset is prepared with [build_repair_create_dataset.ipynb](../../notebooks/build_repair_create_dataset.ipynb) and saved to Weights and Biases Weave.
 
-Each dataset row is a Firefox build failure; per trial the harness runs the agent
+### Run evals
+
+Each dataset row is a Firefox build failure. The harness runs the agent
 on a git worktree at the failure commit, builds the fix, and LLM-judges it against
 the landed commits. Needs a bootstrapped Firefox checkout.
 
-Local:
+Local (use only for debugging as new agent is not sandboxed):
 
 ```sh
 FIREFOX_GIT_REPO=/path/to/firefox \
@@ -31,22 +56,37 @@ FIREFOX_GIT_REPO=/path/to/firefox \
   python -m evals.eval --no-try-push --limit 1
 ```
 
-Docker (reuses the broker, so no Bugzilla creds in the eval container):
+Docker (reuses the broker container, so no Bugzilla creds passed to the eval container):
 
 ```sh
 FIREFOX_GIT_REPO=/path/to/firefox \
-  docker compose run --rm build-repair-eval --no-try-push --limit 1
+  docker compose --env-file .env -f agents/build-repair/compose.yml run --rm --build build-repair-eval --no-try-push --limit 1
 ```
 
-Flags: `--trials N`, `--parallelism N`, `--judge-model <id>`, `--dataset <ref>`,
-`--no-try-push`, `--verbose`.
+Flags:
+
+`--trials N` - the number of times to run each example
+
+`--parallelism N` - the number of runs to parallelize with Weave
+
+`--judge-model <id>` - Claude model ID for LLM-as-a-judge
+
+`--dataset <ref>` - Weave dataset name
 
-The agent reads the bug live from Bugzilla, so the harness skips examples whose fix
+`--no-try-push` - do not run TRY push to verify the results, only local build
+
+`--verbose` - debugging log level
+
+The harness skips examples whose fix
 landed before the production model's training cutoff (`MODEL_CUTOFF_DATES` in
 `evals/verify.py`) to avoid contamination.
 
-## W&B metrics
+Change the models in [config.py](hackbot_agents/build_repair/config.py) to older ones (`claude-opus-4-6`) to test on older datasets.
 
-`weave.init` + `weave.Evaluation` log success and diff rates, local and try build
+### W&B metrics
+
+`weave.init` + `weave.Evaluation` log success and diff rates, local and TRY build
 pass rates, LLM fix-matching (analysis/fix quality, ground-truth match,
 acceptance), and `total_cost_usd`.
+
+See https://wandb.ai/moz-bugbug/bugbug-build-repair-eval/weave/evaluations

From b2f6b7494cc2347a692eecc5e76ce71b0d1e1762 Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Thu, 25 Jun 2026 14:25:18 -0700
Subject: [PATCH 14/15] Remove services

---
 .github/dependabot.yml | 38 --------------------------------------
 1 file changed, 38 deletions(-)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 0c766de53a..0ce7a61d73 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -34,44 +34,6 @@ updates:
     open-pull-requests-limit: 99
     allow:
       - dependency-type: direct
-  - package-ecosystem: uv
-    directory: "/services/mcp"
-    schedule:
-      interval: weekly
-      day: tuesday
-    groups:
-      patch:
-        applies-to: version-updates
-        patterns:
-          - "*"
-        update-types:
-          - patch
-    cooldown:
-      semver-major-days: 14
-      semver-minor-days: 7
-      semver-patch-days: 3
-    open-pull-requests-limit: 99
-    allow:
-      - dependency-type: direct
-  - package-ecosystem: uv
-    directory: "/services/reviewhelper-api"
-    schedule:
-      interval: weekly
-      day: wednesday
-    groups:
-      patch:
-        applies-to: version-updates
-        patterns:
-          - "*"
-        update-types:
-          - patch
-    cooldown:
-      semver-major-days: 14
-      semver-minor-days: 7
-      semver-patch-days: 3
-    open-pull-requests-limit: 99
-    allow:
-      - dependency-type: direct
   - package-ecosystem: npm
     directory: "/ui/changes"
     schedule:

From d07ac26eaa2a4545dd26efce3608efc4d4e39d82 Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Thu, 25 Jun 2026 17:26:53 -0700
Subject: [PATCH 15/15] Remove unnecessary git config

---
 agents/build-repair/Dockerfile | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/agents/build-repair/Dockerfile b/agents/build-repair/Dockerfile
index a6de686d05..e1890dc740 100644
--- a/agents/build-repair/Dockerfile
+++ b/agents/build-repair/Dockerfile
@@ -74,10 +74,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     UV_PROJECT_ENVIRONMENT=/opt/venv \
     uv sync --frozen --no-dev --no-install-workspace --inexact --extra eval --package hackbot-agent-build-repair
 
-# The harness creates worktrees and runs `./mach build` against a bind-mounted
-# Firefox checkout owned by a different uid; allow git to operate on it.
-RUN git config --system --add safe.directory '*'
-
 USER agent
 ENV FIREFOX_GIT_REPO=/firefox