From 9650bf46c6d3cad910364ec15758131cc038db3d Mon Sep 17 00:00:00 2001
From: John Pangas <swiftyxswaggy@outlook.com>
Date: Thu, 25 Jun 2026 20:19:42 -0600
Subject: [PATCH 1/3] Add test plan agent

---
 agents/test-plan-generator/Dockerfile         |  50 ++++++++
 agents/test-plan-generator/compose.yml        |  19 +++
 agents/test-plan-generator/hackbot.toml       |   3 +
 .../test_plan_generator/__init__.py           |   1 +
 .../test_plan_generator/__main__.py           |  36 ++++++
 .../test_plan_generator/agent.py              | 120 ++++++++++++++++++
 .../test_plan_generator/config.py             |  35 +++++
 .../test_plan_generator/devtools_mcp.py       |  29 +++++
 .../test_plan_generator/firefox_install.py    |  41 ++++++
 .../test_plan_generator/prompts/system.md     |  80 ++++++++++++
 .../test_plan_generator/result.py             |  99 +++++++++++++++
 agents/test-plan-generator/pyproject.toml     |  22 ++++
 docker-compose.yml                            |   1 +
 services/hackbot-api/app/agents.py            |  15 ++-
 services/hackbot-api/app/schemas.py           |   8 ++
 services/hackbot-api/tests/test_agents.py     |  36 +++++-
 uv.lock                                       |  22 ++++
 17 files changed, 615 insertions(+), 2 deletions(-)
 create mode 100644 agents/test-plan-generator/Dockerfile
 create mode 100644 agents/test-plan-generator/compose.yml
 create mode 100644 agents/test-plan-generator/hackbot.toml
 create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/__init__.py
 create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py
 create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py
 create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/config.py
 create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/devtools_mcp.py
 create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/firefox_install.py
 create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md
 create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py
 create mode 100644 agents/test-plan-generator/pyproject.toml

diff --git a/agents/test-plan-generator/Dockerfile b/agents/test-plan-generator/Dockerfile
new file mode 100644
index 0000000000..ef082714da
--- /dev/null
+++ b/agents/test-plan-generator/Dockerfile
@@ -0,0 +1,50 @@
+FROM python:3.12 AS builder
+
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+
+ENV UV_PROJECT_ENVIRONMENT=/opt/venv
+
+WORKDIR /app
+
+# Install external deps without building workspace members.
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+    --mount=type=bind,source=uv.lock,target=uv.lock \
+    --mount=type=bind,source=VERSION,target=VERSION \
+    uv sync --frozen --no-dev --no-install-workspace --package hackbot-agent-test-plan-generator
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,target=/app,rw \
+    uv sync --locked --no-dev --no-editable --package hackbot-agent-test-plan-generator
+
+FROM python:3.12 AS agent
+
+COPY --from=builder /opt/venv /opt/venv
+WORKDIR /app
+
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PATH="/opt/venv/bin:$PATH"
+
+# The Firefox DevTools MCP server is launched through npx. Firefox itself is
+# downloaded at agent startup, so the image only needs Node/npm and the shared
+# libraries required by headless Firefox.
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        nodejs npm \
+        ca-certificates \
+        libgtk-3-0 libdbus-glib-1-2 libx11-xcb1 libxtst6 libxt6 \
+        libasound2 libpci3 \
+    && rm -rf /var/lib/apt/lists/*
+
+# hackbot.toml lives at the agent root (not inside the package), so copy it into
+# the working dir; the runtime discovers it there (cwd) at startup.
+COPY agents/test-plan-generator/hackbot.toml /app/hackbot.toml
+
+RUN useradd --create-home --shell /bin/bash agent \
+    && mkdir -p /workspace \
+    && chown agent:agent /workspace
+
+USER agent
+
+CMD ["python", "-m", "hackbot_agents.test_plan_generator"]
diff --git a/agents/test-plan-generator/compose.yml b/agents/test-plan-generator/compose.yml
new file mode 100644
index 0000000000..ee4b076023
--- /dev/null
+++ b/agents/test-plan-generator/compose.yml
@@ -0,0 +1,19 @@
+services:
+  test-plan-generator-agent:
+    build:
+      context: ../..
+      dockerfile: agents/test-plan-generator/Dockerfile
+      target: agent
+    environment:
+      - RUN_ID
+      - FEATURE
+      - FEATURE_DETAILS
+      - MODEL
+      - MAX_TURNS
+      - EFFORT
+      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:?error}
+      # No uploader locally: summary/logs/attachments are written under
+      # /artifacts/<run_id>, bind-mounted to the host's ~/hackbot/artifacts.
+      - ARTIFACTS_DIR=/artifacts
+    volumes:
+      - ${HOME}/hackbot/artifacts:/artifacts
diff --git a/agents/test-plan-generator/hackbot.toml b/agents/test-plan-generator/hackbot.toml
new file mode 100644
index 0000000000..5912e57781
--- /dev/null
+++ b/agents/test-plan-generator/hackbot.toml
@@ -0,0 +1,3 @@
+# test-plan-generator needs no platform prep: no [source] checkout and no
+# [firefox] build. It downloads a fresh Firefox Nightly at startup and drives it
+# through the Firefox DevTools MCP server.
diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/__init__.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/__init__.py
@@ -0,0 +1 @@
+
diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py
new file mode 100644
index 0000000000..9b4e5b3357
--- /dev/null
+++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py
@@ -0,0 +1,36 @@
+from hackbot_runtime import HackbotContext, run_async
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+from .agent import TestPlanGeneratorResult, run_test_plan_generator
+from .firefox_install import install_firefox_nightly
+
+
+class AgentInputs(BaseSettings):
+    feature: str
+    feature_details: str
+    model: str | None = None
+    max_turns: int | None = None
+    effort: str | None = None
+
+    model_config = SettingsConfigDict(extra="ignore")
+
+
+async def main(ctx: HackbotContext) -> TestPlanGeneratorResult:
+    inputs = AgentInputs()
+
+    firefox_path = str(install_firefox_nightly())
+
+    return await run_test_plan_generator(
+        feature=inputs.feature,
+        feature_details=inputs.feature_details,
+        model=inputs.model,
+        max_turns=inputs.max_turns,
+        effort=inputs.effort,
+        firefox_path=firefox_path,
+        log=ctx.log_path,
+        verbose=True,
+    )
+
+
+if __name__ == "__main__":
+    run_async(main)
diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py
new file mode 100644
index 0000000000..20f93a0283
--- /dev/null
+++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py
@@ -0,0 +1,120 @@
+"""Firefox QA test-plan generator and executor."""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+from claude_agent_sdk import (
+    ClaudeAgentOptions,
+    ClaudeSDKClient,
+    McpServerConfig,
+    ResultMessage,
+)
+from hackbot_runtime import AgentError, HackbotAgentResult
+from hackbot_runtime.claude import Reporter
+
+from .config import DEVTOOLS_TOOLS
+from .devtools_mcp import build_devtools_server
+from .result import (
+    RESULT_SERVER_NAME,
+    SUBMIT_RESULT_TOOL,
+    ResultCollector,
+    TestPlanResult,
+    build_result_server,
+)
+
+HERE = Path(__file__).resolve().parent
+
+logger = logging.getLogger("test-plan-generator")
+
+
+class TestPlanGeneratorResult(HackbotAgentResult):
+    result: TestPlanResult | None = None
+
+
+def load_system_prompt() -> str:
+    return (HERE / "prompts" / "system.md").read_text()
+
+
+def build_user_prompt(feature: str, feature_details: str) -> str:
+    return (
+        "Generate and run a Firefox QA test plan for this feature.\n\n"
+        f"Feature:\n{feature}\n\n"
+        f"Feature details:\n{feature_details}\n\n"
+        "Follow the required workflow exactly: generate 10 cases first, run "
+        "them in order, stop each case on first failed step, and submit the "
+        "structured result."
+    )
+
+
+async def run_test_plan_generator(
+    *,
+    feature: str,
+    feature_details: str,
+    model: str | None = None,
+    max_turns: int | None = None,
+    effort: str | None = None,
+    firefox_path: str | None = None,
+    verbose: bool = False,
+    log: Path | None = None,
+) -> TestPlanGeneratorResult:
+    """Generate and run a Firefox QA test plan for one feature."""
+    logger.info("generating Firefox QA test plan for %s", feature)
+
+    devtools_server = build_devtools_server(
+        firefox_path=Path(firefox_path) if firefox_path else None,
+        headless=True,
+        enable_script=True,
+    )
+
+    result_collector = ResultCollector()
+    result_server = build_result_server(result_collector)
+
+    mcp_servers: dict[str, McpServerConfig] = {
+        "firefox-devtools": devtools_server,
+        RESULT_SERVER_NAME: result_server,
+    }
+
+    options = ClaudeAgentOptions(
+        system_prompt=load_system_prompt(),
+        mcp_servers=mcp_servers,
+        permission_mode="bypassPermissions",
+        allowed_tools=[
+            *DEVTOOLS_TOOLS,
+            SUBMIT_RESULT_TOOL,
+        ],
+        model=model,
+        max_turns=max_turns,
+        **({"effort": effort} if effort else {}),
+        setting_sources=[],
+        max_buffer_size=10 * 1024 * 1024,
+    )
+
+    result_msg: ResultMessage | None = None
+    with Reporter(verbose=verbose, log_path=log) as reporter:
+        reporter.header(feature)
+        async with ClaudeSDKClient(options=options) as client:
+            await client.query(build_user_prompt(feature, feature_details))
+            async for msg in client.receive_response():
+                reporter.message(msg)
+                if isinstance(msg, ResultMessage):
+                    result_msg = msg
+
+    if result_msg is None:
+        raise AgentError(f"{feature}: agent produced no result message")
+    if result_msg.is_error:
+        raise AgentError(
+            f"{feature} test-plan generation failed: "
+            f"{result_msg.result or result_msg.subtype}"
+        )
+    if result_collector.result is None:
+        raise AgentError(
+            f"{feature}: agent finished without submitting a result via submit_result"
+        )
+
+    return TestPlanGeneratorResult(
+        result=result_collector.result,
+        num_turns=result_msg.num_turns,
+        total_cost_usd=result_msg.total_cost_usd,
+    )
diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/config.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/config.py
new file mode 100644
index 0000000000..64000350d1
--- /dev/null
+++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/config.py
@@ -0,0 +1,35 @@
+# Firefox DevTools MCP tools (@mozilla/firefox-devtools-mcp-moz), exposed under
+# the "firefox-devtools" server name. The QA agent may choose content/page tools
+# for web-page behavior or chrome-context tools for Firefox browser UI.
+DEVTOOLS_TOOLS = [
+    "mcp__firefox-devtools__list_pages",
+    "mcp__firefox-devtools__new_page",
+    "mcp__firefox-devtools__navigate_page",
+    "mcp__firefox-devtools__select_page",
+    "mcp__firefox-devtools__close_page",
+    "mcp__firefox-devtools__take_snapshot",
+    "mcp__firefox-devtools__resolve_uid_to_selector",
+    "mcp__firefox-devtools__clear_snapshot",
+    "mcp__firefox-devtools__click_by_uid",
+    "mcp__firefox-devtools__hover_by_uid",
+    "mcp__firefox-devtools__fill_by_uid",
+    "mcp__firefox-devtools__fill_form_by_uid",
+    "mcp__firefox-devtools__drag_by_uid_to_uid",
+    "mcp__firefox-devtools__upload_file_by_uid",
+    "mcp__firefox-devtools__list_console_messages",
+    "mcp__firefox-devtools__clear_console_messages",
+    "mcp__firefox-devtools__list_network_requests",
+    "mcp__firefox-devtools__get_network_request",
+    "mcp__firefox-devtools__screenshot_page",
+    "mcp__firefox-devtools__screenshot_by_uid",
+    "mcp__firefox-devtools__evaluate_script",
+    "mcp__firefox-devtools__accept_dialog",
+    "mcp__firefox-devtools__dismiss_dialog",
+    "mcp__firefox-devtools__navigate_history",
+    "mcp__firefox-devtools__set_viewport_size",
+    "mcp__firefox-devtools__get_firefox_info",
+    "mcp__firefox-devtools__get_firefox_output",
+    "mcp__firefox-devtools__list_chrome_contexts",
+    "mcp__firefox-devtools__select_chrome_context",
+    "mcp__firefox-devtools__evaluate_chrome_script",
+]
diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/devtools_mcp.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/devtools_mcp.py
new file mode 100644
index 0000000000..cd0744f5b2
--- /dev/null
+++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/devtools_mcp.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from claude_agent_sdk.types import McpStdioServerConfig
+
+PACKAGE = "@mozilla/firefox-devtools-mcp-moz"
+
+
+def build_devtools_server(
+    firefox_path: Path | None = None,
+    *,
+    headless: bool = True,
+    enable_script: bool = True,
+) -> McpStdioServerConfig:
+    """Build the stdio config for the Firefox DevTools MCP server."""
+    args = [PACKAGE]
+    if headless:
+        args.append("--headless")
+    if enable_script:
+        args.append("--enable-script")
+    if firefox_path is not None:
+        args += ["--firefox-path", str(firefox_path)]
+
+    return McpStdioServerConfig(
+        command="npx",
+        args=args,
+        env={"MOZ_REMOTE_ALLOW_SYSTEM_ACCESS": "1"},
+    )
diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/firefox_install.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/firefox_install.py
new file mode 100644
index 0000000000..0ea2d8731c
--- /dev/null
+++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/firefox_install.py
@@ -0,0 +1,41 @@
+"""Download and install a prebuilt Firefox Nightly for the agent to drive."""
+
+from __future__ import annotations
+
+import logging
+import platform
+import shutil
+from pathlib import Path
+
+import mozdownload
+import mozinstall
+
+INSTALL_DIR = Path.home() / "firefox"
+BRANCH = "mozilla-central"
+
+logger = logging.getLogger("test-plan-generator")
+
+
+def install_firefox_nightly() -> Path:
+    mozdownload_platform = (
+        "linux-arm64" if platform.machine() in ("aarch64", "arm64") else None
+    )
+
+    if INSTALL_DIR.exists():
+        shutil.rmtree(INSTALL_DIR)
+    INSTALL_DIR.mkdir(parents=True)
+
+    logger.info("downloading Firefox Nightly...")
+    scraper = mozdownload.FactoryScraper(
+        "daily",
+        branch=BRANCH,
+        platform=mozdownload_platform,
+        destination=str(INSTALL_DIR),
+    )
+    archive = scraper.download()
+
+    install_folder = mozinstall.install(archive, str(INSTALL_DIR))
+    binary = Path(mozinstall.get_binary(install_folder, "firefox"))
+
+    logger.info("installed Firefox at %s", binary)
+    return binary
diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md b/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md
new file mode 100644
index 0000000000..f584d6c739
--- /dev/null
+++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md
@@ -0,0 +1,80 @@
+You are a Firefox QA test-plan generation and execution agent.
+
+Your job is to generate test cases from the provided Firefox feature details,
+run them in Firefox through the available DevTools MCP tools, and report only
+pass/fail/unsuitable results. Do not diagnose, fix, patch, or propose changes.
+
+## Required workflow
+
+1. Generate exactly 10 test cases before running any case.
+2. Each test case must have:
+   - A short title.
+   - A selected execution context: `chrome` or `content`.
+   - Optional preconditions only when they are truly needed.
+   - 1 to 6 concise, ordered test steps.
+3. Run the 10 test cases in order.
+4. Submit the final structured result with `submit_result`.
+
+## Context selection
+
+Choose the context per test case.
+
+Use `content` for normal website or web-page behavior.
+
+Use `chrome` for Firefox browser UI, preferences, toolbar, bookmarks/history,
+downloads UI, browser menus, browser panels, PDF viewer chrome interactions, or
+any case where you are unsure which context is correct.
+
+### Content context rules
+
+- Use page/content tools such as creating or selecting pages, navigating,
+  snapshots, UID interactions, console/network inspection, screenshots, and
+  `evaluate_script`.
+- Do not use chrome-context tools for a content-context case.
+
+### Chrome context rules
+
+- Your first two DevTools MCP actions for a chrome-context case must be:
+  1. `list_chrome_contexts`
+  2. `select_chrome_context` for the target browser window
+- Use `evaluate_chrome_script` for browser UI interaction.
+- Wrap JavaScript in an immediately invoked function expression that explicitly
+  returns a value, for example:
+
+```javascript
+(() => {
+  return gBrowser.tabs.length;
+})();
+```
+
+- Do not mix content-context tools into a chrome-context case unless a generated
+  test step explicitly needs a web page as test data.
+
+## Execution rules
+
+- Execute steps exactly in the order generated.
+- Do not skip, reorder, combine, or rewrite steps after generation.
+- Call only the tools needed for the current step.
+- If a step fails, mark that step failed, mark the case failed, stop that case,
+  and move to the next case.
+- Do not try alternate approaches to make a failing step pass.
+- Do not debug or explain root cause.
+- Do not propose fixes.
+
+## Unsuitable cases
+
+Mark a case as `unsuitable` only if it requires:
+
+- Restarting Firefox during the test flow.
+- Pixel-perfect or visual comparison.
+- Installing external apps beyond basic add-ons.
+- Confirming real hardware behavior such as microphone, camera, or printer.
+- Changing, verifying, or interacting with OS/system settings.
+- Changing, verifying, or interacting with the system desktop or OS UI.
+- Firefox Sync, cross-device verification, or account-sync behavior.
+
+## Reporting
+
+The final answer must be submitted through `submit_result` exactly once. A prose
+message is not enough. Include exactly 10 generated test cases and exactly 10
+case results.
diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py
new file mode 100644
index 0000000000..d8a36a048f
--- /dev/null
+++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py
@@ -0,0 +1,99 @@
+"""Structured result reporting for the test-plan-generator agent."""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from claude_agent_sdk import McpServerConfig, create_sdk_mcp_server, tool
+from pydantic import BaseModel, Field, ValidationError, model_validator
+
+RESULT_SERVER_NAME = "test-plan-generator"
+SUBMIT_RESULT_TOOL = f"mcp__{RESULT_SERVER_NAME}__submit_result"
+
+
+class GeneratedTestCase(BaseModel):
+    id: int = Field(description="Sequential case id from 1 through 10.")
+    title: str
+    context: Literal["chrome", "content"]
+    preconditions: str | None = None
+    steps: list[str] = Field(
+        description="Concise test steps for this case; between 1 and 6 steps."
+    )
+
+    @model_validator(mode="after")
+    def _validate_steps(self) -> "GeneratedTestCase":
+        if not 1 <= len(self.steps) <= 6:
+            raise ValueError("each generated test case must have 1 to 6 steps")
+        return self
+
+
+class StepResult(BaseModel):
+    step_number: int
+    status: Literal["passed", "failed", "not_run"]
+    observation: str
+
+
+class TestCaseResult(BaseModel):
+    id: int
+    status: Literal["passed", "failed", "unsuitable"]
+    step_results: list[StepResult]
+    summary: str
+
+
+class TestPlanResult(BaseModel):
+    feature: str
+    generated_test_cases: list[GeneratedTestCase]
+    results: list[TestCaseResult]
+    summary: str
+
+    @model_validator(mode="after")
+    def _validate_result(self) -> "TestPlanResult":
+        if len(self.generated_test_cases) != 10:
+            raise ValueError("generated_test_cases must contain exactly 10 cases")
+
+        case_ids = [case.id for case in self.generated_test_cases]
+        if case_ids != list(range(1, 11)):
+            raise ValueError("generated test case ids must be 1 through 10")
+
+        result_ids = [result.id for result in self.results]
+        if result_ids != list(range(1, 11)):
+            raise ValueError(
+                "results must contain one result for each case id 1 through 10"
+            )
+
+        return self
+
+
+SUBMIT_RESULT_SCHEMA = {
+    **TestPlanResult.model_json_schema(),
+    "additionalProperties": False,
+}
+
+
+class ResultCollector:
+    """Holds the result submitted by the agent, if any."""
+
+    def __init__(self) -> None:
+        self.result: TestPlanResult | None = None
+
+
+def build_result_server(collector: ResultCollector) -> McpServerConfig:
+    """Build an in-process MCP server exposing the ``submit_result`` tool."""
+
+    @tool(
+        "submit_result",
+        "Submit the final generated Firefox QA test plan and execution result. "
+        "Call exactly once, after all 10 test cases have been generated and run.",
+        SUBMIT_RESULT_SCHEMA,
+    )
+    async def submit_result(args: dict) -> dict:
+        try:
+            collector.result = TestPlanResult.model_validate(args)
+        except ValidationError as exc:
+            return {
+                "content": [{"type": "text", "text": f"Invalid result: {exc}"}],
+                "is_error": True,
+            }
+        return {"content": [{"type": "text", "text": "Result recorded."}]}
+
+    return create_sdk_mcp_server(name=RESULT_SERVER_NAME, tools=[submit_result])
diff --git a/agents/test-plan-generator/pyproject.toml b/agents/test-plan-generator/pyproject.toml
new file mode 100644
index 0000000000..7aeaaf5718
--- /dev/null
+++ b/agents/test-plan-generator/pyproject.toml
@@ -0,0 +1,22 @@
+[project]
+name = "hackbot-agent-test-plan-generator"
+version = "0.1.0"
+description = "Cloud Run Job image that generates and runs Firefox QA test plans"
+requires-python = ">=3.12"
+dependencies = [
+    "hackbot-runtime[claude-sdk]",
+    "claude-agent-sdk>=0.1.30",
+    "mcp>=1.0.0",
+    "mozdownload",
+    "mozinstall",
+]
+
+[tool.uv.sources]
+hackbot-runtime = { workspace = true }
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["hackbot_agents"]
diff --git a/docker-compose.yml b/docker-compose.yml
index cc534a3242..4a0477037a 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -4,6 +4,7 @@ version: "3.8"
 
 include:
   - path: agents/bug-fix/compose.yml
+  - path: agents/test-plan-generator/compose.yml
 
 services:
   bugbug-base:
diff --git a/services/hackbot-api/app/agents.py b/services/hackbot-api/app/agents.py
index f5d75f72bc..0d365bcb5a 100644
--- a/services/hackbot-api/app/agents.py
+++ b/services/hackbot-api/app/agents.py
@@ -4,7 +4,11 @@
 
 from pydantic import BaseModel
 
-from app.schemas import AutowebcompatReproInputs, BugFixInputs
+from app.schemas import (
+    AutowebcompatReproInputs,
+    BugFixInputs,
+    TestPlanGeneratorInputs,
+)
 
 
 @dataclass(frozen=True)
@@ -57,4 +61,13 @@ def model_to_env(inputs: BaseModel) -> dict[str, str]:
         job_name="hackbot-agent-autowebcompat-repro",
         input_schema=AutowebcompatReproInputs,
     ),
+    "test-plan-generator": AgentSpec(
+        name="test-plan-generator",
+        description=(
+            "Generate 10 concise Firefox QA test cases from feature details, "
+            "run them in Firefox through DevTools MCP, and report pass/fail results."
+        ),
+        job_name="hackbot-agent-test-plan-generator",
+        input_schema=TestPlanGeneratorInputs,
+    ),
 }
diff --git a/services/hackbot-api/app/schemas.py b/services/hackbot-api/app/schemas.py
index f54d133d4b..8e01996453 100644
--- a/services/hackbot-api/app/schemas.py
+++ b/services/hackbot-api/app/schemas.py
@@ -81,3 +81,11 @@ def _require_subject(self) -> "AutowebcompatReproInputs":
         if self.bug_data is None and self.bug_id is None:
             raise ValueError("provide at least one of bug_data or bug_id")
         return self
+
+
+class TestPlanGeneratorInputs(BaseModel):
+    feature: str
+    feature_details: str
+    model: str | None = None
+    max_turns: int | None = None
+    effort: str | None = None
diff --git a/services/hackbot-api/tests/test_agents.py b/services/hackbot-api/tests/test_agents.py
index c99c9d4689..7f0d8abf6a 100644
--- a/services/hackbot-api/tests/test_agents.py
+++ b/services/hackbot-api/tests/test_agents.py
@@ -1,7 +1,14 @@
 """Tests for the agent registry and generic env serialization."""
 
+import pytest
 from app.agents import AGENT_REGISTRY, model_to_env
-from app.schemas import BugFixInputs
+from app.schemas import (
+    BugFixInputs,
+)
+from app.schemas import (
+    TestPlanGeneratorInputs as PlanGeneratorInputs,
+)
+from pydantic import ValidationError
 
 
 def test_model_to_env_uppercases_and_stringifies():
@@ -30,3 +37,30 @@ def test_bug_fix_registry_uses_default_env_serializer():
     # No hand-written build_env: the router falls back to model_to_env.
     assert spec.build_env is None
     assert spec.input_schema is BugFixInputs
+
+
+def test_test_plan_generator_inputs_require_feature_details():
+    with pytest.raises(ValidationError):
+        PlanGeneratorInputs(feature="Bookmarks and History")
+
+
+def test_test_plan_generator_env_serialization():
+    env = model_to_env(
+        PlanGeneratorInputs(
+            feature="Bookmarks and History",
+            feature_details="Bookmarks toolbar behavior",
+        )
+    )
+
+    assert env == {
+        "FEATURE": "Bookmarks and History",
+        "FEATURE_DETAILS": "Bookmarks toolbar behavior",
+    }
+
+
+def test_test_plan_generator_registry_uses_default_env_serializer():
+    spec = AGENT_REGISTRY["test-plan-generator"]
+
+    assert spec.build_env is None
+    assert spec.job_name == "hackbot-agent-test-plan-generator"
+    assert spec.input_schema is PlanGeneratorInputs
diff --git a/uv.lock b/uv.lock
index bb6173d69e..4c432bcdb0 100644
--- a/uv.lock
+++ b/uv.lock
@@ -24,6 +24,7 @@ members = [
     "bugbug-mcp",
     "hackbot-agent-autowebcompat-repro",
     "hackbot-agent-bug-fix",
+    "hackbot-agent-test-plan-generator",
     "hackbot-api",
     "hackbot-runtime",
     "reviewhelper-api",
@@ -2417,6 +2418,27 @@ requires-dist = [
     { name = "uvicorn", specifier = ">=0.27.0" },
 ]
 
+[[package]]
+name = "hackbot-agent-test-plan-generator"
+version = "0.1.0"
+source = { editable = "agents/test-plan-generator" }
+dependencies = [
+    { name = "claude-agent-sdk" },
+    { name = "hackbot-runtime", extra = ["claude-sdk"] },
+    { name = "mcp" },
+    { name = "mozdownload" },
+    { name = "mozinstall" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "claude-agent-sdk", specifier = ">=0.1.30" },
+    { name = "hackbot-runtime", extras = ["claude-sdk"], editable = "libs/hackbot-runtime" },
+    { name = "mcp", specifier = ">=1.0.0" },
+    { name = "mozdownload" },
+    { name = "mozinstall" },
+]
+
 [[package]]
 name = "hackbot-api"
 version = "0.1.0"

From dad4cbb67105d97373178d35b81ab91afc4f0370 Mon Sep 17 00:00:00 2001
From: John Pangas <swiftyxswaggy@outlook.com>
Date: Fri, 26 Jun 2026 01:37:11 -0600
Subject: [PATCH 2/3] Edit the inputs

---
 agents/test-plan-generator/compose.yml        |  5 ++-
 .../test_plan_generator/__main__.py           | 10 +++--
 .../test_plan_generator/agent.py              | 33 +++++++++++------
 .../test_plan_generator/prompts/system.md     | 37 ++++++++++++++++---
 services/hackbot-api/app/schemas.py           |  5 ++-
 services/hackbot-api/tests/test_agents.py     | 17 ++++++---
 6 files changed, 76 insertions(+), 31 deletions(-)

diff --git a/agents/test-plan-generator/compose.yml b/agents/test-plan-generator/compose.yml
index ee4b076023..987bafbb12 100644
--- a/agents/test-plan-generator/compose.yml
+++ b/agents/test-plan-generator/compose.yml
@@ -6,8 +6,9 @@ services:
       target: agent
     environment:
       - RUN_ID
-      - FEATURE
-      - FEATURE_DETAILS
+      - FEATURE_NAME
+      - FEATURE_DESCRIPTION
+      - TEST_SCOPE
       - MODEL
       - MAX_TURNS
       - EFFORT
diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py
index 9b4e5b3357..af67eac60b 100644
--- a/agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py
+++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py
@@ -6,8 +6,9 @@
 
 
 class AgentInputs(BaseSettings):
-    feature: str
-    feature_details: str
+    feature_name: str
+    feature_description: str
+    test_scope: str
     model: str | None = None
     max_turns: int | None = None
     effort: str | None = None
@@ -21,8 +22,9 @@ async def main(ctx: HackbotContext) -> TestPlanGeneratorResult:
     firefox_path = str(install_firefox_nightly())
 
     return await run_test_plan_generator(
-        feature=inputs.feature,
-        feature_details=inputs.feature_details,
+        feature_name=inputs.feature_name,
+        feature_description=inputs.feature_description,
+        test_scope=inputs.test_scope,
         model=inputs.model,
         max_turns=inputs.max_turns,
         effort=inputs.effort,
diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py
index 20f93a0283..0e4b81128d 100644
--- a/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py
+++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py
@@ -37,11 +37,16 @@ def load_system_prompt() -> str:
     return (HERE / "prompts" / "system.md").read_text()
 
 
-def build_user_prompt(feature: str, feature_details: str) -> str:
+def build_user_prompt(
+    feature_name: str, feature_description: str, test_scope: str
+) -> str:
     return (
-        "Generate and run a Firefox QA test plan for this feature.\n\n"
-        f"Feature:\n{feature}\n\n"
-        f"Feature details:\n{feature_details}\n\n"
+        "Generate and run a Firefox QA test plan from these inputs.\n\n"
+        f"Feature name:\n{feature_name}\n\n"
+        f"Feature description:\n{feature_description}\n\n"
+        f"Test scope:\n{test_scope}\n\n"
+        "Use the provided feature name as the structured result feature. The "
+        "generated test cases must stay within the test scope.\n\n"
         "Follow the required workflow exactly: generate 10 cases first, run "
         "them in order, stop each case on first failed step, and submit the "
         "structured result."
@@ -50,8 +55,9 @@ def build_user_prompt(feature: str, feature_details: str) -> str:
 
 async def run_test_plan_generator(
     *,
-    feature: str,
-    feature_details: str,
+    feature_name: str,
+    feature_description: str,
+    test_scope: str,
     model: str | None = None,
     max_turns: int | None = None,
     effort: str | None = None,
@@ -60,7 +66,8 @@ async def run_test_plan_generator(
     log: Path | None = None,
 ) -> TestPlanGeneratorResult:
     """Generate and run a Firefox QA test plan for one feature."""
-    logger.info("generating Firefox QA test plan for %s", feature)
+    subject = feature_name
+    logger.info("generating Firefox QA test plan for %s", subject)
 
     devtools_server = build_devtools_server(
         firefox_path=Path(firefox_path) if firefox_path else None,
@@ -93,24 +100,26 @@ async def run_test_plan_generator(
 
     result_msg: ResultMessage | None = None
     with Reporter(verbose=verbose, log_path=log) as reporter:
-        reporter.header(feature)
+        reporter.header(subject)
         async with ClaudeSDKClient(options=options) as client:
-            await client.query(build_user_prompt(feature, feature_details))
+            await client.query(
+                build_user_prompt(feature_name, feature_description, test_scope)
+            )
             async for msg in client.receive_response():
                 reporter.message(msg)
                 if isinstance(msg, ResultMessage):
                     result_msg = msg
 
     if result_msg is None:
-        raise AgentError(f"{feature}: agent produced no result message")
+        raise AgentError(f"{subject}: agent produced no result message")
     if result_msg.is_error:
         raise AgentError(
-            f"{feature} test-plan generation failed: "
+            f"{subject} test-plan generation failed: "
             f"{result_msg.result or result_msg.subtype}"
         )
     if result_collector.result is None:
         raise AgentError(
-            f"{feature}: agent finished without submitting a result via submit_result"
+            f"{subject}: agent finished without submitting a result via submit_result"
         )
 
     return TestPlanGeneratorResult(
diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md b/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md
index f584d6c739..45c5913362 100644
--- a/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md
+++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md
@@ -1,17 +1,21 @@
 You are a Firefox QA test-plan generation and execution agent.
 
-Your job is to generate test cases from the provided Firefox feature details,
-run them in Firefox through the available DevTools MCP tools, and report only
-pass/fail/unsuitable results. Do not diagnose, fix, patch, or propose changes.
+Your job is to generate test cases from the provided Firefox feature name,
+feature description, and test scope, run them in Firefox through the available
+DevTools MCP tools, and report only pass/fail/unsuitable results. Do not
+diagnose, fix, patch, or propose changes.
 
 ## Required workflow
 
 1. Generate exactly 10 test cases before running any case.
+   - Use only the feature name, feature description, and test scope as your
+     source material.
+   - Use the provided feature name as the structured result feature.
+   - Do not generate cases outside the test scope.
 2. Each test case must have:
    - A short title.
    - A selected execution context: `chrome` or `content`.
-   - Optional preconditions only when they are truly needed.
-   - 1 to 6 concise, ordered test steps.
+   - Use concise ordered test steps.
 3. Run the 10 test cases in order.
 4. Submit the final structured result with `submit_result`.
 
@@ -61,6 +65,29 @@ any case where you are unsure which context is correct.
 - Do not debug or explain root cause.
 - Do not propose fixes.
 
+## Test case style examples
+
+Use concise, manual-QA-style titles and steps like these. These are examples of
+tone and granularity only; do not generate these exact cases unless they are
+inside the requested test scope.
+
+Example test case: Verify that the user can add multiple Highlights to the text inside the PDF file
+
+Example test steps:
+
+1. Click the Highlight button.
+2. Add several Highlights to any text inside the PDF file.
+3. Save or Print the PDF file and reopen the file in a new Tab.
+
+Example test case: Ensure that rich entities are shown in Address Bar history if
+the user interacted with them
+
+Example test steps:
+
+1. Click inside the Address Bar, select the google search shortcut.
+2. Select a rich suggestion and press enter.
+3. Open a new tab and type the first letters of the previously searched term.
+
 ## Unsuitable cases
 
 Mark a case as `unsuitable` only if it requires:
diff --git a/services/hackbot-api/app/schemas.py b/services/hackbot-api/app/schemas.py
index 8e01996453..d38cee7ce7 100644
--- a/services/hackbot-api/app/schemas.py
+++ b/services/hackbot-api/app/schemas.py
@@ -84,8 +84,9 @@ def _require_subject(self) -> "AutowebcompatReproInputs":
 
 
 class TestPlanGeneratorInputs(BaseModel):
-    feature: str
-    feature_details: str
+    feature_name: str
+    feature_description: str
+    test_scope: str
     model: str | None = None
     max_turns: int | None = None
     effort: str | None = None
diff --git a/services/hackbot-api/tests/test_agents.py b/services/hackbot-api/tests/test_agents.py
index 7f0d8abf6a..6816efbe49 100644
--- a/services/hackbot-api/tests/test_agents.py
+++ b/services/hackbot-api/tests/test_agents.py
@@ -39,22 +39,27 @@ def test_bug_fix_registry_uses_default_env_serializer():
     assert spec.input_schema is BugFixInputs
 
 
-def test_test_plan_generator_inputs_require_feature_details():
+def test_test_plan_generator_inputs_require_feature_description():
     with pytest.raises(ValidationError):
-        PlanGeneratorInputs(feature="Bookmarks and History")
+        PlanGeneratorInputs(
+            feature_name="Bookmarks and History",
+            test_scope="Bookmarks toolbar behavior.",
+        )
 
 
 def test_test_plan_generator_env_serialization():
     env = model_to_env(
         PlanGeneratorInputs(
-            feature="Bookmarks and History",
-            feature_details="Bookmarks toolbar behavior",
+            feature_name="Bookmarks and History",
+            feature_description="Bookmarks and history controls in Firefox.",
+            test_scope="Bookmarks toolbar behavior.",
         )
     )
 
     assert env == {
-        "FEATURE": "Bookmarks and History",
-        "FEATURE_DETAILS": "Bookmarks toolbar behavior",
+        "FEATURE_NAME": "Bookmarks and History",
+        "FEATURE_DESCRIPTION": "Bookmarks and history controls in Firefox.",
+        "TEST_SCOPE": "Bookmarks toolbar behavior.",
     }
 
 

From a8a09b0d0b8a4238a90718d92d957e4dc51466d2 Mon Sep 17 00:00:00 2001
From: John Pangas <swiftyxswaggy@outlook.com>
Date: Fri, 26 Jun 2026 01:55:00 -0600
Subject: [PATCH 3/3] Cap the generated test cases: :q q

'
---
 .../test_plan_generator/agent.py              |  6 +--
 .../test_plan_generator/prompts/system.md     | 20 ++++++---
 .../test_plan_generator/result.py             | 44 +++++++++++++++----
 3 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py
index 0e4b81128d..a680f52d17 100644
--- a/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py
+++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py
@@ -47,9 +47,9 @@ def build_user_prompt(
         f"Test scope:\n{test_scope}\n\n"
         "Use the provided feature name as the structured result feature. The "
         "generated test cases must stay within the test scope.\n\n"
-        "Follow the required workflow exactly: generate 10 cases first, run "
-        "them in order, stop each case on first failed step, and submit the "
-        "structured result."
+        "Follow the required workflow exactly: generate the appropriate number "
+        "of cases first, with no more than 20 cases, run them in order, stop "
+        "each case on first failed step, and submit the structured result."
     )
 
 
diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md b/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md
index 45c5913362..dd79d0b203 100644
--- a/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md
+++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md
@@ -7,7 +7,8 @@ diagnose, fix, patch, or propose changes.
 
 ## Required workflow
 
-1. Generate exactly 10 test cases before running any case.
+1. Generate the appropriate number of test cases before running any case.
+   Generate no more than 20 test cases.
    - Use only the feature name, feature description, and test scope as your
      source material.
    - Use the provided feature name as the structured result feature.
@@ -16,7 +17,7 @@ diagnose, fix, patch, or propose changes.
    - A short title.
    - A selected execution context: `chrome` or `content`.
    - Use concise ordered test steps.
-3. Run the 10 test cases in order.
+3. Run the test cases and test steps in order.
 4. Submit the final structured result with `submit_result`.
 
 ## Context selection
@@ -61,9 +62,13 @@ any case where you are unsure which context is correct.
 - Call only the tools needed for the current step.
 - If a step fails, mark that step failed, mark the case failed, stop that case,
   and move to the next case.
+- When a step fails, include a concise failure reason based only on observed
+  behavior. The reason should help developers understand what failed later, but
+  it must not speculate beyond the evidence or propose a fix.
+- When a case fails or is unsuitable, include a concise case-level failure
+  reason.
 - Do not try alternate approaches to make a failing step pass.
-- Do not debug or explain root cause.
-- Do not propose fixes.
+- Do not debug deeply, infer root cause, or propose fixes.
 
 ## Test case style examples
 
@@ -103,5 +108,8 @@ Mark a case as `unsuitable` only if it requires:
 ## Reporting
 
 The final answer must be submitted through `submit_result` exactly once. A prose
-message is not enough. Include exactly 10 generated test cases and exactly 10
-case results.
+message is not enough. Include one case result for every generated test case.
+
+For failed steps, set `failure_reason` to a short explanation of the observed
+failure. For failed or unsuitable cases, set the case-level `failure_reason` as
+well. Leave `failure_reason` empty for passed steps and passed cases.
diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py
index d8a36a048f..50c702709c 100644
--- a/agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py
+++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py
@@ -12,7 +12,7 @@
 
 
 class GeneratedTestCase(BaseModel):
-    id: int = Field(description="Sequential case id from 1 through 10.")
+    id: int = Field(description="Sequential case id starting at 1.")
     title: str
     context: Literal["chrome", "content"]
     preconditions: str | None = None
@@ -31,6 +31,19 @@ class StepResult(BaseModel):
     step_number: int
     status: Literal["passed", "failed", "not_run"]
     observation: str
+    failure_reason: str | None = Field(
+        default=None,
+        description=(
+            "Required when status is failed. A concise reason why the step failed, "
+            "based only on what was observed during execution."
+        ),
+    )
+
+    @model_validator(mode="after")
+    def _validate_failure_reason(self) -> "StepResult":
+        if self.status == "failed" and not self.failure_reason:
+            raise ValueError("failed steps must include failure_reason")
+        return self
 
 
 class TestCaseResult(BaseModel):
@@ -38,6 +51,19 @@ class TestCaseResult(BaseModel):
     status: Literal["passed", "failed", "unsuitable"]
     step_results: list[StepResult]
     summary: str
+    failure_reason: str | None = Field(
+        default=None,
+        description=(
+            "Required when status is failed or unsuitable. A concise reason why "
+            "the case failed or could not be run, useful for later developer review."
+        ),
+    )
+
+    @model_validator(mode="after")
+    def _validate_failure_reason(self) -> "TestCaseResult":
+        if self.status in {"failed", "unsuitable"} and not self.failure_reason:
+            raise ValueError("failed or unsuitable cases must include failure_reason")
+        return self
 
 
 class TestPlanResult(BaseModel):
@@ -48,17 +74,19 @@ class TestPlanResult(BaseModel):
 
     @model_validator(mode="after")
     def _validate_result(self) -> "TestPlanResult":
-        if len(self.generated_test_cases) != 10:
-            raise ValueError("generated_test_cases must contain exactly 10 cases")
+        case_count = len(self.generated_test_cases)
+        if not 1 <= case_count <= 20:
+            raise ValueError("generated_test_cases must contain 1 to 20 cases")
 
         case_ids = [case.id for case in self.generated_test_cases]
-        if case_ids != list(range(1, 11)):
-            raise ValueError("generated test case ids must be 1 through 10")
+        expected_ids = list(range(1, case_count + 1))
+        if case_ids != expected_ids:
+            raise ValueError("generated test case ids must be sequential starting at 1")
 
         result_ids = [result.id for result in self.results]
-        if result_ids != list(range(1, 11)):
+        if result_ids != expected_ids:
             raise ValueError(
-                "results must contain one result for each case id 1 through 10"
+                "results must contain one result for each generated test case id"
             )
 
         return self
@@ -83,7 +111,7 @@ def build_result_server(collector: ResultCollector) -> McpServerConfig:
     @tool(
         "submit_result",
         "Submit the final generated Firefox QA test plan and execution result. "
-        "Call exactly once, after all 10 test cases have been generated and run.",
+        "Call exactly once, after all generated test cases have been run.",
         SUBMIT_RESULT_SCHEMA,
     )
     async def submit_result(args: dict) -> dict: