From 9650bf46c6d3cad910364ec15758131cc038db3d Mon Sep 17 00:00:00 2001 From: John Pangas Date: Thu, 25 Jun 2026 20:19:42 -0600 Subject: [PATCH 1/3] Add test plan agent --- agents/test-plan-generator/Dockerfile | 50 ++++++++ agents/test-plan-generator/compose.yml | 19 +++ agents/test-plan-generator/hackbot.toml | 3 + .../test_plan_generator/__init__.py | 1 + .../test_plan_generator/__main__.py | 36 ++++++ .../test_plan_generator/agent.py | 120 ++++++++++++++++++ .../test_plan_generator/config.py | 35 +++++ .../test_plan_generator/devtools_mcp.py | 29 +++++ .../test_plan_generator/firefox_install.py | 41 ++++++ .../test_plan_generator/prompts/system.md | 80 ++++++++++++ .../test_plan_generator/result.py | 99 +++++++++++++++ agents/test-plan-generator/pyproject.toml | 22 ++++ docker-compose.yml | 1 + services/hackbot-api/app/agents.py | 15 ++- services/hackbot-api/app/schemas.py | 8 ++ services/hackbot-api/tests/test_agents.py | 36 +++++- uv.lock | 22 ++++ 17 files changed, 615 insertions(+), 2 deletions(-) create mode 100644 agents/test-plan-generator/Dockerfile create mode 100644 agents/test-plan-generator/compose.yml create mode 100644 agents/test-plan-generator/hackbot.toml create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/__init__.py create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/config.py create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/devtools_mcp.py create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/firefox_install.py create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md create mode 100644 agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py create mode 100644 agents/test-plan-generator/pyproject.toml diff --git a/agents/test-plan-generator/Dockerfile b/agents/test-plan-generator/Dockerfile new file mode 100644 index 0000000000..ef082714da --- /dev/null +++ b/agents/test-plan-generator/Dockerfile @@ -0,0 +1,50 @@ +FROM python:3.12 AS builder + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +ENV UV_PROJECT_ENVIRONMENT=/opt/venv + +WORKDIR /app + +# Install external deps without building workspace members. +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ + --mount=type=bind,source=uv.lock,target=uv.lock \ + --mount=type=bind,source=VERSION,target=VERSION \ + uv sync --frozen --no-dev --no-install-workspace --package hackbot-agent-test-plan-generator + +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,target=/app,rw \ + uv sync --locked --no-dev --no-editable --package hackbot-agent-test-plan-generator + +FROM python:3.12 AS agent + +COPY --from=builder /opt/venv /opt/venv +WORKDIR /app + +ENV PYTHONUNBUFFERED=1 +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PATH="/opt/venv/bin:$PATH" + +# The Firefox DevTools MCP server is launched through npx. Firefox itself is +# downloaded at agent startup, so the image only needs Node/npm and the shared +# libraries required by headless Firefox. +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + nodejs npm \ + ca-certificates \ + libgtk-3-0 libdbus-glib-1-2 libx11-xcb1 libxtst6 libxt6 \ + libasound2 libpci3 \ + && rm -rf /var/lib/apt/lists/* + +# hackbot.toml lives at the agent root (not inside the package), so copy it into +# the working dir; the runtime discovers it there (cwd) at startup. +COPY agents/test-plan-generator/hackbot.toml /app/hackbot.toml + +RUN useradd --create-home --shell /bin/bash agent \ + && mkdir -p /workspace \ + && chown agent:agent /workspace + +USER agent + +CMD ["python", "-m", "hackbot_agents.test_plan_generator"] diff --git a/agents/test-plan-generator/compose.yml b/agents/test-plan-generator/compose.yml new file mode 100644 index 0000000000..ee4b076023 --- /dev/null +++ b/agents/test-plan-generator/compose.yml @@ -0,0 +1,19 @@ +services: + test-plan-generator-agent: + build: + context: ../.. + dockerfile: agents/test-plan-generator/Dockerfile + target: agent + environment: + - RUN_ID + - FEATURE + - FEATURE_DETAILS + - MODEL + - MAX_TURNS + - EFFORT + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:?error} + # No uploader locally: summary/logs/attachments are written under + # /artifacts/, bind-mounted to the host's ~/hackbot/artifacts. + - ARTIFACTS_DIR=/artifacts + volumes: + - ${HOME}/hackbot/artifacts:/artifacts diff --git a/agents/test-plan-generator/hackbot.toml b/agents/test-plan-generator/hackbot.toml new file mode 100644 index 0000000000..5912e57781 --- /dev/null +++ b/agents/test-plan-generator/hackbot.toml @@ -0,0 +1,3 @@ +# test-plan-generator needs no platform prep: no [source] checkout and no +# [firefox] build. It downloads a fresh Firefox Nightly at startup and drives it +# through the Firefox DevTools MCP server. diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/__init__.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/__init__.py @@ -0,0 +1 @@ + diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py new file mode 100644 index 0000000000..9b4e5b3357 --- /dev/null +++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py @@ -0,0 +1,36 @@ +from hackbot_runtime import HackbotContext, run_async +from pydantic_settings import BaseSettings, SettingsConfigDict + +from .agent import TestPlanGeneratorResult, run_test_plan_generator +from .firefox_install import install_firefox_nightly + + +class AgentInputs(BaseSettings): + feature: str + feature_details: str + model: str | None = None + max_turns: int | None = None + effort: str | None = None + + model_config = SettingsConfigDict(extra="ignore") + + +async def main(ctx: HackbotContext) -> TestPlanGeneratorResult: + inputs = AgentInputs() + + firefox_path = str(install_firefox_nightly()) + + return await run_test_plan_generator( + feature=inputs.feature, + feature_details=inputs.feature_details, + model=inputs.model, + max_turns=inputs.max_turns, + effort=inputs.effort, + firefox_path=firefox_path, + log=ctx.log_path, + verbose=True, + ) + + +if __name__ == "__main__": + run_async(main) diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py new file mode 100644 index 0000000000..20f93a0283 --- /dev/null +++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py @@ -0,0 +1,120 @@ +"""Firefox QA test-plan generator and executor.""" + +from __future__ import annotations + +import logging +from pathlib import Path + +from claude_agent_sdk import ( + ClaudeAgentOptions, + ClaudeSDKClient, + McpServerConfig, + ResultMessage, +) +from hackbot_runtime import AgentError, HackbotAgentResult +from hackbot_runtime.claude import Reporter + +from .config import DEVTOOLS_TOOLS +from .devtools_mcp import build_devtools_server +from .result import ( + RESULT_SERVER_NAME, + SUBMIT_RESULT_TOOL, + ResultCollector, + TestPlanResult, + build_result_server, +) + +HERE = Path(__file__).resolve().parent + +logger = logging.getLogger("test-plan-generator") + + +class TestPlanGeneratorResult(HackbotAgentResult): + result: TestPlanResult | None = None + + +def load_system_prompt() -> str: + return (HERE / "prompts" / "system.md").read_text() + + +def build_user_prompt(feature: str, feature_details: str) -> str: + return ( + "Generate and run a Firefox QA test plan for this feature.\n\n" + f"Feature:\n{feature}\n\n" + f"Feature details:\n{feature_details}\n\n" + "Follow the required workflow exactly: generate 10 cases first, run " + "them in order, stop each case on first failed step, and submit the " + "structured result." + ) + + +async def run_test_plan_generator( + *, + feature: str, + feature_details: str, + model: str | None = None, + max_turns: int | None = None, + effort: str | None = None, + firefox_path: str | None = None, + verbose: bool = False, + log: Path | None = None, +) -> TestPlanGeneratorResult: + """Generate and run a Firefox QA test plan for one feature.""" + logger.info("generating Firefox QA test plan for %s", feature) + + devtools_server = build_devtools_server( + firefox_path=Path(firefox_path) if firefox_path else None, + headless=True, + enable_script=True, + ) + + result_collector = ResultCollector() + result_server = build_result_server(result_collector) + + mcp_servers: dict[str, McpServerConfig] = { + "firefox-devtools": devtools_server, + RESULT_SERVER_NAME: result_server, + } + + options = ClaudeAgentOptions( + system_prompt=load_system_prompt(), + mcp_servers=mcp_servers, + permission_mode="bypassPermissions", + allowed_tools=[ + *DEVTOOLS_TOOLS, + SUBMIT_RESULT_TOOL, + ], + model=model, + max_turns=max_turns, + **({"effort": effort} if effort else {}), + setting_sources=[], + max_buffer_size=10 * 1024 * 1024, + ) + + result_msg: ResultMessage | None = None + with Reporter(verbose=verbose, log_path=log) as reporter: + reporter.header(feature) + async with ClaudeSDKClient(options=options) as client: + await client.query(build_user_prompt(feature, feature_details)) + async for msg in client.receive_response(): + reporter.message(msg) + if isinstance(msg, ResultMessage): + result_msg = msg + + if result_msg is None: + raise AgentError(f"{feature}: agent produced no result message") + if result_msg.is_error: + raise AgentError( + f"{feature} test-plan generation failed: " + f"{result_msg.result or result_msg.subtype}" + ) + if result_collector.result is None: + raise AgentError( + f"{feature}: agent finished without submitting a result via submit_result" + ) + + return TestPlanGeneratorResult( + result=result_collector.result, + num_turns=result_msg.num_turns, + total_cost_usd=result_msg.total_cost_usd, + ) diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/config.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/config.py new file mode 100644 index 0000000000..64000350d1 --- /dev/null +++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/config.py @@ -0,0 +1,35 @@ +# Firefox DevTools MCP tools (@mozilla/firefox-devtools-mcp-moz), exposed under +# the "firefox-devtools" server name. The QA agent may choose content/page tools +# for web-page behavior or chrome-context tools for Firefox browser UI. +DEVTOOLS_TOOLS = [ + "mcp__firefox-devtools__list_pages", + "mcp__firefox-devtools__new_page", + "mcp__firefox-devtools__navigate_page", + "mcp__firefox-devtools__select_page", + "mcp__firefox-devtools__close_page", + "mcp__firefox-devtools__take_snapshot", + "mcp__firefox-devtools__resolve_uid_to_selector", + "mcp__firefox-devtools__clear_snapshot", + "mcp__firefox-devtools__click_by_uid", + "mcp__firefox-devtools__hover_by_uid", + "mcp__firefox-devtools__fill_by_uid", + "mcp__firefox-devtools__fill_form_by_uid", + "mcp__firefox-devtools__drag_by_uid_to_uid", + "mcp__firefox-devtools__upload_file_by_uid", + "mcp__firefox-devtools__list_console_messages", + "mcp__firefox-devtools__clear_console_messages", + "mcp__firefox-devtools__list_network_requests", + "mcp__firefox-devtools__get_network_request", + "mcp__firefox-devtools__screenshot_page", + "mcp__firefox-devtools__screenshot_by_uid", + "mcp__firefox-devtools__evaluate_script", + "mcp__firefox-devtools__accept_dialog", + "mcp__firefox-devtools__dismiss_dialog", + "mcp__firefox-devtools__navigate_history", + "mcp__firefox-devtools__set_viewport_size", + "mcp__firefox-devtools__get_firefox_info", + "mcp__firefox-devtools__get_firefox_output", + "mcp__firefox-devtools__list_chrome_contexts", + "mcp__firefox-devtools__select_chrome_context", + "mcp__firefox-devtools__evaluate_chrome_script", +] diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/devtools_mcp.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/devtools_mcp.py new file mode 100644 index 0000000000..cd0744f5b2 --- /dev/null +++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/devtools_mcp.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from pathlib import Path + +from claude_agent_sdk.types import McpStdioServerConfig + +PACKAGE = "@mozilla/firefox-devtools-mcp-moz" + + +def build_devtools_server( + firefox_path: Path | None = None, + *, + headless: bool = True, + enable_script: bool = True, +) -> McpStdioServerConfig: + """Build the stdio config for the Firefox DevTools MCP server.""" + args = [PACKAGE] + if headless: + args.append("--headless") + if enable_script: + args.append("--enable-script") + if firefox_path is not None: + args += ["--firefox-path", str(firefox_path)] + + return McpStdioServerConfig( + command="npx", + args=args, + env={"MOZ_REMOTE_ALLOW_SYSTEM_ACCESS": "1"}, + ) diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/firefox_install.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/firefox_install.py new file mode 100644 index 0000000000..0ea2d8731c --- /dev/null +++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/firefox_install.py @@ -0,0 +1,41 @@ +"""Download and install a prebuilt Firefox Nightly for the agent to drive.""" + +from __future__ import annotations + +import logging +import platform +import shutil +from pathlib import Path + +import mozdownload +import mozinstall + +INSTALL_DIR = Path.home() / "firefox" +BRANCH = "mozilla-central" + +logger = logging.getLogger("test-plan-generator") + + +def install_firefox_nightly() -> Path: + mozdownload_platform = ( + "linux-arm64" if platform.machine() in ("aarch64", "arm64") else None + ) + + if INSTALL_DIR.exists(): + shutil.rmtree(INSTALL_DIR) + INSTALL_DIR.mkdir(parents=True) + + logger.info("downloading Firefox Nightly...") + scraper = mozdownload.FactoryScraper( + "daily", + branch=BRANCH, + platform=mozdownload_platform, + destination=str(INSTALL_DIR), + ) + archive = scraper.download() + + install_folder = mozinstall.install(archive, str(INSTALL_DIR)) + binary = Path(mozinstall.get_binary(install_folder, "firefox")) + + logger.info("installed Firefox at %s", binary) + return binary diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md b/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md new file mode 100644 index 0000000000..f584d6c739 --- /dev/null +++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md @@ -0,0 +1,80 @@ +You are a Firefox QA test-plan generation and execution agent. + +Your job is to generate test cases from the provided Firefox feature details, +run them in Firefox through the available DevTools MCP tools, and report only +pass/fail/unsuitable results. Do not diagnose, fix, patch, or propose changes. + +## Required workflow + +1. Generate exactly 10 test cases before running any case. +2. Each test case must have: + - A short title. + - A selected execution context: `chrome` or `content`. + - Optional preconditions only when they are truly needed. + - 1 to 6 concise, ordered test steps. +3. Run the 10 test cases in order. +4. Submit the final structured result with `submit_result`. + +## Context selection + +Choose the context per test case. + +Use `content` for normal website or web-page behavior. + +Use `chrome` for Firefox browser UI, preferences, toolbar, bookmarks/history, +downloads UI, browser menus, browser panels, PDF viewer chrome interactions, or +any case where you are unsure which context is correct. + +### Content context rules + +- Use page/content tools such as creating or selecting pages, navigating, + snapshots, UID interactions, console/network inspection, screenshots, and + `evaluate_script`. +- Do not use chrome-context tools for a content-context case. + +### Chrome context rules + +- Your first two DevTools MCP actions for a chrome-context case must be: + 1. `list_chrome_contexts` + 2. `select_chrome_context` for the target browser window +- Use `evaluate_chrome_script` for browser UI interaction. +- Wrap JavaScript in an immediately invoked function expression that explicitly + returns a value, for example: + +```javascript +(() => { + return gBrowser.tabs.length; +})(); +``` + +- Do not mix content-context tools into a chrome-context case unless a generated + test step explicitly needs a web page as test data. + +## Execution rules + +- Execute steps exactly in the order generated. +- Do not skip, reorder, combine, or rewrite steps after generation. +- Call only the tools needed for the current step. +- If a step fails, mark that step failed, mark the case failed, stop that case, + and move to the next case. +- Do not try alternate approaches to make a failing step pass. +- Do not debug or explain root cause. +- Do not propose fixes. + +## Unsuitable cases + +Mark a case as `unsuitable` only if it requires: + +- Restarting Firefox during the test flow. +- Pixel-perfect or visual comparison. +- Installing external apps beyond basic add-ons. +- Confirming real hardware behavior such as microphone, camera, or printer. +- Changing, verifying, or interacting with OS/system settings. +- Changing, verifying, or interacting with the system desktop or OS UI. +- Firefox Sync, cross-device verification, or account-sync behavior. + +## Reporting + +The final answer must be submitted through `submit_result` exactly once. A prose +message is not enough. Include exactly 10 generated test cases and exactly 10 +case results. diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py new file mode 100644 index 0000000000..d8a36a048f --- /dev/null +++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py @@ -0,0 +1,99 @@ +"""Structured result reporting for the test-plan-generator agent.""" + +from __future__ import annotations + +from typing import Literal + +from claude_agent_sdk import McpServerConfig, create_sdk_mcp_server, tool +from pydantic import BaseModel, Field, ValidationError, model_validator + +RESULT_SERVER_NAME = "test-plan-generator" +SUBMIT_RESULT_TOOL = f"mcp__{RESULT_SERVER_NAME}__submit_result" + + +class GeneratedTestCase(BaseModel): + id: int = Field(description="Sequential case id from 1 through 10.") + title: str + context: Literal["chrome", "content"] + preconditions: str | None = None + steps: list[str] = Field( + description="Concise test steps for this case; between 1 and 6 steps." + ) + + @model_validator(mode="after") + def _validate_steps(self) -> "GeneratedTestCase": + if not 1 <= len(self.steps) <= 6: + raise ValueError("each generated test case must have 1 to 6 steps") + return self + + +class StepResult(BaseModel): + step_number: int + status: Literal["passed", "failed", "not_run"] + observation: str + + +class TestCaseResult(BaseModel): + id: int + status: Literal["passed", "failed", "unsuitable"] + step_results: list[StepResult] + summary: str + + +class TestPlanResult(BaseModel): + feature: str + generated_test_cases: list[GeneratedTestCase] + results: list[TestCaseResult] + summary: str + + @model_validator(mode="after") + def _validate_result(self) -> "TestPlanResult": + if len(self.generated_test_cases) != 10: + raise ValueError("generated_test_cases must contain exactly 10 cases") + + case_ids = [case.id for case in self.generated_test_cases] + if case_ids != list(range(1, 11)): + raise ValueError("generated test case ids must be 1 through 10") + + result_ids = [result.id for result in self.results] + if result_ids != list(range(1, 11)): + raise ValueError( + "results must contain one result for each case id 1 through 10" + ) + + return self + + +SUBMIT_RESULT_SCHEMA = { + **TestPlanResult.model_json_schema(), + "additionalProperties": False, +} + + +class ResultCollector: + """Holds the result submitted by the agent, if any.""" + + def __init__(self) -> None: + self.result: TestPlanResult | None = None + + +def build_result_server(collector: ResultCollector) -> McpServerConfig: + """Build an in-process MCP server exposing the ``submit_result`` tool.""" + + @tool( + "submit_result", + "Submit the final generated Firefox QA test plan and execution result. " + "Call exactly once, after all 10 test cases have been generated and run.", + SUBMIT_RESULT_SCHEMA, + ) + async def submit_result(args: dict) -> dict: + try: + collector.result = TestPlanResult.model_validate(args) + except ValidationError as exc: + return { + "content": [{"type": "text", "text": f"Invalid result: {exc}"}], + "is_error": True, + } + return {"content": [{"type": "text", "text": "Result recorded."}]} + + return create_sdk_mcp_server(name=RESULT_SERVER_NAME, tools=[submit_result]) diff --git a/agents/test-plan-generator/pyproject.toml b/agents/test-plan-generator/pyproject.toml new file mode 100644 index 0000000000..7aeaaf5718 --- /dev/null +++ b/agents/test-plan-generator/pyproject.toml @@ -0,0 +1,22 @@ +[project] +name = "hackbot-agent-test-plan-generator" +version = "0.1.0" +description = "Cloud Run Job image that generates and runs Firefox QA test plans" +requires-python = ">=3.12" +dependencies = [ + "hackbot-runtime[claude-sdk]", + "claude-agent-sdk>=0.1.30", + "mcp>=1.0.0", + "mozdownload", + "mozinstall", +] + +[tool.uv.sources] +hackbot-runtime = { workspace = true } + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["hackbot_agents"] diff --git a/docker-compose.yml b/docker-compose.yml index cc534a3242..4a0477037a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,6 +4,7 @@ version: "3.8" include: - path: agents/bug-fix/compose.yml + - path: agents/test-plan-generator/compose.yml services: bugbug-base: diff --git a/services/hackbot-api/app/agents.py b/services/hackbot-api/app/agents.py index f5d75f72bc..0d365bcb5a 100644 --- a/services/hackbot-api/app/agents.py +++ b/services/hackbot-api/app/agents.py @@ -4,7 +4,11 @@ from pydantic import BaseModel -from app.schemas import AutowebcompatReproInputs, BugFixInputs +from app.schemas import ( + AutowebcompatReproInputs, + BugFixInputs, + TestPlanGeneratorInputs, +) @dataclass(frozen=True) @@ -57,4 +61,13 @@ def model_to_env(inputs: BaseModel) -> dict[str, str]: job_name="hackbot-agent-autowebcompat-repro", input_schema=AutowebcompatReproInputs, ), + "test-plan-generator": AgentSpec( + name="test-plan-generator", + description=( + "Generate 10 concise Firefox QA test cases from feature details, " + "run them in Firefox through DevTools MCP, and report pass/fail results." + ), + job_name="hackbot-agent-test-plan-generator", + input_schema=TestPlanGeneratorInputs, + ), } diff --git a/services/hackbot-api/app/schemas.py b/services/hackbot-api/app/schemas.py index f54d133d4b..8e01996453 100644 --- a/services/hackbot-api/app/schemas.py +++ b/services/hackbot-api/app/schemas.py @@ -81,3 +81,11 @@ def _require_subject(self) -> "AutowebcompatReproInputs": if self.bug_data is None and self.bug_id is None: raise ValueError("provide at least one of bug_data or bug_id") return self + + +class TestPlanGeneratorInputs(BaseModel): + feature: str + feature_details: str + model: str | None = None + max_turns: int | None = None + effort: str | None = None diff --git a/services/hackbot-api/tests/test_agents.py b/services/hackbot-api/tests/test_agents.py index c99c9d4689..7f0d8abf6a 100644 --- a/services/hackbot-api/tests/test_agents.py +++ b/services/hackbot-api/tests/test_agents.py @@ -1,7 +1,14 @@ """Tests for the agent registry and generic env serialization.""" +import pytest from app.agents import AGENT_REGISTRY, model_to_env -from app.schemas import BugFixInputs +from app.schemas import ( + BugFixInputs, +) +from app.schemas import ( + TestPlanGeneratorInputs as PlanGeneratorInputs, +) +from pydantic import ValidationError def test_model_to_env_uppercases_and_stringifies(): @@ -30,3 +37,30 @@ def test_bug_fix_registry_uses_default_env_serializer(): # No hand-written build_env: the router falls back to model_to_env. assert spec.build_env is None assert spec.input_schema is BugFixInputs + + +def test_test_plan_generator_inputs_require_feature_details(): + with pytest.raises(ValidationError): + PlanGeneratorInputs(feature="Bookmarks and History") + + +def test_test_plan_generator_env_serialization(): + env = model_to_env( + PlanGeneratorInputs( + feature="Bookmarks and History", + feature_details="Bookmarks toolbar behavior", + ) + ) + + assert env == { + "FEATURE": "Bookmarks and History", + "FEATURE_DETAILS": "Bookmarks toolbar behavior", + } + + +def test_test_plan_generator_registry_uses_default_env_serializer(): + spec = AGENT_REGISTRY["test-plan-generator"] + + assert spec.build_env is None + assert spec.job_name == "hackbot-agent-test-plan-generator" + assert spec.input_schema is PlanGeneratorInputs diff --git a/uv.lock b/uv.lock index bb6173d69e..4c432bcdb0 100644 --- a/uv.lock +++ b/uv.lock @@ -24,6 +24,7 @@ members = [ "bugbug-mcp", "hackbot-agent-autowebcompat-repro", "hackbot-agent-bug-fix", + "hackbot-agent-test-plan-generator", "hackbot-api", "hackbot-runtime", "reviewhelper-api", @@ -2417,6 +2418,27 @@ requires-dist = [ { name = "uvicorn", specifier = ">=0.27.0" }, ] +[[package]] +name = "hackbot-agent-test-plan-generator" +version = "0.1.0" +source = { editable = "agents/test-plan-generator" } +dependencies = [ + { name = "claude-agent-sdk" }, + { name = "hackbot-runtime", extra = ["claude-sdk"] }, + { name = "mcp" }, + { name = "mozdownload" }, + { name = "mozinstall" }, +] + +[package.metadata] +requires-dist = [ + { name = "claude-agent-sdk", specifier = ">=0.1.30" }, + { name = "hackbot-runtime", extras = ["claude-sdk"], editable = "libs/hackbot-runtime" }, + { name = "mcp", specifier = ">=1.0.0" }, + { name = "mozdownload" }, + { name = "mozinstall" }, +] + [[package]] name = "hackbot-api" version = "0.1.0" From dad4cbb67105d97373178d35b81ab91afc4f0370 Mon Sep 17 00:00:00 2001 From: John Pangas Date: Fri, 26 Jun 2026 01:37:11 -0600 Subject: [PATCH 2/3] Edit the inputs --- agents/test-plan-generator/compose.yml | 5 ++- .../test_plan_generator/__main__.py | 10 +++-- .../test_plan_generator/agent.py | 33 +++++++++++------ .../test_plan_generator/prompts/system.md | 37 ++++++++++++++++--- services/hackbot-api/app/schemas.py | 5 ++- services/hackbot-api/tests/test_agents.py | 17 ++++++--- 6 files changed, 76 insertions(+), 31 deletions(-) diff --git a/agents/test-plan-generator/compose.yml b/agents/test-plan-generator/compose.yml index ee4b076023..987bafbb12 100644 --- a/agents/test-plan-generator/compose.yml +++ b/agents/test-plan-generator/compose.yml @@ -6,8 +6,9 @@ services: target: agent environment: - RUN_ID - - FEATURE - - FEATURE_DETAILS + - FEATURE_NAME + - FEATURE_DESCRIPTION + - TEST_SCOPE - MODEL - MAX_TURNS - EFFORT diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py index 9b4e5b3357..af67eac60b 100644 --- a/agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py +++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/__main__.py @@ -6,8 +6,9 @@ class AgentInputs(BaseSettings): - feature: str - feature_details: str + feature_name: str + feature_description: str + test_scope: str model: str | None = None max_turns: int | None = None effort: str | None = None @@ -21,8 +22,9 @@ async def main(ctx: HackbotContext) -> TestPlanGeneratorResult: firefox_path = str(install_firefox_nightly()) return await run_test_plan_generator( - feature=inputs.feature, - feature_details=inputs.feature_details, + feature_name=inputs.feature_name, + feature_description=inputs.feature_description, + test_scope=inputs.test_scope, model=inputs.model, max_turns=inputs.max_turns, effort=inputs.effort, diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py index 20f93a0283..0e4b81128d 100644 --- a/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py +++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py @@ -37,11 +37,16 @@ def load_system_prompt() -> str: return (HERE / "prompts" / "system.md").read_text() -def build_user_prompt(feature: str, feature_details: str) -> str: +def build_user_prompt( + feature_name: str, feature_description: str, test_scope: str +) -> str: return ( - "Generate and run a Firefox QA test plan for this feature.\n\n" - f"Feature:\n{feature}\n\n" - f"Feature details:\n{feature_details}\n\n" + "Generate and run a Firefox QA test plan from these inputs.\n\n" + f"Feature name:\n{feature_name}\n\n" + f"Feature description:\n{feature_description}\n\n" + f"Test scope:\n{test_scope}\n\n" + "Use the provided feature name as the structured result feature. The " + "generated test cases must stay within the test scope.\n\n" "Follow the required workflow exactly: generate 10 cases first, run " "them in order, stop each case on first failed step, and submit the " "structured result." @@ -50,8 +55,9 @@ def build_user_prompt(feature: str, feature_details: str) -> str: async def run_test_plan_generator( *, - feature: str, - feature_details: str, + feature_name: str, + feature_description: str, + test_scope: str, model: str | None = None, max_turns: int | None = None, effort: str | None = None, @@ -60,7 +66,8 @@ async def run_test_plan_generator( log: Path | None = None, ) -> TestPlanGeneratorResult: """Generate and run a Firefox QA test plan for one feature.""" - logger.info("generating Firefox QA test plan for %s", feature) + subject = feature_name + logger.info("generating Firefox QA test plan for %s", subject) devtools_server = build_devtools_server( firefox_path=Path(firefox_path) if firefox_path else None, @@ -93,24 +100,26 @@ async def run_test_plan_generator( result_msg: ResultMessage | None = None with Reporter(verbose=verbose, log_path=log) as reporter: - reporter.header(feature) + reporter.header(subject) async with ClaudeSDKClient(options=options) as client: - await client.query(build_user_prompt(feature, feature_details)) + await client.query( + build_user_prompt(feature_name, feature_description, test_scope) + ) async for msg in client.receive_response(): reporter.message(msg) if isinstance(msg, ResultMessage): result_msg = msg if result_msg is None: - raise AgentError(f"{feature}: agent produced no result message") + raise AgentError(f"{subject}: agent produced no result message") if result_msg.is_error: raise AgentError( - f"{feature} test-plan generation failed: " + f"{subject} test-plan generation failed: " f"{result_msg.result or result_msg.subtype}" ) if result_collector.result is None: raise AgentError( - f"{feature}: agent finished without submitting a result via submit_result" + f"{subject}: agent finished without submitting a result via submit_result" ) return TestPlanGeneratorResult( diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md b/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md index f584d6c739..45c5913362 100644 --- a/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md +++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md @@ -1,17 +1,21 @@ You are a Firefox QA test-plan generation and execution agent. -Your job is to generate test cases from the provided Firefox feature details, -run them in Firefox through the available DevTools MCP tools, and report only -pass/fail/unsuitable results. Do not diagnose, fix, patch, or propose changes. +Your job is to generate test cases from the provided Firefox feature name, +feature description, and test scope, run them in Firefox through the available +DevTools MCP tools, and report only pass/fail/unsuitable results. Do not +diagnose, fix, patch, or propose changes. ## Required workflow 1. Generate exactly 10 test cases before running any case. + - Use only the feature name, feature description, and test scope as your + source material. + - Use the provided feature name as the structured result feature. + - Do not generate cases outside the test scope. 2. Each test case must have: - A short title. - A selected execution context: `chrome` or `content`. - - Optional preconditions only when they are truly needed. - - 1 to 6 concise, ordered test steps. + - Use concise ordered test steps. 3. Run the 10 test cases in order. 4. Submit the final structured result with `submit_result`. @@ -61,6 +65,29 @@ any case where you are unsure which context is correct. - Do not debug or explain root cause. - Do not propose fixes. +## Test case style examples + +Use concise, manual-QA-style titles and steps like these. These are examples of +tone and granularity only; do not generate these exact cases unless they are +inside the requested test scope. + +Example test case: Verify that the user can add multiple Highlights to the text inside the PDF file + +Example test steps: + +1. Click the Highlight button. +2. Add several Highlights to any text inside the PDF file. +3. Save or Print the PDF file and reopen the file in a new Tab. + +Example test case: Ensure that rich entities are shown in Address Bar history if +the user interacted with them + +Example test steps: + +1. Click inside the Address Bar, select the google search shortcut. +2. Select a rich suggestion and press enter. +3. Open a new tab and type the first letters of the previously searched term. + ## Unsuitable cases Mark a case as `unsuitable` only if it requires: diff --git a/services/hackbot-api/app/schemas.py b/services/hackbot-api/app/schemas.py index 8e01996453..d38cee7ce7 100644 --- a/services/hackbot-api/app/schemas.py +++ b/services/hackbot-api/app/schemas.py @@ -84,8 +84,9 @@ def _require_subject(self) -> "AutowebcompatReproInputs": class TestPlanGeneratorInputs(BaseModel): - feature: str - feature_details: str + feature_name: str + feature_description: str + test_scope: str model: str | None = None max_turns: int | None = None effort: str | None = None diff --git a/services/hackbot-api/tests/test_agents.py b/services/hackbot-api/tests/test_agents.py index 7f0d8abf6a..6816efbe49 100644 --- a/services/hackbot-api/tests/test_agents.py +++ b/services/hackbot-api/tests/test_agents.py @@ -39,22 +39,27 @@ def test_bug_fix_registry_uses_default_env_serializer(): assert spec.input_schema is BugFixInputs -def test_test_plan_generator_inputs_require_feature_details(): +def test_test_plan_generator_inputs_require_feature_description(): with pytest.raises(ValidationError): - PlanGeneratorInputs(feature="Bookmarks and History") + PlanGeneratorInputs( + feature_name="Bookmarks and History", + test_scope="Bookmarks toolbar behavior.", + ) def test_test_plan_generator_env_serialization(): env = model_to_env( PlanGeneratorInputs( - feature="Bookmarks and History", - feature_details="Bookmarks toolbar behavior", + feature_name="Bookmarks and History", + feature_description="Bookmarks and history controls in Firefox.", + test_scope="Bookmarks toolbar behavior.", ) ) assert env == { - "FEATURE": "Bookmarks and History", - "FEATURE_DETAILS": "Bookmarks toolbar behavior", + "FEATURE_NAME": "Bookmarks and History", + "FEATURE_DESCRIPTION": "Bookmarks and history controls in Firefox.", + "TEST_SCOPE": "Bookmarks toolbar behavior.", } From a8a09b0d0b8a4238a90718d92d957e4dc51466d2 Mon Sep 17 00:00:00 2001 From: John Pangas Date: Fri, 26 Jun 2026 01:55:00 -0600 Subject: [PATCH 3/3] Cap the generated test cases: :q q ' --- .../test_plan_generator/agent.py | 6 +-- .../test_plan_generator/prompts/system.md | 20 ++++++--- .../test_plan_generator/result.py | 44 +++++++++++++++---- 3 files changed, 53 insertions(+), 17 deletions(-) diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py index 0e4b81128d..a680f52d17 100644 --- a/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py +++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/agent.py @@ -47,9 +47,9 @@ def build_user_prompt( f"Test scope:\n{test_scope}\n\n" "Use the provided feature name as the structured result feature. The " "generated test cases must stay within the test scope.\n\n" - "Follow the required workflow exactly: generate 10 cases first, run " - "them in order, stop each case on first failed step, and submit the " - "structured result." + "Follow the required workflow exactly: generate the appropriate number " + "of cases first, with no more than 20 cases, run them in order, stop " + "each case on first failed step, and submit the structured result." ) diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md b/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md index 45c5913362..dd79d0b203 100644 --- a/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md +++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/prompts/system.md @@ -7,7 +7,8 @@ diagnose, fix, patch, or propose changes. ## Required workflow -1. Generate exactly 10 test cases before running any case. +1. Generate the appropriate number of test cases before running any case. + Generate no more than 20 test cases. - Use only the feature name, feature description, and test scope as your source material. - Use the provided feature name as the structured result feature. @@ -16,7 +17,7 @@ diagnose, fix, patch, or propose changes. - A short title. - A selected execution context: `chrome` or `content`. - Use concise ordered test steps. -3. Run the 10 test cases in order. +3. Run the test cases and test steps in order. 4. Submit the final structured result with `submit_result`. ## Context selection @@ -61,9 +62,13 @@ any case where you are unsure which context is correct. - Call only the tools needed for the current step. - If a step fails, mark that step failed, mark the case failed, stop that case, and move to the next case. +- When a step fails, include a concise failure reason based only on observed + behavior. The reason should help developers understand what failed later, but + it must not speculate beyond the evidence or propose a fix. +- When a case fails or is unsuitable, include a concise case-level failure + reason. - Do not try alternate approaches to make a failing step pass. -- Do not debug or explain root cause. -- Do not propose fixes. +- Do not debug deeply, infer root cause, or propose fixes. ## Test case style examples @@ -103,5 +108,8 @@ Mark a case as `unsuitable` only if it requires: ## Reporting The final answer must be submitted through `submit_result` exactly once. A prose -message is not enough. Include exactly 10 generated test cases and exactly 10 -case results. +message is not enough. Include one case result for every generated test case. + +For failed steps, set `failure_reason` to a short explanation of the observed +failure. For failed or unsuitable cases, set the case-level `failure_reason` as +well. Leave `failure_reason` empty for passed steps and passed cases. diff --git a/agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py b/agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py index d8a36a048f..50c702709c 100644 --- a/agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py +++ b/agents/test-plan-generator/hackbot_agents/test_plan_generator/result.py @@ -12,7 +12,7 @@ class GeneratedTestCase(BaseModel): - id: int = Field(description="Sequential case id from 1 through 10.") + id: int = Field(description="Sequential case id starting at 1.") title: str context: Literal["chrome", "content"] preconditions: str | None = None @@ -31,6 +31,19 @@ class StepResult(BaseModel): step_number: int status: Literal["passed", "failed", "not_run"] observation: str + failure_reason: str | None = Field( + default=None, + description=( + "Required when status is failed. A concise reason why the step failed, " + "based only on what was observed during execution." + ), + ) + + @model_validator(mode="after") + def _validate_failure_reason(self) -> "StepResult": + if self.status == "failed" and not self.failure_reason: + raise ValueError("failed steps must include failure_reason") + return self class TestCaseResult(BaseModel): @@ -38,6 +51,19 @@ class TestCaseResult(BaseModel): status: Literal["passed", "failed", "unsuitable"] step_results: list[StepResult] summary: str + failure_reason: str | None = Field( + default=None, + description=( + "Required when status is failed or unsuitable. A concise reason why " + "the case failed or could not be run, useful for later developer review." + ), + ) + + @model_validator(mode="after") + def _validate_failure_reason(self) -> "TestCaseResult": + if self.status in {"failed", "unsuitable"} and not self.failure_reason: + raise ValueError("failed or unsuitable cases must include failure_reason") + return self class TestPlanResult(BaseModel): @@ -48,17 +74,19 @@ class TestPlanResult(BaseModel): @model_validator(mode="after") def _validate_result(self) -> "TestPlanResult": - if len(self.generated_test_cases) != 10: - raise ValueError("generated_test_cases must contain exactly 10 cases") + case_count = len(self.generated_test_cases) + if not 1 <= case_count <= 20: + raise ValueError("generated_test_cases must contain 1 to 20 cases") case_ids = [case.id for case in self.generated_test_cases] - if case_ids != list(range(1, 11)): - raise ValueError("generated test case ids must be 1 through 10") + expected_ids = list(range(1, case_count + 1)) + if case_ids != expected_ids: + raise ValueError("generated test case ids must be sequential starting at 1") result_ids = [result.id for result in self.results] - if result_ids != list(range(1, 11)): + if result_ids != expected_ids: raise ValueError( - "results must contain one result for each case id 1 through 10" + "results must contain one result for each generated test case id" ) return self @@ -83,7 +111,7 @@ def build_result_server(collector: ResultCollector) -> McpServerConfig: @tool( "submit_result", "Submit the final generated Firefox QA test plan and execution result. " - "Call exactly once, after all 10 test cases have been generated and run.", + "Call exactly once, after all generated test cases have been run.", SUBMIT_RESULT_SCHEMA, ) async def submit_result(args: dict) -> dict: